diff --git a/.appveyor.yml b/.appveyor.yml index 4a69c2ae375..d2ae3e019a3 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,6 +1,7 @@ image: - Visual Studio 2015 - Visual Studio 2017 + - Visual Studio 2019 build: project: libfabric.sln @@ -8,15 +9,29 @@ build: configuration: - Debug-v140 - Debug-v141 + - Debug-v142 - Release-v140 - Release-v141 + - Release-v142 matrix: exclude: + - configuration: Debug-v140 + image: Visual Studio 2019 - configuration: Debug-v141 image: Visual Studio 2015 + - configuration: Debug-v142 + image: Visual Studio 2015 + - configuration: Debug-v142 + image: Visual Studio 2017 + - configuration: Release-v140 + image: Visual Studio 2019 - configuration: Release-v141 image: Visual Studio 2015 + - configuration: Release-v142 + image: Visual Studio 2015 + - configuration: Release-v142 + image: Visual Studio 2017 before_build: - ps: .appveyor.ps1 -Verbose diff --git a/.gitignore b/.gitignore index f8c16e564b1..437aee891ee 100644 --- a/.gitignore +++ b/.gitignore @@ -66,10 +66,12 @@ prov/*/*.spec .vs fabtests.spec +fabtests/config fabtests/ubertest/fabtest fabtests/ubertest/fi_ubertest fabtests/benchmarks/fi_* fabtests/functional/fi_* fabtests/unit/fi_* +fabtests/multinode/fi_* pingpong/fi_* diff --git a/.travis.yml b/.travis.yml index 4542a996ac6..2f226839765 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,4 @@ -dist: trusty +dist: bionic language: c compiler: - clang @@ -6,6 +6,7 @@ compiler: os: - linux - osx +osx_image: xcode12.2 addons: apt: packages: @@ -33,20 +34,36 @@ addons: - wget - abi-compliance-checker - abi-dumper -# 32 bit support packages - - gcc-multilib - ssh_known_hosts: - - www.openfabrics.org - - git.kernel.org + coverity_scan: + project: + name: "ofiwg/libfabric" + description: "Libfabric project coverity scans" + notification_email: sean.hefty@intel.com + build_command_prepend: "./autogen.sh; ./configure" + build_command: "make -j2" + # It might be overkill to run a full scan across the compiler test matrix + # for every PR to master. The coverity addon can not selectively run for + # certain OSes or compilers. Once a couple runs succeed, change this to a + # coverity-scan branch that we push to on-demand during releases or as + # needed.. + branch_pattern: master env: global: - PREFIX=$HOME/install - PATH=$PREFIX/bin:$PATH - - CPPFLAGS="-Werror -I$PREFIX/include" + - CPPFLAGS="-I$PREFIX/include" - LDFLAGS=-L$PREFIX/lib - LD_LIBRARY_PATH=$PREFIX/lib - - LIBFABRIC_CONFIGURE_ARGS="--prefix=$PREFIX --enable-sockets" + - LIBFABRIC_CONFIGURE_ARGS="--prefix=$PREFIX --enable-tcp" + # Temporarily disable -Werror testing (Jan 2020) because + # there are some warnings about unaligned atomics that I + # do not know how to fix + #- MAKE_FLAGS="AM_CFLAGS=-Werror" + - MAKE_FLAGS= + - ASAN_OPTIONS=detect_leaks=0 + # Encrypted COVERITY_SCAN_TOKEN + - secure: "gDU1pbiuGsuPHezMp0X2DEC9+bBu2F+XDqR93JMkIzHNI7ygQX/kXeJT6ly9MH60paSpIolfQFNA6QotKtpZ62X3a9wrhv3In1viB+EJr1wmsPrKfprI+JfZYevPLTn6LUQM0d2zoclRWNJzY/uldc6bEaXXxDKIaRk8pgmNZR4=" # Brew update GNU Autotools so that autogen can succeed before_install: @@ -56,53 +73,74 @@ before_install: install: - ./autogen.sh - # Build rdma-core because ubuntu trusty doesn't have a sufficiently new version of ibverbs/rdma-core - # Build verbs only in linux as OS X doesn't have verbs support + # Build rdma-core because ubuntu doesn't have a sufficiently new version of + # ibverbs/rdma-core for EFA and PSM3. OS X doesn't have verbs support. - if [[ "$TRAVIS_OS_NAME" == "linux" ]] ; then - RDMA_CORE_BRANCH=v13 ; - git clone --depth 1 -b $RDMA_CORE_BRANCH https://github.com/linux-rdma/rdma-core.git && cd rdma-core && bash build.sh && cd - ; + RDMA_CORE_BRANCH="v27.0"; + git clone --depth 1 -b $RDMA_CORE_BRANCH https://github.com/linux-rdma/rdma-core.git && cd rdma-core && bash build.sh && cd -; RDMA_CORE_PATH=$PWD/rdma-core/build ; export LD_LIBRARY_PATH="$RDMA_CORE_PATH/lib:$LD_LIBRARY_PATH" ; LIBFABRIC_CONFIGURE_ARGS="$LIBFABRIC_CONFIGURE_ARGS --enable-usnic - --enable-verbs=$RDMA_CORE_PATH --enable-mlx=$HOME/mlx"; - UCX_BRANCH=v1.2.x; - git clone --depth 1 -b $UCX_BRANCH https://github.com/openucx/ucx.git && cd ucx && ./autogen.sh && ./configure --prefix=$HOME/mlx CFLAGS="-w" && make -j2 install && cd -; - fi - - if [[ "$TRAVIS_OS_NAME" == "linux" && "`basename $CC`" == "clang" ]]; then - ./configure CFLAGS="-Werror $CFLAGS" $LIBFABRIC_CONFIGURE_ARGS - --enable-debug && make -j2; + --enable-psm3=$RDMA_CORE_PATH + --enable-verbs=$RDMA_CORE_PATH + --enable-efa=$RDMA_CORE_PATH"; fi # Test fabric direct - - ./configure --prefix=$PREFIX --enable-direct=sockets --enable-udp=no - --enable-psm=no --enable-gni=no --enable-psm2=no --enable-verbs=no - --enable-usnic=no --enable-rxm=no --enable-rxd=no --enable-mlx=no - - make -j2 + # (all other providers are automatically disabled by configure) + - ./configure --prefix=$PREFIX --enable-direct=sockets + - make -j2 $MAKE_FLAGS # Test loadable library option - - ./configure --enable-sockets=dl --disable-udp --disable-rxm --disable-rxd - --disable-verbs --disable-usnic --disable-mlx --prefix=$PREFIX - - make -j2 + # List of providers current as of Jan 2020 + - ./configure --prefix=$PREFIX --enable-tcp=dl + --disable-bgq + --disable-efa + --disable-gni + --disable-hook_debug + --disable-mrail + --disable-perf + --disable-psm + --disable-psm2 + --disable-psm3 + --disable-rstream + --disable-rxd + --disable-rxm + --disable-shm + --disable-tcp + --disable-udp + --disable-usnic + --disable-verbs + - make -j2 $MAKE_FLAGS - make install - make test - rm -rf $PREFIX + # Test debug build + - echo "Final libfabric configure args $LIBFABRIC_CONFIGURE_ARGS" + - ./configure $LIBFABRIC_CONFIGURE_ARGS --enable-debug + - make -j2 $MAKE_FLAGS # Test regular build - - ./configure $LIBFABRIC_CONFIGURE_ARGS - - make -j2 + - CFLAGS="-fsanitize=address" ./configure $LIBFABRIC_CONFIGURE_ARGS + - make -j2 $MAKE_FLAGS - make install - make test - make distcheck - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make rpm; fi - # Prepare build for fabtests - - ./configure $LIBFABRIC_CONFIGURE_ARGS - - make -j2 - - make install - - make test - - make distcheck - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make rpm; fi + # We don't want to use LIBFABRIC_CONFIGURE_ARGS here as the standard + # prefix should be tested when building the RPM. + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then + make dist; + config_options="--enable-efa=$RDMA_CORE_PATH + --enable-psm3=$RDMA_CORE_PATH + --enable-verbs=$RDMA_CORE_PATH --enable-usnic"; + LDFLAGS=-Wl,--build-id rpmbuild -ta + --define "configopts $config_options" libfabric-*.tar.bz2; + fi script: - cd fabtests - ./autogen.sh - - ./configure --prefix=$PREFIX --with-libfabric=$PREFIX + - CFLAGS="-fsanitize=address" ./configure --prefix=$PREFIX --with-libfabric=$PREFIX + # Do not use MAKE_FLAGS here because we use AM_CFLAGS in the + # normal fabtests' Makefile.am (i.e., overriding it on the command + # line removes information that we need to build fabtests itself). - make -j2 - make install - make test diff --git a/AUTHORS b/AUTHORS index c3a53a3cefe..cf7e6151b53 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,12 +1,18 @@ +Adam Goldman aikatz aingerson aingerson Ajay Kulkarni aleksandra.justa +Alex McKinley +Alex McKinley Amith Abraham Ana Guerrero López Anatoliy Rozanov Andrew Friedley +Andrey Lobanov +Anthony Zinger +Ao Li Arun C Ilango arun ilango Arun Ilango @@ -17,9 +23,13 @@ Benjamin Drung Ben Menadue Ben Turrubiates Ben Turrubiates +Bernd Schubert +Brian Barrett +Brian J. Murrell Brian Li Chang Hyun Park Charles J Archer +Chenwei Zhang Chen Zhao Chris Dolan Chuck Fossen @@ -27,8 +37,10 @@ Coni Gehler Dardo D Kleiner Dave Goodell David Noel +Dipti Kothari Dmitry Durnov Dmitry Gladkov +Doug Oucharek Erik Paulson Erik Paulson Evan Harvey @@ -38,29 +50,40 @@ Evgeny Leksikov Ezra Kissel Firas Jahjah Frank Zago +Gal Pressman +Gengbin Zheng germanafro Gilles Gouaillardet Gilles Gouaillardet +Goldman, Adam Hefty Holger Hoffstätte +Honggang Li Howard Pritchard +Hui Zhou +Ian Ziemba Ignacio Hernandez Ira Weiny +iziemba <57813515+iziemba@users.noreply.github.com> Jaime Arteaga James Dinan James Shimek James Swaro +James Swaro James Swaro Jason Godfrey Jason Gunthorpe Jay Sternberg +Jean-Yves VET Jeff Hammond Jeff Hammond Jeff Squyres Jerome Berryhill Jerome Boyd Berryhill Jerome Soumagne +Jiakun Yan Jianxin Xiong +Jie Zhang Jim Snow Jithin Jose Joe Doyle @@ -71,11 +94,13 @@ Jonathan Behrens jose jose JoZie +jroznova Ken Raffenetti Kevan rehm Kevan Rehm kseager Latchesar Ionkov +Leena Radeke Lisanna Dettwyler Lisanna Dettwyler Marcin Salnik @@ -87,16 +112,27 @@ Mikhail Khalilov Mikhail Khalilov Mohan Gandhi Neil Spruit +Nicolas Morey-Chaisemartin +nikhilnanal +nikhilnanal +Nikhil Nanal +nikhilnanal Nikita Gusev +Nikola Dancejic Oblomov, Sergey Oblomov, Sergey OFIWG Bot Paolo Inaudi +patrickbueb <70724661+patrickbueb@users.noreply.github.com> +Patrick Bueb Patrick MacArthur Patrick McCormick Paul Coffman Pavan Balaji +Peter Gottesman Peter Gottesman +Phil Carns +Philip Davis Pierre Roux Prankur Gupta Raghu Raja @@ -104,16 +140,19 @@ Raghu Raja Reese Faucette Richard Halkyard Robert Wespetal +Rohit Zambre Sannikov, Alexander Sayantan Sur Sean Hefty Sergey Oblomov Shantonu Hossain +Shi Jin soblomov Solovyev, Dmitriy Spruit, Neil R Srdjan Milakovic Stan Smith +Stephen Oost Steven Vormwald Steve Welch Sung-Eun Choi @@ -121,17 +160,23 @@ Sung-Eun Choi Sylvain Didelot Sylvain Didelot Thananon Patinyasakdikul +Thibault BREZILLON Thomas Smith Tony Zinger tonyzinger +Trevor Hendricks Venkata Krishna Nimmagadda Venkata Krishna Nimmagadda +Wei Zhang Wei Zhang Wesley Bland William Zhang +Xuezhao Liu Xuyang Wang Yohann Burette yohann +Yulu Jia +Zach Tiffany Zach Tiffany Zach ztaylor diff --git a/COPYING b/COPYING index 31bc30a75ee..a786c78ba28 100644 --- a/COPYING +++ b/COPYING @@ -3,7 +3,7 @@ licenses. You may choose to be licensed under the terms of the the BSD license or the GNU General Public License (GPL) Version 2, both included below. -Copyright (c) 2015-2019 Intel Corporation. All rights reserved. +Copyright (c) 2015-2021 Intel Corporation. All rights reserved. Copyright (c) 2015-2019 Cisco Systems, Inc. All rights reserved. ================================================================== diff --git a/Makefile.am b/Makefile.am index a48dda167ec..187b5ddcfaf 100644 --- a/Makefile.am +++ b/Makefile.am @@ -2,6 +2,7 @@ # Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2017-2018 Intel Corporation, Inc. All right reserved. # Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All rights reserved. +# (C) Copyright 2020 Hewlett Packard Enterprise Development LP # # Makefile.am for libfabric @@ -39,6 +40,11 @@ rdmainclude_HEADERS = # internal utility functions shared by in-tree providers: common_srcs = \ + src/hmem.c \ + src/hmem_rocr.c \ + src/hmem_cuda.c \ + src/hmem_cuda_gdrcopy.c \ + src/hmem_ze.c \ src/common.c \ src/enosys.c \ src/rbtree.c \ @@ -66,10 +72,15 @@ common_srcs = \ prov/util/src/util_ns.c \ prov/util/src/util_shm.c \ prov/util/src/util_mem_monitor.c\ - prov/util/src/util_mr_cache.c + prov/util/src/util_mem_hooks.c \ + prov/util/src/util_mr_cache.c \ + prov/util/src/cuda_mem_monitor.c \ + prov/util/src/rocr_mem_monitor.c \ + prov/util/src/util_coll.c if MACOS +common_srcs += src/osx/osd.c common_srcs += src/unix/osd.c common_srcs += include/osx/osd.h common_srcs += include/unix/osd.h @@ -116,6 +127,7 @@ util_fi_pingpong_LDADD = $(linkback) nodist_src_libfabric_la_SOURCES = src_libfabric_la_SOURCES = \ + include/ofi_hmem.h \ include/ofi.h \ include/ofi_abi.h \ include/ofi_atom.h \ @@ -125,6 +137,7 @@ src_libfabric_la_SOURCES = \ include/ofi_indexer.h \ include/ofi_iov.h \ include/ofi_list.h \ + include/ofi_bitmask.h \ include/shared/ofi_str.h \ include/ofi_lock.h \ include/ofi_mem.h \ @@ -141,6 +154,7 @@ src_libfabric_la_SOURCES = \ include/ofi_mr.h \ include/ofi_net.h \ include/ofi_perf.h \ + include/ofi_coll.h \ include/fasthash.h \ include/rbtree.h \ include/uthash.h \ @@ -171,7 +185,7 @@ src_libfabric_la_LIBADD = src_libfabric_la_DEPENDENCIES = libfabric.map if !EMBEDDED -src_libfabric_la_LDFLAGS += -version-info 12:0:11 +src_libfabric_la_LDFLAGS += -version-info 16:0:15 endif src_libfabric_la_LDFLAGS += -export-dynamic \ $(libfabric_version_script) @@ -179,6 +193,7 @@ rdmainclude_HEADERS += \ $(top_srcdir)/include/rdma/fabric.h \ $(top_srcdir)/include/rdma/fi_atomic.h \ $(top_srcdir)/include/rdma/fi_cm.h \ + $(top_srcdir)/include/rdma/fi_collective.h \ $(top_srcdir)/include/rdma/fi_domain.h \ $(top_srcdir)/include/rdma/fi_eq.h \ $(top_srcdir)/include/rdma/fi_rma.h \ @@ -394,12 +409,12 @@ include prov/efa/Makefile.include include prov/usnic/Makefile.include include prov/psm/Makefile.include include prov/psm2/Makefile.include +include prov/psm3/Makefile.include include prov/gni/Makefile.include include prov/rxm/Makefile.include include prov/mrail/Makefile.include include prov/rxd/Makefile.include include prov/bgq/Makefile.include -include prov/mlx/Makefile.include include prov/shm/Makefile.include include prov/tcp/Makefile.include include prov/rstream/Makefile.include diff --git a/NEWS.md b/NEWS.md index c87126afc02..63356499c6f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,7 +3,812 @@ Libfabric release notes This file contains the main features as well as overviews of specific bug fixes (and other actions) for each version of Libfabric since -version 1.0. +version 1.0. New major releases include all fixes from minor +releases with earlier release dates. + +v1.12.0, Mon Mar 8, 2021 +========================= + +## Core + +- Added re-entrant version of fi_tostr +- Added fi_control commands for accessing fid-specific attributes +- Added Ze (level-0) HMEM API support +- Fixed RoCR memory checks +- Minor code cleanups, restructuring, and fixes +- Fix possible stack buffer overflow with address string conversion +- Handle macOS socket API size limitations +- Verify and improve support for CUDA devices +- Update internal string functions to protect against buffer overflow +- Support gdrcopy in addition to cudaMemcpy to avoid deadlocks +- Properly mark if addresses support only local communication +- Prevent providers from layering over each other non-optimally + +## EFA +- Added support for FI_DELIVERY_COMPLETE via an acknowledgment packet in the + provider. Applications that request FI_DELIVERY_COMPLETE will see a + performance impact from this release onward. The default delivery semantic + for EFA is still FI_TRANSMIT_COMPLETE and acknowledgment packets will not be + sent in this mode. +- Added ability for the provider to notify device that it can correctly handle + receiver not ready (RNR) errors. There are still known issues so this is + currently turned off by default; the device is still configured to retry + indefinitely. +- Disable FI_HMEM when FI_LOCAL_COMM is requested due to problems in the + provider with loopback support for FI_HMEM buffers. +- Use a loopback read to copy from host memory to FI_HMEM buffers in the + receive path. This has a performance impact, but using the native copy API + for CUDA can cause a deadlock when the EFA provider is used with NCCL. +- Only allow fork support when the cache is disabled, i.e. the application + handles registrations (FI_MR_LOCAL) to prevent potential data corruption. + General fork support will be addressed in a future release. +- Moved EFA fork handler check to only trigger when an EFA device is present + and EFA is selected by an application. +- Changed default memory registration cache monitor back to userfaultfd due to + a conflict with the memory hooks installed by Open MPI. +- Fixed an issue where packets were incorrectly queued which caused message + ordering issues for messages the EFA provider sent via SHM provider. +- Fixed a bug where bounce buffers were used instead of application provided + memory registration descriptors. +- Various fixes for AV and FI_HMEM capability checks in the getinfo path. +- Fix bug in the GPUDirect support detection path. +- Various fixes and refactoring to the protocol implementation to resolve some + memory leaks and hangs. + +## PSM3 + +- New Intel provider optimized for verbs UD QPs + +## RxD + +- Added missing cleanup to free peer endpoint data with AV +- Add support for FI_SYNC_ERR flag + +## RxM + +- Cleanup atomic buffer pool lock resources +- Fix unexpected message handling when using multi-recv buffers +- Handle SAR and rendezvous messages received into multi-recv buffers +- Give application entire size of eager buffer region +- Minor code cleanups based on static code analysis +- Simplify rendezvous message code paths +- Avoid passing internal errors handling progress directly to applications +- Limit fi_cancel to canceling at most 1 receive operation +- Remove incorrect handling if errors occur writing to a CQ +- Only write 1 CQ entry if a SAR message fails +- Continue processing if the receive buffer pool is full and reposting delayed +- Add support for dynamic receive buffering when layering over tcp +- Add support for direct send to avoid send bounce buffers in certain cases +- Prioritize credit messages to avoid deadlock +- Fix conversion to message provider's mr access flags +- Reduce inject size by the minimum packet header needed by rxm +- Fix checks to enable shared rx when creating an endpoint +- Minor code restructuring +- Fix trying to access freed memory in error handling case +- Use optimized inject limits to avoid bounce buffer copies +- Fix possible invalid pointer access handling rx errors +- Add support for HMEM if supported by msg provider +- Add missing locks around progress to silence thread-sanitizer +- Support re-connecting to peers if peer disconnects (client-server model) +- Cleanup rendezvous protocol handling +- Add support for RMA write rendezvous protocol + +## SHM + +- Add support for Ze IPC protocol +- Only perform IPC protocol related cleanup when using IPC +- Disable cross-memory attach protocol when HMEM is enabled +- Fix cross-memory attach support when running in containers +- Always call SAR protocol's progress function +- Enable cross-memory attach protocol when sending to self +- Minor code cleanups and restructuring for maintenance + +## Sockets + +- Verify CM data size is less than supported value +- Handle FI_SYNC_ERR flag on AV insert +- Improve destination IP address checks +- Minor coding cleanups based on static code analysis + +## TCP + +- Fix hangs on windows during connection setup +- Relax CQ checks when enabling EP to handle send/recv only EPs +- Fix possible use of unset return value in EP enable +- Minor coding cleanups based on static code analysis +- Handle EAGAIN during CM message exchanges +- Set sockets to nonblocking on creation to avoid possible hangs at scale +- Improve CM state tracking and optimize CM message flows +- Make passive endpoints nonblocking to avoid hangs +- Allow reading buffered data from disconnected endpoints +- Implement fi_cancel for receive queues +- Flush outstanding operations to user when an EP is disabled +- Support dynamic receive buffering - removes need for bounce buffers +- Add direct send feature - removes need for bounce buffers +- Minor code cleanups and restructuring to improve maintenance +- Add support for fo_domain_bind + +## Util + +- Improve checks that EPs are bound to necessary CQs +- Fix mistaking the AV's total size with current count to size properly +- Fix CQ buffer overrun protection mechanisms to avoid lost events + +## Verbs + +- Add SW credit flow control to improve performance over Ethernet +- Skip verbs devices that report faulty information +- Limit inline messages to iov = 1 to support more devices +- Minor code improvements and restructuring to improve maintenance +- Enable caching of device memory (RoCR, CUDA, Ze) registrations +- Add HMEM support, including proprietary verbs support for P2P +- Add support for registering device memory +- Support GIDs at any GID index, not just 0 +- Fix macro definitions to cleanup build warnings +- Support GID based connection establishment, removes ipoib requirement +- Reduce per peer memory footprint for large scale fabrics + +v1.11.2, Tue Dec 15, 2020 +========================= + +## Core + +- Handle data transfers > 4GB on OS X over tcp sockets +- Fixed spelling and syntax in man pages +- Fix pmem instruction checks + +## EFA + +- Use memory registration for emulated read protocol +- Update send paths to use app memory descriptor if available +- Remove unneeded check for local memory registration +- Do not install fork handler if EFA is not used +- Fix medium message RTM protocol +- Fix memory registration leak in error path +- Fix posting of REQ packets when using shm provider + +## RxM + +- Fix provider initialization when built as a dynamic library + +## SHM + +- Reverts SAR buffer locking patch +- Include correct header file for process_vm_readv/writev syscalls +- Skip atomic fetch processing for non-fetch operations + +## TCP + +- Fix swapping of address and CQ data in RMA inject path + +## Util + +- Fix error code returned for invalid AV flags +- Fix a bug finding the end of a page when the address is aligned + +## Verbs + +- Fix build warning in XRC CM log messages +- Fix build warnings in debug macros + +v1.11.1, Fri Oct 9, 2021 +======================== + +## Core + +- Remove calls to cuInit to prevent indirect call to fork +- Ignore case when comparing provider names +- Prevent layering util providers over EFA +- Fix segfault if passed a NULL address to print +- Fail build if CUDA is requested but not available + +## EFA + +- Switch to memhooks monitor +- Avoid potential deadlock copying data to GPU buffers +- Allow creating packet pools with non-huge pages +- Check return value when processing data packets +- Minor code restructuring and bug fixes +- Check if outstanding TX limit has been reached prior to sending +- Move RDMA read registration to post time +- Do not overwrite a packet's associated MR when copying packets +- Pass in correct packet when determining the header size +- Do not release rx_entry in EAGAIN case +- Disable MR cache if fork support is requested +- Turn off MR cache if user supports FI_MR_LOCAL +- Add FI_REMOTE_READ to shm registrations +- Remove use_cnt assert closing domain to allow driver cleanup +- Fix off by 1 returned AV address when using AV map +- Ensure setting FI_HMEM capability is backwards compatible + +## RxD + +- Fix bug that prevents sending timely ACKs for segmented messages +- Remove calls that recursively try to acquire the EP lock + +## RxM + +- Allow re-connecting to peers + +## SHM + +- Create duplicate fi_info's when reporting FI_HMEM support +- Handle transfers larger than 2GB +- Register for signal using SA_ONSTACK +- Fix segfault if peer has not been inserted into local AV +- Fix command/buffer tracking for sending connection requests +- Return proper errno on AV lookup failures +- Remove duplicate call to ofi_hmem_init +- Fix using incorrect peer id for mid-sized message transfers +- Fix addressing race conditions +- Fix mixing of shm AV index values with fi_addr_t values +- Fix initialization synchronization +- Ensure progress is invoked for mid-sized message transfers +- Always use CMA when sending data to self +- Fix hang using SAR protocol + +## Sockets + +- Retry address lookup for messages received during CM setup + +## TCP + +- Fix possible deadlock during EP shutdown due lock inversion +- Rework CM state machine to fix lock inversion handling disconnect + +## Util + +- Correctly mark if addresses support local/remote communication +- Check madvise memhook advice +- Update mmap intercept hook function +- Replace memhooks implementation to intercept syscalls +- Fix shmat intercept hook handling +- Fix error handling obtaining page sizes +- Fix incorrect locking in MR cache +- Fix memory leak in rbtree cleanup + +## Verbs + +- Fix XRC transport shared INI QP locking +- Account for off-by-one flow control credit issue +- Fix disabling of receive queue flow control +- Reduce overall memory footprint on fully connected apps +- Skip reporting native IB addresses when network interface is requested + +v1.11.0, Fri Aug 14, 2020 +========================= + +## Core + +- Add generalized hmem_ops interface for device ops +- Add FI_HMEM_CUDA, FI_HMEM_ROCR, and FI_HMEM_ZE interfaces and device support +- Add CUDA and ROCR memory monitors and support for multiple monitors +- Add fi_tostr for FI_HMEM_* interfaces +- Add utility interface and device support +- Add documentation for hmem override ops +- Save mr_map mem_desc as ofi_mr +- Rework and reorganize memory monitor code +- Add mr_cache argument flush_lru to ofi_mr_cache_flush +- Fix 1.1 ABI domain, EP, and tx attributes +- Add loading of DL providers by name +- Add CMA wrappers and define CMA for OSX +- Fix util getinfo: use base fi_info caps, altering mr_mode properly, + FI_MR_HMEM support, NULL hints, set CQ FI_MSG flag, query FI_COLLECTIVE, + list FI_MATCH_COMPLETE, select and request specific core provider +- Add rbmap interface to get root node +- Add support of AF_IB to addr manipulation functions +- Windows: Map strtok_r() to strtok_s() +- Define OFI_IB_IP_{PORT,PS}_MASK +- Make fi_addr_format() public +- Remove mr_cache entry subscribed field +- Update memhooks brk and implement sbrk intercepts +- Fix vrb_speed units +- Fix possible null dereference in ofi_create_filter +- Add ofi_idx_ordered_remove +- Add functions ofi_generate_seed() and ofi_xorshift_random_r() +- Call correct close fd call in util_wait_fd_close +- Set a libfabric default universe size +- Add compatibility with SUSE packaging +- Windows: Handle socket API size limitations +- Fix UBSAN warnings +- Save and restore the errno in FI_LOG +- Ensure that access to atomic handlers are in range +- Ensure ifa_name is null terminated in ofi_get_list_of_addr +- Buffer pools fallback to normal allocations when hugepage allocations fail + +## EFA + +- Add support to use user posted receive buffers with RDM EP when requested +- Various fixes to FI_HMEM support +- Added fork handler and abort if rdma-core is incorrectly configured +- Fix bandwidth regression due to increased structure size +- Reuse verbs protection domain when in same process address space +- Periodically flush MR cache to reduce MR usage +- Properly handle setting/unsetting RDMAV_HUGEPAGES_SAFE +- Fix provider_version reported by EFA +- Populate additional fields in fid_nic +- Fix various bugs in the completion, info, and domain paths +- Fix various memory leaks + +## PSM2 + +- Treat dynamic connection errors as fatal +- Add missing return status checking for PSM2 AM calls + +## RxD + +- updated AV design to be dynamically extensible using indexer and index map. +- updated static allocation of peers with runtime allocation during rts. +- added wrapper to fetch pointer to a peer from the peers data structure. +- Updated to show correct msg_ordering. +- Check datatype size when handling atomic ops. +- Verify atomic opcode in range for fixing Klocwork issue. +- Corrected use of addr in rxd_atomic_inject for retrieving rxd_addr. + +## RxM + +- Align reporting of FI_COLLECTIVE with man pages +- Show correct ordering of atomic operations +- Fix error handling inserting IP addresses into an AV +- Minor code cleanups and bug fixes +- Select different optimizations based on running over tcp vs verbs +- Use SRX by default when using tcp to improve scaling +- Correct CQ size calculation when using SRX +- Fix MR registration error path when handling iov's +- Allow selecting tcp wait objects separate from verbs +- Only repost Rx buffers if necessary + +## SHM + +- Fix a CMA check bug +- Fix shm provider signal handler calling the original handler +- Add initial framework for IPC device copies +- Add FI_HMEM support and integrate hmem_ops +- Fix error handling path in smr_create +- Fix AV insertion error handling +- Verify atomic op value +- Redefine shm addrlen to not use NAME_MAX +- Fix snprintf to exclude byte for null terminator +- Mark smr_region as volatile +- Fix memory leaks + +## Sockets + +- Fix backwards compatibility accessing struct fi_mr_attr +- Fix use after free error in CM threads +- Free unclaimed messages during endpoint cleanup to avoid memory leaks +- Improve handling of socket disconnection +- Limit time spent in progress when expected list is long +- Avoid thread starvation by converting spinlocks to mutex + +## TCP + +- Minor bug fixes +- Verify received opcode values are valid +- Avoid possible receive buffer overflow from malformed packets +- Fix fi_cq_sread failing with ECANCELED +- Optimize receive progress handling +- Do not alter pseudo random sequence numbers +- Increase default listen backlog size to improve scaling +- Handle processing of NACK packets during connection setup +- Fix wrong error handling during passive endpoint creation +- Add logging messages during shutdown handling +- Improve logging and error handling +- Fix possible use after free issues during CM setup +- Minor code restructuring + +## Util + +- Use internal flags in place of epoll flags for portability +- Support HMEM with the mr_cache +- Verify application requested FI_HMEM prior to accessing fi_mr_attr fields +- Fix memory leak when using POLLFD wait sets +- Ensure AV data is aligned even if address length is not +- Fix handling of mr mode bits for API < 1.5 +- Allow user to force use of userfaultfd memory monitor + +## Verbs + +- Add support for AF_IB and native IB addressing +- Minor code cleanups +- Avoid possible string overrun parsing interface names +- Fix memory leak handling duplication interface names +- Add XRC shared Rx CQ credit reservation +- Fix possible segfault when closing an XRC SRQ +- Fix verbs speed units to MBps +- Add flow control support to avoid RQ overruns +- Fix memory leak of address data when creating endpoints + +v1.10.1, Fri May 8, 2020 +======================== + +## Core + +- Fixed library version + +## EFA + +- Allow endpoint to choose shm usage +- Fix handling of REQ packets +- Fix logic writing a Tx completion entry +- Use correct Tx operation flags for msg sends + +## Fabtests + +- Use pax tar format when creating source packages + +## RxD + +- Use correct peer address for atomic_inject calls + +## SHM + +- Fix BSD build failure + +## TCP + +- Add locking around signaling a wait fd + +v1.10.0, Fri Apr 24, 2020 +========================= + +## Core + +- Added new pollfd wait object to API +- Added ability to query for a fid's wait object type +- Updated most providers to a new provider versioning format +- Support using multiple fds for blocking calls, in place of epoll +- Fix memory leak when destroying rbtrees +- Record interface names and network names for IP addressable providers +- Improved performance of timing calculations +- Improvements to MR caching mechanism + +## EFA + +- Replaces custom admin commands with native use of rdma-core APIs +- Added support for FI_RMA using RDMA Reads +- Added rendezvous protocol for long messages using RDMA Reads +- Added support for CUDA buffers (FI_HMEM) +- Added medium-message protocol +- Added support for atomic operations +- Added randomized Queue Key assignment to endpoints +- Improved support for client-server applications +- Disables use of shared-memory if FI_LOCAL_COMM is not required +- Updated protocol to v4 +- Refactor packet handling functions and headers for better extensibility +- Added handshake protocol to negotiate protocol features with peers +- Refactor send/recv paths for improved memory descriptor handling +- Use inlined device sends for FI_INJECT +- Removes fork() to detect CMA support from the init path +- Better reuse of MRs keys across EFA and SHM control path +- Squashes the MR functions in the RxR and EFA layers +- Squashes the AV functions in the RxR and EFA layers +- Use 0-based offset if FI_MR_VIRT_ADDR not set +- Retries memory registration in MR cache error paths +- Fixes to addr_format handling in the RDM endpoint +- Fixes memory leaks +- Fixes AV error handling paths +- Fixes shm error handling paths +- Fixes compiler warnings + +## PSM2 + +- Improve source address translation for scalable endpoints + +## RxM + +- Add support for pollfd wait objects +- Fix double free in error path +- Report CQ errors for failed RMA transfers +- Fixing locking in tagged receive path +- Remove incorrect rx_attr capability bits +- Handle unexpected messages when posting multi-recv buffers +- Repost multi-recv buffers to the receive queue head +- Fix unexpected message handling +- Fix stall in collective progress caused by lost receive buffers +- Add support for collection operations + +## RxD + +- Replace rxd_ep_wait_fd_add with direct call to ofi_wait_fd_add +- Reorganize attr caps +- Add rxd to fi_provider man page + +## SHM + +- Fix pointer ofi_cq_init progress pointer +- Add CQ wait object support with new FI_WAIT_YIELD wait type +- Include string terminator in addrlen +- Fix av_insert address cast +- Fix unexpected messaging processing on empty receive queue +- Fix unexpected messaging locking +- Progress unexpected queue for non-tagged receives +- Move ep_name_list initialization/cleanup and fix signal handling +- Reorganize attr caps +- Warn once on peer mapping failures +- Add FI_DELIVERY_COMPLETE support +- Fix FI_MULTI_RECV reporting and allow writing to overflow CQ for unexpected MULTI_RECV +- Refactor and simplify msg processing, formating, and recv posting +- Rename ep_entry to rx_entry and add tx_entry for pending outgoing messages +- Properly align cmd data +- Return correct addrlen on av lookup +- Fix id passed into rma fast path +- Fix typo +- Fix potential data ordering issue in atomic fetch path +- Add proper RMA read protocol without CMA +- Add runtime CMA check during mapping +- Add mmap-based fallback protocol for large messages without CMA +- Add large message segmentation fallback protocol for large messages without CMA and + add FI_SHM_SAR_THRESHOLD to control switching between segmentation and mmap +- Define macros for address translation +- Allow building of shm provider on older kernels with x86 arch +- Rename peer_addr to peer_data +- Change locking when progressing response entries +- Fix cmd_cnt increment on RMA ops +- Add error handling when inserting more than SMR_MAX_PEERS +- Add shm size space check +- Fix locking when processing response from self +- Add locking around the ep_name_list + +## TCP + +- Fix incorrect signaling of fd waking up thread in fi_cq_sread +- Switch to using pollfd wait object instead of epoll as default +- Add missing ep lock to fix possible ep list corruption +- Remove incorrectly reported CQ events posted to EQ +- Update domain name to IP network name +- Improved socket processing to improve scalability and performance +- Remove incorrect implementation of FI_MULTI_RECV support +- Report error completions even if successful completion is suppressed +- Report correct EQ event for aborted connections + +## Verbs + +- Fix XRC request identification +- Fix small memory leak for XRC connections +- Add retry logic for XRC connections +- Fix mapping of domains to NICs when multiple NICs are present +- Allow filtering of device names via environment variable +- Fix compilation with -fno-common option +- Code restructuring to improve maintenance + +v1.9.1, Fri Mar 6, 2020 +======================= + +## Core + +- Fix gcc 9.2 warnings +- Fix thread hangs in MR cache when using userfaultfd monitor +- Add missing header for FreeBSD build +- Allow a core provider to discover and use filtered providers + +## EFA + +- Change MR cache count and size limits +- Fixes to 32-bit msg_id wraparound handling +- Adds address map to look up EFA address from shm address +- Remove unnecessary EFA device name check +- Detect availability of CMA directly from EFA provider +- Use OFI_GETINFO_HIDDEN flag when querying for shm +- Allow use of EFA when shm is unavailable +- Fixes info and domain capabilities for RDM endpoint +- Fixes to dest_addr returned with info objects +- Fixes segfault in efa_mr_cache_entry_dereg() +- Fixes compilation warning in DSO build of the provider +- Fixes compilation errors with -fno-common +- Fixes to send-side control path + +## PSM2 + +- Clean up of AV entries that have been removed + +## RxM + +- Fix multi-recv buffer handling to use entire buffer +- Consume entire multi-recv buffer before using buffer +- Continue execution after handling transfer errors +- Properly cleanup CM progress thread +- Minor code cleanups and restructuring + +## SHM + +- Properly restore captured signals +- Track ptrace_scope globally, and allow disabling +- Properly initialize endpoint name list +- Fix potential deadlock resulting from missed handling of unexpected messages +- Fix multi-threading issue accessing unexpected messages +- Handle multiple addresses passed to fi_av_insert +- NULL terminate address strings +- Pass correct pointer to ofi_cq_init + +## TCP + +- Removed incorrect implementation for multi-recv buffer support +- Always report error completions +- Report correct EQ event for aborted connection requests +- Improve connection data corner cases + +## Verbs + +- Fix segfault handling error completions +- Avoid null dereference handling EQ events +- Remove possible deadlock in XRC error path +- Enable credit tracking to avoid SQ, RQ, and CQ overruns +- Verify that CQ space is available for bound EPs +- Minor code cleanups and restructuring + + +v1.9.0, Fri Nov 22, 2019 +======================== + +## Core + +- Add generic implementation for collective operations +- Add support for traffic class selection +- Fixes and enhancements to memory registration cache +- Add support for older kernels to the MR cache (hook malloc related calls) +- Fix setting loopback address byte ordering +- Fix MR cache locking from spinlock to a mutex to avoid starvation +- Add API enhancements for heterogeneous memory (e.g. GPUs) +- Limit default size of MR cache to avoid out of memory errors +- Fix g++ compile error +- Enhanced the hooking provider infrastructure +- Enhanced windows support for IPv6 and NIC selection +- Fix timeout calculation in wait operations +- Add simple spell checker for FI_PROVIDER +- Fix red-black tree possible use after free issue +- Fix segfault running libfabric within a linux container +- Minor cleanups and bug fixes +- Work-around possible long delay in getaddrinfo() + +## EFA + +- Introduce support for shared-memory communication using shm provider +- Enable Memory Registration caching by default +- Refactor TX and CQ handling functions to reduce branching +- Use application-provided MR descriptors when available +- Optimize progress engine polling loop for shm and EFA completions +- Enable inline registration for emulated RMA reads +- Inherit FI_UNIVERSE_SIZE for AV sizing +- Increase default min AV size to 16K +- Fix uninitialized objects with DSO build of the provider +- Fix handling of FI_AV_UNSPEC +- Fix crash and resource leak with fi_cancel() implementation +- Fix issues with EFA's registration cache under efa;ofi_rxd +- Fix MR allocation handlers to use correct pointer and size +- Fix error handling in multi-recv completion code +- Fix compilation errors when built with valgrind annotations +- Fix compilation errors when packet poisoning was enabled +- Fix incorrect parameter definitions +- Fix leaks of internal resources +- Miscellaneous cleanups and bug fixes + +## MRail + +- Renamed address control environment variable +- Implement large message striping using rendezvous +- Properly set tx/rx op flags + +## PSM2 + +- Fix memory leaks +- Add fi_nic support +- Report correct value for max_order_raw_size +- Report max_msg_size as a page aligned value +- Fix potential multi-threaded race condition +- Avoid potential deadlock in disconnect protocol + +## RxD + +- Fix default AV count +- Minor cleanups and optimizations +- Handle errors unpacking packets +- Report all failures when inserting addresses into AV +- Remove unneeded posted buffer tracking + +## RxM + +- Fix inject completion semantics +- Fix MR key handling when mismatched with core provider +- Add basic support for some collective operations +- Fix senddata desc parameter mismatch +- Serialize EQ processing to avoid use after free issue +- Minor cleanup and optimizations +- Remove atomic buffer limitations +- Provide mechanism to force auto-progress for poorly designed apps +- Fix high memory usage when using RMA +- Fix segfault handling memory deregistration +- Discard canceled receive buffers when closing msg ep +- Fix memory leaks in connection management + +## SHM + +- Cleanup tmpfs after unclean shutdown +- Increase the size of endpoint names +- Align endpoint count attribute with maximum supported peer count +- Add user ID to shared memory name +- Only support small transfers if ptrace is restricted +- Fix incorrect reporting of completion buffer +- Return correct addrlen on fi_getname +- Round tx/rx sizes up in case sizes are not already a power of two +- Skip utility providers for shm provider + +## TCP + +- Report aborted requests as canceled +- Fixed support for 0-length transfers +- Return positive error code for CQ entries +- Bind ports using SO_REUSEADDR +- Properly check for correct recv completion length +- Fix potential deadlock due to lock ordering issue + +## Verbs + +- Enable on-demand paging memory registration option +- Enable send queue overflow optimization for mlx devices +- Cleanup EQ when closing an associated endpoint +- Minor optimizations and code restructuring +- Avoid potential deadlock accessing EQ and EP +- Speedup XRC connection setup +- Handle IPv6 link local address scope id +- Updates to support new versions of rdma-core libraries +- XRC connection optimizations, cleanups, and error handling improvements +- Fix possible segfault in error handling path +- Remove support for vendor specific and experimental verbs +- Handle 0-length memory registrations +- Fix EQ trywait behavior to check for software events + + +v1.8.1, Mon Sep 30, 2019 +======================== + +## Core + +- Limit default size of memory registration cache +- Verify that correct entry is removed from MR cache + +## EFA + +- Fixes to fi_cancel() when used with multi-recv buffers +- Fixes to registered memory handling after a fork() +- Fixes to the long message flow-control protocol +- Use FI_AV_TABLE as the preferred AV type +- Fixes to the bufpool allocation handlers +- Fixes to RTS handler +- Fix to use correct arch detection preprocessor macro +- Expose fid_nic information +- Fix memory leaks + +## PSM2 + +- Fix incorrect value of max_order_raw_size +- Report page aligned max_msg_size +- Always enable the lock accessed by the disconnection thread +- Fix race condition with progress thread and FI_THREAD_DOMAIN +- Avoid a potential deadlock in disconnection protocol + +## RxD +- Fix default AV count with environment variable FI_OFI_RXD_MAX_PEERS + +## RxM + +- Fix connection handle shutdown/CQ processing race +- Fix RMA ordering bits for FI_ATOMIC + +## SHM +- Add correct reporting of FI_MR_BASIC +- Add correct reporting and proper support of FI_DIRECTED_RECV + +## Verbs + +- Allow zero length memory registrations +- Improve connection scale up by removing synchronous calls in fi_getinfo +- Fix missing serialization to event channel during CM ID migration +- Protect XRC EQ processing from EP API connect/accept calls +- Fix XRC connection tag to EP return value in error case +- return EAGAIN to user if an unhandled rdmacm event is received +- handle IPv6 link local addresses correctly + v1.8.0, Fri Jun 28, 2019 ======================== @@ -127,7 +932,7 @@ v1.7.2, Fri Jun 14, 2019 - Fix message windowing - Limit number of transfer entries that can be active - Use utility CQ calls to handle CQ overflow -- Set correct opcde when completing read completions +- Set correct opcode when completing read completions - Preset and fix tx and rx transfer flags - Fix segfault diff --git a/README.md b/README.md index 1ed9a8a0417..23cc0f65ed9 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ [libfabric master branch Travis CI status](https://travis-ci.org/ofiwg/libfabric) -[libfabric Coverity scan suild status](https://scan.coverity.com/projects/4274) +[libfabric Coverity scan build status](https://scan.coverity.com/projects/4274) +[libfabric master branch AppVeyor CI status](https://ci.appveyor.com/api/projects/status/github/ofiwg/libfabric) [![libfabric release version](https://img.shields.io/github/release/ofiwg/libfabric.svg)](https://github.com/ofiwg/libfabric/releases/latest) # libfabric @@ -140,12 +141,6 @@ See the `fi_gni(7)` man page for more details. - The `gni` provider requires `gcc` version 4.9 or higher. -### mxm - -*** - -The MXM provider has been deprecated and was removed after the 1.4.0 release. - ### psm *** @@ -177,12 +172,26 @@ Intel TrueScale Fabric. See the `fi_psm2(7)` man page for more details. +### psm3 + +*** + +The `psm3` provider provides optimized performance and scalability for most +verbs UD devices. Additional features and optimizations can be enabled when +running over Intel's E810 Ethernet NICs and using Intel's rendezvous kernel +module ([`rv`](https://github.com/intel/iefs-kernel-updates)). PSM 3.x fully integrates the OFI provider and the underlying +PSM3 protocols/implementation and only exports the OFI APIs. + +See [`fi_psm3`(7)](https://ofiwg.github.io/libfabric/master/man/fi_psm3.7.html) for more details. + ### rxm +*** + The `ofi_rxm` provider is an utility provider that supports RDM endpoints emulated over MSG endpoints of a core provider. -See [`fi_rxm`(7)](fi_rxm.7.html) for more information. +See [`fi_rxm`(7)](https://ofiwg.github.io/libfabric/master/man/fi_rxm.7.html) for more information. ### sockets @@ -336,27 +345,6 @@ See the `fi_netdir(7)` man page for more details. root of provier directory, i.e. \prov\netdir\NetDirect, where NetDirect contains the header files), specify them in the configuration properties of the VS project. -### mlx - -*** - -The MLX provider enables applications using OFI to be run over UCX -communication library. It uses libucp for connections control and data transfer operations. -Supported UCP API version: 1.2 - -See the `fi_mlx(7)` man page for more details. - -#### Dependencies - -- The MLX provider requires UCP API 1.2 capable libucp and libucs (tested with hpcx v1.8.0, v1.9.7). - If you are compiling Libfabric from source and want to enable MLX - support, you will also need the matching header files for UCX. - If the libraries and header files are not in default paths, specify them using: - -``` ---with-mlx= -``` - ### shm *** @@ -380,7 +368,7 @@ EC2 Elastic Fabric Adapter (EFA)](https://aws.amazon.com/hpc/efa/), a custom-built OS bypass hardware interface for inter-instance communication on EC2. -See [`fi_efa`(7)](fi_efa.7.html) for more information. +See [`fi_efa`(7)](https://ofiwg.github.io/libfabric/master/man/fi_efa.7.html) for more information. ## WINDOWS Instructions @@ -406,6 +394,7 @@ Even though windows isn't fully supported yet it is possible to compile and link 1-2: Debug/Release ICC (restricted support for Intel Compiler XE 15.0 only) 3-4: Debug/Release v140 (VS 2015 tool set) 5-6: Debug/Release v141 (VS 2017 tool set) + 7-8: Debug/Release v142 (VS 2019 tool set) make sure you choose the correct target fitting your compiler. By default the library will be compiled to `\x64\` diff --git a/config/fi_provider.m4 b/config/fi_provider.m4 index 704e0eaaa92..e01e3373c9f 100644 --- a/config/fi_provider.m4 +++ b/config/fi_provider.m4 @@ -9,8 +9,6 @@ AC_DEFUN([FI_PROVIDER_INIT],[ PROVIDERS_DL= PROVIDERS_STATIC= PROVIDERS_COUNT= - - m4_include(config/fi_check_package.m4) ]) dnl diff --git a/configure.ac b/configure.ac index 2190e409917..2594de363a2 100644 --- a/configure.ac +++ b/configure.ac @@ -1,18 +1,20 @@ dnl dnl Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. -dnl Copyright (c) 2019 Intel, Inc. All rights reserved. -dnl Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All rights reserved. +dnl Copyright (c) 2019-2021 Intel, Inc. All rights reserved. +dnl Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. All rights reserved. +dnl (C) Copyright 2020 Hewlett Packard Enterprise Development LP dnl dnl Process this file with autoconf to produce a configure script. AC_PREREQ([2.60]) -AC_INIT([libfabric], [1.9.0a1], [ofiwg@lists.openfabrics.org]) +AC_INIT([libfabric], [1.12.0rc2], [ofiwg@lists.openfabrics.org]) AC_CONFIG_SRCDIR([src/fabric.c]) AC_CONFIG_AUX_DIR(config) AC_CONFIG_MACRO_DIR(config) AC_CONFIG_HEADERS(config.h) -AM_INIT_AUTOMAKE([1.11 dist-bzip2 foreign -Wall -Werror subdir-objects parallel-tests tar-ustar]) +AM_INIT_AUTOMAKE([1.11 dist-bzip2 foreign -Wall -Werror subdir-objects parallel-tests tar-pax]) m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) +m4_include(config/fi_check_package.m4) AC_CANONICAL_HOST @@ -93,6 +95,9 @@ AC_ARG_ENABLE([atomics], dnl Checks for programs AC_PROG_CC_C99 +AS_IF([test "$ac_cv_prog_cc_c99" = "no"], + [AC_MSG_WARN([Libfabric requires a C99-compliant compiler]) + AC_MSG_ERROR([Cannot continue])]) AM_PROG_CC_C_O AC_PROG_CPP @@ -127,6 +132,16 @@ AS_IF([test x"$enable_debug" != x"no"], AC_DEFINE_UNQUOTED([ENABLE_DEBUG],[$dbg], [defined to 1 if libfabric was configured with --enable-debug, 0 otherwise]) +AC_ARG_ENABLE([asan], + [AS_HELP_STRING([--enable-asan], + [Enable address sanitizer @<:@default=no@:>@]) + ], + [], + [enable_asan=no]) + +AS_IF([test x"$enable_asan" != x"no"], + [CFLAGS="-fsanitize=address $CFLAGS"]) + dnl Checks for header files. AC_HEADER_STDC @@ -180,10 +195,19 @@ fi AC_DEFINE_UNQUOTED([PT_LOCK_SPIN], [$have_spinlock], [Define to 1 if pthread_spin_init is available.]) -AC_CHECK_FUNCS([epoll_create]) -if test "$ac_cv_func_epoll_create" = yes; then - AC_DEFINE([HAVE_EPOLL], [1], [Define if you have epoll support.]) -fi +AC_ARG_ENABLE([epoll], + [AS_HELP_STRING([--disable-epoll], + [Disable epoll if available@<:@default=no@:>@])], + [], + [enable_epoll=auto] +) + +AS_IF([test x"$enable_epoll" != x"no"], + [AC_CHECK_FUNCS([epoll_create]) + if test "$ac_cv_func_epoll_create" = yes; then + AC_DEFINE([HAVE_EPOLL], [1], [Define if you have epoll support.]) + fi] +) AC_CHECK_HEADER([linux/perf_event.h], [AC_CHECK_DECL([__builtin_ia32_rdpmc], @@ -349,8 +373,8 @@ AS_IF([test "$icc_symver_hack"], [AC_MSG_RESULT(disabled)], [ -AC_TRY_LINK([], - [__asm__(".symver main_, main@ABIVER_1.0");], +AC_TRY_LINK([__asm__(".symver main_, main@ABIVER_1.0");], + [], [ AC_MSG_RESULT(yes) ac_asm_symver_support=1 @@ -372,7 +396,7 @@ AC_TRY_LINK( int foo(int arg) { return arg + 3; }; int foo2(int arg) __attribute__ (( __alias__("foo"))); ], - [ /* empty main */ ], + [ foo2(1); ], [ AC_MSG_RESULT(yes) ac_prog_cc_alias_symbols=1 @@ -451,17 +475,207 @@ AS_IF([test $have_uffd -eq 1], AC_DEFINE_UNQUOTED([HAVE_UFFD_UNMAP], [$have_uffd], [Define to 1 if platform supports userfault fd unmap]) +dnl Check support to intercept syscalls +AC_CHECK_HEADERS_ONCE(elf.h sys/auxv.h) + +dnl Check support to clock_gettime +have_clock_gettime=0 + +AC_SEARCH_LIBS([clock_gettime],[rt], + [have_clock_gettime=1], + []) + +AC_DEFINE_UNQUOTED(HAVE_CLOCK_GETTIME, [$have_clock_gettime], + [Define to 1 if clock_gettime is available.]) +AM_CONDITIONAL(HAVE_CLOCK_GETTIME, [test $have_clock_gettime -eq 1]) + +dnl Check for CUDA runtime libraries. +AC_ARG_WITH([cuda], + [AC_HELP_STRING([--with-cuda=DIR], + [Provide path to where the CUDA development + and runtime libraries are installed.])], + [], []) + +have_libcuda=0 +AS_IF([test x"$with_cuda" != x"no"], + [FI_CHECK_PACKAGE([cuda], + [cuda_runtime.h], + [cudart], + [cudaMemcpy], + [-lcuda], + [$with_cuda], + [], + [have_libcuda=1], + [], + [])], + []) + +AS_IF([test x"$with_cuda" != x"no" && test -n "$with_cuda" && test "$have_libcuda" = "0" ], + [AC_MSG_ERROR([CUDA support requested but CUDA runtime not available.])], + []) +AC_DEFINE_UNQUOTED([HAVE_LIBCUDA], [$have_libcuda], [Whether we have CUDA runtime or not]) + +AC_ARG_ENABLE([cuda-dlopen], + [AS_HELP_STRING([--enable-cuda-dlopen], + [Enable dlopen of CUDA libraries @<:@default=no@:>@]) + ], + [ + AS_IF([test "$freebsd" == "0"], [ + AC_CHECK_LIB(dl, dlopen, [], + [AC_MSG_ERROR([dlopen not found. libfabric requires libdl.])]) + ]) + AC_DEFINE([ENABLE_CUDA_DLOPEN], [1], [dlopen CUDA libraries]) + ], + [enable_cuda_dlopen=no]) + +AC_ARG_WITH([ze], + AC_HELP_STRING([--with-ze=DIR], [Provide path to where the ZE + libraries and headers are installed.]), + [], []) + +AS_IF([test x"$with_ze" != x"no"], + [FI_CHECK_PACKAGE([ze], + [level_zero/ze_api.h], + [ze_loader], + [zeInit], + [], + [$with_ze], + [], + [AC_DEFINE([HAVE_LIBZE], [1],[ZE support])], + [], []) + CPPFLAGS="$CPPFLAGS $ze_CPPFLAGS" + LDFLAGS="$LDFLAGS $ze_LDFLAGS" + LIBS="$LIBS $ze_LIBS"], + []) + +enable_memhooks=1 +AC_ARG_ENABLE([memhooks-monitor], + [AC_HELP_STRING([--disable-memhooks-monitor], + [Determine whether memhooks memory monitor is disabled.])], + [enable_memhooks=0], + []) + +AC_DEFINE_UNQUOTED(ENABLE_MEMHOOKS_MONITOR, [$enable_memhooks], + [Define to 1 to enable memhooks memory monitor]) + +AS_IF([test "$enable_memhooks" == "1"], [ + AC_CHECK_FUNCS([__curbrk __clear_cache]) + AC_CHECK_HEADERS([linux/mman.h sys/syscall.h]) + AC_CHECK_DECLS([__syscall], [], [], [#include ]) + AC_CHECK_FUNCS([__syscall]) + ], []) + +enable_uffd=1 +AC_ARG_ENABLE([uffd-monitor], + [AC_HELP_STRING([--disable-uffd-monitor], + [Determine whether uffd memory monitor is disabled.])], + [enable_uffd=0], + []) + +AC_DEFINE_UNQUOTED(ENABLE_UFFD_MONITOR, [$enable_uffd], + [Define to 1 to enable uffd memory monitor]) + + +AH_BOTTOM([ +#if defined(__linux__) && (defined(__x86_64__) || defined(__amd64__) || defined(__aarch64__)) && ENABLE_MEMHOOKS_MONITOR +#define HAVE_MEMHOOKS_MONITOR 1 +#else +#define HAVE_MEMHOOKS_MONITOR 0 +#endif + +#if HAVE_UFFD_UNMAP && ENABLE_UFFD_MONITOR +#define HAVE_UFFD_MONITOR 1 +#else +#define HAVE_UFFD_MONITOR 0 +#endif +]) + +CPPFLAGS="$CPPFLAGS $cuda_CPPFLAGS" +LDFLAGS="$LDFLAGS $cuda_LDFLAGS" + +AS_IF([test x"$enable_cuda_dlopen" != x"yes"], [LIBS="$LIBS $cuda_LIBS"]) + +#gdrcopy related configs +AC_ARG_WITH([gdrcopy], + [AC_HELP_STRING([--with-gdrcopy=DIR], + [Provide path to where the gdrcopy development + and runtime libraries are installed.])], + [], []) + +FI_CHECK_PACKAGE([gdrcopy], + [gdrapi.h], + [gdrapi], + [gdr_open], + [-lgdrapi], + [$with_gdrcopy], + [], + [AC_DEFINE([HAVE_GDRCOPY], [1],[gdrcopy support])], + [], []) + +AC_ARG_ENABLE([gdrcopy-dlopen], + [AS_HELP_STRING([--enable-gdrcopy-dlopen], + [Enable dlopen of gdrcopy libraries @<:@default=no@:>@]) + ], + [ + AS_IF([test "$freebsd" == "0"], [ + AC_CHECK_LIB(dl, dlopen, [], + [AC_MSG_ERROR([dlopen not found. libfabric requires libdl.])]) + ]) + AC_DEFINE([ENABLE_GDRCOPY_DLOPEN], [1], [dlopen CUDA libraries]) + ], + [enable_gdrcopy_dlopen=no]) + +CPPFLAGS="$CPPFLAGS $gdrcopy_CPPFLAGS" +LDFLAGS="$LDFLAGS $gdrcopy_LDFLAGS" +AS_IF([test x"$enable_gdrcopy_dlopen" != x"yes"], [LIBS="$LIBS $gdrcopy_LIBS"]) +#end gdrcopy configures + +dnl Check for ROCR runtime libraries. +AC_ARG_WITH([rocr], + [AC_HELP_STRING([--with-rocr=DIR], + [Provide path to where the ROCR/HSA development + and runtime libraries are installed.])], + [], []) + +AC_ARG_ENABLE([rocr-dlopen], + [AS_HELP_STRING([--enable-rocr-dlopen], + [Enable dlopen of ROCR libraries @<:@default=no@:>@]) + ], + [ + AS_IF([test "$freebsd" == "0"], [ + AC_CHECK_LIB(dl, dlopen, [], + [AC_MSG_ERROR([dlopen not found. libfabric requires libdl.])]) + ]) + AC_DEFINE([ENABLE_ROCR_DLOPEN], [1], [dlopen ROCR libraries]) + ], + [enable_rocr_dlopen=no]) + +FI_CHECK_PACKAGE([rocr], + [hsa/hsa_ext_amd.h], + [hsa-runtime64], + [hsa_amd_pointer_info], + [], + [$with_rocr], + [$with_rocr/lib], + [AC_DEFINE([HAVE_ROCR], [1], [ROCR HSA support])], + [], []) + +CPPFLAGS="$CPPFLAGS $rocr_CPPFLAGS" +LDFLAGS="$LDFLAGS $rocr_LDFLAGS" + +AS_IF([test x"$enable_rocr_dlopen" != x"yes"], [LIBS="$LIBS $rocr_LIBS"]) + dnl Provider-specific checks FI_PROVIDER_INIT FI_PROVIDER_SETUP([psm]) FI_PROVIDER_SETUP([psm2]) +FI_PROVIDER_SETUP([psm3]) FI_PROVIDER_SETUP([sockets]) FI_PROVIDER_SETUP([verbs]) FI_PROVIDER_SETUP([efa]) dnl The usnic provider must be setup after the verbs provider. See dnl prov/usnic/configure.m4 for details. FI_PROVIDER_SETUP([usnic]) -FI_PROVIDER_SETUP([mlx]) FI_PROVIDER_SETUP([gni]) FI_PROVIDER_SETUP([udp]) FI_PROVIDER_SETUP([tcp]) diff --git a/contrib/buildrpm/buildrpmLibfabric.sh b/contrib/buildrpm/buildrpmLibfabric.sh index 3e8cef4fb86..dd3c3ba971f 100755 --- a/contrib/buildrpm/buildrpmLibfabric.sh +++ b/contrib/buildrpm/buildrpmLibfabric.sh @@ -105,7 +105,7 @@ error() # usage information ################### usage="Usage: $0 [-i provider_name] [-e provider_name] - [-n] [-o] [-m] [-d] [-s] [-c] [-r] [-v] [-h] tarball + [-n] [-o] [-l] [-m] [-d] [-s] [-c] [-r] [-v] [-h] tarball Provider options: @@ -121,6 +121,9 @@ usage="Usage: $0 [-i provider_name] [-e provider_name] -o install under /opt/libfabric/_VERSION_ {default: install under /usr/ } + -l create symbolic link 'default' to _VERSION_ (requires -o option) + {default: link not create} + -m install modulefile {default: don't install modulefile} @@ -160,7 +163,7 @@ usage="Usage: $0 [-i provider_name] [-e provider_name] # parse args ############ export arguments="$@" -while getopts DP:M:V:nomi:e:dc:r:svh flag; do +while getopts DP:M:V:nolmi:e:dc:r:svh flag; do case "$flag" in n) noop="true" ;; @@ -191,6 +194,8 @@ while getopts DP:M:V:nomi:e:dc:r:svh flag; do ;; v) verbose="true" ;; + l) version_symbolic_link="true" + ;; h) echo "$usage" exit 0 ;; @@ -261,6 +266,9 @@ if [[ -n "$install_in_opt" ]]; then if [[ -z "$prefix" ]] ; then prefix=$default_opt_prefix fi + if [[ -n "$version_symbolic_link" ]]; then + rpmbuild_options="$rpmbuild_options --define '_version_symbolic_link $prefix/libfabric/default'" + fi prefix="$prefix/libfabric/$version" if [[ -n "$modulepath" ]] ; then diff --git a/contrib/cray/Jenkinsfile.verbs b/contrib/cray/Jenkinsfile.verbs index 1d6d9d462ef..22ad4246a09 100644 --- a/contrib/cray/Jenkinsfile.verbs +++ b/contrib/cray/Jenkinsfile.verbs @@ -3,6 +3,12 @@ @Library(['CrayNetworkCI@master', 'dst-shared@master']) _ +if (!isBuildable()) { + echo "build request is not valid, skipping build" + currentBuild.result = 'SUCCESS' + return +} + pipeline { options { // Generic build options @@ -24,8 +30,11 @@ pipeline { steps { // creating git short hash script { - GIT_SHORT_COMMIT = sh(returnStdout: true, script: "git log -n 1 --pretty=format:'%h'").trim() + GIT_DESCRIPTION = sh(returnStdout: true, script: "git describe --tags").trim() LIBFABRIC_INSTALL = pwd tmp: true + if (changeRequest()) { + SFT_PR_ENV_VAR = 'SFT_PR=1' + } } dir ('contrib/cray/bin') { @@ -37,13 +46,17 @@ pipeline { } } } - stage('Build') { + stage('Build CUDA and ROCR') { options { timeout (time: 5, unit: 'MINUTES') } + environment { + LD_LIBRARY_PATH = "$ROCR_INSTALL_PATH/lib:$CUDA_INSTALL_PATH/lib64:$LD_LIBRARY_PATH" + } steps { sh './autogen.sh' - sh "./configure --prefix=$LIBFABRIC_INSTALL" + sh """./configure --prefix=$LIBFABRIC_INSTALL --disable-memhooks-monitor \ + --with-cuda=$CUDA_INSTALL_PATH --with-rocr=$ROCR_INSTALL_PATH""" sh "make -j 12" sh "make install" dir ("fabtests") { @@ -54,14 +67,122 @@ pipeline { } } } + stage("Verify CUDA and ROCR Build") { + steps { + script { + cuda_link_count = sh(returnStdout: true, + script: """objdump -a -x $LIBFABRIC_INSTALL/lib/libfabric.so | + grep NEED | grep cuda | wc -l""").trim() + if (cuda_link_count != "2") { + error("libfabric failed to link to CUDA") + } + rocr_link_count = sh(returnStdout: true, + script: """objdump -a -x $LIBFABRIC_INSTALL/lib/libfabric.so | + grep NEED | grep hsa | wc -l""").trim() + if (rocr_link_count != "1") { + error("libfabric failed to link to ROCR") + } + } + } + } + stage('Build CUDA and ROCR dlopen') { + options { + timeout (time: 5, unit: 'MINUTES') + } + steps { + sh './autogen.sh' + sh """./configure --prefix=$LIBFABRIC_INSTALL --disable-memhooks-monitor \ + --with-cuda=$CUDA_INSTALL_PATH --enable-cuda-dlopen \ + --with-rocr=$ROCR_INSTALL_PATH --enable-rocr-dlopen""" + sh "make -j 12" + sh "make install" + dir ("fabtests") { + sh './autogen.sh' + sh "./configure --with-libfabric=$LIBFABRIC_INSTALL --prefix=$FABTEST_PATH" + sh "make -j12" + sh "make -j12 install" + } + } + } + stage("Verify CUDA and ROCR Build dlopen") { + steps { + script { + cuda_link_count = sh(returnStdout: true, + script: """objdump -a -x $LIBFABRIC_INSTALL/lib/libfabric.so | + grep NEED | grep cuda | wc -l""").trim() + if (cuda_link_count != "0") { + error("libfabric failed to link to CUDA") + } + rocr_link_count = sh(returnStdout: true, + script: """objdump -a -x $LIBFABRIC_INSTALL/lib/libfabric.so | + grep NEED | grep hsa | wc -l""").trim() + if (rocr_link_count != "0") { + error("libfabric failed to link to ROCR") + } + } + } + } + stage('Build LTU') { + options { + timeout (time: 5, unit: 'MINUTES') + } + steps { + dir ('libfabric-test-utils') { + deleteDir () + } + dir ('libfabric-test-utils') { + git url: "ssh://${env.LTU_GIT_REPO}", credentialsId: 'jenkins-nw-cje2-sshkey', branch: "${env.SRC_GIT_BRANCH}" + sh "git remote -v" + script { + LTU_VERSION = sh(returnStdout: true, script: "cat .version").trim() + GIT_SHORT_COMMIT = sh(returnStdout: true, script: "git log -n 1 --pretty=format:'%h'").trim() + LTU_VERSION = "${LTU_VERSION}" + "_${GIT_SHORT_COMMIT}" + } + echo "*** Building libfabric-test-utils, Version: ${LTU_VERSION} ***" + sh "./autogen.sh" + sh """./configure --prefix=${LIBFABRIC_INSTALL} --with-libfabric=${LIBFABRIC_INSTALL} \ + --with-nvidia=${CUDA_INSTALL_PATH} --with-amd=${ROCR_INSTALL_PATH} \ + --with-pmi=${PMI_INSTALL_PATH} --with-pmi_include=${PMI_INCLUDE_PATH} \ + --with-ltu-build-string=\"libfabric-test-utils-${LTU_VERSION}\"""" + sh "make -j 10" + sh "make install" + } + } + } + stage('Build SFT') { + options { + timeout (time: 5, unit: 'MINUTES') + } + steps { + dir ('libfabric-sft') { + deleteDir () + } + dir ('libfabric-sft') { + git url: "ssh://${env.SFT_GIT_REPO}", credentialsId: 'jenkins-nw-cje2-sshkey', branch: "${env.SRC_GIT_BRANCH}" + sh "git remote -v" + script { + SFT_VERSION = sh(returnStdout: true, script: "cat .version").trim() + GIT_SHORT_COMMIT = sh(returnStdout: true, script: "git log -n 1 --pretty=format:'%h'").trim() + SFT_VERSION = "${SFT_VERSION}" + "_${GIT_SHORT_COMMIT}" + } + echo "*** Building libfabric-sft, Version: ${SFT_VERSION} ***" + sh "./autogen.sh" + sh """./configure --prefix=${LIBFABRIC_INSTALL} --with-libfabric=${LIBFABRIC_INSTALL} \ + --with-libltu=${LIBFABRIC_INSTALL} \ + --with-sft-build-string=\"libfabric-sft-${LTU_VERSION}\"""" + sh "make -j 10" + sh "make install" + } + } + } stage('Test') { environment { LD_LIBRARY_PATH = "$LIBFABRIC_INSTALL/lib:$LD_LIBRARY_PATH" MPIR_CVAR_OFI_USE_PROVIDER = 'verbs;ofi_rxm' LIBFABRIC_INSTALL_PATH = "$LIBFABRIC_INSTALL" - SFT_BIN = "${SFT_INSTALL_PATH + '/bin'}" + SFT_BIN = "${LIBFABRIC_INSTALL + '/bin'}" SFT_MAX_JOB_TIME = '3' - SFT_NUM_JOBS = '4' + SFT_NUM_JOBS = '6' SFT_PROVIDER = 'verbs;ofi_rxm' SFT_BASELINE_DIR = "contrib/cray" SFT_BASELINE_RESULTS_FILE = 'sft_test_results_baseline.txt' @@ -71,6 +192,7 @@ pipeline { SFT_TEST_RESULTS_EXPECTED = 'expected_' SFT_TEST_RESULTS_PREFIX = 'BUILD_' SFT_TEST_RESULTS_CI = 'sft_ci_results.yaml' + FI_VERBS_MIN_RNR_TIMER= '4' } options { timeout (time: 22, unit: 'MINUTES') @@ -164,7 +286,7 @@ pipeline { try { dir ("${SFT_BIN}") { sh """ - ./ci-all.sh \\ + ${SFT_PR_ENV_VAR} ./ci-all.sh \\ --provider '${SFT_PROVIDER}' \\ -L ${SFT_TEST_RESULTS_DIR} \\ --num-jobs ${SFT_NUM_JOBS} \\ @@ -211,7 +333,7 @@ pipeline { try { dir ("${SFT_BIN}") { sh """ - ./ci-all.sh \\ + ${SFT_PR_ENV_VAR} ./ci-all.sh \\ --provider '${SFT_PROVIDER}' \\ -L ${SFT_TEST_RESULTS_DIR} \\ --num-jobs ${SFT_NUM_JOBS} \\ @@ -251,17 +373,27 @@ pipeline { } stage("Applications") { steps { - tee ('mpi.tap') { + tee ('omb.tap') { timeout(time: 10, unit: 'MINUTES') { - sh '${BATS_INSTALL_PATH}/bats -t contrib/cray/bats/mpi.bats' + sh '${BATS_INSTALL_PATH}/bats -t contrib/cray/bats/omb.bats' } } + tee ('imb.tap') { + timeout(time: 20, unit: 'MINUTES') { + sh '${BATS_INSTALL_PATH}/bats -t contrib/cray/bats/imb.bats' + } + } } post { always { sh """contrib/cray/bin/parse_logfiles.sh \\ - -r mpi.tap \\ - -w mpi.xml \\ + -r omb.tap \\ + -w omb.xml \\ + tap applications.mpi applications""" + + sh """contrib/cray/bin/parse_logfiles.sh \\ + -r imb.tap \\ + -w imb.xml \\ tap applications.mpi applications""" } } @@ -269,61 +401,82 @@ pipeline { } post { always { - step ([$class: 'XUnitBuilder', + step ([$class: 'XUnitPublisher', thresholds: [ [$class: 'FailedThreshold', unstableThreshold: '0']], tools: [[$class: 'JUnitType', pattern: "smoketests.xml"]]]) - step ([$class: 'XUnitBuilder', + step ([$class: 'XUnitPublisher', thresholds: [ [$class: 'FailedThreshold', unstableThreshold: '0']], tools: [[$class: 'JUnitType', pattern: "*-rc.xml"]]]) - step ([$class: 'XUnitBuilder', + step ([$class: 'XUnitPublisher', thresholds: [ [$class: 'FailedThreshold', unstableThreshold: '0']], tools: [[$class: 'JUnitType', pattern: "*-xrc.xml"]]]) - step ([$class: 'XUnitBuilder', + step ([$class: 'XUnitPublisher', thresholds: [ [$class: 'FailedThreshold', unstableThreshold: '0']], tools: [[$class: 'JUnitType', pattern: "sft_test_results/RC/sft_*_test_results.xml"]]]) - step ([$class: 'XUnitBuilder', + step ([$class: 'XUnitPublisher', thresholds: [ [$class: 'FailedThreshold', unstableThreshold: '0']], tools: [[$class: 'JUnitType', pattern: "sft_test_results/XRC/sft_*_test_results.xml"]]]) - step ([$class: 'XUnitBuilder', + step ([$class: 'XUnitPublisher', + thresholds: [ + [$class: 'FailedThreshold', unstableThreshold: '0']], + tools: [[$class: 'JUnitType', pattern: "omb.xml"]]]) + step ([$class: 'XUnitPublisher', thresholds: [ [$class: 'FailedThreshold', unstableThreshold: '0']], - tools: [[$class: 'JUnitType', pattern: "mpi.xml"]]]) + tools: [[$class: 'JUnitType', pattern: "imb.xml"]]]) + } + cleanup { + echo "*** Test: Post: Cleanup: env.BRANCH_NAME: ${env.BRANCH_NAME} ***" + echo "*** Test: Post: Cleanup: isOfiwgBuild: " + isOfiwgBuild() + " ***" + script { + if ( isInternalBuild() ) { + echo "*** Test: Post: Cleanup: isInternalBuild: TRUE ***" + } else { + echo "*** Test: Post: Cleanup: isInternalBuild: FALSE ***" + } + } + echo "*** Test: Post: Cleanup: currentBuild.currentResult: ${currentBuild.currentResult} ***" } } } stage("Install Libfabric Build") { when { allOf { - expression { currentBuild.result == 'SUCCESS' } ; - anyOf { - expression { env.BRANCH_NAME == 'master' } ; - buildingTag() ; - } + expression { currentBuild.currentResult == 'SUCCESS' } ; } } environment { - LIBFABRIC_INSTALL_PATH="${LIBFABRIC_BUILD_PATH + '/' + GIT_SHORT_COMMIT}" + LIBFABRIC_INSTALL_PATH="${LIBFABRIC_BUILD_PATH + '/' + GIT_DESCRIPTION}" } steps { - sh './autogen.sh' - sh "./configure --prefix=$LIBFABRIC_INSTALL_PATH" - sh "make -j 12" - sh "make install" + script { + BUILD_LIBFABRIC = 'false' + if ( isInternalBuild && + (( env.BRANCH_NAME == 'master' ) || buildingTag())) { + BUILD_LIBFABRIC = 'true' + } else if ( isOfiwgBuild() && ( env.BRANCH_NAME == 'master' )) { + LIBFABRIC_INSTALL_PATH="${LIBFABRIC_BUILD_PATH + '/' + 'OFIWG_' + GIT_DESCRIPTION}" + BUILD_LIBFABRIC = 'true' + } + echo "*** Install Libfabric Build: BUILD_LIBFABRIC: $BUILD_LIBFABRIC ***" + if ( BUILD_LIBFABRIC == 'true' ) { + sh "./autogen.sh" + sh "./configure --prefix=$LIBFABRIC_INSTALL_PATH --disable-memhooks-monitor" + sh "make -j 12" + sh "make install" + } + } } } stage("Deploy") { when { allOf { - expression { currentBuild.result == 'SUCCESS' } ; - anyOf { - expression { env.BRANCH_NAME == 'master' } ; - buildingTag() - } + expression { currentBuild.currentResult == 'SUCCESS' } ; } } options { @@ -336,85 +489,50 @@ pipeline { parallel { stage("Create nightly link") { when { - expression { env.BRANCH_NAME == 'master' } + allOf { + expression { isInternalBuild() } ; + expression { env.BRANCH_NAME == 'master' } + } } steps { dir (env.TAG_DIRECTORY) { sh "rm -f nightly || true" - sh "ln -s ../$GIT_SHORT_COMMIT nightly" + sh "ln -s ../$GIT_DESCRIPTION nightly" } } } stage("Create tagged link") { when { - buildingTag() + allOf { + expression { isInternalBuild() } ; + buildingTag() + } } steps { dir (env.TAG_DIRECTORY) { sh "rm -f $BRANCH_NAME || true" - sh "ln -s ../$GIT_SHORT_COMMIT $BRANCH_NAME" + sh "ln -s ../$GIT_DESCRIPTION $BRANCH_NAME" } } } - stage("Create RPMs") { - steps { - sh 'make dist-bzip2' - sh '''$WORKSPACE/contrib/buildrpm/buildrpmLibfabric.sh \ - -i verbs \ - -i sockets \ - -smv \ - -r '--define "_prefix /opt/cray/libfabric/$version"' \ - -r '--define "modulefile_path /opt/cray/modulefiles"' \ - $(ls libfabric-*.tar.bz2)''' + stage("Create upstream link") { + when { + allOf { + expression { isOfiwgBuild() } ; + expression { env.BRANCH_NAME == 'master' } + } } - post { - success { - stash name: 'rpms', includes: 'rpmbuild/RPMS/**/*' - stash name: 'sources', includes: 'rpmbuild/SOURCES/*' + steps { + dir (env.TAG_DIRECTORY) { + sh "rm -f upstream || true" + sh "ln -s ../OFIWG_$GIT_DESCRIPTION upstream" } } } } } - stage('Publish') { - when { - allOf { - expression { currentBuild.result == 'SUCCESS' } ; - expression { return isRelease("${env.GIT_BRANCH}") } - } - } - agent { - node { - label 'utility_pod' - } - } - steps { - container('utility') { - sh 'tar -cvzf /tmp/libfabric-source.tar.gz --exclude "*.log" .' - - // publishes the source RPM to DST's Artifactory instance - transfer(artifactName: '/tmp/libfabric-source.tar.gz') - - // Sends event to message bus to notify other builds - publishEvents(["os-networking-libfabric-verbs-publish"]) - } - } - } } post { - success { - script { - try { - unstash 'rpms' - unstash 'sources' - archiveArtifacts 'rpmbuild/SOURCES/*' - archiveArtifacts 'rpmbuild/RPMS/**/*' - } - catch (Exception e) { - echo 'No rpms to archive' - } - } - } changed { script { // send email when the state of the pipeline changes @@ -444,9 +562,20 @@ pipeline { ROOT_BUILD_PATH = "/scratch/jenkins/builds" FABTEST_PATH = "${WORKSPACE + '/installs/fabtests'}" LIBFABRIC_BUILD_PATH = "${ROOT_BUILD_PATH + '/libfabric'}" - OMB_BUILD_PATH = "${ROOT_BUILD_PATH + '/osu-micro-benchmarks/5.4.2/libexec/osu-micro-benchmarks/mpi'}" - MPICH_PATH = "${ROOT_BUILD_PATH + '/mpich/3.3b3'}" + OMB_BUILD_PATH = "${ROOT_BUILD_PATH + '/osu-micro-benchmarks/stable/libexec/osu-micro-benchmarks/mpi'}" + IMB_BUILD_PATH = "${ROOT_BUILD_PATH + '/imb/v2019.6'}" + MPICH_PATH = "${ROOT_BUILD_PATH + '/mpich/stable'}" SFT_INSTALL_PATH = "${ROOT_BUILD_PATH + '/libfabric-sft/stable'}" + SFT_PR_ENV_VAR = 'SFT_PR=0' BATS_INSTALL_PATH = "${ROOT_BUILD_PATH + '/bats/stable/bin'}" + CUDA_INSTALL_PATH = "/scratch/opt/cuda" + ROCR_INSTALL_PATH = "/opt/rocm" + PMI_INCLUDE_PATH = "/usr/include/slurm" + PMI_INSTALL_PATH = "/usr/lib64" + LTU_VERSION = "0.0.0" + SFT_VERSION = "0.0.0" + LTU_GIT_REPO = 'git@stash.us.cray.com:7999/ofi-cray/libfabric-test-utils.git' + SFT_GIT_REPO = 'git@stash.us.cray.com:7999/ofi-cray/libfabric-sft.git' + SRC_GIT_BRANCH = 'master' } } diff --git a/contrib/cray/bats/batsgenerator.sh b/contrib/cray/bats/batsgenerator.sh new file mode 100755 index 00000000000..cee576721fc --- /dev/null +++ b/contrib/cray/bats/batsgenerator.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# Generate a bats file to run Intel MPI Benchmarks +# Assumes IMB test suite has been installed and included in Jenkinsfile.verbs file +# Example: +# Add IMB-EXT Windows test running 20 ranks, 5 ranks per node to imb.bats +# ./batsgenerator.sh IMB-EXT windows 20 5 imb.bats + +# Insert shebang and load test helper +shebang="#!/usr/bin/env bats\n\n" +fi_env="XRC_FI_ENV=\"FI_VERBS_XRCD_FILENAME=/tmp/xrc_imb_\$\$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1\"\n\n" + +# Command line input: test suite +# E.g. IMB-EXT +test_suite=$1 +shift + +# Command line input: benchmark +# E.g. windows +benchmark=$1 +shift + +# Command line input: number of ranks +# E.g. 20 +num_ranks=$1 +shift + +# Command line input: number of ranks per node (rpn) +# E.g. 5 +num_rpn=$1 +shift + +#Command line input: name of bats file +# E.g. imb.bats +bats_file=$1 + +shift +if [[ $# -gt 0 ]] ; then + iter_flag=" -iter $1" +else + iter_flag="" +fi + +if [ ! -f "${bats_file}" ]; then + printf "${shebang}load test_helper\n\n${fi_env}" >> ${bats_file} +fi + +sed -e "s/@TEST_SUITE@/${test_suite}/g" \ + -e "s/@BENCHMARK@/${benchmark}/g" \ + -e "s/@RANKS@/${num_ranks}/g" \ + -e "s/@RPN@/${num_rpn}/g" \ + -e "s/@ITER_FLAG@/${iter_flag}/g" \ + benchmark.template >> ${bats_file} diff --git a/contrib/cray/bats/benchmark.template b/contrib/cray/bats/benchmark.template new file mode 100644 index 00000000000..8544a0eeeb1 --- /dev/null +++ b/contrib/cray/bats/benchmark.template @@ -0,0 +1,14 @@ +# RC +@test "@TEST_SUITE@ @BENCHMARK@ @RANKS@ ranks, @RPN@ ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher @RANKS@ @RPN@) timeout 300 "$IMB_BUILD_PATH/@TEST_SUITE@ -npmin @RANKS@@ITER_FLAG@ -time 10 -mem 2 -msglog 2:18 @BENCHMARK@" + [ "$status" -eq 0 ] +} + +# XRC +@test "@TEST_SUITE@ @BENCHMARK@ @RANKS@ ranks, @RPN@ ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher @RANKS@ @RPN@) timeout 300 "$IMB_BUILD_PATH/@TEST_SUITE@ -npmin @RANKS@@ITER_FLAG@ -time 10 -mem 2 -msglog 2:18 @BENCHMARK@" + [ "$status" -eq 0 ] +} diff --git a/contrib/cray/bats/imb.bats b/contrib/cray/bats/imb.bats new file mode 100644 index 00000000000..4b9d95626ed --- /dev/null +++ b/contrib/cray/bats/imb.bats @@ -0,0 +1,664 @@ +#!/usr/bin/env bats + +load test_helper + +XRC_FI_ENV="FI_VERBS_XRCD_FILENAME=/tmp/xrc_imb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1" + +# RC +@test "IMB-P2P unirandom 2 ranks, 1 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-P2P -npmin 2 -time 10 -mem 2 -msglog 2:18 unirandom" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-P2P unirandom 2 ranks, 1 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-P2P -npmin 2 -time 10 -mem 2 -msglog 2:18 unirandom" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-P2P birandom 2 ranks, 1 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-P2P -npmin 2 -time 10 -mem 2 -msglog 2:18 birandom" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-P2P birandom 2 ranks, 1 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-P2P -npmin 2 -time 10 -mem 2 -msglog 2:18 birandom" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-P2P corandom 2 ranks, 1 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-P2P -npmin 2 -time 10 -mem 2 -msglog 2:18 corandom" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-P2P corandom 2 ranks, 1 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-P2P -npmin 2 -time 10 -mem 2 -msglog 2:18 corandom" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-RMA bidir_get 2 ranks, 1 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-RMA -npmin 2 -time 10 -mem 2 -msglog 2:18 bidir_get" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-RMA bidir_get 2 ranks, 1 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-RMA -npmin 2 -time 10 -mem 2 -msglog 2:18 bidir_get" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-RMA bidir_put 2 ranks, 1 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-RMA -npmin 2 -time 10 -mem 2 -msglog 2:18 bidir_put" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-RMA bidir_put 2 ranks, 1 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-RMA -npmin 2 -time 10 -mem 2 -msglog 2:18 bidir_put" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-RMA unidir_get 2 ranks, 1 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-RMA -npmin 2 -time 10 -mem 2 -msglog 2:18 unidir_get" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-RMA unidir_get 2 ranks, 1 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-RMA -npmin 2 -time 10 -mem 2 -msglog 2:18 unidir_get" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-RMA unidir_put 2 ranks, 1 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-RMA -npmin 2 -time 10 -mem 2 -msglog 2:18 unidir_put" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-RMA unidir_put 2 ranks, 1 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 2 1) timeout 300 "$IMB_BUILD_PATH/IMB-RMA -npmin 2 -time 10 -mem 2 -msglog 2:18 unidir_put" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-EXT window 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-EXT -npmin 20 -time 10 -mem 2 -msglog 2:18 window" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-EXT window 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-EXT -npmin 20 -time 10 -mem 2 -msglog 2:18 window" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-EXT accumulate 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-EXT -npmin 20 -time 10 -mem 2 -msglog 2:18 accumulate" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-EXT accumulate 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-EXT -npmin 20 -time 10 -mem 2 -msglog 2:18 accumulate" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC ialltoall 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ialltoall" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC ialltoall 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ialltoall" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC ialltoall_pure 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ialltoall_pure" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC ialltoall_pure 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ialltoall_pure" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC ialltoallv 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ialltoallv" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC ialltoallv 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ialltoallv" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC ialltoallv_pure 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ialltoallv_pure" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC ialltoallv_pure 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ialltoallv_pure" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC iallgather 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallgather" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC iallgather 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallgather" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC iallgather_pure 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallgather_pure" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC iallgather_pure 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallgather_pure" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC iallgatherv 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallgatherv" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC iallgatherv 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallgatherv" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC iallgatherv_pure 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallgatherv_pure" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC iallgatherv_pure 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallgatherv_pure" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC iallreduce 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallreduce" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC iallreduce 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallreduce" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC iallreduce_pure 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallreduce_pure" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC iallreduce_pure 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iallreduce_pure" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC ibarrier 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ibarrier" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC ibarrier 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ibarrier" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC ibarrier_pure 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ibarrier_pure" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC ibarrier_pure 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ibarrier_pure" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC ibcast 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ibcast" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC ibcast 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ibcast" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC ibcast_pure 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ibcast_pure" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC ibcast_pure 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ibcast_pure" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC igather 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 igather" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC igather 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 igather" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC igather_pure 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 igather_pure" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC igather_pure 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 igather_pure" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC igatherv 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 igatherv" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC igatherv 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 igatherv" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC igatherv_pure 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 igatherv_pure" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC igatherv_pure 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 igatherv_pure" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC ireduce 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ireduce" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC ireduce 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ireduce" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC ireduce_pure 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ireduce_pure" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC ireduce_pure 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ireduce_pure" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC ireduce_scatter 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ireduce_scatter" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC ireduce_scatter 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 ireduce_scatter" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC iscatter 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iscatter" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC iscatter 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iscatter" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC iscatter_pure 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iscatter_pure" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC iscatter_pure 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iscatter_pure" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC iscatterv 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iscatterv" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC iscatterv 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iscatterv" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-NBC iscatterv_pure 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iscatterv_pure" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-NBC iscatterv_pure 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-NBC -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 iscatterv_pure" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-MPI1 reduce 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 reduce" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-MPI1 reduce 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 reduce" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-MPI1 reduce_scatter 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 reduce_scatter" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-MPI1 reduce_scatter 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 reduce_scatter" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-MPI1 reduce_scatter_block 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 reduce_scatter_block" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-MPI1 reduce_scatter_block 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 reduce_scatter_block" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-MPI1 allreduce 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 allreduce" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-MPI1 allreduce 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 allreduce" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-MPI1 allgather 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 allgather" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-MPI1 allgather 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 allgather" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-MPI1 allgatherv 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 allgatherv" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-MPI1 allgatherv 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 allgatherv" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-MPI1 scatter 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 scatter" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-MPI1 scatter 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 scatter" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-MPI1 scatterv 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 scatterv" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-MPI1 scatterv 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 scatterv" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-MPI1 gather 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 gather" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-MPI1 gather 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 gather" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-MPI1 gatherv 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 gatherv" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-MPI1 gatherv 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 gatherv" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-MPI1 alltoall 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 alltoall" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-MPI1 alltoall 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 alltoall" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-MPI1 bcast 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 bcast" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-MPI1 bcast 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 bcast" + [ "$status" -eq 0 ] +} +# RC +@test "IMB-MPI1 barrier 20 ranks, 5 ranks per node using RC verbs" { + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 barrier" + [ "$status" -eq 0 ] +} + +# XRC +@test "IMB-MPI1 barrier 20 ranks, 5 ranks per node using XRC verbs" { + eval ${XRC_FI_ENV} \ + run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + $(batch_launcher 20 5) timeout 300 "$IMB_BUILD_PATH/IMB-MPI1 -npmin 20 -iter 100 -time 10 -mem 2 -msglog 2:18 barrier" + [ "$status" -eq 0 ] +} diff --git a/contrib/cray/bats/mpi.bats b/contrib/cray/bats/omb.bats similarity index 75% rename from contrib/cray/bats/mpi.bats rename to contrib/cray/bats/omb.bats index 48c96e143c4..37693d860bf 100644 --- a/contrib/cray/bats/mpi.bats +++ b/contrib/cray/bats/omb.bats @@ -203,199 +203,199 @@ load test_helper # XRC @test "osu_latency 2 ranks, 1 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 2 1) timeout 300 $OMB_BUILD_PATH/pt2pt/osu_latency [ "$status" -eq 0 ] } @test "osu_bw 2 ranks, 1 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 2 1) timeout 300 $OMB_BUILD_PATH/pt2pt/osu_bw [ "$status" -eq 0 ] } @test "osu_mbw_mr 8 ranks, 4 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 8 4) timeout 300 $OMB_BUILD_PATH/pt2pt/osu_mbw_mr [ "$status" -eq 0 ] } @test "osu_get_latency 2 ranks, 1 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 2 1) timeout 300 $OMB_BUILD_PATH/one-sided/osu_get_latency [ "$status" -eq 0 ] } @test "osu_get_bw 2 ranks, 1 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 2 1) timeout 300 $OMB_BUILD_PATH/one-sided/osu_get_bw [ "$status" -eq 0 ] } @test "osu_put_latency 2 ranks, 1 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 2 1) timeout 300 $OMB_BUILD_PATH/one-sided/osu_put_latency [ "$status" -eq 0 ] } @test "osu_put_bw 2 ranks, 1 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 2 1) timeout 300 $OMB_BUILD_PATH/one-sided/osu_put_bw [ "$status" -eq 0 ] } @test "osu_put_bibw 2 ranks, 1 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 2 1) timeout 300 $OMB_BUILD_PATH/one-sided/osu_put_bibw [ "$status" -eq 0 ] } @test "osu_allreduce 40 ranks, 10 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_allreduce [ "$status" -eq 0 ] } @test "osu_allgather 40 ranks, 10 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_allgather [ "$status" -eq 0 ] } @test "osu_allgatherv 40 ranks, 10 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_allgatherv [ "$status" -eq 0 ] } @test "osu_alltoall 20 ranks, 5 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 20 5) timeout 300 $OMB_BUILD_PATH/collective/osu_alltoall [ "$status" -eq 0 ] } @test "osu_alltoallv 20 ranks, 5 ranks per node using XRC verbs" { skip "fails consistently at 128k message size" - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 20 5) timeout 300 $OMB_BUILD_PATH/collective/osu_alltoallv [ "$status" -eq 0 ] } @test "osu_barrier 40 ranks, 10 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_barrier [ "$status" -eq 0 ] } @test "osu_bcast 40 ranks, 10 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_bcast [ "$status" -eq 0 ] } @test "osu_gather 40 ranks, 10 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_gather [ "$status" -eq 0 ] } @test "osu_gatherv 40 ranks, 10 ranks per node using XRC verbs" { skip "fails intermittently" - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_gatherv [ "$status" -eq 0 ] } @test "osu_iallgather 40 ranks, 10 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_iallgather [ "$status" -eq 0 ] } @test "osu_iallgatherv 40 ranks, 10 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_iallgatherv [ "$status" -eq 0 ] } @test "osu_ialltoall 20 ranks, 5 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 20 5) timeout 300 $OMB_BUILD_PATH/collective/osu_ialltoall [ "$status" -eq 0 ] } @test "osu_ialltoallv 20 ranks, 5 ranks per node using XRC verbs" { skip "fails consistently at 128k message size" - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 20 5) timeout 300 $OMB_BUILD_PATH/collective/osu_ialltoallv [ "$status" -eq 0 ] } @test "osu_ialltoallw 20 ranks, 5 ranks per node using XRC verbs" { skip "fails consistently at 128k message size" - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 20 5) timeout 300 $OMB_BUILD_PATH/collective/osu_ialltoallw [ "$status" -eq 0 ] } @test "osu_ibarrier 40 ranks, 10 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_ibarrier [ "$status" -eq 0 ] } @test "osu_ibcast 40 ranks, 10 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_ibcast [ "$status" -eq 0 ] } @test "osu_igather 40 ranks, 10 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_igather [ "$status" -eq 0 ] } @test "osu_igatherv 40 ranks, 10 ranks per node using XRC verbs" { skip "fails intermittently" - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_igatherv [ "$status" -eq 0 ] } @test "osu_iscatter 40 ranks, 10 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_iscatter [ "$status" -eq 0 ] } @test "osu_iscatterv 40 ranks, 10 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_iscatterv [ "$status" -eq 0 ] } @test "osu_reduce 40 ranks, 10 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_reduce [ "$status" -eq 0 ] } @test "osu_reduce_scatter 40 ranks, 10 ranks per node using XRC verbs" { skip "fails consistently at 512K message size" - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_reduce_scatter [ "$status" -eq 0 ] } @test "osu_scatter 40 ranks, 10 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_scatter [ "$status" -eq 0 ] } @test "osu_scatterv 40 ranks, 10 ranks per node using XRC verbs" { - FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ + FI_VERBS_XRCD_FILENAME=/tmp/xrc_omb_$$ FI_OFI_RXM_USE_SRX=1 FI_VERBS_PREFER_XRC=1 run $CONTRIB_BIN/logwrap -w ${BATS_TEST_LOGFILE} -- \ $(batch_launcher 40 10) timeout 300 $OMB_BUILD_PATH/collective/osu_scatterv [ "$status" -eq 0 ] } diff --git a/contrib/cray/bin/run_libfabric_pipeline b/contrib/cray/bin/run_libfabric_pipeline index 6383aea9ca5..88978f4ee12 100755 --- a/contrib/cray/bin/run_libfabric_pipeline +++ b/contrib/cray/bin/run_libfabric_pipeline @@ -21,9 +21,10 @@ BUILD=true TEST=true UNITTEST=true SMOKETEST=true +IMBTEST=true +OMBTEST=true FABTEST=true SFT=true -MPI=true function usage { echo \ @@ -38,15 +39,16 @@ function set_sections_to_run { TEST=false UNITTEST=false SMOKETEST=false + IMBTEST=false + OMBTEST=false FABTEST=false SFT=false - MPI=false sections=$(echo $@ | tr ',' ' ') for section in $sections ; do section_name=$(echo $section | awk '{print toupper($0)}') case $section_name in - 'UNITTEST'|'SMOKETEST'|'FABTEST'|'SFT'|'MPI') + 'UNITTEST'|'SMOKETEST'|'IMBTEST'|'OMBTEST'|'FABTEST'|'SFT') TEST=true eval ${section_name}=true ;; @@ -55,17 +57,19 @@ function set_sections_to_run { TEST=true UNITTEST=true SMOKETEST=true + IMBTEST=true + OMBTEST=true FABTEST=true SFT=true - MPI=true ;; 'TEST') TEST=true UNITTEST=true SMOKETEST=true + IMBTEST=true + OMBTEST=true FABTEST=true SFT=true - MPI=true ;; 'BUILD') BUILD=true @@ -75,7 +79,7 @@ function set_sections_to_run { esac done - for each in BUILD TEST UNITTEST SMOKETEST SFT MPI ; do + for each in BUILD TEST UNITTEST SMOKETEST IMBTEST OMBTEST SFT ; do if $DEBUG ; then echo ${each} = $(eval echo \$$each) ; fi done } @@ -127,7 +131,7 @@ verbose "CLEAN: $CLEAN" verbose "SECTIONS: $SECTIONS" verbose "WORKSPACE: $WORKSPACE" -for each in BUILD TEST UNITTEST SMOKETEST FABTEST MPI SFT ; do +for each in BUILD TEST UNITTEST SMOKETEST IMBTEST OMBTEST FABTEST SFT ; do verbose "$each: $(eval echo \$$each)" done @@ -168,10 +172,11 @@ export ROOT_BUILD_PATH="/scratch/jenkins/builds" export FABTEST_PATH="${WORKSPACE}/fabtests" export LIBFABRIC_BUILD_PATH="${ROOT_BUILD_PATH}/libfabric" export OMB_BUILD_PATH="${ROOT_BUILD_PATH}/osu-micro-benchmarks/5.4.2/libexec/osu-micro-benchmarks/mpi" -export MPICH_PATH="${ROOT_BUILD_PATH}/mpich/3.3b3" +export MPICH_PATH="${ROOT_BUILD_PATH}/mpich/stable" export SFT_INSTALL_PATH="${ROOT_BUILD_PATH}/libfabric-sft/stable" export BATS_INSTALL_PATH="${ROOT_BUILD_PATH}/bats/stable/bin" export BATS_LOG_DIRECTORY="$WORKSPACE/logs" +export IMB_BUILD_PATH="${ROOT_BUILD_PATH}/imb/v2019.6" # End pipeline variables # Start Prologue @@ -228,6 +233,22 @@ $BATS_INSTALL_PATH/bats $@ -t contrib/cray/bats/smoketests.bats | tee smoketests section_end 'smoke tests' fi +if $IMBTEST ; then +section_start 'imb tests' +## Start IMB Tests +$BATS_INSTALL_PATH/bats $@ -t contrib/cray/bats/imb.bats | tee imb.tap +## End IMB Tests +section_end 'imb tests' +fi + +if $OMBTEST ; then +section_start 'omb tests' +## Start OMB Tests +$BATS_INSTALL_PATH/bats $@ -t contrib/cray/bats/omb.bats | tee omb.tap +## End OMB Tests +section_end 'omb tests' +fi + if $FABTEST ; then section_start 'fabtests' ## Start Fabtests @@ -286,8 +307,8 @@ timeout 900 ./ci-all.sh \ --results-file ${SFT_TEST_RESULTS_DIR}/${SFT_TEST_RESULTS_CI} popd -cp ./${SFT_BASELINE_DIR}/${SFT_BASELINE_RESULTS_FILE} ${SFT_TEST_RESULTS_DIR}/ \ - ${SFT_TEST_RESULTS_EXPECTED}${SFT_BASELINE_RESULTS_FILE} +cp ./${SFT_BASELINE_DIR}/${SFT_BASELINE_RESULTS_FILE} \ + ${SFT_TEST_RESULTS_DIR}/${SFT_TEST_RESULTS_EXPECTED}${SFT_BASELINE_RESULTS_FILE} ${SFT_BIN}/sft_parse_test_results.pm \ -b ${SFT_TEST_RESULTS_EXPECTED}${SFT_BASELINE_RESULTS_FILE} \ -d ${SFT_TEST_RESULTS_DIR} \ @@ -304,12 +325,4 @@ rm -rf ${SFT_TEST_RESULTS_DIR} || true section_end 'sft' fi -if $MPI ; then -section_start 'mpi' -## Start MPI Tests -$BATS_INSTALL_PATH/bats -t contrib/cray/bats/mpi.bats | tee mpi.tap -## End MPI Tests -section_end 'mpi' -fi - fi diff --git a/contrib/cray/python/parse_results.py b/contrib/cray/python/parse_results.py index f83a81208c1..d24ac1a8545 100755 --- a/contrib/cray/python/parse_results.py +++ b/contrib/cray/python/parse_results.py @@ -86,6 +86,8 @@ def fabtests_testcase_parser(log, classname_prefix): result = 'pass' elif data[1] == 'Notrun': result = 'skip' + elif data[1] == 'Excluded': + result = 'skip' else: result = 'fail' elif line.startswith(' time:'): diff --git a/contrib/intel/jenkins/Jenkinsfile b/contrib/intel/jenkins/Jenkinsfile new file mode 100644 index 00000000000..fba9d6eac32 --- /dev/null +++ b/contrib/intel/jenkins/Jenkinsfile @@ -0,0 +1,216 @@ + +pipeline { + agent any + options { + timestamps() + timeout(activity: true, time: 4, unit: 'HOURS') + } + environment { + JOB_CADENCE = 'PR' + } + + stages { + stage ('fetch-opa-psm2') { + steps { + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']) { + dir('opa-psm2-lib') { + + checkout changelog: false, poll: false, scm: [$class: 'GitSCM', \ + branches: [[name: '*/master']], \ + doGenerateSubmoduleConfigurations: false, extensions: [], submoduleCfg: [], \ + userRemoteConfigs: [[url: 'https://github.com/intel/opa-psm2.git']]] + } + } + } + } + + stage ('build-libfabric') { + steps { + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']) { + sh """ + python3.7 contrib/intel/jenkins/build.py 'libfabric' --ofi_build_mode='dbg' + echo "libfabric build completed" + """ + } + } + } + stage('build-fabtests') { + steps { + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']) { + sh """ + python3.7 contrib/intel/jenkins/build.py 'fabtests' --ofi_build_mode='dbg' + echo 'fabtests build completed' + """ + } + } + } + + stage ('build-shmem') { + steps { + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']){ + sh """ + python3.7 contrib/intel/jenkins/build.py 'shmem' --ofi_build_mode='dbg' + echo 'shmem benchmarks built successfully' + """ + } + } + } + stage('build MPICH_bm') { + steps { + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']){ + sh """ + python3.7 contrib/intel/jenkins/build.py 'mpich_benchmarks' --ofi_build_mode='dbg' + echo "mpi benchmarks with mpich - built successfully" + """ + } + } + } + stage('build IMPI_bm') { + steps { + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']){ + sh """ + python3.7 contrib/intel/jenkins/build.py 'impi_benchmarks' --ofi_build_mode='dbg' + echo 'mpi benchmarks with impi - built successfully' + """ + } + } + } + + stage ('build OMPI_bm') { + steps { + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']){ + sh """ + python3.7 contrib/intel/jenkins/build.py 'ompi_benchmarks' --ofi_build_mode='dbg' + echo 'mpi benchmarks with ompi - built successfully' + """ + } + } + } + + stage('parallel-tests') { + parallel { + stage('eth-tcp-dbg') { + agent {node {label 'eth'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) + { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=tcp --ofi_build_mode='dbg' + ) + """ + } + } + } + stage('eth-udp-shm-dbg') { + agent {node {label 'eth'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) + { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=udp --ofi_build_mode='dbg' + python3.7 runtests.py --prov=udp --util=rxd --ofi_build_mode='dbg' + python3.7 runtests.py --prov=shm --ofi_build_mode='dbg' + ) + """ + } + } + + } + stage('hfi1-psm2-verbs-dbg') { + agent {node {label 'hfi1'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=psm2 --ofi_build_mode='dbg' + python3.7 runtests.py --prov=verbs --ofi_build_mode='dbg' + ) + """ + } + } + } + + stage('hfi1-verbs_rxd-dbg') { + agent {node {label 'hfi1'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=verbs --util=rxd --ofi_build_mode='dbg' + ) + """ + } + } + } + stage('hfi1-verbs_rxm-dbg') { + agent {node {label 'hfi1'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=verbs --util=rxm --ofi_build_mode='dbg' + ) + """ + } + } + } + stage('mlx5-verbs_rxm-dbg') { + agent {node {label 'mlx5'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=verbs --ofi_build_mode='dbg' + python3.7 runtests.py --prov=verbs --util=rxm --ofi_build_mode='dbg' + ) + """ + } + } + } + stage('mlx5-verbs_rxd-dbg') { + agent {node {label 'mlx5'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=verbs --util=rxd --ofi_build_mode='dbg' + ) + """ + } + } + } + } + } + + } + + post { + cleanup { + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh "rm -rf '/mpibuilddir/mpich-build-dir/${env.JOB_NAME}/${env.BUILD_NUMBER}'" + sh "rm -rf '/mpibuilddir/ompi-build-dir/${env.JOB_NAME}/${env.BUILD_NUMBER}'" + sh "rm -rf '/mpibuilddir/mpich-suite-build-dir/${env.JOB_NAME}/${env.BUILD_NUMBER}'" + dir("${env.WORKSPACE}"){ + deleteDir() + } + } + } + } + +} diff --git a/contrib/intel/jenkins/Jenkinsfile.daily b/contrib/intel/jenkins/Jenkinsfile.daily new file mode 100644 index 00000000000..2dbabe09a5b --- /dev/null +++ b/contrib/intel/jenkins/Jenkinsfile.daily @@ -0,0 +1,501 @@ + +pipeline { + agent any + options { + timestamps() + timeout(activity: true, time: 4, unit: 'HOURS') + } + environment { + JOB_CADENCE = 'daily' + } + stages { + stage ('fetch-opa-psm2') { + steps { + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']) { + dir('opa-psm2-lib') { + + checkout changelog: false, poll: false, scm: [$class: 'GitSCM', \ + branches: [[name: '*/master']], \ + doGenerateSubmoduleConfigurations: false, extensions: [], submoduleCfg: [], \ + userRemoteConfigs: [[url: 'https://github.com/intel/opa-psm2.git']]] + } + } + } + } + + stage ('build-libfabric') { + steps { + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']) { + sh """ + python3.7 contrib/intel/jenkins/build.py 'libfabric' + python3.7 contrib/intel/jenkins/build.py 'libfabric' --ofi_build_mode='dbg' + python3.7 contrib/intel/jenkins/build.py 'libfabric' --ofi_build_mode='dl' + echo "libfabric build completed" + """ + } + } + } + stage('build-fabtests') { + steps { + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']) { + sh """ + python3.7 contrib/intel/jenkins/build.py 'fabtests' + python3.7 contrib/intel/jenkins/build.py 'fabtests' --ofi_build_mode='dbg' + python3.7 contrib/intel/jenkins/build.py 'fabtests' --ofi_build_mode='dl' + echo 'fabtests build completed' + """ + } + } + } + + stage ('build-shmem') { + steps { + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']) { + sh """ + python3.7 contrib/intel/jenkins/build.py 'shmem' + python3.7 contrib/intel/jenkins/build.py 'shmem' --ofi_build_mode='dbg' + python3.7 contrib/intel/jenkins/build.py 'shmem' --ofi_build_mode='dl' + echo 'shmem benchmarks built successfully' + """ + } + } + } + + stage ('build OMPI_bm') { + steps { + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']) { + sh """ + python3.7 contrib/intel/jenkins/build.py 'ompi_benchmarks' + python3.7 contrib/intel/jenkins/build.py 'ompi_benchmarks' --ofi_build_mode='dbg' + python3.7 contrib/intel/jenkins/build.py 'ompi_benchmarks' --ofi_build_mode='dl' + echo 'mpi benchmarks with ompi - built successfully' + """ + } + } + } + + stage('build IMPI_bm') { + steps { + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']) { + sh """ + python3.7 contrib/intel/jenkins/build.py 'impi_benchmarks' + python3.7 contrib/intel/jenkins/build.py 'impi_benchmarks' --ofi_build_mode='dbg' + python3.7 contrib/intel/jenkins/build.py 'impi_benchmarks' --ofi_build_mode='dl' + echo 'mpi benchmarks with impi - built successfully' + """ + } + } + } + + stage('build MPICH_bm') { + steps { + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin']) { + sh """ + python3.7 contrib/intel/jenkins/build.py 'mpich_benchmarks' + python3.7 contrib/intel/jenkins/build.py 'mpich_benchmarks' --ofi_build_mode='dbg' + python3.7 contrib/intel/jenkins/build.py 'mpich_benchmarks' --ofi_build_mode='dl' + echo "mpi benchmarks with mpich - built successfully" + """ + } + } + } + stage('parallel-tests') { + parallel { + stage('eth-sockets') { + agent {node {label 'eth'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) + { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=sockets + ) + """ + } + } + } + stage('eth-tcp') { + agent {node {label 'eth'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) + { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=tcp + ) + """ + } + } + } + stage('eth-udp-rxd-shm') { + agent {node {label 'eth'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) + { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=udp + python3.7 runtests.py --prov=udp --util=rxd + python3.7 runtests.py --prov=shm + ) + """ + } + } + } + stage('hfi1-psm2-verbs') { + agent {node {label 'hfi1'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=psm2 + python3.7 runtests.py --prov=verbs + ) + """ + } + } + + } + stage('hfi1-verbs-rxm') { + agent {node {label 'hfi1'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=verbs --util=rxm + ) + """ + } + } + + } + stage('hfi1-verbs-rxd') { + agent {node {label 'hfi1'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=verbs --util=rxd + + ) + """ + } + } + + } + stage('mlx5-verbs-rxm') { + agent {node {label 'mlx5'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=verbs + python3.7 runtests.py --prov=verbs --util=rxm + + ) + """ + } + } + + } + stage('mlx5-verbs-rxd') { + agent {node {label 'mlx5'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=verbs --util=rxd + ) + """ + } + } + + } + stage('eth-sockets-dbg') { + agent {node {label 'eth'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) + { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=sockets --ofi_build_mode='dbg' + ) + """ + } + } + } + stage('eth-tcp-dbg') { + agent {node {label 'eth'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) + { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=tcp --ofi_build_mode='dbg' + ) + """ + } + } + } + stage('eth-udp-rxd-shm-dbg') { + agent {node {label 'eth'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) + { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=udp --ofi_build_mode='dbg' + python3.7 runtests.py --prov=udp --util=rxd --ofi_build_mode='dbg' + python3.7 runtests.py --prov=shm --ofi_build_mode='dbg' + ) + """ + } + } + + } + stage('hfi1-psm2-verbs-dbg') { + agent {node {label 'hfi1'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=psm2 --ofi_build_mode='dbg' + python3.7 runtests.py --prov=verbs --ofi_build_mode='dbg' + ) + """ + } + } + } + stage('hfi1-verbs_rxd-dbg') { + agent {node {label 'hfi1'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=verbs --util=rxd --ofi_build_mode='dbg' + ) + """ + } + } + } + stage('hfi1-verbs_rxm-dbg') { + agent {node {label 'hfi1'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=verbs --util=rxm --ofi_build_mode='dbg' + ) + """ + } + } + } + stage('mlx5-verbs_rxm-dbg') { + agent {node {label 'mlx5'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=verbs --ofi_build_mode='dbg' + python3.7 runtests.py --prov=verbs --util=rxm --ofi_build_mode='dbg' + ) + """ + } + } + } + stage('mlx5-verbs_rxd-dbg') { + agent {node {label 'mlx5'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=verbs --util=rxd --ofi_build_mode='dbg' + ) + """ + } + } + } + stage('eth-sockets-dl') { + agent {node {label 'eth'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) + { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=sockets --ofi_build_mode='dl' + ) + """ + } + } + } + stage('eth-tcp-dl') { + agent {node {label 'eth'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) + { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=tcp --ofi_build_mode='dl' + ) + """ + } + } + + } + stage('eth-udp-rxd-shm-dl') { + agent {node {label 'eth'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin/:$PYTHONPATH']) + { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=udp --ofi_build_mode='dl' + python3.7 runtests.py --prov=udp --util=rxd --ofi_build_mode='dl' + python3.7 runtests.py --prov=shm --ofi_build_mode='dl' + ) + """ + } + } + + } + + stage('hfi1-psm2-verbs-dl') { + agent {node {label 'hfi1'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=psm2 --ofi_build_mode='dl' + python3.7 runtests.py --prov=verbs --ofi_build_mode='dl' + ) + """ + } + } + } + stage('hfi1-verbs_rxd-dl') { + agent {node {label 'hfi1'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=verbs --util=rxd --ofi_build_mode='dl' + ) + """ + } + } + } + stage('hfi1-verbs_rxm-dl') { + agent {node {label 'hfi1'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=verbs --util=rxm --ofi_build_mode='dl' + ) + """ + } + } + } + + stage('mlx5-verbs_rxm-dl') { + agent {node {label 'mlx5'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=verbs --ofi_build_mode='dl' + python3.7 runtests.py --prov=verbs --util=rxm --ofi_build_mode='dl' + ) + """ + } + } + + } + stage('mlx5-verbs_rxd-dl') { + agent {node {label 'mlx5'}} + steps{ + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh """ + env + ( + cd ${env.WORKSPACE}/contrib/intel/jenkins/ + python3.7 runtests.py --prov=verbs --util=rxd --ofi_build_mode='dl' + ) + """ + } + } + } + } + + } + + } + + post { + failure { + mail from: 'notification@jenkins-ci.org', + to: "${env.mailrecepient}", + subject: "${env.JOB_NAME} - Build # ${env.BUILD_NUMBER} - ${currentBuild.result}!", + body: " Check console output at ${env.BUILD_URL} to view the results." + + } + cleanup { + withEnv(['PATH+EXTRA=/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/bin:$PYTHONPATH']) { + sh "rm -rf '/mpibuilddir/mpich-build-dir/${env.JOB_NAME}/${env.BUILD_NUMBER}'" + sh "rm -rf '/mpibuilddir/ompi-build-dir/${env.JOB_NAME}/${env.BUILD_NUMBER}'" + sh "rm -rf '/mpibuilddir/mpich-suite-build-dir/${env.JOB_NAME}/${env.BUILD_NUMBER}'" + dir("${env.WORKSPACE}"){ + deleteDir() + } + } + } + } + +} + diff --git a/contrib/intel/jenkins/build.py b/contrib/intel/jenkins/build.py new file mode 100755 index 00000000000..1f655043e1a --- /dev/null +++ b/contrib/intel/jenkins/build.py @@ -0,0 +1,291 @@ +import os +import sys + +# add jenkins config location to PATH +sys.path.append(os.environ['CI_SITE_CONFIG']) + +import ci_site_config +import argparse +import subprocess +import shlex +import common +import re +import shutil + +def build_libfabric(libfab_install_path, mode): + + if (os.path.exists(libfab_install_path) != True): + os.makedirs(libfab_install_path) + + config_cmd = ['./configure', '--prefix={}'.format(libfab_install_path)] + enable_prov_val = 'yes' + + if (mode == 'dbg'): + config_cmd.append('--enable-debug') + elif (mode == 'dl'): + enable_prov_val='dl' + + for prov in common.enabled_prov_list: + config_cmd.append('--enable-{}={}'.format(prov, enable_prov_val)) + for prov in common.disabled_prov_list: + config_cmd.append('--enable-{}=no'.format(prov)) + + config_cmd.append('--with-psm2-src={}/opa-psm2-lib'.format(workspace)) + + + common.run_command(['./autogen.sh']) + common.run_command(shlex.split(" ".join(config_cmd))) + common.run_command(['make','clean']) + common.run_command(['make']) + common.run_command(['make','install']) + + +def build_fabtests(libfab_install_path, mode): + + os.chdir('{}/fabtests'.format(workspace)) + if (mode == 'dbg'): + config_cmd = ['./configure', '--enable-debug', '--prefix={}' \ + .format(libfab_install_path),'--with-libfabric={}' \ + .format(libfab_install_path)] + else: + config_cmd = ['./configure', '--prefix={}'.format(libfab_install_path), + '--with-libfabric={}'.format(libfab_install_path)] + + + common.run_command(['./autogen.sh']) + common.run_command(config_cmd) + common.run_command(['make','clean']) + common.run_command(['make']) + common.run_command(['make', 'install']) + +def build_shmem(shmem_dir, libfab_install_path): + + shmem_tar = ci_site_config.shmem_tar + if(os.path.exists(shmem_dir)): + os.rmdir(shmem_dir) + + os.makedirs(shmem_dir) + os.chdir(shmem_dir) + + os.makedirs('SOS') + common.run_command(['tar', '-xf', shmem_tar, '-C', 'SOS', '--strip-components=1']) + os.chdir('SOS') + + common.run_command(['./autogen.sh']) + + config_cmd = ['./configure', '--prefix={}'.format(shmem_dir), '--disable-fortran', \ + '--enable-remote-virtual-addressing', '--disable-aslr-check', \ + '--enable-pmi-simple', '--with-ofi={}'.format(libfab_install_path), \ + 'LDFLAGS=-fno-pie'] + + common.run_command(config_cmd) + + common.run_command(['make','-j4']) + common.run_command(['make', 'check', 'TESTS=']) + common.run_command(['make', 'install']) + + +def build_ISx(shmem_dir): + + oshcc = '{}/bin/oshcc'.format(shmem_dir) + tmp_isx_src = '{}/ISx'.format(ci_site_config.shmem_root) + shutil.copytree(tmp_isx_src, '{}/ISx'.format(shmem_dir)) + #os.chdir(shmem_dir) + #git_cmd = ['git', 'clone', '--depth', '1', 'https://github.com/ParRes/ISx.git', 'ISx'] + + #common.run_command(git_cmd) + os.chdir('{}/ISx/SHMEM'.format(shmem_dir)) + common.run_command(['make', 'CC={}'.format(oshcc), 'LDLIBS=-lm']) + + +def build_PRK(shmem_dir): + + oshcc = '{}/bin/oshcc'.format(shmem_dir) + shmem_src = '{}/SOS'.format(shmem_dir) + tmp_prk_src = '{}/PRK'.format(ci_site_config.shmem_root) + shutil.copytree(tmp_prk_src, '{}/PRK'.format(shmem_dir)) + #os.chdir(shmem_dir) + #git_cmd = ['git', 'clone', '--depth', ' 1', 'https://github.com/ParRes/Kernels.git', 'PRK'] + #common.run_command(git_cmd) + os.chdir('{}/PRK'.format(shmem_dir)) + with open('common/make.defs','w') as f: + f.write('SHMEMCC={} -std=c99\nSHMEMTOP={}\n'.format(oshcc,shmem_src)) + + common.run_command(['make', 'allshmem']) + +def build_uh(shmem_dir): + oshcc_bin = "{}/bin".format(shmem_dir) + os.environ["PATH"] += os.pathsep + oshcc_bin + tmp_uh_src = '{}/tests-uh'.format(ci_site_config.shmem_root) + shutil.copytree(tmp_uh_src, '{}/tests-uh'.format(shmem_dir)) + #os.chdir(shmem_dir) + #git_cmd = ['git', 'clone', '--depth', '1', 'https://github.com/openshmem-org/tests-uh.git', 'tests-uh'] + #common.run_command(git_cmd) + os.chdir('{}/tests-uh'.format(shmem_dir)) + common.run_command(['make', '-j4', 'C_feature_tests']) + + +def build_mpi(mpi, mpisrc, mpi_install_path, libfab_install_path, ofi_build_mode): + + build_mpi_path ="/mpibuilddir/{}-build-dir/{}/{}/{}".format(mpi, jobname, buildno, ofi_build_mode) + if (os.path.exists(build_mpi_path) == False): + os.makedirs(build_mpi_path) + + os.chdir(build_mpi_path) + cmd = ["{}/configure".format(mpisrc), + "--disable-oshmem", "--prefix={}".format(mpi_install_path), + "--with-libfabric={}".format(libfab_install_path)] + + if (mpi == 'ompi'): + cmd.append("--enable-mpi-fortran=no") + elif (mpi == 'mpich'): + cmd.append("--enable-fortran=no") + cmd.append("--with-device=ch4:ofi") + cmd.append("--enable-ch4-direct=netmod") + + configure_cmd = shlex.split(" ".join(cmd)) + common.run_command(configure_cmd) + common.run_command(["make", "clean"]) + common.run_command(["make", "install", "-j32"]) + +def build_mpich_suite(mpi, mpi_install_path, libfab_install_path, ofi_build_mode): + + mpich_suite_build_path = "/mpibuilddir/mpich-suite-build-dir/{}/{}/{}/mpich" \ + .format(jobname, buildno, ofi_build_mode); + if (os.path.exists(mpich_suite_build_path) == False): + shutil.copytree(ci_site_config.mpich_src, mpich_suite_build_path) + + mpich_suite_path = '{}/test/'.format(mpich_suite_build_path) + mpichsuite_installpath= "{}/mpichsuite/test".format(mpi_install_path) + pwd = os.getcwd() + if (mpi == 'impi'): + os.chdir("{}/mpi".format(mpich_suite_path)) + cmd = ["./configure", "--with-mpi={}/intel64" \ + .format(ci_site_config.impi_root)] + + configure_cmd = shlex.split(" ".join(cmd)) + common.run_command(configure_cmd) + common.run_command(["make", "all","-j32"]) + shutil.copytree(mpich_suite_path, mpichsuite_installpath) + common.run_command(["make", "distclean"]) + os.chdir(pwd) + + + +def build_stress_bm(mpi, mpi_install_path, libfab_install_path): + + stress_install_path = "{}/stress".format(mpi_install_path) + if (os.path.exists(stress_install_path) == False): + os.makedirs(stress_install_path) + + if (mpi == 'impi'): + os.environ['LD_LIBRARY_PATH'] = "{}/lib".format(libfab_install_path) + mpicc_path = "{}/intel64/bin/mpicc".format(ci_site_config.impi_root) + else: + os.environ['LD_LIBRARY_PATH'] = "" + mpicc_path = "{}/bin/mpicc".format(mpi_install_path) + + cmd=" ".join([mpicc_path, '-lz', "{}/mpi_stress/mpi_stress.c" \ + .format(ci_site_config.benchmarks['wfr-mpi-tests']),\ + '-o', "{}/mpi_stress".format(stress_install_path)]) + + runcmd = shlex.split(cmd) + common.run_command(runcmd) + + +def build_osu_bm(mpi, mpi_install_path, libfab_install_path): + + osu_install_path = "{}/osu".format(mpi_install_path) + if (os.path.exists(osu_install_path) == False): + os.makedirs(osu_install_path) + os.chdir(osu_install_path) + + if (mpi == 'impi'): + os.environ['CC']="{}/intel64/bin/mpicc".format(ci_site_config.impi_root) + os.environ['CXX']="{}/intel64/bin/mpicxx".format(ci_site_config.impi_root) + os.environ['LD_LIBRARY_PATH'] = "{}/lib".format(libfab_install_path) + + else: + os.environ['CC']="{}/bin/mpicc".format(mpi_install_path) + os.environ['CXX']="{}/bin/mpicxx".format(mpi_install_path) + os.environ['LD_LIBRARY_PATH']="" + + + os.environ['CFLAGS']="-I{}/util/".format(ci_site_config.benchmarks['osu']) + cmd = " ".join(["{}/configure".format(ci_site_config.benchmarks['osu']), + "--prefix={}".format(osu_install_path)]) + + configure_cmd = shlex.split(cmd) + + common.run_command(configure_cmd) + common.run_command(["make", "-j4"]) + common.run_command(["make", "install"]) + + +if __name__ == "__main__": +#read Jenkins environment variables + # In Jenkins, JOB_NAME = 'ofi_libfabric/master' vs BRANCH_NAME = 'master' + # job name is better to use to distinguish between builds of different + # jobs but with same branch name. + jobname = os.environ['JOB_NAME'] + buildno = os.environ['BUILD_NUMBER'] + workspace = os.environ['WORKSPACE'] + + + + parser = argparse.ArgumentParser() + parser.add_argument("build_item", help="build libfabric or fabtests", + choices=['libfabric','fabtests', 'impi_benchmarks', \ + 'ompi_benchmarks', 'mpich_benchmarks', 'shmem']) + parser.add_argument("--ofi_build_mode", help="select buildmode debug or dl", \ + choices=['dbg','dl']) + + args = parser.parse_args() + build_item = args.build_item + + if (args.ofi_build_mode): + ofi_build_mode = args.ofi_build_mode + else: + ofi_build_mode = 'reg' + + + + install_path = "{installdir}/{jbname}/{bno}/{bmode}" \ + .format(installdir=ci_site_config.install_dir, + jbname=jobname, bno=buildno,bmode=ofi_build_mode) + + p = re.compile('mpi*') + + if (build_item == 'libfabric'): + build_libfabric(install_path, ofi_build_mode) + + elif (build_item == 'fabtests'): + build_fabtests(install_path, ofi_build_mode) + #if the build_item contains the string 'mpi' + elif (p.search(build_item)): + mpi = build_item[:-11] #extract the mpitype from '*mpi*_benchmarks' build_item + mpi_install_path = "{}/{}".format(install_path, mpi) + + if (os.path.exists(mpi_install_path) == False): + os.makedirs(mpi_install_path) + if (mpi != 'impi'): + mpisrc = ci_site_config.mpich_src if mpi == 'mpich' \ + else ci_site_config.ompi_src + # only need to build ompi or mpich, impi is available as binary + build_mpi(mpi, mpisrc, mpi_install_path, install_path, ofi_build_mode) + + # build mpich_test_suite + build_mpich_suite(mpi, mpi_install_path, install_path, ofi_build_mode) + # run stress and osu benchmarks for all mpitypes + build_stress_bm(mpi, mpi_install_path, install_path) + build_osu_bm(mpi, mpi_install_path, install_path) + elif (build_item == 'shmem'): + # build shmem + shmem_dir = "{}/shmem".format(install_path) + build_shmem(shmem_dir, install_path) + build_ISx(shmem_dir) + build_PRK(shmem_dir) + build_uh(shmem_dir) + + + diff --git a/contrib/intel/jenkins/common.py b/contrib/intel/jenkins/common.py new file mode 100755 index 00000000000..7805edc22e7 --- /dev/null +++ b/contrib/intel/jenkins/common.py @@ -0,0 +1,59 @@ +import collections +import ci_site_config +import subprocess +import sys + +def get_node_name(host, interface): + # This is the pattern we follow in SFS team cluster + return "%s-%s" % (host, interface) + +def run_command(command): + print(" ".join(command)) + p = subprocess.Popen(command, stdout=subprocess.PIPE, text=True) + print(p.returncode) + while True: + out = p.stdout.read(1) + if (out == "" and p.poll() != None): + break + if (out != ""): + sys.stdout.write(out) + sys.stdout.flush() + if (p.returncode != 0): + print("exiting with " + str(p.poll())) + sys.exit(p.returncode) + + +Prov = collections.namedtuple('Prov', 'core util') +prov_list = [ + + Prov("psm3", None), + Prov("psm2", None), + Prov("verbs", None), + Prov("verbs", "rxd"), + Prov("verbs", "rxm"), + Prov("sockets", None), + Prov("tcp", None), + Prov("udp", None), + Prov("udp", "rxd"), + Prov("shm", None), +] +enabled_prov_list = [ + "psm2", + "verbs", + "tcp", + "sockets", + "udp", + "shm" +] +disabled_prov_list = [ + "psm3", + 'usnic', + 'psm', + 'efa', + 'perf', + 'rstream', + 'hook_debug', + 'bgq' + 'mrail' +] + diff --git a/contrib/intel/jenkins/run.py b/contrib/intel/jenkins/run.py new file mode 100755 index 00000000000..7873f0f1e77 --- /dev/null +++ b/contrib/intel/jenkins/run.py @@ -0,0 +1,125 @@ +import tests +import subprocess +import sys +import argparse +import os +import common + +sys.path.append(os.environ['CI_SITE_CONFIG']) +import ci_site_config + +# read Jenkins environment variables +# In Jenkins, JOB_NAME = 'ofi_libfabric/master' vs BRANCH_NAME = 'master' +# job name is better to use to distinguish between builds of different +# jobs but with the same branch name. +fab = os.environ['FABRIC']#args.fabric +jbname = os.environ['JOB_NAME']#args.jobname +bno = os.environ['BUILD_NUMBER']#args.buildno + + +#run fi_info test +def fi_info_test(core, hosts, mode,util=None): + + fi_info_test = tests.FiInfoTest(jobname=jbname,buildno=bno,\ + testname="fi_info", core_prov=core, fabric=fab,\ + hosts=hosts, ofi_build_mode=mode, util_prov=util) + print("running fi_info test for {}-{}-{}".format(core, util, fab)) + fi_info_test.execute_cmd() + + +#runfabtests +def fabtests(core, hosts, mode, util=None): + + runfabtest = tests.Fabtest(jobname=jbname,buildno=bno,\ + testname="runfabtests", core_prov=core, fabric=fab,\ + hosts=hosts, ofi_build_mode=mode, util_prov=util) + + if (runfabtest.execute_condn): + print("running fabtests for {}-{}-{}".format(core, util, fab)) + runfabtest.execute_cmd() + else: + print("skipping {} as execute condition fails"\ + .format(runfabtest.testname)) + print("----------------------------------------------------------------------------------------\n") + +def shmemtest(core, hosts, mode, util=None): + runshmemtest = tests.ShmemTest(jobname=jbname,buildno=bno,\ + testname="shmem test", core_prov=core, fabric=fab,\ + hosts=hosts, ofi_build_mode=mode, util_prov=util) + if (runshmemtest.execute_condn): + print("running shmem unit test for {}-{}-{}".format(core, util, fab)) + runshmemtest.execute_cmd("unit") + print("running shmem PRK test for {}-{}-{}".format(core, util, fab)) + runshmemtest.execute_cmd("prk") + print("running shmem ISx test for {}-{}-{}".format(core, util, fab)) + runshmemtest.execute_cmd("isx") + print("running shmem uh test for {}-{}-{}".format(core, util, fab)) + runshmemtest.execute_cmd("uh") + else: + print("skipping {} as execute condition fails"\ + .format(runshmemtest.testname)) + print("----------------------------------------------------------------------------------------\n") + + +#imb-tests +def intel_mpi_benchmark(core, hosts, mpi, mode, util=None): + + imb_test = tests.MpiTestIMB(jobname=jbname,buildno=bno,\ + testname="IntelMPIbenchmark",core_prov=core, fabric=fab,\ + hosts=hosts, mpitype=mpi, ofi_build_mode=mode, util_prov=util) + + if (imb_test.execute_condn == True and imb_test.mpi_gen_execute_condn == True): + print("running imb-tests for {}-{}-{}-{}".format(core, util, fab, mpi)) + imb_test.execute_cmd() + else: + print("skipping {} as execute condition fails"\ + .format(imb_test.testname)) + print("----------------------------------------------------------------------------------------\n") + +#mpich_test_suite +def mpich_test_suite(core, hosts, mpi, mode, util=None): + mpich_tests = tests.MpichTestSuite(jobname=jbname,buildno=bno,\ + testname="MpichTestSuite",core_prov=core, fabric=fab,\ + mpitype=mpi, hosts=hosts, ofi_build_mode=mode, \ + util_prov=util) + if (mpich_tests.execute_condn == True and \ + mpich_tests.mpi_gen_execute_condn == True): + print("Running mpich test suite: Spawn coll, comm, dt Tests for {}-{}-{}-{}".format(core, util, fab, mpi)) + os.environ["MPITEST_RETURN_WITH_CODE"] = "1" + mpich_tests.execute_cmd("spawn") + +#mpi_stress benchmark tests +def mpistress_benchmark(core, hosts, mpi, mode, util=None): + + stress_test = tests.MpiTestStress(jobname=jbname,buildno=bno,\ + testname="stress",core_prov=core, fabric=fab, mpitype=mpi,\ + hosts=hosts, ofi_build_mode=mode, util_prov=util) + + if (stress_test.execute_condn == True and stress_test.mpi_gen_execute_condn == True): + print("running mpistress-test for {}-{}-{}-{}".format(core, util, fab, mpi)) + stress_test.execute_cmd() + else: + print("skipping {} as execute condition fails" \ + .format(stress_test.testname)) + print("----------------------------------------------------------------------------------------\n") + +#osu benchmark tests +def osu_benchmark(core, hosts, mpi, mode, util=None): + + osu_test = tests.MpiTestOSU(jobname=jbname, buildno=bno, \ + testname="osu-benchmarks",core_prov=core, fabric=fab, mpitype=mpi, \ + hosts=hosts, ofi_build_mode=mode, util_prov=util) + + if (osu_test.execute_condn == True and osu_test.mpi_gen_execute_condn == True): + print("running osu-test for {}-{}-{}-{}".format(core, util, fab, mpi)) + osu_test.execute_cmd() + else: + print("skipping {} as execute condition fails" \ + .format(osu_test.testname)) + print("----------------------------------------------------------------------------------------\n") + + +if __name__ == "__main__": + pass + + diff --git a/contrib/intel/jenkins/runtests.py b/contrib/intel/jenkins/runtests.py new file mode 100755 index 00000000000..c3cfe285d8f --- /dev/null +++ b/contrib/intel/jenkins/runtests.py @@ -0,0 +1,73 @@ +import argparse +import os +import sys +sys.path.append(os.environ['CI_SITE_CONFIG']) +import ci_site_config +import run +import common + +parser = argparse.ArgumentParser() + +parser.add_argument("--prov", help="core provider", choices=["psm2", "verbs", \ + "tcp", "udp", "sockets", "shm"]) +parser.add_argument("--util", help="utility provider", choices=["rxd", "rxm"]) +parser.add_argument("--ofi_build_mode", help="specify the build configuration", \ + choices = ["dbg", "dl"]) + +args = parser.parse_args() +args_core = args.prov + +args_util = args.util + +if (args.ofi_build_mode): + ofi_build_mode = args.ofi_build_mode +else: + ofi_build_mode='reg' + +node = (os.environ['NODE_NAME']).split('-')[0] +hosts = [node] +# Note: Temporarily disabling all mpich testing +# due to mpich options issues which is causing +# multiple tests to fail. +#mpilist = ['impi', 'mpich', 'ompi'] +mpilist = ['impi', 'ompi'] + +#this script is executed from /tmp +#this is done since some mpi tests +#look for a valid location before running +# the test on the secondary host(client) +# but jenkins only creates a valid path on +# the primary host (server/test node) + +os.chdir('/tmp/') + +if(args_core): + for host in ci_site_config.node_map[node]: + hosts.append(host) + + if (args_util == None): + run.fi_info_test(args_core, hosts, ofi_build_mode) + run.fabtests(args_core, hosts, ofi_build_mode) + run.shmemtest(args_core, hosts, ofi_build_mode) + for mpi in mpilist: + run.mpich_test_suite(args_core, hosts, mpi, ofi_build_mode) + run.intel_mpi_benchmark(args_core, hosts, mpi, ofi_build_mode) + run.mpistress_benchmark(args_core, hosts, mpi, ofi_build_mode) + run.osu_benchmark(args_core, hosts, mpi, ofi_build_mode) + else: + run.fi_info_test(args_core, hosts, ofi_build_mode, util=args_util) + run.fabtests(args_core, hosts, ofi_build_mode, util=args_util) + run.shmemtest(args_core, hosts, ofi_build_mode, util=args_util) + for mpi in mpilist: + run.mpich_test_suite(args_core, hosts, mpi, ofi_build_mode, \ + util=args_util) + + run.intel_mpi_benchmark(args_core, hosts, mpi, ofi_build_mode, \ + util=args_util) + run.mpistress_benchmark(args_core, hosts, mpi, ofi_build_mode, \ + util=args_util) + run.osu_benchmark(args_core, hosts, mpi, ofi_build_mode, \ + util=args_util) +else: + print("Error : Specify a core provider to run tests") + diff --git a/contrib/intel/jenkins/tests.py b/contrib/intel/jenkins/tests.py new file mode 100755 index 00000000000..a3e670be85d --- /dev/null +++ b/contrib/intel/jenkins/tests.py @@ -0,0 +1,528 @@ +import sys +import os + +print(os.environ['CI_SITE_CONFIG']) +sys.path.append(os.environ['CI_SITE_CONFIG']) # for adding path for ci_site_config + +import subprocess +import re +import ci_site_config +import common +import shlex +from abc import ABC, abstractmethod # abstract base class for creating abstract classes in python + +job_cadence = os.environ['JOB_CADENCE'] + +# A Jenkins env variable for job name is composed of the name of the jenkins job and the branch name +# it is building for. for e.g. in our case jobname = 'ofi_libfabric/master' +class Test: + def __init__ (self, jobname, buildno, testname, core_prov, fabric, + hosts, ofi_build_mode, util_prov=None): + self.jobname = jobname + self.buildno = buildno + self.testname = testname + self.core_prov = core_prov + self.util_prov = "ofi_{}".format(util_prov) if util_prov != None else "" + self.fabric = fabric + self.hosts = hosts + self.ofi_build_mode = ofi_build_mode + self.job_cadence = job_cadence + if (len(hosts) == 2): + self.server = hosts[0] + self.client = hosts[1] + + self.nw_interface = ci_site_config.interface_map[self.fabric] + self.libfab_installpath = "{}/{}/{}/{}".format(ci_site_config.install_dir, + self.jobname, self.buildno, self.ofi_build_mode) + + self.env = [("FI_VERBS_MR_CACHE_ENABLE", "1"),\ + ("FI_VERBS_INLINE_SIZE", "256")] \ + if self.core_prov == "verbs" else [] +class FiInfoTest(Test): + def __init__(self, jobname, buildno, testname, core_prov, fabric, + hosts, ofi_build_mode, util_prov=None): + + super().__init__(jobname, buildno, testname, core_prov, fabric, + hosts, ofi_build_mode, util_prov) + + self.fi_info_testpath = "{}/bin".format(self.libfab_installpath) + + @property + def cmd(self): + return "{}/fi_info ".format(self.fi_info_testpath) + + @property + def options(self): + if (self.util_prov): + opts = "-f -p {};{}".format(self.core_prov, self.util_prov) + else: + opts = "-f -p {}".format(self.core_prov) + + return opts + + def execute_cmd(self): + command = self.cmd + self.options + outputcmd = shlex.split(command) + common.run_command(outputcmd) + + +class Fabtest(Test): + + def __init__(self, jobname, buildno, testname, core_prov, fabric, + hosts, ofi_build_mode, util_prov=None): + + super().__init__(jobname, buildno, testname, core_prov, fabric, + hosts, ofi_build_mode, util_prov) + self.fabtestpath = "{}/bin".format(self.libfab_installpath) + self.fabtestconfigpath = "{}/share/fabtests".format(self.libfab_installpath) + def get_exclude_file(self): + path = self.libfab_installpath + efile_path = "{}/share/fabtests/test_configs".format(path) + + prov = self.util_prov if self.util_prov else self.core_prov + efile_old = "{path}/{prov}/{prov}.exclude".format(path=efile_path, + prov=prov) + + if self.util_prov: + efile = "{path}/{util_prov}/{core_prov}/exclude".format(path=efile_path, + util_prov=self.util_prov, core_prov=self.core_prov) + else: + efile = "{path}/{prov}/exclude".format(path=efile_path, + prov=self.core_prov) + + if os.path.isfile(efile): + return efile + elif os.path.isfile(efile_old): + return efile_old + else: + print("Exclude file: {} not found!".format(efile)) + return None + + @property + def cmd(self): + return "{}/runfabtests.sh ".format(self.fabtestpath) + + @property + def options(self): + opts = "-T 300 -vvv -p {} -S ".format(self.fabtestpath) + if (self.core_prov == "verbs" and self.nw_interface): + opts = "{} -s {} ".format(opts, common.get_node_name(self.server, + self.nw_interface)) # include common.py + opts = "{} -c {} ".format(opts, common.get_node_name(self.client, + self.nw_interface)) # from common.py + + if (self.core_prov == "shm"): + opts = "{} -s {} ".format(opts, self.server) + opts = "{} -c {} ".format(opts, self.client) + opts += "-N " + + if not re.match(".*sockets|udp|tcp.*", self.core_prov): + opts = "{} -t all ".format(opts) + + efile = self.get_exclude_file() + if efile: + opts = "{} -R ".format(opts) + opts = "{} -f {} ".format(opts, efile) + + for key,val in self.env: + opts = "{options} -E {key}={value} ".format(options = opts, + key=key, value=val) + + if self.util_prov: + opts = "{options} {core};{util} ".format(options=opts, + core=self.core_prov, util=self.util_prov) + else: + opts = "{options} {core} ".format(options=opts, + core=self.core_prov) + + if (self.core_prov == "shm"): + opts += "{} {} ".format(self.server, self.server) + else: + opts += "{} {} ".format(self.server, self.client) + + return opts + + @property + def execute_condn(self): + return True if (self.core_prov != 'shm' or \ + self.ofi_build_mode == 'dbg') else False + + def execute_cmd(self): + curdir = os.getcwd() + os.chdir(self.fabtestconfigpath) + command = self.cmd + self.options + outputcmd = shlex.split(command) + common.run_command(outputcmd) + os.chdir(curdir) + +class ShmemTest(Test): + def __init__(self, jobname, buildno, testname, core_prov, fabric, + hosts, ofi_build_mode, util_prov=None): + + super().__init__(jobname, buildno, testname, core_prov, fabric, + hosts, ofi_build_mode, util_prov) + + #self.n - number of hosts * number of processes per host + self.n = 4 + # self.ppn - number of processes per node. + self.ppn = 2 + self.shmem_dir = "{}/shmem".format(self.libfab_installpath) + + @property + def cmd(self): + #todo: rename mpi_testpath to testpath to make it generic for shmem and mpitest + return "{}/run_shmem.sh ".format(ci_site_config.mpi_testpath) + + def options(self, shmem_testname): + + if self.util_prov: + prov = "{core};{util} ".format(core=self.core_prov, + util=self.util_prov) + else: + prov = self.core_prov + + opts = "-n {n} -hosts {server},{client} -shmem_dir={shmemdir} \ + -libfabric_path={path}/lib -prov '{provider}' -test {test} \ + -server {server} -inf {inf}" \ + .format(n=self.n, server=self.server, client=self.client, \ + shmemdir=self.shmem_dir, path=self.libfab_installpath, \ + provider=prov, test=shmem_testname, \ + inf=ci_site_config.interface_map[self.fabric]) + return opts + + @property + def execute_condn(self): + return True if (self.job_cadence == 'daily' and \ + (self.core_prov == "psm2" or \ + self.core_prov == "sockets")) \ + else False + + def execute_cmd(self, shmem_testname): + command = self.cmd + self.options(shmem_testname) + outputcmd = shlex.split(command) + common.run_command(outputcmd) + + +class MpiTests(Test): + def __init__(self, jobname, buildno, testname, core_prov, fabric, + mpitype, hosts, ofi_build_mode, util_prov=None): + + super().__init__(jobname, buildno, testname, core_prov, + fabric, hosts, ofi_build_mode, util_prov) + self.mpi = mpitype + + @property + def cmd(self): + if (self.mpi == "impi" or self.mpi == "mpich"): + self.testpath = ci_site_config.mpi_testpath + return "{}/run_{}.sh ".format(self.testpath,self.mpi) + elif(self.mpi =="ompi"): + self.testpath = "{}/ompi/bin".format(self.libfab_installpath) + return "{}/mpirun ".format(self.testpath) + + @property + def options(self): + opts = [] + if (self.mpi == "impi" or self.mpi == "mpich"): + opts = "-n {} -ppn {} -hosts {},{} ".format(self.n,self.ppn, + self.server,self.client) + + if (self.mpi == "impi"): + opts = "{} -mpi_root={} ".format(opts, + ci_site_config.impi_root) + else: + opts = "{} -mpi_root={}/mpich".format(opts, + self.libfab_installpath) + + opts = "{} -libfabric_path={}/lib ".format(opts, + self.libfab_installpath) + + if self.util_prov: + opts = "{options} -prov {core};{util} ".format(options=opts, + core=self.core_prov, util=self.util_prov) + else: + opts = "{} -prov {} ".format(opts, self.core_prov) + + for key, val in self.env: + opts = "{} -genv {} {} ".format(opts, key, val) + + elif (self.mpi == "ompi"): + opts = "-np {} ".format(self.n) + hosts = ",".join([":".join([host,str(self.ppn)]) \ + for host in self.hosts]) + + opts = "{} --host {} ".format(opts, hosts) + + if self.util_prov: + opts = "{} --mca mtl_ofi_provider_include {};{} ".format(opts, + self.core_prov,self.util_prov) + else: + opts = "{} --mca mtl_ofi_provider_include {} ".format(opts, + self.core_prov) + + opts += "--mca orte_base_help_aggregate 0 " + opts += "--mca mtl ofi --mca pml cm -tag-output " + for key,val in self.env: + opts = "{} -x {}={} ".format(opts,key,val) + return opts + + @property + def mpi_gen_execute_condn(self): + #Skip MPI tests for udp, verbs(core) providers. + # we would still have MPI tests runnning for + # verbs-rxd and verbs-rxm providers + return True if (self.core_prov != "udp" and \ + self.core_prov != "shm" and \ + (self.core_prov != "verbs" or \ + self.util_prov == "ofi_rxm" or \ + self.util_prov == "ofi_rxd")) else False + +# IMBtests serves as an abstract class for different +# types of intel MPI benchmarks. Currently we have +# the mpi1 and rma tests enabled which are encapsulated +# in the IMB_mpi1 and IMB_rma classes below. + +class IMBtests(ABC): + """ + This is an abstract class for IMB tests. + currently IMB-MPI1 and IMB-RMA tests are + supported. In future there could be more. + All abstract methods must be implemented. + """ + + @property + @abstractmethod + def imb_cmd(self): + pass + + @property + @abstractmethod + def execute_condn(self): + pass + +class IMBmpi1(IMBtests): + + def __init__(self): + self.additional_tests = [ + "Biband", + "Uniband", + "PingPongAnySource", + "PingPingAnySource", + "PingPongSpecificSource", + "PingPingSpecificSource" + ] + + @property + def imb_cmd(self): + return "{}/intel64/bin/IMB-MPI1 -include {}".format(ci_site_config.impi_root, \ + ','.join(self.additional_tests)) + + @property + def execute_condn(self): + return True + +class IMBrma(IMBtests): + def __init__(self, core_prov): + self.core_prov = core_prov + + @property + def imb_cmd(self): + return "{}/intel64/bin/IMB-RMA".format(ci_site_config.impi_root) + + @property + def execute_condn(self): + return True if (self.core_prov != "verbs") else False + +# MpiTestIMB class inherits from the MPITests class. +# It uses the same options method and class variables as all MPI tests. +# It creates IMB_xxx test objects for each kind of IMB test. +class MpiTestIMB(MpiTests): + + def __init__(self, jobname, buildno, testname, core_prov, fabric, + mpitype, hosts, ofi_build_mode, util_prov=None): + super().__init__(jobname, buildno, testname, core_prov, fabric, + mpitype, hosts, ofi_build_mode, util_prov) + + self.n = 4 + self.ppn = 1 + self.mpi1 = IMBmpi1() + self.rma = IMBrma(self.core_prov) + + @property + def execute_condn(self): + return True if (self.mpi == "impi") else False + + def execute_cmd(self): + command = self.cmd + self.options + if(self.mpi1.execute_condn): + outputcmd = shlex.split(command + self.mpi1.imb_cmd) + common.run_command(outputcmd) + if (self.rma.execute_condn): + outputcmd = shlex.split(command + self.rma.imb_cmd) + common.run_command(outputcmd) + +class MpichTestSuite(MpiTests): + + def __init__(self, jobname, buildno, testname, core_prov, fabric, + mpitype, hosts, ofi_build_mode, util_prov=None): + super().__init__(jobname, buildno, testname, core_prov, fabric, + mpitype, hosts, ofi_build_mode, util_prov) + self.mpichsuitepath = "{}/{}/mpichsuite/test/mpi/" \ + .format(self.libfab_installpath, self.mpi) + self.pwd = os.getcwd() + + def testgroup(self, testgroupname): + + testpath = "{}/{}".format(self.mpichsuitepath, testgroupname) + tests = [] + with open("{}/testlist".format(testpath)) as file: + for line in file: + if(line[0] != '#' and line[0] != '\n'): + tests.append((line.rstrip('\n')).split(' ')) + + return tests + + def options(self, nprocs, timeout=None): + if (self.mpi == "impi" or self.mpi == "mpich"): + if (self.mpi == "impi"): + mpiroot = ci_site_config.impi_root + else: + mpiroot = "{}/mpich".format(self.libfab_installpath) + if (self.util_prov): + prov = "\"{};{}\"".format(self.core_prov, self.util_prov) + else: + prov = self.core_prov + + if (timeout != None): + os.environ['MPIEXEC_TIMEOUT']=timeout + + opts = "-n {np} -hosts {s},{c} -mpi_root={mpiroot} \ + -libfabric_path={installpath}/lib -prov {provider} " \ + .format(np=nprocs, s=self.server, c=self.client, \ + provider=prov, mpiroot=mpiroot, \ + installpath=self.libfab_installpath) + + elif (self.mpi == "ompi"): + print(self.mpi) + + return opts + + @property + def execute_condn(self): + return True if (self.mpi == 'impi' and self.core_prov != 'psm2' \ + and self.core_prov != 'sockets') else False + + def execute_cmd(self, testgroupname): + print("Running Tests: " + testgroupname) + tests = [] + time = None + os.chdir("{}/{}".format(self.mpichsuitepath,testgroupname)) + tests = self.testgroup(testgroupname) + for test in tests: + testname = test[0] + nprocs = test[1] + args = test[2:] + for item in args: + itemlist = item.split('=') + if (itemlist[0] == 'timelimit'): + time = itemlist[1] + opts = self.options(nprocs, timeout=time) + testcmd = self.cmd + opts +"./{}".format(testname) + outputcmd = shlex.split(testcmd) + common.run_command(outputcmd) + os.chdir(self.pwd) + +class MpiTestStress(MpiTests): + + def __init__(self, jobname, buildno, testname, core_prov, fabric, + mpitype, hosts, ofi_build_mode, util_prov=None): + super().__init__(jobname, buildno, testname, core_prov, fabric, + mpitype, hosts, ofi_build_mode, util_prov) + + + if((self.core_prov == "verbs" or self.core_prov =="psm2")): + self.n = 16 + self.ppn = 8 + else: + self.n = 4 + self.ppn = 2 + + @property + def stress_cmd(self): + return "{}/{}/stress/mpi_stress -dcr".format(self.libfab_installpath, self.mpi) + + @property + def execute_condn(self): + # Todo : run stress test for ompi with libfabirc-dbg builds if it works + # in Jenkins for buildbot these ompi did not build with libfabric-dbg + + # Due to an mpich issue when the correct mpich options are enabled during + # mpich builds, sttress test is failing. disabling mpich + stress tests + # untill the mpich team fixes the issue. + return True if ((self.job_cadence == 'daily') and \ + (self.mpi != 'ompi' or \ + self.ofi_build_mode != 'dbg')) else False + + def execute_cmd(self): + command = self.cmd + self.options + self.stress_cmd + outputcmd = shlex.split(command) + common.run_command(outputcmd) + + + +class MpiTestOSU(MpiTests): + + def __init__(self, jobname, buildno, testname, core_prov, fabric, + mpitype, hosts, ofi_build_mode, util_prov=None): + super().__init__(jobname, buildno, testname, core_prov, fabric, + mpitype, hosts, ofi_build_mode, util_prov) + + self.n = 4 + self.ppn = 2 + self.two_proc_tests = {'osu_latency', + 'osu_bibw', + 'osu_latency_mt', + 'osu_bw','osu_get_latency', + 'osu_fop_latency', + 'osu_acc_latency', + 'osu_get_bw', + 'osu_put_latency', + 'osu_put_bw', + 'osu_put_bibw', + 'osu_cas_latency', + 'osu_get_acc_latency' + } + + self.osu_mpi_path = "{}/{}/osu/libexec/osu-micro-benchmarks/mpi/". \ + format(self.libfab_installpath,mpitype) + + @property + def execute_condn(self): + # sockets and psm2 have some issues with OSU benchmark testing. + return True if ((self.job_cadence == 'daily') and \ + (self.mpi != "ompi" or \ + (self.core_prov != "sockets" and \ + self.core_prov != "psm2" and \ + self.ofi_build_mode!="dbg"))) \ + else False + + def execute_cmd(self): + assert(self.osu_mpi_path) + p = re.compile('osu_put*') + for root, dirs, tests in os.walk(self.osu_mpi_path): + for test in tests: + if test in self.two_proc_tests: + self.n=2 + self.ppn=1 + else: + self.n=4 + self.ppn=2 + # for sockets provider skip 'osu_put' benchmark tests as they fail. + if(self.core_prov !='sockets' or p.search(test)== None): + launcher = self.cmd + self.options + osu_cmd = os.path.join(root, test) + command = launcher + osu_cmd + outputcmd = shlex.split(command) + common.run_command(outputcmd) + + diff --git a/docs/policy b/docs/policy index 5cf066c143b..25c860ea945 100644 --- a/docs/policy +++ b/docs/policy @@ -4,7 +4,13 @@ This document describes the general policies and procedures that are followed by the libfabric development community. It is best viewed as a guideline, and is not a formal or legal document. -Code contributions + +DEVELOPER GUIDELINES +==================== +The following guidelines are helpful for developers new to the +libfabric community and open source development. + +Code Contributions ------------------ Any developers wishing to contribute to libfabric may do so, provided that they adhere to the CONTRIBUTORS agreement in the root @@ -13,31 +19,6 @@ or documents that must be signed prior to submitting code. Developers need the rights to submit the code being introduced, and the code must meet the license requirements of the project. -Git Repository Admin --------------------- -The number of people with administrative access to the github repo -will be limited. Traditionally, this has been around three developers who -are active in the project, and are from different companies. Admins -will typically have the same limitations as those with write access to -the repo, such as no forced updates. - -Git Write Access ----------------- -Because of the scope of the project, there may be several people (more -than 10) with write access. Most writers are maintainers for a -specific provider in the project. As a general rule, writers should only -commit changes to the subdirectory that corresponds with the provider -that they are maintaining. Changes made to other providers or the -libfabric core must be approved prior by the relevant owners prior to -being merged. - -Core Changes ------------- -Updates to the libfabric core should be reviewed by at least one other -developer. Changes to the API should be brought to the attention of -the OFIWG mailing list, with significant changes discussed prior to -being implemented. - Patch Submission ---------------- Patches should be submitted directly to github as part of a pull request. @@ -45,6 +26,43 @@ For patches that touch the external API or introduce or modify core functionality significantly, an email should be sent to the ofiwg mail list with a link to the pull request. +Patches should include a clear description of the problem that the patch +is addressing, and how it does so. One or two line descriptions are +almost never sufficient, except for the most trivial code changes. +The description should stand on its own, and provide enough context +for someone to determine what the patch does, without needing to read +the accompanying code changes. Often times, the purpose of a patch is +made clearer as part of a review discussion. When this occurs, the +portion of the discussion clarifying the purpose of a change should be +folded into the patch description. + +Each patch should address a single problem. When a patch description +indicates that a patch does A, B, and C, that's usually the indication +that the patch should have been split into three separate patches. +An exception may be made if an unrelated change occurs in the code that +surrounds the patch, provided that the change is trivial. For example, +white space cleanup or fixing typos in comments may be allowed to slip +through the review process, even though those changes are unrelated to +the patch. + +No single patch should ever break the build or result in incorrect operation. +That is, arbitrarily breaking up a patch into two or more pieces, which all +need to be applied to bring the repository back into a stable state is not +allowed. + +One of the most common reasons that a patch is rejected is that it is +trying to change too many things at once. The standard argument back is +that developer viewed the entire set of changes as one entity. The +best chance of having code accepted with minimal changes requested is +to keep patches small. If a large set of changes requires restructuring +the existing code, then separate out the restructuring into its own set +of patches. It's okay for a patch to do nothing significant other than +prepare the code for a follow on patch. In fact, it's often preferred, +as that can help identify alternatives that weren't considered. + +For help on how to write a good patch and patch description, search the +web. There are plenty of helpful tutorials out there. + Pull Requests ------------- A number of continuous integration tests run against all pull requests. @@ -57,6 +75,44 @@ may be ignored, and the pull request merged. It is the responsibility of the person committing the request to the repo to confirm that any CI failures are unrelated to the changes in the pull request. +Core Changes +------------ +Updates to the libfabric core should be reviewed by at least one other +developer. Changes to the API should be brought to the attention of +the OFIWG mailing list, with significant changes discussed prior to +being implemented. + +API Changes +----------- +All files under the include/rdma subdirectory are maintained as part of +the stable libfabric API. Any changes to those files will receive a +strongly scrutinized review, as changes there have a much broader impact +across not just the project, but the entire libfabric software ecosystem. +For additional details, see include/ofi_abi.h before deciding that you +really don't need that API change. :) + + +PROJECT ADMINISTRATION +====================== + +Git Repository Admin +-------------------- +The number of people with administrative access to the github repo +will be limited. Traditionally, this has been around three developers who +are active in the project, and are from different companies. Admins +will typically have the same limitations as those with write access to +the repo, such as no forced updates. + +Git Write Access +---------------- +Because of the scope of the project, there may be several people (more +than 10) with write access. Most writers are maintainers for a +specific provider in the project. As a general rule, writers should only +commit changes to the subdirectory that corresponds with the provider +that they are maintaining. Changes made to other providers or the +libfabric core must be approved prior by the relevant owners prior to +being merged. + Releases -------- A wiki page maintained on github with the repo provides a full checklist diff --git a/docs/providers b/docs/providers index 8fbc0f57013..8ac9b4ba01f 100644 --- a/docs/providers +++ b/docs/providers @@ -106,5 +106,7 @@ fi_poll_fd() - call poll() on an fd fi_wait_cond() - wait on a mutex fi_datatype_size() - return size of an atomic datatype fi_[capability]_allowed() - routines to check caps bits -fi_gettime_ms() - return current time in milliseconds +ofi_gettime_ns() - return current time in nanoseconds +ofi_gettime_us() - return current time in microseconds +ofi_gettime_ms() - return current time in milliseconds fi_fd_nonblock() - set fd to nonblocking diff --git a/fabtests/COPYING b/fabtests/COPYING index 22e8703de8c..14257a21634 100644 --- a/fabtests/COPYING +++ b/fabtests/COPYING @@ -7,7 +7,7 @@ Some parts of the source are 3rd party code which uses MIT license. The description and requirements of the license are available in later part of this file. -Copyright (c) 2015-2018 Intel Corporation. All rights reserved. +Copyright (c) 2015-2020 Intel Corporation. All rights reserved. Copyright (c) 2016-2018 Cisco Systems, Inc. All rights reserved. ================================================================== diff --git a/fabtests/Makefile.am b/fabtests/Makefile.am index 71bade82dbf..95e7b6318b2 100644 --- a/fabtests/Makefile.am +++ b/fabtests/Makefile.am @@ -4,10 +4,12 @@ ACLOCAL_AMFLAGS = -I config if MACOS os_excludes = -f ./test_configs/osx.exclude +AM_CFLAGS += -I$(srcdir)/include/osx endif if FREEBSD os_excludes = -f ./test_configs/freebsd.exclude +AM_CFLAGS += -I$(srcdir)/include/freebsd endif bin_PROGRAMS = \ @@ -16,7 +18,7 @@ bin_PROGRAMS = \ functional/fi_stream_msg \ functional/fi_msg_sockets \ functional/fi_rdm \ - functional/fi_rdm_rma_simple \ + functional/fi_rdm_rma_event \ functional/fi_rdm_rma_trigger \ functional/fi_rdm_deferred_wq \ functional/fi_dgram \ @@ -52,12 +54,14 @@ bin_PROGRAMS = \ unit/fi_eq_test \ unit/fi_cq_test \ unit/fi_mr_test \ + unit/fi_mr_cache_evict \ unit/fi_cntr_test \ unit/fi_av_test \ unit/fi_dom_test \ unit/fi_getinfo_test \ - unit/fi_resource_freeing \ - ubertest/fi_ubertest + ubertest/fi_ubertest \ + multinode/fi_multinode \ + multinode/fi_multinode_coll dist_bin_SCRIPTS = \ scripts/runfabtests.sh \ @@ -82,17 +86,22 @@ nobase_dist_config_DATA = \ test_configs/tcp/tcp.exclude \ test_configs/verbs/all.test \ test_configs/verbs/quick.test \ - test_configs/verbs/exclude \ + test_configs/verbs/verbs.exclude \ test_configs/usnic/all.test \ test_configs/usnic/quick.test \ test_configs/psm/all.test \ test_configs/psm2/all.test \ test_configs/psm2/verify.test \ test_configs/psm2/psm2.exclude \ - test_configs/ofi_rxm/verbs/all.test \ - test_configs/ofi_rxm/verbs/exclude \ - test_configs/ofi_rxd/ofi_rxd.exclude \ + test_configs/psm3/all.test \ + test_configs/psm3/verify.test \ + test_configs/psm3/psm3.exclude \ + test_configs/ofi_rxm/tcp.test \ + test_configs/ofi_rxm/verbs.test \ + test_configs/ofi_rxm/ofi_rxm.exclude \ test_configs/ofi_rxd/udp.test \ + test_configs/ofi_rxd/verbs.test \ + test_configs/ofi_rxd/ofi_rxd.exclude \ test_configs/shm/all.test \ test_configs/shm/shm.exclude \ test_configs/shm/quick.test \ @@ -100,10 +109,16 @@ nobase_dist_config_DATA = \ test_configs/efa/efa.exclude noinst_LTLIBRARIES = libfabtests.la + libfabtests_la_SOURCES = \ common/shared.c \ common/jsmn.c \ + common/hmem.c \ + common/hmem_cuda.c \ + common/hmem_rocr.c \ + common/hmem_ze.c \ include/shared.h \ + include/hmem.h \ include/jsmn.h \ include/unix/osd.h \ include/ft_osd.h @@ -150,9 +165,9 @@ functional_fi_rdm_shared_av_SOURCES = \ functional/rdm_shared_av.c functional_fi_rdm_shared_av_LDADD = libfabtests.la -functional_fi_rdm_rma_simple_SOURCES = \ - functional/rdm_rma_simple.c -functional_fi_rdm_rma_simple_LDADD = libfabtests.la +functional_fi_rdm_rma_event_SOURCES = \ + functional/rdm_rma_event.c +functional_fi_rdm_rma_event_LDADD = libfabtests.la functional_fi_rdm_rma_trigger_SOURCES = \ functional/rdm_rma_trigger.c @@ -298,6 +313,11 @@ unit_fi_mr_test_SOURCES = \ $(unit_srcs) unit_fi_mr_test_LDADD = libfabtests.la +unit_fi_mr_cache_evict_SOURCES = \ + unit/mr_cache_evict.c \ + $(unit_srcs) +unit_fi_mr_cache_evict_LDADD = libfabtests.la + unit_fi_cntr_test_SOURCES = \ unit/cntr_test.c \ $(unit_srcs) @@ -318,10 +338,6 @@ unit_fi_getinfo_test_SOURCES = \ $(unit_srcs) unit_fi_getinfo_test_LDADD = libfabtests.la -unit_fi_resource_freeing_SOURCES = \ - unit/resource_freeing.c -unit_fi_resource_freeing_LDADD = libfabtests.la - ubertest_fi_ubertest_SOURCES = \ ubertest/fabtest.h \ ubertest/ofi_atomic.h \ @@ -337,6 +353,31 @@ ubertest_fi_ubertest_SOURCES = \ ubertest/test_ctrl.c ubertest_fi_ubertest_LDADD = libfabtests.la +multinode_fi_multinode_SOURCES = \ + multinode/src/harness.c \ + multinode/src/pattern.c \ + multinode/include/pattern.h \ + multinode/src/core.c \ + multinode/include/core.h + +multinode_fi_multinode_LDADD = libfabtests.la + +multinode_fi_multinode_CFLAGS = \ + $(AM_CFLAGS) \ + -I$(srcdir)/multinode/include + +multinode_fi_multinode_coll_SOURCES = \ + multinode/src/harness.c \ + multinode/src/core_coll.c \ + multinode/include/coll_test.h \ + multinode/include/core.h + +multinode_fi_multinode_coll_LDADD = libfabtests.la + +multinode_fi_multinode_coll_CFLAGS = \ + $(AM_CFLAGS) \ + -I$(srcdir)/multinode/include + real_man_pages = \ man/man7/fabtests.7 @@ -359,7 +400,7 @@ dummy_man_pages = \ man/man1/fi_rdm_deferred_wq.1 \ man/man1/fi_rdm_multi_domain.1 \ man/man1/fi_multi_recv.1 \ - man/man1/fi_rdm_rma_simple.1 \ + man/man1/fi_rdm_rma_event.1 \ man/man1/fi_rdm_rma_trigger.1 \ man/man1/fi_rdm_shared_av.1 \ man/man1/fi_rdm_tagged_peek.1 \ @@ -384,7 +425,6 @@ dummy_man_pages = \ man/man1/fi_eq_test.1 \ man/man1/fi_getinfo_test.1 \ man/man1/fi_mr_test.1 \ - man/man1/fi_resource_freeing.1 \ man/man1/fi_bw.1 \ man/man1/fi_ubertest.1 diff --git a/fabtests/Makefile.win b/fabtests/Makefile.win index 9be5046ffff..2c416dee98f 100644 --- a/fabtests/Makefile.win +++ b/fabtests/Makefile.win @@ -18,6 +18,10 @@ CFLAGS = $(CFLAGS) /Zi /Od /MTd outdir = $(output_root)$(arch)\debug-v141 CFLAGS = $(CFLAGS) /Zi /Od /MTd !endif +!if "$(config)" == "Debug-v142" +outdir = $(output_root)$(arch)\debug-v142 +CFLAGS = $(CFLAGS) /Zi /Od /MTd +!endif !if "$(config)" == "Release-v140" outdir = $(output_root)$(arch)\release-v140 CFLAGS = $(CFLAGS) /O2 /MT @@ -26,9 +30,14 @@ CFLAGS = $(CFLAGS) /O2 /MT outdir = $(output_root)$(arch)\release-v141 CFLAGS = $(CFLAGS) /O2 /MT !endif +!if "$(config)" == "Release-v142" +outdir = $(output_root)$(arch)\release-v142 +CFLAGS = $(CFLAGS) /O2 /MT +!endif -basedeps = common\shared.c common\jsmn.c common\windows\getopt.c \ - common\windows\osd.c +basedeps = common\hmem.c common\shared.c common\jsmn.c \ + common\windows\getopt.c common\windows\osd.c \ + common\hmem_cuda.c common\hmem_rocr.c common\hmem_ze.c includes = /Iinclude /Iinclude\windows /I..\include /FIft_osd.h \ /Iinclude\windows\getopt @@ -59,7 +68,7 @@ benchmarks: $(outdir)\msg_pingpong.exe $(outdir)\rdm_cntr_pingpong.exe \ functional: $(outdir)\cq_data.exe $(outdir)\dgram.exe $(outdir)\dgram_waitset.exe $(outdir)\msg.exe \ $(outdir)\msg_epoll.exe $(outdir)\msg_sockets.exe \ - $(outdir)\poll.exe $(outdir)\rdm.exe $(outdir)\rdm_rma_simple.exe $(outdir)\rdm_rma_trigger.exe \ + $(outdir)\poll.exe $(outdir)\rdm.exe $(outdir)\rdm_rma_event.exe $(outdir)\rdm_rma_trigger.exe \ $(outdir)\rdm_tagged_peek.exe $(outdir)\scalable_ep.exe $(outdir)\inj_complete.exe $(outdir)\bw.exe unit: $(outdir)\av_test.exe $(outdir)\dom_test.exe $(outdir)\eq_test.exe @@ -95,7 +104,7 @@ $(outdir)\poll.exe: {functional}poll.c $(basedeps) $(outdir)\rdm.exe: {functional}rdm.c $(basedeps) -$(outdir)\rdm_rma_simple.exe: {functional}rdm_rma_simple.c $(basedeps) +$(outdir)\rdm_rma_event.exe: {functional}rdm_rma_event.c $(basedeps) $(outdir)\rdm_rma_trigger.exe: {functional}rdm_rma_trigger.c $(basedeps) diff --git a/fabtests/README.md b/fabtests/README.md index 8448191b011..e3085c1c959 100644 --- a/fabtests/README.md +++ b/fabtests/README.md @@ -1,7 +1,3 @@ -[![Build Status](https://travis-ci.org/ofiwg/fabtests.svg?branch=master)](https://travis-ci.org/ofiwg/fabtests) -[![fabtests Coverity scan suild status](https://scan.coverity.com/projects/ofiwg-fabtests/badge.svg)](https://scan.coverity.com/projects/ofiwg-fabtests) -[![fabtests release version](https://img.shields.io/github/release/ofiwg/fabtests.svg)](https://github.com/ofiwg/fabtests/releases/latest) - # fabtests Fabtests provides a set of examples that uses diff --git a/fabtests/benchmarks/dgram_pingpong.c b/fabtests/benchmarks/dgram_pingpong.c index bb18adff008..74920a709f3 100644 --- a/fabtests/benchmarks/dgram_pingpong.c +++ b/fabtests/benchmarks/dgram_pingpong.c @@ -59,11 +59,7 @@ static int run(void) for (i = 0; i < TEST_CNT; i++) { if (!ft_use_size(i, opts.sizes_enabled)) continue; - opts.transfer_size = test_size[i].size; - if (opts.transfer_size > fi->ep_attr->max_msg_size) - continue; - init_test(&opts, test_name, sizeof(test_name)); ret = pingpong(); if (ret) @@ -122,6 +118,7 @@ int main(int argc, char **argv) hints->mode |= FI_CONTEXT; hints->domain_attr->mr_mode = opts.mr_mode; hints->domain_attr->threading = FI_THREAD_DOMAIN; + hints->tx_attr->tclass = FI_TC_LOW_LATENCY; ret = run(); diff --git a/fabtests/benchmarks/msg_bw.c b/fabtests/benchmarks/msg_bw.c index f273d6a1f54..84094216506 100644 --- a/fabtests/benchmarks/msg_bw.c +++ b/fabtests/benchmarks/msg_bw.c @@ -107,6 +107,8 @@ int main(int argc, char **argv) hints->domain_attr->resource_mgmt = FI_RM_ENABLED; hints->domain_attr->mr_mode = opts.mr_mode; hints->domain_attr->threading = FI_THREAD_DOMAIN; + hints->addr_format = opts.address_format; + hints->tx_attr->tclass = FI_TC_BULK_DATA; ret = run(); diff --git a/fabtests/benchmarks/msg_pingpong.c b/fabtests/benchmarks/msg_pingpong.c index ef342eae898..28331243108 100644 --- a/fabtests/benchmarks/msg_pingpong.c +++ b/fabtests/benchmarks/msg_pingpong.c @@ -107,6 +107,8 @@ int main(int argc, char **argv) hints->caps = FI_MSG; hints->domain_attr->mr_mode = opts.mr_mode; hints->domain_attr->threading = FI_THREAD_DOMAIN; + hints->addr_format = opts.address_format; + hints->tx_attr->tclass = FI_TC_LOW_LATENCY; ret = run(); diff --git a/fabtests/benchmarks/rdm_cntr_pingpong.c b/fabtests/benchmarks/rdm_cntr_pingpong.c index ab09dd170bf..61c64a6b548 100644 --- a/fabtests/benchmarks/rdm_cntr_pingpong.c +++ b/fabtests/benchmarks/rdm_cntr_pingpong.c @@ -100,6 +100,7 @@ int main(int argc, char **argv) hints->caps = FI_MSG; hints->domain_attr->mr_mode = opts.mr_mode; hints->domain_attr->threading = FI_THREAD_DOMAIN; + hints->tx_attr->tclass = FI_TC_LOW_LATENCY; ret = run(); diff --git a/fabtests/benchmarks/rdm_pingpong.c b/fabtests/benchmarks/rdm_pingpong.c index 800a7b63d01..bd2dc403645 100644 --- a/fabtests/benchmarks/rdm_pingpong.c +++ b/fabtests/benchmarks/rdm_pingpong.c @@ -74,7 +74,7 @@ int main(int argc, char **argv) if (!hints) return EXIT_FAILURE; - while ((op = getopt(argc, argv, "h" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) != + while ((op = getopt(argc, argv, "Uh" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) != -1) { switch (op) { default: @@ -82,6 +82,9 @@ int main(int argc, char **argv) ft_parseinfo(op, optarg, hints, &opts); ft_parsecsopts(op, optarg, &opts); break; + case 'U': + hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE; + break; case '?': case 'h': ft_csusage(argv[0], "Ping pong client and server using RDM."); @@ -98,6 +101,7 @@ int main(int argc, char **argv) hints->mode = FI_CONTEXT; hints->domain_attr->mr_mode = opts.mr_mode; hints->domain_attr->threading = FI_THREAD_DOMAIN; + hints->tx_attr->tclass = FI_TC_LOW_LATENCY; ret = run(); diff --git a/fabtests/benchmarks/rdm_tagged_bw.c b/fabtests/benchmarks/rdm_tagged_bw.c index 5252d448489..323a66dc77b 100644 --- a/fabtests/benchmarks/rdm_tagged_bw.c +++ b/fabtests/benchmarks/rdm_tagged_bw.c @@ -77,13 +77,16 @@ int main(int argc, char **argv) if (!hints) return EXIT_FAILURE; - while ((op = getopt(argc, argv, "h" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) != -1) { + while ((op = getopt(argc, argv, "Uh" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) != -1) { switch (op) { default: ft_parse_benchmark_opts(op, optarg); ft_parseinfo(op, optarg, hints, &opts); ft_parsecsopts(op, optarg, &opts); break; + case 'U': + hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE; + break; case '?': case 'h': ft_csusage(argv[0], "Bandwidth test for RDM endpoints using tagged messages."); @@ -101,6 +104,7 @@ int main(int argc, char **argv) hints->mode = FI_CONTEXT; hints->domain_attr->mr_mode = opts.mr_mode; hints->domain_attr->threading = FI_THREAD_DOMAIN; + hints->tx_attr->tclass = FI_TC_BULK_DATA; ret = run(); diff --git a/fabtests/benchmarks/rdm_tagged_pingpong.c b/fabtests/benchmarks/rdm_tagged_pingpong.c index f21216aed4f..e0ca5f211d5 100644 --- a/fabtests/benchmarks/rdm_tagged_pingpong.c +++ b/fabtests/benchmarks/rdm_tagged_pingpong.c @@ -76,13 +76,16 @@ int main(int argc, char **argv) if (!hints) return EXIT_FAILURE; - while ((op = getopt(argc, argv, "h" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) != -1) { + while ((op = getopt(argc, argv, "Uh" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) != -1) { switch (op) { default: ft_parse_benchmark_opts(op, optarg); ft_parseinfo(op, optarg, hints, &opts); ft_parsecsopts(op, optarg, &opts); break; + case 'U': + hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE; + break; case '?': case 'h': ft_csusage(argv[0], "Ping pong client and server using tagged messages."); @@ -99,6 +102,7 @@ int main(int argc, char **argv) hints->mode = FI_CONTEXT; hints->domain_attr->mr_mode = opts.mr_mode; hints->domain_attr->threading = FI_THREAD_DOMAIN; + hints->tx_attr->tclass = FI_TC_LOW_LATENCY; ret = run(); diff --git a/fabtests/benchmarks/rma_bw.c b/fabtests/benchmarks/rma_bw.c index e4351c89bb2..0b2501543bb 100644 --- a/fabtests/benchmarks/rma_bw.c +++ b/fabtests/benchmarks/rma_bw.c @@ -95,8 +95,9 @@ int main(int argc, char **argv) hints->domain_attr->resource_mgmt = FI_RM_ENABLED; hints->mode = FI_CONTEXT; hints->domain_attr->threading = FI_THREAD_DOMAIN; + hints->addr_format = opts.address_format; - while ((op = getopt(argc, argv, "ho:" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) != -1) { + while ((op = getopt(argc, argv, "Uho:" CS_OPTS INFO_OPTS BENCHMARK_OPTS)) != -1) { switch (op) { default: ft_parse_benchmark_opts(op, optarg); @@ -106,6 +107,9 @@ int main(int argc, char **argv) if (ret) return ret; break; + case 'U': + hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE; + break; case '?': case 'h': ft_csusage(argv[0], "Bandwidth test using RMA operations."); @@ -123,6 +127,7 @@ int main(int argc, char **argv) opts.dst_addr = argv[optind]; hints->domain_attr->mr_mode = opts.mr_mode; + hints->tx_attr->tclass = FI_TC_BULK_DATA; ret = run(); diff --git a/fabtests/common/hmem.c b/fabtests/common/hmem.c new file mode 100644 index 00000000000..8736817e5bb --- /dev/null +++ b/fabtests/common/hmem.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2020 Intel Corporation. All rights reserved. + * + * This software is available to you under the BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + */ + +#if HAVE_CONFIG_H + #include +#endif + +#include +#include +#include "hmem.h" + +static bool hmem_initialized = false; + +struct ft_hmem_ops { + int (*init)(void); + int (*cleanup)(void); + int (*alloc)(uint64_t device, void **buf, size_t size); + int (*free)(void *buf); + int (*memset)(uint64_t device, void *buf, int value, size_t size); + int (*copy_to_hmem)(uint64_t device, void *dst, const void *src, + size_t size); + int (*copy_from_hmem)(uint64_t device, void *dst, const void *src, + size_t size); +}; + +static struct ft_hmem_ops hmem_ops[] = { + [FI_HMEM_SYSTEM] = { + .init = ft_host_init, + .cleanup = ft_host_cleanup, + .alloc = ft_host_alloc, + .free = ft_host_free, + .memset = ft_host_memset, + .copy_to_hmem = ft_host_memcpy, + .copy_from_hmem = ft_host_memcpy, + }, + [FI_HMEM_CUDA] = { + .init = ft_cuda_init, + .cleanup = ft_cuda_cleanup, + .alloc = ft_cuda_alloc, + .free = ft_cuda_free, + .memset = ft_cuda_memset, + .copy_to_hmem = ft_cuda_copy_to_hmem, + .copy_from_hmem = ft_cuda_copy_from_hmem, + }, + [FI_HMEM_ROCR] = { + .init = ft_rocr_init, + .cleanup = ft_rocr_cleanup, + .alloc = ft_rocr_alloc, + .free = ft_rocr_free, + .memset = ft_rocr_memset, + .copy_to_hmem = ft_rocr_memcpy, + .copy_from_hmem = ft_rocr_memcpy, + }, + [FI_HMEM_ZE] = { + .init = ft_ze_init, + .cleanup = ft_ze_cleanup, + .alloc = ft_ze_alloc, + .free = ft_ze_free, + .memset = ft_ze_memset, + .copy_to_hmem = ft_ze_copy, + .copy_from_hmem = ft_ze_copy, + }, +}; + +int ft_hmem_init(enum fi_hmem_iface iface) +{ + int ret; + + ret = hmem_ops[iface].init(); + if (ret == FI_SUCCESS) + hmem_initialized = true; + + return ret; +} + +int ft_hmem_cleanup(enum fi_hmem_iface iface) +{ + int ret = FI_SUCCESS; + + if (hmem_initialized) { + ret = hmem_ops[iface].cleanup(); + if (ret == FI_SUCCESS) + hmem_initialized = false; + } + + return ret; +} + +int ft_hmem_alloc(enum fi_hmem_iface iface, uint64_t device, void **buf, + size_t size) +{ + return hmem_ops[iface].alloc(device, buf, size); +} + +int ft_hmem_free(enum fi_hmem_iface iface, void *buf) +{ + return hmem_ops[iface].free(buf); +} + +int ft_hmem_memset(enum fi_hmem_iface iface, uint64_t device, void *buf, + int value, size_t size) +{ + return hmem_ops[iface].memset(device, buf, value, size); +} + +int ft_hmem_copy_to(enum fi_hmem_iface iface, uint64_t device, void *dst, + const void *src, size_t size) +{ + return hmem_ops[iface].copy_to_hmem(device, dst, src, size); +} + +int ft_hmem_copy_from(enum fi_hmem_iface iface, uint64_t device, void *dst, + const void *src, size_t size) +{ + return hmem_ops[iface].copy_from_hmem(device, dst, src, size); +} diff --git a/fabtests/common/hmem_cuda.c b/fabtests/common/hmem_cuda.c new file mode 100644 index 00000000000..ea3cfb78954 --- /dev/null +++ b/fabtests/common/hmem_cuda.c @@ -0,0 +1,224 @@ +/* + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "hmem.h" +#include "shared.h" + +#ifdef HAVE_CUDA_RUNTIME_H + +#include +#include +#include + +struct cuda_ops { + cudaError_t (*cudaMemcpy)(void *dst, const void *src, size_t count, + enum cudaMemcpyKind kind); + cudaError_t (*cudaMalloc)(void **ptr, size_t size); + cudaError_t (*cudaFree)(void *ptr); + cudaError_t (*cudaMemset)(void *ptr, int value, size_t count); + const char *(*cudaGetErrorName)(cudaError_t error); + const char *(*cudaGetErrorString)(cudaError_t error); +}; + +static struct cuda_ops cuda_ops; +static void *cudart_handle; + +#define CUDA_ERR(err, fmt, ...) \ + FT_ERR(fmt ": %s %s", ##__VA_ARGS__, cuda_ops.cudaGetErrorName(err), \ + cuda_ops.cudaGetErrorString(err)) + +int ft_cuda_init(void) +{ + cudart_handle = dlopen("libcudart.so", RTLD_NOW); + if (!cudart_handle) { + FT_ERR("Failed to dlopen libcudart.so"); + goto err; + } + + cuda_ops.cudaMemcpy = dlsym(cudart_handle, "cudaMemcpy"); + if (!cuda_ops.cudaMemcpy) { + FT_ERR("Failed to find cudaMemcpy"); + goto err_dlclose_cuda; + } + + cuda_ops.cudaMalloc = dlsym(cudart_handle, "cudaMalloc"); + if (!cuda_ops.cudaMalloc) { + FT_ERR("Failed to find cudaMalloc"); + goto err_dlclose_cuda; + } + + cuda_ops.cudaFree = dlsym(cudart_handle, "cudaFree"); + if (!cuda_ops.cudaFree) { + FT_ERR("Failed to find cudaFree"); + goto err_dlclose_cuda; + } + + cuda_ops.cudaMemset = dlsym(cudart_handle, "cudaMemset"); + if (!cuda_ops.cudaMemset) { + FT_ERR("Failed to find cudaMemset"); + goto err_dlclose_cuda; + } + + cuda_ops.cudaGetErrorName = dlsym(cudart_handle, "cudaGetErrorName"); + if (!cuda_ops.cudaGetErrorName) { + FT_ERR("Failed to find cudaGetErrorName"); + goto err_dlclose_cuda; + } + + cuda_ops.cudaGetErrorString = dlsym(cudart_handle, + "cudaGetErrorString"); + if (!cuda_ops.cudaGetErrorString) { + FT_ERR("Failed to find cudaGetErrorString"); + goto err_dlclose_cuda; + } + + return FI_SUCCESS; + +err_dlclose_cuda: + dlclose(cudart_handle); +err: + return -FI_ENODATA; +} + +int ft_cuda_cleanup(void) +{ + dlclose(cudart_handle); + return FI_SUCCESS; +} + +int ft_cuda_alloc(uint64_t device, void **buf, size_t size) +{ + cudaError_t cuda_ret; + + cuda_ret = cuda_ops.cudaMalloc(buf, size); + if (cuda_ret == cudaSuccess) + return FI_SUCCESS; + + CUDA_ERR(cuda_ret, "cudaMalloc failed"); + + return -FI_ENOMEM; +} + +int ft_cuda_free(void *buf) +{ + cudaError_t cuda_ret; + + cuda_ret = cuda_ops.cudaFree(buf); + if (cuda_ret == cudaSuccess) + return FI_SUCCESS; + + CUDA_ERR(cuda_ret, "cudaFree failed"); + + return -FI_EIO; +} + +int ft_cuda_memset(uint64_t device, void *buf, int value, size_t size) +{ + cudaError_t cuda_ret; + + cuda_ret = cuda_ops.cudaMemset(buf, value, size); + if (cuda_ret == cudaSuccess) + return FI_SUCCESS; + + CUDA_ERR(cuda_ret, "cudaMemset failed"); + + return -FI_EIO; +} + +int ft_cuda_copy_to_hmem(uint64_t device, void *dst, const void *src, + size_t size) +{ + cudaError_t cuda_ret; + + cuda_ret = cuda_ops.cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice); + if (cuda_ret == cudaSuccess) + return FI_SUCCESS; + + CUDA_ERR(cuda_ret, "cudaMemcpy failed"); + + return -FI_EIO; +} + +int ft_cuda_copy_from_hmem(uint64_t device, void *dst, const void *src, + size_t size) +{ + cudaError_t cuda_ret; + + cuda_ret = cuda_ops.cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost); + if (cuda_ret == cudaSuccess) + return FI_SUCCESS; + + CUDA_ERR(cuda_ret, "cudaMemcpy failed"); + + return -FI_EIO; +} + +#else + +int ft_cuda_init(void) +{ + return -FI_ENOSYS; +} + +int ft_cuda_cleanup(void) +{ + return -FI_ENOSYS; +} + +int ft_cuda_alloc(uint64_t device, void **buf, size_t size) +{ + return -FI_ENOSYS; +} + +int ft_cuda_free(void *buf) +{ + return -FI_ENOSYS; +} + +int ft_cuda_memset(uint64_t device, void *buf, int value, size_t size) +{ + return -FI_ENOSYS; +} + +int ft_cuda_copy_to_hmem(uint64_t device, void *dst, const void *src, + size_t size) +{ + return -FI_ENOSYS; +} + +int ft_cuda_copy_from_hmem(uint64_t device, void *dst, const void *src, + size_t size) +{ + return -FI_ENOSYS; +} + +#endif /* HAVE_CUDA_RUNTIME_H */ diff --git a/fabtests/common/hmem_rocr.c b/fabtests/common/hmem_rocr.c new file mode 100644 index 00000000000..3fed6d77e93 --- /dev/null +++ b/fabtests/common/hmem_rocr.c @@ -0,0 +1,378 @@ +/* + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "hmem.h" +#include "shared.h" + +#ifdef HAVE_ROCR_RUNTIME_H + +#include +#include +#include + +struct rocr_ops { + hsa_status_t (*hsa_memory_copy)(void *dst, const void *src, + size_t size); + hsa_status_t (*hsa_init)(void); + hsa_status_t (*hsa_shut_down)(void); + hsa_status_t (*hsa_status_string)(hsa_status_t status, + const char **status_string); + hsa_status_t (*hsa_agent_get_info)(hsa_agent_t agent, + hsa_agent_info_t attribute, + void *value); + hsa_status_t (*hsa_region_get_info)(hsa_region_t region, + hsa_region_info_t attribute, + void *value); + hsa_status_t (*hsa_iterate_agents) + (hsa_status_t (*cb)(hsa_agent_t agent, void* data), void *data); + hsa_status_t (*hsa_agent_iterate_regions) + (hsa_agent_t agent, + hsa_status_t (*cb)(hsa_region_t region, void* data), + void *data); + hsa_status_t (*hsa_memory_allocate)(hsa_region_t region, size_t size, + void **ptr); + hsa_status_t (*hsa_memory_free)(void *ptr); + hsa_status_t (*hsa_amd_memory_fill)(void* ptr, uint32_t value, + size_t count); +}; + +static struct rocr_ops rocr_ops; +static void *rocr_handle; + +static const char *hsa_status_to_string(hsa_status_t status) +{ + const char *str; + hsa_status_t hsa_ret; + + hsa_ret = rocr_ops.hsa_status_string(status, &str); + if (hsa_ret != HSA_STATUS_SUCCESS) + return "unknown error"; + + return str; +} + +#define ROCR_ERR(err, fmt, ...) \ + FT_ERR(fmt ": %s", ##__VA_ARGS__, hsa_status_to_string(err)) + +static hsa_agent_t gpu_agent; +static hsa_region_t gpu_region; + +static hsa_status_t agent_cb(hsa_agent_t agent, void *data) +{ + hsa_status_t hsa_ret; + hsa_device_type_t hsa_dev_type; + + hsa_ret = rocr_ops.hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, + (void *) &hsa_dev_type); + + if (hsa_ret == HSA_STATUS_SUCCESS && + hsa_dev_type == HSA_DEVICE_TYPE_GPU) { + gpu_agent = agent; + return HSA_STATUS_INFO_BREAK; + } + + return hsa_ret; +} + +static hsa_status_t region_cb(hsa_region_t region, void *data) +{ + hsa_status_t hsa_ret; + hsa_region_segment_t hsa_segment; + + hsa_ret = rocr_ops.hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, + &hsa_segment); + + if (hsa_ret == HSA_STATUS_SUCCESS && + hsa_segment == HSA_REGION_SEGMENT_GLOBAL) { + gpu_region = region; + return HSA_STATUS_INFO_BREAK; + } + + return hsa_ret; +} + +int ft_rocr_init(void) +{ + hsa_status_t hsa_ret; + + rocr_handle = dlopen("libhsa-runtime64.so", RTLD_NOW); + if (!rocr_handle) { + FT_ERR("Failed to dlopen libhsa-runtime64.so"); + goto err; + } + + rocr_ops.hsa_memory_copy = dlsym(rocr_handle, "hsa_memory_copy"); + if (!rocr_ops.hsa_memory_copy) { + FT_ERR("Failed to find hsa_memory_copy"); + goto err_dlclose_rocr; + } + + rocr_ops.hsa_init = dlsym(rocr_handle, "hsa_init"); + if (!rocr_ops.hsa_init) { + FT_ERR("Failed to find hsa_init"); + goto err_dlclose_rocr; + } + + rocr_ops.hsa_shut_down = dlsym(rocr_handle, "hsa_shut_down"); + if (!rocr_ops.hsa_shut_down) { + FT_ERR("Failed to find hsa_shut_down"); + goto err_dlclose_rocr; + } + + rocr_ops.hsa_status_string = dlsym(rocr_handle, "hsa_status_string"); + if (!rocr_ops.hsa_status_string) { + FT_ERR("Failed to find hsa_status_string"); + goto err_dlclose_rocr; + } + + rocr_ops.hsa_agent_get_info = dlsym(rocr_handle, "hsa_agent_get_info"); + if (!rocr_ops.hsa_agent_get_info) { + FT_ERR("Failed to find hsa_agent_get_info"); + goto err_dlclose_rocr; + } + + rocr_ops.hsa_region_get_info = dlsym(rocr_handle, + "hsa_region_get_info"); + if (!rocr_ops.hsa_region_get_info) { + FT_ERR("Failed to find hsa_region_get_info"); + goto err_dlclose_rocr; + } + + rocr_ops.hsa_iterate_agents = dlsym(rocr_handle, "hsa_iterate_agents"); + if (!rocr_ops.hsa_iterate_agents) { + FT_ERR("Failed to find hsa_iterate_agents"); + goto err_dlclose_rocr; + } + + rocr_ops.hsa_agent_iterate_regions = + dlsym(rocr_handle, "hsa_agent_iterate_regions"); + if (!rocr_ops.hsa_agent_iterate_regions) { + FT_ERR("Failed to find hsa_agent_iterate_regions"); + goto err_dlclose_rocr; + } + + rocr_ops.hsa_memory_allocate = + dlsym(rocr_handle, "hsa_memory_allocate"); + if (!rocr_ops.hsa_memory_allocate) { + FT_ERR("Failed to find hsa_memory_allocate"); + goto err_dlclose_rocr; + } + + rocr_ops.hsa_memory_free = dlsym(rocr_handle, "hsa_memory_free"); + if (!rocr_ops.hsa_memory_free) { + FT_ERR("Failed to find hsa_memory_free"); + goto err_dlclose_rocr; + } + + rocr_ops.hsa_amd_memory_fill = dlsym(rocr_handle, + "hsa_amd_memory_fill"); + if (!rocr_ops.hsa_amd_memory_fill) { + FT_ERR("Failed to find hsa_amd_memory_fill"); + goto err_dlclose_rocr; + } + + hsa_ret = rocr_ops.hsa_init(); + if (hsa_ret != HSA_STATUS_SUCCESS) { + ROCR_ERR(hsa_ret, "hsa_init failed"); + goto err_dlclose_rocr; + } + + hsa_ret = rocr_ops.hsa_iterate_agents(agent_cb, NULL); + if (hsa_ret != HSA_STATUS_INFO_BREAK) { + FT_ERR("Failed to find GPU agent"); + goto err_dlclose_rocr; + } + + hsa_ret = rocr_ops.hsa_agent_iterate_regions(gpu_agent, region_cb, + NULL); + if (hsa_ret != HSA_STATUS_INFO_BREAK) { + FT_ERR("Failed to find GPU region"); + goto err_dlclose_rocr; + } + + return FI_SUCCESS; + +err_dlclose_rocr: + dlclose(rocr_handle); +err: + return -FI_ENODATA; +} + +int ft_rocr_cleanup(void) +{ + hsa_status_t hsa_ret; + + hsa_ret = rocr_ops.hsa_shut_down(); + if (hsa_ret != HSA_STATUS_SUCCESS) { + ROCR_ERR(hsa_ret, "hsa_init failed"); + return -FI_ENODATA; + } + + dlclose(rocr_handle); + + return FI_SUCCESS; +} + +int ft_rocr_alloc(uint64_t device, void **buf, size_t size) +{ + hsa_status_t hsa_ret; + + hsa_ret = rocr_ops.hsa_memory_allocate(gpu_region, size, buf); + if (hsa_ret == HSA_STATUS_SUCCESS) + return FI_SUCCESS; + + ROCR_ERR(hsa_ret, "hsa_memory_allocate failed"); + + return -FI_ENOMEM; +} + +int ft_rocr_free(void *buf) +{ + hsa_status_t hsa_ret; + + hsa_ret = rocr_ops.hsa_memory_free(buf); + if (hsa_ret == HSA_STATUS_SUCCESS) + return FI_SUCCESS; + + ROCR_ERR(hsa_ret, "hsa_memory_free failed"); + + return -FI_EIO; +} + +#define ROCR_MEM_FILL_BYTE_ALIGNMENT 4U + +int ft_rocr_memset(uint64_t device, void *buf, int value, size_t size) +{ + unsigned char set_value = value; + void *mem_fill_ptr; + size_t mem_fill_size; + uint32_t mem_fill_value; + hsa_status_t hsa_ret; + unsigned char *ptr = buf; + int ret; + + /* Determine if ROCR memory fill can be used to set device memory. ROCR + * memory fill requires 4-byte alignment. + */ + mem_fill_ptr = (void *) ALIGN((uintptr_t) buf, + ROCR_MEM_FILL_BYTE_ALIGNMENT); + + /* Use ROCR memory copy to fill the start of the buffer until the buffer + * is correctly aligned. + */ + while (ptr != mem_fill_ptr && size > 0) { + ret = ft_rocr_memcpy(device, ptr, &set_value, sizeof(*ptr)); + if (ret != FI_SUCCESS) + return ret; + + size--; + ptr++; + } + + /* Use ROCR memory fill to fill the middle of the buffer. */ + if (size >= ROCR_MEM_FILL_BYTE_ALIGNMENT) { + mem_fill_size = ALIGN_DOWN(size, ROCR_MEM_FILL_BYTE_ALIGNMENT); + + memset(&mem_fill_value, set_value, sizeof(mem_fill_value)); + + hsa_ret = rocr_ops.hsa_amd_memory_fill(mem_fill_ptr, + mem_fill_value, + mem_fill_size / + ROCR_MEM_FILL_BYTE_ALIGNMENT); + if (hsa_ret != HSA_STATUS_SUCCESS) { + ROCR_ERR(hsa_ret, "hsa_amd_memory_fill failed"); + return -FI_EIO; + } + + size -= mem_fill_size; + ptr += mem_fill_size; + } + + /* Use ROCR memory copy to fill the end of the buffer. */ + while (size > 0) { + ret = ft_rocr_memcpy(device, ptr, &set_value, sizeof(*ptr)); + if (ret != FI_SUCCESS) + return ret; + + size--; + ptr++; + } + + return FI_SUCCESS; +} + +int ft_rocr_memcpy(uint64_t device, void *dst, const void *src, size_t size) +{ + hsa_status_t hsa_ret; + + hsa_ret = rocr_ops.hsa_memory_copy(dst, src, size); + if (hsa_ret == HSA_STATUS_SUCCESS) + return FI_SUCCESS; + + ROCR_ERR(hsa_ret, "hsa_memory_copy failed"); + + return -FI_EIO; +} + +#else + +int ft_rocr_init(void) +{ + return -FI_ENOSYS; +} + +int ft_rocr_cleanup(void) +{ + return -FI_ENOSYS; +} + +int ft_rocr_alloc(uint64_t device, void **buf, size_t size) +{ + return -FI_ENOSYS; +} + +int ft_rocr_free(void *buf) +{ + return -FI_ENOSYS; +} + +int ft_rocr_memset(uint64_t device, void *buf, int value, size_t size) +{ + return -FI_ENOSYS; +} + +int ft_rocr_memcpy(uint64_t device, void *dst, const void *src, size_t size) +{ + return -FI_ENOSYS; +} + +#endif /* HAVE_ROCR_RUNTIME_H */ diff --git a/fabtests/common/hmem_ze.c b/fabtests/common/hmem_ze.c new file mode 100644 index 00000000000..759faa34bff --- /dev/null +++ b/fabtests/common/hmem_ze.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2020 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +#include +#endif + +#include "hmem.h" + +#ifdef HAVE_LIBZE + +#include + +#define ZE_MAX_DEVICES 4 + +static ze_context_handle_t context; +static ze_device_handle_t devices[ZE_MAX_DEVICES]; +static ze_command_queue_handle_t cmd_queue[ZE_MAX_DEVICES]; +static int num_devices = 0; + +static const ze_command_queue_desc_t cq_desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + .pNext = NULL, + .ordinal = 0, + .index = 0, + .flags = 0, + .mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, + .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL, +}; + +static const ze_command_list_desc_t cl_desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, + .pNext = NULL, + .commandQueueGroupOrdinal = 0, + .flags = 0, +}; + +static const ze_device_mem_alloc_desc_t device_desc = { + .stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, + .pNext = NULL, + .flags = 0, + .ordinal = 0, +}; + +int ft_ze_init(void) +{ + ze_driver_handle_t driver; + ze_context_desc_t context_desc = {0}; + ze_result_t ze_ret; + uint32_t count; + + ze_ret = zeInit(ZE_INIT_FLAG_GPU_ONLY); + if (ze_ret) + return -FI_EIO; + + count = 1; + ze_ret = zeDriverGet(&count, &driver); + if (ze_ret) + return -FI_EIO; + + ze_ret = zeContextCreate(driver, &context_desc, &context); + if (ze_ret) + return -FI_EIO; + + count = 0; + ze_ret = zeDeviceGet(driver, &count, NULL); + if (ze_ret || count > ZE_MAX_DEVICES) + goto err;; + + ze_ret = zeDeviceGet(driver, &count, devices); + if (ze_ret) + goto err; + + for (num_devices = 0; num_devices < count; num_devices++) { + ze_ret = zeCommandQueueCreate(context, devices[num_devices], &cq_desc, + &cmd_queue[num_devices]); + if (ze_ret) + goto err; + } + + return FI_SUCCESS; + +err: + (void) ft_ze_cleanup(); + return -FI_EIO; +} + +int ft_ze_cleanup(void) +{ + int i, ret = FI_SUCCESS; + + for (i = 0; i < num_devices; i++) { + if (cmd_queue[i] && zeCommandQueueDestroy(cmd_queue[i])) + ret = -FI_EINVAL; + } + + if (zeContextDestroy(context)) + return -FI_EINVAL; + + return ret; +} + +int ft_ze_alloc(uint64_t device, void **buf, size_t size) +{ + return zeMemAllocDevice(context, &device_desc, size, 16, + devices[device], buf) ? -FI_EINVAL : 0; +} + +int ft_ze_free(void *buf) +{ + return zeMemFree(context, buf) ? -FI_EINVAL : FI_SUCCESS; +} + +int ft_ze_memset(uint64_t device, void *buf, int value, size_t size) +{ + ze_command_list_handle_t cmd_list; + ze_result_t ze_ret; + + ze_ret = zeCommandListCreate(context, devices[device], &cl_desc, &cmd_list); + if (ze_ret) + return -FI_EIO; + + ze_ret = zeCommandListAppendMemoryFill(cmd_list, buf, &value, + sizeof(value), size, NULL, 0, NULL); + if (ze_ret) + goto free; + + ze_ret = zeCommandListClose(cmd_list); + if (ze_ret) + goto free; + + ze_ret = zeCommandQueueExecuteCommandLists(cmd_queue[device], 1, + &cmd_list, NULL); + +free: + if (!zeCommandListDestroy(cmd_list) && !ze_ret) + return FI_SUCCESS; + + return -FI_EINVAL; +} + +int ft_ze_copy(uint64_t device, void *dst, const void *src, size_t size) +{ + ze_command_list_handle_t cmd_list; + ze_result_t ze_ret; + + ze_ret = zeCommandListCreate(context, devices[device], &cl_desc, &cmd_list); + if (ze_ret) + return -FI_EIO; + + ze_ret = zeCommandListAppendMemoryCopy(cmd_list, dst, src, size, NULL, 0, NULL); + if (ze_ret) + goto free; + + ze_ret = zeCommandListClose(cmd_list); + if (ze_ret) + goto free; + + ze_ret = zeCommandQueueExecuteCommandLists(cmd_queue[device], 1, + &cmd_list, NULL); + +free: + if (!zeCommandListDestroy(cmd_list) && !ze_ret) + return FI_SUCCESS; + + return -FI_EINVAL; +} + +#else + +int ft_ze_init(void) +{ + return -FI_ENOSYS; +} + +int ft_ze_cleanup(void) +{ + return -FI_ENOSYS; +} + +int ft_ze_alloc(uint64_t device, void **buf, size_t size) +{ + return -FI_ENOSYS; +} + +int ft_ze_free(void *buf) +{ + return -FI_ENOSYS; +} + +int ft_ze_memset(uint64_t device, void *buf, int value, size_t size) +{ + return -FI_ENOSYS; +} + +int ft_ze_copy(uint64_t device, void *dst, const void *src, size_t size) +{ + return -FI_ENOSYS; +} + + +#endif /* HAVE_LIBZE */ diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index 44304ab99da..96b90a37986 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -46,8 +46,10 @@ #include #include #include +#include #include +#include struct fi_info *fi_pep, *fi, *hints; struct fid_fabric *fabric; @@ -200,6 +202,10 @@ static void ft_cq_set_wait_attr(void) cq_attr.wait_obj = FI_WAIT_FD; cq_attr.wait_cond = FI_CQ_COND_NONE; break; + case FT_COMP_YIELD: + cq_attr.wait_obj = FI_WAIT_YIELD; + cq_attr.wait_cond = FI_CQ_COND_NONE; + break; default: cq_attr.wait_obj = FI_WAIT_NONE; break; @@ -219,6 +225,9 @@ static void ft_cntr_set_wait_attr(void) case FT_COMP_WAIT_FD: cntr_attr.wait_obj = FI_WAIT_FD; break; + case FT_COMP_YIELD: + cntr_attr.wait_obj = FI_WAIT_YIELD; + break; default: cntr_attr.wait_obj = FI_WAIT_NONE; break; @@ -350,6 +359,48 @@ void ft_free_bit_combo(uint64_t *combo) free(combo); } +static int ft_reg_mr(void *buf, size_t size, uint64_t access, + uint64_t key, struct fid_mr **mr, void **desc) +{ + struct fi_mr_attr attr = {0}; + struct iovec iov = {0}; + int ret; + + if (((!(fi->domain_attr->mr_mode & FI_MR_LOCAL) && + !(opts.options & FT_OPT_USE_DEVICE)) || + (!(fi->domain_attr->mr_mode & FI_MR_HMEM) && + opts.options & FT_OPT_USE_DEVICE)) && + !(fi->caps & (FI_RMA | FI_ATOMIC))) + return 0; + + iov.iov_base = buf; + iov.iov_len = size; + attr.mr_iov = &iov; + attr.iov_count = 1; + attr.access = access; + attr.offset = 0; + attr.requested_key = key; + attr.context = NULL; + attr.iface = opts.iface; + + switch (opts.iface) { + case FI_HMEM_ZE: + attr.device.ze = opts.device; + break; + default: + break; + } + + ret = fi_mr_regattr(domain, &attr, 0, mr); + if (ret) + return ret; + + if (desc) + *desc = fi_mr_desc(*mr); + + return FI_SUCCESS; +} + static int ft_alloc_ctx_array(struct ft_context **mr_array, char ***mr_bufs, char *default_buf, size_t mr_size, uint64_t start_key) @@ -371,25 +422,23 @@ static int ft_alloc_ctx_array(struct ft_context **mr_array, char ***mr_bufs, for (i = 0; i < opts.window_size; i++) { context = &(*mr_array)[i]; if (!(opts.options & FT_OPT_ALLOC_MULT_MR)) { - context->buf = default_buf; + context->buf = default_buf + mr_size * i; + context->mr = mr; + context->desc = mr_desc; continue; } - (*mr_bufs)[i] = calloc(1, mr_size); + ret = ft_hmem_alloc(opts.iface, opts.device, + (void **) &((*mr_bufs)[i]), mr_size); + if (ret) + return ret; + context->buf = (*mr_bufs)[i]; - if (((fi->domain_attr->mr_mode & FI_MR_LOCAL) || - (fi->caps & (FI_RMA | FI_ATOMIC)))) { - ret = fi_mr_reg(domain, context->buf, - mr_size, access, 0, - start_key + i, 0, - &context->mr, NULL); - if (ret) - return ret; - context->desc = fi_mr_desc(context->mr); - } else { - context->mr = NULL; - context->desc = NULL; - } + ret = ft_reg_mr(context->buf, mr_size, access, + start_key + i, &context->mr, + &context->desc); + if (ret) + return ret; } return 0; @@ -426,11 +475,12 @@ static int ft_alloc_msgs(void) } else { ft_set_tx_rx_sizes(&tx_size, &rx_size); tx_mr_size = 0; - rx_mr_size = 0; - buf_size = MAX(tx_size, FT_MAX_CTRL_MSG) + MAX(rx_size, FT_MAX_CTRL_MSG); + rx_mr_size = 0; + buf_size = MAX(tx_size, FT_MAX_CTRL_MSG) * opts.window_size + + MAX(rx_size, FT_MAX_CTRL_MSG) * opts.window_size; } - if (opts.options & FT_OPT_ALIGN) { + if (opts.options & FT_OPT_ALIGN && !(opts.options & FT_OPT_USE_DEVICE)) { alignment = sysconf(_SC_PAGESIZE); if (alignment < 0) return -errno; @@ -443,30 +493,28 @@ static int ft_alloc_msgs(void) return ret; } } else { - buf = malloc(buf_size); - if (!buf) { - perror("malloc"); - return -FI_ENOMEM; - } + ret = ft_hmem_alloc(opts.iface, opts.device, (void **) &buf, buf_size); + if (ret) + return ret; } - memset(buf, 0, buf_size); + ret = ft_hmem_memset(opts.iface, opts.device, (void *) buf, 0, buf_size); + if (ret) + return ret; rx_buf = buf; - tx_buf = (char *) buf + MAX(rx_size, FT_MAX_CTRL_MSG); - tx_buf = (void *) (((uintptr_t) tx_buf + alignment - 1) & - ~(alignment - 1)); + + if (opts.options & FT_OPT_ALLOC_MULT_MR) + tx_buf = (char *) buf + MAX(rx_size, FT_MAX_CTRL_MSG); + else + tx_buf = (char *) buf + MAX(rx_size, FT_MAX_CTRL_MSG) * opts.window_size; remote_cq_data = ft_init_cq_data(fi); - if (!ft_mr_alloc_func && !ft_check_opts(FT_OPT_SKIP_REG_MR) && - ((fi->domain_attr->mr_mode & FI_MR_LOCAL) || - (fi->caps & (FI_RMA | FI_ATOMIC)))) { - ret = fi_mr_reg(domain, buf, buf_size, ft_info_to_mr_access(fi), - 0, FT_MR_KEY, 0, &mr, NULL); - if (ret) { - FT_PRINTERR("fi_mr_reg", ret); + mr = &no_mr; + if (!ft_mr_alloc_func && !ft_check_opts(FT_OPT_SKIP_REG_MR)) { + ret = ft_reg_mr(buf, buf_size, ft_info_to_mr_access(fi), + FT_MR_KEY, &mr, &mr_desc); + if (ret) return ret; - } - mr_desc = ft_check_mr_local_flag(fi) ? fi_mr_desc(mr) : NULL; } else { if (ft_mr_alloc_func) { assert(!ft_check_opts(FT_OPT_SKIP_REG_MR)); @@ -474,7 +522,6 @@ static int ft_alloc_msgs(void) if (ret) return ret; } - mr = &no_mr; } ret = ft_alloc_ctx_array(&tx_ctx_arr, &tx_mr_bufs, tx_buf, @@ -512,6 +559,14 @@ int ft_open_fabric_res(void) return ret; } + if (opts.options & FT_OPT_DOMAIN_EQ) { + ret = fi_domain_bind(domain, &eq->fid, 0); + if (ret) { + FT_PRINTERR("fi_domain_bind", ret); + return ret; + } + } + return 0; } @@ -629,15 +684,23 @@ int ft_alloc_active_res(struct fi_info *fi) return 0; } -static void ft_init(void) +static int ft_init(void) { tx_seq = 0; rx_seq = 0; tx_cq_cntr = 0; rx_cq_cntr = 0; + + //If using device memory for transfers, require OOB address + //exchange because extra steps are involved when passing + //device buffers into fi_av_insert + if (opts.options & FT_OPT_ENABLE_HMEM) + opts.options |= FT_OPT_OOB_ADDR_EXCH; + + return ft_hmem_init(opts.iface); } -static int ft_init_oob(void) +int ft_init_oob(void) { int ret, op, err; struct addrinfo *ai = NULL; @@ -697,6 +760,19 @@ static int ft_init_oob(void) return ret; } +int ft_accept_next_client() { + int ret; + + if (!ft_check_opts(FT_OPT_SKIP_MSG_ALLOC) && (fi->caps & (FI_MSG | FI_TAGGED))) { + /* Initial receive will get remote address for unconnected EPs */ + ret = ft_post_rx(ep, MAX(rx_size, FT_MAX_CTRL_MSG), &rx_ctx); + if (ret) + return ret; + } + + return ft_init_av(); +} + int ft_getinfo(struct fi_info *hints, struct fi_info **info) { char *node, *service; @@ -710,6 +786,11 @@ int ft_getinfo(struct fi_info *hints, struct fi_info **info) if (!hints->ep_attr->type) hints->ep_attr->type = FI_EP_RDM; + if (opts.options & FT_OPT_ENABLE_HMEM) { + hints->caps |= FI_HMEM; + hints->domain_attr->mr_mode |= FI_MR_HMEM; + } + ret = fi_getinfo(FT_FIVERSION, node, service, flags, hints, info); if (ret) { FT_PRINTERR("fi_getinfo", ret); @@ -742,7 +823,10 @@ int ft_start_server(void) { int ret; - ft_init(); + ret = ft_init(); + if (ret) + return ret; + ret = ft_init_oob(); if (ret) return ret; @@ -862,6 +946,14 @@ int ft_server_connect(void) goto err; } + if (opts.options & FT_OPT_DOMAIN_EQ) { + ret = fi_domain_bind(domain, &eq->fid, 0); + if (ret) { + FT_PRINTERR("fi_domain_bind", ret); + return ret; + } + } + ret = ft_alloc_active_res(fi); if (ret) goto err; @@ -904,7 +996,10 @@ int ft_client_connect(void) { int ret; - ft_init(); + ret = ft_init(); + if (ret) + return ret; + ret = ft_init_oob(); if (ret) return ret; @@ -936,7 +1031,10 @@ int ft_init_fabric(void) { int ret; - ft_init(); + ret = ft_init(); + if (ret) + return ret; + ret = ft_init_oob(); if (ret) return ret; @@ -995,7 +1093,8 @@ int ft_enable_ep(struct fid_ep *ep, struct fid_eq *eq, struct fid_av *av, uint64_t flags; int ret; - if (fi->ep_attr->type == FI_EP_MSG || fi->caps & FI_MULTICAST) + if ((fi->ep_attr->type == FI_EP_MSG || fi->caps & FI_MULTICAST || + fi->caps & FI_COLLECTIVE) && !(opts.options & FT_OPT_DOMAIN_EQ)) FT_EP_BIND(ep, eq, 0); FT_EP_BIND(ep, av, 0); @@ -1143,7 +1242,7 @@ int ft_exchange_addresses_oob(struct fid_av *av_ptr, struct fid_ep *ep_ptr, ret = ft_av_insert(av_ptr, buf, 1, remote_addr, 0, NULL); if (ret) - return ret; + return ret; return 0; } @@ -1309,7 +1408,7 @@ int ft_exchange_raw_keys(struct fi_rma_iov *peer_iov) if (ret) return ret; - ret = ft_tx(ep, remote_fi_addr, len, &tx_ctx); + ret = ft_tx(ep, remote_fi_addr, len, &tx_ctx); if (ret) return ret; @@ -1422,21 +1521,21 @@ int ft_exchange_keys(struct fi_rma_iov *peer_iov) static void ft_cleanup_mr_array(struct ft_context *ctx_arr, char **mr_bufs) { - int i; + int i, ret; if (!mr_bufs) return; for (i = 0; i < opts.window_size; i++) { FT_CLOSE_FID(ctx_arr[i].mr); - free(mr_bufs[i]); + ret = ft_hmem_free(opts.iface, mr_bufs[i]); + if (ret) + FT_PRINTERR("ft_hmem_free", ret); } } static void ft_close_fids(void) { - if (mr != &no_mr) - FT_CLOSE_FID(mr); FT_CLOSE_FID(mc); FT_CLOSE_FID(alias_ep); FT_CLOSE_FID(ep); @@ -1450,15 +1549,19 @@ static void ft_close_fids(void) FT_CLOSE_FID(rxcntr); FT_CLOSE_FID(txcntr); FT_CLOSE_FID(pollset); + if (mr != &no_mr) + FT_CLOSE_FID(mr); FT_CLOSE_FID(av); - FT_CLOSE_FID(eq); FT_CLOSE_FID(domain); + FT_CLOSE_FID(eq); FT_CLOSE_FID(waitset); FT_CLOSE_FID(fabric); } void ft_free_res(void) { + int ret; + ft_cleanup_mr_array(tx_ctx_arr, tx_mr_bufs); ft_cleanup_mr_array(rx_ctx_arr, rx_mr_bufs); @@ -1470,7 +1573,9 @@ void ft_free_res(void) ft_close_fids(); if (buf) { - free(buf); + ret = ft_hmem_free(opts.iface, buf); + if (ret) + FT_PRINTERR("ft_hmem_free", ret); buf = rx_buf = tx_buf = NULL; buf_size = rx_size = tx_size = tx_mr_size = rx_mr_size = 0; } @@ -1486,6 +1591,10 @@ void ft_free_res(void) fi_freeinfo(hints); hints = NULL; } + + ret = ft_hmem_cleanup(opts.iface); + if (ret) + FT_PRINTERR("ft_hmem_cleanup", ret); } static int dupaddr(void **dst_addr, size_t *dst_addrlen, @@ -1547,7 +1656,7 @@ int ft_read_addr_opts(char **node, char **service, struct fi_info *hints, { int ret; - if (opts->dst_addr) { + if (opts->dst_addr && (opts->src_addr || !opts->oob_port)){ if (!opts->dst_port) opts->dst_port = default_port; @@ -1855,7 +1964,7 @@ ssize_t ft_post_rma_inject(enum ft_rma_opcodes op, struct fid_ep *ep, size_t siz switch (op) { case FT_RMA_WRITE: FT_POST(fi_inject_write, ft_progress, txcq, tx_seq, &tx_cq_cntr, - "fi_inject_write", ep, tx_buf, opts.transfer_size, + "fi_inject_write", ep, tx_buf, opts.transfer_size, remote_fi_addr, remote->addr, remote->key); break; case FT_RMA_WRITEDATA: @@ -2140,6 +2249,7 @@ static int ft_get_cq_comp(struct fid_cq *cq, uint64_t *cur, switch (opts.comp_method) { case FT_COMP_SREAD: + case FT_COMP_YIELD: ret = ft_wait_for_comp(cq, cur, total, timeout); break; case FT_COMP_WAIT_FD: @@ -2207,6 +2317,7 @@ static int ft_get_cntr_comp(struct fid_cntr *cntr, uint64_t total, int timeout) case FT_COMP_SREAD: case FT_COMP_WAITSET: case FT_COMP_WAIT_FD: + case FT_COMP_YIELD: ret = ft_wait_for_cntr(cntr, total, timeout); break; default: @@ -2527,7 +2638,6 @@ int ft_finalize_ep(struct fid_ep *ep) int ret; struct fi_context ctx; - strcpy(tx_buf + ft_tx_prefix_size(), "fin"); iov.iov_base = tx_buf; iov.iov_len = 4 + ft_tx_prefix_size(); @@ -2543,7 +2653,9 @@ int ft_finalize_ep(struct fid_ep *ep) tmsg.ignore = 0; tmsg.context = &ctx; - ret = fi_tsendmsg(ep, &tmsg, FI_INJECT | FI_TRANSMIT_COMPLETE); + FT_POST(fi_tsendmsg, ft_progress, txcq, tx_seq, + &tx_cq_cntr, "tsendmsg", ep, &tmsg, + FI_TRANSMIT_COMPLETE); } else { struct fi_msg msg; @@ -2554,15 +2666,12 @@ int ft_finalize_ep(struct fid_ep *ep) msg.addr = remote_fi_addr; msg.context = &ctx; - ret = fi_sendmsg(ep, &msg, FI_INJECT | FI_TRANSMIT_COMPLETE); - } - if (ret) { - FT_PRINTERR("transmit", ret); - return ret; + FT_POST(fi_sendmsg, ft_progress, txcq, tx_seq, + &tx_cq_cntr, "sendmsg", ep, &msg, + FI_TRANSMIT_COMPLETE); } - - ret = ft_get_tx_comp(++tx_seq); + ret = ft_get_tx_comp(tx_seq); if (ret) return ret; @@ -2678,6 +2787,9 @@ void ft_addr_usage() "synchronization over the, optional, port"); FT_PRINT_OPTS_USAGE("-E[=]", "enable out-of-band address exchange only " "over the, optional, port"); + FT_PRINT_OPTS_USAGE("-C ", "number of connections to accept before " + "cleaning up a server"); + FT_PRINT_OPTS_USAGE("-F ", "Address format (default:FI_FORMAT_UNSPEC)"); } void ft_usage(char *name, char *desc) @@ -2705,6 +2817,15 @@ void ft_usage(char *name, char *desc) FT_PRINT_OPTS_USAGE("", "fi_resmgmt_test"); FT_PRINT_OPTS_USAGE("", "fi_inj_complete"); FT_PRINT_OPTS_USAGE("", "fi_bw"); + FT_PRINT_OPTS_USAGE("-U", "run fabtests with FI_DELIVERY_COMPLETE set"); + FT_PRINT_OPTS_USAGE("", "Only the following tests support this option for now:"); + FT_PRINT_OPTS_USAGE("", "fi_bw"); + FT_PRINT_OPTS_USAGE("", "fi_rdm"); + FT_PRINT_OPTS_USAGE("", "fi_rdm_atomic"); + FT_PRINT_OPTS_USAGE("", "fi_rdm_pingpong"); + FT_PRINT_OPTS_USAGE("", "fi_rdm_tagged_bw"); + FT_PRINT_OPTS_USAGE("", "fi_rdm_tagged_pingpong"); + FT_PRINT_OPTS_USAGE("", "fi_rma_bw"); FT_PRINT_OPTS_USAGE("-M ", "Disable mode bit from test"); FT_PRINT_OPTS_USAGE("", "mr_local"); FT_PRINT_OPTS_USAGE("-a
", "name of address vector"); @@ -2729,6 +2850,10 @@ void ft_mcusage(char *name, char *desc) FT_PRINT_OPTS_USAGE("-p ", "specific provider name eg sockets, verbs"); FT_PRINT_OPTS_USAGE("-d ", "domain name"); FT_PRINT_OPTS_USAGE("-p ", "specific provider name eg sockets, verbs"); + FT_PRINT_OPTS_USAGE("-D ", "Specify device interface: eg ze (default: None). " + "Automatically enables FI_HMEM (-H)"); + FT_PRINT_OPTS_USAGE("-i ", "Specify which device to use (default: 0)"); + FT_PRINT_OPTS_USAGE("-H", "Enable provider FI_HMEM support"); FT_PRINT_OPTS_USAGE("-h", "display this help output"); return; @@ -2738,12 +2863,15 @@ void ft_csusage(char *name, char *desc) { ft_usage(name, desc); FT_PRINT_OPTS_USAGE("-I ", "number of iterations"); + FT_PRINT_OPTS_USAGE("-Q", "bind EQ to domain (vs. endpoint)"); FT_PRINT_OPTS_USAGE("-w ", "number of warmup iterations"); FT_PRINT_OPTS_USAGE("-S ", "specific transfer size or 'all'"); FT_PRINT_OPTS_USAGE("-l", "align transmit and receive buffers to page size"); FT_PRINT_OPTS_USAGE("-m", "machine readable output"); + FT_PRINT_OPTS_USAGE("-D ", "Specify device interface: eg cuda, ze(default: None). " + "Automatically enables FI_HMEM (-H)"); FT_PRINT_OPTS_USAGE("-t ", "completion type [queue, counter]"); - FT_PRINT_OPTS_USAGE("-c ", "completion method [spin, sread, fd]"); + FT_PRINT_OPTS_USAGE("-c ", "completion method [spin, sread, fd, yield]"); FT_PRINT_OPTS_USAGE("-h", "display this help output"); return; @@ -2795,6 +2923,21 @@ void ft_parseinfo(int op, char *optarg, struct fi_info *hints, if (!strncasecmp("mr_local", optarg, 8)) opts->mr_mode &= ~FI_MR_LOCAL; break; + case 'D': + if (!strncasecmp("ze", optarg, 2)) + opts->iface = FI_HMEM_ZE; + else if (!strncasecmp("cuda", optarg, 4)) + opts->iface = FI_HMEM_CUDA; + else + printf("Unsupported interface\n"); + opts->options |= FT_OPT_ENABLE_HMEM | FT_OPT_USE_DEVICE; + break; + case 'i': + opts->device = atoi(optarg); + break; + case 'H': + opts->options |= FT_OPT_ENABLE_HMEM; + break; default: /* let getopt handle unknown opts*/ break; @@ -2824,6 +2967,19 @@ void ft_parse_addr_opts(int op, char *optarg, struct ft_opts *opts) else opts->oob_port = default_oob_port; break; + case 'F': + if (!strncasecmp("fi_sockaddr_in", optarg, 14)) + opts->address_format = FI_SOCKADDR_IN; + else if (!strncasecmp("fi_sockaddr_in6", optarg, 15)) + opts->address_format = FI_SOCKADDR_IN6; + else if (!strncasecmp("fi_sockaddr_ib", optarg, 14)) + opts->address_format = FI_SOCKADDR_IB; + else if (!strncasecmp("fi_sockaddr", optarg, 11)) /* keep me last */ + opts->address_format = FI_SOCKADDR; + break; + case 'C': + opts->options |= FT_OPT_SERVER_PERSIST; + opts->num_connections = atoi(optarg); default: /* let getopt handle unknown opts*/ break; @@ -2839,12 +2995,15 @@ void ft_parsecsopts(int op, char *optarg, struct ft_opts *opts) opts->options |= FT_OPT_ITER; opts->iterations = atoi(optarg); break; + case 'Q': + opts->options |= FT_OPT_DOMAIN_EQ; + break; case 'S': if (!strncasecmp("all", optarg, 3)) { opts->sizes_enabled = FT_ENABLE_ALL; } else { opts->options |= FT_OPT_SIZE; - opts->transfer_size = atoi(optarg); + opts->transfer_size = atol(optarg); } break; case 'm': @@ -2855,6 +3014,8 @@ void ft_parsecsopts(int op, char *optarg, struct ft_opts *opts) opts->comp_method = FT_COMP_SREAD; else if (!strncasecmp("fd", optarg, 2)) opts->comp_method = FT_COMP_WAIT_FD; + else if (!strncasecmp("yield", optarg, 5)) + opts->comp_method = FT_COMP_YIELD; break; case 't': if (!strncasecmp("counter", optarg, 7)) { @@ -2908,12 +3069,12 @@ int ft_parse_rma_opts(int op, char *optarg, struct fi_info *hints, return 0; } -void ft_fill_buf(void *buf, int size) +void ft_fill_buf(void *buf, size_t size) { char *msg_buf; int msg_index; static unsigned int iter = 0; - int i; + size_t i; msg_index = ((iter++)*INTEG_SEED) % integ_alphabet_length; msg_buf = (char *)buf; @@ -2924,13 +3085,13 @@ void ft_fill_buf(void *buf, int size) } } -int ft_check_buf(void *buf, int size) +int ft_check_buf(void *buf, size_t size) { char *recv_data; char c; static unsigned int iter = 0; int msg_index; - int i; + size_t i; msg_index = ((iter++)*INTEG_SEED) % integ_alphabet_length; recv_data = (char *)buf; @@ -2943,7 +3104,7 @@ int ft_check_buf(void *buf, int size) break; } if (i != size) { - printf("Error at iteration=%d size=%d byte=%d\n", + printf("Error at iteration=%d size=%zu byte=%zu\n", iter, size, i); return 1; } @@ -3148,7 +3309,7 @@ int ft_sock_recv(int fd, void *msg, size_t len) } else if (ret == 0) { return -FI_ENOTCONN; } else if (ret < 0) { - FT_PRINTERR("ft_fw_recv", ret); + FT_PRINTERR("ft_sock_recv", -errno); perror("recv"); return -errno; } else { diff --git a/fabtests/configure.ac b/fabtests/configure.ac index 4ca4336eadc..65a5d219597 100644 --- a/fabtests/configure.ac +++ b/fabtests/configure.ac @@ -1,15 +1,15 @@ dnl dnl Copyright (c) 2016-2017 Cisco Systems, Inc. All rights reserved. -dnl Copyright (c) 2018-2019 Intel Corporation, Inc. All rights reserved. +dnl Copyright (c) 2018-2021 Intel Corporation, Inc. All rights reserved. dnl dnl Process this file with autoconf to produce a configure script. AC_PREREQ(2.57) -AC_INIT([fabtests], [1.9.0a1], [ofiwg@lists.openfabrics.org]) +AC_INIT([fabtests], [1.12.0rc2], [ofiwg@lists.openfabrics.org]) AC_CONFIG_AUX_DIR(config) AC_CONFIG_MACRO_DIR(config) AC_CONFIG_HEADERS(config.h) -AM_INIT_AUTOMAKE([1.11 dist-bzip2 foreign -Wall -Werror subdir-objects]) +AM_INIT_AUTOMAKE([1.11 dist-bzip2 foreign -Wall -Werror subdir-objects tar-pax]) m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) AC_CANONICAL_HOST @@ -51,6 +51,16 @@ AC_ARG_ENABLE([debug], AC_DEFINE_UNQUOTED([ENABLE_DEBUG], [$dbg], [defined to 1 if configured with --enable-debug]) +AC_ARG_ENABLE([asan], + [AS_HELP_STRING([--enable-asan], + [Enable address sanitizer @<:@default=no@:>@]) + ], + [], + [enable_asan=no]) + +AS_IF([test x"$enable_asan" != x"no"], + [CFLAGS="-fsanitize=address $CFLAGS"]) + dnl Fix autoconf's habit of adding -g -O2 by default AS_IF([test -z "$CFLAGS"], [CFLAGS="-O2 -DNDEBUG ${base_c_warn_flags}"]) @@ -96,6 +106,36 @@ AC_ARG_WITH([libfabric], LDFLAGS="-L$withval/$fab_libdir $LDFLAGS"], []) +dnl Check for CUDA support. Require fabtests to dlopen CUDA runtime. +AC_ARG_WITH([cuda], + [AC_HELP_STRING([--with-cuda=DIR], + [Provide path to where the CUDA development + and runtime libraries are installed.])], + [AS_IF([test "$freebsd" == "0"], + [AC_CHECK_LIB(dl, dlopen, [], [AC_MSG_ERROR([dlopen not found.])])], + []) + CPPFLAGS="-I$withval/include $CPPFLAGS" + AC_CHECK_HEADER([cuda_runtime.h], + [AC_DEFINE([HAVE_CUDA_RUNTIME_H], [1], + [Define to 1 if you have ])], + [AC_MSG_ERROR([ not found])])], + []) + +dnl Check for ROCR support. Require fabtests to dlopen ROCR. +AC_ARG_WITH([rocr], + [AC_HELP_STRING([--with-rocr=DIR], + [Provide path to where the ROCR development + and runtime libraries are installed.])], + [AS_IF([test "$freebsd" == "0"], + [AC_CHECK_LIB(dl, dlopen, [], [AC_MSG_ERROR([dlopen not found.])])], + []) + CPPFLAGS="-I$withval/include $CPPFLAGS" + AC_CHECK_HEADER([hsa/hsa.h], + [AC_DEFINE([HAVE_ROCR_RUNTIME_H], [1], + [Define to 1 if you have ])], + [AC_MSG_ERROR([ not found])])], + []) + dnl Checks for libraries AC_CHECK_LIB([fabric], fi_getinfo, [], AC_MSG_ERROR([fi_getinfo() not found. fabtests requires libfabric.])) @@ -105,6 +145,20 @@ AC_HEADER_STDC AC_CHECK_HEADER([rdma/fabric.h], [], [AC_MSG_ERROR([ not found. fabtests requires libfabric.])]) +AC_ARG_WITH([ze], + AC_HELP_STRING([--with-ze], [Use non-default ZE location - default NO]), + [CPPFLAGS="-I$withval/include $CPPFLAGS" + LDFLAGS="-L$withval/$lib $LDFLAGS"], + []) + +dnl Checks for ZE libraries +AS_IF([test x"$with_ze" != x"no"], + [AC_CHECK_LIB([ze_loader], zeInit, + AC_CHECK_HEADER([level_zero/ze_api.h], + AC_DEFINE([HAVE_LIBZE], 1, [ZE support])), + [])] + []) + AC_MSG_CHECKING([for fi_trywait support]) AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include ]], [[fi_trywait(NULL, NULL, 0);]])], @@ -122,4 +176,5 @@ AC_DEFINE_UNQUOTED([HAVE_EPOLL], [$have_epoll], [Defined to 1 if Linux epoll is available]) AC_CONFIG_FILES([Makefile fabtests.spec]) + AC_OUTPUT diff --git a/fabtests/fabtests.sln b/fabtests/fabtests.sln index 2e460cb0283..1735e56da5f 100644 --- a/fabtests/fabtests.sln +++ b/fabtests/fabtests.sln @@ -9,18 +9,24 @@ Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug-v140|x64 = Debug-v140|x64 Debug-v141|x64 = Debug-v141|x64 + Debug-v142|x64 = Debug-v142|x64 Release-v140|x64 = Release-v140|x64 Release-v141|x64 = Release-v141|x64 + Release-v142|x64 = Release-v142|x64 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {076F757A-8827-4D3C-A87F-6E49623C16E1}.Debug-v140|x64.ActiveCfg = Debug-v140|x64 {076F757A-8827-4D3C-A87F-6E49623C16E1}.Debug-v140|x64.Build.0 = Debug-v140|x64 {076F757A-8827-4D3C-A87F-6E49623C16E1}.Debug-v141|x64.ActiveCfg = Debug-v141|x64 {076F757A-8827-4D3C-A87F-6E49623C16E1}.Debug-v141|x64.Build.0 = Debug-v141|x64 + {076F757A-8827-4D3C-A87F-6E49623C16E1}.Debug-v142|x64.ActiveCfg = Debug-v142|x64 + {076F757A-8827-4D3C-A87F-6E49623C16E1}.Debug-v142|x64.Build.0 = Debug-v142|x64 {076F757A-8827-4D3C-A87F-6E49623C16E1}.Release-v140|x64.ActiveCfg = Release-v140|x64 {076F757A-8827-4D3C-A87F-6E49623C16E1}.Release-v140|x64.Build.0 = Release-v140|x64 {076F757A-8827-4D3C-A87F-6E49623C16E1}.Release-v141|x64.ActiveCfg = Release-v141|x64 {076F757A-8827-4D3C-A87F-6E49623C16E1}.Release-v141|x64.Build.0 = Release-v141|x64 + {076F757A-8827-4D3C-A87F-6E49623C16E1}.Release-v142|x64.ActiveCfg = Release-v142|x64 + {076F757A-8827-4D3C-A87F-6E49623C16E1}.Release-v142|x64.Build.0 = Release-v142|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/fabtests/fabtests.vcxproj b/fabtests/fabtests.vcxproj index d6f06bf4962..5242eb86d53 100644 --- a/fabtests/fabtests.vcxproj +++ b/fabtests/fabtests.vcxproj @@ -9,6 +9,10 @@ Debug-v141 x64 + + Debug-v142 + x64 + Release-v140 x64 @@ -17,6 +21,10 @@ Release-v141 x64 + + Release-v142 + x64 + {076F757A-8827-4D3C-A87F-6E49623C16E1} @@ -37,6 +45,13 @@ true MultiByte + + Makefile + true + v142 + true + MultiByte + Makefile false @@ -51,6 +66,13 @@ true MultiByte + + Makefile + false + v142 + true + MultiByte + @@ -62,12 +84,18 @@ + + + + + + WIN32;_DEBUG;$(NMakePreprocessorDefinitions) @@ -83,6 +111,13 @@ nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) clean all nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) clean + + WIN32;_DEBUG;$(NMakePreprocessorDefinitions) + $(ProjectDir)Include;$(ExecutablePath) + nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) all + nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) clean all + nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) clean + WIN32;NDEBUG;$(NMakePreprocessorDefinitions) $(ProjectDir)Include;$(ExecutablePath) @@ -97,6 +132,13 @@ nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) clean all nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) clean + + WIN32;NDEBUG;$(NMakePreprocessorDefinitions) + $(ProjectDir)Include;$(ExecutablePath) + nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) all + nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) clean all + nmake /F Makefile.win config=$(Configuration) arch=x$(PlatformArchitecture) clean + @@ -108,6 +150,10 @@ + + + + @@ -128,7 +174,7 @@ - + @@ -150,6 +196,7 @@ + diff --git a/fabtests/fabtests.vcxproj.filters b/fabtests/fabtests.vcxproj.filters index 2370a8e239d..207c5cb7892 100644 --- a/fabtests/fabtests.vcxproj.filters +++ b/fabtests/fabtests.vcxproj.filters @@ -48,6 +48,18 @@ Source Files\common + + Source Files\common + + + Source Files\common + + + Source Files\common + + + Source Files\common + Source Files\common @@ -75,7 +87,7 @@ Source Files\functional - + Source Files\functional @@ -185,6 +197,9 @@ Header Files + + Header Files + Header Files diff --git a/fabtests/functional/bw.c b/fabtests/functional/bw.c index 4da65402b08..ff48be698cc 100644 --- a/fabtests/functional/bw.c +++ b/fabtests/functional/bw.c @@ -199,7 +199,7 @@ int main(int argc, char **argv) hints->ep_attr->type = FI_EP_RDM; - while ((op = getopt(argc, argv, "W:vT:h" CS_OPTS ADDR_OPTS INFO_OPTS)) != -1) { + while ((op = getopt(argc, argv, "UW:vT:h" CS_OPTS ADDR_OPTS INFO_OPTS)) != -1) { switch (op) { default: ft_parse_addr_opts(op, optarg, &opts); @@ -209,6 +209,9 @@ int main(int argc, char **argv) case 'W': opts.window_size = atoi(optarg); break; + case 'U': + hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE; + break; case 'v': opts.options |= FT_OPT_VERIFY_DATA; break; @@ -218,6 +221,11 @@ int main(int argc, char **argv) case '?': case 'h': ft_usage(argv[0], "A bandwidth test with data verification."); + FT_PRINT_OPTS_USAGE("-T sleep_time", + "Receive side delay before starting"); + FT_PRINT_OPTS_USAGE("-v", "Enable data verification"); + FT_PRINT_OPTS_USAGE("-W window_size", + "Set transmit window size before waiting for completion"); return EXIT_FAILURE; } } diff --git a/fabtests/functional/cm_data.c b/fabtests/functional/cm_data.c index ba92370b432..74fec72c767 100644 --- a/fabtests/functional/cm_data.c +++ b/fabtests/functional/cm_data.c @@ -448,6 +448,7 @@ static int run(void) ft_sock_shutdown(sock); err2: free(entry); + free(cm_data); return ret; } @@ -456,7 +457,7 @@ int main(int argc, char **argv) int op, ret; opts = INIT_OPTS; - opts.options |= FT_OPT_SIZE | FT_OPT_SKIP_REG_MR; + opts.options |= FT_OPT_SIZE | FT_OPT_SKIP_REG_MR | FT_OPT_SKIP_MSG_ALLOC; hints = fi_allocinfo(); if (!hints) diff --git a/fabtests/functional/msg_sockets.c b/fabtests/functional/msg_sockets.c index a4a7da8a072..95551c0ebe8 100644 --- a/fabtests/functional/msg_sockets.c +++ b/fabtests/functional/msg_sockets.c @@ -54,6 +54,9 @@ union sockaddr_any { static union sockaddr_any bound_addr; static size_t bound_addr_len = sizeof bound_addr; +/* string format is [%s]:%s */ +#define MAXADDRSTR ((BUFSIZ * 2) + 4) + /* Wrapper for memcmp for sockaddr. Note that the sockaddr structure may * contain holes, so sockaddr's are expected to have been initialized to all @@ -113,7 +116,7 @@ sockaddrstr(const union sockaddr_any *addr, socklen_t len, char *buf, size_t buf static int check_address(struct fid *fid, const char *message) { - char buf1[BUFSIZ], buf2[BUFSIZ]; + char buf1[MAXADDRSTR], buf2[MAXADDRSTR]; union sockaddr_any tmp; size_t tmplen; const char *ep_addr, *addr_expected; @@ -127,13 +130,14 @@ static int check_address(struct fid *fid, const char *message) } if (sockaddrcmp(&tmp, tmplen, &bound_addr, bound_addr_len)) { - ep_addr = sockaddrstr(&tmp, tmplen, buf1, BUFSIZ); + ep_addr = sockaddrstr(&tmp, tmplen, buf1, sizeof buf1); if (!ep_addr) { FT_ERR("Unable to get ep_addr as string!"); return -FI_EINVAL; } - addr_expected = sockaddrstr(&bound_addr, bound_addr_len, buf2, BUFSIZ); + addr_expected = sockaddrstr(&bound_addr, bound_addr_len, buf2, + sizeof buf2); if (!addr_expected) { FT_ERR("Unable to get addr_expected as string!"); return -FI_EINVAL; @@ -302,7 +306,7 @@ static int client_connect(void) static int setup_handle(void) { - static char buf[BUFSIZ]; + static char buf[MAXADDRSTR]; struct addrinfo *ai, aihints; const char *bound_addr_str; char *saved_addr; @@ -398,7 +402,8 @@ static int setup_handle(void) break; } - bound_addr_str = sockaddrstr(&bound_addr, bound_addr_len, buf, BUFSIZ); + bound_addr_str = sockaddrstr(&bound_addr, bound_addr_len, buf, + sizeof buf); if (!bound_addr_str) { FT_ERR("Unable to get bound_addr as string!"); ret = -FI_EINVAL; diff --git a/fabtests/functional/multi_ep.c b/fabtests/functional/multi_ep.c index ccbabc0bfba..3e122cadbfe 100644 --- a/fabtests/functional/multi_ep.c +++ b/fabtests/functional/multi_ep.c @@ -300,7 +300,7 @@ int main(int argc, char **argv) ft_usage(argv[0], "Multi endpoint test"); FT_PRINT_OPTS_USAGE("-c ", "number of endpoints to create and test (def 3)"); - FT_PRINT_OPTS_USAGE("-v", "Enable DataCheck testing"); + FT_PRINT_OPTS_USAGE("-v", "Enable data verification"); return EXIT_FAILURE; } } diff --git a/fabtests/functional/multi_mr.c b/fabtests/functional/multi_mr.c index 23aebc88ee7..6a814fcaeb3 100644 --- a/fabtests/functional/multi_mr.c +++ b/fabtests/functional/multi_mr.c @@ -303,9 +303,10 @@ int main(int argc, char **argv) case '?': case 'h': ft_usage(argv[0], "Ping-pong multi memory region test"); - FT_PRINT_OPTS_USAGE("-c ", "number of memory regions to create and test"); + FT_PRINT_OPTS_USAGE("-c ", + "number of memory regions to create and test"); FT_PRINT_OPTS_USAGE("-V", "Enable verbose printing"); - FT_PRINT_OPTS_USAGE("-v", "Enable DataCheck testing"); + FT_PRINT_OPTS_USAGE("-v", "Enable data verification"); return EXIT_FAILURE; } } diff --git a/fabtests/functional/multi_recv.c b/fabtests/functional/multi_recv.c index 62870b13fad..c01a575196b 100644 --- a/fabtests/functional/multi_recv.c +++ b/fabtests/functional/multi_recv.c @@ -37,25 +37,22 @@ #include -// MULTI_BUF_SIZE_FACTOR defines how large the multi recv buffer will be. -// The minimum value of the factor is 2 which will set the multi recv buffer -// size to be twice the size of the send buffer. In order to use FI_MULTI_RECV -// feature efficiently, we need to have a large recv buffer so that we don't -// to repost the buffer often to get the remaining data when the buffer is full -#define MULTI_BUF_SIZE_FACTOR 4 -#define DEFAULT_MULTI_BUF_SIZE (1024 * 1024) +#define MAX_XFER_SIZE (1 << 20) static struct fid_mr *mr_multi_recv; struct fi_context ctx_multi_recv[2]; -static int use_recvmsg; +static int use_recvmsg, comp_per_buf; -static int repost_recv(int iteration) { +static int repost_recv(int iteration) +{ struct fi_msg msg; struct iovec msg_iov; + void *buf_addr; int ret; + buf_addr = rx_buf + (rx_size / 2) * iteration; if (use_recvmsg) { - msg_iov.iov_base = rx_buf + (rx_size / 2) * iteration; + msg_iov.iov_base = buf_addr; msg_iov.iov_len = rx_size / 2; msg.msg_iov = &msg_iov; msg.desc = fi_mr_desc(mr_multi_recv); @@ -69,9 +66,9 @@ static int repost_recv(int iteration) { return ret; } } else { - ret = fi_recv(ep, rx_buf + (rx_size / 2) * iteration, - rx_size / 2, fi_mr_desc(mr_multi_recv), - 0, &ctx_multi_recv[iteration]); + ret = fi_recv(ep, buf_addr, rx_size / 2, + fi_mr_desc(mr_multi_recv), 0, + &ctx_multi_recv[iteration]); if (ret) { FT_PRINTERR("fi_recv", ret); return ret; @@ -82,9 +79,9 @@ static int repost_recv(int iteration) { } -int wait_for_recv_completion(int num_completions) +static int wait_for_recv_completion(int num_completions) { - int i, ret; + int i, ret, per_buf_cnt = 0; struct fi_cq_data_entry comp; while (num_completions > 0) { @@ -97,21 +94,27 @@ int wait_for_recv_completion(int num_completions) return ret; } - if (comp.len) - num_completions--; - - if (ft_check_opts(FT_OPT_VERIFY_DATA | FT_OPT_ACTIVE)) { + if (comp.flags & FI_RECV) { if (comp.len != opts.transfer_size) { - FT_ERR("comp.len != opts.transfer_size"); - return -FI_EOTHER; + FT_ERR("completion length %lu, expected %lu", + comp.len, opts.transfer_size); + return -FI_EIO; } - ret = ft_check_buf(comp.buf, opts.transfer_size); - if (ret) - return ret; + if (ft_check_opts(FT_OPT_VERIFY_DATA | FT_OPT_ACTIVE) && + ft_check_buf(comp.buf, opts.transfer_size)) + return -FI_EIO; + per_buf_cnt++; + num_completions--; } if (comp.flags & FI_MULTI_RECV) { - i = (comp.op_context == &ctx_multi_recv[0]) ? 0 : 1; + if (per_buf_cnt != comp_per_buf) { + FT_ERR("Received %d completions per buffer, expected %d", + per_buf_cnt, comp_per_buf); + return -FI_EIO; + } + per_buf_cnt = 0; + i = comp.op_context == &ctx_multi_recv[1]; ret = repost_recv(i); if (ret) @@ -121,18 +124,6 @@ int wait_for_recv_completion(int num_completions) return 0; } -static int sync_test(void) -{ - int ret; - - ret = opts.dst_addr ? ft_tx(ep, remote_fi_addr, 1, &tx_ctx) : wait_for_recv_completion(1); - if (ret) - return ret; - - ret = opts.dst_addr ? wait_for_recv_completion(1) : ft_tx(ep, remote_fi_addr, 1, &tx_ctx); - return ret; -} - /* * Post buffer as two halves, so that we can repost one half * when the other half is full. @@ -154,23 +145,21 @@ static int run_test(void) { int ret, i; - ret = sync_test(); - if (ret) { - fprintf(stderr, "sync_test failed!\n"); - goto out; - } + ret = ft_sync(); + if (ret) + return ret; ft_start(); if (opts.dst_addr) { for (i = 0; i < opts.iterations; i++) { ret = ft_tx(ep, remote_fi_addr, opts.transfer_size, &tx_ctx); if (ret) - goto out; + return ret; } } else { ret = wait_for_recv_completion(opts.iterations); if (ret) - goto out; + return ret; } ft_stop(); @@ -178,10 +167,9 @@ static int run_test(void) show_perf_mr(opts.transfer_size, opts.iterations, &start, &end, 1, opts.argc, opts.argv); else - show_perf(test_name, opts.transfer_size, opts.iterations, + show_perf(NULL, opts.transfer_size, opts.iterations, &start, &end, 1); -out: return ret; } @@ -202,7 +190,7 @@ static int alloc_ep_res(struct fi_info *fi) { int ret; - tx_size = MAX(FT_MAX_CTRL_MSG, opts.transfer_size); + tx_size = opts.transfer_size; if (tx_size > fi->ep_attr->max_msg_size) { fprintf(stderr, "transfer size is larger than the maximum size " "of the data transfer supported by the provider\n"); @@ -222,8 +210,16 @@ static int alloc_ep_res(struct fi_info *fi) return ret; } - // set the multi buffer size to be allocated - rx_size = MAX(tx_size, DEFAULT_MULTI_BUF_SIZE) * MULTI_BUF_SIZE_FACTOR; + /* We only ues the common code to send messages, so + * set mr_desc to the tx buffer's region. + */ + mr_desc = fi_mr_desc(mr); + + //Each multi recv buffer will be able to hold at least 2 and + //up to 64 messages, allowing proper testing of multi recv + //completions and reposting + rx_size = MIN(tx_size * 128, MAX_XFER_SIZE * 4); + comp_per_buf = rx_size / 2 / opts.transfer_size; rx_buf = malloc(rx_size); if (!rx_buf) { fprintf(stderr, "Cannot allocate rx_buf\n"); @@ -237,25 +233,15 @@ static int alloc_ep_res(struct fi_info *fi) return ret; } - ret = ft_alloc_active_res(fi); - if (ret) - return ret; - return 0; } -static int init_fabric(void) +static int run(void) { - int ret; - - ret = ft_getinfo(hints, &fi); - if (ret) - return ret; - - // set FI_MULTI_RECV flag for all recv operations - fi->rx_attr->op_flags = FI_MULTI_RECV; + int ret = 0; - ret = ft_open_fabric_res(); + ret = hints->ep_attr->type == FI_EP_MSG ? + ft_init_fabric_cm() : ft_init_fabric(); if (ret) return ret; @@ -263,183 +249,20 @@ static int init_fabric(void) if (ret) return ret; - ret = ft_enable_ep_recv(); - if (ret) - return ret; - ret = fi_setopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_MIN_MULTI_RECV, &tx_size, sizeof(tx_size)); if (ret) return ret; ret = post_multi_recv_buffer(); - return ret; -} - -static int init_av(void) -{ - size_t addrlen; - int ret; - - if (opts.dst_addr) { - ret = ft_av_insert(av, fi->dest_addr, 1, &remote_fi_addr, 0, NULL); - if (ret) - return ret; - - addrlen = 64; - ret = fi_getname(&ep->fid, tx_buf, &addrlen); - if (ret) { - FT_PRINTERR("fi_getname", ret); - return ret; - } - - ret = ft_tx(ep, remote_fi_addr, addrlen, &tx_ctx); - if (ret) - return ret; - } else { - ret = wait_for_recv_completion(1); - if (ret) - return ret; - - ret = ft_av_insert(av, rx_buf, 1, &remote_fi_addr, 0, NULL); - if (ret) - return ret; - } - - return 0; -} - -int start_server(void) -{ - int ret; - - tx_seq = 0; - rx_seq = 0; - tx_cq_cntr = 0; - rx_cq_cntr = 0; - - - ret = ft_getinfo(hints, &fi_pep); if (ret) return ret; - // set FI_MULTI_RECV flag for all recv operations - fi_pep->rx_attr->op_flags = FI_MULTI_RECV; - - ret = fi_fabric(fi_pep->fabric_attr, &fabric, NULL); - if (ret) { - FT_PRINTERR("fi_fabric", ret); - return ret; - } - - ret = fi_eq_open(fabric, &eq_attr, &eq, NULL); - if (ret) { - FT_PRINTERR("fi_eq_open", ret); - return ret; - } - - ret = fi_passive_ep(fabric, fi_pep, &pep, NULL); - if (ret) { - FT_PRINTERR("fi_passive_ep", ret); - return ret; - } - - ret = fi_pep_bind(pep, &eq->fid, 0); - if (ret) { - FT_PRINTERR("fi_pep_bind", ret); - return ret; - } - - ret = fi_listen(pep); - if (ret) { - FT_PRINTERR("fi_listen", ret); - return ret; - } - - return 0; -} - -int server_connect(void) -{ - int ret; - - ret = ft_retrieve_conn_req(eq, &fi); - if (ret) - goto err; - - ret = fi_domain(fabric, fi, &domain, NULL); - if (ret) { - FT_PRINTERR("fi_domain", ret); - goto err; - } - - ret = alloc_ep_res(fi); - if (ret) - goto err; - - ret = ft_enable_ep_recv(); - if (ret) - goto err; - - ret = fi_setopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_MIN_MULTI_RECV, - &tx_size, sizeof(tx_size)); - if (ret) - goto err; - - ret = post_multi_recv_buffer(); - if (ret) - goto err; - - ret = ft_accept_connection(ep, eq); - if (ret) - goto err; - - return 0; -err: - fi_reject(pep, fi->handle, NULL, 0); - return ret; -} - -static int client_connect(void) -{ - int ret; - - ret = init_fabric(); - if (ret) - return ret; - - return ft_connect_ep(ep, eq, fi->dest_addr); -} - -static int run(void) -{ - int ret = 0; - - if (hints->ep_attr->type == FI_EP_MSG) { - if (!opts.dst_addr) { - ret = start_server(); - if (ret) - goto out; - } - - ret = opts.dst_addr ? client_connect() : server_connect(); - if (ret) - goto out; - } else { - ret = init_fabric(); - if (ret) - goto out; - - ret = init_av(); - if (ret) - goto out; - } - ret = run_test(); rx_seq++; ft_finalize(); -out: + return ret; } @@ -448,7 +271,8 @@ int main(int argc, char **argv) int op, ret; opts = INIT_OPTS; - opts.options |= FT_OPT_SIZE | FT_OPT_SKIP_MSG_ALLOC; + opts.options |= FT_OPT_SIZE | FT_OPT_SKIP_MSG_ALLOC | FT_OPT_OOB_SYNC | + FT_OPT_OOB_ADDR_EXCH; use_recvmsg = 0; hints = fi_allocinfo(); @@ -469,8 +293,10 @@ int main(int argc, char **argv) break; case '?': case 'h': - ft_csusage(argv[0], "Streaming RDM client-server using multi recv buffer."); + ft_csusage(argv[0], + "Streaming RDM client-server using multi recv buffer."); FT_PRINT_OPTS_USAGE("-M", "enable testing with fi_recvmsg"); + FT_PRINT_OPTS_USAGE("-v", "Enable data verification"); return EXIT_FAILURE; } } @@ -478,9 +304,15 @@ int main(int argc, char **argv) if (optind < argc) opts.dst_addr = argv[optind]; + if (opts.transfer_size > MAX_XFER_SIZE) { + FT_ERR("Use smaller transfer size (max %d)", MAX_XFER_SIZE); + return EIO; + } + hints->caps = FI_MSG | FI_MULTI_RECV; hints->mode = FI_CONTEXT; hints->domain_attr->mr_mode = opts.mr_mode; + hints->rx_attr->op_flags = FI_MULTI_RECV; cq_attr.format = FI_CQ_FORMAT_DATA; diff --git a/fabtests/functional/rdm.c b/fabtests/functional/rdm.c index 84cecd38a2a..6fe12c860e5 100644 --- a/fabtests/functional/rdm.c +++ b/fabtests/functional/rdm.c @@ -34,15 +34,28 @@ #include + static int run(void) { int ret; + int nconn = 1; ret = ft_init_fabric(); if (ret) return ret; - return ft_send_recv_greeting(ep); + if ((opts.options & FT_OPT_SERVER_PERSIST) && !opts.dst_addr) + nconn = opts.num_connections; + + while (nconn && !ret) { + ret = ft_send_recv_greeting(ep); + + if (--nconn && !ret) { + ret = ft_accept_next_client(); + } + } + + return ret; } int main(int argc, char **argv) @@ -56,12 +69,15 @@ int main(int argc, char **argv) if (!hints) return EXIT_FAILURE; - while ((op = getopt(argc, argv, "h" ADDR_OPTS INFO_OPTS)) != -1) { + while ((op = getopt(argc, argv, "Uh" ADDR_OPTS INFO_OPTS)) != -1) { switch (op) { default: ft_parse_addr_opts(op, optarg, &opts); ft_parseinfo(op, optarg, hints, &opts); break; + case 'U': + hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE; + break; case '?': case 'h': ft_usage(argv[0], "A simple RDM client-sever example."); diff --git a/fabtests/functional/rdm_atomic.c b/fabtests/functional/rdm_atomic.c index c2b3655ffbe..369b3240f35 100644 --- a/fabtests/functional/rdm_atomic.c +++ b/fabtests/functional/rdm_atomic.c @@ -435,6 +435,10 @@ static int init_fabric(void) { int ret; + ret = ft_init_oob(); + if (ret) + return ret; + ret = ft_getinfo(hints, &fi); if (ret) return ret; @@ -491,7 +495,7 @@ int main(int argc, char **argv) if (!hints) return EXIT_FAILURE; - while ((op = getopt(argc, argv, "ho:z:" CS_OPTS INFO_OPTS)) != -1) { + while ((op = getopt(argc, argv, "ho:Uz:" CS_OPTS INFO_OPTS)) != -1) { switch (op) { case 'o': if (!strncasecmp("all", optarg, 3)) { @@ -505,6 +509,9 @@ int main(int argc, char **argv) } } break; + case 'U': + hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE; + break; case 'z': if (!strncasecmp("all", optarg, 3)) { run_all_datatypes = 1; diff --git a/fabtests/functional/rdm_rma_simple.c b/fabtests/functional/rdm_rma_event.c similarity index 100% rename from fabtests/functional/rdm_rma_simple.c rename to fabtests/functional/rdm_rma_event.c diff --git a/fabtests/functional/recv_cancel.c b/fabtests/functional/recv_cancel.c index e75569471ca..eb7c5c82dd6 100644 --- a/fabtests/functional/recv_cancel.c +++ b/fabtests/functional/recv_cancel.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2017 Intel Corporation. All rights reserved. + * Copyright (c) 2013-2020 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -47,7 +47,8 @@ static int recv_cancel_client(void) return ret; ft_tag = CANCEL_TAG; - ret = ft_post_tx(ep, remote_fi_addr, opts.transfer_size, NO_CQ_DATA, &tx_ctx); + ret = ft_post_tx(ep, remote_fi_addr, opts.transfer_size, NO_CQ_DATA, + &tx_ctx); if (ret) return ret; @@ -55,7 +56,8 @@ static int recv_cancel_client(void) fprintf(stdout, "CANCEL msg posted to server\n"); ft_tag = STANDARD_TAG; - ret = ft_post_tx(ep, remote_fi_addr, opts.transfer_size, NO_CQ_DATA, &tx_ctx); + ret = ft_post_tx(ep, remote_fi_addr, opts.transfer_size, NO_CQ_DATA, + &tx_ctx); if (ret) return ret; @@ -110,16 +112,18 @@ static int recv_cancel_host(void) usleep(1000); } while ((ret == -FI_EAGAIN) && (retries < 5000)); if (retries >= 5000) { - FT_PRINTERR("ERROR: failed to detect error CQ entry in cq_read", -FI_EOTHER); + FT_PRINTERR("ERROR: no error CQ entry in cq_read deteceted", + -FI_EOTHER); return -FI_EOTHER; } else { if (opts.verbose) - fprintf(stdout, "GOOD: detected error cq entry in cq_read\n"); + fprintf(stdout, "GOOD: detected error cq entry\n"); } /* Verify the error CQ has been populated */ if (fi_cq_readerr(rxcq, &cancel_error_entry, 0) != 1) { - FT_PRINTERR("ERROR: No cancel CQ error entry was populated", -FI_EOTHER); + FT_PRINTERR("ERROR: No cancel CQ error entry was populated", + -FI_EOTHER); return -FI_EOTHER; } @@ -129,7 +133,8 @@ static int recv_cancel_host(void) } if (!(cancel_error_entry.flags & FI_RECV)) { - FT_PRINTERR("ERROR: cancelled completion flags is incorrect", -FI_EOTHER); + FT_PRINTERR("ERROR: cancelled completion flags are incorrect", + -FI_EOTHER); return -FI_EOTHER; } @@ -138,19 +143,21 @@ static int recv_cancel_host(void) /* Verify only one CQ err entry can be read */ if (fi_cq_readerr(rxcq, &cancel_error_entry, 0) != -FI_EAGAIN) { - FT_PRINTERR("ERROR: Another CQ error entry was populated", -FI_EOTHER); + FT_PRINTERR("ERROR: Another CQ error entry was populated", + -FI_EOTHER); return -FI_EOTHER; } if (opts.verbose) - fprintf(stdout, "GOOD: no additional error entries have been detected\n"); + fprintf(stdout, "GOOD: no extra error entries detected\n"); /* Check for second recv completion*/ do { ret = fi_cq_read(rxcq, &recv_completion, 1); if (ret > 0) { if (recv_completion.op_context != &standard_recv_ctx) { - FT_PRINTERR("ERROR: op_context does not match recv ctx", -FI_EOTHER); + FT_PRINTERR("ERROR: op_context does not match", + -FI_EOTHER); return -FI_EOTHER; } } else if ((ret <= 0) && (ret != -FI_EAGAIN)) { @@ -161,6 +168,28 @@ static int recv_cancel_host(void) if (opts.verbose) fprintf(stdout, "GOOD: Completed uncancelled recv\n"); + /* Repost cancelled recv and get completion */ + ft_tag = CANCEL_TAG; + ret = ft_post_rx(ep, opts.transfer_size, &cancel_recv_ctx); + if (ret) + return ret; + + do { + ret = fi_cq_read(rxcq, &recv_completion, 1); + if (ret > 0) { + if (recv_completion.op_context != &cancel_recv_ctx) { + FT_PRINTERR("ERROR: op_context does not match", + -FI_EOTHER); + return -FI_EOTHER; + } + } else if ((ret <= 0) && (ret != -FI_EAGAIN)) { + FT_PRINTERR("fi_cq_read", ret); + } + } while (ret == -FI_EAGAIN); + + if (opts.verbose) + fprintf(stdout, "GOOD: Completed reposted cancelled recv\n"); + fprintf(stdout, "GOOD: Completed Recv Cancel Test\n"); return 0; diff --git a/fabtests/functional/unexpected_msg.c b/fabtests/functional/unexpected_msg.c index 006ba4baeb1..1c9ef71e5d5 100644 --- a/fabtests/functional/unexpected_msg.c +++ b/fabtests/functional/unexpected_msg.c @@ -94,7 +94,7 @@ static char *get_rx_buf(int index) return rx_buf + rx_size * index; } -static int wait_recvs() +static int wait_recv(void) { struct fi_cq_tagged_entry entry; int ret; @@ -139,11 +139,14 @@ static int run_test_loop(void) ret = ft_post_tx_buf(ep, remote_fi_addr, opts.transfer_size, op_data, &tx_ctx_arr[j].context, - op_buf, mr_desc, op_tag); + op_buf, mr_desc, op_tag + j); if (ret) { printf("ERROR send_msg returned %d\n", ret); return ret; } + + /* Request send progress */ + (void) fi_cq_read(txcq, NULL, 0); } ret = ft_sync(); @@ -154,15 +157,17 @@ static int run_test_loop(void) op_buf = get_rx_buf(j); ret = ft_post_rx_buf(ep, opts.transfer_size, &rx_ctx_arr[j].context, op_buf, - mr_desc, op_tag); + mr_desc, + op_tag + (concurrent_msgs - 1) - j); if (ret) { printf("ERROR recv_msg returned %d\n", ret); return ret; } - } - for (j = 0; j < concurrent_msgs; j++) { - ret = wait_recvs(); + /* Progress sends */ + (void) fi_cq_read(txcq, NULL, 0); + + ret = wait_recv(); if (ret < 1) return ret; } @@ -222,42 +227,24 @@ int main(int argc, char **argv) if (!hints) return EXIT_FAILURE; - while ((op = getopt(argc, argv, "m:i:c:vdSh" ADDR_OPTS INFO_OPTS)) != -1) { + while ((op = getopt(argc, argv, "CM:h" CS_OPTS INFO_OPTS)) != -1) { switch (op) { default: + ft_parsecsopts(op, optarg, &opts); ft_parse_addr_opts(op, optarg, &opts); ft_parseinfo(op, optarg, hints, &opts); break; - case 'c': - concurrent_msgs = strtoul(optarg, NULL, 0); - break; - case 'i': - num_iters = strtoul(optarg, NULL, 0); - break; - case 'S': - opts.comp_method = FT_COMP_SREAD; - break; - case 'v': - opts.options |= FT_OPT_VERIFY_DATA; - break; - case 'm': - opts.transfer_size = strtoul(optarg, NULL, 0); - break; - case 'd': + case 'C': send_data = true; break; + case 'M': + concurrent_msgs = strtoul(optarg, NULL, 0); + break; case '?': case 'h': - ft_usage(argv[0], "Unexpected message functional test"); - FT_PRINT_OPTS_USAGE("-c ", - "Concurrent messages per iteration "); - FT_PRINT_OPTS_USAGE("-v", "Enable data verification"); - FT_PRINT_OPTS_USAGE("-i ", "Number of iterations"); - FT_PRINT_OPTS_USAGE("-S", - "Use fi_cq_sread instead of polling fi_cq_read"); - FT_PRINT_OPTS_USAGE("-m ", - "Size of unexpected messages"); - FT_PRINT_OPTS_USAGE("-d", "Send remote CQ data"); + ft_csusage(argv[0], "Unexpected message handling test."); + FT_PRINT_OPTS_USAGE("-C", "transfer remote CQ data"); + FT_PRINT_OPTS_USAGE("-M ", "number of concurrent msgs"); return EXIT_FAILURE; } } diff --git a/prov/mlx/src/mlx_cq.c b/fabtests/include/freebsd/malloc.h similarity index 70% rename from prov/mlx/src/mlx_cq.c rename to fabtests/include/freebsd/malloc.h index a0e9a5e7e35..30abe10b2f6 100644 --- a/prov/mlx/src/mlx_cq.c +++ b/fabtests/include/freebsd/malloc.h @@ -1,11 +1,11 @@ /* - * Copyright (c) 2016 Intel Corporation. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the - * OpenFabrics.org BSD license below: + * BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following @@ -29,30 +29,16 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include -#include "mlx.h" -int mlx_cq_open ( - struct fid_domain *domain, struct fi_cq_attr *attr, - struct fid_cq **cq_fid, void *context) -{ - int status = FI_SUCCESS; - struct util_cq *u_cq; - - u_cq = calloc(1, sizeof(struct util_cq)); - if (!u_cq) { - return -FI_ENOMEM; - } +#ifndef _FABTESTS_FREEBSD_MALLOC_H_ +#define _FABTESTS_FREEBSD_MALLOC_H_ - status = ofi_cq_init( - &mlx_prov, domain, - attr, u_cq, &ofi_cq_progress, context); - if (status) { - free(u_cq); - return status; - } +#define M_MMAP_THRESHOLD -3 - *cq_fid = &(u_cq->cq_fid); - return FI_SUCCESS; +int mallopt(int param, int value) +{ + /* Not supported. */ + return 0; } +#endif /* _FABTESTS_FREEBSD_MALLOC_H_ */ diff --git a/fabtests/include/hmem.h b/fabtests/include/hmem.h new file mode 100644 index 00000000000..813e1b0932a --- /dev/null +++ b/fabtests/include/hmem.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2020 Intel Corporation. All rights reserved. + * + * This software is available to you under the BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + */ + +#ifndef _HMEM_H_ +#define _HMEM_H_ +#if HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ + +#include +#include +#include + +int ft_ze_init(void); +int ft_ze_cleanup(void); +int ft_ze_alloc(uint64_t device, void **buf, size_t size); +int ft_ze_free(void *buf); +int ft_ze_memset(uint64_t device, void *buf, int value, size_t size); +int ft_ze_copy(uint64_t device, void *dst, const void *src, size_t size); + +static inline int ft_host_init(void) +{ + return FI_SUCCESS; +} + +static inline int ft_host_cleanup(void) +{ + return FI_SUCCESS; +} + +static inline int ft_host_alloc(uint64_t device, void **buffer, size_t size) +{ + *buffer = malloc(size); + return !*buffer ? -FI_ENOMEM : FI_SUCCESS; +} + +static inline int ft_host_free(void *buf) +{ + free(buf); + return FI_SUCCESS; +} + +static inline int ft_host_memset(uint64_t device, void *buf, int value, + size_t size) +{ + memset(buf, value, size); + return FI_SUCCESS; +} + +static inline int ft_host_memcpy(uint64_t device, void *dst, const void *src, + size_t size) +{ + memcpy(dst, src, size); + return FI_SUCCESS; +} + +int ft_cuda_init(void); +int ft_cuda_cleanup(void); +int ft_cuda_alloc(uint64_t device, void **buf, size_t size); +int ft_cuda_free(void *buf); +int ft_cuda_memset(uint64_t device, void *buf, int value, size_t size); +int ft_cuda_copy_to_hmem(uint64_t device, void *dst, const void *src, + size_t size); +int ft_cuda_copy_from_hmem(uint64_t device, void *dst, const void *src, + size_t size); + +int ft_rocr_init(void); +int ft_rocr_cleanup(void); +int ft_rocr_alloc(uint64_t device, void **buf, size_t size); +int ft_rocr_free(void *buf); +int ft_rocr_memset(uint64_t device, void *buf, int value, size_t size); +int ft_rocr_memcpy(uint64_t device, void *dst, const void *src, size_t size); + +int ft_hmem_init(enum fi_hmem_iface iface); +int ft_hmem_cleanup(enum fi_hmem_iface iface); +int ft_hmem_alloc(enum fi_hmem_iface iface, uint64_t device, void **buf, + size_t size); +int ft_hmem_free(enum fi_hmem_iface iface, void *buf); +int ft_hmem_memset(enum fi_hmem_iface iface, uint64_t device, void *buf, + int value, size_t size); +int ft_hmem_copy_to(enum fi_hmem_iface iface, uint64_t device, void *dst, + const void *src, size_t size); +int ft_hmem_copy_from(enum fi_hmem_iface iface, uint64_t device, void *dst, + const void *src, size_t size); + +#endif /* _HMEM_H_ */ diff --git a/fabtests/include/osx/malloc.h b/fabtests/include/osx/malloc.h new file mode 100644 index 00000000000..b7a9369c904 --- /dev/null +++ b/fabtests/include/osx/malloc.h @@ -0,0 +1,44 @@ +/* + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FABTESTS_OSX_MALLOC_H_ +#define _FABTESTS_OSX_MALLOC_H_ + +#define M_MMAP_THRESHOLD -3 + +int mallopt(int param, int value) +{ + /* Not supported. */ + return 0; +} + +#endif /* _FABTESTS_OSX_MALLOC_H_ */ diff --git a/fabtests/include/shared.h b/fabtests/include/shared.h index 6a04f1368d4..46bb751e35f 100644 --- a/fabtests/include/shared.h +++ b/fabtests/include/shared.h @@ -49,13 +49,17 @@ extern "C" { #endif #ifndef FT_FIVERSION -#define FT_FIVERSION FI_VERSION(1,5) +#define FT_FIVERSION FI_VERSION(1,9) #endif #include "ft_osd.h" #define OFI_UTIL_PREFIX "ofi_" #define OFI_NAME_DELIM ';' +#define ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask)) +#define ALIGN(x, a) ALIGN_MASK(x, (typeof(x))(a) - 1) +#define ALIGN_DOWN(x, a) ALIGN((x) - ((a) - 1), (a)) + #define OFI_MR_BASIC_MAP (FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_VIRT_ADDR) /* exit codes must be 0-255 */ @@ -65,9 +69,6 @@ static inline int ft_exit_code(int ret) return absret > 255 ? EXIT_FAILURE : absret; } -#define ft_foreach_info(fi, info) \ - for (fi = info; fi; fi = fi->next) - #define ft_sa_family(addr) (((struct sockaddr *)(addr))->sa_family) struct test_size_param { @@ -82,13 +83,6 @@ extern const unsigned int test_cnt; #define FT_ENABLE_ALL (~0) #define FT_DEFAULT_SIZE (1 << 0) -static inline int ft_use_size(int index, int enable_flags) -{ - return (enable_flags == FT_ENABLE_ALL) || - (enable_flags & test_size[index].enable_flags); -} - - enum precision { NANO = 1, MICRO = 1000, @@ -99,7 +93,8 @@ enum ft_comp_method { FT_COMP_SPIN = 0, FT_COMP_SREAD, FT_COMP_WAITSET, - FT_COMP_WAIT_FD + FT_COMP_WAIT_FD, + FT_COMP_YIELD, }; enum { @@ -119,6 +114,10 @@ enum { FT_OPT_SKIP_REG_MR = 1 << 13, FT_OPT_OOB_ADDR_EXCH = 1 << 14, FT_OPT_ALLOC_MULT_MR = 1 << 15, + FT_OPT_SERVER_PERSIST = 1 << 16, + FT_OPT_ENABLE_HMEM = 1 << 17, + FT_OPT_USE_DEVICE = 1 << 18, + FT_OPT_DOMAIN_EQ = 1 << 19, FT_OPT_OOB_CTRL = FT_OPT_OOB_SYNC | FT_OPT_OOB_ADDR_EXCH, }; @@ -136,9 +135,15 @@ enum ft_atomic_opcodes { FT_ATOMIC_COMPARE, }; +enum op_state { + OP_DONE = 0, + OP_PENDING +}; + struct ft_context { char *buf; void *desc; + enum op_state state; struct fid_mr *mr; struct fi_context2 context; }; @@ -164,10 +169,15 @@ struct ft_opts { enum ft_rma_opcodes rma_op; char *oob_port; int argc; + int num_connections; + int address_format; uint64_t mr_mode; /* Fail if the selected provider does not support FI_MSG_PREFIX. */ int force_prefix; + enum fi_hmem_iface iface; + uint64_t device; + char **argv; }; @@ -220,8 +230,8 @@ void ft_usage(char *name, char *desc); void ft_mcusage(char *name, char *desc); void ft_csusage(char *name, char *desc); -void ft_fill_buf(void *buf, int size); -int ft_check_buf(void *buf, int size); +void ft_fill_buf(void *buf, size_t size); +int ft_check_buf(void *buf, size_t size); int ft_check_opts(uint64_t flags); uint64_t ft_init_cq_data(struct fi_info *info); int ft_sock_listen(char *node, char *service); @@ -237,10 +247,10 @@ extern int ft_parent_proc; extern int ft_socket_pair[2]; extern int sock; extern int listen_sock; -#define ADDR_OPTS "B:P:s:a:b::E::" -#define FAB_OPTS "f:d:p:" +#define ADDR_OPTS "B:P:s:a:b::E::C:F:" +#define FAB_OPTS "f:d:p:D:i:H" #define INFO_OPTS FAB_OPTS "e:M:" -#define CS_OPTS ADDR_OPTS "I:S:mc:t:w:l" +#define CS_OPTS ADDR_OPTS "I:QS:mc:t:w:l" #define NO_CQ_DATA 0 extern char default_port[8]; @@ -259,14 +269,17 @@ extern char default_port[8]; .rma_op = FT_RMA_WRITE, \ .oob_port = NULL, \ .mr_mode = FI_MR_LOCAL | OFI_MR_BASIC_MAP, \ - .argc = argc, .argv = argv \ + .iface = FI_HMEM_SYSTEM, \ + .device = 0, \ + .argc = argc, .argv = argv, \ + .address_format = FI_FORMAT_UNSPEC \ } #define FT_STR_LEN 32 -#define FT_MAX_CTRL_MSG 64 +#define FT_MAX_CTRL_MSG 256 #define FT_MR_KEY 0xC0DE #define FT_TX_MR_KEY (FT_MR_KEY + 1) -#define FT_RX_MR_KEY 0xFFFF +#define FT_RX_MR_KEY 0xFFFF #define FT_MSG_MR_ACCESS (FI_SEND | FI_RECV) #define FT_RMA_MR_ACCESS (FI_READ | FI_WRITE | FI_REMOTE_READ | FI_REMOTE_WRITE) @@ -278,6 +291,13 @@ char *cnt_str(char str[FT_STR_LEN], long long cnt); int size_to_count(int size); size_t datatype_to_size(enum fi_datatype datatype); +static inline int ft_use_size(int index, int enable_flags) +{ + return test_size[index].size <= fi->ep_attr->max_msg_size && + ((enable_flags == FT_ENABLE_ALL) || + (enable_flags & test_size[index].enable_flags)); +} + #define FT_PRINTERR(call, retv) \ do { fprintf(stderr, call "(): %s:%d, ret=%d (%s)\n", __FILE__, __LINE__, \ (int) (retv), fi_strerror((int) -(retv))); } while (0) @@ -295,17 +315,17 @@ size_t datatype_to_size(enum fi_datatype datatype); #define FT_DEBUG(fmt, ...) #endif -#define FT_EQ_ERR(eq, entry, buf, len) \ - FT_ERR("eq_readerr (Provider errno: %d) : %s", \ - entry.prov_errno, fi_eq_strerror(eq, entry.err, \ - entry.err_data, \ - buf, len)) \ +#define FT_EQ_ERR(eq, entry, buf, len) \ + FT_ERR("eq_readerr (Provider errno: %d) : %s", \ + entry.prov_errno, fi_eq_strerror(eq, entry.prov_errno, \ + entry.err_data, \ + buf, len)) \ -#define FT_CQ_ERR(cq, entry, buf, len) \ - FT_ERR("cq_readerr (Provider errno: %d) : %s", \ - entry.prov_errno, fi_cq_strerror(cq, entry.err, \ - entry.err_data, \ - buf, len)) \ +#define FT_CQ_ERR(cq, entry, buf, len) \ + FT_ERR("cq_readerr (Provider errno: %d) : %s", \ + entry.prov_errno, fi_cq_strerror(cq, entry.prov_errno, \ + entry.err_data, \ + buf, len)) \ #define FT_CLOSE_FID(fd) \ do { \ @@ -347,6 +367,7 @@ int ft_alloc_bufs(); int ft_open_fabric_res(); int ft_getinfo(struct fi_info *hints, struct fi_info **info); int ft_init_fabric(); +int ft_init_oob(); int ft_start_server(); int ft_server_connect(); int ft_client_connect(); @@ -473,6 +494,8 @@ int ft_send_recv_greeting(struct fid_ep *ep); int ft_send_greeting(struct fid_ep *ep); int ft_recv_greeting(struct fid_ep *ep); +int ft_accept_next_client(); + int check_recv_msg(const char *message); uint64_t ft_info_to_mr_access(struct fi_info *info); int ft_alloc_bit_combo(uint64_t fixed, uint64_t opt, uint64_t **combos, int *len); diff --git a/fabtests/man/fabtests.7.md b/fabtests/man/fabtests.7.md index da2a8c9c6a5..e1271213493 100644 --- a/fabtests/man/fabtests.7.md +++ b/fabtests/man/fabtests.7.md @@ -38,7 +38,7 @@ These tests are a mix of very basic functionality tests that show major features of libfabric. *fi_av_xfer* -: Tests communication for unconnected endpoints, as addresses +: Tests communication for connectionless endpoints, as addresses are inserted and removed from the local address vector. *fi_cm_data* @@ -51,7 +51,7 @@ features of libfabric. : A basic datagram endpoint example. *fi_dgram_waitset* -: Transfers datagrams using waitsets for completion notifcation. +: Transfers datagrams using waitsets for completion notification. *fi_inj_complete* : Sends messages using the FI_INJECT_COMPLETE operation flag. @@ -64,7 +64,7 @@ features of libfabric. *fi_msg_epoll* : Transfers messages with completion queues configured to use file - descriptors as wait objetcts. The file descriptors are retrieved + descriptors as wait objects. The file descriptors are retrieved by the program and used directly with the Linux epoll API. *fi_msg_sockets* @@ -101,8 +101,9 @@ features of libfabric. : Transfers multiple messages over an RDM endpoint that are received into a single buffer, posted using the FI_MULTI_RECV flag. -*fi_rdm_rma_simple* -: A simple RMA write example over an RDM endpoint. +*fi_rdm_rma_event* +: An RMA write example over an RDM endpoint that uses RMA events + to notify the peer that the RMA transfer has completed. *fi_rdm_rma_trigger* : A basic example of queuing an RMA write operation that is initiated @@ -121,7 +122,7 @@ features of libfabric. *fi_resmgmt_test* : Tests the resource management enabled feature. This verifies that the - provider prevents applications from overruning local and remote command + provider prevents applications from overrunning local and remote command queues and completion queues. This corresponds to setting the domain attribute resource_mgmt to FI_RM_ENABLED. @@ -209,8 +210,17 @@ testing scope is limited. *fi_mr_test* : Tests memory registration. -*fi_resource_freeing* -: Allocates and closes fabric resources to check for proper cleanup. +*fi_mr_cache_evict* +: Tests provider MR cache eviction capabilities. + +# Multinode + +This test runs a series of tests over multiple formats and patterns to help +validate at scale. The patterns are an all to all, one to all, all to one and +a ring. The tests also run across multiple capabilities, such as messages, rma, +atomics, and tagged messages. Currently, there is no option to run these +capabilities and patterns independently, however the test is short enough to be +all run at once. # Ubertest @@ -220,8 +230,8 @@ number of tests by iterating over a large number of test variables. As a result, a full ubertest run can take a significant amount of time. Because ubertest iterates over input variables, it relies on a test configuration file for control, rather than extensive command line options that are used -by other fabtests. A configuration file must be constructured for each -provider. Example test configurations are at /test_configs. +by other fabtests. A configuration file must be constructed for each +provider. Example test configurations are at test_configs. *fi_ubertest* : This test takes a configure file as input. The file contains a list of @@ -234,11 +244,94 @@ provider. Example test configurations are at /test_configs. ### Config file options -TODO: add all supported config options +The following keys and respective key values may be used in the config file. + +*prov_name* +: Identify the provider(s) to test. E.g. udp, tcp, verbs, + ofi_rxm;verbs; ofi_rxd;udp. + +*test_type* +: FT_TEST_LATENCY, FT_TEST_BANDWIDTH, FT_TEST_UNIT + +*test_class* +: FT_CAP_MSG, FT_CAP_TAGGED, FT_CAP_RMA, FT_CAP_ATOMIC + +*class_function* +: For FT_CAP_MSG and FT_CAP_TAGGED: FT_FUNC_SEND, FT_FUNC_SENDV, FT_FUNC_SENDMSG, + FT_FUNC_INJECT, FT_FUNC_INJECTDATA, FT_FUNC_SENDDATA + + For FT_CAP_RMA: FT_FUNC_WRITE, FT_FUNC_WRITEV, FT_FUNC_WRITEMSG, + FT_FUNC_WRITEDATA, FT_FUNC_INJECT_WRITE, FT_FUNC_INJECT_WRITEDATA + FT_FUNC_READ, FT_FUNC_READV, FT_FUNC_READMSG + + For FT_CAP_ATOMIC: FT_FUNC_ATOMIC, FT_FUNC_ATOMICV, FT_FUNC_ATOMICMSG, + FT_FUNC_INJECT_ATOMIC, FT_FUNC_FETCH_ATOMIC, FT_FUNC_FETCH_ATOMICV, + FT_FUNC_FETCH_ATOMICMSG, FT_FUNC_COMPARE_ATOMIC, FT_FUNC_COMPARE_ATOMICV, + FT_FUNC_COMPARE_ATOMICMSG + +*constant_caps - values OR'ed together* +: FI_RMA, FI_MSG, FI_SEND, FI_RECV, FI_READ, + FI_WRITE, FI_REMOTE_READ, FI_REMOTE_WRITE, FI_TAGGED, FI_DIRECTED_RECV + +*mode - values OR'ed together* +: FI_CONTEXT, FI_RX_CQ_DATA + +*ep_type* +: FI_EP_MSG, FI_EP_DGRAM, FI_EP_RDM + +*comp_type* +: FT_COMP_QUEUE, FT_COMP_CNTR, FT_COMP_ALL + +*av_type* +: FI_AV_MAP, FI_AV_TABLE + +*eq_wait_obj* +: FI_WAIT_NONE, FI_WAIT_UNSPEC, FI_WAIT_FD, FI_WAIT_MUTEX_COND + +*cq_wait_obj* +: FI_WAIT_NONE, FI_WAIT_UNSPEC, FI_WAIT_FD, FI_WAIT_MUTEX_COND + +*cntr_wait_obj* +: FI_WAIT_NONE, FI_WAIT_UNSPEC, FI_WAIT_FD, FI_WAIT_MUTEX_COND + +*threading* +: FI_THREAD_UNSPEC, FI_THREAD_SAFE, FI_THREAD_FID, FI_THREAD_DOMAIN, + FI_THREAD_COMPLETION, FI_THREAD_ENDPOINT -- *threading* - Specify a list of threading levels. This is a hints only config: ubertest - doesn't spawn multiple threads to verify functionality. +*progress* +: FI_PROGRESS_MANUAL, FI_PROGRESS_AUTO, FI_PROGRESS_UNSPEC + +*mr_mode* +: (Values OR'ed together) FI_MR_LOCAL, FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, + FI_MR_PROV_KEY + +*op* +: For FT_CAP_ATOMIC: FI_MIN, FI_MAX, FI_SUM, FI_PROD, FI_LOR, FI_LAND, FI_BOR, + FI_BAND, FI_LXOR, FI_BXOR, FI_ATOMIC_READ, FI_ATOMIC_WRITE, FI_CSWAP, + FI_CSWAP_NE, FI_CSWAP_LE, FI_CSWAP_LT, FI_CSWAP_GE, FI_CSWAP_GT, FI_MSWAP + +*datatype* +: For FT_CAP_ATOMIC: FI_INT8, FI_UINT8, FI_INT16, FI_UINT16, FI_INT32, + FI_UINT32, FI_INT64, FI_UINT64, FI_FLOAT, FI_DOUBLE, FI_FLOAT_COMPLEX, + FI_DOUBLE_COMPLEX, FI_LONG_DOUBLE, FI_LONG_DOUBLE_COMPLEX + +*msg_flags - values OR'ed together* +: For FT_FUNC_[SEND,WRITE,READ,ATOMIC]MSG: FI_REMOTE_CQ_DATA, FI_COMPLETION + +*rx_cq_bind_flags - values OR'ed together* +: FI_SELECTIVE_COMPLETION + +*tx_cq_bind_flags - values OR'ed together* +: FI_SELECTIVE_COMPLETION + +*rx_op_flags - values OR'ed together* +: FI_COMPLETION + +*tx_op_flags - values OR'ed together* +: FI_COMPLETION + +*test_flags - values OR'ed together* +: FT_FLAG_QUICKTEST # HOW TO RUN TESTS @@ -274,6 +367,10 @@ the list available for that test. : Use the specified endpoint type for the test. Valid options are msg, dgram, and rdm. The default endpoint type is rdm. +*-D * +: Allocate data buffers on the specified device, rather than in host + memory. Valid options are ze and cuda. + *-a
* : The name of a shared address vector. This option only applies to tests that support shared address vectors. @@ -287,6 +384,9 @@ the list available for that test. *-s
* : Specifies the address of the local endpoint. +*-F +: Specifies the address format. + *-b[=oob_port]* : Enables out-of-band (via sockets) address exchange and test synchronization. A port for the out-of-band connection may be specified @@ -297,9 +397,15 @@ the list available for that test. out-of-band connection may be specified as part of this option to override the default. Cannot be used together with the '-b' option. +*-U* +: Run fabtests with FI_DELIVERY_COMPLETE. + *-I * : Number of data transfer iterations. +*-Q* +: Associated any EQ with the domain, rather than directly with the EP. + *-w * : Number of warm-up data transfer iterations. @@ -336,6 +442,9 @@ the list available for that test. *-M * : For multicast tests, specifies the address of the multicast group to join. +*-v* +: Add data verification check to data transfers. + # USAGE EXAMPLES ## A simple example @@ -357,6 +466,15 @@ This will run "fi_rdm_atomic" for all atomic operations with - 1024 bytes message size - server node as 123.168.0.123 +## Run multinode tests + + Server and clients are invoked with the same command: + fi_multinode -n -s -C + + A process on the server must be started before any of the clients can be started + succesfully. -C lists the mode that the tests will run in. Currently the options are + for rma and msg. If not provided, the test will default to msg. + ## Run fi_ubertest run server: fi_ubertest diff --git a/fabtests/man/man1/fi_rdm_rma_simple.1 b/fabtests/man/man1/fi_multinode.1 similarity index 100% rename from fabtests/man/man1/fi_rdm_rma_simple.1 rename to fabtests/man/man1/fi_multinode.1 diff --git a/fabtests/man/man1/fi_rdm_rma_event.1 b/fabtests/man/man1/fi_rdm_rma_event.1 new file mode 100644 index 00000000000..3f6ccf96f11 --- /dev/null +++ b/fabtests/man/man1/fi_rdm_rma_event.1 @@ -0,0 +1 @@ +.so man7/fabtests.7 diff --git a/fabtests/man/man7/fabtests.7 b/fabtests/man/man7/fabtests.7 index 26c9682cd28..165187c9dc7 100644 --- a/fabtests/man/man7/fabtests.7 +++ b/fabtests/man/man7/fabtests.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fabtests" "7" "2019\-07\-12" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fabtests" "7" "2020\-12\-01" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -37,8 +37,8 @@ These tests are a mix of very basic functionality tests that show major features of libfabric. .TP .B \f[I]fi_av_xfer\f[] -Tests communication for unconnected endpoints, as addresses are inserted -and removed from the local address vector. +Tests communication for connectionless endpoints, as addresses are +inserted and removed from the local address vector. .RS .RE .TP @@ -58,7 +58,7 @@ A basic datagram endpoint example. .RE .TP .B \f[I]fi_dgram_waitset\f[] -Transfers datagrams using waitsets for completion notifcation. +Transfers datagrams using waitsets for completion notification. .RS .RE .TP @@ -79,7 +79,7 @@ A basic message endpoint example. .TP .B \f[I]fi_msg_epoll\f[] Transfers messages with completion queues configured to use file -descriptors as wait objetcts. +descriptors as wait objects. The file descriptors are retrieved by the program and used directly with the Linux epoll API. .RS @@ -138,8 +138,9 @@ a single buffer, posted using the FI_MULTI_RECV flag. .RS .RE .TP -.B \f[I]fi_rdm_rma_simple\f[] -A simple RMA write example over an RDM endpoint. +.B \f[I]fi_rdm_rma_event\f[] +An RMA write example over an RDM endpoint that uses RMA events to notify +the peer that the RMA transfer has completed. .RS .RE .TP @@ -169,7 +170,7 @@ Tests canceling posted receives for tagged messages. .TP .B \f[I]fi_resmgmt_test\f[] Tests the resource management enabled feature. -This verifies that the provider prevents applications from overruning +This verifies that the provider prevents applications from overrunning local and remote command queues and completion queues. This corresponds to setting the domain attribute resource_mgmt to FI_RM_ENABLED. @@ -303,10 +304,19 @@ Tests memory registration. .RS .RE .TP -.B \f[I]fi_resource_freeing\f[] -Allocates and closes fabric resources to check for proper cleanup. +.B \f[I]fi_mr_cache_evict\f[] +Tests provider MR cache eviction capabilities. .RS .RE +.SH Multinode +.PP +This test runs a series of tests over multiple formats and patterns to +help validate at scale. +The patterns are an all to all, one to all, all to one and a ring. +The tests also run across multiple capabilities, such as messages, rma, +atomics, and tagged messages. +Currently, there is no option to run these capabilities and patterns +independently, however the test is short enough to be all run at once. .SH Ubertest .PP This is a comprehensive latency, bandwidth, and functionality test that @@ -317,8 +327,8 @@ As a result, a full ubertest run can take a significant amount of time. Because ubertest iterates over input variables, it relies on a test configuration file for control, rather than extensive command line options that are used by other fabtests. -A configuration file must be constructured for each provider. -Example test configurations are at /test_configs. +A configuration file must be constructed for each provider. +Example test configurations are at test_configs. .TP .B \f[I]fi_ubertest\f[] This test takes a configure file as input. @@ -334,11 +344,144 @@ iterations of each test. .RE .SS Config file options .PP -TODO: add all supported config options -.IP \[bu] 2 -\f[I]threading\f[] Specify a list of threading levels. -This is a hints only config: ubertest doesn\[aq]t spawn multiple threads -to verify functionality. +The following keys and respective key values may be used in the config +file. +.TP +.B \f[I]prov_name\f[] +Identify the provider(s) to test. +E.g. +udp, tcp, verbs, ofi_rxm;verbs; ofi_rxd;udp. +.RS +.RE +.TP +.B \f[I]test_type\f[] +FT_TEST_LATENCY, FT_TEST_BANDWIDTH, FT_TEST_UNIT +.RS +.RE +.TP +.B \f[I]test_class\f[] +FT_CAP_MSG, FT_CAP_TAGGED, FT_CAP_RMA, FT_CAP_ATOMIC +.RS +.RE +.TP +.B \f[I]class_function\f[] +For FT_CAP_MSG and FT_CAP_TAGGED: FT_FUNC_SEND, FT_FUNC_SENDV, +FT_FUNC_SENDMSG, FT_FUNC_INJECT, FT_FUNC_INJECTDATA, FT_FUNC_SENDDATA +.RS +.RE +.PP +For FT_CAP_RMA: FT_FUNC_WRITE, FT_FUNC_WRITEV, FT_FUNC_WRITEMSG, +FT_FUNC_WRITEDATA, FT_FUNC_INJECT_WRITE, FT_FUNC_INJECT_WRITEDATA +FT_FUNC_READ, FT_FUNC_READV, FT_FUNC_READMSG +.PP +For FT_CAP_ATOMIC: FT_FUNC_ATOMIC, FT_FUNC_ATOMICV, FT_FUNC_ATOMICMSG, +FT_FUNC_INJECT_ATOMIC, FT_FUNC_FETCH_ATOMIC, FT_FUNC_FETCH_ATOMICV, +FT_FUNC_FETCH_ATOMICMSG, FT_FUNC_COMPARE_ATOMIC, +FT_FUNC_COMPARE_ATOMICV, FT_FUNC_COMPARE_ATOMICMSG +.TP +.B \f[I]constant_caps \- values OR\[aq]ed together\f[] +FI_RMA, FI_MSG, FI_SEND, FI_RECV, FI_READ, FI_WRITE, FI_REMOTE_READ, +FI_REMOTE_WRITE, FI_TAGGED, FI_DIRECTED_RECV +.RS +.RE +.TP +.B \f[I]mode \- values OR\[aq]ed together\f[] +FI_CONTEXT, FI_RX_CQ_DATA +.RS +.RE +.TP +.B \f[I]ep_type\f[] +FI_EP_MSG, FI_EP_DGRAM, FI_EP_RDM +.RS +.RE +.TP +.B \f[I]comp_type\f[] +FT_COMP_QUEUE, FT_COMP_CNTR, FT_COMP_ALL +.RS +.RE +.TP +.B \f[I]av_type\f[] +FI_AV_MAP, FI_AV_TABLE +.RS +.RE +.TP +.B \f[I]eq_wait_obj\f[] +FI_WAIT_NONE, FI_WAIT_UNSPEC, FI_WAIT_FD, FI_WAIT_MUTEX_COND +.RS +.RE +.TP +.B \f[I]cq_wait_obj\f[] +FI_WAIT_NONE, FI_WAIT_UNSPEC, FI_WAIT_FD, FI_WAIT_MUTEX_COND +.RS +.RE +.TP +.B \f[I]cntr_wait_obj\f[] +FI_WAIT_NONE, FI_WAIT_UNSPEC, FI_WAIT_FD, FI_WAIT_MUTEX_COND +.RS +.RE +.TP +.B \f[I]threading\f[] +FI_THREAD_UNSPEC, FI_THREAD_SAFE, FI_THREAD_FID, FI_THREAD_DOMAIN, +FI_THREAD_COMPLETION, FI_THREAD_ENDPOINT +.RS +.RE +.TP +.B \f[I]progress\f[] +FI_PROGRESS_MANUAL, FI_PROGRESS_AUTO, FI_PROGRESS_UNSPEC +.RS +.RE +.TP +.B \f[I]mr_mode\f[] +(Values OR\[aq]ed together) FI_MR_LOCAL, FI_MR_VIRT_ADDR, +FI_MR_ALLOCATED, FI_MR_PROV_KEY +.RS +.RE +.TP +.B \f[I]op\f[] +For FT_CAP_ATOMIC: FI_MIN, FI_MAX, FI_SUM, FI_PROD, FI_LOR, FI_LAND, +FI_BOR, FI_BAND, FI_LXOR, FI_BXOR, FI_ATOMIC_READ, FI_ATOMIC_WRITE, +FI_CSWAP, FI_CSWAP_NE, FI_CSWAP_LE, FI_CSWAP_LT, FI_CSWAP_GE, +FI_CSWAP_GT, FI_MSWAP +.RS +.RE +.TP +.B \f[I]datatype\f[] +For FT_CAP_ATOMIC: FI_INT8, FI_UINT8, FI_INT16, FI_UINT16, FI_INT32, +FI_UINT32, FI_INT64, FI_UINT64, FI_FLOAT, FI_DOUBLE, FI_FLOAT_COMPLEX, +FI_DOUBLE_COMPLEX, FI_LONG_DOUBLE, FI_LONG_DOUBLE_COMPLEX +.RS +.RE +.TP +.B \f[I]msg_flags \- values OR\[aq]ed together\f[] +For FT_FUNC_[SEND,WRITE,READ,ATOMIC]MSG: FI_REMOTE_CQ_DATA, +FI_COMPLETION +.RS +.RE +.TP +.B \f[I]rx_cq_bind_flags \- values OR\[aq]ed together\f[] +FI_SELECTIVE_COMPLETION +.RS +.RE +.TP +.B \f[I]tx_cq_bind_flags \- values OR\[aq]ed together\f[] +FI_SELECTIVE_COMPLETION +.RS +.RE +.TP +.B \f[I]rx_op_flags \- values OR\[aq]ed together\f[] +FI_COMPLETION +.RS +.RE +.TP +.B \f[I]tx_op_flags \- values OR\[aq]ed together\f[] +FI_COMPLETION +.RS +.RE +.TP +.B \f[I]test_flags \- values OR\[aq]ed together\f[] +FT_FLAG_QUICKTEST +.RS +.RE .SH HOW TO RUN TESTS .IP "(1)" 4 Fabtests requires that libfabric be installed on the system, and at @@ -386,6 +529,13 @@ Valid options are msg, dgram, and rdm. The default endpoint type is rdm. .RS .RE +.TP +.B \f[I]\-D \f[] +Allocate data buffers on the specified device, rather than in host +memory. +Valid options are ze and cuda. +.RS +.RE *\-a .IP \[bu] 2 : The name of a shared address vector. @@ -404,6 +554,11 @@ Specifies the port number of the peer endpoint, overriding the default. .IP \[bu] 2 : Specifies the address of the local endpoint. .TP +.B *\-F +Specifies the address format. +.RS +.RE +.TP .B \f[I]\-b[=oob_port]\f[] Enables out\-of\-band (via sockets) address exchange and test synchronization. @@ -420,11 +575,21 @@ Cannot be used together with the \[aq]\-b\[aq] option. .RS .RE .TP +.B \f[I]\-U\f[] +Run fabtests with FI_DELIVERY_COMPLETE. +.RS +.RE +.TP .B \f[I]\-I \f[] Number of data transfer iterations. .RS .RE .TP +.B \f[I]\-Q\f[] +Associated any EQ with the domain, rather than directly with the EP. +.RS +.RE +.TP .B \f[I]\-w \f[] Number of warm\-up data transfer iterations. .RS @@ -482,6 +647,11 @@ For multicast tests, specifies the address of the multicast group to join. .RS .RE +.TP +.B \f[I]\-v\f[] +Add data verification check to data transfers. +.RS +.RE .SH USAGE EXAMPLES .SS A simple example .IP @@ -512,6 +682,20 @@ This will run "fi_rdm_atomic" for all atomic operations with \-\ server\ node\ as\ 123.168.0.123 \f[] .fi +.SS Run multinode tests +.IP +.nf +\f[C] +Server\ and\ clients\ are\ invoked\ with\ the\ same\ command: +\ \ \ \ fi_multinode\ \-n\ \ \-s\ \ \-C\ + +A\ process\ on\ the\ server\ must\ be\ started\ before\ any\ of\ the\ clients\ can\ be\ started +succesfully.\ \-C\ lists\ the\ mode\ that\ the\ tests\ will\ run\ in.\ Currently\ the\ options\ are +\f[] +.fi +.PP +for rma and msg. +If not provided, the test will default to msg. .SS Run fi_ubertest .IP .nf diff --git a/fabtests/multinode/include/coll_test.h b/fabtests/multinode/include/coll_test.h new file mode 100644 index 00000000000..92eae06804d --- /dev/null +++ b/fabtests/multinode/include/coll_test.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2019-2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + + + +typedef int (*coll_test_setup_t)(); +typedef int (*coll_test_run_t)(); +typedef void (*coll_test_teardown_t)(); + +struct coll_test { + char *name; + coll_test_setup_t setup; + coll_test_run_t run; + coll_test_teardown_t teardown; +}; \ No newline at end of file diff --git a/fabtests/multinode/include/core.h b/fabtests/multinode/include/core.h new file mode 100644 index 00000000000..fc9d64ab397 --- /dev/null +++ b/fabtests/multinode/include/core.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2017-2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include + +#include "pattern.h" + +#define PM_DEFAULT_OOB_PORT (8228) + +enum multi_xfer{ + multi_msg, + multi_rma, +}; + +struct multi_xfer_method { + char* name; + int (*send)(); + int (*recv)(); + int (*wait)(); +}; + +struct pm_job_info { + size_t my_rank; + size_t num_ranks; + int sock; + int *clients; //only valid for server + struct fi_rma_iov *multi_iovs; + + struct sockaddr_storage oob_server_addr; + size_t server_addr_len; + void *names; + size_t name_len; + fi_addr_t *fi_addrs; + enum multi_xfer transfer_method; +}; + +struct multinode_xfer_state { + int iteration; + size_t recvs_posted; + size_t sends_posted; + + size_t tx_window; + size_t rx_window; + + /* pattern iterator state */ + int cur_source; + int cur_target; + + bool all_recvs_posted; + bool all_sends_posted; + bool all_completions_done; + + uint64_t tx_flags; + uint64_t rx_flags; +}; + +extern struct pm_job_info pm_job; +int multinode_run_tests(int argc, char **argv); +int pm_allgather(void *my_item, void *items, int item_size); +void pm_barrier(); +int multi_msg_send(); +int multi_msg_recv(); +int multi_msg_wait(); +int multi_rma_write(); +int multi_rma_recv(); +int multi_rma_wait(); diff --git a/prov/efa/src/efa_verbs/efa_ib.h b/fabtests/multinode/include/pattern.h similarity index 68% rename from prov/efa/src/efa_verbs/efa_ib.h rename to fabtests/multinode/include/pattern.h index 5c86e3978fb..43c36bfc490 100644 --- a/prov/efa/src/efa_verbs/efa_ib.h +++ b/fabtests/multinode/include/pattern.h @@ -1,13 +1,11 @@ /* - * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2017-2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright (c) 2017-2019 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: + * BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following @@ -32,22 +30,25 @@ * SOFTWARE. */ -#ifndef EFA_IB_H -#define EFA_IB_H +#pragma once -#include "config.h" -#include -#include +#include +#include +#include +#include -#include "infiniband/efa_verbs.h" -#include "efa-abi.h" -#include "efa.h" +/* Initial value for iterator position. */ +#define PATTERN_NO_CURRENT (-1) -#define HIDDEN __attribute__((visibility("hidden"))) +/* Number of patterns to test */ +extern const int NUM_TESTS; -extern HIDDEN int abi_ver; +struct pattern_ops { + char *name; + int (*next_source)(int *cur); + int (*next_target) (int *cur); +}; + +extern struct pattern_ops patterns[]; -HIDDEN int efa_ib_init(struct ibv_device ***list); -char *get_sysfs_path(void); -#endif /* EFA_IB_H */ diff --git a/fabtests/multinode/src/core.c b/fabtests/multinode/src/core.c new file mode 100644 index 00000000000..f39efc23136 --- /dev/null +++ b/fabtests/multinode/src/core.c @@ -0,0 +1,507 @@ +/* + * Copyright (c) 2017-2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHWARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. const NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER const AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS const THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +char *tx_barrier; +char *rx_barrier; +struct fid_mr *mr_barrier; +struct fi_context2 *barrier_tx_ctx, *barrier_rx_ctx; + +struct pattern_ops *pattern; +struct multinode_xfer_state state; +struct multi_xfer_method method; +struct multi_xfer_method multi_xfer_methods[] = { + { + .name = "send/recv", + .send = multi_msg_send, + .recv = multi_msg_recv, + .wait = multi_msg_wait, + }, + { + .name = "rma", + .send = multi_rma_write, + .recv = multi_rma_recv, + .wait = multi_rma_wait, + } +}; + +static int multi_setup_fabric(int argc, char **argv) +{ + char my_name[FT_MAX_CTRL_MSG]; + size_t len; + int i, ret; + struct fi_rma_iov *remote = malloc(sizeof(*remote)); + + hints->ep_attr->type = FI_EP_RDM; + hints->mode = FI_CONTEXT; + hints->domain_attr->mr_mode = opts.mr_mode; + + if (pm_job.transfer_method == multi_msg) { + hints->caps = FI_MSG; + } else if (pm_job.transfer_method == multi_rma) { + hints->caps = FI_MSG | FI_RMA; + } else { + printf("Not a valid cabability\n"); + return -FI_ENODATA; + } + + method = multi_xfer_methods[pm_job.transfer_method]; + + tx_seq = 0; + rx_seq = 0; + tx_cq_cntr = 0; + rx_cq_cntr = 0; + + ret = ft_hmem_init(opts.iface); + if (ret) + return ret; + + if (pm_job.my_rank != 0) + pm_barrier(); + + ret = ft_getinfo(hints, &fi); + if (ret) + return ret; + + ret = ft_open_fabric_res(); + if (ret) + return ret; + + opts.av_size = pm_job.num_ranks; + ret = ft_alloc_active_res(fi); + if (ret) + return ret; + + ret = ft_enable_ep(ep, eq, av, txcq, rxcq, txcntr, rxcntr); + if (ret) + return ret; + + len = FT_MAX_CTRL_MSG; + ret = fi_getname(&ep->fid, (void *) my_name, &len); + if (ret) { + FT_PRINTERR("error determining local endpoint name\n", ret); + goto err; + } + + pm_job.name_len = 256; + pm_job.names = malloc(pm_job.name_len * pm_job.num_ranks); + if (!pm_job.names) { + FT_ERR("error allocating memory for address exchange\n"); + ret = -FI_ENOMEM; + goto err; + } + + if (pm_job.my_rank == 0) + pm_barrier(); + + ret = pm_allgather(my_name, pm_job.names, pm_job.name_len); + if (ret) { + FT_PRINTERR("error exchanging addresses\n", ret); + goto err; + } + + pm_job.fi_addrs = calloc(pm_job.num_ranks, sizeof(*pm_job.fi_addrs)); + if (!pm_job.fi_addrs) { + FT_ERR("error allocating memory for av fi addrs\n"); + ret = -FI_ENOMEM; + goto err; + } + + for (i = 0; i < pm_job.num_ranks; i++) { + ret = fi_av_insert(av, (char*)pm_job.names + i * pm_job.name_len, 1, + &pm_job.fi_addrs[i], 0, NULL); + if (ret != 1) { + FT_ERR("unable to insert all addresses into AV table\n"); + ret = -1; + goto err; + } + } + + pm_job.multi_iovs = malloc(sizeof(*(pm_job.multi_iovs)) * pm_job.num_ranks); + if (!pm_job.multi_iovs) { + FT_ERR("error allocation memory for rma_iovs\n"); + goto err; + } + + if (fi->domain_attr->mr_mode & FI_MR_VIRT_ADDR) + remote->addr = (uintptr_t) rx_buf; + else + remote->addr = 0; + + remote->key = fi_mr_key(mr); + remote->len = rx_size; + + ret = pm_allgather(remote, pm_job.multi_iovs, sizeof(*remote)); + if (ret) { + FT_ERR("error exchanging rma_iovs\n"); + goto err; + } + for (i = 0; i < pm_job.num_ranks; i++) { + pm_job.multi_iovs[i].addr += (tx_size * pm_job.my_rank); + } + + return 0; +err: + ft_free_res(); + return ft_exit_code(ret); +} + +static int ft_progress(struct fid_cq *cq, uint64_t total, uint64_t *cq_cntr) +{ + struct fi_cq_err_entry comp; + int ret; + + ret = fi_cq_read(cq, &comp, 1); + if (ret > 0) + (*cq_cntr)++; + + if (ret >= 0 || ret == -FI_EAGAIN) + return 0; + + if (ret == -FI_EAVAIL) { + ret = ft_cq_readerr(cq); + (*cq_cntr)++; + } else { + FT_PRINTERR("fi_cq_read/sread", ret); + } + return ret; +} + +int multi_msg_recv() +{ + int ret, offset; + + /* post receives */ + while (!state.all_recvs_posted && state.rx_window) { + + ret = pattern->next_source(&state.cur_source); + if (ret == -FI_ENODATA) { + state.all_recvs_posted = true; + break; + } else if (ret < 0) { + return ret; + } + + offset = state.recvs_posted % opts.window_size ; + assert(rx_ctx_arr[offset].state == OP_DONE); + + ret = ft_post_rx_buf(ep, opts.transfer_size, + &rx_ctx_arr[offset].context, + rx_ctx_arr[offset].buf, + rx_ctx_arr[offset].desc, 0); + if (ret) + return ret; + + rx_ctx_arr[offset].state = OP_PENDING; + state.recvs_posted++; + state.rx_window--; + } + return 0; +} + +int multi_msg_send() +{ + int ret, offset; + fi_addr_t dest; + + while (!state.all_sends_posted && state.tx_window) { + + ret = pattern->next_target(&state.cur_target); + if (ret == -FI_ENODATA) { + state.all_sends_posted = true; + break; + } else if (ret < 0) { + return ret; + } + + offset = state.sends_posted % opts.window_size; + assert(tx_ctx_arr[offset].state == OP_DONE); + + dest = pm_job.fi_addrs[state.cur_target]; + ret = ft_post_tx_buf(ep, dest, opts.transfer_size, + NO_CQ_DATA, + &tx_ctx_arr[offset].context, + tx_ctx_arr[offset].buf, + tx_ctx_arr[offset].desc, 0); + if (ret) + return ret; + + tx_ctx_arr[offset].state = OP_PENDING; + state.sends_posted++; + state.tx_window--; + } + return 0; +} + +int multi_msg_wait() +{ + int ret, i; + + ret = ft_get_tx_comp(tx_seq); + if (ret) + return ret; + + ret = ft_get_rx_comp(rx_seq); + if (ret) + return ret; + + for (i = 0; i < opts.window_size; i++) { + rx_ctx_arr[i].state = OP_DONE; + tx_ctx_arr[i].state = OP_DONE; + } + + state.rx_window = opts.window_size; + state.tx_window = opts.window_size; + + if (state.all_recvs_posted && state.all_sends_posted) + state.all_completions_done = true; + + return 0; +} + +int multi_rma_write() +{ + int ret, rc; + + while (!state.all_sends_posted && state.tx_window) { + + ret = pattern->next_target(&state.cur_target); + if (ret == -FI_ENODATA) { + state.all_sends_posted = true; + break; + } else if (ret < 0) { + return ret; + } + + snprintf((char*) tx_buf + tx_size * state.cur_target, tx_size, + "Hello World! from %zu to %i on the %zuth iteration, %s test", + pm_job.my_rank, state.cur_target, + (size_t) tx_seq, pattern->name); + + while (1) { + ret = fi_write(ep, + tx_buf + tx_size * state.cur_target, + opts.transfer_size, mr_desc, + pm_job.fi_addrs[state.cur_target], + pm_job.multi_iovs[state.cur_target].addr, + pm_job.multi_iovs[state.cur_target].key, + &tx_ctx_arr[state.tx_window].context); + if (!ret) + break; + + if (ret != -FI_EAGAIN) { + printf("RMA write failed"); + return ret; + } + + rc = ft_progress(txcq, tx_seq, &tx_cq_cntr); + if (rc && rc != -FI_EAGAIN) { + printf("Failed to get rma completion"); + return rc; + } + } + tx_seq++; + + state.sends_posted++; + state.tx_window--; + } + return 0; +} + +int multi_rma_recv() +{ + state.all_recvs_posted = true; + return 0; +} + +int multi_rma_wait() +{ + int ret; + + ret = ft_get_tx_comp(tx_seq); + if (ret) + return ret; + + state.rx_window = opts.window_size; + state.tx_window = opts.window_size; + + if (state.all_recvs_posted && state.all_sends_posted) + state.all_completions_done = true; + + return 0; +} + +int send_recv_barrier(int sync) +{ + int ret, i; + + for(i = 0; i < pm_job.num_ranks; i++) { + + ret = ft_post_rx_buf(ep, opts.transfer_size, + &barrier_rx_ctx[i], + rx_buf, mr_desc, 0); + if (ret) + return ret; + } + + for (i = 0; i < pm_job.num_ranks; i++) { + ret = ft_post_tx_buf(ep, pm_job.fi_addrs[i], 0, + NO_CQ_DATA, &barrier_tx_ctx[i], + tx_buf, mr_desc, 0); + if (ret) + return ret; + } + + ret = ft_get_tx_comp(tx_seq); + if (ret) + return ret; + + ret = ft_get_rx_comp(rx_seq); + + return ret; +} + +static inline void multi_init_state() +{ + state.cur_source = PATTERN_NO_CURRENT; + state.cur_target = PATTERN_NO_CURRENT; + + state.all_completions_done = false; + state.all_recvs_posted = false; + state.all_sends_posted = false; + + state.rx_window = opts.window_size; + state.tx_window = opts.window_size; +} + +static int multi_run_test() +{ + int ret; + int iter; + + for (iter = 0; iter < opts.iterations; iter++) { + + multi_init_state(); + while (!state.all_completions_done || + !state.all_recvs_posted || + !state.all_sends_posted) { + ret = method.recv(); + if (ret) + return ret; + + ret = method.send(); + if (ret) + return ret; + + ret = method.wait(); + if (ret) + return ret; + } + + ret = send_recv_barrier(iter); + if (ret) + return ret; + } + return 0; +} + +static void pm_job_free_res() +{ + free(pm_job.names); + free(pm_job.fi_addrs); + free(pm_job.multi_iovs); + + free(barrier_tx_ctx); + free(barrier_rx_ctx); + + FT_CLOSE_FID(mr_barrier); +} + +int multinode_run_tests(int argc, char **argv) +{ + int ret = FI_SUCCESS; + int i; + + + barrier_tx_ctx = malloc(sizeof(*barrier_tx_ctx) * pm_job.num_ranks); + if (!barrier_tx_ctx) + return -FI_ENOMEM; + + barrier_rx_ctx = malloc(sizeof(*barrier_rx_ctx) * pm_job.num_ranks); + if (!barrier_rx_ctx) + return -FI_ENOMEM; + + ret = multi_setup_fabric(argc, argv); + if (ret) + return ret; + + + for (i = 0; i < NUM_TESTS && !ret; i++) { + printf("starting %s... ", patterns[i].name); + pattern = &patterns[i]; + ret = multi_run_test(); + if (ret) + printf("failed\n"); + else + printf("passed\n"); + + fflush(stdout); + } + + pm_job_free_res(); + ft_free_res(); + return ft_exit_code(ret); +} + diff --git a/fabtests/multinode/src/core_coll.c b/fabtests/multinode/src/core_coll.c new file mode 100644 index 00000000000..29fcc9aefa7 --- /dev/null +++ b/fabtests/multinode/src/core_coll.c @@ -0,0 +1,581 @@ +/* + * Copyright (c) 2017-2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHWARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. const NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER const AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS const THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +struct fid_av_set *av_set; +fi_addr_t world_addr; +fi_addr_t coll_addr; +struct fid_mc *coll_mc; + + +static int wait_for_event(uint32_t event) +{ + uint32_t ev; + int err; + struct fi_cq_err_entry comp = { 0 }; + + do { + err = fi_eq_read(eq, &ev, NULL, 0, 0); + if (err >= 0) { + FT_DEBUG("found eq entry %d\n", event); + if (ev == event) { + return FI_SUCCESS; + } + } else if (err != -EAGAIN) { + return err; + } + + err = fi_cq_read(rxcq, &comp, 1); + if (err < 0 && err != -EAGAIN) { + return err; + } + + err = fi_cq_read(txcq, &comp, 1); + if (err < 0 && err != -EAGAIN) { + return err; + } + } while (err == -FI_EAGAIN); + + return err; +} + +static int wait_for_comp(void *ctx) +{ + int err; + struct fi_cq_err_entry comp = { 0 }; + + do { + err = fi_cq_read(rxcq, &comp, 1); + if (err < 0 && err != -EAGAIN) { + return err; + } + + if (comp.op_context && comp.op_context == ctx) { + return FI_SUCCESS; + } + + err = fi_cq_read(txcq, &comp, 1); + if (err < 0 && err != -EAGAIN) { + return err; + } + + if (comp.op_context && comp.op_context == ctx) { + return FI_SUCCESS; + } + } while (err == -FI_EAGAIN); + + return err; +} + +static int coll_setup() +{ + int err; + struct fi_av_set_attr av_set_attr; + + av_set_attr.count = pm_job.num_ranks; + av_set_attr.start_addr = 0; + av_set_attr.end_addr = pm_job.num_ranks - 1; + av_set_attr.stride = 1; + + err = fi_av_set(av, &av_set_attr, &av_set, NULL); + if (err) { + FT_DEBUG("av_set creation failed ret = %d\n", err); + } + + err = fi_av_set_addr(av_set, &world_addr); + if (err) { + FT_DEBUG("failed to get collective addr = %d (%s)\n", err, + fi_strerror(err)); + return err; + } + + err = fi_join_collective(ep, world_addr, av_set, 0, &coll_mc, NULL); + if (err) { + FT_DEBUG("collective join failed ret = %d (%s)\n", err, fi_strerror(err)); + return err; + } + + return wait_for_event(FI_JOIN_COMPLETE); +} + +static void coll_teardown() +{ + fi_close(&coll_mc->fid); + fi_close(&av_set->fid); +} + +static int join_test_run() +{ + return FI_SUCCESS; +} + +static int barrier_test_run() +{ + int err; + uint64_t done_flag; + struct fi_collective_attr attr; + + attr.op = FI_NOOP; + attr.datatype = FI_VOID; + attr.mode = 0; + err = fi_query_collective(domain, FI_BARRIER, &attr, 0); + if (err) { + FT_DEBUG("barrier collective not supported: %d (%s)\n", err, + fi_strerror(err)); + return err; + } + + coll_addr = fi_mc_addr(coll_mc); + err = fi_barrier(ep, coll_addr, &done_flag); + if (err) { + FT_DEBUG("collective barrier failed: %d (%s)\n", err, fi_strerror(err)); + return err; + } + + return wait_for_comp(&done_flag); +} + +static int sum_all_reduce_test_run() +{ + int err; + uint64_t done_flag; + uint64_t result = 0; + uint64_t expect_result = 0; + uint64_t data = pm_job.my_rank; + size_t count = 1; + uint64_t i; + struct fi_collective_attr attr; + + attr.op = FI_SUM; + attr.datatype = FI_UINT64; + attr.mode = 0; + err = fi_query_collective(domain, FI_ALLREDUCE, &attr, 0); + if (err) { + FT_DEBUG("SUM AllReduce collective not supported: %d (%s)\n", err, + fi_strerror(err)); + return err; + } + + for (i = 0; i < pm_job.num_ranks; i++) { + expect_result += i; + } + + coll_addr = fi_mc_addr(coll_mc); + err = fi_allreduce(ep, &data, count, NULL, &result, NULL, coll_addr, FI_UINT64, + FI_SUM, 0, &done_flag); + if (err) { + FT_DEBUG("collective allreduce failed: %d (%s)\n", err, fi_strerror(err)); + return err; + } + + err = wait_for_comp(&done_flag); + if (err) + return err; + + if (result == expect_result) + return FI_SUCCESS; + + FT_DEBUG("allreduce failed; expect: %ld, actual: %ld\n", expect_result, result); + return -FI_ENOEQ; +} + +static int all_gather_test_run() +{ + int err; + uint64_t done_flag; + uint64_t *result; + uint64_t *expect_result; + uint64_t data = pm_job.my_rank; + size_t count = 1; + uint64_t i; + struct fi_collective_attr attr; + + attr.op = FI_NOOP; + attr.datatype = FI_UINT64; + attr.mode = 0; + err = fi_query_collective(domain, FI_ALLGATHER, &attr, 0); + if (err) { + FT_DEBUG("SUM AllReduce collective not supported: %d (%s)\n", err, + fi_strerror(err)); + return err; + } + + result = malloc(pm_job.num_ranks * sizeof(*expect_result)); + expect_result = malloc(pm_job.num_ranks * sizeof(*expect_result)); + for (i = 0; i < pm_job.num_ranks; i++) { + expect_result[i] = i; + } + + coll_addr = fi_mc_addr(coll_mc); + err = fi_allgather(ep, &data, count, NULL, result, NULL, coll_addr, FI_UINT64, 0, + &done_flag); + if (err) { + FT_DEBUG("collective allreduce failed: %d (%s)\n", err, fi_strerror(err)); + goto errout; + } + + err = wait_for_comp(&done_flag); + if (err) + goto errout; + + for (i = 0; i < pm_job.num_ranks; i++) { + if ((expect_result[i]) != result[i]) { + FT_DEBUG("allgather failed; expect[%ld]: %ld, actual[%ld]: %ld\n", + i, expect_result[i], i, result[i]); + err = -1; + goto errout; + } + } + return FI_SUCCESS; + +errout: + free(expect_result); + free(result); + return err; +} + +static int scatter_test_run() +{ + int err; + uint64_t done_flag; + uint64_t result; + uint64_t *data; + uint64_t i; + struct fi_collective_attr attr; + fi_addr_t root = 0; + size_t data_size = pm_job.num_ranks * sizeof(*data); + + attr.op = FI_NOOP; + attr.datatype = FI_UINT64; + attr.mode = 0; + err = fi_query_collective(domain, FI_SCATTER, &attr, 0); + if (err) { + FT_DEBUG("Scatter collective not supported: %d (%s)\n", err, + fi_strerror(err)); + return err; + } + + data = malloc(data_size); + if (!data) + return -FI_ENOMEM; + + for (i = 0; i < pm_job.num_ranks; i++) { + data[i] = i; + } + + coll_addr = fi_mc_addr(coll_mc); + if (pm_job.my_rank == root) + err = fi_scatter(ep, data, 1, NULL, &result, NULL, coll_addr, root, + FI_UINT64, 0, &done_flag); + else + err = fi_scatter(ep, NULL, 1, NULL, &result, NULL, coll_addr, root, + FI_UINT64, 0, &done_flag); + + if (err) { + FT_DEBUG("collective scatter failed: %d (%s)\n", err, fi_strerror(err)); + goto errout; + } + + err = wait_for_comp(&done_flag); + if (err) + goto errout; + + if (data[pm_job.my_rank] != result) { + FT_DEBUG("scatter failed; expect: %ld, actual: %ld\n", + data[pm_job.my_rank], result); + err = -1; + goto errout; + } + return FI_SUCCESS; + +errout: + free(data); + return err; +} + +static int broadcast_test_run() +{ + int err; + uint64_t done_flag; + uint64_t *result, *data; + uint64_t i; + struct fi_collective_attr attr; + fi_addr_t root = 0; + size_t data_cnt = pm_job.num_ranks; + + attr.op = FI_NOOP; + attr.datatype = FI_UINT64; + attr.mode = 0; + err = fi_query_collective(domain, FI_BROADCAST, &attr, 0); + if (err) { + FT_DEBUG("Broadcast collective not supported: %d (%s)\n", err, + fi_strerror(err)); + return err; + } + + result = malloc(data_cnt * sizeof(*result)); + if (!result) + return -FI_ENOMEM; + + data = malloc(data_cnt * sizeof(*data)); + if (!data) + return -FI_ENOMEM; + + for (i = 0; i < pm_job.num_ranks; ++i) { + data[i] = pm_job.num_ranks - 1 - i; + } + + coll_addr = fi_mc_addr(coll_mc); + if (pm_job.my_rank == root) + err = fi_broadcast(ep, data, data_cnt, NULL, coll_addr, root, FI_UINT64, + 0, &done_flag); + else + err = fi_broadcast(ep, result, data_cnt, NULL, coll_addr, root, FI_UINT64, + 0, &done_flag); + + if (err) { + FT_DEBUG("broadcast scatter failed: %d (%s)\n", err, fi_strerror(err)); + goto out; + } + + err = wait_for_comp(&done_flag); + if (err) + goto out; + + if (pm_job.my_rank == root) { + err = FI_SUCCESS; + goto out; + } + + for (i = 0; i < data_cnt; i++) { + if (result[i] != data[i]) { + FT_DEBUG("broadcast failed; expect: %ld, actual: %ld\n", data[i], + result[i]); + err = -1; + goto out; + } + } + err = FI_SUCCESS; + +out: + free(data); + free(result); + return err; +} + +struct coll_test tests[] = { + { + .name = "join_test", + .setup = coll_setup, + .run = join_test_run, + .teardown = coll_teardown + }, + { + .name = "barrier_test", + .setup = coll_setup, + .run = barrier_test_run, + .teardown = coll_teardown + }, + { + .name = "sum_all_reduce_test", + .setup = coll_setup, + .run = sum_all_reduce_test_run, + .teardown = coll_teardown + }, + { + .name = "all_gather_test", + .setup = coll_setup, + .run = all_gather_test_run, + .teardown = coll_teardown + }, + { + .name = "scatter_test", + .setup = coll_setup, + .run = scatter_test_run, + .teardown = coll_teardown + }, + { + .name = "broadcast_test", + .setup = coll_setup, + .run = broadcast_test_run, + .teardown = coll_teardown, + }, +}; + +const int NUM_TESTS = ARRAY_SIZE(tests); + +static inline int setup_hints() +{ + hints->ep_attr->type = FI_EP_RDM; + hints->caps = FI_MSG | FI_COLLECTIVE; + hints->mode = FI_CONTEXT; + hints->domain_attr->control_progress = FI_PROGRESS_MANUAL; + hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + hints->fabric_attr->prov_name = strdup("tcp"); + return FI_SUCCESS; +} + +static int multinode_setup_fabric(int argc, char **argv) +{ + char my_name[FT_MAX_CTRL_MSG]; + size_t len; + int err; + + setup_hints(); + + err = ft_getinfo(hints, &fi); + if (err) + return err; + + err = ft_open_fabric_res(); + if (err) + return err; + + opts.av_size = pm_job.num_ranks; + + av_attr.type = FI_AV_TABLE; + err = ft_alloc_active_res(fi); + if (err) + return err; + + err = ft_enable_ep(ep, eq, av, txcq, rxcq, txcntr, rxcntr); + if (err) + return err; + + len = FT_MAX_CTRL_MSG; + err = fi_getname(&ep->fid, (void *) my_name, &len); + if (err) { + FT_PRINTERR("error determining local endpoint name", err); + goto errout; + } + + pm_job.name_len = len; + pm_job.names = malloc(len * pm_job.num_ranks); + if (!pm_job.names) { + FT_ERR("error allocating memory for address exchange\n"); + err = -FI_ENOMEM; + goto errout; + } + + err = pm_allgather(my_name, pm_job.names, pm_job.name_len); + if (err) { + FT_PRINTERR("error exchanging addresses", err); + goto errout; + } + + pm_job.fi_addrs = calloc(pm_job.num_ranks, sizeof(*pm_job.fi_addrs)); + if (!pm_job.fi_addrs) { + FT_ERR("error allocating memory for av fi addrs\n"); + err = -FI_ENOMEM; + goto errout; + } + + err = fi_av_insert(av, pm_job.names, pm_job.num_ranks, pm_job.fi_addrs, 0, NULL); + if (err != pm_job.num_ranks) { + FT_ERR("unable to insert all addresses into AV table: %d (%s)\n", err, + fi_strerror(err)); + err = -1; + goto errout; + } + return 0; +errout: + ft_free_res(); + return ft_exit_code(err); +} + +static void pm_job_free_res() +{ + free(pm_job.names); + + free(pm_job.fi_addrs); +} + +int multinode_run_tests(int argc, char **argv) +{ + int ret = FI_SUCCESS; + int i; + + ret = multinode_setup_fabric(argc, argv); + if (ret) + return ret; + + for (i = 0; i < NUM_TESTS && !ret; i++) { + FT_DEBUG("Running Test: %s \n", tests[i].name); + + ret = tests[i].setup(); + FT_DEBUG("Setup Complete...\n"); + if (ret) + goto out; + + ret = tests[i].run(); + if (ret) + goto out; + + pm_barrier(); + tests[i].teardown(); + FT_DEBUG("Run Complete...\n"); + FT_DEBUG("Test Complete: %s \n", tests[i].name); + } + +out: + if (ret) + printf("failed\n"); + else + printf("passed\n"); + + pm_job_free_res(); + ft_free_res(); + return ft_exit_code(ret); +} diff --git a/fabtests/multinode/src/harness.c b/fabtests/multinode/src/harness.c new file mode 100644 index 00000000000..29fab855c9e --- /dev/null +++ b/fabtests/multinode/src/harness.c @@ -0,0 +1,344 @@ +/* + * Copyright (c) 2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +struct pm_job_info pm_job; + +static int parse_caps(char* caps) +{ + if (strcmp(caps, "msg") == 0) { + return multi_msg; + } else if (strcmp(caps, "rma") == 0) { + return multi_rma; + } else { + printf("Warn: Invalid capability, defaulting to msg\n"); + return multi_msg; + } +} + +static inline ssize_t socket_send(int sock, void *buf, size_t len, int flags) +{ + ssize_t ret; + size_t m = 0; + uint8_t *ptr = (uint8_t *) buf; + + do { + ret = send(sock, (void *) &ptr[m], len-m, flags); + if (ret < 0) + return ret; + + m += ret; + } while (m != len); + + return len; +} + +static inline int socket_recv(int sock, void *buf, size_t len, int flags) +{ + ssize_t ret; + size_t m = 0; + uint8_t *ptr = (uint8_t *) buf; + + do { + ret = recv(sock, (void *) &ptr[m], len-m, flags); + if (ret <= 0) + return -1; + + m += ret; + } while (m < len); + + return len; +} + +int pm_allgather(void *my_item, void *items, int item_size) +{ + int i, ret; + uint8_t *offset; + + /* client */ + if (!pm_job.clients) { + ret = socket_send(pm_job.sock, my_item, item_size, 0); + if (ret < 0) + return errno == EPIPE ? -FI_ENOTCONN : -errno; + + ret = socket_recv(pm_job.sock, items, + pm_job.num_ranks*item_size, 0); + if (ret <= 0) + return (ret)? -errno : -FI_ENOTCONN; + + return 0; + } + + /* server */ + memcpy(items, my_item, item_size); + + for (i = 0; i < pm_job.num_ranks-1; i++) { + offset = (uint8_t *)items + item_size * (i+1); + + ret = socket_recv(pm_job.clients[i], (void *)offset, + item_size, 0); + if (ret <= 0) + return ret; + } + + for (i = 0; i < pm_job.num_ranks-1; i++) { + ret = socket_send(pm_job.clients[i], items, + pm_job.num_ranks*item_size, 0); + if (ret < 0) + return ret; + } + return 0; +} + +void pm_barrier() +{ + char ch; + char chs[pm_job.num_ranks]; + + pm_allgather(&ch, chs, 1); +} + +static int pm_init_ranks() +{ + int ret; + int i; + size_t send_rank; + + if (pm_job.clients) { + for(i = 0; i < pm_job.num_ranks-1; i++) { + send_rank = i + 1; + ret = socket_send(pm_job.clients[i], &send_rank, sizeof(send_rank), 0); + if (ret < 0) + return ret; + } + } else { + ret = socket_recv(pm_job.sock, &(pm_job.my_rank), sizeof(pm_job.my_rank), 0); + } + + return ret; +} + +static int server_connect() +{ + int new_sock; + int ret, i; + + ret = listen(pm_job.sock, pm_job.num_ranks); + if (ret) + return ret; + + pm_job.clients = calloc(pm_job.num_ranks, sizeof(int)); + if (!pm_job.clients) + return -FI_ENOMEM; + + for (i = 0; i < pm_job.num_ranks-1; i++) { + new_sock = accept(pm_job.sock, NULL, NULL); + if (new_sock < 0) { + FT_ERR("error during server init\n"); + goto err; + } + pm_job.clients[i] = new_sock; + FT_DEBUG("connection established\n"); + } + close(pm_job.sock); + return 0; +err: + while (i--) { + close(pm_job.clients[i]); + } + free(pm_job.clients); + return new_sock; +} + +static int pm_conn_setup() +{ + int sock, ret; + int optval = 1; + + sock = socket(pm_job.oob_server_addr.ss_family, SOCK_STREAM, 0); + if (sock < 0) + return -1; + + pm_job.sock = sock; + + ret = setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *) &optval, + sizeof(optval)); + if (ret) { + FT_ERR("error setting socket options\n"); + return ret; + } + + ret = bind(sock, (struct sockaddr *)&pm_job.oob_server_addr, + pm_job.server_addr_len); + if (ret == 0) { + ret = server_connect(); + } else { + opts.dst_addr = opts.src_addr; + opts.dst_port = opts.src_port; + opts.src_addr = NULL; + opts.src_port = 0; + ret = connect(pm_job.sock, (struct sockaddr *)&pm_job.oob_server_addr, + pm_job.server_addr_len); + } + if (ret) { + FT_ERR("OOB conn failed - %s\n", strerror(errno)); + return ret; + } + + return 0; +} + +static void pm_finalize() +{ + int i; + + if (!pm_job.clients) { + close(pm_job.sock); + return; + } + + for (i = 0; i < pm_job.num_ranks-1; i++) { + close(pm_job.clients[i]); + } + free(pm_job.clients); +} + +int pm_get_oob_server_addr() +{ + struct addrinfo *res; + struct sockaddr_in *in; + struct sockaddr_in6 *in6; + int ret; + + ret = getaddrinfo(opts.src_addr, NULL, NULL, &res); + if (ret) { + FT_ERR( "getaddrinfo failed\n"); + return ret; + } + + memcpy(&pm_job.oob_server_addr, res->ai_addr, res->ai_addrlen); + pm_job.server_addr_len = res->ai_addrlen; + + switch (pm_job.oob_server_addr.ss_family) { + case AF_INET: + in = (struct sockaddr_in *) &pm_job.oob_server_addr; + in->sin_port = PM_DEFAULT_OOB_PORT; + break; + case AF_INET6: + in6 = (struct sockaddr_in6 *) &pm_job.oob_server_addr; + in6->sin6_port = PM_DEFAULT_OOB_PORT; + break; + default: + FT_ERR( "Unsupported Address family\n"); + ret = -1; + break; + } + + freeaddrinfo(res); + return ret; +} + +int main(int argc, char **argv) +{ + extern char *optarg; + int c, ret; + + opts = INIT_OPTS; + opts.options |= FT_OPT_SIZE; + + pm_job.clients = NULL; + + hints = fi_allocinfo(); + if (!hints) + return EXIT_FAILURE; + + while ((c = getopt(argc, argv, "n:C:h" CS_OPTS INFO_OPTS)) != -1) { + switch (c) { + default: + ft_parse_addr_opts(c, optarg, &opts); + ft_parseinfo(c, optarg, hints, &opts); + ft_parsecsopts(c, optarg, &opts); + break; + case 'n': + pm_job.num_ranks = atoi(optarg); + break; + case 'C': + pm_job.transfer_method = parse_caps(optarg); + break; + case '?': + case 'h': + ft_usage(argv[0], "A simple multinode test"); + return EXIT_FAILURE; + } + } + + ret = pm_get_oob_server_addr(); + if (ret) + goto err1; + + ret = pm_conn_setup(); + if (ret) { + FT_ERR("connection setup failed\n"); + goto err1; + } + + ret = pm_init_ranks(); + if (ret < 0) { + FT_ERR("rank initialization failed\n"); + goto err2; + } + + FT_DEBUG("OOB job setup done\n"); + + ret = multinode_run_tests(argc, argv); + if (ret) { + FT_ERR( "Tests failed\n"); + goto err2; + } + FT_DEBUG("Tests Passed\n"); +err2: + pm_finalize(); +err1: + return ret; +} diff --git a/fabtests/multinode/src/pattern.c b/fabtests/multinode/src/pattern.c new file mode 100644 index 00000000000..3cf705929d0 --- /dev/null +++ b/fabtests/multinode/src/pattern.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2017-2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include + +static int broadcast_gather_next(int *cur) +{ + int next; + if (pm_job.my_rank) + return -FI_ENODATA; + next = *cur + 1; + + if (next >= pm_job.num_ranks) + return -FI_ENODATA; + if (next == 0) + next = 1; + + *cur = next; + + return 0; +} + +static int broadcast_gather_current(int *cur) +{ + int next; + if (!pm_job.my_rank) + return -FI_ENODATA; + + next = *cur + 1; + + if (next > 0) + return -FI_ENODATA; + + *cur = next; + + return 0; +} + +static int ring_next(int *cur) +{ + if ((pm_job.my_rank == 0 && pm_job.num_ranks - 1 == *cur) || + (pm_job.my_rank != 0 && pm_job.my_rank - 1 == *cur)) + return -FI_ENODATA; + + if (pm_job.my_rank == 0) + *cur = pm_job.num_ranks - 1; + else + *cur = pm_job.my_rank - 1; + return 0; +} + +static int ring_current(int *cur) +{ + if ((pm_job.my_rank + 1) % pm_job.num_ranks == *cur) + return -FI_ENODATA; + + *cur = (pm_job.my_rank + 1) % pm_job.num_ranks; + return 0; + +} + +static int mesh_next(int *cur) +{ + int next = *cur + 1; + + if (next >= pm_job.num_ranks) + return -FI_ENODATA; + + *cur = next; + return 0; +} + +struct pattern_ops patterns[] = { + { + .name = "ring", + .next_source = ring_next, + .next_target = ring_current, + }, + { + .name = "gather", + .next_source = broadcast_gather_next, + .next_target = broadcast_gather_current, + }, + { + .name = "broadcast", + .next_source = broadcast_gather_current, + .next_target = broadcast_gather_next, + }, + { + .name = "full_mesh", + .next_source = mesh_next, + .next_target = mesh_next, + }, +}; + +const int NUM_TESTS = ARRAY_SIZE(patterns); diff --git a/fabtests/scripts/runfabtests.cmd b/fabtests/scripts/runfabtests.cmd index eb74c521599..d6f0a957be1 100644 --- a/fabtests/scripts/runfabtests.cmd +++ b/fabtests/scripts/runfabtests.cmd @@ -24,7 +24,7 @@ set functional_tests=^ "poll -t queue"^ "poll -t counter"^ "rdm"^ - "rdm_rma_simple"^ + "rdm_rma_event"^ "rdm_rma_trigger"^ "rdm_tagged_peek"^ "bw -e rdm -v -T 1"^ diff --git a/fabtests/scripts/runfabtests.sh b/fabtests/scripts/runfabtests.sh index 030b321cc48..7e9dbf9e87f 100755 --- a/fabtests/scripts/runfabtests.sh +++ b/fabtests/scripts/runfabtests.sh @@ -46,6 +46,8 @@ trap cleanup_and_exit SIGINT # declare BIN_PATH declare PROV="" +declare CORE="" +declare UTIL="" declare TEST_TYPE="quick" declare SERVER="127.0.0.1" declare CLIENT="127.0.0.1" @@ -56,6 +58,7 @@ declare COMPLEX_CFG declare TIMEOUT_VAL="120" declare STRICT_MODE=0 declare FORK=0 +declare OOB=0 declare C_ARGS="" declare S_ARGS="" @@ -71,12 +74,19 @@ declare -i pass_count=0 declare -i fail_count=0 declare -i total_failures=0 +python=$(which python3 2>/dev/null) || python=$(which python2 2>/dev/null) || python=$(which python 2>/dev/null) + +if [ $? -ne 0 ]; then + echo "Unable to find python dependency, exiting..." + exit 1 +fi + if [[ "$(uname)" == "FreeBSD" ]]; then - declare -ri FI_ENODATA=$(python -c 'import errno; print(errno.ENOMSG)') + declare -ri FI_ENODATA=$($python -c 'import errno; print(errno.ENOMSG)') else - declare -ri FI_ENODATA=$(python -c 'import errno; print(errno.ENODATA)') + declare -ri FI_ENODATA=$($python -c 'import errno; print(errno.ENODATA)') fi -declare -ri FI_ENOSYS=$(python -c 'import errno; print(errno.ENOSYS)') +declare -ri FI_ENOSYS=$($python -c 'import errno; print(errno.ENOSYS)') neg_unit_tests=( "fi_dgram g00n13s" @@ -99,7 +109,8 @@ functional_tests=( "fi_poll -t queue" "fi_poll -t counter" "fi_rdm" - "fi_rdm_rma_simple" + "fi_rdm -U" + "fi_rdm_rma_event" "fi_rdm_rma_trigger" "fi_shared_ctx" "fi_shared_ctx --no-tx-shared-ctx" @@ -118,8 +129,6 @@ functional_tests=( "fi_recv_cancel -e rdm -V" "fi_unexpected_msg -e msg -i 10" "fi_unexpected_msg -e rdm -i 10" - "fi_unexpected_msg -e msg -S -i 10" - "fi_unexpected_msg -e rdm -S -i 10" "fi_inj_complete -e msg" "fi_inj_complete -e rdm" "fi_inj_complete -e dgram" @@ -127,6 +136,7 @@ functional_tests=( "fi_inj_complete -e rdm -SR" "fi_inj_complete -e dgram -SR" "fi_bw -e rdm -v -T 1" + "fi_bw -e rdm -v -T 1 -U" "fi_bw -e msg -v -T 1" ) @@ -139,18 +149,28 @@ short_tests=( "fi_rma_bw -e msg -o read -I 5" "fi_rma_bw -e msg -o writedata -I 5" "fi_rma_bw -e rdm -o write -I 5" + "fi_rma_bw -e rdm -o write -I 5 -U" "fi_rma_bw -e rdm -o read -I 5" + "fi_rma_bw -e rdm -o read -I 5 -U" "fi_rma_bw -e rdm -o writedata -I 5" + "fi_rma_bw -e rdm -o writedata -I 5 -U" "fi_rdm_atomic -I 5 -o all" + "fi_rdm_atomic -I 5 -o all -U" "fi_rdm_cntr_pingpong -I 5" "fi_multi_recv -e rdm -I 5" "fi_multi_recv -e msg -I 5" "fi_rdm_pingpong -I 5" + "fi_rdm_pingpong -I 5 -U" "fi_rdm_pingpong -I 5 -v" + "fi_rdm_pingpong -I 5 -v -U" "fi_rdm_tagged_pingpong -I 5" + "fi_rdm_tagged_pingpong -I 5 -U" "fi_rdm_tagged_pingpong -I 5 -v" + "fi_rdm_tagged_pingpong -I 5 -v -U" "fi_rdm_tagged_bw -I 5" + "fi_rdm_tagged_bw -I 5 -U" "fi_rdm_tagged_bw -I 5 -v" + "fi_rdm_tagged_bw -I 5 -v -U" "fi_dgram_pingpong -I 5" ) @@ -165,20 +185,32 @@ standard_tests=( "fi_rma_bw -e msg -o read" "fi_rma_bw -e msg -o writedata" "fi_rma_bw -e rdm -o write" + "fi_rma_bw -e rdm -o write -U" "fi_rma_bw -e rdm -o read" + "fi_rma_bw -e rdm -o read -U" "fi_rma_bw -e rdm -o writedata" + "fi_rma_bw -e rdm -o writedata -U" "fi_rdm_atomic -o all -I 1000" + "fi_rdm_atomic -o all -I 1000 -U" "fi_rdm_cntr_pingpong" "fi_multi_recv -e rdm" "fi_multi_recv -e msg" "fi_rdm_pingpong" + "fi_rdm_pingpong -U" "fi_rdm_pingpong -v" + "fi_rdm_pingpong -v -U" "fi_rdm_pingpong -k" + "fi_rdm_pingpong -k -U" "fi_rdm_pingpong -k -v" + "fi_rdm_pingpong -k -v -U" "fi_rdm_tagged_pingpong" + "fi_rdm_tagged_pingpong -U" "fi_rdm_tagged_pingpong -v" + "fi_rdm_tagged_pingpong -v -U" "fi_rdm_tagged_bw" + "fi_rdm_tagged_bw -U" "fi_rdm_tagged_bw -v" + "fi_rdm_tagged_bw -v -U" "fi_dgram_pingpong" "fi_dgram_pingpong -k" ) @@ -197,6 +229,12 @@ complex_tests=( "fi_ubertest" ) +multinode_tests=( + "fi_multinode -C msg" + "fi_multinode -C rma" + "fi_multinode_coll" +) + function errcho { >&2 echo $* } @@ -298,12 +336,17 @@ function read_exclude_file { function auto_exclude { local excl_file + local name=$UTIL + + if [ -z $UTIL ]; then + name=$CORE + fi - excl_file="./fabtests/test_configs/${PROV}/${PROV}.exclude" + excl_file="./fabtests/test_configs/${name}/${name}.exclude" if [[ ! -f "$excl_file" ]]; then - excl_file="./test_configs/${PROV}/${PROV}.exclude" + excl_file="./test_configs/${name}/${name}.exclude" if [[ ! -f "$excl_file" ]]; then - excl_file="../test_configs/${PROV}/${PROV}.exclude" + excl_file="../test_configs/${name}/${name}.exclude" if [[ ! -f "$excl_file" ]]; then return fi @@ -352,8 +395,12 @@ function unit_test { local test=$1 local is_neg=$2 local ret1=0 + local s_interface=$(eval "if [ $OOB -eq 1 ]; \ + then echo $GOOD_ADDR; \ + else echo $S_INTERFACE; \ + fi") local test_exe=$(echo "${test} -p \"$PROV\"" | \ - sed -e "s/GOOD_ADDR/$GOOD_ADDR/g" -e "s/SERVER_ADDR/${S_INTERFACE}/g") + sed -e "s/GOOD_ADDR/$GOOD_ADDR/g" -e "s/SERVER_ADDR/$s_interface/g") local start_time local end_time local test_time @@ -407,12 +454,22 @@ function cs_test { start_time=$(date '+%s') - s_cmd="${BIN_PATH}${test_exe} ${S_ARGS} -s $S_INTERFACE" + if [[ $OOB -eq 1 ]]; then + s_arg="-E" + else + s_arg="-s $S_INTERFACE" + fi + s_cmd="${BIN_PATH}${test_exe} ${S_ARGS} $s_arg" ${SERVER_CMD} "${EXPORT_ENV} $s_cmd" &> $s_outp & s_pid=$! sleep 1 - c_cmd="${BIN_PATH}${test_exe} ${C_ARGS} -s $C_INTERFACE $S_INTERFACE" + if [[ $OOB -eq 1 ]]; then + c_arg="-E $S_INTERFACE" + else + c_arg="-s $C_INTERFACE $S_INTERFACE" + fi + c_cmd="${BIN_PATH}${test_exe} ${C_ARGS} $c_arg" ${CLIENT_CMD} "${EXPORT_ENV} $c_cmd" &> $c_outp & c_pid=$! @@ -443,9 +500,31 @@ function cs_test { fi } +function set_cfg_file { + local cfg_file + local parent=$UTIL + local name=$CORE + + if [ -z $UTIL ]; then + parent=$CORE + name=$1 + fi + + cfg_file="${PWD}/fabtests/test_configs/${parent}/${name}.test" + if [[ ! -f "$cfg_file" ]]; then + cfg_file="${PWD}/test_configs/${parent}/${name}.test" + if [[ ! -f "$cfg_file" ]]; then + return + fi + fi + + COMPLEX_CFG=${cfg_file} +} + function complex_test { local test=$1 local config=$2 + local path=${PROV/;/\/} local test_exe="${test}" local s_ret=0 local c_ret=0 @@ -454,6 +533,9 @@ function complex_test { local test_time is_excluded "$test" && return + if [[ -z "$COMPLEX_CFG" ]]; then + set_cfg_file $config + fi start_time=$(date '+%s') @@ -463,12 +545,16 @@ function complex_test { opts="" fi + if [[ $OOB -eq 1 ]]; then + opts+=" -E" + fi + s_cmd="${BIN_PATH}${test_exe} -x $opts" FI_LOG_LEVEL=error ${SERVER_CMD} "${EXPORT_ENV} $s_cmd" &> $s_outp & s_pid=$! sleep 1 - c_cmd="${BIN_PATH}${test_exe} -p \"${PROV}\" -t $config $S_INTERFACE $opts" + c_cmd="${BIN_PATH}${test_exe} -u "${COMPLEX_CFG}" $S_INTERFACE $opts" FI_LOG_LEVEL=error ${CLIENT_CMD} "${EXPORT_ENV} $c_cmd" &> $c_outp & c_pid=$! @@ -510,30 +596,121 @@ function complex_test { fi } +function multinode_test { + local test="$1" + local s_ret=0 + local c_ret=0 + local c_out_arr=() + local num_procs=$2 + local test_exe="${test} -n $num_procs -p \"${PROV}\"" + local c_out + local start_time + local end_time + local test_time + + is_excluded "$test" && return + + start_time=$(date '+%s') + + s_cmd="${BIN_PATH}${test_exe} ${S_ARGS} -s ${S_INTERFACE}" + ${SERVER_CMD} "${EXPORT_ENV} $s_cmd" &> $s_outp & + s_pid=$! + sleep 1 + + c_pid_arr=() + for ((i=1; i $c_out & + c_pid_arr+=($!) + c_out_arr+=($c_out) + done + + for pid in ${c_pid_arr[*]}; do + wait $pid + c_ret=($?)||$c_ret + done + + [[ c_ret -ne 0 ]] && kill -9 $s_pid 2> /dev/null + + wait $s_pid + s_ret=$? + echo "server finished" + + end_time=$(date '+%s') + test_time=$(compute_duration "$start_time" "$end_time") + + pe=1 + if [[ $STRICT_MODE -eq 0 && $s_ret -eq $FI_ENODATA && $c_ret -eq $FI_ENODATA ]] || + [[ $STRICT_MODE -eq 0 && $s_ret -eq $FI_ENOSYS && $c_ret -eq $FI_ENOSYS ]]; then + print_results "$test_exe" "Notrun" "$test_time" "$s_outp" "$s_cmd" "" "$c_cmd" + for c_out in "${c_out_arr[@]}" + do + printf -- " client_stdout $pe: |\n" + sed -e 's/^/ /' < $c_out + pe=$((pe+1)) + done + skip_count+=1 + elif [ $s_ret -ne 0 -o $c_ret -ne 0 ]; then + print_results "$test_exe" "Fail" "$test_time" "$s_outp" "$s_cmd" "" "$c_cmd" + for c_out in "${c_out_arr[@]}" + do + printf -- " client_stdout $pe: |\n" + sed -e 's/^/ /' < $c_out + pe=$((pe+1)) + done + if [ $s_ret -eq 124 -o $c_ret -eq 124 ]; then + cleanup + fi + fail_count+=1 + else + print_results "$test_exe" "Pass" "$test_time" "$s_outp" "$s_cmd" "" "$c_cmd" + for c_out in "${c_out_arr[@]}" + do + printf -- " client_stdout $pe: |\n" + sed -e 's/^/ /' < $c_out + pe=$((pe+1)) + done + pass_count+=1 + fi +} + +function set_core_util { + prov_arr=$(echo $PROV | tr ";" " ") + CORE="" + UTIL="" + for p in $prov_arr; do + if [[ -z $CORE ]]; then + CORE=$p + else + UTIL=$p + fi + done +} + function main { skip_count=0 pass_count=0 fail_count=0 - local complex_cfg="quick" + local complex_type="quick" + set_core_util set_excludes + if [[ $1 == "quick" ]]; then local -r tests="unit functional short" elif [[ $1 == "verify" ]]; then local -r tests="complex" - complex_cfg=$1 + complex_type=$1 else - local -r tests=$(echo $1 | sed 's/all/unit,functional,standard,complex/g' | tr ',' ' ') - if [[ $1 == "all" ]]; then - complex_cfg=$1 + local -r tests=$(echo $1 | sed 's/all/unit,functional,standard,complex,multinode/g' | tr ',' ' ') + if [[ $1 == "all" || $1 == "complex" ]]; then + complex_type="all" fi fi - if [[ -n "$COMPLEX_CFG" ]]; then - complex_cfg="$COMPLEX_CFG" - fi - if [ $VERBOSE -eq 0 ] ; then printf "# %-68s%10s\n" "Test" "Result" print_border @@ -569,8 +746,12 @@ function main { ;; complex) for test in "${complex_tests[@]}"; do - complex_test $test $complex_cfg - + complex_test $test $complex_type + done + ;; + multinode) + for test in "${multinode_tests[@]}"; do + multinode_test "$test" 3 done ;; *) @@ -615,6 +796,7 @@ function usage { regex patterns e.g. \"dgram,rma.*write\"" errcho -e " -E\texport provided variable name and value to ssh client and server processes. options must of of the form '-E var=value'" + errcho -e " -U\trun fabtests with FI_DELIVERY_COMPLETE set" errcho -e " -f\texclude tests file: File containing list of test names / regex patterns to exclude (one per line)" errcho -e " -N\tskip negative unit tests" @@ -626,10 +808,11 @@ function usage { errcho -e " -S\tStrict mode: -FI_ENODATA, -FI_ENOSYS errors would be treated as failures instead of skipped/notrun" errcho -e " -C\tAdditional client test arguments: Parameters to pass to client fabtests" errcho -e " -L\tAdditional server test arguments: Parameters to pass to server fabtests" + errcho -e " -b\tenable out-of-band address exchange over the default port" exit 1 } -while getopts ":vt:p:g:e:f:c:s:u:T:C:L:NRSkE:" opt; do +while getopts ":vt:p:g:e:f:c:s:u:T:C:L:NRSbkE:" opt; do case ${opt} in t) TEST_TYPE=$OPTARG ;; @@ -658,6 +841,8 @@ case ${opt} in ;; S) STRICT_MODE=1 ;; + b) OOB=1 + ;; k) FORK=1 ;; C) C_ARGS="${OPTARG}" diff --git a/fabtests/test_configs/efa/efa.exclude b/fabtests/test_configs/efa/efa.exclude index 861f99a23e5..8ec42b4c28a 100644 --- a/fabtests/test_configs/efa/efa.exclude +++ b/fabtests/test_configs/efa/efa.exclude @@ -57,10 +57,8 @@ inj_complete # Exclude trigger ops trigger -# Exclude all atomic tests -atomic -rdm_cntr_pingpong +#rdm_cntr_pingpong # This test requires ENA IPs for the OOB sync @@ -95,3 +93,6 @@ rdm_tagged_peek # fail on timeout - cannot be supported dgram_bw + +# Multinode tests failing with an unsupported address format +multinode diff --git a/fabtests/test_configs/eq_cq.test b/fabtests/test_configs/eq_cq.test index 6113eb932b9..1641eef2154 100644 --- a/fabtests/test_configs/eq_cq.test +++ b/fabtests/test_configs/eq_cq.test @@ -1,6 +1,6 @@ -#: "Tests different wait objects for EQ and CQ across sockets and verbs providers" +#: "Tests different wait objects for EQ and CQ across tcp and verbs providers" { - prov_name: sockets, + prov_name: tcp, test_type: [ FT_TEST_LATENCY, ], @@ -25,16 +25,13 @@ cq_wait_obj: [ FI_WAIT_NONE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, ], test_flags: FT_FLAG_QUICKTEST }, { - prov_name: sockets, + prov_name: tcp, test_type: [ FT_TEST_LATENCY, FT_TEST_BANDWIDTH, @@ -61,9 +58,6 @@ FI_WAIT_FD, FI_WAIT_MUTEX_COND, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, ], @@ -91,9 +85,6 @@ cq_wait_obj: [ FI_WAIT_NONE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, ], @@ -122,9 +113,6 @@ FI_WAIT_UNSPEC, FI_WAIT_FD, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, ], diff --git a/fabtests/test_configs/lat_bw.test b/fabtests/test_configs/lat_bw.test index cc184a2dc17..4696d2dce69 100644 --- a/fabtests/test_configs/lat_bw.test +++ b/fabtests/test_configs/lat_bw.test @@ -1,6 +1,6 @@ -#: "Latency and bandwidth tests for all providers" +#: "Latency and bandwidth tests" { - prov_name: sockets, + prov_name: tcp, test_type: [ FT_TEST_LATENCY, FT_TEST_BANDWIDTH, @@ -19,9 +19,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -43,9 +40,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, ], @@ -69,9 +63,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, ], diff --git a/fabtests/test_configs/ofi_rxd/udp.test b/fabtests/test_configs/ofi_rxd/udp.test index 51c11120b41..ee105367e27 100644 --- a/fabtests/test_configs/ofi_rxd/udp.test +++ b/fabtests/test_configs/ofi_rxd/udp.test @@ -16,9 +16,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, diff --git a/fabtests/test_configs/ofi_rxd/verbs.test b/fabtests/test_configs/ofi_rxd/verbs.test new file mode 100644 index 00000000000..e41d086a21a --- /dev/null +++ b/fabtests/test_configs/ofi_rxd/verbs.test @@ -0,0 +1,23 @@ +{ + prov_name: verbs;ofi_rxd, + test_type: [ + FT_TEST_LATENCY, + FT_TEST_BANDWIDTH, + ], + class_function: [ + FT_FUNC_SEND, + FT_FUNC_SENDV, + FT_FUNC_SENDMSG, + FT_FUNC_SENDDATA, + ], + ep_type: [ + FI_EP_RDM + ], + comp_type: [ + FT_COMP_QUEUE, + ], + test_class: [ + FT_CAP_MSG, + FT_CAP_TAGGED, + ], +}, diff --git a/fabtests/test_configs/ofi_rxm/ofi_rxm.exclude b/fabtests/test_configs/ofi_rxm/ofi_rxm.exclude index c65fed985c1..ec8121b0784 100644 --- a/fabtests/test_configs/ofi_rxm/ofi_rxm.exclude +++ b/fabtests/test_configs/ofi_rxm/ofi_rxm.exclude @@ -9,13 +9,10 @@ -e dgram cm_data -rdm_rma_simple +rdm_rma_event trigger shared_ctx scalable_ep shared_av multi_mr atomic - -# Remove this once ubertest supports setting MR modes -ubertest diff --git a/fabtests/test_configs/ofi_rxm/tcp.test b/fabtests/test_configs/ofi_rxm/tcp.test new file mode 100644 index 00000000000..baec15f1972 --- /dev/null +++ b/fabtests/test_configs/ofi_rxm/tcp.test @@ -0,0 +1,26 @@ +{ + prov_name: tcp;ofi_rxm, + test_type: [ + FT_TEST_LATENCY, + FT_TEST_BANDWIDTH, + ], + class_function: [ + FT_FUNC_SEND, + FT_FUNC_SENDV, + FT_FUNC_SENDDATA, + FT_FUNC_INJECT, + FT_FUNC_INJECTDATA, + ], + ep_type: [ + FI_EP_RDM, + ], + comp_type: [ + FT_COMP_QUEUE, + ], + test_class: [ + FT_CAP_MSG, + FT_CAP_TAGGED, + ], + mr_mode: [FI_MR_LOCAL, FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, FI_MR_PROV_KEY], + progress: [FI_PROGRESS_MANUAL, FI_PROGRESS_AUTO], +}, diff --git a/fabtests/test_configs/ofi_rxm/verbs/all.test b/fabtests/test_configs/ofi_rxm/verbs.test similarity index 100% rename from fabtests/test_configs/ofi_rxm/verbs/all.test rename to fabtests/test_configs/ofi_rxm/verbs.test diff --git a/fabtests/test_configs/psm/all.test b/fabtests/test_configs/psm/all.test index a65a1607d84..c864bc28dda 100644 --- a/fabtests/test_configs/psm/all.test +++ b/fabtests/test_configs/psm/all.test @@ -20,7 +20,7 @@ FT_COMP_QUEUE, ], mode: [ - FT_MODE_ALL, + FI_CONTEXT, ], test_class: [ FT_CAP_MSG, @@ -53,7 +53,7 @@ FT_COMP_QUEUE, ], mode: [ - FT_MODE_ALL, + FI_CONTEXT, ], test_class: [ FT_CAP_RMA, diff --git a/fabtests/test_configs/psm2/all.test b/fabtests/test_configs/psm2/all.test index e573e602f1e..e482d905803 100644 --- a/fabtests/test_configs/psm2/all.test +++ b/fabtests/test_configs/psm2/all.test @@ -2,7 +2,6 @@ prov_name: psm2, test_type: [ FT_TEST_LATENCY, - FT_TEST_BANDWIDTH, ], class_function: [ FT_FUNC_SEND, @@ -21,7 +20,35 @@ FT_COMP_QUEUE, ], mode: [ - FT_MODE_ALL, + FI_CONTEXT, + ], + test_class: [ + FT_CAP_MSG, + FT_CAP_TAGGED, + ], + test_flags: FT_FLAG_QUICKTEST +}, +{ + prov_name: psm2, + test_type: [ + FT_TEST_BANDWIDTH, + ], + class_function: [ + FT_FUNC_SEND, + FT_FUNC_INJECT, + ], + ep_type: [ + FI_EP_RDM + ], + av_type: [ + FI_AV_TABLE + FI_AV_MAP, + ], + comp_type: [ + FT_COMP_QUEUE, + ], + mode: [ + FI_CONTEXT, ], test_class: [ FT_CAP_MSG, @@ -50,7 +77,7 @@ FT_COMP_QUEUE, ], mode: [ - FT_MODE_ALL, + FI_CONTEXT, ], test_class: [ FT_CAP_MSG, @@ -84,7 +111,7 @@ FT_COMP_QUEUE, ], mode: [ - FT_MODE_ALL, + FI_CONTEXT, ], test_class: [ FT_CAP_RMA, diff --git a/fabtests/test_configs/psm2/psm2.exclude b/fabtests/test_configs/psm2/psm2.exclude index 21c7f986920..30303e02273 100644 --- a/fabtests/test_configs/psm2/psm2.exclude +++ b/fabtests/test_configs/psm2/psm2.exclude @@ -14,3 +14,4 @@ shared_ctx scalable_ep shared_av rdm_cntr_pingpong +multi_recv diff --git a/fabtests/test_configs/psm2/verify.test b/fabtests/test_configs/psm2/verify.test index 0e2328c1492..2b685f4f393 100644 --- a/fabtests/test_configs/psm2/verify.test +++ b/fabtests/test_configs/psm2/verify.test @@ -19,7 +19,7 @@ FT_COMP_QUEUE, ], mode: [ - FT_MODE_ALL, + FI_CONTEXT, ], test_class: [ FT_CAP_MSG, @@ -46,7 +46,7 @@ FT_COMP_QUEUE, ], mode: [ - FT_MODE_ALL, + FI_CONTEXT, ], test_class: [ FT_CAP_MSG, @@ -78,7 +78,7 @@ FT_COMP_QUEUE, ], mode: [ - FT_MODE_ALL, + FI_CONTEXT, ], test_class: [ FT_CAP_RMA, @@ -138,7 +138,7 @@ FT_COMP_QUEUE, ], mode: [ - FT_MODE_ALL, + FI_CONTEXT, ], test_class: [ FT_CAP_ATOMIC, @@ -185,7 +185,7 @@ FT_COMP_CNTR, ], mode: [ - FT_MODE_ALL, + FI_CONTEXT, ], test_class: [ FT_CAP_ATOMIC, @@ -237,7 +237,7 @@ FT_COMP_QUEUE, ], mode: [ - FT_MODE_ALL, + FI_CONTEXT, ], test_class: [ FT_CAP_ATOMIC, diff --git a/fabtests/test_configs/psm3/all.test b/fabtests/test_configs/psm3/all.test new file mode 100644 index 00000000000..f18a9da9306 --- /dev/null +++ b/fabtests/test_configs/psm3/all.test @@ -0,0 +1,120 @@ +{ + prov_name: psm3, + test_type: [ + FT_TEST_LATENCY, + ], + class_function: [ + FT_FUNC_SEND, + FT_FUNC_SENDV, + FT_FUNC_SENDMSG, + FT_FUNC_INJECT, + ], + ep_type: [ + FI_EP_RDM + ], + av_type: [ + FI_AV_TABLE + FI_AV_MAP, + ], + comp_type: [ + FT_COMP_QUEUE, + ], + mode: [ + FI_CONTEXT, + ], + test_class: [ + FT_CAP_MSG, + FT_CAP_TAGGED, + ], + test_flags: FT_FLAG_QUICKTEST +}, +{ + prov_name: psm3, + test_type: [ + FT_TEST_BANDWIDTH, + ], + class_function: [ + FT_FUNC_SEND, + FT_FUNC_INJECT, + ], + ep_type: [ + FI_EP_RDM + ], + av_type: [ + FI_AV_TABLE + FI_AV_MAP, + ], + comp_type: [ + FT_COMP_QUEUE, + ], + mode: [ + FI_CONTEXT, + ], + test_class: [ + FT_CAP_MSG, + FT_CAP_TAGGED, + ], + test_flags: FT_FLAG_QUICKTEST +}, +{ + prov_name: psm3, + test_type: [ + FT_TEST_LATENCY, + FT_TEST_BANDWIDTH, + ], + class_function: [ + FT_FUNC_SENDDATA, + FT_FUNC_INJECTDATA, + ], + ep_type: [ + FI_EP_RDM + ], + av_type: [ + FI_AV_TABLE + FI_AV_MAP, + ], + comp_type: [ + FT_COMP_QUEUE, + ], + mode: [ + FI_CONTEXT, + ], + test_class: [ + FT_CAP_MSG, + ], + test_flags: FT_FLAG_QUICKTEST +}, +{ + prov_name: psm3, + test_type: [ + FT_TEST_LATENCY, + FT_TEST_BANDWIDTH, + ], + class_function: [ + FT_FUNC_WRITE, + FT_FUNC_WRITEV, + FT_FUNC_WRITEMSG, + FT_FUNC_INJECT_WRITE, + FT_FUNC_WRITEDATA, + FT_FUNC_READ, + FT_FUNC_READV, + FT_FUNC_READMSG, + ], + ep_type: [ + FI_EP_RDM + ], + av_type: [ + FI_AV_TABLE + FI_AV_MAP, + ], + comp_type: [ + FT_COMP_QUEUE, + ], + mode: [ + FI_CONTEXT, + ], + test_class: [ + FT_CAP_RMA, + ], + test_flags: FT_FLAG_QUICKTEST +}, diff --git a/fabtests/test_configs/ofi_rxm/verbs/exclude b/fabtests/test_configs/psm3/psm3.exclude similarity index 66% rename from fabtests/test_configs/ofi_rxm/verbs/exclude rename to fabtests/test_configs/psm3/psm3.exclude index 648ca004333..30303e02273 100644 --- a/fabtests/test_configs/ofi_rxm/verbs/exclude +++ b/fabtests/test_configs/psm3/psm3.exclude @@ -3,16 +3,15 @@ # Exclude all prefix tests -k +# av_test supports only FI_SOCKADDR +av_test + ^fi_msg -e msg -^fi_dgram --e dgram cm_data -rdm_rma_simple -trigger shared_ctx scalable_ep shared_av -multi_mr -atomic +rdm_cntr_pingpong +multi_recv diff --git a/fabtests/test_configs/psm3/verify.test b/fabtests/test_configs/psm3/verify.test new file mode 100644 index 00000000000..eb2d45261e4 --- /dev/null +++ b/fabtests/test_configs/psm3/verify.test @@ -0,0 +1,246 @@ +{ + prov_name: psm3, + test_type: [ + FT_TEST_UNIT, + ], + class_function: [ + FT_FUNC_SEND, + FT_FUNC_SENDV, + FT_FUNC_SENDMSG, + FT_FUNC_INJECT, + ], + ep_type: [ + FI_EP_RDM + ], + av_type: [ + FI_AV_TABLE + ], + comp_type: [ + FT_COMP_QUEUE, + ], + mode: [ + FI_CONTEXT, + ], + test_class: [ + FT_CAP_MSG, + FT_CAP_TAGGED, + ], + test_flags: FT_FLAG_QUICKTEST +}, +{ + prov_name: psm3, + test_type: [ + FT_TEST_UNIT, + ], + class_function: [ + FT_FUNC_SENDDATA, + FT_FUNC_INJECTDATA, + ], + ep_type: [ + FI_EP_RDM + ], + av_type: [ + FI_AV_TABLE + ], + comp_type: [ + FT_COMP_QUEUE, + ], + mode: [ + FI_CONTEXT, + ], + test_class: [ + FT_CAP_MSG, + ], + test_flags: FT_FLAG_QUICKTEST +}, +{ + prov_name: psm3, + test_type: [ + FT_TEST_UNIT, + ], + class_function: [ + FT_FUNC_WRITE, + FT_FUNC_WRITEV, + FT_FUNC_WRITEMSG, + FT_FUNC_INJECT_WRITE, + FT_FUNC_WRITEDATA, + FT_FUNC_READ, + FT_FUNC_READV, + FT_FUNC_READMSG, + ], + ep_type: [ + FI_EP_RDM + ], + av_type: [ + FI_AV_TABLE + ], + comp_type: [ + FT_COMP_QUEUE, + ], + mode: [ + FI_CONTEXT, + ], + test_class: [ + FT_CAP_RMA, + ], + test_flags: FT_FLAG_QUICKTEST +}, +{ + prov_name: psm3, + test_type: [ + FT_TEST_UNIT, + ], + class_function: [ + FT_FUNC_ATOMIC, + FT_FUNC_ATOMICV, + FT_FUNC_ATOMICMSG, + FT_FUNC_FETCH_ATOMIC, + FT_FUNC_FETCH_ATOMICV, + FT_FUNC_FETCH_ATOMICMSG, + FT_FUNC_INJECT_ATOMIC, + ], + op:[ + FI_MIN, + FI_MAX, + FI_SUM, + FI_PROD, + FI_LOR, + FI_LAND, + FI_BOR, + FI_BAND, + FI_LXOR, + FI_BXOR, + FI_ATOMIC_WRITE, + ], + datatype:[ + FI_INT8, + FI_UINT8, + FI_INT16, + FI_UINT16, + FI_INT32, + FI_UINT32, + FI_INT64, + FI_UINT64, + FI_FLOAT, + FI_DOUBLE, + FI_LONG_DOUBLE, + FI_FLOAT_COMPLEX, + FI_DOUBLE_COMPLEX, + FI_LONG_DOUBLE_COMPLEX, + ], + ep_type: [ + FI_EP_RDM, + ], + av_type: [ + FI_AV_TABLE, + ], + comp_type: [ + FT_COMP_QUEUE, + ], + mode: [ + FI_CONTEXT, + ], + test_class: [ + FT_CAP_ATOMIC, + ], + test_flags: FT_FLAG_QUICKTEST +}, +{ + prov_name: psm3, + test_type: [ + FT_TEST_UNIT, + ], + class_function: [ + FT_FUNC_FETCH_ATOMIC, + FT_FUNC_FETCH_ATOMICV, + FT_FUNC_FETCH_ATOMICMSG, + ], + op:[ + FI_ATOMIC_READ, + ], + datatype:[ + FI_INT8, + FI_UINT8, + FI_INT16, + FI_UINT16, + FI_INT32, + FI_UINT32, + FI_INT64, + FI_UINT64, + FI_FLOAT, + FI_DOUBLE, + FI_LONG_DOUBLE, + FI_FLOAT_COMPLEX, + FI_DOUBLE_COMPLEX, + FI_LONG_DOUBLE_COMPLEX, + ], + ep_type: [ + FI_EP_RDM, + ], + av_type: [ + FI_AV_TABLE, + ], + comp_type: [ + FT_COMP_QUEUE, + FT_COMP_CNTR, + ], + mode: [ + FI_CONTEXT, + ], + test_class: [ + FT_CAP_ATOMIC, + ], + test_flags: FT_FLAG_QUICKTEST +}, +{ + prov_name: psm3, + test_type: [ + FT_TEST_UNIT, + ], + class_function: [ + FT_FUNC_COMPARE_ATOMIC, + FT_FUNC_COMPARE_ATOMICV, + FT_FUNC_COMPARE_ATOMICMSG, + ], + op:[ + FI_CSWAP, + FI_CSWAP_NE, + FI_CSWAP_LE, + FI_CSWAP_LT, + FI_CSWAP_GE, + FI_CSWAP_GT, + FI_MSWAP, + ], + datatype:[ + FI_INT8, + FI_UINT8, + FI_INT16, + FI_UINT16, + FI_INT32, + FI_UINT32, + FI_INT64, + FI_UINT64, + FI_FLOAT, + FI_DOUBLE, + FI_LONG_DOUBLE, + FI_FLOAT_COMPLEX, + FI_DOUBLE_COMPLEX, + FI_LONG_DOUBLE_COMPLEX, + ], + ep_type: [ + FI_EP_RDM, + ], + av_type: [ + FI_AV_TABLE, + ], + comp_type: [ + FT_COMP_QUEUE, + ], + mode: [ + FI_CONTEXT, + ], + test_class: [ + FT_CAP_ATOMIC, + ], + test_flags: FT_FLAG_QUICKTEST +}, diff --git a/fabtests/test_configs/shm/all.test b/fabtests/test_configs/shm/all.test index 33522c25af2..a975a7a75d9 100644 --- a/fabtests/test_configs/shm/all.test +++ b/fabtests/test_configs/shm/all.test @@ -15,9 +15,6 @@ ep_type: [ FI_EP_RDM, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -41,9 +38,6 @@ ep_type: [ FI_EP_RDM, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -74,9 +68,6 @@ ep_type: [ FI_EP_RDM, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -104,9 +95,6 @@ ep_type: [ FI_EP_RDM, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -142,9 +130,6 @@ ep_type: [ FI_EP_RDM, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_RMA, ], @@ -176,9 +161,6 @@ ep_type: [ FI_EP_RDM, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_RMA, ], @@ -219,9 +201,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -259,9 +238,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -303,9 +279,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -347,9 +320,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -382,9 +352,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -423,9 +390,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], diff --git a/fabtests/test_configs/shm/quick.test b/fabtests/test_configs/shm/quick.test index b95011a5455..94c98b6558d 100644 --- a/fabtests/test_configs/shm/quick.test +++ b/fabtests/test_configs/shm/quick.test @@ -15,9 +15,6 @@ ep_type: [ FI_EP_RDM, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -43,9 +40,6 @@ ep_type: [ FI_EP_RDM, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -77,9 +71,6 @@ ep_type: [ FI_EP_RDM, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -108,9 +99,6 @@ ep_type: [ FI_EP_RDM, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -147,9 +135,6 @@ ep_type: [ FI_EP_RDM, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_RMA, ], @@ -182,9 +167,6 @@ ep_type: [ FI_EP_RDM, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_RMA, ], @@ -228,9 +210,6 @@ FT_COMP_QUEUE, FT_COMP_CNTR, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -269,9 +248,6 @@ FT_COMP_QUEUE, FT_COMP_CNTR, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -314,9 +290,6 @@ FT_COMP_QUEUE, FT_COMP_CNTR, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -359,9 +332,6 @@ FT_COMP_QUEUE, FT_COMP_CNTR, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -395,9 +365,6 @@ FT_COMP_QUEUE, FT_COMP_CNTR, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -437,9 +404,6 @@ FT_COMP_QUEUE, FT_COMP_CNTR, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], diff --git a/fabtests/test_configs/shm/verify.test b/fabtests/test_configs/shm/verify.test index 156c4415c29..d94394be585 100644 --- a/fabtests/test_configs/shm/verify.test +++ b/fabtests/test_configs/shm/verify.test @@ -14,9 +14,6 @@ ep_type: [ FI_EP_RDM, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -48,9 +45,6 @@ ep_type: [ FI_EP_RDM, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_RMA, ], @@ -81,9 +75,6 @@ ep_type: [ FI_EP_RDM, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_RMA, ], @@ -141,9 +132,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -187,9 +175,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -239,9 +224,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -299,9 +281,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -342,9 +321,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -391,9 +367,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], diff --git a/fabtests/test_configs/sockets/all.test b/fabtests/test_configs/sockets/all.test index 31080aa8b8d..f7c27f710d7 100644 --- a/fabtests/test_configs/sockets/all.test +++ b/fabtests/test_configs/sockets/all.test @@ -14,7 +14,6 @@ ], ep_type: [ FI_EP_MSG, - FI_EP_DGRAM, FI_EP_RDM, ], av_type: [ @@ -24,9 +23,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -58,9 +54,6 @@ cq_wait_obj: [ FI_WAIT_NONE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, ], @@ -76,7 +69,6 @@ ], ep_type: [ FI_EP_MSG, - FI_EP_DGRAM, ], av_type: [ FI_AV_TABLE, @@ -93,9 +85,6 @@ FI_WAIT_FD, FI_WAIT_MUTEX_COND, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, ], diff --git a/fabtests/test_configs/sockets/complete.test b/fabtests/test_configs/sockets/complete.test index 8483413f701..b6c932daf42 100644 --- a/fabtests/test_configs/sockets/complete.test +++ b/fabtests/test_configs/sockets/complete.test @@ -15,7 +15,6 @@ ], ep_type: [ FI_EP_MSG, - FI_EP_DGRAM, FI_EP_RDM, ], av_type: [ @@ -26,10 +25,6 @@ FT_COMP_QUEUE, FT_COMP_CNTR, ], - mode: [ - FT_MODE_ALL, - FT_MODE_NONE, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -60,10 +55,6 @@ FI_WAIT_FD, FI_WAIT_MUTEX_COND, ], - mode: [ - FT_MODE_ALL, - FT_MODE_NONE, - ], test_class: [ FT_CAP_MSG, ], @@ -80,7 +71,6 @@ ], ep_type: [ FI_EP_MSG, - FI_EP_DGRAM, ], av_type: [ FI_AV_TABLE, @@ -97,10 +87,6 @@ FI_WAIT_FD, FI_WAIT_MUTEX_COND, ], - mode: [ - FT_MODE_ALL, - FT_MODE_NONE, - ], test_class: [ FT_CAP_MSG, ], @@ -117,7 +103,6 @@ ], ep_type: [ FI_EP_MSG, - FI_EP_DGRAM, ], av_type: [ FI_AV_TABLE, @@ -134,10 +119,6 @@ FI_WAIT_FD, FI_WAIT_MUTEX_COND, ], - mode: [ - FT_MODE_ALL, - FT_MODE_NONE, - ], test_class: [ FT_CAP_MSG, ], @@ -171,10 +152,6 @@ FT_COMP_QUEUE, FT_COMP_CNTR, ], - mode: [ - FT_MODE_ALL, - FT_MODE_NONE, - ], test_class: [ FT_CAP_RMA, ], @@ -235,10 +212,6 @@ FT_COMP_QUEUE, FT_COMP_CNTR, ], - mode: [ - FT_MODE_ALL, - FT_MODE_NONE, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -285,10 +258,6 @@ FT_COMP_QUEUE, FT_COMP_CNTR, ], - mode: [ - FT_MODE_ALL, - FT_MODE_NONE, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -341,10 +310,6 @@ FT_COMP_QUEUE, FT_COMP_CNTR, ], - mode: [ - FT_MODE_ALL, - FT_MODE_NONE, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -368,9 +333,6 @@ comp_type: [ FT_COMP_CNTR, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -404,9 +366,6 @@ comp_type: [ FT_COMP_CNTR, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -441,9 +400,6 @@ comp_type: [ FT_COMP_CNTR, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -475,9 +431,6 @@ FT_COMP_QUEUE, FT_COMP_CNTR, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -509,9 +462,6 @@ FT_COMP_QUEUE, FT_COMP_CNTR, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, diff --git a/fabtests/test_configs/sockets/quick.test b/fabtests/test_configs/sockets/quick.test index b913732390f..b929c5d7e91 100644 --- a/fabtests/test_configs/sockets/quick.test +++ b/fabtests/test_configs/sockets/quick.test @@ -15,7 +15,6 @@ ], ep_type: [ FI_EP_MSG, - FI_EP_DGRAM, FI_EP_RDM, ], av_type: [ @@ -25,9 +24,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -60,9 +56,6 @@ cq_wait_obj: [ FI_WAIT_NONE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, ], @@ -79,7 +72,6 @@ ], ep_type: [ FI_EP_MSG, - FI_EP_DGRAM, ], av_type: [ FI_AV_TABLE, @@ -96,9 +88,6 @@ FI_WAIT_FD, FI_WAIT_MUTEX_COND, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, ], @@ -131,9 +120,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_RMA, ], @@ -171,9 +157,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -206,9 +189,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -241,9 +221,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], diff --git a/fabtests/test_configs/sockets/verify.test b/fabtests/test_configs/sockets/verify.test index 79ad5084a7f..a284a8c2577 100644 --- a/fabtests/test_configs/sockets/verify.test +++ b/fabtests/test_configs/sockets/verify.test @@ -20,9 +20,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -54,9 +51,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_RMA, ], @@ -114,9 +108,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -160,9 +151,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -212,9 +200,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_ATOMIC, ], @@ -239,9 +224,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, FT_CAP_TAGGED, @@ -268,9 +250,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_RMA, ], diff --git a/fabtests/test_configs/tcp/quick.test b/fabtests/test_configs/tcp/quick.test new file mode 100644 index 00000000000..34b323e958e --- /dev/null +++ b/fabtests/test_configs/tcp/quick.test @@ -0,0 +1,54 @@ +#: "Suite of tests for the tcp provider" +{ + prov_name: tcp, + test_type: [ + FT_TEST_LATENCY, + FT_TEST_BANDWIDTH, + FT_TEST_UNIT + ], + class_function: [ + FT_FUNC_SEND, + FT_FUNC_SENDV, + FT_FUNC_SENDMSG, + FT_FUNC_INJECT, + FT_FUNC_INJECTDATA, + FT_FUNC_SENDDATA, + ], + ep_type: [ + FI_EP_MSG, + ], + comp_type: [ + FT_COMP_QUEUE + ], + test_class: [ + FT_CAP_MSG, + ], + test_flags: FT_FLAG_QUICKTEST +}, +{ + prov_name: tcp, + test_type: [ + FT_TEST_LATENCY, + FT_TEST_BANDWIDTH, + FT_TEST_UNIT + ], + class_function: [ + FT_FUNC_READ, + FT_FUNC_READV, + FT_FUNC_READMSG, + FT_FUNC_WRITE, + FT_FUNC_WRITEV, + FT_FUNC_WRITEMSG, + FT_FUNC_WRITEDATA + ], + ep_type: [ + FI_EP_MSG, + ], + comp_type: [ + FT_COMP_QUEUE + ], + test_class: [ + FT_CAP_RMA, + ], + test_flags: FT_FLAG_QUICKTEST +} diff --git a/fabtests/test_configs/tcp/tcp.exclude b/fabtests/test_configs/tcp/tcp.exclude index 566316bb118..9600858800f 100644 --- a/fabtests/test_configs/tcp/tcp.exclude +++ b/fabtests/test_configs/tcp/tcp.exclude @@ -3,7 +3,7 @@ ^fi_dgram -e dgram -rdm_rma_simple +rdm_rma_event rdm_rma_trigger shared_ctx scalable_ep @@ -12,6 +12,7 @@ multi_mr atomic inj_complete -e msg unexpected_msg -e msg +multi_recv # TODO. Following fails with macOS. will fix them later cq_data -e rdm diff --git a/fabtests/test_configs/udp/all.test b/fabtests/test_configs/udp/all.test index bb1c20b27fc..a5c05823a7e 100644 --- a/fabtests/test_configs/udp/all.test +++ b/fabtests/test_configs/udp/all.test @@ -31,9 +31,6 @@ FI_WAIT_UNSPEC, FI_WAIT_FD, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, ], diff --git a/fabtests/test_configs/udp/functional.test b/fabtests/test_configs/udp/functional.test index 37d7af1427a..6adc2dff2d0 100644 --- a/fabtests/test_configs/udp/functional.test +++ b/fabtests/test_configs/udp/functional.test @@ -22,9 +22,6 @@ cq_wait_obj: [ FI_WAIT_NONE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, ], diff --git a/fabtests/test_configs/udp/lat_bw.test b/fabtests/test_configs/udp/lat_bw.test index fca389de07b..7d0420f491b 100644 --- a/fabtests/test_configs/udp/lat_bw.test +++ b/fabtests/test_configs/udp/lat_bw.test @@ -17,9 +17,6 @@ comp_type: [ FT_COMP_QUEUE, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, ], diff --git a/fabtests/test_configs/udp/quick.test b/fabtests/test_configs/udp/quick.test index 3a6ae212169..1ac216ba0ad 100644 --- a/fabtests/test_configs/udp/quick.test +++ b/fabtests/test_configs/udp/quick.test @@ -31,9 +31,6 @@ FI_WAIT_UNSPEC, FI_WAIT_FD, ], - mode: [ - FT_MODE_ALL, - ], test_class: [ FT_CAP_MSG, ], diff --git a/fabtests/test_configs/usnic/all.test b/fabtests/test_configs/usnic/all.test index 958cc7e52c9..7c93a0ce53a 100644 --- a/fabtests/test_configs/usnic/all.test +++ b/fabtests/test_configs/usnic/all.test @@ -32,7 +32,7 @@ FI_WAIT_UNSPEC, ], mode: [ - FT_MODE_ALL, + FI_CONTEXT, FI_RX_CQ_DATA, ], test_class: [ FT_CAP_MSG, diff --git a/fabtests/test_configs/usnic/quick.test b/fabtests/test_configs/usnic/quick.test index 9ca00afd8ca..225d4cd77a5 100644 --- a/fabtests/test_configs/usnic/quick.test +++ b/fabtests/test_configs/usnic/quick.test @@ -32,7 +32,7 @@ FI_WAIT_UNSPEC, ], mode: [ - FT_MODE_ALL, + FI_CONTEXT, FI_RX_CQ_DATA, ], test_class: [ FT_CAP_MSG, diff --git a/fabtests/test_configs/verbs/all.test b/fabtests/test_configs/verbs/all.test index d5ec27b768f..2ad282abfd9 100644 --- a/fabtests/test_configs/verbs/all.test +++ b/fabtests/test_configs/verbs/all.test @@ -25,6 +25,7 @@ test_class: [ FT_CAP_MSG, ], + mode: [FI_CONTEXT, FI_RX_CQ_DATA], mr_mode: [FI_MR_LOCAL, FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, FI_MR_PROV_KEY], }, { @@ -45,6 +46,7 @@ test_class: [ FT_CAP_MSG, ], + mode: [FI_CONTEXT, FI_RX_CQ_DATA], mr_mode: [FI_MR_LOCAL, FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, FI_MR_PROV_KEY], msg_flags: FI_REMOTE_CQ_DATA, }, @@ -71,6 +73,7 @@ test_class: [ FT_CAP_RMA, ], + mode: [FI_CONTEXT, FI_RX_CQ_DATA], mr_mode: [FI_MR_LOCAL, FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, FI_MR_PROV_KEY], }, { @@ -91,6 +94,7 @@ test_class: [ FT_CAP_RMA, ], + mode: [FI_CONTEXT, FI_RX_CQ_DATA], mr_mode: [FI_MR_LOCAL, FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, FI_MR_PROV_KEY], msg_flags: FI_REMOTE_CQ_DATA, }, @@ -119,6 +123,7 @@ test_class: [ FT_CAP_MSG, ], + mode: [FI_CONTEXT, FI_RX_CQ_DATA], mr_mode: [FI_MR_LOCAL, FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, FI_MR_PROV_KEY], }, { @@ -147,5 +152,6 @@ test_class: [ FT_CAP_MSG, ], + mode: [FI_CONTEXT, FI_RX_CQ_DATA], mr_mode: [FI_MR_LOCAL, FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, FI_MR_PROV_KEY], }, diff --git a/fabtests/test_configs/verbs/quick.test b/fabtests/test_configs/verbs/quick.test index a8f0a230d07..2797f8b23e3 100644 --- a/fabtests/test_configs/verbs/quick.test +++ b/fabtests/test_configs/verbs/quick.test @@ -18,9 +18,8 @@ comp_type: [ FT_COMP_QUEUE ], - mode: [ - FT_MODE_ALL - ], + mr_mode: [ FI_MR_LOCAL, FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, FI_MR_PROV_KEY ], + mode: [ FI_CONTEXT, FI_RX_CQ_DATA ], test_class: [ FT_CAP_MSG, ], @@ -48,9 +47,8 @@ cq_wait_obj: [ FI_WAIT_NONE ], - mode: [ - FT_MODE_ALL - ], + mr_mode: [ FI_MR_LOCAL, FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, FI_MR_PROV_KEY ], + mode: [ FI_CONTEXT, FI_RX_CQ_DATA ], test_class: [ FT_CAP_MSG, ], @@ -79,9 +77,8 @@ FI_WAIT_UNSPEC, FI_WAIT_FD, ], - mode: [ - FT_MODE_ALL - ], + mr_mode: [ FI_MR_LOCAL, FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, FI_MR_PROV_KEY ], + mode: [ FI_CONTEXT, FI_RX_CQ_DATA ], test_class: [ FT_CAP_MSG, ], diff --git a/fabtests/test_configs/verbs/exclude b/fabtests/test_configs/verbs/verbs.exclude similarity index 100% rename from fabtests/test_configs/verbs/exclude rename to fabtests/test_configs/verbs/verbs.exclude diff --git a/fabtests/ubertest/config.c b/fabtests/ubertest/config.c index 16eec1c33d3..b050b6eea48 100644 --- a/fabtests/ubertest/config.c +++ b/fabtests/ubertest/config.c @@ -38,8 +38,6 @@ #define FT_CAP_RMA FI_RMA | FI_READ | FI_WRITE | FI_REMOTE_READ | FI_REMOTE_WRITE #define FT_CAP_ATOMIC FI_ATOMICS | FI_READ | FI_WRITE | FI_REMOTE_READ | FI_REMOTE_WRITE -#define FT_MODE_ALL FI_CONTEXT | FI_LOCAL_MR | FI_RX_CQ_DATA /*| FI_MSG_PREFIX*/ -#define FT_MODE_NONE ~0ULL struct key_t { char *str; @@ -48,68 +46,6 @@ struct key_t { int val_size; }; -static struct ft_set test_sets_default[] = { - { - .prov_name = "sockets", - .test_type = { - FT_TEST_LATENCY, - FT_TEST_BANDWIDTH - }, - .class_function = { - FT_FUNC_SEND, - FT_FUNC_SENDV, - FT_FUNC_SENDMSG - }, - .ep_type = { - FI_EP_MSG, - FI_EP_DGRAM, - FI_EP_RDM - }, - .av_type = { - FI_AV_TABLE, - FI_AV_MAP - }, - .comp_type = { - FT_COMP_QUEUE - }, - .mode = { - FT_MODE_ALL - }, - .test_class = { - FT_CAP_MSG, - FT_CAP_TAGGED, -// FT_CAP_RMA, -// FT_CAP_ATOMIC - }, - .test_flags = FT_FLAG_QUICKTEST - }, - { - .prov_name = "verbs", - .test_type = { - FT_TEST_LATENCY, - FT_TEST_BANDWIDTH - }, - .class_function = { - FT_FUNC_SEND, - FT_FUNC_SENDV, - FT_FUNC_SENDMSG - }, - .ep_type = { - FI_EP_MSG, - }, - .comp_type = { - FT_COMP_QUEUE - }, - .mode = { - FT_MODE_ALL - }, - .test_class = { - FT_CAP_MSG, - }, - .test_flags = FT_FLAG_QUICKTEST - }, -}; - static struct ft_series test_series; size_t sm_size_array[] = { @@ -329,12 +265,12 @@ static int ft_parse_num(char *str, int len, struct key_t *key, void *buf) TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_SENDDATA, enum ft_class_function, buf); TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_WRITE, enum ft_class_function, buf); - TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_WRITEV, enum ft_class_function, buf); + TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_WRITEV, enum ft_class_function, buf); TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_WRITEMSG, enum ft_class_function, buf); TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_WRITEDATA, enum ft_class_function, buf); TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_INJECT_WRITE, enum ft_class_function, buf); TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_INJECT_WRITEDATA, enum ft_class_function, buf); - + TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_READ, enum ft_class_function, buf); TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_READV, enum ft_class_function, buf); TEST_ENUM_SET_N_RETURN(str, len, FT_FUNC_READMSG, enum ft_class_function, buf); @@ -458,14 +394,19 @@ static int ft_parse_num(char *str, int len, struct key_t *key, void *buf) } else if (!strncmp(key->str, "tx_op_flags", strlen("tx_op_flags"))) { TEST_ENUM_SET_N_RETURN(str, len, FI_COMPLETION, uint64_t, buf); FT_ERR("Unknown tx_op_flags"); - } else { + } else if (!strncmp(key->str, "comp_type", strlen("comp_type"))) { TEST_ENUM_SET_N_RETURN(str, len, FT_COMP_QUEUE, enum ft_comp_type, buf); TEST_ENUM_SET_N_RETURN(str, len, FT_COMP_CNTR, enum ft_comp_type, buf); TEST_ENUM_SET_N_RETURN(str, len, FT_COMP_ALL, enum ft_comp_type, buf); - TEST_SET_N_RETURN(str, len, "FT_MODE_ALL", FT_MODE_ALL, uint64_t, buf); - TEST_SET_N_RETURN(str, len, "FT_MODE_NONE", FT_MODE_NONE, uint64_t, buf); + FT_ERR("Unknown comp_type"); + } else if (!strncmp(key->str, "mode", strlen("mode"))) { + TEST_ENUM_SET_N_RETURN(str, len, FI_CONTEXT, uint64_t, buf); + TEST_ENUM_SET_N_RETURN(str, len, FI_RX_CQ_DATA, uint64_t, buf); + FT_ERR("Unsupported mode bit"); + } else if (!strncmp(key->str, "test_flags", strlen("test_flags"))) { TEST_SET_N_RETURN(str, len, "FT_FLAG_QUICKTEST", FT_FLAG_QUICKTEST, uint64_t, buf); - FT_ERR("Unknown comp_type/mode/test_flags"); + } else { + FT_ERR("Unknown test configuration key"); } return -1; @@ -563,8 +504,8 @@ static int ft_parse_config(char *config, int size, * JSMN_STRING * JSMN_STRING : * JSMN_STRING : - * In our case, JSMN_OBJECT would represent a ft_set structure. The rest - * of the tokens would be treated as key-value pairs. The first JSMN_STRING + * In our case, JSMN_OBJECT would represent a ft_set structure. The rest + * of the tokens would be treated as key-value pairs. The first JSMN_STRING * would represent a key and the next would represent a value. A value * can also be an array. jsmntok_t.size would represent the length of * the array. @@ -682,9 +623,8 @@ struct ft_series *fts_load(char *filename) free(config); fclose(fp); } else { - printf("No config file given. Using default tests.\n"); - test_series.sets = test_sets_default; - test_series.nsets = sizeof(test_sets_default) / sizeof(test_sets_default[0]); + printf("Test config file required.\n"); + exit(1); } for (fts_start(&test_series, 0); !fts_end(&test_series, 0); @@ -704,8 +644,7 @@ struct ft_series *fts_load(char *filename) void fts_close(struct ft_series *series) { - if (series->sets != test_sets_default) - free(series->sets); + free(series->sets); } void fts_start(struct ft_series *series, int index) @@ -894,8 +833,11 @@ void fts_cur_info(struct ft_series *series, struct ft_info *info) while (set->tx_op_flags[i]) info->tx_op_flags |= set->tx_op_flags[i++]; } - info->mode = !set->mode[series->cur_mode] ? - FT_MODE_ALL : set->mode[series->cur_mode]; + if (set->mode[0]) { + i = 0; + while (set->mode[i]) + info->mode |= set->mode[i++]; + } info->ep_type = set->ep_type[series->cur_ep]; info->av_type = set->av_type[series->cur_av]; @@ -911,16 +853,17 @@ void fts_cur_info(struct ft_series *series, struct ft_info *info) info->cntr_wait_obj = set->cntr_wait_obj[series->cur_cntr_wait_obj]; if (set->node[0]) - strncpy(info->node, set->node, sizeof(info->node) - 1); + strncpy(info->node, set->node, sizeof(info->node)); else if (opts.dst_addr) - strncpy(info->node, opts.dst_addr, sizeof(info->node) - 1); + strncpy(info->node, opts.dst_addr, sizeof(info->node)); + info->node[sizeof(info->node) - 1] = '\0'; + if (set->service[0]) - strncpy(info->service, set->service, sizeof(info->service) - 1); + strncpy(info->service, set->service, sizeof(info->service)); else if (opts.dst_port) - strncpy(info->service, opts.dst_port, sizeof(info->service) - 1); - strncpy(info->prov_name, set->prov_name, sizeof(info->prov_name) - 1); - - info->node[sizeof(info->node) - 1] = '\0'; + strncpy(info->service, opts.dst_port, sizeof(info->service)); info->service[sizeof(info->service) - 1] = '\0'; + + strncpy(info->prov_name, set->prov_name, sizeof(info->prov_name)); info->prov_name[sizeof(info->prov_name) - 1] = '\0'; } diff --git a/fabtests/ubertest/fabtest.h b/fabtests/ubertest/fabtest.h index d88a8354c44..0cc83ea9ff7 100644 --- a/fabtests/ubertest/fabtest.h +++ b/fabtests/ubertest/fabtest.h @@ -327,10 +327,6 @@ struct ft_msg { uint8_t data[124]; }; -int ft_fw_send(int fd, void *msg, size_t len); -int ft_fw_recv(int fd, void *msg, size_t len); - - int ft_open_control(); ssize_t ft_get_event(uint32_t *event, void *buf, size_t len, uint32_t event_check, size_t len_check); diff --git a/fabtests/ubertest/ofi_atomic.c b/fabtests/ubertest/ofi_atomic.c index 075d722c1e0..1737a4981d4 100644 --- a/fabtests/ubertest/ofi_atomic.c +++ b/fabtests/ubertest/ofi_atomic.c @@ -55,12 +55,12 @@ #define OFI_OP_READ(type,dst,src) /* src unused, dst is written to result */ #define OFI_OP_WRITE(type,dst,src) (dst) = (src) -#define OFI_OP_CSWAP_EQ(type,dst,src,cmp) if ((dst) == (cmp)) (dst) = (src) -#define OFI_OP_CSWAP_NE(type,dst,src,cmp) if ((dst) != (cmp)) (dst) = (src) -#define OFI_OP_CSWAP_LE(type,dst,src,cmp) if ((dst) <= (cmp)) (dst) = (src) -#define OFI_OP_CSWAP_LT(type,dst,src,cmp) if ((dst) < (cmp)) (dst) = (src) -#define OFI_OP_CSWAP_GE(type,dst,src,cmp) if ((dst) >= (cmp)) (dst) = (src) -#define OFI_OP_CSWAP_GT(type,dst,src,cmp) if ((dst) > (cmp)) (dst) = (src) +#define OFI_OP_CSWAP_EQ(type,dst,src,cmp) if ((cmp) == (dst)) (dst) = (src) +#define OFI_OP_CSWAP_NE(type,dst,src,cmp) if ((cmp) != (dst)) (dst) = (src) +#define OFI_OP_CSWAP_LE(type,dst,src,cmp) if ((cmp) <= (dst)) (dst) = (src) +#define OFI_OP_CSWAP_LT(type,dst,src,cmp) if ((cmp) < (dst)) (dst) = (src) +#define OFI_OP_CSWAP_GE(type,dst,src,cmp) if ((cmp) >= (dst)) (dst) = (src) +#define OFI_OP_CSWAP_GT(type,dst,src,cmp) if ((cmp) > (dst)) (dst) = (src) #define OFI_OP_MSWAP(type,dst,src,cmp) (dst) = (((src) & (cmp)) | \ ((dst) & ~(cmp))) @@ -294,7 +294,7 @@ OFI_DEFINE_ALL_HANDLERS(WRITE, FUNC, OFI_OP_LXOR) OFI_DEFINE_INT_HANDLERS(WRITE, FUNC, OFI_OP_BXOR) OFI_DEFINE_ALL_HANDLERS(WRITE, FUNC, OFI_OP_WRITE) -void (*ofi_atomic_write_handlers[OFI_WRITE_OP_LAST][FI_DATATYPE_LAST]) +void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][FI_DATATYPE_LAST]) (void *dst, const void *src, size_t cnt) = { { OFI_DEFINE_REALNO_HANDLERS(WRITE, NAME, OFI_OP_MIN) }, @@ -330,7 +330,7 @@ OFI_DEFINE_INT_HANDLERS(READWRITE, FUNC, OFI_OP_BXOR) OFI_DEFINE_ALL_HANDLERS(READ, FUNC, OFI_OP_READ) OFI_DEFINE_ALL_HANDLERS(READWRITE, FUNC, OFI_OP_WRITE) -void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_LAST][FI_DATATYPE_LAST]) +void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][FI_DATATYPE_LAST]) (void *dst, const void *src, void *res, size_t cnt) = { { OFI_DEFINE_REALNO_HANDLERS(READWRITE, NAME, OFI_OP_MIN) }, @@ -360,7 +360,7 @@ OFI_DEFINE_REALNO_HANDLERS(CSWAP, FUNC, OFI_OP_CSWAP_GE) OFI_DEFINE_REALNO_HANDLERS(CSWAP, FUNC, OFI_OP_CSWAP_GT) OFI_DEFINE_INT_HANDLERS(CSWAP, FUNC, OFI_OP_MSWAP) -void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_LAST][FI_DATATYPE_LAST]) +void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][FI_DATATYPE_LAST]) (void *dst, const void *src, const void *cmp, void *res, size_t cnt) = { { OFI_DEFINE_ALL_HANDLERS(CSWAP, NAME, OFI_OP_CSWAP_EQ) }, diff --git a/fabtests/ubertest/ofi_atomic.h b/fabtests/ubertest/ofi_atomic.h index 98966e238d7..aec830b728f 100644 --- a/fabtests/ubertest/ofi_atomic.h +++ b/fabtests/ubertest/ofi_atomic.h @@ -44,19 +44,39 @@ typedef float complex ofi_complex_float; typedef double complex ofi_complex_double; typedef long double complex ofi_complex_long_double; -#define OFI_WRITE_OP_LAST FI_CSWAP -#define OFI_READWRITE_OP_LAST FI_CSWAP +#define OFI_WRITE_OP_START FI_MIN +#define OFI_WRITE_OP_LAST (FI_ATOMIC_WRITE + 1) +#define OFI_WRITE_OP_CNT (OFI_WRITE_OP_LAST - OFI_WRITE_OP_START) +#define OFI_READWRITE_OP_START FI_MIN +#define OFI_READWRITE_OP_LAST (FI_ATOMIC_WRITE + 1) +#define OFI_READWRITE_OP_CNT (OFI_READWRITE_OP_LAST - OFI_READWRITE_OP_START) #define OFI_SWAP_OP_START FI_CSWAP -#define OFI_SWAP_OP_LAST (FI_MSWAP - FI_CSWAP + 1) +#define OFI_SWAP_OP_LAST (FI_MSWAP + 1) +#define OFI_SWAP_OP_CNT (OFI_SWAP_OP_LAST - OFI_SWAP_OP_START) -extern void (*ofi_atomic_write_handlers[OFI_WRITE_OP_LAST][FI_DATATYPE_LAST]) +#define ofi_atomic_iswrite_op(op) \ + (op >= OFI_WRITE_OP_START && op < OFI_WRITE_OP_LAST && op != FI_ATOMIC_READ) +#define ofi_atomic_isreadwrite_op(op) \ + (op >= OFI_READWRITE_OP_START && op < OFI_READWRITE_OP_LAST) +#define ofi_atomic_isswap_op(op) \ + (op >= OFI_SWAP_OP_START && op < OFI_SWAP_OP_LAST) + +extern void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][FI_DATATYPE_LAST]) (void *dst, const void *src, size_t cnt); -extern void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_LAST][FI_DATATYPE_LAST]) +extern void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][FI_DATATYPE_LAST]) (void *dst, const void *src, void *res, size_t cnt); -extern void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_LAST][FI_DATATYPE_LAST]) +extern void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][FI_DATATYPE_LAST]) (void *dst, const void *src, const void *cmp, void *res, size_t cnt); +#define ofi_atomic_write_handler(op, datatype, dst, src, cnt) \ + ofi_atomic_write_handlers[op][datatype](dst, src, cnt) +#define ofi_atomic_readwrite_handler(op, datatype, dst, src, res, cnt) \ + ofi_atomic_readwrite_handlers[op][datatype](dst, src, res, cnt) +#define ofi_atomic_swap_handler(op, datatype, dst, src, cmp, res, cnt) \ + ofi_atomic_swap_handlers[op - OFI_SWAP_OP_START][datatype](dst, src, \ + cmp, res, cnt) + #define OFI_DEF_COMPLEX_OPS(type) \ static inline int ofi_complex_eq_## type \ (ofi_complex_## type a, ofi_complex_## type b) \ diff --git a/fabtests/ubertest/test_ctrl.c b/fabtests/ubertest/test_ctrl.c index cf3757f2dc7..dac2a7e0902 100644 --- a/fabtests/ubertest/test_ctrl.c +++ b/fabtests/ubertest/test_ctrl.c @@ -70,7 +70,7 @@ static int ft_init_rx_control(void) ft_rx_ctrl.cq_format = FI_CQ_FORMAT_DATA; ft_rx_ctrl.addr = FI_ADDR_UNSPEC; - ft_rx_ctrl.msg_size = med_size_array[med_size_cnt - 1]; + ft_rx_ctrl.msg_size = ft_ctrl.size_array[ft_ctrl.size_cnt - 1]; if (fabric_info && fabric_info->ep_attr && fabric_info->ep_attr->max_msg_size && fabric_info->ep_attr->max_msg_size < ft_rx_ctrl.msg_size) @@ -120,13 +120,8 @@ static int ft_init_control(void) ft_ctrl.iov_array = sm_size_array; ft_ctrl.iov_cnt = sm_size_cnt; - if (test_info.test_class & FI_RMA) { - ft_ctrl.size_array = lg_size_array; - ft_ctrl.size_cnt = lg_size_cnt; - } else { - ft_ctrl.size_array = med_size_array; - ft_ctrl.size_cnt = med_size_cnt; - } + ft_ctrl.size_array = lg_size_array; + ft_ctrl.size_cnt = lg_size_cnt; ret = ft_init_rx_control(); if (ret) diff --git a/fabtests/ubertest/uber.c b/fabtests/ubertest/uber.c index 52ed2f68cff..eb6ef3b29d5 100644 --- a/fabtests/ubertest/uber.c +++ b/fabtests/ubertest/uber.c @@ -68,8 +68,6 @@ enum { static int results[FT_MAX_RESULT]; static char *filename = NULL; -static char *provname = NULL; -static char *testname = NULL; static int ft_nullstr(char *str) @@ -190,7 +188,7 @@ static void ft_print_comp(struct ft_info *test) printf(", rx: "); ft_print_comp_flag(test->rx_cq_bind_flags, test->rx_op_flags); printf(", "); -} +} static void ft_show_test_info(void) { @@ -272,11 +270,8 @@ static void ft_fw_convert_info(struct fi_info *info, struct ft_info *test_info) info->domain_attr->cq_data_size = 4; } -static void -ft_fw_update_info(struct ft_info *test_info, struct fi_info *info, int subindex) +static void ft_fw_update_info(struct ft_info *test_info, struct fi_info *info) { - test_info->test_subindex = subindex; - if (info->ep_attr) { test_info->protocol = info->ep_attr->protocol; test_info->protocol_version = info->ep_attr->protocol_version; @@ -296,6 +291,7 @@ ft_fw_update_info(struct ft_info *test_info, struct fi_info *info, int subindex) if (info->domain_attr) { test_info->progress = info->domain_attr->data_progress; test_info->threading = info->domain_attr->threading; + test_info->mr_mode = info->domain_attr->mr_mode; } test_info->mode = info->mode; @@ -306,12 +302,14 @@ static int ft_fw_result_index(int fi_errno) switch (fi_errno) { case 0: return FT_SUCCESS; - case FI_ENODATA: + case -FI_ENODATA: return FT_ENODATA; - case FI_ENOSYS: + case -FI_ENOSYS: return FT_ENOSYS; - case FI_EIO: + case -FI_EIO: return FT_EIO; + case -FT_SKIP: + return FT_SKIP; default: return FT_ERROR; } @@ -332,289 +330,125 @@ static int ft_recv_test_info(void) return 0; } -static int ft_exchange_uint32(uint32_t local, uint32_t *remote) +static int ft_send_result(int err, struct fi_info *info) { - uint32_t local_net = htonl(local); int ret; - - ret = ft_sock_send(sock, &local_net, sizeof local); + ret = ft_sock_send(sock, &err, sizeof err); if (ret) { FT_PRINTERR("ft_sock_send", ret); return ret; } - - ret = ft_sock_recv(sock, remote, sizeof *remote); - if (ret) { - FT_PRINTERR("ft_sock_recv", ret); - return ret; + if (err) { + printf("Ending test %d, result: %s\n", test_info.test_index, + fi_strerror(-err)); + return err; } - *remote = ntohl(*remote); - return 0; } -static int ft_skip_info(struct fi_info *hints, struct fi_info *info) +static int ft_recv_result(struct fi_info *info) { - uint32_t remote_protocol, skip, remote_skip; - size_t len; - int ret; - - //make sure remote side is using the same protocol - ret = ft_exchange_uint32(info->ep_attr->protocol, &remote_protocol); - if (ret) - return ret; - - if (info->ep_attr->protocol != remote_protocol) - return 1; - - //check needed to skip utility providers, unless requested - skip = (!ft_util_name(hints->fabric_attr->prov_name, &len) && - strcmp(hints->fabric_attr->prov_name, - info->fabric_attr->prov_name)); - - ret = ft_exchange_uint32(skip, &remote_skip); - if (ret) - return ret; - - return skip || remote_skip; -} - -static int ft_transfer_subindex(int subindex, int *remote_idx) -{ - int ret; - - ret = ft_sock_send(sock, &subindex, sizeof subindex); - if (ret) { - FT_PRINTERR("ft_sock_send", ret); - return ret; - } - - ret = ft_sock_recv(sock, remote_idx, sizeof *remote_idx); + int ret, err = 0; + ret = ft_sock_recv(sock, &err, sizeof err); if (ret) { FT_PRINTERR("ft_sock_recv", ret); return ret; } + if (err) { + printf("Ending test %d, result: %s\n", test_info.test_index, + fi_strerror(-err)); + } - return 0; + return err; } -static int ft_fw_process_list_server(struct fi_info *hints, struct fi_info *info) +static int ft_server_setup(struct fi_info *hints, struct fi_info *info) { - int ret, subindex, remote_idx = 0, result = -FI_ENODATA, end_test = 0; - int server_ready = 0; - struct fi_info *open_res_info; + int ret = 0; - ret = ft_sock_send(sock, &test_info, sizeof test_info); - if (ret) { - FT_PRINTERR("ft_sock_send", ret); - return ret; + hints = fi_allocinfo(); + if (!hints) { + ret = -FI_ENOMEM; + goto err; } - for (subindex = 1, fabric_info = info; fabric_info; - fabric_info = fabric_info->next, subindex++) { - - ret = ft_check_info(hints, fabric_info); - if (ret) - return ret; - - /* Stores the fabric_info into a tmp variable, resolves an issue caused - * by ft_accept with FI_EP_MSG which overwrites the fabric_info. - */ - open_res_info = fabric_info; - while (1) { - fabric_info = open_res_info; - ret = ft_open_res(); - if (ret) { - FT_PRINTERR("ft_open_res", ret); - return ret; - } + ft_fw_convert_info(hints, &test_info); - if (!server_ready) { - server_ready = 1; - ret = ft_sock_send(sock, &server_ready, sizeof server_ready); - if (ret) { - FT_PRINTERR("ft_sock_send", ret); - return ret; - } - } + ret = fi_getinfo(FT_FIVERSION, ft_strptr(test_info.node), + ft_strptr(test_info.service), FI_SOURCE, hints, &info); + if (ret) { + FT_PRINTERR("fi_getinfo", ret); + goto err; + } - ret = ft_sock_recv(sock, &end_test, sizeof end_test); - if (ret) { - FT_PRINTERR("ft_sock_recv", ret); - return ret; - } - if (end_test) { - ft_cleanup(); - break; - } + fabric_info = info; - if (ft_skip_info(hints, fabric_info)) { - ft_cleanup(); - continue; - } + ret = ft_check_info(hints, fabric_info); + if (ret) + goto err; - ret = ft_transfer_subindex(subindex, &remote_idx); - if (ret) - return ret; + ret = ft_open_res(); + if (ret) + goto err; - ft_fw_update_info(&test_info, fabric_info, subindex); + ft_fw_update_info(&test_info, fabric_info); - printf("Starting test %d-%d-%d: ", test_info.test_index, - subindex, remote_idx); - ft_show_test_info(); + return 0; +err: + ft_send_result(ret, info); + return ret; +} - result = ft_init_test(); - if (result) - continue; +static int ft_server_child() +{ + struct fi_info *hints = NULL; + struct fi_info *info = NULL; + int ret, result; - result = ft_run_test(); + printf("Starting test %d:\n", test_info.test_index); - ret = ft_sock_send(sock, &result, sizeof result); - if (result) { - FT_PRINTERR("ft_run_test", result); - } else if (ret) { - FT_PRINTERR("ft_sock_send", ret); - return ret; - } - } + ret = ft_server_setup(hints, info); + if (ret) + return ret; - end_test = (fabric_info->next == NULL); - ret = ft_sock_send(sock, &end_test, sizeof end_test); - if (ret) { - FT_PRINTERR("ft_sock_send", ret); - return ret; - } - } + ret = ft_send_result(0, info); + if (ret) + return ret; - test_info.prov_name[0] = '\0'; ret = ft_sock_send(sock, &test_info, sizeof test_info); if (ret) { FT_PRINTERR("ft_sock_send", ret); return ret; } - if (subindex == 1) - return -FI_ENODATA; - - return result; -} - -static int ft_fw_process_list_client(struct fi_info *hints, struct fi_info *info) -{ - int ret, subindex, remote_idx = 0, result = -FI_ENODATA, sresult, end_test = 0; - - while (!end_test) { - for (subindex = 1, fabric_info = info; fabric_info; - fabric_info = fabric_info->next, subindex++) { - - end_test = 0; - ret = ft_sock_send(sock, &end_test, sizeof end_test); - if (ret) { - FT_PRINTERR("ft_sock_send", ret); - return ret; - } - - if (ft_skip_info(hints, fabric_info)) - continue; - - ret = ft_transfer_subindex(subindex, &remote_idx); - if (ret) - return ret; - - ret = ft_check_info(hints, fabric_info); - if (ret) - return ret; - - ft_fw_update_info(&test_info, fabric_info, subindex); - printf("Starting test %d-%d-%d: ", test_info.test_index, - subindex, remote_idx); - ft_show_test_info(); + ret = ft_recv_result(info); + if (ret) + return ret; - ret = ft_open_res(); - if (ret) { - FT_PRINTERR("ft_open_res", ret); - return ret; - } + ret = ft_init_test(); + if (ret) + return ret; - result = ft_init_test(); - if (result) - continue; - - result = ft_run_test(); - - ret = ft_sock_recv(sock, &sresult, sizeof sresult); - if (result && result != -FI_EIO) { - FT_PRINTERR("ft_run_test", result); - fprintf(stderr, "Node: %s\nService: %s \n", - test_info.node, test_info.service); - fprintf(stderr, "%s\n", fi_tostr(hints, FI_TYPE_INFO)); - return -FI_EOTHER; - } else if (ret) { - FT_PRINTERR("ft_sock_recv", ret); - result = ret; - return -FI_EOTHER; - } else if (sresult) { - result = sresult; - if (sresult != -FI_EIO) - return -FI_EOTHER; - } - } - end_test = 1; - ret = ft_sock_send(sock, &end_test, sizeof end_test); - if (ret) { - FT_PRINTERR("ft_sock_send", ret); - return ret; - } + result = ft_run_test(); - ret = ft_sock_recv(sock, &end_test, sizeof end_test); - if (ret) { - FT_PRINTERR("ft_sock_recv", ret); - return ret; - } + ret = ft_sock_send(sock, &result, sizeof result); + if (result) { + FT_PRINTERR("ft_run_test", result); } - if (subindex == 1) - return -FI_ENODATA; - - return result; -} - -static int ft_server_child() -{ - struct fi_info *hints, *info; - int ret; - - hints = fi_allocinfo(); - if (!hints) - return -FI_ENOMEM; - - ft_fw_convert_info(hints, &test_info); - printf("Starting test %d:\n", test_info.test_index); + fi_freeinfo(hints); + ft_cleanup(); - ret = fi_getinfo(FT_FIVERSION, ft_strptr(test_info.node), - ft_strptr(test_info.service), FI_SOURCE, - hints, &info); - if (ret && ret != -FI_ENODATA) { - FT_PRINTERR("fi_getinfo", ret); - } else { - ret = ft_fw_process_list_server(hints, info); - if (ret != -FI_ENODATA) - fi_freeinfo(info); - - if (ret && ret != -FI_EIO) { - FT_PRINTERR("ft_fw_process_list", ret); - printf("Node: %s\nService: %s\n", - test_info.node, test_info.service); - printf("%s\n", fi_tostr(hints, FI_TYPE_INFO)); - } + if (ret) { + FT_PRINTERR("ft_sock_send", ret); + return ret; } - fi_freeinfo(hints); printf("Ending test %d, result: %s\n", test_info.test_index, fi_strerror(-ret)); - return ret; + return result; } static int ft_fw_server(void) @@ -645,74 +479,118 @@ static int ft_fw_server(void) results[ft_fw_result_index(ret)]++; - } while (!ret || ret == FI_EIO || ret == FI_ENODATA); + } while (!ret || ret == -FI_EIO || ret == -FI_ENODATA || ret == -FT_SKIP); return ret; } - -static int ft_client_child(void) +static int ft_client_setup(struct fi_info *hints, struct fi_info *info) { - struct fi_info *hints, *info; - int ret, result, server_ready = 0; + int ret; + ret = ft_recv_test_info(); + if (ret) + goto err; - result = -FI_ENODATA; hints = fi_allocinfo(); - if (!hints) - return -FI_ENOMEM; + if (!hints) { + ret = -FI_ENOMEM; + goto err; + } ret = ft_getsrcaddr(opts.src_addr, opts.src_port, hints); if (ret) - return ret; + goto err; ft_fw_convert_info(hints, &test_info); + ft_show_test_info(); + + ret = fi_getinfo(FT_FIVERSION, ft_strptr(test_info.node), + ft_strptr(test_info.service), 0, hints, &info); + if (ret) + goto err; + + fabric_info = info; + + ret = ft_check_info(hints, fabric_info); + if (ret) + goto err; + + ft_fw_update_info(&test_info, fabric_info); + + ret = ft_open_res(); + + return 0; + +err: + ft_send_result(ret, info); + return ret; +} +static int ft_client_child(void) +{ + struct fi_info *hints = NULL; + struct fi_info *info = NULL; + int ret, result, sresult = 0; + result = -FI_ENODATA; + + ret = ft_sock_send(sock, &test_info, sizeof test_info); + if (ret) + goto err; + printf("Starting test %d / %d:\n", test_info.test_index, series->test_count); - while (!ft_nullstr(test_info.prov_name)) { - printf("Starting test %d-%d: ", test_info.test_index, - test_info.test_subindex); - ft_show_test_info(); - ret = ft_sock_recv(sock, &server_ready, sizeof server_ready); - if (ret) - return ret; - - if (!server_ready) - return -FI_EOTHER; + ret = ft_recv_result(info); + if (ret) + return ret; - result = fi_getinfo(FT_FIVERSION, ft_strptr(test_info.node), - ft_strptr(test_info.service), 0, hints, &info); - if (result) { - FT_PRINTERR("fi_getinfo", result); - } + ret = ft_client_setup(hints, info); + if (ret) + return ret; - ret = ft_fw_process_list_client(hints, info); - if (ret != -FI_ENODATA) - fi_freeinfo(info); - else - goto out; + ret = ft_send_result(0, info); + if (ret) + return ret; - ret = ft_recv_test_info(); - if (ret) { - FT_PRINTERR("ft_recv_test_info", ret); - goto out; - } - ft_fw_convert_info(hints, &test_info); + result = ft_init_test(); + if (result) + return result; + + result = ft_run_test(); + ret = ft_sock_recv(sock, &sresult, sizeof sresult); + if (result && result != -FI_EIO) { + FT_PRINTERR("ft_run_test", result); + fprintf(stderr, "Node: %s\nService: %s \n", + test_info.node, test_info.service); + fprintf(stderr, "%s\n", fi_tostr(hints, FI_TYPE_INFO)); + ret = -FI_EOTHER; + } else if (ret) { + FT_PRINTERR("ft_sock_recv", ret); + result = ret; + ret = -FI_EOTHER; + } else if (sresult) { + result = sresult; + if (sresult != -FI_EIO) + ret = -FI_EOTHER; } printf("Ending test %d / %d, result: %s\n", test_info.test_index, series->test_count, fi_strerror(-result)); -out: + fi_freeinfo(hints); + ft_cleanup(); + + return 0; + +err: + ft_send_result(ret, info); return result; } static int ft_fw_client(void) { - int ret, result; + int result; pid_t pid; - for (fts_start(series, test_start_index); !fts_end(series, test_end_index); fts_next(series)) { @@ -725,18 +603,6 @@ static int ft_fw_client(void) continue; } - ret = ft_sock_send(sock, &test_info, sizeof test_info); - if (ret) { - FT_PRINTERR("ft_sock_send", ret); - return ret; - } - - ret = ft_recv_test_info(); - if (ret) { - FT_PRINTERR("ft_recv_test_info", ret); - return ret; - } - if (do_fork) { pid = fork(); if (!pid) { @@ -769,17 +635,14 @@ static void ft_fw_usage(char *program) { fprintf(stderr, "Usage:\n"); fprintf(stderr, " %s [OPTIONS] \t\t\tstart server\n", program); - fprintf(stderr, " %s [OPTIONS] \tconnect to server\n", program); + fprintf(stderr, " %s [OPTIONS] -u config_file \tconnect to server\n", program); fprintf(stderr, "\nOptions:\n"); FT_PRINT_OPTS_USAGE("-q ", "Management port for test"); FT_PRINT_OPTS_USAGE("-h", "display this help output"); fprintf(stderr, "\nServer only options:\n"); FT_PRINT_OPTS_USAGE("-x", "exit after test run"); fprintf(stderr, "\nClient only options:\n"); - FT_PRINT_OPTS_USAGE("-u ", "test configuration file " - "(Either config file or both provider and test name are required)"); - FT_PRINT_OPTS_USAGE("-p ", " provider name"); - FT_PRINT_OPTS_USAGE("-t ", "test name"); + FT_PRINT_OPTS_USAGE("-u ", "test configuration file "); FT_PRINT_OPTS_USAGE("-y ", ""); FT_PRINT_OPTS_USAGE("-z ", ""); FT_PRINT_OPTS_USAGE("-s
", "source address"); @@ -792,77 +655,6 @@ void ft_free() { if (filename) free(filename); - if (testname) - free(testname); - if (provname) - free(provname); -} - -static int ft_get_config_file(char *provname, char *testname, char **filename) -{ - char **prov_vec, **path_vec, *str; - size_t i, prov_count, path_count, len; - int ret = -FI_ENOMEM; - - // TODO use macro for ";" - prov_vec = ft_split_and_alloc(provname, ";", &prov_count); - if (!prov_vec) { - FT_ERR("Unable to split provname\n"); - return -FI_EINVAL; - } - - /* prov_count + count_of(CONFIG_PATH, "test_configs", "testname", ".test") */ - path_count = prov_count + 4; - path_vec = calloc(path_count, sizeof(*path_vec)); - if (!path_vec) - goto err1; - - path_vec[0] = CONFIG_PATH; - path_vec[1] = "test_configs"; - - /* Path for "prov1;prov2;prov3;..." is ".../prov3/prov2/prov1" */ - for (i = 0; i < prov_count; i++) - path_vec[i + 2] = prov_vec[prov_count - i - 1]; - - path_vec[prov_count + 2] = testname; - path_vec[prov_count + 3] = "test"; - - for (i = 0, len = 0; i < path_count; i++) - len += strlen(path_vec[i]) + 1; - - // NULL char at the end - len++; - - *filename = calloc(1, len); - if (!*filename) - goto err2; - - for (i = 0, str = *filename; i < path_count; i++) { - if (i < path_count - 1) - ret = snprintf(str, len, "/%s", path_vec[i]); - else - ret = snprintf(str, len, ".%s", path_vec[i]); - if (ret < 0) - goto err3; - - if (ret >= (int)len) { - ret = -FI_ETRUNC; - goto err3; - } - str += ret; - len -= ret; - } - free(path_vec); - ft_free_string_array(prov_vec); - return 0; -err3: - free(*filename); - *filename = NULL; -err2: - free(path_vec); -err1: - ft_free_string_array(prov_vec); - return ret; } int main(int argc, char **argv) @@ -871,17 +663,11 @@ int main(int argc, char **argv) opts = INIT_OPTS; int ret, op; - while ((op = getopt(argc, argv, "p:u:t:q:xy:z:hf" ADDR_OPTS)) != -1) { + while ((op = getopt(argc, argv, "u:q:xy:z:hf" ADDR_OPTS)) != -1) { switch (op) { case 'u': filename = strdup(optarg); break; - case 'p': - provname = strdup(optarg); - break; - case 't': - testname = strdup(optarg); - break; case 'q': service = optarg; break; @@ -919,21 +705,9 @@ int main(int argc, char **argv) if (!opts.dst_port) opts.dst_port = default_port; if (!filename) { - if (!testname || !provname) { - ft_fw_usage(argv[0]); - ft_free(); - exit(1); - } else { - ret = ft_get_config_file(provname, testname, - &filename); - if (ret < 0) { - ft_free(); - exit(1); - } - } - } else { - testname = NULL; - provname = NULL; + ft_fw_usage(argv[0]); + ft_free(); + exit(1); } series = fts_load(filename); if (!series) { diff --git a/fabtests/ubertest/verify.c b/fabtests/ubertest/verify.c index 08011b5c242..b58fa927461 100644 --- a/fabtests/ubertest/verify.c +++ b/fabtests/ubertest/verify.c @@ -132,13 +132,11 @@ static int verify_atomic(void) } if (is_compare_func(test_info.class_function)) { - ofi_atomic_swap_handlers[op - OFI_SWAP_OP_START][type](dst, - src, cmp, tmp, count); + ofi_atomic_swap_handler(op, type, dst, src, cmp, tmp, count); } else if (is_fetch_func(test_info.class_function)) { - ofi_atomic_readwrite_handlers[op][type](dst, - src, tmp, count); + ofi_atomic_readwrite_handler(op, type, dst, src, tmp, count); } else { - ofi_atomic_write_handlers[op][type](dst, src, count); + ofi_atomic_write_handler(op, type, dst, src, count); } SWITCH_TYPES(type, CHECK_LOCAL, dst, ft_mr_ctrl.buf, count, ret); diff --git a/fabtests/unit/av_test.c b/fabtests/unit/av_test.c index 81fd4d5ad69..edcd2bd8c97 100644 --- a/fabtests/unit/av_test.c +++ b/fabtests/unit/av_test.c @@ -515,6 +515,82 @@ av_goodbad_vector_sync() return TEST_RET_VAL(ret, testret); } +/* + * Tests: + * - sync vector with 1 good and 1 bad using FI_SYNC_ERR + */ +static int +av_goodbad_vector_sync_err() +{ + int testret, ret; + struct fid_av *av; + struct fi_av_attr attr; + uint8_t addrbuf[4096]; + int buflen; + int sync_err[2]; + + if (av_type != FI_AV_TABLE) { + ret = 0; + testret = SKIPPED; + sprintf(err_buf, "test not valid for AV type FI_AV_MAP"); + goto out; + } + + testret = FAIL; + + memset(&attr, 0, sizeof(attr)); + attr.type = av_type; + attr.count = 32; + + av = NULL; + ret = fi_av_open(domain, &attr, &av, NULL); + if (ret != 0) { + sprintf(err_buf, "fi_av_open(%s) = %d, %s", + fi_tostr(&av_type, FI_TYPE_AV_TYPE), + ret, fi_strerror(-ret)); + goto fail; + } + + sync_err[0] = -1; + sync_err[1] = 0; + + buflen = sizeof(addrbuf); + + /* vector is good address + bad address */ + ret = av_create_address_list(good_address, 0, 1, addrbuf, 0, buflen); + if (ret < 0) { + goto fail; // av_create_address_list filled err_buf + } + ret = av_create_address_list(bad_address, 0, 1, addrbuf, 1, buflen); + if (ret < 0) { + goto fail; // av_create_address_list filled err_buf + } + ret = fi_av_insert(av, addrbuf, 2, NULL, FI_SYNC_ERR, sync_err); + if (ret != 1) { + if (ret == -FI_EBADFLAGS) { + sprintf(err_buf, "FI_SYNC_ERR not supported\n"); + ret = -FI_ENOSYS; + } + sprintf(err_buf, "fi_av_insert ret=%d, should be 1", ret); + goto fail; + } + + if (sync_err[0] != 0) { + sprintf(err_buf, "sync_err[0] != 0"); + goto fail; + } + if (sync_err[1] == 0) { + sprintf(err_buf, "sync_err[1] = 0"); + goto fail; + } + + testret = PASS; +fail: + FT_CLOSE_FID(av); +out: + return TEST_RET_VAL(ret, testret); +} + /* * Tests: * - async good vector @@ -958,10 +1034,10 @@ struct test_entry test_array_good[] = { TEST_ENTRY(av_good_sync, "Test sync AV insert with good address"), TEST_ENTRY(av_null_fi_addr, "Test AV insert without specifying fi_addr"), TEST_ENTRY(av_good_vector_async, - "Test async AV insert with vector of good addresses"), + "Test async AV insert with vector of good addresses"), TEST_ENTRY(av_zero_async, "Test async insert AV insert of zero addresses"), TEST_ENTRY(av_good_2vector_async, - "Test async AV inserts with two address vectors"), + "Test async AV inserts with two address vectors"), TEST_ENTRY(av_insert_stages, "Test AV insert at various stages"), { NULL, "" } }; @@ -969,9 +1045,11 @@ struct test_entry test_array_good[] = { struct test_entry test_array_bad[] = { TEST_ENTRY(av_bad_sync, "Test sync AV insert of bad address"), TEST_ENTRY(av_goodbad_vector_sync, - "Test sync AV insert of 1 good and 1 bad address"), + "Test sync AV insert of 1 good and 1 bad address"), TEST_ENTRY(av_goodbad_vector_async, - "Test async AV insert with good and bad address"), + "Test async AV insert with good and bad address"), + TEST_ENTRY(av_goodbad_vector_sync_err, + "Test AV insert of 1 good, 1 bad address using FI_SYNC_ERR"), { NULL, "" } }; diff --git a/fabtests/unit/cq_test.c b/fabtests/unit/cq_test.c index d33fc1544f2..6396fc3f747 100644 --- a/fabtests/unit/cq_test.c +++ b/fabtests/unit/cq_test.c @@ -41,6 +41,7 @@ #include "unit_common.h" #include "shared.h" +static int test_max = 1 << 15; static char err_buf[512]; static int @@ -70,7 +71,7 @@ static int cq_open_close_simultaneous(void) int testret = FAIL; struct fid_cq **cq_array; - count = fi->domain_attr->cq_cnt; + count = MIN(fi->domain_attr->cq_cnt, test_max); FT_DEBUG("testing creation of up to %zu simultaneous CQs\n", count); cq_array = calloc(count, sizeof(*cq_array)); @@ -81,6 +82,10 @@ static int cq_open_close_simultaneous(void) for (opened = 0; opened < count && !ret; opened++) { ret = create_cq(&cq_array[opened], 0, 0, FI_CQ_FORMAT_UNSPEC, FI_WAIT_UNSPEC); + if (ret) { + ret = create_cq(&cq_array[opened], 0, 0, + FI_CQ_FORMAT_UNSPEC, FI_WAIT_NONE); + } } if (ret) { FT_WARN("fi_cq_open failed after %d (cq_cnt: %zu): %s", @@ -114,6 +119,11 @@ cq_open_close_sizes() size = (i < 0) ? 0 : 1 << i; ret = create_cq(&cq, size, 0, FI_CQ_FORMAT_UNSPEC, FI_WAIT_UNSPEC); + if (ret != 0) { + ret = create_cq(&cq, size, 0, FI_CQ_FORMAT_UNSPEC, + FI_WAIT_NONE); + } + if (ret == -FI_EINVAL) { FT_WARN("\nSuccessfully completed %d iterations up to " "size %d before the provider returned " @@ -123,8 +133,7 @@ cq_open_close_sizes() goto pass; } if (ret != 0) { - sprintf(err_buf, "fi_cq_open(%d, 0, FI_CQ_FORMAT_UNSPEC, " - "FI_WAIT_UNSPEC) = %d, %s", + sprintf(err_buf, "fi_cq_open with size %d returned %d, %s", size, ret, fi_strerror(-ret)); goto fail; } @@ -209,6 +218,7 @@ struct test_entry test_array[] = { static void usage(void) { ft_unit_usage("cq_test", "Unit test for Completion Queue (CQ)"); + FT_PRINT_OPTS_USAGE("-L ", "Limit of CQs to open. Default: 32k"); } int main(int argc, char **argv) @@ -220,8 +230,11 @@ int main(int argc, char **argv) if (!hints) return EXIT_FAILURE; - while ((op = getopt(argc, argv, FAB_OPTS "h")) != -1) { + while ((op = getopt(argc, argv, FAB_OPTS "hL:")) != -1) { switch (op) { + case 'L': + test_max = atoi(optarg); + break; default: ft_parseinfo(op, optarg, hints, &opts); break; diff --git a/fabtests/unit/getinfo_test.c b/fabtests/unit/getinfo_test.c index 63b2cd8734a..31bf406ca3e 100644 --- a/fabtests/unit/getinfo_test.c +++ b/fabtests/unit/getinfo_test.c @@ -45,8 +45,11 @@ getinfo_ ## name ## _desc) typedef int (*ft_getinfo_init)(struct fi_info *); -typedef int (*ft_getinfo_test)(char *, char *, uint64_t, struct fi_info *, struct fi_info **); +typedef int (*ft_getinfo_test)(char *, char *, uint64_t, struct fi_info *, + struct fi_info **); typedef int (*ft_getinfo_check)(struct fi_info *); +typedef int (*ft_getinfo_init_val)(struct fi_info *, uint64_t); +typedef int (*ft_getinfo_check_val)(struct fi_info *, uint64_t); static char err_buf[512]; static char new_prov_var[128]; @@ -110,91 +113,204 @@ static int invalid_dom(struct fi_info *hints) return 0; } -static int validate_msg_ordering_bits(char *node, char *service, uint64_t flags, - struct fi_info *hints, struct fi_info **info) +static int validate_bit_combos(char *node, char *service, uint64_t flags, + struct fi_info *hints, struct fi_info **info, uint64_t bits, + ft_getinfo_init_val init, ft_getinfo_check_val check) { int i, ret; - uint64_t ordering_bits = (FI_ORDER_STRICT | FI_ORDER_DATA); - uint64_t *msg_order_combinations; - int cnt; + uint64_t *combinations; + int cnt, fail, skipped; - ret = ft_alloc_bit_combo(0, ordering_bits, &msg_order_combinations, &cnt); + ret = ft_alloc_bit_combo(0, bits, &combinations, &cnt); if (ret) { FT_UNIT_STRERR(err_buf, "ft_alloc_bit_combo failed", ret); return ret; } - /* test for what ordering support exists on this provider */ - /* test ordering support in TX ATTRIBUTE */ - for (i = 0; i < cnt; i++) { - hints->tx_attr->msg_order = msg_order_combinations[i]; + for (i = 0, fail = skipped = 0; i < cnt; i++) { + init(hints, combinations[i]); ret = fi_getinfo(FT_FIVERSION, node, service, flags, hints, info); if (ret) { - if (ret == -FI_ENODATA) + if (ret == -FI_ENODATA) { + skipped++; continue; + } FT_UNIT_STRERR(err_buf, "fi_getinfo failed", ret); - goto failed_getinfo; + goto out; } - ft_foreach_info(fi, *info) { - FT_DEBUG("\nTesting for fabric: %s, domain: %s, endpoint type: %d", - fi->fabric_attr->name, fi->domain_attr->name, - fi->ep_attr->type); - if (hints->tx_attr->msg_order) { - if ((fi->tx_attr->msg_order & hints->tx_attr->msg_order) != - hints->tx_attr->msg_order) { - FT_DEBUG("tx msg_order not matching - hints: %" - PRIx64 " prov: %" PRIx64 "\n", - hints->tx_attr->msg_order, - fi->tx_attr->msg_order); - ret = -FI_EOTHER; - fi_freeinfo(*info); - goto failed_getinfo; - } + for (fi = *info; fi; fi = fi->next) { + if (check && check(fi, combinations[i])) { + FT_DEBUG("%s:failed check for caps [%s]\n", + fi->fabric_attr->prov_name, + fi_tostr(&combinations[i], + FI_TYPE_CAPS)); + ret = -FI_EIO; } } + if (ret) + fail++; + fi_freeinfo(*info); + *info = NULL; } + ret = 0; + printf("(passed)(skipped) (%d)(%d)/%d combinations\n", + cnt - (fail + skipped), skipped, cnt); +out: + fi = NULL; + ft_free_bit_combo(combinations); + return fail ? -FI_EIO : ret; +} - /* test ordering support in RX ATTRIBUTE */ - for (i = 0; i < cnt; i++) { - hints->tx_attr->msg_order = 0; - hints->rx_attr->msg_order = msg_order_combinations[i]; - ret = fi_getinfo(FT_FIVERSION, node, service, flags, hints, info); - if (ret) { - if (ret == -FI_ENODATA) - continue; - FT_UNIT_STRERR(err_buf, "fi_getinfo failed", ret); - goto failed_getinfo; - } - ft_foreach_info(fi, *info) { - FT_DEBUG("\nTesting for fabric: %s, domain: %s, endpoint type: %d", - fi->fabric_attr->name, fi->domain_attr->name, - fi->ep_attr->type); - if (hints->rx_attr->msg_order) { - if ((fi->rx_attr->msg_order & hints->rx_attr->msg_order) != - hints->rx_attr->msg_order) { - FT_DEBUG("rx msg_order not matching - hints: %" - PRIx64 " prov: %" PRIx64 "\n", - hints->rx_attr->msg_order, - fi->rx_attr->msg_order); - ret = -FI_EOTHER; - fi_freeinfo(*info); - goto failed_getinfo; - } - } - } - fi_freeinfo(*info); +#define check_has_bits(val, bits) (((val) & (bits)) != (bits)) +#define check_only_has_bits(val, bits) ((val) & ~(bits)) + +static int init_tx_order(struct fi_info *hints, uint64_t order) +{ + hints->tx_attr->msg_order = order; + return 0; +} + +static int check_tx_order(struct fi_info *info, uint64_t order) +{ + return check_has_bits(info->tx_attr->msg_order, order); +} + +static int validate_tx_ordering_bits(char *node, char *service, uint64_t flags, + struct fi_info *hints, struct fi_info **info) +{ + return validate_bit_combos(node, service, flags, hints, info, + FI_ORDER_STRICT | FI_ORDER_DATA, + init_tx_order, check_tx_order); +} + +static int init_rx_order(struct fi_info *hints, uint64_t order) +{ + hints->rx_attr->msg_order = order; + return 0; +} + +static int check_rx_order(struct fi_info *info, uint64_t order) +{ + return check_has_bits(info->rx_attr->msg_order, order); +} + +static int validate_rx_ordering_bits(char *node, char *service, uint64_t flags, + struct fi_info *hints, struct fi_info **info) +{ + return validate_bit_combos(node, service, flags, hints, info, + FI_ORDER_STRICT | FI_ORDER_DATA, + init_rx_order, check_rx_order); +} + +static int init_caps(struct fi_info *hints, uint64_t bits) +{ + hints->caps = bits; + return 0; +} + +#define PRIMARY_TX_CAPS (FI_MSG | FI_RMA | FI_TAGGED | FI_ATOMIC | \ + FI_MULTICAST | FI_NAMED_RX_CTX | FI_HMEM | \ + FI_COLLECTIVE) +#define PRIMARY_RX_CAPS (FI_MSG | FI_RMA | FI_TAGGED | FI_ATOMIC | \ + FI_DIRECTED_RECV | FI_VARIABLE_MSG | \ + FI_HMEM | FI_COLLECTIVE) + +#define PRIMARY_CAPS (PRIMARY_TX_CAPS | PRIMARY_RX_CAPS) +#define DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM | FI_SHARED_AV) +#define SEC_TX_CAPS (FI_TRIGGER | FI_FENCE | FI_RMA_PMEM) +#define SEC_RX_CAPS (FI_RMA_PMEM | FI_SOURCE | FI_SOURCE_ERR | \ + FI_RMA_EVENT | FI_MULTI_RECV | FI_TRIGGER) +#define MOD_TX_CAPS (FI_SEND | FI_READ | FI_WRITE) +#define MOD_RX_CAPS (FI_RECV | FI_REMOTE_READ | FI_REMOTE_WRITE) +#define OPT_TX_CAPS (MOD_TX_CAPS | SEC_TX_CAPS) +#define OPT_RX_CAPS (MOD_RX_CAPS | SEC_RX_CAPS) +#define OPT_CAPS (DOMAIN_CAPS | OPT_TX_CAPS | OPT_RX_CAPS) + +static void print_incorrect_caps(char *prov, char *attr, + uint64_t expected, uint64_t actual) +{ + FT_DEBUG("%s: %s->caps has unexpected caps -\n", prov, attr); + FT_DEBUG("expected\t[%s]\n", fi_tostr(&expected, FI_TYPE_CAPS)); + FT_DEBUG("actual\t[%s]\n", fi_tostr(&actual, FI_TYPE_CAPS)); +} + +static int check_no_extra_caps(struct fi_info *info, uint64_t caps) +{ + if (caps & check_only_has_bits(info->caps, caps | OPT_CAPS)) { + print_incorrect_caps(info->fabric_attr->prov_name, "info", + caps & PRIMARY_CAPS, info->caps & ~OPT_CAPS); + return 1; + } + if (check_only_has_bits(info->tx_attr->caps, + PRIMARY_TX_CAPS | OPT_TX_CAPS)) { + print_incorrect_caps(info->fabric_attr->prov_name, "tx_attr", + caps & PRIMARY_TX_CAPS, + info->tx_attr->caps & ~OPT_TX_CAPS); + return 1; + } + if (check_only_has_bits(info->tx_attr->caps, info->caps)) { + print_incorrect_caps(info->fabric_attr->prov_name, "tx_attr", + info->caps & (PRIMARY_TX_CAPS | OPT_TX_CAPS), + info->tx_attr->caps); + } + if (check_only_has_bits(info->rx_attr->caps, + PRIMARY_RX_CAPS | OPT_RX_CAPS)) { + print_incorrect_caps(info->fabric_attr->prov_name, "rx_attr", + caps & PRIMARY_RX_CAPS, + info->rx_attr->caps & ~OPT_RX_CAPS); + return 1; + } + if (check_only_has_bits(info->rx_attr->caps, info->caps)) { + print_incorrect_caps(info->fabric_attr->prov_name, "rx_attr", + info->caps & (PRIMARY_RX_CAPS | OPT_RX_CAPS), + info->rx_attr->caps); + return 1; + } + return 0; +} + +static int check_caps(struct fi_info *info, uint64_t caps) +{ + int ret; + + ret = check_no_extra_caps(info, caps); + if (!caps) + return ret; + + if (check_has_bits(info->caps, caps)) { + print_incorrect_caps(info->fabric_attr->prov_name, "info", + caps & PRIMARY_CAPS, info->caps & ~OPT_CAPS); + return 1; + } + if (check_has_bits(info->tx_attr->caps, caps & PRIMARY_TX_CAPS)) { + print_incorrect_caps(info->fabric_attr->prov_name, "tx_attr", + caps & PRIMARY_TX_CAPS, + info->tx_attr->caps & ~OPT_TX_CAPS); + return 1; + } + if (check_has_bits(info->rx_attr->caps, caps & PRIMARY_RX_CAPS)) { + print_incorrect_caps(info->fabric_attr->prov_name, "rx_attr", + caps & PRIMARY_RX_CAPS, + info->rx_attr->caps & ~OPT_RX_CAPS); + return 1; } - *info = NULL; - ft_free_bit_combo(msg_order_combinations); return 0; +} -failed_getinfo: - *info = NULL; - ft_free_bit_combo(msg_order_combinations); - return ret; +static int validate_primary_caps(char *node, char *service, uint64_t flags, + struct fi_info *hints, struct fi_info **info) +{ + return validate_bit_combos(node, service, flags, hints, info, + PRIMARY_TX_CAPS | PRIMARY_RX_CAPS, + init_caps, check_caps); +} + +static int test_null_hints_caps(struct fi_info *info) +{ + return check_no_extra_caps(info, 0); } static int init_valid_rma_RAW_ordering_no_set_size(struct fi_info *hints) @@ -218,11 +334,14 @@ static int init_valid_rma_RAW_ordering_set_size(struct fi_info *hints) ret = fi_getinfo(FT_FIVERSION, NULL, NULL, 0, hints, &fi); if (ret) { - sprintf(err_buf, "fi_getinfo failed %s(%d)", fi_strerror(-ret), -ret); + sprintf(err_buf, "fi_getinfo returned %d - %s", + -ret, fi_strerror(-ret)); return ret; } - if (fi->ep_attr->max_order_raw_size > 0) - hints->ep_attr->max_order_raw_size = fi->ep_attr->max_order_raw_size - 1; + if (fi->ep_attr->max_order_raw_size > 0) { + hints->ep_attr->max_order_raw_size = + fi->ep_attr->max_order_raw_size - 1; + } fi_freeinfo(fi); @@ -250,11 +369,14 @@ static int init_valid_rma_WAR_ordering_set_size(struct fi_info *hints) ret = fi_getinfo(FT_FIVERSION, NULL, NULL, 0, hints, &fi); if (ret) { - sprintf(err_buf, "fi_getinfo failed %s(%d)", fi_strerror(-ret), -ret); + sprintf(err_buf, "fi_getinfo returned %d - %s", + -ret, fi_strerror(-ret)); return ret; } - if (fi->ep_attr->max_order_war_size > 0) - hints->ep_attr->max_order_war_size = fi->ep_attr->max_order_war_size - 1; + if (fi->ep_attr->max_order_war_size > 0) { + hints->ep_attr->max_order_war_size = + fi->ep_attr->max_order_war_size - 1; + } fi_freeinfo(fi); @@ -281,11 +403,14 @@ static int init_valid_rma_WAW_ordering_set_size(struct fi_info *hints) hints->rx_attr->msg_order = FI_ORDER_WAW; ret = fi_getinfo(FT_FIVERSION, NULL, NULL, 0, hints, &fi); if (ret) { - sprintf(err_buf, "fi_getinfo failed %s(%d)", fi_strerror(-ret), -ret); + sprintf(err_buf, "fi_getinfo returned %d - %s", + -ret, fi_strerror(-ret)); return ret; } - if (fi->ep_attr->max_order_waw_size > 0) - hints->ep_attr->max_order_waw_size = fi->ep_attr->max_order_waw_size - 1; + if (fi->ep_attr->max_order_waw_size > 0) { + hints->ep_attr->max_order_waw_size = + fi->ep_attr->max_order_waw_size - 1; + } fi_freeinfo(fi); @@ -299,7 +424,8 @@ static int check_valid_rma_ordering_sizes(struct fi_info *info) if (info->ep_attr->max_order_raw_size <= 0) return EXIT_FAILURE; if (hints->ep_attr->max_order_raw_size) { - if (info->ep_attr->max_order_raw_size < hints->ep_attr->max_order_raw_size) + if (info->ep_attr->max_order_raw_size < + hints->ep_attr->max_order_raw_size) return EXIT_FAILURE; } } @@ -308,7 +434,8 @@ static int check_valid_rma_ordering_sizes(struct fi_info *info) if (info->ep_attr->max_order_war_size <= 0) return EXIT_FAILURE; if (hints->ep_attr->max_order_war_size) { - if (info->ep_attr->max_order_war_size < hints->ep_attr->max_order_war_size) + if (info->ep_attr->max_order_war_size < + hints->ep_attr->max_order_war_size) return EXIT_FAILURE; } } @@ -317,7 +444,8 @@ static int check_valid_rma_ordering_sizes(struct fi_info *info) if (info->ep_attr->max_order_waw_size <= 0) return EXIT_FAILURE; if (hints->ep_attr->max_order_waw_size) { - if (info->ep_attr->max_order_waw_size < hints->ep_attr->max_order_waw_size) + if (info->ep_attr->max_order_waw_size < + hints->ep_attr->max_order_waw_size) return EXIT_FAILURE; } } @@ -338,12 +466,15 @@ static int init_invalid_rma_RAW_ordering_size(struct fi_info *hints) ret = fi_getinfo(FT_FIVERSION, NULL, NULL, 0, hints, &fi); if (ret) { - sprintf(err_buf, "fi_getinfo failed %s(%d)", fi_strerror(-ret), -ret); + sprintf(err_buf, "fi_getinfo returned %d - %s", + -ret, fi_strerror(-ret)); return ret; } - if (fi->ep_attr->max_order_raw_size) - hints->ep_attr->max_order_raw_size = fi->ep_attr->max_order_raw_size + 1; + if (fi->ep_attr->max_order_raw_size) { + hints->ep_attr->max_order_raw_size = + fi->ep_attr->max_order_raw_size + 1; + } fi_freeinfo(fi); @@ -363,12 +494,15 @@ static int init_invalid_rma_WAR_ordering_size(struct fi_info *hints) ret = fi_getinfo(FT_FIVERSION, NULL, NULL, 0, hints, &fi); if (ret) { - sprintf(err_buf, "fi_getinfo failed %s(%d)", fi_strerror(-ret), -ret); + sprintf(err_buf, "fi_getinfo returned %d - %s", + -ret, fi_strerror(-ret)); return ret; } - if (fi->ep_attr->max_order_war_size) - hints->ep_attr->max_order_war_size = fi->ep_attr->max_order_war_size + 1; + if (fi->ep_attr->max_order_war_size) { + hints->ep_attr->max_order_war_size = + fi->ep_attr->max_order_war_size + 1; + } fi_freeinfo(fi); @@ -388,12 +522,15 @@ static int init_invalid_rma_WAW_ordering_size(struct fi_info *hints) ret = fi_getinfo(FT_FIVERSION, NULL, NULL, 0, hints, &fi); if (ret) { - sprintf(err_buf, "fi_getinfo failed %s(%d)", fi_strerror(-ret), -ret); + sprintf(err_buf, "fi_getinfo returned %d - %s", + -ret, fi_strerror(-ret)); return ret; } - if (fi->ep_attr->max_order_waw_size) - hints->ep_attr->max_order_waw_size = fi->ep_attr->max_order_waw_size + 1; + if (fi->ep_attr->max_order_waw_size) { + hints->ep_attr->max_order_waw_size = + fi->ep_attr->max_order_waw_size + 1; + } fi_freeinfo(fi); @@ -440,7 +577,8 @@ static int init_mr_unspec(struct fi_info *hints) static int test_mr_v1_0(char *node, char *service, uint64_t flags, struct fi_info *test_hints, struct fi_info **info) { - return fi_getinfo(FI_VERSION(1, 0), node, service, flags, test_hints, info); + return fi_getinfo(FI_VERSION(1, 0), node, service, flags, + test_hints, info); } static int check_mr_unspec(struct fi_info *info) @@ -450,43 +588,145 @@ static int check_mr_unspec(struct fi_info *info) EXIT_FAILURE : 0; } -static int test_mr_modes(char *node, char *service, uint64_t flags, - struct fi_info *hints, struct fi_info **info) +static int init_mr_mode(struct fi_info *hints, uint64_t mode) { - struct fi_info *fi; - uint64_t *mr_modes; - int i, cnt, ret; + hints->domain_attr->mr_mode = (uint32_t) mode; + return 0; +} + +static int check_mr_mode(struct fi_info *info, uint64_t mode) +{ + return check_only_has_bits(info->domain_attr->mr_mode, mode); +} - ret = ft_alloc_bit_combo(0, FI_MR_LOCAL | FI_MR_RAW | FI_MR_VIRT_ADDR | +static int validate_mr_modes(char *node, char *service, uint64_t flags, + struct fi_info *hints, struct fi_info **info) +{ + uint64_t mode_bits = FI_MR_LOCAL | FI_MR_RAW | FI_MR_VIRT_ADDR | FI_MR_ALLOCATED | FI_MR_PROV_KEY | FI_MR_MMU_NOTIFY | - FI_MR_RMA_EVENT | FI_MR_ENDPOINT, &mr_modes, &cnt); + FI_MR_RMA_EVENT | FI_MR_ENDPOINT; + + return validate_bit_combos(node, service, flags, hints, info, mode_bits, + init_mr_mode, check_mr_mode); +} + +/* + * Progress checks + */ +static int init_data_manual(struct fi_info *hints) +{ + hints->domain_attr->data_progress = FI_PROGRESS_MANUAL; + return 0; +} + +static int init_data_auto(struct fi_info *hints) +{ + hints->domain_attr->data_progress = FI_PROGRESS_AUTO; + return 0; +} + +static int init_ctrl_manual(struct fi_info *hints) +{ + hints->domain_attr->control_progress = FI_PROGRESS_MANUAL; + return 0; +} + +static int init_ctrl_auto(struct fi_info *hints) +{ + hints->domain_attr->control_progress = FI_PROGRESS_AUTO; + return 0; +} + +static int check_data_manual(struct fi_info *info) +{ + return (info->domain_attr->data_progress != FI_PROGRESS_MANUAL) ? + EXIT_FAILURE : 0; +} + +static int check_data_auto(struct fi_info *info) +{ + return (info->domain_attr->data_progress != FI_PROGRESS_AUTO) ? + EXIT_FAILURE : 0; +} + +static int check_ctrl_manual(struct fi_info *info) +{ + return (info->domain_attr->control_progress != FI_PROGRESS_MANUAL) ? + EXIT_FAILURE : 0; +} + +static int check_ctrl_auto(struct fi_info *info) +{ + return (info->domain_attr->control_progress != FI_PROGRESS_AUTO) ? + EXIT_FAILURE : 0; +} + + +static int init_domain_caps(struct fi_info *hints, uint64_t caps) +{ + hints->domain_attr->caps = caps; + return 0; +} + +static int check_domain_caps(struct fi_info *info, uint64_t caps) +{ + return check_has_bits(info->domain_attr->caps, caps); +} + +static int validate_domain_caps(char *node, char *service, uint64_t flags, + struct fi_info *hints, struct fi_info **info) +{ + return validate_bit_combos(node, service, flags, hints, info, + FI_LOCAL_COMM | FI_REMOTE_COMM | FI_SHARED_AV, + init_domain_caps, check_domain_caps); +} + +/* Some apps (MPI) request all fi_info structures, and use the output to + * form the hints for a second call. This usage breaks if the provider + * adds a new capability bit that also requires setting a mode or mr_mode + * bit (new or otherwise), which the app does not set. + * This is really a problem with the app, but avoid a regression + * by verifying that providers do not add new requirements for apps that + * inadvertently pick up a new capability bit. + */ +static int test_caps_regression(char *node, char *service, uint64_t flags, + struct fi_info *hints, struct fi_info **info) +{ + struct fi_info *fi; + int ret; + + ret = fi_getinfo(FT_FIVERSION, node, service, flags, NULL, info); if (ret) return ret; - for (i = 0; i < cnt; i++) { - hints->domain_attr->mr_mode = (uint32_t) mr_modes[i]; - ret = fi_getinfo(FT_FIVERSION, node, service, flags, hints, info); - if (ret) { - if (ret == -FI_ENODATA) - continue; - FT_UNIT_STRERR(err_buf, "fi_getinfo failed", ret); - goto out; - } - - ft_foreach_info(fi, *info) { - if (fi->domain_attr->mr_mode & ~hints->domain_attr->mr_mode) { - ret = -FI_EOTHER; - fi_freeinfo(*info); - goto out; - } + if (!hints || !hints->fabric_attr || !hints->fabric_attr->prov_name) { + fi = *info; + } else { + for (fi = *info; fi; fi = fi->next) { + if (!strcasecmp(hints->fabric_attr->prov_name, + (*info)->fabric_attr->prov_name)) + break; } - fi_freeinfo(*info); } -out: + if (!fi) + return 0; + + /* Limit mode bits to common, older options only */ + hints->caps |= fi->caps; + hints->mode = FI_CONTEXT; + hints->domain_attr->mr_mode = FI_MR_LOCAL | OFI_MR_BASIC_MAP; + + fi_freeinfo(*info); *info = NULL; - ft_free_bit_combo(mr_modes); - return ret; + + ret = fi_getinfo(FT_FIVERSION, node, service, flags, hints, info); + if (ret) { + printf("regression: new mode/mr_mode bits required..."); + return -FI_EINVAL; + } + + return 0; } @@ -494,8 +734,8 @@ static int test_mr_modes(char *node, char *service, uint64_t flags, * getinfo test */ static int getinfo_unit_test(char *node, char *service, uint64_t flags, - struct fi_info *base_hints, ft_getinfo_init init, ft_getinfo_test test, - ft_getinfo_check check, int ret_exp) + struct fi_info *base_hints, ft_getinfo_init init, + ft_getinfo_test test, ft_getinfo_check check, int ret_exp) { struct fi_info *info = NULL, *fi, *test_hints = NULL; int ret; @@ -512,27 +752,30 @@ static int getinfo_unit_test(char *node, char *service, uint64_t flags, goto out; } - if (test) + if (test) { ret = test(node, service, flags, test_hints, &info); - else - ret = fi_getinfo(FT_FIVERSION, node, service, flags, test_hints, &info); + } else { + ret = fi_getinfo(FT_FIVERSION, node, service, flags, + test_hints, &info); + } if (ret) { if (ret == ret_exp) { ret = 0; goto out; } - sprintf(err_buf, "fi_getinfo failed %s(%d)", fi_strerror(-ret), -ret); + sprintf(err_buf, "fi_getinfo returned %d - %s", + -ret, fi_strerror(-ret)); goto out; } if (!info || !check) goto out; - ft_foreach_info(fi, info) { + for (fi = info; fi; fi = fi->next) { FT_DEBUG("\nTesting for fabric: %s, domain: %s, endpoint type: %d", - fi->fabric_attr->name, fi->domain_attr->name, - fi->ep_attr->type); - ret = check(info); + fi->fabric_attr->prov_name, fi->domain_attr->name, + fi->ep_attr->type); + ret = check(fi); if (ret) break; } @@ -542,19 +785,19 @@ static int getinfo_unit_test(char *node, char *service, uint64_t flags, return ret; } -#define getinfo_test(name, num, desc, node, service, flags, hints, init, test, check, \ - ret_exp) \ -char *getinfo_ ## name ## num ## _desc = desc; \ -static int getinfo_ ## name ## num(void) \ -{ \ - int ret, testret = FAIL; \ - ret = getinfo_unit_test(node, service, flags, hints, init, test, check, \ - ret_exp); \ - if (ret) \ - goto fail; \ - testret = PASS; \ -fail: \ - return TEST_RET_VAL(ret, testret); \ +#define getinfo_test(name, num, desc, node, service, flags, hints, \ + init, test, check, ret_exp) \ +char *getinfo_ ## name ## num ## _desc = desc; \ +static int getinfo_ ## name ## num(void) \ +{ \ + int ret, testret = FAIL; \ + ret = getinfo_unit_test(node, service, flags, hints, init, \ + test, check, ret_exp); \ + if (ret) \ + goto fail; \ + testret = PASS; \ +fail: \ + return TEST_RET_VAL(ret, testret); \ } /* @@ -570,82 +813,85 @@ fail: \ /* 1.1 Source address only tests */ getinfo_test(no_hints, 1, "Test with no node, service, flags or hints", - NULL, NULL, 0, NULL, NULL, NULL, check_srcaddr, 0) -getinfo_test(no_hints, 2, "Test with node, no service, FI_SOURCE flag and no hints", - opts.src_addr ? opts.src_addr : "localhost", NULL, FI_SOURCE, - NULL, NULL, NULL, check_srcaddr, 0) -getinfo_test(no_hints, 3, "Test with service, FI_SOURCE flag and no node or hints", + NULL, NULL, 0, NULL, NULL, NULL, check_srcaddr, 0) +getinfo_test(no_hints, 2, "Test with node, no service, FI_SOURCE flag, no hints", + opts.src_addr ? opts.src_addr : "localhost", NULL, FI_SOURCE, + NULL, NULL, NULL, check_srcaddr, 0) +getinfo_test(no_hints, 3, "Test with service, FI_SOURCE flag, no node, no hints", NULL, opts.src_port, FI_SOURCE, NULL, NULL, - NULL, check_srcaddr, 0) // TODO should we check for wildcard addr? -getinfo_test(no_hints, 4, "Test with node, service, FI_SOURCE flags and no hints", - opts.src_addr ? opts.src_addr : "localhost", opts.src_port, - FI_SOURCE, NULL, NULL, NULL, check_srcaddr, 0) + NULL, check_srcaddr, 0) +getinfo_test(no_hints, 4, "Test with node, service, FI_SOURCE flag, no hints", + opts.src_addr ? opts.src_addr : "localhost", opts.src_port, + FI_SOURCE, NULL, NULL, NULL, check_srcaddr, 0) /* 1.2 Source and destination address tests */ -getinfo_test(no_hints, 5, "Test with node, service and no hints", - opts.dst_addr ? opts.dst_addr : "localhost", opts.dst_port, - 0, NULL, NULL, NULL, check_src_dest_addr, 0) +getinfo_test(no_hints, 5, "Test with node, service, no hints", + opts.dst_addr ? opts.dst_addr : "localhost", opts.dst_port, + 0, NULL, NULL, NULL, check_src_dest_addr, 0) -/* 2. Test with hints */ +/* 2. Tests, most with hints */ /* 2.1 Source address only tests */ getinfo_test(src, 1, "Test with no node, service, or flags", - NULL, NULL, 0, hints, NULL, NULL, check_srcaddr, 0) + NULL, NULL, 0, hints, NULL, NULL, check_srcaddr, 0) getinfo_test(src, 2, "Test with node, no service, FI_SOURCE flag", - opts.src_addr ? opts.src_addr : "localhost", NULL, FI_SOURCE, - hints, NULL, NULL, check_srcaddr, 0) -getinfo_test(src, 3, "Test with service, FI_SOURCE flag and no node", - NULL, opts.src_port, FI_SOURCE, hints, NULL, - NULL, check_srcaddr, 0) // TODO should we check for wildcard addr? -getinfo_test(src, 4, "Test with node, service, FI_SOURCE flags", - opts.src_addr ? opts.src_addr : "localhost", opts.src_port, - FI_SOURCE, hints, NULL, NULL, check_srcaddr, 0) + opts.src_addr ? opts.src_addr : "localhost", NULL, FI_SOURCE, + hints, NULL, NULL, check_srcaddr, 0) +getinfo_test(src, 3, "Test with service, FI_SOURCE flag, no node", + NULL, opts.src_port, FI_SOURCE, hints, NULL, + NULL, check_srcaddr, 0) +getinfo_test(src, 4, "Test with node, service, FI_SOURCE flag", + opts.src_addr ? opts.src_addr : "localhost", opts.src_port, + FI_SOURCE, hints, NULL, NULL, check_srcaddr, 0) /* 2.2 Source and destination address tests */ getinfo_test(src_dest, 1, "Test with node, service", - opts.dst_addr ? opts.dst_addr : "localhost", opts.dst_port, - 0, hints, NULL, NULL, check_src_dest_addr, 0) + opts.dst_addr ? opts.dst_addr : "localhost", opts.dst_port, + 0, hints, NULL, NULL, check_src_dest_addr, 0) getinfo_test(src_dest, 2, "Test API version", - NULL, NULL, 0, hints, NULL, NULL, check_api_version , 0) + NULL, NULL, 0, hints, NULL, NULL, check_api_version , 0) /* Negative tests */ getinfo_test(neg, 1, "Test with non-existent domain name", - NULL, NULL, 0, hints, invalid_dom, NULL, NULL, -FI_ENODATA) + NULL, NULL, 0, hints, invalid_dom, NULL, NULL, -FI_ENODATA) /* Utility provider tests */ getinfo_test(util, 1, "Test if we get utility provider when requested", - NULL, NULL, 0, hints, NULL, NULL, check_util_prov, 0) + NULL, NULL, 0, hints, NULL, NULL, check_util_prov, 0) /* Message Ordering Tests */ -getinfo_test(msg_ordering, 1, "Test msg ordering bits supported are set", - NULL, NULL, 0, hints, NULL, validate_msg_ordering_bits, NULL, 0) +getinfo_test(msg_ordering, 1, "Test tx ordering bits supported are set", + NULL, NULL, 0, hints, NULL, validate_tx_ordering_bits, NULL, 0) +getinfo_test(msg_ordering, 2, "Test rx ordering bits supported are set", + NULL, NULL, 0, hints, NULL, validate_rx_ordering_bits, NULL, 0) + getinfo_test(raw_ordering, 1, "Test rma RAW ordering size is set", - NULL, NULL, 0, hints, init_valid_rma_RAW_ordering_no_set_size, - NULL, check_valid_rma_ordering_sizes, 0) + NULL, NULL, 0, hints, init_valid_rma_RAW_ordering_no_set_size, + NULL, check_valid_rma_ordering_sizes, 0) getinfo_test(raw_ordering, 2, "Test rma RAW ordering size is set to hints", - NULL, NULL, 0, hints, init_valid_rma_RAW_ordering_set_size, - NULL, check_valid_rma_ordering_sizes, 0) + NULL, NULL, 0, hints, init_valid_rma_RAW_ordering_set_size, + NULL, check_valid_rma_ordering_sizes, 0) getinfo_test(war_ordering, 1, "Test rma WAR ordering size is set", - NULL, NULL, 0, hints, init_valid_rma_WAR_ordering_no_set_size, - NULL, check_valid_rma_ordering_sizes, 0) + NULL, NULL, 0, hints, init_valid_rma_WAR_ordering_no_set_size, + NULL, check_valid_rma_ordering_sizes, 0) getinfo_test(war_ordering, 2, "Test rma WAR ordering size is set to hints", - NULL, NULL, 0, hints, init_valid_rma_WAR_ordering_set_size, - NULL, check_valid_rma_ordering_sizes, 0) + NULL, NULL, 0, hints, init_valid_rma_WAR_ordering_set_size, + NULL, check_valid_rma_ordering_sizes, 0) getinfo_test(waw_ordering, 1, "Test rma WAW ordering size is set", - NULL, NULL, 0, hints, init_valid_rma_WAW_ordering_no_set_size, - NULL, check_valid_rma_ordering_sizes, 0) + NULL, NULL, 0, hints, init_valid_rma_WAW_ordering_no_set_size, + NULL, check_valid_rma_ordering_sizes, 0) getinfo_test(waw_ordering, 2, "Test rma WAW ordering size is set to hints", - NULL, NULL, 0, hints, init_valid_rma_WAW_ordering_set_size, - NULL, check_valid_rma_ordering_sizes, 0) + NULL, NULL, 0, hints, init_valid_rma_WAW_ordering_set_size, + NULL, check_valid_rma_ordering_sizes, 0) getinfo_test(bad_raw_ordering, 1, "Test invalid rma RAW ordering size", - NULL, NULL, 0, hints, init_invalid_rma_RAW_ordering_size, - NULL, NULL, -FI_ENODATA) + NULL, NULL, 0, hints, init_invalid_rma_RAW_ordering_size, + NULL, NULL, -FI_ENODATA) getinfo_test(bad_war_ordering, 1, "Test invalid rma WAR ordering size", - NULL, NULL, 0, hints, init_invalid_rma_WAR_ordering_size, - NULL, NULL, -FI_ENODATA) + NULL, NULL, 0, hints, init_invalid_rma_WAR_ordering_size, + NULL, NULL, -FI_ENODATA) getinfo_test(bad_waw_ordering, 1, "Test invalid rma WAW ordering size", - NULL, NULL, 0, hints, init_invalid_rma_WAW_ordering_size, - NULL, NULL, -FI_ENODATA) + NULL, NULL, 0, hints, init_invalid_rma_WAW_ordering_size, + NULL, NULL, -FI_ENODATA) /* MR mode tests */ getinfo_test(mr_mode, 1, "Test FI_MR_BASIC", NULL, NULL, 0, @@ -657,15 +903,37 @@ getinfo_test(mr_mode, 3, "Test FI_MR_UNSPEC (v1.0)", NULL, NULL, 0, getinfo_test(mr_mode, 4, "Test FI_MR_BASIC (v1.0)", NULL, NULL, 0, hints, init_mr_basic, test_mr_v1_0, check_mr_basic, -FI_ENODATA) getinfo_test(mr_mode, 5, "Test FI_MR_SCALABLE (v1.0)", NULL, NULL, 0, - hints, init_mr_scalable, test_mr_v1_0, check_mr_scalable, -FI_ENODATA) + hints, init_mr_scalable, test_mr_v1_0, check_mr_scalable, + -FI_ENODATA) getinfo_test(mr_mode, 6, "Test mr_mode bits", NULL, NULL, 0, - hints, NULL, test_mr_modes, NULL, 0) + hints, NULL, validate_mr_modes, NULL, 0) + +/* Progress tests */ +getinfo_test(progress, 1, "Test data manual progress", NULL, NULL, 0, + hints, init_data_manual, NULL, check_data_manual, 0) +getinfo_test(progress, 2, "Test data auto progress", NULL, NULL, 0, + hints, init_data_auto, NULL, check_data_auto, 0) +getinfo_test(progress, 3, "Test ctrl manual progress", NULL, NULL, 0, + hints, init_ctrl_manual, NULL, check_ctrl_manual, 0) +getinfo_test(progress, 4, "Test ctrl auto progress", NULL, NULL, 0, + hints, init_ctrl_auto, NULL, check_ctrl_auto, 0) + +/* Capability test */ +getinfo_test(caps, 1, "Test capability bits supported are set", + NULL, NULL, 0, hints, NULL, validate_primary_caps, NULL, 0) +getinfo_test(caps, 2, "Test capability with no hints", + NULL, NULL, 0, NULL, NULL, NULL, test_null_hints_caps, 0) +getinfo_test(caps, 3, "Test domain capabilities", NULL, NULL, 0, + hints, NULL, validate_domain_caps, NULL, 0) +getinfo_test(caps, 4, "Test for capability bit regression", + NULL, NULL, 0, hints, NULL, test_caps_regression, NULL, 0) static void usage(void) { ft_unit_usage("getinfo_test", "Unit tests for fi_getinfo"); - FT_PRINT_OPTS_USAGE("-e ", "Endpoint type: msg|rdm|dgram (default:rdm)"); + FT_PRINT_OPTS_USAGE("-e ", + "Endpoint type: msg|rdm|dgram (default:rdm)"); ft_addr_usage(); } @@ -718,6 +986,7 @@ int main(int argc, char **argv) TEST_ENTRY_GETINFO(src_dest1), TEST_ENTRY_GETINFO(src_dest2), TEST_ENTRY_GETINFO(msg_ordering1), + TEST_ENTRY_GETINFO(msg_ordering2), TEST_ENTRY_GETINFO(raw_ordering1), TEST_ENTRY_GETINFO(raw_ordering2), TEST_ENTRY_GETINFO(war_ordering1), @@ -734,6 +1003,14 @@ int main(int argc, char **argv) TEST_ENTRY_GETINFO(mr_mode4), TEST_ENTRY_GETINFO(mr_mode5), TEST_ENTRY_GETINFO(mr_mode6), + TEST_ENTRY_GETINFO(progress1), + TEST_ENTRY_GETINFO(progress2), + TEST_ENTRY_GETINFO(progress3), + TEST_ENTRY_GETINFO(progress4), + TEST_ENTRY_GETINFO(caps1), + TEST_ENTRY_GETINFO(caps2), + TEST_ENTRY_GETINFO(caps3), + TEST_ENTRY_GETINFO(caps4), { NULL, "" } }; @@ -771,15 +1048,16 @@ int main(int argc, char **argv) opts.src_port = "9228"; hints->mode = ~0; + hints->domain_attr->mr_mode = opts.mr_mode; if (hints->fabric_attr->prov_name) { if (set_prov(hints->fabric_attr->prov_name)) return EXIT_FAILURE; } else { - FT_WARN("\nTests getinfo1 to getinfo5 may not run exclusively " - "for a particular provider since we don't pass hints.\n" - "So the failures in any of those tests may not be " - "attributable to a single provider.\n"); + FT_WARN("\nSome tests do not pass in hints, and may not run " + "exclusively for a particular provider.\n" + "Failures in any of those tests may not be " + "attributable to a specific provider.\n"); } failed = run_tests(no_hint_tests, err_buf); diff --git a/fabtests/unit/mr_cache_evict.c b/fabtests/unit/mr_cache_evict.c new file mode 100644 index 00000000000..4e72e0f7529 --- /dev/null +++ b/fabtests/unit/mr_cache_evict.c @@ -0,0 +1,844 @@ +/* + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "unit_common.h" +#include "shared.h" +#include "hmem.h" + +/* Supported memory region types. */ +enum alloc_type { + MMAP, + BRK, + SBRK, + CUDA, + ROCR, +}; + +static void *reuse_addr = NULL; +static char err_buf[512]; +static size_t mr_buf_size = 16384; + +/* Given a time value, determine the expected cached time value. The assumption + * is the cache value should at least have a CACHE_IMPROVEMENT_PERCENT time + * improvement over the original time value. + */ +#define CACHE_IMPROVEMENT_PERCENT 80 +#define CACHE_TIME_MAX_VALUE(time) ((time) / 100 * \ + (100 - CACHE_IMPROVEMENT_PERCENT)) + +#define PAGEMAP_ENTRY_SIZE 8 +#define PAGEMAP_PFN_PRESENT (1ULL << 63) +#define PAGEMAP_PFN_MASK ((1ULL << 55) - 1) + +/* Function used to get physical address from a virtual address. Must be root + * to read pagemap. + */ +static int virt_to_phys(const void *va_addr, uint64_t *phy_addr) +{ + int fd; + int ret; + uint64_t entry; + ssize_t read_size; + off_t seek_ret; + off_t seek_offset; + int page_size; + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd == -1) { + ret = -errno; + FT_UNIT_STRERR(err_buf, "open of /proc/self/pagemap failed", + ret); + return ret; + } + + page_size = sysconf(_SC_PAGESIZE); + if (page_size == -1) { + ret = -errno; + FT_UNIT_STRERR(err_buf, "sysconf(_SC_PAGESIZE) failed", ret); + goto out; + } + + /* Map virtual address to offset in pagemap. */ + seek_offset = (uintptr_t) va_addr / page_size * PAGEMAP_ENTRY_SIZE; + + seek_ret = lseek(fd, seek_offset, SEEK_SET); + if (seek_ret == -1) { + ret = -errno; + FT_UNIT_STRERR(err_buf, "lseek failed", ret); + goto out; + } + + read_size = read(fd, &entry, sizeof(entry)); + if (read_size == -1) { + ret = -errno; + FT_UNIT_STRERR(err_buf, "read failed", ret); + goto out; + } else if (read_size != sizeof(entry)) { + ret = -ENOSPC; + FT_UNIT_STRERR(err_buf, "short read", ret); + goto out; + } + + if (entry & PAGEMAP_PFN_PRESENT) { + ret = 0; + *phy_addr = (entry & PAGEMAP_PFN_MASK) * page_size; + } else { + ret = -EFAULT; + FT_UNIT_STRERR(err_buf, "Failed to find physical address", ret); + } + +out: + close(fd); + + return ret; +} + +/* Sbrk/brk allocations are only intended to support a single outstanding + * allocation at a time. Extra handling of the program break is needed to make + * sbrk/brk allocations more flexible including making allocations thread safe. + */ +static void sbrk_free(void *ptr) +{ + void *cur_brk = (void *) ((uint64_t) ptr + mr_buf_size); + void *rewind_brk = ptr; + + FT_DEBUG("Resetting program break from %p to %p", cur_brk, rewind_brk); + cur_brk = sbrk(-(intptr_t) mr_buf_size); + if (cur_brk == (void *) -1) { + FT_UNIT_STRERR(err_buf, "sbrk failed", -errno); + return; + } + + /* Verify the program break was reset to the expected location. */ + cur_brk = sbrk(0); + if (cur_brk == (void *) -1) { + FT_UNIT_STRERR(err_buf, "sbrk failed", -errno); + return; + } + + if (cur_brk != rewind_brk) + FT_UNIT_STRERR(err_buf, "Failed to reset program break", + -ENOMEM); +} + +static void *sbrk_alloc(void) +{ + void *prev_brk; + void *cur_brk; + + prev_brk = sbrk((intptr_t) mr_buf_size); + if (prev_brk == (void *) -1) { + FT_UNIT_STRERR(err_buf, "sbrk failed", -errno); + return NULL; + } + + /* Determine the size of the newly allocated buffer. If this operation + * fails, memory is leaked. + */ + cur_brk = sbrk(0); + if (cur_brk == (void *) -1) { + FT_UNIT_STRERR(err_buf, "sbrk failed", -errno); + return NULL; + } + + FT_DEBUG("Moved program break from %p to %p", prev_brk, cur_brk); + + return prev_brk; +} + +static void brk_free(void *ptr) +{ + void *cur_brk = (void *) ((uint64_t) ptr + mr_buf_size); + void *rewind_brk = ptr; + int ret; + + FT_DEBUG("Resetting program break from %p to %p", cur_brk, rewind_brk); + ret = brk(rewind_brk); + if (ret) { + FT_UNIT_STRERR(err_buf, "brk failed", -errno); + return; + } + + /* Verify the program break was reset the the expected location. */ + cur_brk = sbrk(0); + if (cur_brk == (void *) -1) { + FT_UNIT_STRERR(err_buf, "sbrk failed", -errno); + return; + } + + if (cur_brk != rewind_brk) + FT_UNIT_STRERR(err_buf, "Failed to reset program break", + -ENOMEM); +} + +static void *brk_alloc(void) +{ + void *prev_brk; + void *cur_brk; + int ret; + + /* Use sbrk to determine the current program break. This is needed to + * determine the brk allocation size. + */ + prev_brk = sbrk(0); + if (prev_brk == (void *) -1) { + FT_UNIT_STRERR(err_buf, "sbrk failed", -errno); + return NULL; + } + + cur_brk = (void *) ((intptr_t) prev_brk + mr_buf_size); + ret = brk(cur_brk); + if (ret) { + FT_UNIT_STRERR(err_buf, "brk failed", -errno); + return NULL; + } + + /* Determine the size of the newly allocated buffer. If this operation + * fails, memory is leaked. + */ + cur_brk = sbrk(0); + if (cur_brk == (void *) -1) { + FT_UNIT_STRERR(err_buf, "sbrk failed", -errno); + return NULL; + } + + FT_DEBUG("Moved program break from %p to %p", prev_brk, cur_brk); + + return prev_brk; +} + +/* Mmap allocations are only intended to support a single outstanding + * allocation at a time. Extra handling of the mmap reuse address needs to occur + * to make mmap allocations more flexible including making allocations thread + * safe. + */ +static void mmap_free(void *ptr) +{ + if (munmap(ptr, mr_buf_size)) + FT_UNIT_STRERR(err_buf, "munmap failed", -errno); +} + +static void *mmap_alloc(void) +{ + void *ptr; + int flags = MAP_ANONYMOUS | MAP_PRIVATE; + + /* If a reuse address is defined, request MAP_FIXED to require the mmap + * allocation to reuse this address. + */ + if (reuse_addr) + flags |= MAP_FIXED; + + ptr = mmap(reuse_addr, mr_buf_size, PROT_READ | PROT_WRITE, flags, -1, + 0); + if (ptr == MAP_FAILED) { + FT_UNIT_STRERR(err_buf, "mmap failed", -errno); + return NULL; + } + + /* Cache this virtual address to reuse for future allocations. */ + reuse_addr = ptr; + + return ptr; +} + +static void rocr_free(void *ptr) +{ + ft_hmem_free(FI_HMEM_ROCR, ptr); +} + +static void *rocr_malloc(void) +{ + int ret; + void *ptr; + + ret = ft_hmem_alloc(FI_HMEM_ROCR, 0, &ptr, mr_buf_size); + if (ret) + return NULL; + return ptr; +} + + +static void cuda_free(void *ptr) +{ + ft_hmem_free(FI_HMEM_CUDA, ptr); +} + +static void *cuda_malloc(void) +{ + int ret; + void *ptr; + + ret = ft_hmem_alloc(FI_HMEM_CUDA, 0, &ptr, mr_buf_size); + if (ret) + return NULL; + return ptr; +} + +/* Generic allocation/deallocation function. Only a single allocation of any + * type should be outstanding. + */ +static void mem_free(void *ptr, enum alloc_type type) +{ + switch (type) { + case SBRK: + sbrk_free(ptr); + break; + case MMAP: + mmap_free(ptr); + break; + case BRK: + brk_free(ptr); + break; + case CUDA: + cuda_free(ptr); + break; + case ROCR: + rocr_free(ptr); + break; + default: + return; + } + + FT_DEBUG("Memory freed: va=%p", ptr); +} + +static enum fi_hmem_iface alloc_type_to_iface(enum alloc_type type) +{ + switch (type) { + case CUDA: + return FI_HMEM_CUDA; + case ROCR: + return FI_HMEM_ROCR; + default: + return FI_HMEM_SYSTEM; + } +} + +/* User defined global mr_buf_size controls allocation size. */ +static void *mem_alloc(enum alloc_type type) +{ + uint64_t phys_addr = 0; + void *ptr; + int ret; + + switch (type) { + case SBRK: + ptr = sbrk_alloc(); + break; + case MMAP: + ptr = mmap_alloc(); + break; + case BRK: + ptr = brk_alloc(); + break; + case CUDA: + ptr = cuda_malloc(); + break; + case ROCR: + ptr = rocr_malloc(); + break; + default: + return NULL; + } + + if (ptr) { + if (geteuid() == 0 && + alloc_type_to_iface(type) == FI_HMEM_SYSTEM) { + /* Perform a write to the buffer to ensure the kernel + * has faulted in a page for this allocation. This will + * help prevent virt_to_phys() from returning an error + * due to no PFN. + */ + *(uint8_t *) ptr = 0; + ret = virt_to_phys(ptr, &phys_addr); + if (ret) + FT_DEBUG("virt_to_phys() failed: %s", + fi_strerror(-ret)); + } + + FT_DEBUG("Memory allocated: va=%p size=%lu phys_addr=0x%lx", + ptr, mr_buf_size, phys_addr); + } + + return ptr; +} + +/* MR registration function which returns the MR and the elapsed time, in + * nanoseconds, to register the MR. + */ +static int mr_register(const void *buf, struct fid_mr **mr, int64_t *elapsed, + enum fi_hmem_iface iface) +{ + int ret; + const struct iovec iov = { + .iov_base = (void *) buf, + .iov_len = mr_buf_size, + }; + struct fi_mr_attr mr_attr = { + .mr_iov = &iov, + .iov_count = 1, + .access = ft_info_to_mr_access(fi), + .requested_key = FT_MR_KEY, + .iface = iface, + }; + + ft_start(); + ret = fi_mr_regattr(domain, &mr_attr, 0, mr); + ft_stop(); + + if (ret != FI_SUCCESS) { + FT_UNIT_STRERR(err_buf, "fi_mr_reg failed", -errno); + return -errno; + } + + *elapsed = get_elapsed(&start, &end, NANO); + + return 0; +} + +/* Run a test verifing the eviction MR cache entries. The following is how the + * test works: + * 1. Prime CPU caches by registering a priming MR. This MR is not used for + * cache measurements. + * + * 2. Allocate a buffer using mem_alloc() with either MMAP, BRK, or SBRK. The + * mem_alloc() allocator is constructed to return the same virtual address + * during buffer reallocation. + * + * 3. Measure MR registration time of the mem_alloc() buffer. Since this buffer + * has not been previously registered, the elapsed time for this MR + * registration should be long. This is referred to as the initial MR + * registration time. + * + * 4. Measure MR registration time of the mem_alloc() buffer again. Since this + * buffer has been previously registered, the elapsed time for this MR + * registration should be significantly less than the initial MR registration + * time. If the registration time is not significantly less, it is assumed + * the provider does not support MR caching, and the test will exit. This + * elapsed time is referred to as the cached MR registration time. + * + * 5. If the provider supports caching, the mem_alloc() buffer is freed and + * reallocated. Measures are inplace to have the reallocated mem_alloc() + * buffer return the same virtual address. During this this time, the + * provider's MR cache should experience an eviction. + * + * 6. Measure MR registration time of the mem_alloc() buffer a third time. Since + * the provider should have experienced a MR cache eviction, the elapsed time + * for this MR registration should not be significantly less than the initial + * MR registration time. If this allocation is significantly less, it is + * assumed this MR registration incorrectly found a cached MR entry. This + * elapsed time is referred to as the reallocated MR registration time. + */ +static int mr_cache_test(enum alloc_type type) +{ + void *prime_buf = NULL; + struct fid_mr *prime_mr = NULL; + void *buf = NULL; + struct fid_mr *mr = NULL; + int64_t mr_reg_time; + struct fid_mr *cached_mr = NULL; + int64_t cached_mr_reg_time; + struct fid_mr *realloc_mr = NULL; + int64_t realloc_mr_reg_time; + int ret; + void *prev_buf; + int testret = FAIL; + enum fi_hmem_iface iface = alloc_type_to_iface(type); + + /* Reallocate the domain to reset the MR cache. */ + if (!domain) { + ret = -EINVAL; + FT_UNIT_STRERR(err_buf, "no domain allocated", ret); + goto cleanup; + } + + ret = fi_close(&domain->fid); + if (ret) { + FT_UNIT_STRERR(err_buf, "Failed to close the domain", ret); + domain = NULL; + goto cleanup; + } + + ret = fi_domain(fabric, fi, &domain, NULL); + if (ret) { + FT_UNIT_STRERR(err_buf, "fi_domain failed", ret); + domain = NULL; + goto cleanup; + } + + /* A priming MR registration is used to ensure the first timed MR + * registration does not take into account the setting up of CPU caches. + */ + switch (iface) { + case FI_HMEM_CUDA: + prime_buf = cuda_malloc(); + if (!prime_buf) { + ret = -ENOMEM; + FT_UNIT_STRERR(err_buf, "cuda_malloc failed", ret); + goto cleanup; + } + break; + + case FI_HMEM_ROCR: + prime_buf = rocr_malloc(); + if (!prime_buf) { + ret = -ENOMEM; + FT_UNIT_STRERR(err_buf, "rocr_malloc failed", ret); + goto cleanup; + } + break; + + default: + prime_buf = malloc(mr_buf_size); + if (!prime_buf) { + ret = -ENOMEM; + FT_UNIT_STRERR(err_buf, "malloc failed", ret); + goto cleanup; + } + break; + } + + ret = mr_register(prime_buf, &prime_mr, &mr_reg_time, iface); + if (ret) { + FT_UNIT_STRERR(err_buf, "mr_register failed", ret); + goto cleanup; + } + + /* Perform initial MR registration. MR registration elapsed time is + * recorded for future comparision. + */ + buf = mem_alloc(type); + if (!buf) { + ret = -ENOMEM; + FT_UNIT_STRERR(err_buf, "mem_alloc failed", ret); + goto cleanup; + } + + ret = mr_register(buf, &mr, &mr_reg_time, iface); + if (ret) { + FT_UNIT_STRERR(err_buf, "mr_register failed", ret); + goto cleanup; + } + + FT_DEBUG("Initial MR registration time: %ld nsecs", mr_reg_time); + + /* Perform another allocation using the same buffer. This should hit the + * MR cache. + */ + ret = mr_register(buf, &cached_mr, &cached_mr_reg_time, iface); + if (ret) { + FT_UNIT_STRERR(err_buf, "mr_register failed", ret); + goto cleanup; + } + + FT_DEBUG("Cached MR registration time: %ld nsecs", cached_mr_reg_time); + + /* If cached allocation is not within the expected duration, assume the + * provider does not support MR caching. + */ + if (cached_mr_reg_time > CACHE_TIME_MAX_VALUE(mr_reg_time)) { + ret = -FI_ENOSYS; + sprintf(err_buf, "Assuming MR cache not enabled by provider"); + goto cleanup; + } + + /* Free the buffer without freeing the MR. This should result in the MR + * cache evicting/invalidating the MR entry. The buffer will then be + * reallocated and re-registered. The newly registered MR should not + * have been cached. + */ + prev_buf = buf; + mem_free(buf, type); + + buf = mem_alloc(type); + if (!buf) { + ret = -ENOMEM; + FT_UNIT_STRERR(err_buf, "mem_alloc failed", ret); + goto cleanup; + } + + /* We NEED the same pointer to be returned for this test to be valid. */ + if (buf != prev_buf) { + ret = -EFAULT; + FT_UNIT_STRERR(err_buf, + "Failed to reallocate same virtual address", + ret); + goto cleanup; + } + + /* Verify reallocated MR registration time is close to the initial MR + * registration time and greater than the cached MR registration time. + */ + ret = mr_register(buf, &realloc_mr, &realloc_mr_reg_time, iface); + if (ret) { + FT_UNIT_STRERR(err_buf, "mr_register failed", ret); + goto cleanup; + } + + FT_DEBUG("Reallocated MR registration time: %ld nsecs", + realloc_mr_reg_time); + + if (realloc_mr_reg_time <= CACHE_TIME_MAX_VALUE(mr_reg_time)) { + ret = -EEXIST; + FT_UNIT_STRERR(err_buf, + "Reallocated MR registration time too low. " + "Cached MR may have been incorrectly used.", + ret); + } else { + testret = PASS; + } + +cleanup: + if (realloc_mr) + fi_close(&realloc_mr->fid); + + if (cached_mr) + fi_close(&cached_mr->fid); + + if (mr) + fi_close(&mr->fid); + + if (buf) + mem_free(buf, type); + + if (prime_mr) + fi_close(&prime_mr->fid); + + if (prime_buf) { + switch (iface) { + case FI_HMEM_CUDA: + cuda_free(prime_buf); + break; + + case FI_HMEM_ROCR: + rocr_free(prime_buf); + break; + + default: + free(prime_buf); + break; + } + } + + return TEST_RET_VAL(ret, testret); +} + +/* Run tests using MMAP, BRK, and SBRK. */ +static int mr_cache_mmap_test(void) +{ + return mr_cache_test(MMAP); +} + +static int mr_cache_brk_test(void) +{ + return mr_cache_test(BRK); +} + +static int mr_cache_sbrk_test(void) +{ + return mr_cache_test(SBRK); +} + +static int mr_cache_cuda_test(void) +{ + int ret; + + if (!(opts.options & FT_OPT_ENABLE_HMEM)) { + sprintf(err_buf, "FI_HMEM support not requested"); + return SKIPPED; + } + + ret = ft_hmem_init(FI_HMEM_CUDA); + if (ret) { + sprintf(err_buf, "ft_hmem_init(FI_HMEM_CUDA) failed"); + return TEST_RET_VAL(ret, FAIL); + } + + ret = mr_cache_test(CUDA); + + ft_hmem_cleanup(FI_HMEM_CUDA); + + return ret; +} + +static int mr_cache_rocr_test(void) +{ + int ret; + + if (!(opts.options & FT_OPT_ENABLE_HMEM)) { + sprintf(err_buf, "FI_HMEM support not requested"); + return SKIPPED; + } + + ret = ft_hmem_init(FI_HMEM_ROCR); + if (ret) { + sprintf(err_buf, "ft_hmem_init(FI_HMEM_ROCR) failed"); + return TEST_RET_VAL(ret, FAIL); + } + + ret = mr_cache_test(ROCR); + + ft_hmem_cleanup(FI_HMEM_ROCR); + + return ret; +} + +struct test_entry test_array[] = { + TEST_ENTRY(mr_cache_mmap_test, "MR cache eviction test using MMAP"), + TEST_ENTRY(mr_cache_brk_test, "MR cache eviction test using BRK"), + TEST_ENTRY(mr_cache_sbrk_test, "MR cache eviction test using SBRK"), + TEST_ENTRY(mr_cache_cuda_test, "MR cache eviction test using CUDA"), + TEST_ENTRY(mr_cache_rocr_test, "MR cache eviction test using ROCR"), + { NULL, "" } +}; + +static void usage(void) +{ + ft_unit_usage("fi_mr_cache_evict", + "Test a provider's ability to evict MR cache entries.\n" + "Evictions are verified using MMAP, BRK, SBRK, CUDA and ROCR\n" + "allocations. FI_HMEM support must be enabled to run CUDA and\n" + "ROCR tests.\n\n" + "With debug enabled, when running as root, the physical \n" + "address of the first page of the MMAP, BRK, and SBRK \n" + "allocation is returned. This can be used to verify the \n" + "underlying physical memory changes between MMAP, BRK, and \n" + "SBRK allocations. When running as non-root, the reported \n" + "physical address is always zero."); + FT_PRINT_OPTS_USAGE("-s ", "Memory region size to be tested."); + FT_PRINT_OPTS_USAGE("-H", "Enable provider FI_HMEM support"); +} + +int main(int argc, char **argv) +{ + int ret; + int op; + int failed = 0; + + /* Force malloc to use mmap by setting M_MMAP_THRESHOLD to 1. This + * allows for this application to control the program break. Note that + * not all operating systems may support this call. Thus, failure of + * mallopt() is not treated as an error. But, this could impact the + * results of the test. + */ + ret = mallopt(M_MMAP_THRESHOLD, 1); + if (ret != 1) + FT_PRINTERR("Failed to set M_MMAP_THRESHOLD to 1. " + "System may not support M_MMAP_THRESHOLD. " + "Proceeding with test.", -EINVAL); + + hints = fi_allocinfo(); + if (!hints) + return EXIT_FAILURE; + + while ((op = getopt(argc, argv, FAB_OPTS "h" "s:")) != -1) { + switch (op) { + default: + ft_parseinfo(op, optarg, hints, &opts); + break; + case 's': + errno = 0; + mr_buf_size = strtoul(optarg, NULL, 10); + if (mr_buf_size == 0) + ret = -EINVAL; + else if (mr_buf_size == ULONG_MAX && errno) + ret = -errno; + else + ret = 0; + + if (ret) { + FT_PRINTERR("Invalid memory region size", ret); + goto out; + } + break; + case '?': + case 'h': + usage(); + return EXIT_FAILURE; + } + } + + hints->mode = ~0; + hints->domain_attr->mode = ~0; + hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); + hints->caps |= FI_MSG | FI_RMA; + + if (opts.options & FT_OPT_ENABLE_HMEM) + hints->caps |= FI_HMEM; + + ret = fi_getinfo(FT_FIVERSION, NULL, 0, 0, hints, &fi); + if (ret) { + hints->caps &= ~FI_RMA; + ret = fi_getinfo(FT_FIVERSION, NULL, 0, 0, hints, &fi); + if (ret) { + FT_PRINTERR("fi_getinfo", ret); + goto out; + } + } + + if (!ft_info_to_mr_access(fi)) + goto out; + + if (fi->domain_attr->mr_iov_limit == 0) { + ret = -EINVAL; + FT_PRINTERR("mr_iov_limit not set", ret); + goto out; + } + + ret = ft_open_fabric_res(); + if (ret) + goto out; + + printf("Testing MR cache on fabric %s domain %s\n", + fi->fabric_attr->name, fi->domain_attr->name); + + failed = run_tests(test_array, err_buf); + if (failed > 0) + printf("Summary: %d tests failed\n", failed); + else + printf("Summary: all tests passed\n"); + +out: + ft_free_res(); + return ret ? ft_exit_code(ret) : (failed > 0) ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/fabtests/unit/mr_test.c b/fabtests/unit/mr_test.c index a7eca891d33..c1f94d804f4 100644 --- a/fabtests/unit/mr_test.c +++ b/fabtests/unit/mr_test.c @@ -142,7 +142,7 @@ static int mr_regattr() int testret = FAIL; struct fid_mr *mr; struct iovec *iov; - struct fi_mr_attr attr; + struct fi_mr_attr attr = {0}; char *base; attr.access = ft_info_to_mr_access(fi); diff --git a/fabtests/unit/resource_freeing.c b/fabtests/unit/resource_freeing.c deleted file mode 100644 index 21dbe6ce0db..00000000000 --- a/fabtests/unit/resource_freeing.c +++ /dev/null @@ -1,303 +0,0 @@ -/* - * Copyright (c) 2017 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include "shared.h" - -#define lengthof(arr) (sizeof(arr) / sizeof(*arr)) - -enum test_depth { - DEPTH_FABRIC, - DEPTH_DOMAIN, - DEPTH_ENABLE_ENDPOINT -}; - -int test_resource_freeing(enum test_depth test_depth, - const char *fabric_service) -{ - int our_ret = FI_SUCCESS; - int ret; - uint64_t flags; - struct fi_info *info; - - /* Setup fabric */ - - hints = fi_allocinfo(); - if (!hints) { - our_ret = -FI_ENOMEM; - goto error_return; - } - - flags = FI_SOURCE; - hints->caps = FI_RMA; - hints->ep_attr->type = FI_EP_RDM; - - ret = fi_getinfo(FT_FIVERSION, NULL, fabric_service, flags, - hints, &info); - if (ret) { - FT_PRINTERR("fi_getinfo", ret); - our_ret = ret; - goto free_hints; - } - - ret = fi_fabric(info->fabric_attr, &fabric, NULL); - if (ret) { - FT_PRINTERR("fi_fabric", ret); - our_ret = ret; - goto free_info; - } - - if (test_depth == DEPTH_FABRIC) { - goto close_fabric; - } - - ret = fi_domain(fabric, info, &domain, NULL); - if (ret) { - FT_PRINTERR("fi_domain", ret); - our_ret = ret; - goto close_fabric; - } - - if (test_depth == DEPTH_DOMAIN) { - goto close_domain; - } - - /* Create pre-endpoint resources */ - - av_attr.type = info->domain_attr->av_type; - av_attr.count = 0; - av_attr.name = NULL; - ret = fi_av_open(domain, &av_attr, &av, NULL); - if (ret) { - FT_PRINTERR("fi_av_open", ret); - our_ret = ret; - goto close_domain; - } - - cntr_attr.events = FI_CNTR_EVENTS_COMP; - cntr_attr.wait_obj = FI_WAIT_UNSPEC; - ret = fi_cntr_open(domain, &cntr_attr, &txcntr, NULL); - if (ret) { - FT_PRINTERR("fi_cntr_open", ret); - our_ret = ret; - goto close_av; - } - - ret = fi_cq_open(domain, &cq_attr, &txcq, NULL); - if (ret) { - FT_PRINTERR("fi_cq_open", ret); - our_ret = ret; - goto close_txcntr; - } - - ret = fi_endpoint(domain, info, &ep, NULL); - if (ret) { - FT_PRINTERR("fi_endpoint", ret); - our_ret = ret; - goto close_txcq; - } - - /* Bind pre-endpoint resources to ep */ - - ret = fi_ep_bind(ep, &txcntr->fid, FI_WRITE); - if (ret) { - FT_PRINTERR("fi_ep_bind", ret); - our_ret = ret; - goto close_ep; - } - - ret = fi_ep_bind(ep, &av->fid, 0); - if (ret) { - FT_PRINTERR("fi_ep_bind", ret); - our_ret = ret; - goto close_ep; - } - - ret = fi_ep_bind(ep, &txcq->fid, FI_TRANSMIT); - if (ret) { - FT_PRINTERR("fi_ep_bind", ret); - our_ret = ret; - goto close_ep; - } - - /* Enable ep */ - - ret = fi_enable(ep); - if (ret) { - FT_PRINTERR("fi_enable", ret); - our_ret = ret; - goto close_ep; - } - - if (test_depth == DEPTH_ENABLE_ENDPOINT) { - goto close_ep; - } - -close_ep: - ret = fi_close(&ep->fid); - if (ret) { - FT_PRINTERR("fi_close", ret); - our_ret = our_ret ? our_ret : ret; - } - -close_txcq: - ret = fi_close(&txcq->fid); - if (ret) { - FT_PRINTERR("fi_close", ret); - our_ret = our_ret ? our_ret : ret; - } - -close_txcntr: - ret = fi_close(&txcntr->fid); - if (ret) { - FT_PRINTERR("fi_close", ret); - our_ret = our_ret ? our_ret : ret; - } - -close_av: - ret = fi_close(&av->fid); - if (ret) { - FT_PRINTERR("fi_close", ret); - our_ret = our_ret ? our_ret : ret; - } - -close_domain: - ret = fi_close(&domain->fid); - if (ret) { - FT_PRINTERR("fi_close", ret); - our_ret = our_ret ? our_ret : ret; - } - -close_fabric: - ret = fi_close(&fabric->fid); - if (ret) { - FT_PRINTERR("fi_close", ret); - our_ret = our_ret ? our_ret : ret; - } - -free_info: - fi_freeinfo(info); - -free_hints: - fi_freeinfo(hints); - -error_return: - return our_ret; -} - -void print_test_resource_freeing_call(enum test_depth test_depth, int iter) -{ - fprintf(stdout, - "Running test_resource_freeing with " - "[%s] for %d iterations\n", - (test_depth == DEPTH_FABRIC) ? "DEPTH_FABRIC" - : (test_depth == DEPTH_DOMAIN) ? "DEPTH_DOMAIN" - : (test_depth == DEPTH_ENABLE_ENDPOINT) ? "DEPTH_ENABLE_ENDPOINT" - : "(unknown test depth)", - iter - ); - - fflush(stderr); - fflush(stdout); -} - -void print_test_resource_freeing_result_call(int success, - enum test_depth test_depth, - int iter) -{ - fprintf(success ? stdout : stderr, - "%s: test_resource_freeing %s with " - "[%s]\n", - success ? "GOOD" : "ERROR", - success ? "succeeded" : "failed", - (test_depth == DEPTH_FABRIC) ? "DEPTH_FABRIC" - : (test_depth == DEPTH_DOMAIN) ? "DEPTH_DOMAIN" - : (test_depth == DEPTH_ENABLE_ENDPOINT) ? "DEPTH_ENABLE_ENDPOINT" - : "(unknown test depth)" - ); - - fflush(stderr); - fflush(stdout); -} - -int main(int argc, char **argv) -{ - int op, i, td_idx, ret = 0, iters = 2, exit_code = 0; - - opts = INIT_OPTS; - - hints = fi_allocinfo(); - if (!hints) - return EXIT_FAILURE; - - while ((op = getopt(argc, argv, "i:h" ADDR_OPTS INFO_OPTS)) != -1) { - switch (op) { - default: - ft_parse_addr_opts(op, optarg, &opts); - ft_parseinfo(op, optarg, hints, &opts); - break; - case 'i': - iters = atoi(optarg); - break; - case '?': - case 'h': - ft_usage(argv[0], "Test which exercises resource freeing in a provider\n"); - FT_PRINT_OPTS_USAGE("-i ", "number of iterations to test"); - return EXIT_FAILURE; - } - } - - enum test_depth test_depth[] = { - DEPTH_FABRIC, DEPTH_DOMAIN, DEPTH_ENABLE_ENDPOINT}; - - for (td_idx = 0; td_idx < lengthof(test_depth); td_idx += 1) { - print_test_resource_freeing_call( - test_depth[td_idx], iters); - for (i = 0; i < iters; i += 1) { - ret = test_resource_freeing( - test_depth[td_idx], default_port); - if (ret) { - exit_code = EXIT_FAILURE; - break; - } - } - print_test_resource_freeing_result_call( - !ret, /* int success */ - test_depth[td_idx], - i); - } - - return ft_exit_code(exit_code); -} diff --git a/hello_world.c b/hello_world.c new file mode 100644 index 00000000000..3ea9873412f --- /dev/null +++ b/hello_world.c @@ -0,0 +1,9 @@ +#include +#include + +int main(int argc, char *argv[]){ + + printf("Hello World!\n"); + + return EXIT_SUCCESS; +} diff --git a/hello_world_1.c b/hello_world_1.c new file mode 100644 index 00000000000..3ea9873412f --- /dev/null +++ b/hello_world_1.c @@ -0,0 +1,9 @@ +#include +#include + +int main(int argc, char *argv[]){ + + printf("Hello World!\n"); + + return EXIT_SUCCESS; +} diff --git a/include/freebsd/osd.h b/include/freebsd/osd.h index fa3691d9780..c185d90e545 100644 --- a/include/freebsd/osd.h +++ b/include/freebsd/osd.h @@ -75,6 +75,82 @@ static inline size_t ofi_ifaddr_get_speed(struct ifaddrs *ifa) return 0; } +static inline ssize_t ofi_process_vm_readv(pid_t pid, + const struct iovec *local_iov, + unsigned long liovcnt, + const struct iovec *remote_iov, + unsigned long riovcnt, + unsigned long flags) +{ + return -FI_ENOSYS; +} + +static inline size_t ofi_process_vm_writev(pid_t pid, + const struct iovec *local_iov, + unsigned long liovcnt, + const struct iovec *remote_iov, + unsigned long riovcnt, + unsigned long flags) +{ + return -FI_ENOSYS; +} + +static inline ssize_t ofi_read_socket(SOCKET fd, void *buf, size_t count) +{ + return read(fd, buf, count); +} + +static inline ssize_t ofi_write_socket(SOCKET fd, const void *buf, size_t count) +{ + return write(fd, buf, count); +} + +static inline ssize_t ofi_recv_socket(SOCKET fd, void *buf, size_t count, + int flags) +{ + return recv(fd, buf, count, flags); +} + +static inline ssize_t ofi_recvfrom_socket(SOCKET fd, void *buf, size_t count, int flags, + struct sockaddr *from, socklen_t *fromlen) +{ + return recvfrom(fd, buf, count, flags, from, fromlen); +} + +static inline ssize_t ofi_send_socket(SOCKET fd, const void *buf, size_t count, + int flags) +{ + return send(fd, buf, count, flags); +} + +static inline ssize_t ofi_sendto_socket(SOCKET fd, const void *buf, size_t count, int flags, + const struct sockaddr *to, socklen_t tolen) +{ + return sendto(fd, buf, count, flags, to, tolen); +} + +static inline ssize_t ofi_writev_socket(SOCKET fd, struct iovec *iov, size_t iov_cnt) +{ + return writev(fd, iov, iov_cnt); +} + +static inline ssize_t ofi_readv_socket(SOCKET fd, struct iovec *iov, int iov_cnt) +{ + return readv(fd, iov, iov_cnt); +} + +static inline ssize_t +ofi_sendmsg_tcp(SOCKET fd, const struct msghdr *msg, int flags) +{ + return sendmsg(fd, msg, flags); +} + +static inline ssize_t +ofi_recvmsg_tcp(SOCKET fd, struct msghdr *msg, int flags) +{ + return recvmsg(fd, msg, flags); +} + #endif /* _FREEBSD_OSD_H_ */ diff --git a/include/linux/osd.h b/include/linux/osd.h index fe222d2d4cb..66c0e658e90 100644 --- a/include/linux/osd.h +++ b/include/linux/osd.h @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include #include "unix/osd.h" @@ -95,4 +97,90 @@ static inline int ofi_hugepage_enabled(void) size_t ofi_ifaddr_get_speed(struct ifaddrs *ifa); +#ifndef __NR_process_vm_readv +# define __NR_process_vm_readv 310 +#endif + +#ifndef __NR_process_vm_writev +# define __NR_process_vm_writev 311 +#endif + +static inline ssize_t ofi_process_vm_readv(pid_t pid, + const struct iovec *local_iov, + unsigned long liovcnt, + const struct iovec *remote_iov, + unsigned long riovcnt, + unsigned long flags) +{ + return syscall(__NR_process_vm_readv, pid, local_iov, liovcnt, + remote_iov, riovcnt, flags); +} + +static inline size_t ofi_process_vm_writev(pid_t pid, + const struct iovec *local_iov, + unsigned long liovcnt, + const struct iovec *remote_iov, + unsigned long riovcnt, + unsigned long flags) +{ + return syscall(__NR_process_vm_writev, pid, local_iov, liovcnt, + remote_iov, riovcnt, flags); +} + +static inline ssize_t ofi_read_socket(SOCKET fd, void *buf, size_t count) +{ + return read(fd, buf, count); +} + +static inline ssize_t ofi_write_socket(SOCKET fd, const void *buf, size_t count) +{ + return write(fd, buf, count); +} + +static inline ssize_t ofi_recv_socket(SOCKET fd, void *buf, size_t count, + int flags) +{ + return recv(fd, buf, count, flags); +} + +static inline ssize_t ofi_recvfrom_socket(SOCKET fd, void *buf, size_t count, int flags, + struct sockaddr *from, socklen_t *fromlen) +{ + return recvfrom(fd, buf, count, flags, from, fromlen); +} + +static inline ssize_t ofi_send_socket(SOCKET fd, const void *buf, size_t count, + int flags) +{ + return send(fd, buf, count, flags); +} + +static inline ssize_t ofi_sendto_socket(SOCKET fd, const void *buf, size_t count, int flags, + const struct sockaddr *to, socklen_t tolen) +{ + return sendto(fd, buf, count, flags, to, tolen); +} + +static inline ssize_t ofi_writev_socket(SOCKET fd, struct iovec *iov, size_t iov_cnt) +{ + return writev(fd, iov, iov_cnt); +} + +static inline ssize_t ofi_readv_socket(SOCKET fd, struct iovec *iov, int iov_cnt) +{ + return readv(fd, iov, iov_cnt); +} + +static inline ssize_t +ofi_sendmsg_tcp(SOCKET fd, const struct msghdr *msg, int flags) +{ + return sendmsg(fd, msg, flags); +} + +static inline ssize_t +ofi_recvmsg_tcp(SOCKET fd, struct msghdr *msg, int flags) +{ + return recvmsg(fd, msg, flags); +} + #endif /* _LINUX_OSD_H_ */ diff --git a/include/ofi.h b/include/ofi.h index 3b93cfd65e7..46d61940724 100644 --- a/include/ofi.h +++ b/include/ofi.h @@ -65,8 +65,16 @@ extern "C" { #endif +/* For in-tree providers */ +#define OFI_VERSION_LATEST FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION) +/* The lower minor digit is reserved for custom libfabric builds */ +#define OFI_VERSION_DEF_PROV \ + FI_VERSION(FI_MAJOR_VERSION * 100 + FI_MINOR_VERSION, \ + FI_REVISION_VERSION * 10) + #define OFI_GETINFO_INTERNAL (1ULL << 58) #define OFI_CORE_PROV_ONLY (1ULL << 59) +#define OFI_GETINFO_HIDDEN (1ULL << 60) #define OFI_ORDER_RAR_SET (FI_ORDER_RAR | FI_ORDER_RMA_RAR | \ FI_ORDER_ATOMIC_RAR) @@ -77,6 +85,15 @@ extern "C" { #define OFI_ORDER_WAW_SET (FI_ORDER_WAW | FI_ORDER_RMA_WAW | \ FI_ORDER_ATOMIC_WAW) +#define OFI_IGNORED_TX_CAPS /* older Rx caps not applicable to Tx */ \ + (FI_REMOTE_READ | FI_REMOTE_WRITE | FI_RECV | FI_DIRECTED_RECV | \ + FI_VARIABLE_MSG | FI_MULTI_RECV | FI_SOURCE | FI_RMA_EVENT | \ + FI_SOURCE_ERR) +#define OFI_IGNORED_RX_CAPS /* Older Tx caps not applicable to Rx */ \ + (FI_READ | FI_WRITE | FI_SEND | FI_FENCE | FI_MULTICAST | \ + FI_NAMED_RX_CTX) + + #define sizeof_field(type, field) sizeof(((type *)0)->field) #ifndef MIN @@ -95,7 +112,26 @@ extern "C" { #define ofi_div_ceil(a, b) ((a + b - 1) / b) -#define OFI_MAGIC_64 (0x0F1C0DE0F1C0DE64) +static inline int ofi_val64_gt(uint64_t x, uint64_t y) { + return ((int64_t) (x - y)) > 0; +} +static inline int ofi_val64_ge(uint64_t x, uint64_t y) { + return ((int64_t) (x - y)) >= 0; +} +#define ofi_val64_lt(x, y) ofi_val64_gt(y, x) + +static inline int ofi_val32_gt(uint32_t x, uint32_t y) { + return ((int32_t) (x - y)) > 0; +} +static inline int ofi_val32_ge(uint32_t x, uint32_t y) { + return ((int32_t) (x - y)) >= 0; +} +#define ofi_val32_lt(x, y) ofi_val32_gt(y, x) + +#define ofi_val32_inrange(start, length, value) \ + ofi_val32_ge(value, start) && ofi_val32_lt(value, start + length) +#define ofi_val64_inrange(start, length, value) \ + ofi_val64_ge(value, start) && ofi_val64_lt(value, start + length) #ifndef BIT #define BIT(nr) (1UL << (nr)) @@ -119,28 +155,24 @@ extern "C" { #define TAB " " -#define CASEENUMSTR(SYM) \ - case SYM: { ofi_strcatf(buf, #SYM); break; } -#define IFFLAGSTR(flags, SYM) \ - do { if (flags & SYM) ofi_strcatf(buf, #SYM ", "); } while(0) #define CASEENUMSTRN(SYM, N) \ case SYM: { ofi_strncatf(buf, N, #SYM); break; } #define IFFLAGSTRN(flags, SYM, N) \ do { if (flags & SYM) ofi_strncatf(buf, N, #SYM ", "); } while(0) -#define ofi_strcatf(dest, ...) \ - ofi_strncatf(dest, OFI_BUFSIZ, __VA_ARGS__) /* * CPU specific features */ + +/* X86_64 */ enum { - OFI_CLWB_REG = 2, + OFI_CLWB_REG = 1, OFI_CLWB_BIT = (1 << 24), OFI_CLFLUSHOPT_REG = 1, - OFI_CLFLUSHOPT_BIT = (1 << 24), + OFI_CLFLUSHOPT_BIT = (1 << 23), OFI_CLFLUSH_REG = 3, - OFI_CLFLUSH_BIT = (1 << 23), + OFI_CLFLUSH_BIT = (1 << 19), }; int ofi_cpu_supports(unsigned func, unsigned reg, unsigned bit); @@ -156,6 +188,7 @@ enum ofi_prov_type { struct fi_prov_context { enum ofi_prov_type type; int disable_logging; + int disable_layering; }; struct fi_filter { @@ -205,6 +238,12 @@ static inline uint64_t roundup_power_of_two(uint64_t n) return n; } +static inline uint64_t rounddown_power_of_two(uint64_t n) +{ + uint64_t pof2 = roundup_power_of_two(n); + return (pof2 > n) ? (pof2 >> 1) : pof2; +} + static inline size_t ofi_get_aligned_size(size_t size, size_t alignment) { return ((size % alignment) == 0) ? @@ -218,14 +257,20 @@ static inline void *ofi_get_page_start(const void *addr, size_t page_size) static inline void *ofi_get_page_end(const void *addr, size_t page_size) { - return ofi_get_page_start((const char *) addr + page_size -1, page_size); + return (void *)((uintptr_t)ofi_get_page_start((const char *)addr + + page_size, page_size) - 1); } static inline size_t ofi_get_page_bytes(const void *addr, size_t len, size_t page_size) { - return (char *)ofi_get_page_end((const char *) addr + len, page_size) - - (char *)ofi_get_page_start(addr, page_size); + char *start = ofi_get_page_start(addr, page_size); + char *end = (char *)ofi_get_page_start((const char*)addr + len - 1, page_size) + + page_size; + size_t result = end - start; + + assert(result % page_size == 0); + return result; } #define FI_TAG_GENERIC 0xAAAAAAAAAAAAAAAAULL @@ -236,26 +281,32 @@ uint64_t ofi_tag_format(uint64_t max_tag); uint8_t ofi_msb(uint64_t num); uint8_t ofi_lsb(uint64_t num); -int ofi_send_allowed(uint64_t caps); -int ofi_recv_allowed(uint64_t caps); -int ofi_rma_initiate_allowed(uint64_t caps); -int ofi_rma_target_allowed(uint64_t caps); +extern size_t ofi_universe_size; + +bool ofi_send_allowed(uint64_t caps); +bool ofi_recv_allowed(uint64_t caps); +bool ofi_rma_initiate_allowed(uint64_t caps); +bool ofi_rma_target_allowed(uint64_t caps); +bool ofi_needs_tx(uint64_t caps); +bool ofi_needs_rx(uint64_t caps); + int ofi_ep_bind_valid(const struct fi_provider *prov, struct fid *bfid, uint64_t flags); int ofi_check_rx_mode(const struct fi_info *info, uint64_t flags); -uint64_t fi_gettime_ms(void); -uint64_t fi_gettime_us(void); +uint64_t ofi_gettime_ns(void); +uint64_t ofi_gettime_us(void); +uint64_t ofi_gettime_ms(void); static inline uint64_t ofi_timeout_time(int timeout) { - return (timeout >= 0) ? fi_gettime_ms() + timeout : 0; + return (timeout >= 0) ? ofi_gettime_ms() + timeout : 0; } static inline int ofi_adjust_timeout(uint64_t timeout_time, int *timeout) { if (*timeout >= 0) { - *timeout = (int) (timeout_time - fi_gettime_ms()); + *timeout = (int) (timeout_time - ofi_gettime_ms()); return (*timeout <= 0) ? -FI_ETIMEDOUT : 0; } return 0; @@ -300,6 +351,33 @@ static inline uint64_t ofi_key2idx(struct ofi_key_idx *key_idx, uint64_t key) return key & ((1ULL << key_idx->idx_bits) - 1); } +static inline uint32_t ofi_xorshift_random(uint32_t val) +{ + /* + * Xorshift Random Number Generators are from 224. + * R. P. Brent, Some long-period random number + * generators using shifts and xors, ANZIAM + * Journal 48 (CTAC2006), C188-C202, 2007. + * Presented at the 13th Biennial Computational + * Techniques and Applications + * Conference (CTAC06), Townsville, 2-5 July 2006. + * arXiv:1004.3115v1 + */ + val ^= val << 13; + val ^= val >> 17; + val ^= val << 5; + + return val; +} + +static inline uint32_t ofi_xorshift_random_r(uint32_t *seed) +{ + return *seed = ofi_xorshift_random(*seed); +} + +uint32_t ofi_generate_seed(void); + +size_t ofi_vrb_speed(uint8_t speed, uint8_t width); #ifdef __cplusplus } diff --git a/include/ofi_abi.h b/include/ofi_abi.h index 8ba35a4a373..f11b0698873 100644 --- a/include/ofi_abi.h +++ b/include/ofi_abi.h @@ -90,7 +90,7 @@ extern "C" { * { * ... * } - * DEFAULT_SYMVER(bar_, bar, "MYLIB_1.0"); + * DEFAULT_SYMVER(bar_, bar, MYLIB_1.0); * * This function is the main entry point for function foo. * int DEFAULT_SYMVER_PRE(foo)(void) @@ -105,13 +105,13 @@ extern "C" { * { * ... * } - * COMPAT_SYMVER(foo_1_0, foo, "MYLIB_1.0"); + * COMPAT_SYMVER(foo_1_0, foo, MYLIB_1.0); * * By convention, the name of compatibility functions is the exported function * name appended with the ABI version that it is compatible with. */ -#define CURRENT_ABI "FABRIC_1.2" +#define CURRENT_ABI "FABRIC_1.4" #if HAVE_ALIAS_ATTRIBUTE == 1 #define DEFAULT_SYMVER_PRE(a) a##_ diff --git a/include/ofi_atom.h b/include/ofi_atom.h index 8d1b6d9b625..46d6eb98dec 100644 --- a/include/ofi_atom.h +++ b/include/ofi_atom.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -123,6 +124,41 @@ typedef atomic_long ofi_atomic_int64_t; ATOMIC_IS_INITIALIZED(atomic); \ return (int##radix##_t)atomic_fetch_sub_explicit(&atomic->val, val, \ memory_order_acq_rel) - val; \ + } \ + /** \ + * Compare and swap, strong version \ + * \ + * @return true if atomic matches expected and the change is done, false \ + * otherwise. \ + */ \ + static inline \ + bool ofi_atomic_cas_bool_strong##radix(ofi_atomic##radix##_t *atomic, \ + int##radix##_t expected, \ + int##radix##_t desired) \ + { \ + ATOMIC_IS_INITIALIZED(atomic); \ + return atomic_compare_exchange_strong_explicit(&atomic->val, &expected, desired, \ + memory_order_acq_rel, \ + memory_order_relaxed); \ + } \ + /** \ + * Compare and swap, weak version \ + * \ + * @return true if atomic matches expected and the change is done, false \ + * otherwise. \ + * This is the weak version and may incorrectly report a failed match. \ + * As a result it is most useful in loops that wait until the check succeeds. \ + */ \ + static inline \ + bool ofi_atomic_cas_bool_weak##radix(ofi_atomic##radix##_t *atomic, \ + int##radix##_t expected, \ + int##radix##_t desired) \ + { \ + ATOMIC_IS_INITIALIZED(atomic); \ + return atomic_compare_exchange_weak_explicit(&atomic->val, \ + &expected, desired, \ + memory_order_acq_rel, \ + memory_order_relaxed); \ } #elif defined HAVE_BUILTIN_ATOMICS @@ -184,8 +220,30 @@ typedef atomic_long ofi_atomic_int64_t; { \ *(ofi_atomic_ptr(atomic)) = value; \ ATOMIC_INIT(atomic); \ + } \ + static inline \ + bool ofi_atomic_cas_bool##radix(ofi_atomic##radix##_t *atomic, \ + int##radix##_t expected, \ + int##radix##_t desired) \ + { \ + ATOMIC_IS_INITIALIZED(atomic); \ + return ofi_atomic_cas_bool(radix, ofi_atomic_ptr(atomic), expected, desired); \ + } \ + static inline \ + bool ofi_atomic_cas_bool_strong##radix(ofi_atomic##radix##_t *atomic, \ + int##radix##_t expected, \ + int##radix##_t desired) \ + { \ + return ofi_atomic_cas_bool##radix(atomic, expected, desired); \ + } \ + static inline \ + bool ofi_atomic_cas_bool_weak##radix(ofi_atomic##radix##_t *atomic, \ + int##radix##_t expected, \ + int##radix##_t desired) \ + { \ + return ofi_atomic_cas_bool##radix(atomic, expected, desired); \ } - + #else /* HAVE_ATOMICS */ #define OFI_ATOMIC_DEFINE(radix) \ @@ -261,7 +319,37 @@ typedef atomic_long ofi_atomic_int64_t; v = atomic->val; \ fastlock_release(&atomic->lock); \ return v; \ + } \ + static inline \ + bool ofi_atomic_cas_bool##radix(ofi_atomic##radix##_t *atomic, \ + int##radix##_t expected, \ + int##radix##_t desired) \ + { \ + bool ret = false; \ + ATOMIC_IS_INITIALIZED(atomic); \ + fastlock_acquire(&atomic->lock); \ + if (atomic->val == expected) { \ + atomic->val = desired; \ + ret = true; \ + } \ + fastlock_release(&atomic->lock); \ + return ret; \ + } \ + static inline \ + bool ofi_atomic_cas_bool_strong##radix(ofi_atomic##radix##_t *atomic, \ + int##radix##_t expected, \ + int##radix##_t desired) \ + { \ + return ofi_atomic_cas_bool##radix(atomic, expected, desired); \ + } \ + static inline \ + bool ofi_atomic_cas_bool_weak##radix(ofi_atomic##radix##_t *atomic, \ + int##radix##_t expected, \ + int##radix##_t desired) \ + { \ + return ofi_atomic_cas_bool##radix(atomic, expected, desired); \ } + #endif // HAVE_ATOMICS OFI_ATOMIC_DEFINE(32) diff --git a/include/ofi_atomic.h b/include/ofi_atomic.h index a146e75bc20..fc956945fd3 100644 --- a/include/ofi_atomic.h +++ b/include/ofi_atomic.h @@ -43,19 +43,42 @@ extern "C" { size_t ofi_datatype_size(enum fi_datatype datatype); -#define OFI_WRITE_OP_LAST FI_CSWAP -#define OFI_READWRITE_OP_LAST FI_CSWAP +/* The START value is included, LAST is exclusive, which matches the public + * header file use of LAST. CNT is the number of valid values. + */ +#define OFI_WRITE_OP_START FI_MIN +#define OFI_WRITE_OP_LAST (FI_ATOMIC_WRITE + 1) +#define OFI_WRITE_OP_CNT (OFI_WRITE_OP_LAST - OFI_WRITE_OP_START) +#define OFI_READWRITE_OP_START FI_MIN +#define OFI_READWRITE_OP_LAST (FI_ATOMIC_WRITE + 1) +#define OFI_READWRITE_OP_CNT (OFI_READWRITE_OP_LAST - OFI_READWRITE_OP_START) #define OFI_SWAP_OP_START FI_CSWAP -#define OFI_SWAP_OP_LAST (FI_MSWAP - FI_CSWAP + 1) +#define OFI_SWAP_OP_LAST (FI_MSWAP + 1) +#define OFI_SWAP_OP_CNT (OFI_SWAP_OP_LAST - OFI_SWAP_OP_START) + +#define ofi_atomic_iswrite_op(op) \ + (op >= OFI_WRITE_OP_START && op < OFI_WRITE_OP_LAST && op != FI_ATOMIC_READ) +#define ofi_atomic_isreadwrite_op(op) \ + (op >= OFI_READWRITE_OP_START && op < OFI_READWRITE_OP_LAST) +#define ofi_atomic_isswap_op(op) \ + (op >= OFI_SWAP_OP_START && op < OFI_SWAP_OP_LAST) -extern void (*ofi_atomic_write_handlers[OFI_WRITE_OP_LAST][FI_DATATYPE_LAST]) +extern void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][FI_DATATYPE_LAST]) (void *dst, const void *src, size_t cnt); -extern void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_LAST][FI_DATATYPE_LAST]) +extern void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][FI_DATATYPE_LAST]) (void *dst, const void *src, void *res, size_t cnt); -extern void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_LAST][FI_DATATYPE_LAST]) +extern void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][FI_DATATYPE_LAST]) (void *dst, const void *src, const void *cmp, void *res, size_t cnt); +#define ofi_atomic_write_handler(op, datatype, dst, src, cnt) \ + ofi_atomic_write_handlers[op][datatype](dst, src, cnt) +#define ofi_atomic_readwrite_handler(op, datatype, dst, src, res, cnt) \ + ofi_atomic_readwrite_handlers[op][datatype](dst, src, res, cnt) +#define ofi_atomic_swap_handler(op, datatype, dst, src, cmp, res, cnt) \ + ofi_atomic_swap_handlers[op - OFI_SWAP_OP_START][datatype](dst, src, \ + cmp, res, cnt) + int ofi_atomic_valid(const struct fi_provider *prov, enum fi_datatype datatype, enum fi_op op, uint64_t flags); diff --git a/include/ofi_bitmask.h b/include/ofi_bitmask.h new file mode 100644 index 00000000000..624792b5ad8 --- /dev/null +++ b/include/ofi_bitmask.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2019-2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _OFI_BITMASK_H_ +#define _OFI_BITMASK_H_ + +#include + +#include +#include +#include +#include + +struct bitmask { + size_t size; + uint8_t *bytes; +}; + +static inline int ofi_bitmask_create(struct bitmask *mask, size_t size) +{ + size_t byte_size = size / 8; + if (byte_size % 8) + byte_size++; + + mask->bytes = calloc(byte_size, 1); + if (!mask->bytes) + return -FI_ENOMEM; + + mask->size = size; + + return FI_SUCCESS; +} + +static inline void ofi_bitmask_free(struct bitmask *mask) +{ + free(mask->bytes); + mask->bytes = NULL; +} + +static inline size_t ofi_bitmask_bytesize(struct bitmask *mask) +{ + return (mask->size % 8) ? (mask->size / 8 + 1) : (mask->size / 8); +} + +static inline void ofi_bitmask_unset(struct bitmask *mask, size_t idx) +{ + assert(idx <= mask->size); + mask->bytes[idx / 8] &= ~(0x01 << (idx % 8)); +} + +static inline void ofi_bitmask_set(struct bitmask *mask, size_t idx) +{ + assert(idx <= mask->size); + mask->bytes[idx / 8] |= (0x01 << (idx % 8)); +} + +static inline void ofi_bitmask_set_all(struct bitmask *mask) +{ + memset(mask->bytes, 0xff, ofi_bitmask_bytesize(mask)); +} + +static inline size_t ofi_bitmask_get_lsbset(struct bitmask mask) +{ + size_t cur; + uint8_t tmp; + size_t ret = 0; + + for (cur = 0; cur < (mask.size/8); cur++) { + if (mask.bytes[cur]) { + tmp = mask.bytes[cur]; + while (!(tmp & 0x1)) { + tmp >>= 1; + ret++; + } + break; + } else { + ret += 8; + } + } + + assert(ret <= (mask.size)); + return ret; +} + +#endif diff --git a/include/ofi_coll.h b/include/ofi_coll.h new file mode 100644 index 00000000000..5c6cf911308 --- /dev/null +++ b/include/ofi_coll.h @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2019 Intel Corporation, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _OFI_COLL_H_ +#define _OFI_COLL_H_ + +#include + +#include +#include +#include + +#define OFI_WORLD_GROUP_ID 0 +#define OFI_MAX_GROUP_ID 256 +#define OFI_COLL_TAG_FLAG (1ULL << 63) + +enum util_coll_op_type { + UTIL_COLL_JOIN_OP, + UTIL_COLL_BARRIER_OP, + UTIL_COLL_ALLREDUCE_OP, + UTIL_COLL_BROADCAST_OP, + UTIL_COLL_ALLGATHER_OP, + UTIL_COLL_SCATTER_OP, +}; + +static const char * const log_util_coll_op_type[] = { + [UTIL_COLL_JOIN_OP] = "COLL_JOIN", + [UTIL_COLL_BARRIER_OP] = "COLL_BARRIER", + [UTIL_COLL_ALLREDUCE_OP] = "COLL_ALLREDUCE", + [UTIL_COLL_BROADCAST_OP] = "COLL_BROADCAST", + [UTIL_COLL_ALLGATHER_OP] = "COLL_ALLGATHER", + [UTIL_COLL_SCATTER_OP] = "COLL_SCATTER" +}; + +struct util_coll_mc { + struct fid_mc mc_fid; + struct fid_ep *ep; + struct util_av_set *av_set; + uint64_t local_rank; + uint16_t group_id; + uint16_t seq; + ofi_atomic32_t ref; +}; + +struct util_av_set { + struct fid_av_set av_set_fid; + struct util_av *av; + fi_addr_t *fi_addr_array; + size_t fi_addr_count; + uint64_t flags; + struct util_coll_mc coll_mc; + ofi_atomic32_t ref; + fastlock_t lock; +}; + +enum coll_work_type { + UTIL_COLL_SEND, + UTIL_COLL_RECV, + UTIL_COLL_REDUCE, + UTIL_COLL_COPY, + UTIL_COLL_COMP, +}; + +enum coll_state { + UTIL_COLL_WAITING, + UTIL_COLL_PROCESSING, + UTIL_COLL_COMPLETE +}; + +static const char * const log_util_coll_state[] = { + [UTIL_COLL_WAITING] = "COLL_WAITING", + [UTIL_COLL_PROCESSING] = "COLL_PROCESSING", + [UTIL_COLL_COMPLETE] = "COLL_COMPLETE" +}; + +struct util_coll_operation; + +struct util_coll_work_item { + struct slist_entry ready_entry; + struct dlist_entry waiting_entry; + struct util_coll_operation *coll_op; + enum coll_work_type type; + enum coll_state state; + int fence; +}; + +struct util_coll_xfer_item { + struct util_coll_work_item hdr; + void *buf; + int count; + enum fi_datatype datatype; + uint64_t tag; + int remote_rank; +}; + +struct util_coll_copy_item { + struct util_coll_work_item hdr; + void *in_buf; + void *out_buf; + int count; + enum fi_datatype datatype; +}; + +struct util_coll_reduce_item { + struct util_coll_work_item hdr; + void *in_buf; + void *inout_buf; + int count; + enum fi_datatype datatype; + enum fi_op op; +}; + +struct join_data { + struct util_coll_mc *new_mc; + struct bitmask data; + struct bitmask tmp; +}; + +struct barrier_data { + uint64_t data; + uint64_t tmp; +}; + +struct allreduce_data { + void *data; + size_t size; +}; + +struct broadcast_data { + void *chunk; + size_t size; + void *scatter; +}; + +struct util_coll_operation; + +typedef void (*util_coll_comp_fn_t)(struct util_coll_operation *coll_op); +struct util_coll_operation { + enum util_coll_op_type type; + uint32_t cid; + void *context; + struct util_coll_mc *mc; + struct dlist_entry work_queue; + union { + struct join_data join; + struct barrier_data barrier; + struct allreduce_data allreduce; + void *scatter; + struct broadcast_data broadcast; + } data; + util_coll_comp_fn_t comp_fn; +}; + +int ofi_query_collective(struct fid_domain *domain, enum fi_collective_op coll, + struct fi_collective_attr *attr, uint64_t flags); + +int ofi_join_collective(struct fid_ep *ep, fi_addr_t coll_addr, + const struct fid_av_set *set, uint64_t flags, + struct fid_mc **mc, void *context); + +int ofi_av_set(struct fid_av *av, struct fi_av_set_attr *attr, + struct fid_av_set **av_set_fid, void *context); + +ssize_t ofi_ep_barrier(struct fid_ep *ep, fi_addr_t coll_addr, void *context); + +ssize_t ofi_ep_allreduce(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + enum fi_datatype datatype, enum fi_op op, uint64_t flags, + void *context); + +ssize_t ofi_ep_allgather(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + enum fi_datatype datatype, uint64_t flags, void *context); + +ssize_t ofi_ep_scatter(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + fi_addr_t root_addr, enum fi_datatype datatype, uint64_t flags, + void *context); + +ssize_t ofi_ep_broadcast(struct fid_ep *ep, void *buf, size_t count, void *desc, + fi_addr_t coll_addr, fi_addr_t root_addr, + enum fi_datatype datatype, uint64_t flags, void *context); + +int ofi_coll_ep_progress(struct fid_ep *ep); + +void ofi_coll_handle_xfer_comp(uint64_t tag, void *ctx); + + +#endif // _OFI_COLL_H_ diff --git a/include/ofi_cuda.h b/include/ofi_cuda.h new file mode 100644 index 00000000000..564116a40e2 --- /dev/null +++ b/include/ofi_cuda.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ + +#ifndef _OFI_CUDA_H_ +#define _OFI_CUDA_H_ +#if HAVE_LIBCUDA + +#include +#include + +static uint64_t +ofi_copy_cuda_iov_buf(const struct iovec *iov, size_t iov_count, + uint64_t iov_offset, void *buf, + uint64_t bufsize, int dir) +{ + uint64_t done = 0, len; + char *iov_buf; + size_t i; + + for (i = 0; i < iov_count && bufsize; i++) { + len = iov[i].iov_len; + + if (iov_offset > len) { + iov_offset -= len; + continue; + } + + iov_buf = (char *)iov[i].iov_base + iov_offset; + len -= iov_offset; + + len = MIN(len, bufsize); + if (dir == OFI_COPY_BUF_TO_IOV) + cudaMemcpy(iov_buf, (char *) buf + done, len, cudaMemcpyHostToDevice); + else if (dir == OFI_COPY_IOV_TO_BUF) + cudaMemcpy((char *) buf + done, iov_buf, len, cudaMemcpyDeviceToHost); + + iov_offset = 0; + bufsize -= len; + done += len; + } + return done; +} + +static inline uint64_t +ofi_copy_from_cuda_iov(void *buf, uint64_t bufsize, + const struct iovec *iov, size_t iov_count, uint64_t iov_offset) +{ + if (iov_count == 1) { + uint64_t size = ((iov_offset > iov[0].iov_len) ? + 0 : MIN(bufsize, iov[0].iov_len - iov_offset)); + + cudaMemcpy(buf, (char *) iov[0].iov_base + iov_offset, + size, cudaMemcpyDeviceToHost); + return size; + } else { + return ofi_copy_cuda_iov_buf(iov, iov_count, iov_offset, buf, + bufsize, OFI_COPY_IOV_TO_BUF); + } +} + +static inline uint64_t +ofi_copy_to_cuda_iov(const struct iovec *iov, size_t iov_count, uint64_t iov_offset, + void *buf, uint64_t bufsize) +{ + if (iov_count == 1) { + uint64_t size = ((iov_offset > iov[0].iov_len) ? + 0 : MIN(bufsize, iov[0].iov_len - iov_offset)); + cudaMemcpy((char *) iov[0].iov_base + iov_offset, + buf, size, cudaMemcpyHostToDevice); + return size; + } else { + return ofi_copy_cuda_iov_buf(iov, iov_count, iov_offset, buf, + bufsize, OFI_COPY_BUF_TO_IOV); + } +} + +#endif /* HAVE_LIBCUDA */ +#endif /* _OFI_CUDA_H_ */ diff --git a/include/ofi_enosys.h b/include/ofi_enosys.h index 58029356c15..5f7148becdf 100644 --- a/include/ofi_enosys.h +++ b/include/ofi_enosys.h @@ -43,6 +43,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -190,6 +191,7 @@ static struct fi_ops_domain X = { .stx_ctx = fi_no_stx_context, .srx_ctx = fi_no_srx_context, .query_atomic = fi_no_query_atomic, + .query_collective = fi_no_query_collective, }; */ int fi_no_av_open(struct fid_domain *domain, struct fi_av_attr *attr, @@ -210,7 +212,8 @@ int fi_no_srx_context(struct fid_domain *domain, struct fi_rx_attr *attr, struct fid_ep **rx_ep, void *context); int fi_no_query_atomic(struct fid_domain *domain, enum fi_datatype datatype, enum fi_op op, struct fi_atomic_attr *attr, uint64_t flags); - +int fi_no_query_collective(struct fid_domain *domain, enum fi_collective_op coll, + struct fi_collective_attr *attr, uint64_t flags); /* static struct fi_ops_mr X = { @@ -454,6 +457,55 @@ int fi_no_av_insertsym(struct fid_av *av, const char *node, size_t nodecnt, int fi_no_av_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count, uint64_t flags); +/* +static struct fi_ops_collective X = { + .size = sizeof(struct fi_ops_collective), + .barrier = fi_coll_no_barrier, + .broadcast = fi_coll_no_broadcast, + .alltoall = fi_coll_no_alltoall, + .allreduce = fi_coll_no_allreduce, + .allgather = fi_coll_no_allgather, + .reduce_scatter = fi_coll_no_reduce_scatter, + .reduce = fi_coll_no_reduce, + .scatter = fi_coll_no_scatter, + .gather = fi_coll_no_gather, + .msg = fi_coll_no_msg, +}; +*/ +ssize_t fi_coll_no_barrier(struct fid_ep *ep, fi_addr_t coll_addr, void *context); +ssize_t fi_coll_no_broadcast(struct fid_ep *ep, void *buf, size_t count, void *desc, + fi_addr_t coll_addr, fi_addr_t root_addr, + enum fi_datatype datatype, uint64_t flags, void *context); +ssize_t fi_coll_no_alltoall(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + enum fi_datatype datatype, uint64_t flags, void *context); +ssize_t fi_coll_no_allreduce(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + enum fi_datatype datatype, enum fi_op op, uint64_t flags, + void *context); +ssize_t fi_coll_no_allgather(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + enum fi_datatype datatype, uint64_t flags, void *context); +ssize_t fi_coll_no_reduce_scatter(struct fid_ep *ep, const void *buf, size_t count, + void *desc, void *result, void *result_desc, + fi_addr_t coll_addr, enum fi_datatype datatype, + enum fi_op op, uint64_t flags, void *context); +ssize_t fi_coll_no_reduce(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + fi_addr_t root_addr, enum fi_datatype datatype, enum fi_op op, + uint64_t flags, void *context); +ssize_t fi_coll_no_scatter(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + fi_addr_t root_addr, enum fi_datatype datatype, uint64_t flags, + void *context); +ssize_t fi_coll_no_gather(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + fi_addr_t root_addr, enum fi_datatype datatype, uint64_t flags, + void *context); +ssize_t fi_coll_no_msg(struct fid_ep *ep, const struct fi_msg_collective *msg, + struct fi_ioc *resultv, void **result_desc, size_t result_count, + uint64_t flags); + #ifdef __cplusplus } #endif diff --git a/include/ofi_epoll.h b/include/ofi_epoll.h index def823ed3f3..532b7ae37f4 100644 --- a/include/ofi_epoll.h +++ b/include/ofi_epoll.h @@ -37,27 +37,65 @@ #include #include #include +#include #include #include #include #include +enum ofi_pollfds_ctl { + POLLFDS_CTL_ADD, + POLLFDS_CTL_DEL, + POLLFDS_CTL_MOD, +}; + +struct ofi_pollfds_work_item { + int fd; + uint32_t events; + void *context; + enum ofi_pollfds_ctl type; + struct slist_entry entry; +}; + +struct ofi_pollfds { + int size; + int nfds; + struct pollfd *fds; + void **context; + int index; + struct fd_signal signal; + struct slist work_item_list; + fastlock_t lock; +}; + +int ofi_pollfds_create(struct ofi_pollfds **pfds); +int ofi_pollfds_add(struct ofi_pollfds *pfds, int fd, uint32_t events, + void *context); +int ofi_pollfds_mod(struct ofi_pollfds *pfds, int fd, uint32_t events, + void *context); +int ofi_pollfds_del(struct ofi_pollfds *pfds, int fd); +int ofi_pollfds_wait(struct ofi_pollfds *pfds, void **contexts, + int max_contexts, int timeout); +void ofi_pollfds_close(struct ofi_pollfds *pfds); + + #ifdef HAVE_EPOLL #include -#define FI_EPOLL_IN EPOLLIN -#define FI_EPOLL_OUT EPOLLOUT +#define OFI_EPOLL_IN EPOLLIN +#define OFI_EPOLL_OUT EPOLLOUT -typedef int fi_epoll_t; +typedef int ofi_epoll_t; +#define OFI_EPOLL_INVALID -1 -static inline int fi_epoll_create(int *ep) +static inline int ofi_epoll_create(int *ep) { *ep = epoll_create(4); return *ep < 0 ? -ofi_syserr() : 0; } -static inline int fi_epoll_add(int ep, int fd, uint32_t events, void *context) +static inline int ofi_epoll_add(int ep, int fd, uint32_t events, void *context) { struct epoll_event event; int ret; @@ -70,7 +108,7 @@ static inline int fi_epoll_add(int ep, int fd, uint32_t events, void *context) return 0; } -static inline int fi_epoll_mod(int ep, int fd, uint32_t events, void *context) +static inline int ofi_epoll_mod(int ep, int fd, uint32_t events, void *context) { struct epoll_event event; @@ -79,12 +117,12 @@ static inline int fi_epoll_mod(int ep, int fd, uint32_t events, void *context) return epoll_ctl(ep, EPOLL_CTL_MOD, fd, &event) ? -ofi_syserr() : 0; } -static inline int fi_epoll_del(int ep, int fd) +static inline int ofi_epoll_del(int ep, int fd) { return epoll_ctl(ep, EPOLL_CTL_DEL, fd, NULL) ? -ofi_syserr() : 0; } -static inline int fi_epoll_wait(int ep, void **contexts, int max_contexts, +static inline int ofi_epoll_wait(int ep, void **contexts, int max_contexts, int timeout) { struct epoll_event events[max_contexts]; @@ -100,49 +138,29 @@ static inline int fi_epoll_wait(int ep, void **contexts, int max_contexts, return ret; } -static inline void fi_epoll_close(int ep) +static inline void ofi_epoll_close(int ep) { close(ep); } #else -#include -#define FI_EPOLL_IN POLLIN -#define FI_EPOLL_OUT POLLOUT +#define OFI_EPOLL_IN POLLIN +#define OFI_EPOLL_OUT POLLOUT -enum fi_epoll_ctl { - EPOLL_CTL_ADD, - EPOLL_CTL_DEL, - EPOLL_CTL_MOD, -}; +typedef struct ofi_pollfds *ofi_epoll_t; +#define OFI_EPOLL_INVALID NULL -struct fi_epoll_work_item { - int fd; - uint32_t events; - void *context; - enum fi_epoll_ctl type; - struct slist_entry entry; -}; +#define ofi_epoll_create ofi_pollfds_create +#define ofi_epoll_add ofi_pollfds_add +#define ofi_epoll_mod ofi_pollfds_mod +#define ofi_epoll_del ofi_pollfds_del +#define ofi_epoll_wait ofi_pollfds_wait +#define ofi_epoll_close ofi_pollfds_close -typedef struct fi_epoll { - int size; - int nfds; - struct pollfd *fds; - void **context; - int index; - struct fd_signal signal; - struct slist work_item_list; - fastlock_t lock; -} *fi_epoll_t; - -int fi_epoll_create(struct fi_epoll **ep); -int fi_epoll_add(struct fi_epoll *ep, int fd, uint32_t events, void *context); -int fi_epoll_mod(struct fi_epoll *ep, int fd, uint32_t events, void *context); -int fi_epoll_del(struct fi_epoll *ep, int fd); -int fi_epoll_wait(struct fi_epoll *ep, void **contexts, int max_contexts, - int timeout); -void fi_epoll_close(struct fi_epoll *ep); +#define EPOLL_CTL_ADD POLLFDS_CTL_ADD +#define EPOLL_CTL_DEL POLLFDS_CTL_DEL +#define EPOLL_CTL_MOD POLLFDS_CTL_MOD #endif /* HAVE_EPOLL */ diff --git a/include/ofi_hmem.h b/include/ofi_hmem.h new file mode 100644 index 00000000000..cfd9c0a5c3a --- /dev/null +++ b/include/ofi_hmem.h @@ -0,0 +1,206 @@ +/* + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * (C) Copyright 2020-2021 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _OFI_HMEM_H_ +#define _OFI_HMEM_H_ + +#if HAVE_CONFIG_H +#include +#endif + +#include +#include + +#if HAVE_LIBCUDA + +#include +#include + +/* Libfabric supported CUDA operations. */ +cudaError_t ofi_cudaMemcpy(void* dst, const void* src, size_t count, + enum cudaMemcpyKind kind); +const char *ofi_cudaGetErrorName(cudaError_t error); +const char *ofi_cudaGetErrorString(cudaError_t error); +CUresult ofi_cuPointerGetAttribute(void *data, CUpointer_attribute attribute, + CUdeviceptr ptr); +cudaError_t ofi_cudaHostRegister(void *ptr, size_t size, unsigned int flags); +cudaError_t ofi_cudaHostUnregister(void *ptr); + +#endif /* HAVE_LIBCUDA */ + +#ifdef HAVE_ROCR + +#include + +/* Libfabric support ROCr operations. */ + +hsa_status_t ofi_hsa_memory_copy(void *dst, const void *src, size_t size); +hsa_status_t ofi_hsa_amd_pointer_info(void *ptr, hsa_amd_pointer_info_t *info, + void *(*alloc)(size_t), + uint32_t *num_agents_accessible, + hsa_agent_t **accessible); +hsa_status_t ofi_hsa_init(void); +hsa_status_t ofi_hsa_shut_down(void); +hsa_status_t ofi_hsa_status_string(hsa_status_t status, + const char **status_string); +const char *ofi_hsa_status_to_string(hsa_status_t status); + +hsa_status_t ofi_hsa_amd_dereg_dealloc_cb(void *ptr, + hsa_amd_deallocation_callback_t cb); +hsa_status_t ofi_hsa_amd_reg_dealloc_cb(void *ptr, + hsa_amd_deallocation_callback_t cb, + void *user_data); + +hsa_status_t ofi_hsa_amd_memory_lock(void *host_ptr, size_t size, + hsa_agent_t *agents, int num_agents, + void **agent_ptr); +hsa_status_t ofi_hsa_amd_memory_unlock(void *host_ptr); + +#endif /* HAVE_ROCR */ + +int rocr_copy_from_dev(uint64_t device, void *dest, const void *src, + size_t size); +int rocr_copy_to_dev(uint64_t device, void *dest, const void *src, + size_t size); +int rocr_hmem_init(void); +int rocr_hmem_cleanup(void); +bool rocr_is_addr_valid(const void *addr); +int rocr_host_register(void *ptr, size_t size); +int rocr_host_unregister(void *ptr); + +int cuda_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size); +int cuda_copy_from_dev(uint64_t device, void *host, const void *dev, size_t size); +int cuda_hmem_init(void); +int cuda_hmem_cleanup(void); +bool cuda_is_addr_valid(const void *addr); +int cuda_host_register(void *ptr, size_t size); +int cuda_host_unregister(void *ptr); +int cuda_dev_register(struct fi_mr_attr *mr_attr, uint64_t *handle); +int cuda_dev_unregister(uint64_t handle); + +void cuda_gdrcopy_to_dev(uint64_t handle, void *dev, + const void *host, size_t size); +void cuda_gdrcopy_from_dev(uint64_t handle, void *host, + const void *dev, size_t size); +int cuda_gdrcopy_hmem_init(void); +int cuda_gdrcopy_hmem_cleanup(void); +int cuda_gdrcopy_dev_register(struct fi_mr_attr *mr_attr, uint64_t *handle); +int cuda_gdrcopy_dev_unregister(uint64_t handle); + +#define ZE_MAX_DEVICES 4 +int ze_hmem_copy(uint64_t device, void *dst, const void *src, size_t size); +int ze_hmem_init(void); +int ze_hmem_cleanup(void); +bool ze_is_addr_valid(const void *addr); +int ze_hmem_get_handle(void *dev_buf, void **handle); +int ze_hmem_open_handle(void **handle, uint64_t device, void **ipc_ptr); +int ze_hmem_get_shared_handle(int dev_fd, void *dev_buf, int *ze_fd, + void **handle); +int ze_hmem_open_shared_handle(int dev_fd, void **handle, int *ze_fd, + uint64_t device, void **ipc_ptr); +int ze_hmem_close_handle(void *ipc_ptr); +bool ze_hmem_p2p_enabled(void); +int ze_hmem_get_base_addr(const void *ptr, void **base); +int *ze_hmem_get_dev_fds(int *nfds); + +static inline int ofi_memcpy(uint64_t device, void *dest, const void *src, + size_t size) +{ + memcpy(dest, src, size); + return FI_SUCCESS; +} + +static inline int ofi_hmem_init_noop(void) +{ + return FI_SUCCESS; +} + +static inline int ofi_hmem_cleanup_noop(void) +{ + return FI_SUCCESS; +} + +static inline int ofi_hmem_no_get_handle(void *dev_buffer, void **handle) +{ + return -FI_ENOSYS; +} + +static inline int ofi_hmem_no_open_handle(void **handle, uint64_t device, void **ipc_ptr) +{ + return -FI_ENOSYS; +} + +static inline int ofi_hmem_no_close_handle(void *ipc_ptr) +{ + return -FI_ENOSYS; +} + +static inline int ofi_hmem_register_noop(void *ptr, size_t size) +{ + return FI_SUCCESS; +} + +static inline int ofi_hmem_host_unregister_noop(void *ptr) +{ + return FI_SUCCESS; +} + +static inline int ofi_hmem_no_base_addr(const void *ptr, void **base) +{ + return -FI_ENOSYS; +} + +ssize_t ofi_copy_from_hmem_iov(void *dest, size_t size, + enum fi_hmem_iface hmem_iface, uint64_t device, + const struct iovec *hmem_iov, + size_t hmem_iov_count, uint64_t hmem_iov_offset); + +ssize_t ofi_copy_to_hmem_iov(enum fi_hmem_iface hmem_iface, uint64_t device, + const struct iovec *hmem_iov, + size_t hmem_iov_count, uint64_t hmem_iov_offset, + const void *src, size_t size); + +int ofi_hmem_get_handle(enum fi_hmem_iface iface, void *dev_buf, void **handle); +int ofi_hmem_open_handle(enum fi_hmem_iface iface, void **handle, + uint64_t device, void **ipc_ptr); +int ofi_hmem_close_handle(enum fi_hmem_iface iface, void *ipc_ptr); +int ofi_hmem_get_base_addr(enum fi_hmem_iface iface, const void *ptr, + void **base); + +void ofi_hmem_init(void); +void ofi_hmem_cleanup(void); +enum fi_hmem_iface ofi_get_hmem_iface(const void *addr); +int ofi_hmem_host_register(void *ptr, size_t size); +int ofi_hmem_host_unregister(void *ptr); + +#endif /* _OFI_HMEM_H_ */ diff --git a/include/ofi_hook.h b/include/ofi_hook.h index b13bca86985..d7a33cb7013 100644 --- a/include/ofi_hook.h +++ b/include/ofi_hook.h @@ -37,6 +37,7 @@ #include #include +#include #include #include #include diff --git a/include/ofi_indexer.h b/include/ofi_indexer.h index 1b0b09504a9..450324c0cea 100644 --- a/include/ofi_indexer.h +++ b/include/ofi_indexer.h @@ -38,6 +38,7 @@ #include "config.h" #include +#include /* * Indexer: @@ -60,7 +61,7 @@ struct ofi_idx_entry { int next; }; -#define OFI_IDX_INDEX_BITS 16 +#define OFI_IDX_INDEX_BITS 20 #define OFI_IDX_ENTRY_BITS 10 #define OFI_IDX_ENTRY_SIZE (1 << OFI_IDX_ENTRY_BITS) #define OFI_IDX_ARRAY_SIZE (1 << (OFI_IDX_INDEX_BITS - OFI_IDX_ENTRY_BITS)) @@ -79,6 +80,7 @@ struct indexer int ofi_idx_insert(struct indexer *idx, void *item); void *ofi_idx_remove(struct indexer *idx, int index); +void *ofi_idx_remove_ordered(struct indexer *idx, int index); void ofi_idx_replace(struct indexer *idx, int index, void *item); void ofi_idx_reset(struct indexer *idx); @@ -97,6 +99,10 @@ static inline void *ofi_idx_lookup(struct indexer *idx, int index) return ofi_idx_is_valid(idx, index) ? ofi_idx_at(idx, index) : NULL; } +static inline bool ofi_idx_free_list_empty(struct indexer *idx) +{ + return (idx->free_list == 0); +} /* * Index map: * The index map is similar in concept to the indexer. It allows the user @@ -120,7 +126,7 @@ struct index_map int ofi_idm_set(struct index_map *idm, int index, void *item); void *ofi_idm_clear(struct index_map *idm, int index); -void ofi_idm_reset(struct index_map *idm); +void ofi_idm_reset(struct index_map *idm, void (*callback)(void *item)); static inline void *ofi_idm_at(struct index_map *idm, int index) { diff --git a/include/ofi_iov.h b/include/ofi_iov.h index e9dde686be3..12cb238ff30 100644 --- a/include/ofi_iov.h +++ b/include/ofi_iov.h @@ -61,6 +61,16 @@ static inline size_t ofi_total_ioc_cnt(const struct fi_ioc *ioc, size_t ioc_coun return cnt; } +static inline size_t ofi_total_rma_iov_len(const struct fi_rma_iov *rma_iov, + size_t iov_count) +{ + size_t i, len = 0; + + for (i = 0; i < iov_count; i++) + len += rma_iov[i].len; + return len; +} + static inline size_t ofi_total_rma_ioc_cnt(const struct fi_rma_ioc *rma_ioc, size_t ioc_count) { @@ -171,7 +181,13 @@ ofi_iov_within(const struct iovec *iov1, const struct iovec *iov2) void ofi_consume_iov(struct iovec *iovec, size_t *iovec_count, size_t offset); -int ofi_truncate_iov(struct iovec *iov, size_t *iov_count, size_t trim_size); +void ofi_consume_iov_desc(struct iovec *iovec, void **desc, + size_t *iovec_count, size_t offset); + +void ofi_consume_rma_iov(struct fi_rma_iov *rma_iov, size_t *rma_iov_count, + size_t length); + +int ofi_truncate_iov(struct iovec *iov, size_t *iov_count, size_t new_size); /* Copy 'len' bytes worth of src iovec to dst */ int ofi_copy_iov_desc(struct iovec *dst_iov, void **dst_desc, size_t *dst_count, diff --git a/include/ofi_list.h b/include/ofi_list.h index 0cbe06b9c8c..1b79d0a2d2c 100644 --- a/include/ofi_list.h +++ b/include/ofi_list.h @@ -712,4 +712,15 @@ static inline int dlistfd_wait_avail(struct dlistfd_head *head, int timeout) return ret ? ret : !dlistfd_empty(head); } +static inline struct dlist_entry * +dlistfd_remove_first_match(struct dlistfd_head *head, dlist_func_t *match, + const void *arg) +{ + struct dlist_entry *entry = + dlist_remove_first_match(&head->list, match, arg); + if (entry) + dlistfd_reset(head); + return entry; +} + #endif /* _OFI_LIST_H_ */ diff --git a/include/ofi_lock.h b/include/ofi_lock.h index afde786b0e0..05ed4a1d353 100644 --- a/include/ofi_lock.h +++ b/include/ofi_lock.h @@ -161,6 +161,8 @@ static inline void ofi_fastlock_acquire_noop(fastlock_t *lock) /* These non-op routines must be used only by a single-threaded code*/ assert(!lock->in_use); lock->in_use = 1; +#else + (void) lock; #endif } static inline void ofi_fastlock_release_noop(fastlock_t *lock) @@ -168,6 +170,8 @@ static inline void ofi_fastlock_release_noop(fastlock_t *lock) #if ENABLE_DEBUG assert(lock->in_use); lock->in_use = 0; +#else + (void) lock; #endif } diff --git a/include/ofi_mem.h b/include/ofi_mem.h index 9ffc4ac66a1..03c79fc637c 100644 --- a/include/ofi_mem.h +++ b/include/ofi_mem.h @@ -66,12 +66,14 @@ enum { extern size_t *page_sizes; extern size_t num_page_sizes; -static inline long ofi_get_page_size() +static inline long ofi_get_page_size(void) { return ofi_sysconf(_SC_PAGESIZE); } ssize_t ofi_get_hugepage_size(void); +size_t ofi_get_mem_size(void); + /* We implement memdup to avoid external library dependency */ static inline void *mem_dup(const void *src, size_t size) @@ -98,50 +100,50 @@ static inline int ofi_str_dup(const char *src, char **dst) /* * Buffer pool (free stack) template */ -#define FREESTACK_EMPTY NULL +#define OFI_FREESTACK_EMPTY NULL -#define freestack_get_next(user_buf) ((char *)user_buf - sizeof(void *)) -#define freestack_get_user_buf(entry) ((char *)entry + sizeof(void *)) +#define ofi_freestack_get_next(user_buf) ((char *)user_buf - sizeof(void *)) +#define ofi_freestack_get_user_buf(entry) ((char *)entry + sizeof(void *)) #if ENABLE_DEBUG -#define freestack_init_next(entry) *((void **)entry) = NULL -#define freestack_check_next(entry) assert(*((void **)entry) == NULL) +#define ofi_freestack_init_next(entry) *((void **)entry) = NULL +#define ofi_freestack_check_next(entry) assert(*((void **)entry) == NULL) #else -#define freestack_init_next(entry) -#define freestack_check_next(entry) +#define ofi_freestack_init_next(entry) +#define ofi_freestack_check_next(entry) #endif -#define FREESTACK_HEADER \ +#define OFI_FREESTACK_HEADER \ size_t size; \ void *next; \ -#define freestack_isempty(fs) ((fs)->next == FREESTACK_EMPTY) -#define freestack_push(fs, p) \ +#define ofi_freestack_isempty(fs) ((fs)->next == OFI_FREESTACK_EMPTY) +#define ofi_freestack_push(fs, p) \ do { \ - freestack_check_next(freestack_get_next(p)); \ - *(void **) (freestack_get_next(p)) = (fs)->next; \ - (fs)->next = (freestack_get_next(p)); \ + ofi_freestack_check_next(ofi_freestack_get_next(p)); \ + *(void **) (ofi_freestack_get_next(p)) = (fs)->next; \ + (fs)->next = (ofi_freestack_get_next(p)); \ } while (0) -#define freestack_pop(fs) freestack_pop_impl(fs, (fs)->next) +#define ofi_freestack_pop(fs) ofi_freestack_pop_impl(fs, (fs)->next) -static inline void* freestack_pop_impl(void *fs, void *fs_next) +static inline void* ofi_freestack_pop_impl(void *fs, void *fs_next) { struct _freestack { - FREESTACK_HEADER + OFI_FREESTACK_HEADER } *freestack = (struct _freestack *)fs; - assert(!freestack_isempty(freestack)); + assert(!ofi_freestack_isempty(freestack)); freestack->next = *((void **)fs_next); - freestack_init_next(fs_next); - return freestack_get_user_buf(fs_next); + ofi_freestack_init_next(fs_next); + return ofi_freestack_get_user_buf(fs_next); } -#define DECLARE_FREESTACK(entrytype, name) \ +#define OFI_DECLARE_FREESTACK(entrytype, name) \ struct name ## _entry { \ void *next; \ entrytype buf; \ }; \ struct name { \ - FREESTACK_HEADER \ + OFI_FREESTACK_HEADER \ struct name ## _entry entry[]; \ }; \ \ @@ -156,11 +158,11 @@ name ## _init(struct name *fs, size_t size, \ assert(size == roundup_power_of_two(size)); \ assert(sizeof(fs->entry[0].buf) >= sizeof(void *)); \ fs->size = size; \ - fs->next = FREESTACK_EMPTY; \ + fs->next = OFI_FREESTACK_EMPTY; \ for (i = size - 1; i >= 0; i--) { \ if (init) \ init(&fs->entry[i].buf, arg); \ - freestack_push(fs, &fs->entry[i].buf); \ + ofi_freestack_push(fs, &fs->entry[i].buf); \ } \ } \ \ @@ -182,14 +184,16 @@ static inline int name ## _index(struct name *fs, \ entrytype *entry) \ { \ return (int)((struct name ## _entry *) \ - (freestack_get_next(entry)) \ + (ofi_freestack_get_next(entry)) \ - (struct name ## _entry *)fs->entry); \ } \ \ static inline void name ## _free(struct name *fs) \ { \ free(fs); \ -} +} \ +void dummy ## name (void) /* work-around global ; scope */ + /* * Buffer pool (free stack) template for shared memory regions @@ -205,9 +209,9 @@ static inline void name ## _free(struct name *fs) \ #define smr_freestack_push(fs, local_p) \ do { \ void *p = (char **) fs->base_addr + \ - ((char **) freestack_get_next(local_p) - \ + ((char **) ofi_freestack_get_next(local_p) - \ (char **) fs); \ - *(void **) freestack_get_next(local_p) = (fs)->next; \ + *(void **) ofi_freestack_get_next(local_p) = (fs)->next;\ (fs)->next = p; \ } while (0) #define smr_freestack_pop(fs) smr_freestack_pop_impl(fs, fs->next) @@ -225,12 +229,12 @@ static inline void* smr_freestack_pop_impl(void *fs, void *next) (char **) freestack->base_addr); freestack->next = *((void **)local); - freestack_init_next(local); + ofi_freestack_init_next(local); - return freestack_get_user_buf(local); + return ofi_freestack_get_user_buf(local); } -#define DECLARE_SMR_FREESTACK(entrytype, name) \ +#define SMR_DECLARE_FREESTACK(entrytype, name) \ struct name ## _entry { \ void *next; \ entrytype buf; \ @@ -266,14 +270,15 @@ static inline int name ## _index(struct name *fs, \ entrytype *entry) \ { \ return (int)((struct name ## _entry *) \ - (freestack_get_next(entry)) \ + (ofi_freestack_get_next(entry)) \ - (struct name ## _entry *)fs->entry); \ } \ \ static inline void name ## _free(struct name *fs) \ { \ free(fs); \ -} +} \ +void dummy ## name (void) /* work-around global ; scope */ /* @@ -324,9 +329,12 @@ struct ofi_bufpool_region { size_t index; void *context; struct ofi_bufpool *pool; -#ifndef NDEBUG - size_t use_cnt; -#endif + int flags; + OFI_DBG_VAR(size_t, use_cnt) +}; + +struct ofi_bufpool_ftr { + size_t magic; }; struct ofi_bufpool_hdr { @@ -336,6 +344,9 @@ struct ofi_bufpool_hdr { } entry; struct ofi_bufpool_region *region; size_t index; + + OFI_DBG_VAR(struct ofi_bufpool_ftr *, ftr) + OFI_DBG_VAR(size_t, magic) }; int ofi_bufpool_create_attr(struct ofi_bufpool_attr *attr, @@ -387,6 +398,9 @@ static inline void ofi_buf_free(void *buf) { assert(ofi_buf_region(buf)->use_cnt--); assert(!(ofi_buf_pool(buf)->attr.flags & OFI_BUFPOOL_INDEXED)); + assert(ofi_buf_hdr(buf)->magic == OFI_MAGIC_SIZE_T); + assert(ofi_buf_hdr(buf)->ftr->magic == OFI_MAGIC_SIZE_T); + slist_insert_head(&ofi_buf_hdr(buf)->entry.slist, &ofi_buf_pool(buf)->free_list.entries); } @@ -398,13 +412,15 @@ static inline void ofi_ibuf_free(void *buf) { struct ofi_bufpool_hdr *buf_hdr; - assert(ofi_buf_pool(buf)->attr.flags & OFI_BUFPOOL_INDEXED); - assert(ofi_buf_region(buf)->use_cnt--); buf_hdr = ofi_buf_hdr(buf); + assert(ofi_buf_region(buf)->use_cnt--); + assert(ofi_buf_pool(buf)->attr.flags & OFI_BUFPOOL_INDEXED); + assert(buf_hdr->magic == OFI_MAGIC_SIZE_T); + assert(buf_hdr->ftr->magic == OFI_MAGIC_SIZE_T); + dlist_insert_order(&buf_hdr->region->free_list, ofi_ibuf_is_lower, &buf_hdr->entry.dlist); - if (dlist_empty(&buf_hdr->region->entry)) { dlist_insert_order(&buf_hdr->region->pool->free_list.regions, ofi_ibufpool_region_is_lower, diff --git a/include/ofi_mr.h b/include/ofi_mr.h index 56cf75ac281..940d4d32f4f 100644 --- a/include/ofi_mr.h +++ b/include/ofi_mr.h @@ -1,5 +1,8 @@ /* * Copyright (c) 2017-2019 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -45,10 +48,12 @@ #include #include #include - +#include struct ofi_mr_info { struct iovec iov; + enum fi_hmem_iface iface; + uint64_t device; }; @@ -96,34 +101,73 @@ static inline uint64_t ofi_mr_get_prov_mode(uint32_t version, } +/* Single lock used by all memory monitors and MR caches. */ +extern pthread_mutex_t mm_lock; +/* The read-write lock is an additional lock used to protect the dlist_entry + * list of ofi_mem_monitor. Due to the necessity of releasing the mm_lock + * while walking the dlist in ofi_monitor_notify, we need a separate lock to + * ensure thread safety. This must be a read-write lock because + * ofi_monitor_notify may be recursive and cannot block multiple walks from + * occurring at the same time. + */ +extern pthread_rwlock_t mm_list_rwlock; + /* * Memory notifier - Report memory mapping changes to address ranges */ struct ofi_mr_cache; +union ofi_mr_hmem_info { + uint64_t cuda_id; +}; + struct ofi_mem_monitor { - fastlock_t lock; struct dlist_entry list; + enum fi_hmem_iface iface; + void (*init)(struct ofi_mem_monitor *monitor); + void (*cleanup)(struct ofi_mem_monitor *monitor); + int (*start)(struct ofi_mem_monitor *monitor); + void (*stop)(struct ofi_mem_monitor *monitor); int (*subscribe)(struct ofi_mem_monitor *notifier, - const void *addr, size_t len); + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info); void (*unsubscribe)(struct ofi_mem_monitor *notifier, - const void *addr, size_t len); + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info); + + /* Valid is a memory monitor operation used to query a memory monitor to + * see if the memory monitor's view of the buffer is still valid. If the + * memory monitor's view of the buffer is no longer valid (e.g. the + * pages behind a given virtual address have changed), the buffer needs + * to be re-registered. + */ + bool (*valid)(struct ofi_mem_monitor *notifier, const void *addr, + size_t len, union ofi_mr_hmem_info *hmem_info); }; -void ofi_monitor_init(void); -void ofi_monitor_cleanup(void); -int ofi_monitor_add_cache(struct ofi_mem_monitor *monitor, +void ofi_monitor_init(struct ofi_mem_monitor *monitor); +void ofi_monitor_cleanup(struct ofi_mem_monitor *monitor); +void ofi_monitors_init(void); +void ofi_monitors_cleanup(void); +int ofi_monitors_add_cache(struct ofi_mem_monitor **monitors, struct ofi_mr_cache *cache); -void ofi_monitor_del_cache(struct ofi_mr_cache *cache); +void ofi_monitors_del_cache(struct ofi_mr_cache *cache); void ofi_monitor_notify(struct ofi_mem_monitor *monitor, const void *addr, size_t len); +void ofi_monitor_flush(struct ofi_mem_monitor *monitor); int ofi_monitor_subscribe(struct ofi_mem_monitor *monitor, - const void *addr, size_t len); + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info); void ofi_monitor_unsubscribe(struct ofi_mem_monitor *monitor, - const void *addr, size_t len); + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info); + +extern struct ofi_mem_monitor *default_monitor; +extern struct ofi_mem_monitor *default_cuda_monitor; +extern struct ofi_mem_monitor *default_rocr_monitor; /* * Userfault fd memory monitor @@ -134,11 +178,21 @@ struct ofi_uffd { int fd; }; -int ofi_uffd_init(void); -void ofi_uffd_cleanup(void); - extern struct ofi_mem_monitor *uffd_monitor; +/* + * Memory intercept call memory monitor + */ +struct ofi_memhooks { + struct ofi_mem_monitor monitor; + struct dlist_entry intercept_list; +}; + +extern struct ofi_mem_monitor *memhooks_monitor; + +extern struct ofi_mem_monitor *cuda_monitor; + +extern struct ofi_mem_monitor *rocr_monitor; /* * Used to store registered memory regions into a lookup map. This @@ -151,7 +205,7 @@ struct ofi_mr_map { const struct fi_provider *prov; struct ofi_rbmap *rbtree; uint64_t key; - enum fi_mr_mode mode; + int mode; }; int ofi_mr_map_init(const struct fi_provider *in_prov, int mode, @@ -178,13 +232,18 @@ struct ofi_mr { struct util_domain *domain; uint64_t key; uint64_t flags; + enum fi_hmem_iface iface; + uint64_t device; }; +void ofi_mr_update_attr(uint32_t user_version, uint64_t caps, + const struct fi_mr_attr *user_attr, + struct fi_mr_attr *cur_abi_attr); int ofi_mr_close(struct fid *fid); int ofi_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, uint64_t flags, struct fid_mr **mr_fid); int ofi_mr_regv(struct fid *fid, const struct iovec *iov, - size_t count, uint64_t access, uint64_t offset, + size_t count, uint64_t access, uint64_t offset, uint64_t requested_key, uint64_t flags, struct fid_mr **mr_fid, void *context); int ofi_mr_reg(struct fid *fid, const void *buf, size_t len, @@ -200,50 +259,34 @@ int ofi_mr_verify(struct ofi_mr_map *map, ssize_t len, struct ofi_mr_cache_params { size_t max_cnt; size_t max_size; - int merge_regions; + char * monitor; + int cuda_monitor_enabled; + int rocr_monitor_enabled; }; extern struct ofi_mr_cache_params cache_params; struct ofi_mr_entry { struct ofi_mr_info info; - unsigned int cached:1; - unsigned int subscribed:1; + struct ofi_rbnode *node; int use_cnt; - struct dlist_entry lru_entry; + struct dlist_entry list_entry; + union ofi_mr_hmem_info hmem_info; uint8_t data[]; }; -enum ofi_mr_storage_type { - OFI_MR_STORAGE_DEFAULT = 0, - OFI_MR_STORAGE_RBT, - OFI_MR_STORAGE_USER, -}; - -struct ofi_mr_storage { - enum ofi_mr_storage_type type; - void *storage; - - struct ofi_mr_entry * (*find)(struct ofi_mr_storage *storage, - const struct ofi_mr_info *key); - struct ofi_mr_entry * (*overlap)(struct ofi_mr_storage *storage, - const struct iovec *key); - int (*insert)(struct ofi_mr_storage *storage, - struct ofi_mr_info *key, - struct ofi_mr_entry *entry); - int (*erase)(struct ofi_mr_storage *storage, - struct ofi_mr_entry *entry); - void (*destroy)(struct ofi_mr_storage *storage); -}; +#define OFI_HMEM_MAX 4 struct ofi_mr_cache { struct util_domain *domain; - struct ofi_mem_monitor *monitor; - struct dlist_entry notify_entry; + struct ofi_mem_monitor *monitors[OFI_HMEM_MAX]; + struct dlist_entry notify_entries[OFI_HMEM_MAX]; size_t entry_data_size; - struct ofi_mr_storage storage; + struct ofi_rbmap tree; struct dlist_entry lru_list; + struct dlist_entry flush_list; + pthread_mutex_t lock; size_t cached_cnt; size_t cached_size; @@ -261,15 +304,35 @@ struct ofi_mr_cache { struct ofi_mr_entry *entry); }; -int ofi_mr_cache_init(struct util_domain *domain, struct ofi_mem_monitor *monitor, +int ofi_mr_cache_init(struct util_domain *domain, + struct ofi_mem_monitor **monitors, struct ofi_mr_cache *cache); void ofi_mr_cache_cleanup(struct ofi_mr_cache *cache); void ofi_mr_cache_notify(struct ofi_mr_cache *cache, const void *addr, size_t len); -bool ofi_mr_cache_flush(struct ofi_mr_cache *cache); +bool ofi_mr_cache_flush(struct ofi_mr_cache *cache, bool flush_lru); + int ofi_mr_cache_search(struct ofi_mr_cache *cache, const struct fi_mr_attr *attr, struct ofi_mr_entry **entry); +/** + * Given an attr (with an iov range), if the iov range is already registered, + * return the corresponding ofi_mr_entry. Otherwise, return NULL. + * The caller must call ofi_mr_cache_delete on the entry before cleanup if + * the returned entry is not NULL. + * + * @param[in] cache The cache the entry belongs to + * @param[in] attr Information about the region to search + * + * @returns entry The registered entry corresponding to the + * region described in attr + * @returns NULL The region described in attr is not registered + * with the cache. + */ +struct ofi_mr_entry *ofi_mr_cache_find(struct ofi_mr_cache *cache, + const struct fi_mr_attr *attr); +int ofi_mr_cache_reg(struct ofi_mr_cache *cache, const struct fi_mr_attr *attr, + struct ofi_mr_entry **entry); void ofi_mr_cache_delete(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry); diff --git a/include/ofi_net.h b/include/ofi_net.h index e3117fced1c..ee0a6f8c4aa 100644 --- a/include/ofi_net.h +++ b/include/ofi_net.h @@ -125,24 +125,54 @@ int ofi_discard_socket(SOCKET sock, size_t len); #define AF_IB 27 #endif +#define OFI_ADDRSTRLEN (INET6_ADDRSTRLEN + 50) + +/* values taken from librdmacm/rdma_cma.h */ +#define OFI_IB_IP_PS_MASK 0xFFFFFFFFFFFF0000ULL +#define OFI_IB_IP_PORT_MASK 0x000000000000FFFFULL + +struct ofi_sockaddr_ib { + unsigned short int sib_family; /* AF_IB */ + uint16_t sib_pkey; + uint32_t sib_flowinfo; + uint8_t sib_addr[16]; + uint64_t sib_sid; + uint64_t sib_sid_mask; + uint64_t sib_scope_id; +}; + +enum ofi_rdma_port_space { + OFI_RDMA_PS_IPOIB = 0x0002, + OFI_RDMA_PS_IB = 0x013F, + OFI_RDMA_PS_TCP = 0x0106, + OFI_RDMA_PS_UDP = 0x0111, +}; + union ofi_sock_ip { - struct sockaddr sa; - struct sockaddr_in sin; - struct sockaddr_in6 sin6; - uint8_t align[32]; + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + struct ofi_sockaddr_ib sib; + uint8_t align[48]; }; struct ofi_addr_list_entry { - char ipstr[INET6_ADDRSTRLEN]; - union ofi_sock_ip ipaddr; - size_t speed; - struct slist_entry entry; + struct slist_entry entry; + char ipstr[INET6_ADDRSTRLEN]; + union ofi_sock_ip ipaddr; + size_t speed; + char net_name[OFI_ADDRSTRLEN]; + char ifa_name[OFI_ADDRSTRLEN]; + uint64_t comm_caps; }; int ofi_addr_cmp(const struct fi_provider *prov, const struct sockaddr *sa1, const struct sockaddr *sa2); int ofi_getifaddrs(struct ifaddrs **ifap); -void ofi_get_list_of_addr(struct fi_provider *prov, const char *env_name, + +void ofi_set_netmask_str(char *netstr, size_t len, struct ifaddrs *ifa); + +void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, struct slist *addr_list); void ofi_free_list_of_addr(struct slist *addr_list); @@ -153,7 +183,7 @@ void ofi_free_list_of_addr(struct slist *addr_list); #define ofi_sin6_addr(addr) (((struct sockaddr_in6 *)(addr))->sin6_addr) #define ofi_sin6_port(addr) (((struct sockaddr_in6 *)(addr))->sin6_port) -#define OFI_ADDRSTRLEN (INET6_ADDRSTRLEN + 50) +#define ofi_sib_addr(addr) (((struct ofi_sockaddr_ib *)(addr))->sib_addr) static inline size_t ofi_sizeofaddr(const struct sockaddr *addr) { @@ -162,8 +192,10 @@ static inline size_t ofi_sizeofaddr(const struct sockaddr *addr) return sizeof(struct sockaddr_in); case AF_INET6: return sizeof(struct sockaddr_in6); + case AF_IB: + return sizeof(struct ofi_sockaddr_ib); default: - FI_WARN(&core_prov, FI_LOG_CORE, "Unknown address format"); + FI_WARN(&core_prov, FI_LOG_CORE, "Unknown address format\n"); return 0; } } @@ -175,8 +207,10 @@ static inline size_t ofi_sizeofip(const struct sockaddr *addr) return sizeof(struct in_addr); case AF_INET6: return sizeof(struct in6_addr); + case AF_IB: + return sizeof(ofi_sib_addr(addr)); default: - FI_WARN(&core_prov, FI_LOG_CORE, "Unknown address format"); + FI_WARN(&core_prov, FI_LOG_CORE, "Unknown address format\n"); return 0; } } @@ -197,42 +231,54 @@ static inline int ofi_translate_addr_format(int family) uint16_t ofi_get_sa_family(const struct fi_info *info); -static inline int ofi_ipv4_is_any_addr(struct sockaddr *sa) +static inline bool ofi_sin_is_any_addr(const struct sockaddr *sa) { struct in_addr ia_any = { .s_addr = INADDR_ANY, }; if (!sa) - return 0; + return false; return !memcmp(&ofi_sin_addr(sa).s_addr, &ia_any, sizeof(ia_any)); } -static inline int ofi_ipv6_is_any_addr(struct sockaddr *sa) +static inline bool ofi_sin6_is_any_addr(const struct sockaddr *sa) { struct in6_addr ia6_any = IN6ADDR_ANY_INIT; if (!sa) - return 0; + return false; return !memcmp(&ofi_sin6_addr(sa), &ia6_any, sizeof(ia6_any)); } -static inline int ofi_is_any_addr(struct sockaddr *sa) +static inline bool ofi_sib_is_any_addr(const struct sockaddr *sa) { + struct in6_addr ia6_any = IN6ADDR_ANY_INIT; + if (!sa) - return 0; + return false; + + return !memcmp(&ofi_sib_addr(sa), &ia6_any, sizeof(ia6_any)); +} + +static inline bool ofi_is_any_addr(const struct sockaddr *sa) +{ + if (!sa) + return false; switch(sa->sa_family) { case AF_INET: - return ofi_ipv4_is_any_addr(sa); + return ofi_sin_is_any_addr(sa); case AF_INET6: - return ofi_ipv6_is_any_addr(sa); + return ofi_sin6_is_any_addr(sa); + case AF_IB: + return ofi_sib_is_any_addr(sa); default: FI_WARN(&core_prov, FI_LOG_CORE, "Unknown address format!\n"); - return 0; + return false; } } @@ -246,15 +292,18 @@ static inline uint16_t ofi_addr_get_port(const struct sockaddr *addr) return ntohs(ofi_sin_port((const struct sockaddr_in *) addr)); case AF_INET6: return ntohs(ofi_sin6_port((const struct sockaddr_in6 *) addr)); + case AF_IB: + return (uint16_t)ntohll(((const struct ofi_sockaddr_ib *)addr)->sib_sid); default: FI_WARN(&core_prov, FI_LOG_FABRIC, "Unknown address format\n"); - assert(0); return 0; } } static inline void ofi_addr_set_port(struct sockaddr *addr, uint16_t port) { + struct ofi_sockaddr_ib *sib; + switch (ofi_sa_family(addr)) { case AF_INET: ofi_sin_port(addr) = htons(port); @@ -262,6 +311,11 @@ static inline void ofi_addr_set_port(struct sockaddr *addr, uint16_t port) case AF_INET6: ofi_sin6_port(addr) = htons(port); break; + case AF_IB: + sib = (struct ofi_sockaddr_ib *)addr; + sib->sib_sid = htonll(((uint64_t)OFI_RDMA_PS_IB << 16) + ntohs(port)); + sib->sib_sid_mask = htonll(OFI_IB_IP_PS_MASK | OFI_IB_IP_PORT_MASK); + break; default: FI_WARN(&core_prov, FI_LOG_FABRIC, "Unknown address format\n"); assert(0); @@ -275,16 +329,23 @@ static inline void * ofi_get_ipaddr(const struct sockaddr *addr) return &ofi_sin_addr((const struct sockaddr_in *) addr); case AF_INET6: return &ofi_sin6_addr((const struct sockaddr_in6 *) addr); + case AF_IB: + return &ofi_sib_addr((const struct ofi_sockaddr_ib *) addr); default: return NULL; } } -static inline int ofi_equals_ipaddr(const struct sockaddr *addr1, +static inline bool ofi_valid_dest_ipaddr(const struct sockaddr *addr) +{ + return ofi_addr_get_port(addr) && !ofi_is_any_addr(addr); +} + +static inline bool ofi_equals_ipaddr(const struct sockaddr *addr1, const struct sockaddr *addr2) { if (addr1->sa_family != addr2->sa_family) - return 0; + return false; switch (addr1->sa_family) { case AF_INET: @@ -293,20 +354,23 @@ static inline int ofi_equals_ipaddr(const struct sockaddr *addr1, case AF_INET6: return !memcmp(&ofi_sin6_addr(addr1), &ofi_sin6_addr(addr2), sizeof(ofi_sin6_addr(addr1))); + case AF_IB: + return !memcmp(&ofi_sib_addr(addr1), &ofi_sib_addr(addr2), + sizeof(ofi_sib_addr(addr1))); default: - return 0; + return false; } } -static inline int ofi_equals_sockaddr(const struct sockaddr *addr1, - const struct sockaddr *addr2) +static inline bool ofi_equals_sockaddr(const struct sockaddr *addr1, + const struct sockaddr *addr2) { return (ofi_addr_get_port(addr1) == ofi_addr_get_port(addr2)) && ofi_equals_ipaddr(addr1, addr2); } -int ofi_is_wildcard_listen_addr(const char *node, const char *service, - uint64_t flags, const struct fi_info *hints); +bool ofi_is_wildcard_listen_addr(const char *node, const char *service, + uint64_t flags, const struct fi_info *hints); size_t ofi_mask_addr(struct sockaddr *maskaddr, const struct sockaddr *srcaddr, const struct sockaddr *netmask); @@ -317,6 +381,7 @@ size_t ofi_mask_addr(struct sockaddr *maskaddr, const struct sockaddr *srcaddr, */ const char *ofi_straddr(char *buf, size_t *len, uint32_t addr_format, const void *addr); +uint32_t ofi_addr_format(const char *str); /* Returns allocated address to caller. Caller must free. */ int ofi_str_toaddr(const char *str, uint32_t *addr_format, diff --git a/include/ofi_osd.h b/include/ofi_osd.h index 97be08b7bdb..31bbc08da30 100644 --- a/include/ofi_osd.h +++ b/include/ofi_osd.h @@ -107,4 +107,18 @@ static inline int ofi_detect_endianness(void) } } +#define OFI_MAGIC_64 (0x0F1C0DE0F1C0DE64) +#define OFI_MAGIC_PTR ((void *) (uintptr_t) OFI_MAGIC_64) +#define OFI_MAGIC_SIZE_T ((size_t) OFI_MAGIC_64) + +#ifndef NDEBUG +#define OFI_DBG_VAR(type, name) type name; +#define OFI_DBG_SET(name, val) name = val +#define OFI_DBG_ADD(name, val) name += val +#else +#define OFI_DBG_VAR(type, name) +#define OFI_DBG_SET(name, val) +#define OFI_DBG_ADD(name, val) +#endif + #endif /* _OFI_OSD_H_ */ diff --git a/include/ofi_prov.h b/include/ofi_prov.h index 2fa887b4be9..ff9c1fbe1bc 100644 --- a/include/ofi_prov.h +++ b/include/ofi_prov.h @@ -103,6 +103,17 @@ PSM2_INI ; # define PSM2_INIT NULL #endif +#if (HAVE_PSM3) && (HAVE_PSM3_DL) +# define PSM3_INI FI_EXT_INI +# define PSM3_INIT NULL +#elif (HAVE_PSM3) +# define PSM3_INI INI_SIG(fi_psm3_ini) +# define PSM3_INIT fi_psm3_ini() +PSM3_INI ; +#else +# define PSM3_INIT NULL +#endif + #if (HAVE_SOCKETS) && (HAVE_SOCKETS_DL) # define SOCKETS_INI FI_EXT_INI # define SOCKETS_INIT NULL @@ -125,17 +136,6 @@ USNIC_INI ; # define USNIC_INIT NULL #endif -#if (HAVE_MLX) && (HAVE_MLX_DL) -# define MLX_INI FI_EXT_INI -# define MLX_INIT NULL -#elif (HAVE_MLX) -# define MLX_INI INI_SIG(fi_mlx_ini) -# define MLX_INIT fi_mlx_ini() -MLX_INI ; -#else -# define MLX_INIT NULL -#endif - #if (HAVE_UDP) && (HAVE_UDP_DL) # define UDP_INI FI_EXT_INI # define UDP_INIT NULL diff --git a/include/ofi_rbuf.h b/include/ofi_rbuf.h index 5f2e6e38bec..3d8e1c77cdb 100644 --- a/include/ofi_rbuf.h +++ b/include/ofi_rbuf.h @@ -82,7 +82,8 @@ static inline struct name * name ## _create(size_t size) \ static inline void name ## _free(struct name *cq) \ { \ free(cq); \ -} +} \ +void dummy ## name (void) /* work-around global ; scope */ #define ofi_cirque_isempty(cq) ((cq)->wcnt == (cq)->rcnt) #define ofi_cirque_usedcnt(cq) ((cq)->wcnt - (cq)->rcnt) @@ -91,8 +92,10 @@ static inline void name ## _free(struct name *cq) \ #define ofi_cirque_rindex(cq) ((cq)->rcnt & (cq)->size_mask) #define ofi_cirque_windex(cq) ((cq)->wcnt & (cq)->size_mask) +#define ofi_cirque_tindex(cq) (((cq)->wcnt - 1) & (cq)->size_mask) #define ofi_cirque_head(cq) (&(cq)->buf[ofi_cirque_rindex(cq)]) -#define ofi_cirque_tail(cq) (&(cq)->buf[ofi_cirque_windex(cq)]) +#define ofi_cirque_tail(cq) (&(cq)->buf[ofi_cirque_tindex(cq)]) +#define ofi_cirque_next(cq) (&(cq)->buf[ofi_cirque_windex(cq)]) #define ofi_cirque_insert(cq, x) (cq)->buf[(cq)->wcnt++ & (cq)->size_mask] = x #define ofi_cirque_remove(cq) (&(cq)->buf[(cq)->rcnt++ & (cq)->size_mask]) #define ofi_cirque_discard(cq) ((cq)->rcnt++) diff --git a/include/ofi_recvwin.h b/include/ofi_recvwin.h index 917a0c34e99..468c657f8b4 100644 --- a/include/ofi_recvwin.h +++ b/include/ofi_recvwin.h @@ -49,11 +49,11 @@ #include #include -#define OFI_DECL_RECVWIN_BUF(entrytype, name) \ +#define OFI_DECL_RECVWIN_BUF(entrytype, name, id_type) \ OFI_DECLARE_CIRQUE(entrytype, recvwin_cirq); \ struct name { \ - uint64_t exp_msg_id; \ - unsigned int win_size; \ + id_type exp_msg_id; \ + id_type win_size; \ struct recvwin_cirq *pending; \ }; \ \ @@ -74,17 +74,35 @@ ofi_recvwin_free(struct name *recvq) \ } \ \ static inline int \ -ofi_recvwin_queue_msg(struct name *recvq, entrytype * msg, uint64_t id) \ +ofi_recvwin_id_valid(struct name *recvq, id_type id) \ { \ - int write_idx; \ + return ofi_recvwin_id_valid_ ## id_type (recvq, id); \ +} \ + \ +static inline int \ +ofi_recvwin_queue_msg(struct name *recvq, entrytype * msg, id_type id) \ +{ \ + size_t write_idx; \ \ - assert(ofi_recvwin_is_allowed(recvq, id)); \ + assert(ofi_recvwin_id_valid(recvq, id)); \ write_idx = (ofi_cirque_rindex(recvq->pending) \ + (id - recvq->exp_msg_id)) \ & recvq->pending->size_mask; \ recvq->pending->buf[write_idx] = *msg; \ ofi_cirque_commit(recvq->pending); \ return 0; \ +} \ + \ +static inline entrytype * \ +ofi_recvwin_get_msg(struct name *recvq, id_type id) \ +{ \ + size_t read_idx; \ + \ + assert(ofi_recvwin_id_valid(recvq, id)); \ + read_idx = (ofi_cirque_rindex(recvq->pending) \ + + (id - recvq->exp_msg_id)) \ + & recvq->pending->size_mask; \ + return &recvq->pending->buf[read_idx]; \ } \ \ static inline entrytype * \ @@ -111,8 +129,14 @@ ofi_recvwin_slide(struct name *recvq) \ #define ofi_recvwin_exp_inc(rq) ((rq)->exp_msg_id++) #define ofi_recvwin_is_exp(rq, id) ((rq)->exp_msg_id == id) #define ofi_recvwin_next_exp_id(rq) ((rq)->exp_msg_id) -#define ofi_recvwin_is_delayed(rq, id) ((rq)->exp_msg_id > id) -#define ofi_recvwin_is_allowed(rq, id) (id >= rq->exp_msg_id \ - && id < (rq->win_size + rq->exp_msg_id)) +/* + * When exp_msg_id on the receiver has not wrapped around but the sender ID has + * we need to allow the IDs starting from 0 that are valid. These macros use + * the overflow of exp_msg_id to validate that. + */ +#define ofi_recvwin_id_valid_uint32_t(rq, id) \ + ofi_val32_inrange(rq->exp_msg_id, rq->win_size, id) +#define ofi_recvwin_id_valid_uint64_t(rq, id) \ + ofi_val64_inrange(rq->exp_msg_id, rq->win_size, id) #endif /* FI_RECVWIN_H */ diff --git a/include/ofi_shm.h b/include/ofi_shm.h index 0f5351c19a3..8bbb2ec72fa 100644 --- a/include/ofi_shm.h +++ b/include/ofi_shm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 Intel Corporation. All rights reserved. + * Copyright (c) 2016-2021 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -37,11 +37,13 @@ #include #include +#include #include #include #include #include +#include #include @@ -64,6 +66,7 @@ extern "C" { #define SMR_FLAG_DEBUG (0 << 1) #endif +#define SMR_FLAG_IPC_SOCK (1 << 2) #define SMR_CMD_SIZE 128 /* align with 64-byte cache line */ @@ -72,17 +75,31 @@ enum { smr_src_inline, /* command data */ smr_src_inject, /* inject buffers */ smr_src_iov, /* reference iovec via CMA */ + smr_src_mmap, /* mmap-based fallback protocol */ + smr_src_sar, /* segmentation fallback protocol */ + smr_src_ipc, /* device IPC handle protocol */ }; +//reserves 0-255 for defined ops and room for new ops +//256 and beyond reserved for ctrl ops +#define SMR_OP_MAX (1 << 8) + #define SMR_REMOTE_CQ_DATA (1 << 0) #define SMR_RMA_REQ (1 << 1) #define SMR_TX_COMPLETION (1 << 2) #define SMR_RX_COMPLETION (1 << 3) #define SMR_MULTI_RECV (1 << 4) -/* +/* CMA capability */ +enum { + SMR_CMA_CAP_NA, + SMR_CMA_CAP_ON, + SMR_CMA_CAP_OFF, +}; + +/* * Unique smr_op_hdr for smr message protocol: - * addr - local fi_addr of peer sending msg (for shm lookup) + * addr - local shm_id of peer sending msg (for shm lookup) * op - type of op (ex. ofi_op_msg, defined in ofi_proto.h) * op_src - msg src (ex. smr_src_inline, defined above) * op_flags - operation flags (ex. SMR_REMOTE_CQ_DATA, defined above) @@ -91,7 +108,7 @@ enum { */ struct smr_msg_hdr { uint64_t msg_id; - fi_addr_t addr; + int64_t id; uint32_t op; uint16_t op_src; uint16_t op_flags; @@ -108,19 +125,37 @@ struct smr_msg_hdr { }; }; -#define SMR_MSG_DATA_LEN (128 - sizeof(struct smr_msg_hdr)) +#define SMR_MSG_DATA_LEN (SMR_CMD_SIZE - sizeof(struct smr_msg_hdr)) #define SMR_COMP_DATA_LEN (SMR_MSG_DATA_LEN / 2) + +#define IPC_HANDLE_SIZE 64 +struct smr_ipc_info { + uint64_t iface; + union { + uint8_t ipc_handle[IPC_HANDLE_SIZE]; + struct { + uint64_t device; + uint64_t offset; + uint64_t fd_handle; + }; + }; +}; + union smr_cmd_data { uint8_t msg[SMR_MSG_DATA_LEN]; struct { - uint8_t iov_count; - struct iovec iov[(SMR_MSG_DATA_LEN - 8) / + size_t iov_count; + struct iovec iov[(SMR_MSG_DATA_LEN - sizeof(size_t)) / sizeof(struct iovec)]; }; struct { uint8_t buf[SMR_COMP_DATA_LEN]; uint8_t comp[SMR_COMP_DATA_LEN]; }; + struct { + uint64_t sar; + }; + struct smr_ipc_info ipc_info; }; struct smr_cmd_msg { @@ -148,25 +183,46 @@ struct smr_cmd { #define SMR_INJECT_SIZE 4096 #define SMR_COMP_INJECT_SIZE (SMR_INJECT_SIZE / 2) +#define SMR_SAR_SIZE 16384 + +#define SMR_NAME_MAX 256 +#define SMR_SOCK_NAME_MAX sizeof(((struct sockaddr_un *)0)->sun_path) -#define SMR_NAME_SIZE 32 struct smr_addr { - char name[SMR_NAME_SIZE]; - fi_addr_t addr; + char name[SMR_NAME_MAX]; + int64_t id; +}; + +struct smr_peer_data { + struct smr_addr addr; + uint32_t sar_status; + uint32_t name_sent; }; +extern struct dlist_entry ep_name_list; +extern pthread_mutex_t ep_list_lock; + struct smr_region; +struct smr_ep_name { + char name[SMR_NAME_MAX]; + struct smr_region *region; + struct dlist_entry entry; +}; + struct smr_peer { struct smr_addr peer; + fi_addr_t fiaddr; struct smr_region *region; }; #define SMR_MAX_PEERS 256 struct smr_map { - fastlock_t lock; - struct smr_peer peers[SMR_MAX_PEERS]; + fastlock_t lock; + int64_t cur_id; + struct ofi_rbmap rbmap; + struct smr_peer peers[SMR_MAX_PEERS]; }; struct smr_region { @@ -174,6 +230,9 @@ struct smr_region { uint8_t resv; uint16_t flags; int pid; + uint8_t cma_cap_peer; + uint8_t cma_cap_self; + void *base_addr; fastlock_t lock; /* lock for shm access Must hold smr->lock before tx/rx cq locks in order to progress or post recv */ @@ -186,13 +245,16 @@ struct smr_region { Might not always be paired consistently with cmd alloc/free depending on protocol (Ex. unexpected messages, RMA requests) */ + size_t sar_cnt; /* offsets from start of smr_region */ size_t cmd_queue_offset; size_t resp_queue_offset; size_t inject_pool_offset; - size_t peer_addr_offset; + size_t sar_pool_offset; + size_t peer_data_offset; size_t name_offset; + size_t sock_name_offset; }; struct smr_resp { @@ -210,9 +272,24 @@ struct smr_inject_buf { }; }; +enum { + SMR_SAR_FREE = 0, /* buffer can be used */ + SMR_SAR_READY, /* buffer has data in it */ +}; + +struct smr_sar_buf { + uint64_t status; + uint8_t buf[SMR_SAR_SIZE]; +}; + +struct smr_sar_msg { + struct smr_sar_buf sar[2]; +}; + OFI_DECLARE_CIRQUE(struct smr_cmd, smr_cmd_queue); OFI_DECLARE_CIRQUE(struct smr_resp, smr_resp_queue); -DECLARE_SMR_FREESTACK(struct smr_inject_buf, smr_inject_pool); +SMR_DECLARE_FREESTACK(struct smr_inject_buf, smr_inject_pool); +SMR_DECLARE_FREESTACK(struct smr_sar_msg, smr_sar_pool); static inline struct smr_region *smr_peer_region(struct smr_region *smr, int i) { @@ -230,15 +307,24 @@ static inline struct smr_inject_pool *smr_inject_pool(struct smr_region *smr) { return (struct smr_inject_pool *) ((char *) smr + smr->inject_pool_offset); } -static inline struct smr_addr *smr_peer_addr(struct smr_region *smr) +static inline struct smr_peer_data *smr_peer_data(struct smr_region *smr) +{ + return (struct smr_peer_data *) ((char *) smr + smr->peer_data_offset); +} +static inline struct smr_sar_pool *smr_sar_pool(struct smr_region *smr) { - return (struct smr_addr *) ((char *) smr + smr->peer_addr_offset); + return (struct smr_sar_pool *) ((char *) smr + smr->sar_pool_offset); } static inline const char *smr_name(struct smr_region *smr) { return (const char *) smr + smr->name_offset; } +static inline char *smr_sock_name(struct smr_region *smr) +{ + return (char *) smr + smr->sock_name_offset; +} + static inline void smr_set_map(struct smr_region *smr, struct smr_map *map) { smr->map = map; @@ -250,22 +336,29 @@ struct smr_attr { size_t tx_count; }; +size_t smr_calculate_size_offsets(size_t tx_count, size_t rx_count, + size_t *cmd_offset, size_t *resp_offset, + size_t *inject_offset, size_t *sar_offset, + size_t *peer_offset, size_t *name_offset, + size_t *sock_offset); +void smr_cma_check(struct smr_region *region, struct smr_region *peer_region); +void smr_cleanup(void); int smr_map_create(const struct fi_provider *prov, int peer_count, struct smr_map **map); int smr_map_to_region(const struct fi_provider *prov, struct smr_peer *peer_buf); -void smr_map_to_endpoint(struct smr_region *region, int index); -void smr_unmap_from_endpoint(struct smr_region *region, int index); +void smr_map_to_endpoint(struct smr_region *region, int64_t id); +void smr_unmap_from_endpoint(struct smr_region *region, int64_t id); void smr_exchange_all_peers(struct smr_region *region); int smr_map_add(const struct fi_provider *prov, - struct smr_map *map, const char *name, int id); -void smr_map_del(struct smr_map *map, int id); + struct smr_map *map, const char *name, int64_t *id); +void smr_map_del(struct smr_map *map, int64_t id); void smr_map_free(struct smr_map *map); -struct smr_region *smr_map_get(struct smr_map *map, int id); +struct smr_region *smr_map_get(struct smr_map *map, int64_t id); int smr_create(const struct fi_provider *prov, struct smr_map *map, - const struct smr_attr *attr, struct smr_region **smr); + const struct smr_attr *attr, struct smr_region *volatile *smr); void smr_free(struct smr_region *smr); #ifdef __cplusplus diff --git a/include/ofi_signal.h b/include/ofi_signal.h index b4f3ff36680..fa1fabdc74a 100644 --- a/include/ofi_signal.h +++ b/include/ofi_signal.h @@ -39,11 +39,13 @@ #include #include #include +#include #include #include #include #include +#include #include @@ -52,10 +54,20 @@ enum { FI_WRITE_FD }; +enum ofi_signal_state { + OFI_SIGNAL_UNSET, + OFI_SIGNAL_WRITE_PREPARE, + OFI_SIGNAL_SET, + OFI_SIGNAL_READ_PREPARE, +}; + struct fd_signal { - int rcnt; - int wcnt; + ofi_atomic32_t state; int fd[2]; + +#if ENABLE_DEBUG + ofi_atomic32_t debug_cnt; +#endif }; static inline int fd_signal_init(struct fd_signal *signal) @@ -70,6 +82,11 @@ static inline int fd_signal_init(struct fd_signal *signal) if (ret) goto err; + ofi_atomic_initialize32(&signal->state, OFI_SIGNAL_UNSET); + +#if ENABLE_DEBUG + ofi_atomic_initialize32(&signal->debug_cnt, 0); +#endif return 0; err: @@ -87,19 +104,73 @@ static inline void fd_signal_free(struct fd_signal *signal) static inline void fd_signal_set(struct fd_signal *signal) { char c = 0; - if (signal->wcnt == signal->rcnt) { - if (ofi_write_socket(signal->fd[FI_WRITE_FD], &c, sizeof c) == sizeof c) - signal->wcnt++; + bool cas; /* cas result */ + int write_rc; + + cas = ofi_atomic_cas_bool_strong32(&signal->state, + OFI_SIGNAL_UNSET, + OFI_SIGNAL_WRITE_PREPARE); + if (cas) { + write_rc = ofi_write_socket(signal->fd[FI_WRITE_FD], &c, + sizeof c); + if (write_rc == sizeof c) { +#if ENABLE_DEBUG + assert(ofi_atomic_inc32(&signal->debug_cnt) == 1); +#endif + ofi_atomic_set32(&signal->state, OFI_SIGNAL_SET); + } else { + /* XXX: Setting the signal failed, a polling thread + * will not be woken up now and the system might + * get stuck. + * Also, typically this will be totally + * untested code path, as it basically will never + * come up. + */ + ofi_atomic_set32(&signal->state, OFI_SIGNAL_UNSET); + } } } static inline void fd_signal_reset(struct fd_signal *signal) { char c; - if (signal->rcnt != signal->wcnt) { - if (ofi_read_socket(signal->fd[FI_READ_FD], &c, sizeof c) == sizeof c) - signal->rcnt++; - } + bool cas; /* cas result */ + enum ofi_signal_state state; + int read_rc; + + do { + cas = ofi_atomic_cas_bool_weak32(&signal->state, + OFI_SIGNAL_SET, + OFI_SIGNAL_READ_PREPARE); + if (cas) { + read_rc = ofi_read_socket(signal->fd[FI_READ_FD], &c, + sizeof c); + if (read_rc == sizeof c) { +#if ENABLE_DEBUG + assert(ofi_atomic_dec32(&signal->debug_cnt) == 0); +#endif + ofi_atomic_set32(&signal->state, + OFI_SIGNAL_UNSET); + break; + } else { + ofi_atomic_set32(&signal->state, OFI_SIGNAL_SET); + + /* Avoid spinning forever in this highly + * unlikely code path. + */ + break; + } + } + + state = ofi_atomic_get32(&signal->state); + + /* note that this loop also needs to include + * OFI_SIGNAL_WRITE_PREPARE, as the writing thread sets + * the signal to the socket in _WRITE_PREPARE state. The reading + * thread might then race with the writing thread and then + * end up here before the state was switched to OFI_SIGNAL_SET. + */ + } while (state == OFI_SIGNAL_WRITE_PREPARE || state == OFI_SIGNAL_SET); } static inline int fd_signal_poll(struct fd_signal *signal, int timeout) diff --git a/include/ofi_tree.h b/include/ofi_tree.h index 5415c66822f..a2efbf622ad 100644 --- a/include/ofi_tree.h +++ b/include/ofi_tree.h @@ -64,6 +64,7 @@ struct ofi_rbnode { struct ofi_rbmap { struct ofi_rbnode *root; struct ofi_rbnode sentinel; + struct ofi_rbnode *free_list; /* compare() * = 0: a == b @@ -81,12 +82,14 @@ void ofi_rbmap_init(struct ofi_rbmap *map, int (*compare)(struct ofi_rbmap *map, void *key, void *data)); void ofi_rbmap_cleanup(struct ofi_rbmap *map); +struct ofi_rbnode *ofi_rbmap_get_root(struct ofi_rbmap *map); struct ofi_rbnode *ofi_rbmap_find(struct ofi_rbmap *map, void *key); struct ofi_rbnode *ofi_rbmap_search(struct ofi_rbmap *map, void *key, int (*compare)(struct ofi_rbmap *map, void *key, void *data)); int ofi_rbmap_insert(struct ofi_rbmap *map, void *key, void *data, struct ofi_rbnode **node); void ofi_rbmap_delete(struct ofi_rbmap *map, struct ofi_rbnode *node); +int ofi_rbmap_find_delete(struct ofi_rbmap *map, void *key); int ofi_rbmap_empty(struct ofi_rbmap *map); diff --git a/include/ofi_util.h b/include/ofi_util.h index 9fb544d4833..a8e7fb97eed 100644 --- a/include/ofi_util.h +++ b/include/ofi_util.h @@ -64,6 +64,7 @@ #include #include #include +#include #include "rbtree.h" #include "uthash.h" @@ -73,21 +74,26 @@ extern "C" { #endif /* EQ / CQ flags - * ERROR: The added entry was the result of an error completion - * OVERFLOW: The CQ has overflowed, and events have been lost + * ERROR: EQ entry was the result of a failed operation, + * or the caller is trying to read the next entry + * if it is an error. + * AUX: CQ entries are stored in the auxiliary queue */ #define UTIL_FLAG_ERROR (1ULL << 60) -#define UTIL_FLAG_OVERFLOW (1ULL << 61) +#define UTIL_FLAG_AUX (1ULL << 61) /* Indicates that an EP has been bound to a counter */ #define OFI_CNTR_ENABLED (1ULL << 61) +/* Memory registration should not be cached */ +#define OFI_MR_NOCACHE BIT_ULL(60) + #define OFI_Q_STRERROR(prov, level, subsys, q, q_str, entry, q_strerror) \ FI_LOG(prov, level, subsys, "fi_" q_str "_readerr: err: %s (%d), " \ "prov_err: %s (%d)\n", strerror((entry)->err), (entry)->err, \ - q_strerror((q), -(entry)->prov_errno, \ + q_strerror((q), (entry)->prov_errno, \ (entry)->err_data, NULL, 0), \ - -(entry)->prov_errno) + (entry)->prov_errno) #define OFI_CQ_STRERROR(prov, level, subsys, cq, entry) \ OFI_Q_STRERROR(prov, level, subsys, cq, "cq", entry, fi_cq_strerror) @@ -205,7 +211,7 @@ struct util_domain { int ofi_domain_init(struct fid_fabric *fabric_fid, const struct fi_info *info, struct util_domain *domain, void *context); -int ofi_domain_bind_eq(struct util_domain *domain, struct util_eq *eq); +int ofi_domain_bind(struct fid *fid, struct fid *bfid, uint64_t flags); int ofi_domain_close(struct util_domain *domain); static const uint64_t ofi_rx_mr_flags[] = { @@ -291,6 +297,9 @@ struct util_ep { fastlock_t lock; ofi_fastlock_acquire_t lock_acquire; ofi_fastlock_release_t lock_release; + + struct bitmask *coll_cid_mask; + struct slist coll_ready_queue; }; int ofi_ep_bind_av(struct util_ep *util_ep, struct util_av *av); @@ -399,35 +408,66 @@ struct util_wait { enum fi_wait_obj wait_obj; fi_wait_signal_func signal; fi_wait_try_func wait_try; + + struct dlist_entry fid_list; + fastlock_t lock; }; -int fi_wait_init(struct util_fabric *fabric, struct fi_wait_attr *attr, - struct util_wait *wait); +int ofi_wait_init(struct util_fabric *fabric, struct fi_wait_attr *attr, + struct util_wait *wait); int fi_wait_cleanup(struct util_wait *wait); struct util_wait_fd { struct util_wait util_wait; struct fd_signal signal; - fi_epoll_t epoll_fd; struct dlist_entry fd_list; - fastlock_t lock; + + union { + ofi_epoll_t epoll_fd; + struct ofi_pollfds *pollfds; + }; + uint64_t change_index; }; -typedef int (*ofi_wait_fd_try_func)(void *arg); +typedef int (*ofi_wait_try_func)(void *arg); struct ofi_wait_fd_entry { struct dlist_entry entry; int fd; - ofi_wait_fd_try_func wait_try; + ofi_wait_try_func wait_try; void *arg; ofi_atomic32_t ref; }; +struct ofi_wait_fid_entry { + struct dlist_entry entry; + ofi_wait_try_func wait_try; + fid_t fid; + enum fi_wait_obj wait_obj; + uint32_t events; + ofi_atomic32_t ref; + struct fi_wait_pollfd pollfds; +}; + int ofi_wait_fd_open(struct fid_fabric *fabric, struct fi_wait_attr *attr, struct fid_wait **waitset); -int ofi_wait_fd_add(struct util_wait *wait, int fd, uint32_t events, - ofi_wait_fd_try_func wait_try, void *arg, void *context); -int ofi_wait_fd_del(struct util_wait *wait, int fd); +int ofi_wait_add_fd(struct util_wait *wait, int fd, uint32_t events, + ofi_wait_try_func wait_try, void *arg, void *context); +int ofi_wait_del_fd(struct util_wait *wait, int fd); +int ofi_wait_fdset_del(struct util_wait_fd *wait_fd, int fd); +int ofi_wait_add_fid(struct util_wait *wat, fid_t fid, uint32_t events, + ofi_wait_try_func wait_try); +int ofi_wait_del_fid(struct util_wait *wait, fid_t fid); + + +struct util_wait_yield { + struct util_wait util_wait; + int signal; + fastlock_t signal_lock; +}; + +int ofi_wait_yield_open(struct fid_fabric *fabric, struct fi_wait_attr *attr, + struct fid_wait **waitset); /* * Completion queue @@ -441,8 +481,8 @@ int ofi_wait_fd_del(struct util_wait *wait, int fd); typedef void (*fi_cq_read_func)(void **dst, void *src); -struct util_cq_oflow_err_entry { - struct fi_cq_tagged_entry *parent_comp; +struct util_cq_aux_entry { + struct fi_cq_tagged_entry *cq_slot; struct fi_cq_err_entry comp; fi_addr_t src; struct slist_entry list_entry; @@ -466,7 +506,7 @@ struct util_cq { struct util_comp_cirq *cirq; fi_addr_t *src; - struct slist oflow_err_list; + struct slist aux_queue; fi_cq_read_func read_entry; int internal_wait; ofi_atomic32_t signaled; @@ -492,8 +532,9 @@ ssize_t ofi_cq_sreadfrom(struct fid_cq *cq_fid, void *buf, size_t count, fi_addr_t *src_addr, const void *cond, int timeout); int ofi_cq_signal(struct fid_cq *cq_fid); -int ofi_cq_write_overflow(struct util_cq *cq, void *context, uint64_t flags, size_t len, - void *buf, uint64_t data, uint64_t tag, fi_addr_t src); +int ofi_cq_write_overflow(struct util_cq *cq, void *context, uint64_t flags, + size_t len, void *buf, uint64_t data, uint64_t tag, + fi_addr_t src); static inline void util_cq_signal(struct util_cq *cq) { @@ -502,10 +543,10 @@ static inline void util_cq_signal(struct util_cq *cq) } static inline void -ofi_cq_write_comp_entry(struct util_cq *cq, void *context, uint64_t flags, - size_t len, void *buf, uint64_t data, uint64_t tag) +ofi_cq_write_entry(struct util_cq *cq, void *context, uint64_t flags, + size_t len, void *buf, uint64_t data, uint64_t tag) { - struct fi_cq_tagged_entry *comp = ofi_cirque_tail(cq->cirq); + struct fi_cq_tagged_entry *comp = ofi_cirque_next(cq->cirq); comp->op_context = context; comp->flags = flags; comp->len = len; @@ -515,18 +556,13 @@ ofi_cq_write_comp_entry(struct util_cq *cq, void *context, uint64_t flags, ofi_cirque_commit(cq->cirq); } -static inline int -ofi_cq_write_thread_unsafe(struct util_cq *cq, void *context, uint64_t flags, - size_t len, void *buf, uint64_t data, uint64_t tag) +static inline void +ofi_cq_write_src_entry(struct util_cq *cq, void *context, uint64_t flags, + size_t len, void *buf, uint64_t data, uint64_t tag, + fi_addr_t src) { - if (OFI_UNLIKELY(ofi_cirque_isfull(cq->cirq))) { - FI_DBG(cq->domain->prov, FI_LOG_CQ, - "util_cq cirq is full!\n"); - return ofi_cq_write_overflow(cq, context, flags, len, - buf, data, tag, 0); - } - ofi_cq_write_comp_entry(cq, context, flags, len, buf, data, tag); - return 0; + cq->src[ofi_cirque_windex(cq->cirq)] = src; + ofi_cq_write_entry(cq, context, flags, len, buf, data, tag); } static inline int @@ -534,39 +570,40 @@ ofi_cq_write(struct util_cq *cq, void *context, uint64_t flags, size_t len, void *buf, uint64_t data, uint64_t tag) { int ret; + cq->cq_fastlock_acquire(&cq->cq_lock); - ret = ofi_cq_write_thread_unsafe(cq, context, flags, len, buf, data, tag); + if (ofi_cirque_freecnt(cq->cirq) > 1) { + ofi_cq_write_entry(cq, context, flags, len, buf, data, tag); + ret = 0; + } else { + ret = ofi_cq_write_overflow(cq, context, flags, len, + buf, data, tag, FI_ADDR_NOTAVAIL); + } cq->cq_fastlock_release(&cq->cq_lock); return ret; } -static inline int -ofi_cq_write_src_thread_unsafe(struct util_cq *cq, void *context, uint64_t flags, size_t len, - void *buf, uint64_t data, uint64_t tag, fi_addr_t src) -{ - if (OFI_UNLIKELY(ofi_cirque_isfull(cq->cirq))) { - FI_DBG(cq->domain->prov, FI_LOG_CQ, - "util_cq cirq is full!\n"); - return ofi_cq_write_overflow(cq, context, flags, len, - buf, data, tag, src); - } - cq->src[ofi_cirque_windex(cq->cirq)] = src; - ofi_cq_write_comp_entry(cq, context, flags, len, buf, data, tag); - return 0; -} - static inline int ofi_cq_write_src(struct util_cq *cq, void *context, uint64_t flags, size_t len, void *buf, uint64_t data, uint64_t tag, fi_addr_t src) { int ret; + cq->cq_fastlock_acquire(&cq->cq_lock); - ret = ofi_cq_write_src_thread_unsafe(cq, context, flags, len, - buf, data, tag, src); + if (ofi_cirque_freecnt(cq->cirq) > 1) { + ofi_cq_write_src_entry(cq, context, flags, len, buf, data, + tag, src); + ret = 0; + } else { + ret = ofi_cq_write_overflow(cq, context, flags, len, + buf, data, tag, src); + } cq->cq_fastlock_release(&cq->cq_lock); return ret; } +int ofi_cq_insert_error(struct util_cq *cq, + const struct fi_cq_err_entry *err_entry); int ofi_cq_write_error(struct util_cq *cq, const struct fi_cq_err_entry *err_entry); int ofi_cq_write_error_peek(struct util_cq *cq, uint64_t tag, void *context); @@ -646,7 +683,13 @@ static inline void ofi_cntr_inc(struct util_cntr *cntr) struct util_av_entry { ofi_atomic32_t use_cnt; UT_hash_handle hh; - char addr[0]; + /* + * data includes 'addr' and any other additional fields + * associated with av_entry. 'addr' must be the first + * field in 'data' and addr length should be a multiple + * of 8 bytes to ensure alignment of additional fields + */ + char data[]; }; struct util_av { @@ -660,16 +703,27 @@ struct util_av { struct util_av_entry *hash; struct ofi_bufpool *av_entry_pool; + struct util_coll_mc *coll_mc; void *context; uint64_t flags; - size_t count; size_t addrlen; + /* + * context_offset is addrlen + offset (required for alignment), + * if addrlen is a multiple of 8 bytes offset will be 0. + */ + size_t context_offset; struct dlist_entry ep_list; fastlock_t ep_list_lock; }; struct util_av_attr { + /* Must be a multiple of 8 bytes */ size_t addrlen; + /* + * Specify the length of additional fields to be added + * to av_entry other than struct util_av_entry and addr + */ + size_t context_len; int flags; }; @@ -684,6 +738,7 @@ int ofi_av_init_lightweight(struct util_domain *domain, const struct fi_av_attr int ofi_av_close(struct util_av *av); int ofi_av_close_lightweight(struct util_av *av); +size_t ofi_av_size(struct util_av *av); int ofi_av_insert_addr(struct util_av *av, const void *addr, fi_addr_t *fi_addr); int ofi_av_remove_addr(struct util_av *av, fi_addr_t fi_addr); fi_addr_t ofi_av_lookup_fi_addr_unsafe(struct util_av *av, const void *addr); @@ -711,9 +766,10 @@ int ofi_get_src_addr(uint32_t addr_format, void ofi_getnodename(uint16_t sa_family, char *buf, int buflen); int ofi_av_get_index(struct util_av *av, const void *addr); -int ofi_verify_av_insert(struct util_av *av, uint64_t flags); +int ofi_verify_av_insert(struct util_av *av, uint64_t flags, void *context); int ofi_ip_av_insertv(struct util_av *av, const void *addr, size_t addrlen, - size_t count, fi_addr_t *fi_addr, void *context); + size_t count, fi_addr_t *fi_addr, uint64_t flags, + void *context); /* Caller should free *addr */ int ofi_ip_av_sym_getaddr(struct util_av *av, const char *node, size_t nodecnt, const char *service, @@ -764,10 +820,10 @@ struct util_eq { struct util_event { struct slist_entry entry; - int size; + ssize_t size; int event; int err; - uint8_t data[0]; + uint8_t data[]; /* offset should be 8-byte aligned */ }; int ofi_eq_create(struct fid_fabric *fabric, struct fi_eq_attr *attr, @@ -800,7 +856,8 @@ const char *ofi_eq_strerror(struct fid_eq *eq_fid, int prov_errno, #define FI_PRIMARY_CAPS (FI_MSG | FI_RMA | FI_TAGGED | FI_ATOMICS | FI_MULTICAST | \ FI_NAMED_RX_CTX | FI_DIRECTED_RECV | \ FI_READ | FI_WRITE | FI_RECV | FI_SEND | \ - FI_REMOTE_READ | FI_REMOTE_WRITE) + FI_REMOTE_READ | FI_REMOTE_WRITE | FI_COLLECTIVE | \ + FI_HMEM) #define FI_SECONDARY_CAPS (FI_MULTI_RECV | FI_SOURCE | FI_RMA_EVENT | \ FI_SHARED_AV | FI_TRIGGER | FI_FENCE | \ @@ -811,6 +868,9 @@ const char *ofi_eq_strerror(struct fid_eq *eq_fid, int prov_errno, #define OFI_TX_RMA_CAPS (FI_RMA | FI_READ | FI_WRITE) #define OFI_RX_RMA_CAPS (FI_RMA | FI_REMOTE_READ | FI_REMOTE_WRITE) +int ofi_check_ep_type(const struct fi_provider *prov, + const struct fi_ep_attr *prov_attr, + const struct fi_ep_attr *user_attr); int ofi_check_mr_mode(const struct fi_provider *prov, uint32_t api_version, int prov_mode, const struct fi_info *user_info); int ofi_check_fabric_attr(const struct fi_provider *prov, @@ -841,6 +901,14 @@ int ofi_prov_check_dup_info(const struct util_prov *util_prov, uint32_t api_version, const struct fi_info *user_info, struct fi_info **info); +static inline uint64_t +ofi_pick_core_flags(uint64_t all_util_flags, uint64_t all_core_flags, + uint64_t use_core_flags) +{ + return (all_util_flags & ~use_core_flags) | + (all_core_flags & use_core_flags); +} + int ofi_check_info(const struct util_prov *util_prov, const struct fi_info *prov_info, uint32_t api_version, const struct fi_info *user_info); @@ -851,6 +919,9 @@ struct fi_info *ofi_allocinfo_internal(void); int util_getinfo(const struct util_prov *util_prov, uint32_t version, const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct fi_info **info); +int ofi_ip_getinfo(const struct util_prov *prov, uint32_t version, + const char *node, const char *service, uint64_t flags, + const struct fi_info *hints, struct fi_info **info); struct fid_list_entry { @@ -864,7 +935,6 @@ void fid_list_remove(struct dlist_entry *fid_list, fastlock_t *lock, struct fid *fid); void ofi_fabric_insert(struct util_fabric *fabric); -struct util_fabric *ofi_fabric_find(struct util_fabric_info *fabric_info); void ofi_fabric_remove(struct util_fabric *fabric); /* @@ -880,11 +950,14 @@ static inline int ofi_has_util_prefix(const char *str) } typedef int (*ofi_alter_info_t)(uint32_t version, const struct fi_info *src_info, + const struct fi_info *base_info, struct fi_info *dest_info); int ofi_get_core_info(uint32_t version, const char *node, const char *service, uint64_t flags, const struct util_prov *util_prov, - const struct fi_info *util_hints, ofi_alter_info_t info_to_core, + const struct fi_info *util_hints, + const struct fi_info *base_attr, + ofi_alter_info_t info_to_core, struct fi_info **core_info); int ofix_getinfo(uint32_t version, const char *node, const char *service, uint64_t flags, const struct util_prov *util_prov, @@ -941,6 +1014,36 @@ int ofi_ns_del_local_name(struct util_ns *ns, void *service, void *name); void *ofi_ns_resolve_name(struct util_ns *ns, const char *server, void *service); + +/* Setup coordination for credit based flow control between core and util. + * threshold - When number of available RQ credits > threshold, the send + * handler will be invoked + * add_credits - Increments the number of peer RQ credits available + * send_handler - Called to have util code send credit message. If the + * credit message cannot be sent, the credits should be returned to + * the core by calling add_credits. + */ +#define OFI_OPS_FLOW_CTRL "ofix_flow_ctrl_v1" + +struct ofi_ops_flow_ctrl { + size_t size; + void (*set_threshold)(struct fid_ep *ep, uint64_t threshold); + void (*add_credits)(struct fid_ep *ep, uint64_t credits); + int (*enable)(struct fid_ep *ep); + void (*set_send_handler)(struct fid_domain *domain, + ssize_t (*send_handler)(struct fid_ep *ep, uint64_t credits)); +}; + + +/* Dynamic receive buffering support. */ +#define OFI_OPS_DYNAMIC_RBUF "ofix_dynamic_rbuf" + +struct ofi_ops_dynamic_rbuf { + size_t size; + ssize_t (*get_rbuf)(struct fi_cq_data_entry *entry, struct iovec *iov, + size_t *count); +}; + #ifdef __cplusplus } #endif diff --git a/include/osx/osd.h b/include/osx/osd.h index 4280560deb0..6d671b9f361 100644 --- a/include/osx/osd.h +++ b/include/osx/osd.h @@ -47,6 +47,8 @@ #include +#include + #include "unix/osd.h" #include "rdma/fi_errno.h" #include "config.h" @@ -95,6 +97,71 @@ static inline int ofi_hugepage_enabled(void) return 0; } +static inline ssize_t ofi_process_vm_readv(pid_t pid, + const struct iovec *local_iov, + unsigned long liovcnt, + const struct iovec *remote_iov, + unsigned long riovcnt, + unsigned long flags) +{ + return -FI_ENOSYS; +} + +static inline size_t ofi_process_vm_writev(pid_t pid, + const struct iovec *local_iov, + unsigned long liovcnt, + const struct iovec *remote_iov, + unsigned long riovcnt, + unsigned long flags) +{ + return -FI_ENOSYS; +} + +static inline ssize_t +ofi_recv_socket(SOCKET fd, void *buf, size_t count, int flags) +{ + size_t len = count > INT_MAX ? INT_MAX : count; + return recv(fd, buf, len, flags); +} + +static inline ssize_t +ofi_send_socket(SOCKET fd, const void *buf, size_t count, int flags) +{ + size_t len = count > INT_MAX ? INT_MAX : count; + return send(fd, buf, len, flags); +} + +static inline ssize_t ofi_read_socket(SOCKET fd, void *buf, size_t count) +{ + return ofi_recv_socket(fd, buf, count, 0); +} + +static inline ssize_t ofi_write_socket(SOCKET fd, const void *buf, size_t count) +{ + return ofi_send_socket(fd, buf, count, 0); +} + +static inline ssize_t +ofi_recvfrom_socket(SOCKET fd, void *buf, size_t count, int flags, + struct sockaddr *from, socklen_t *fromlen) +{ + size_t len = count > INT_MAX ? INT_MAX : count; + return recvfrom(fd, buf, len, flags, from, fromlen); +} + +static inline ssize_t +ofi_sendto_socket(SOCKET fd, const void *buf, size_t count, int flags, + const struct sockaddr *to, socklen_t tolen) +{ + size_t len = count > INT_MAX ? INT_MAX : count; + return sendto(fd, buf, len, flags, to, tolen); +} + +ssize_t ofi_writev_socket(SOCKET fd, const struct iovec *iovec, size_t iov_cnt); +ssize_t ofi_readv_socket(SOCKET fd, const struct iovec *iovec, size_t iov_cnt); +ssize_t ofi_sendmsg_tcp(SOCKET fd, const struct msghdr *msg, int flags); +ssize_t ofi_recvmsg_tcp(SOCKET fd, struct msghdr *msg, int flags); + #ifdef __cplusplus } #endif diff --git a/include/rdma/fabric.h b/include/rdma/fabric.h index d905ef526dc..9ddf5d121f0 100644 --- a/include/rdma/fabric.h +++ b/include/rdma/fabric.h @@ -1,6 +1,7 @@ /* * Copyright (c) 2013-2017 Intel Corporation. All rights reserved. * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -38,6 +39,7 @@ #include #include #include +#include #ifdef __GNUC__ #define FI_DEPRECATED_FUNC __attribute__((deprecated)) @@ -77,7 +79,8 @@ extern "C" { #endif #define FI_MAJOR_VERSION 1 -#define FI_MINOR_VERSION 8 +#define FI_MINOR_VERSION 12 +#define FI_REVISION_VERSION 0 enum { FI_PATH_MAX = 256, @@ -85,7 +88,7 @@ enum { FI_VERSION_MAX = 64 }; -#define FI_VERSION(major, minor) ((major << 16) | (minor)) +#define FI_VERSION(major, minor) (((major) << 16) | (minor)) #define FI_MAJOR(version) (version >> 16) #define FI_MINOR(version) (version & 0xFFFF) #define FI_VERSION_GE(v1, v2) ((FI_MAJOR(v1) > FI_MAJOR(v2)) || \ @@ -151,6 +154,7 @@ typedef struct fid *fid_t; #define FI_PEEK (1ULL << 19) #define FI_TRIGGER (1ULL << 20) #define FI_FENCE (1ULL << 21) +#define FI_PRIORITY (1ULL << 22) #define FI_COMPLETION (1ULL << 24) #define FI_EVENT FI_COMPLETION @@ -162,6 +166,7 @@ typedef struct fid *fid_t; #define FI_COMMIT_COMPLETE (1ULL << 30) #define FI_MATCH_COMPLETE (1ULL << 31) +#define FI_HMEM (1ULL << 47) #define FI_VARIABLE_MSG (1ULL << 48) #define FI_RMA_PMEM (1ULL << 49) #define FI_SOURCE_ERR (1ULL << 50) @@ -203,6 +208,7 @@ enum { FI_ADDR_PSMX2, /* uint64_t[2] */ FI_ADDR_IB_UD, /* uint64_t[4] */ FI_ADDR_EFA, + FI_ADDR_PSMX3, /* uint64_t[2] */ }; #define FI_ADDR_UNSPEC ((uint64_t) -1) @@ -231,6 +237,7 @@ enum fi_mr_mode { #define FI_MR_MMU_NOTIFY (1 << 7) #define FI_MR_RMA_EVENT (1 << 8) #define FI_MR_ENDPOINT (1 << 9) +#define FI_MR_HMEM (1 << 10) enum fi_progress { FI_PROGRESS_UNSPEC, @@ -313,9 +320,32 @@ enum { FI_PROTO_MRAIL, FI_PROTO_RSTREAM, FI_PROTO_RDMA_CM_IB_XRC, - FI_PROTO_EFA + FI_PROTO_EFA, + FI_PROTO_PSMX3 }; +enum { + FI_TC_UNSPEC = 0, + FI_TC_DSCP = 0x100, + FI_TC_LABEL = 0x200, + FI_TC_BEST_EFFORT = FI_TC_LABEL, + FI_TC_LOW_LATENCY, + FI_TC_DEDICATED_ACCESS, + FI_TC_BULK_DATA, + FI_TC_SCAVENGER, + FI_TC_NETWORK_CTRL, +}; + +static inline uint32_t fi_tc_dscp_set(uint8_t dscp) +{ + return ((uint32_t) dscp) | FI_TC_DSCP; +} + +static inline uint8_t fi_tc_dscp_get(uint32_t tclass) +{ + return tclass & FI_TC_DSCP ? (uint8_t) tclass : 0; +} + /* Mode bits */ #define FI_CONTEXT (1ULL << 59) #define FI_MSG_PREFIX (1ULL << 58) @@ -337,6 +367,7 @@ struct fi_tx_attr { size_t size; size_t iov_limit; size_t rma_iov_limit; + uint32_t tclass; }; struct fi_rx_attr { @@ -393,6 +424,7 @@ struct fi_domain_attr { size_t auth_key_size; size_t max_err_data; size_t mr_cnt; + uint32_t tclass; }; struct fi_fabric_attr { @@ -486,6 +518,7 @@ enum { FI_CLASS_CONNREQ, FI_CLASS_MC, FI_CLASS_NIC, + FI_CLASS_AV_SET, }; struct fi_eq_attr; @@ -502,6 +535,8 @@ struct fi_ops { int (*ops_open)(struct fid *fid, const char *name, uint64_t flags, void **ops, void *context); int (*tostr)(const struct fid *fid, char *buf, size_t len); + int (*ops_set)(struct fid *fid, const char *name, uint64_t flags, + void *ops, void *context); }; /* All fabric interface descriptors must start with this structure */ @@ -565,6 +600,11 @@ struct fi_alias { uint64_t flags; }; +struct fi_fid_var { + int name; + void *val; +}; + struct fi_mr_raw_attr { uint64_t flags; uint64_t *base_addr; @@ -598,6 +638,9 @@ enum { FI_FLUSH_WORK, /* NULL */ FI_REFRESH, /* mr: fi_mr_modify */ FI_DUP, /* struct fid ** */ + FI_GETWAITOBJ, /*enum fi_wait_obj * */ + FI_GET_VAL, /* struct fi_fid_var */ + FI_SET_VAL, /* struct fi_fid_var */ }; static inline int fi_control(struct fid *fid, int command, void *arg) @@ -613,6 +656,28 @@ static inline int fi_alias(struct fid *fid, struct fid **alias_fid, uint64_t fla return fi_control(fid, FI_ALIAS, &alias); } +/* fid value names */ +/* + * Currently no common name is defined. Provider specific names should + * have the FI_PROV_SPECIFIC bit set. + */ + +static inline int fi_get_val(struct fid *fid, int name, void *val) +{ + struct fi_fid_var var; + var.name = name; + var.val = val; + return fi_control(fid, FI_GET_VAL, &var); +} + +static inline int fi_set_val(struct fid *fid, int name, void *val) +{ + struct fi_fid_var var; + var.name = name; + var.val = val; + return fi_control(fid, FI_SET_VAL, &var); +} + static inline int fi_open_ops(struct fid *fid, const char *name, uint64_t flags, void **ops, void *context) @@ -620,6 +685,14 @@ fi_open_ops(struct fid *fid, const char *name, uint64_t flags, return fid->ops->ops_open(fid, name, flags, ops, context); } +static inline int +fi_set_ops(struct fid *fid, const char *name, uint64_t flags, + void *ops, void *context) +{ + return FI_CHECK_OP(fid->ops, struct fi_ops, ops_set) ? + fid->ops->ops_set(fid, name, flags, ops, context) : -FI_ENOSYS; +} + enum fi_type { FI_TYPE_INFO, FI_TYPE_EP_TYPE, @@ -645,9 +718,13 @@ enum fi_type { FI_TYPE_MR_MODE, FI_TYPE_OP_TYPE, FI_TYPE_FID, + FI_TYPE_COLLECTIVE_OP, + FI_TYPE_HMEM_IFACE, }; char *fi_tostr(const void *data, enum fi_type datatype); +char *fi_tostr_r(char *buf, size_t len, const void *data, + enum fi_type datatype); enum fi_param_type { FI_PARAM_STRING, @@ -666,7 +743,6 @@ struct fi_param { int fi_getparams(struct fi_param **params, int *count); void fi_freeparams(struct fi_param *params); - #ifdef FABRIC_DIRECT #include #endif /* FABRIC_DIRECT */ diff --git a/include/rdma/fi_atomic.h b/include/rdma/fi_atomic.h index a7dc068b7f3..cc8b1e52054 100644 --- a/include/rdma/fi_atomic.h +++ b/include/rdma/fi_atomic.h @@ -44,7 +44,6 @@ extern "C" { /* Atomic flags */ -#define FI_SCATTER (1ULL << 57) #define FI_FETCH_ATOMIC (1ULL << 58) #define FI_COMPARE_ATOMIC (1ULL << 59) diff --git a/include/rdma/fi_collective.h b/include/rdma/fi_collective.h index 67eff6a28be..41528b54fdd 100644 --- a/include/rdma/fi_collective.h +++ b/include/rdma/fi_collective.h @@ -42,6 +42,10 @@ extern "C" { #endif +#ifdef FABRIC_DIRECT +#include +#endif /* FABRIC_DIRECT */ + struct fi_ops_av_set { size_t size; @@ -52,6 +56,7 @@ struct fi_ops_av_set { int (*diff)(struct fid_av_set *dst, const struct fid_av_set *src); int (*insert)(struct fid_av_set *set, fi_addr_t addr); int (*remove)(struct fid_av_set *set, fi_addr_t addr); + int (*addr)(struct fid_av_set *set, fi_addr_t *coll_addr); }; struct fid_av_set { @@ -59,11 +64,12 @@ struct fid_av_set { struct fi_ops_av_set *ops; }; - struct fi_collective_attr { - struct fi_atomic_attr datatype_attr; - size_t max_members; - uint64_t mode; + enum fi_op op; + enum fi_datatype datatype; + struct fi_atomic_attr datatype_attr; + size_t max_members; + uint64_t mode; }; struct fi_collective_addr { @@ -76,6 +82,8 @@ struct fi_msg_collective { void **desc; size_t iov_count; fi_addr_t coll_addr; + fi_addr_t root_addr; + enum fi_collective_op coll; enum fi_datatype datatype; enum fi_op op; void *context; @@ -83,14 +91,47 @@ struct fi_msg_collective { struct fi_ops_collective { size_t size; + ssize_t (*barrier)(struct fid_ep *ep, fi_addr_t coll_addr, void *context); - ssize_t (*writeread)(struct fid_ep *ep, + ssize_t (*broadcast)(struct fid_ep *ep, + void *buf, size_t count, void *desc, + fi_addr_t coll_addr, fi_addr_t root_addr, + enum fi_datatype datatype, uint64_t flags, void *context); + ssize_t (*alltoall)(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + enum fi_datatype datatype, uint64_t flags, void *context); + ssize_t (*allreduce)(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + enum fi_datatype datatype, enum fi_op op, + uint64_t flags, void *context); + ssize_t (*allgather)(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + enum fi_datatype datatype, uint64_t flags, void *context); + ssize_t (*reduce_scatter)(struct fid_ep *ep, const void *buf, size_t count, void *desc, void *result, void *result_desc, fi_addr_t coll_addr, enum fi_datatype datatype, enum fi_op op, uint64_t flags, void *context); - ssize_t (*writereadmsg)(struct fid_ep *ep, + ssize_t (*reduce)(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + fi_addr_t root_addr, enum fi_datatype datatype, enum fi_op op, + uint64_t flags, void *context); + ssize_t (*scatter)(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + void *result, void *result_desc, + fi_addr_t coll_addr, fi_addr_t root_addr, + enum fi_datatype datatype, uint64_t flags, void *context); + ssize_t (*gather)(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + void *result, void *result_desc, + fi_addr_t coll_addr, fi_addr_t root_addr, + enum fi_datatype datatype, uint64_t flags, void *context); + ssize_t (*msg)(struct fid_ep *ep, const struct fi_msg_collective *msg, struct fi_ioc *resultv, void **result_desc, size_t result_count, uint64_t flags); @@ -105,10 +146,10 @@ struct fi_ops_collective { static inline int fi_av_set(struct fid_av *av, struct fi_av_set_attr *attr, - struct fid_av_set **av_set, void * context) + struct fid_av_set **set, void * context) { return FI_CHECK_OP(av->ops, struct fi_ops_av, av_set) ? - av->ops->av_set(av, attr, av_set, context) : -FI_ENOSYS; + av->ops->av_set(av, attr, set, context) : -FI_ENOSYS; } static inline int @@ -141,6 +182,12 @@ fi_av_set_remove(struct fid_av_set *set, fi_addr_t addr) return set->ops->remove(set, addr); } +static inline int +fi_av_set_addr(struct fid_av_set *set, fi_addr_t *coll_addr) +{ + return set->ops->addr(set, coll_addr); +} + static inline int fi_join_collective(struct fid_ep *ep, fi_addr_t coll_addr, const struct fid_av_set *set, @@ -149,7 +196,7 @@ fi_join_collective(struct fid_ep *ep, fi_addr_t coll_addr, struct fi_collective_addr addr; addr.set = set; - addr.join_addr = coll_addr; + addr.coll_addr = coll_addr; return fi_join(ep, &addr, flags | FI_COLLECTIVE, mc, context); } @@ -161,16 +208,21 @@ fi_barrier(struct fid_ep *ep, fi_addr_t coll_addr, void *context) static inline ssize_t fi_broadcast(struct fid_ep *ep, void *buf, size_t count, void *desc, - fi_addr_t coll_addr, enum fi_datatype datatype, - enum fi_op op, uint64_t flags, void *context) + fi_addr_t coll_addr, fi_addr_t root_addr, + enum fi_datatype datatype, uint64_t flags, void *context) +{ + return ep->collective->broadcast(ep, buf, count, desc, + coll_addr, root_addr, datatype, flags, context); +} + +static inline ssize_t +fi_alltoall(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, + fi_addr_t coll_addr, enum fi_datatype datatype, + uint64_t flags, void *context) { - if (flags & FI_SEND) { - return ep->collective->writeread(ep, buf, count, desc, - NULL, NULL, coll_addr, datatype, op, flags, context); - } else { - return ep->collective->writeread(ep, NULL, count, NULL, - buf, desc, coll_addr, datatype, op, flags, context); - } + return ep->collective->alltoall(ep, buf, count, desc, + result, result_desc, coll_addr, datatype, flags, context); } static inline ssize_t @@ -179,50 +231,68 @@ fi_allreduce(struct fid_ep *ep, const void *buf, size_t count, void *desc, enum fi_datatype datatype, enum fi_op op, uint64_t flags, void *context) { - return ep->collective->writeread(ep, buf, count, desc, + return ep->collective->allreduce(ep, buf, count, desc, result, result_desc, coll_addr, datatype, op, flags, context); } +static inline ssize_t +fi_allgather(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + enum fi_datatype datatype, uint64_t flags, void *context) +{ + return ep->collective->allgather(ep, buf, count, desc, + result, result_desc, coll_addr, datatype, flags, context); +} + static inline ssize_t fi_reduce_scatter(struct fid_ep *ep, const void *buf, size_t count, void *desc, - void *result, void *result_desc, - fi_addr_t coll_addr, enum fi_datatype datatype, enum fi_op op, + void *result, void *result_desc, fi_addr_t coll_addr, + enum fi_datatype datatype, enum fi_op op, uint64_t flags, void *context) { - return ep->collective->writeread(ep, buf, count, desc, - result, result_desc, coll_addr, datatype, op, - flags | FI_SCATTER, context); + return ep->collective->reduce_scatter(ep, buf, count, desc, + result, result_desc, coll_addr, datatype, op, flags, context); } static inline ssize_t -fi_alltoall(struct fid_ep *ep, const void *buf, size_t count, void *desc, - void *result, void *result_desc, - fi_addr_t coll_addr, enum fi_datatype datatype, - uint64_t flags, void *context) +fi_reduce(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + fi_addr_t root_addr, enum fi_datatype datatype, enum fi_op op, + uint64_t flags, void *context) { - return ep->collective->writeread(ep, buf, count, desc, - result, result_desc, coll_addr, datatype, FI_ALLTOALL, - flags, context); + return ep->collective->reduce(ep, buf, count, desc, result, result_desc, + coll_addr, root_addr, datatype, op, flags, context); } + static inline ssize_t -fi_allgather(struct fid_ep *ep, const void *buf, size_t count, void *desc, - void *result, void *result_desc, - fi_addr_t coll_addr, enum fi_datatype datatype, - uint64_t flags, void *context) +fi_scatter(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + fi_addr_t root_addr, enum fi_datatype datatype, + uint64_t flags, void *context) { - return ep->collective->writeread(ep, buf, count, desc, - result, result_desc, coll_addr, datatype, FI_ALLGATHER, - flags, context); + return ep->collective->scatter(ep, buf, count, desc, result, result_desc, + coll_addr, root_addr, datatype, flags, context); } -static inline int -fi_query_collective(struct fid_domain *domain, - enum fi_datatype datatype, enum fi_op op, - struct fi_collective_attr *attr, uint64_t flags) + +static inline ssize_t +fi_gather(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + fi_addr_t root_addr, enum fi_datatype datatype, + uint64_t flags, void *context) +{ + return ep->collective->gather(ep, buf, count, desc, result, result_desc, + coll_addr, root_addr, datatype, flags, context); +} + +static inline +int fi_query_collective(struct fid_domain *domain, enum fi_collective_op coll, + struct fi_collective_attr *attr, uint64_t flags) { - return fi_query_atomic(domain, datatype, op, &attr->datatype_attr, - flags | FI_COLLECTIVE); + return FI_CHECK_OP(domain->ops, struct fi_ops_domain, query_collective) ? + domain->ops->query_collective(domain, coll, attr, flags) : + -FI_ENOSYS; } #endif diff --git a/include/rdma/fi_domain.h b/include/rdma/fi_domain.h index 75e0bc456ae..27d6dd398b2 100644 --- a/include/rdma/fi_domain.h +++ b/include/rdma/fi_domain.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2013-2017 Intel Corporation. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,6 +34,7 @@ #ifndef FI_DOMAIN_H #define FI_DOMAIN_H +#include #include #include @@ -111,6 +113,13 @@ struct fid_mr { uint64_t key; }; +enum fi_hmem_iface { + FI_HMEM_SYSTEM = 0, + FI_HMEM_CUDA, + FI_HMEM_ROCR, + FI_HMEM_ZE, +}; + struct fi_mr_attr { const struct iovec *mr_iov; size_t iov_count; @@ -120,6 +129,12 @@ struct fi_mr_attr { void *context; size_t auth_key_size; uint8_t *auth_key; + enum fi_hmem_iface iface; + union { + uint64_t reserved; + int cuda; + int ze; + } device; }; struct fi_mr_modify { @@ -127,6 +142,23 @@ struct fi_mr_modify { struct fi_mr_attr attr; }; +#define FI_SET_OPS_HMEM_OVERRIDE "hmem_override_ops" + +struct fi_hmem_override_ops { + size_t size; + + ssize_t (*copy_from_hmem_iov)(void *dest, size_t size, + enum fi_hmem_iface iface, uint64_t device, + const struct iovec *hmem_iov, + size_t hmem_iov_count, + uint64_t hmem_iov_offset); + + ssize_t (*copy_to_hmem_iov)(enum fi_hmem_iface iface, uint64_t device, + const struct iovec *hmem_iov, + size_t hmem_iov_count, + uint64_t hmem_iov_offset, const void *src, + size_t size); +}; #ifdef FABRIC_DIRECT #include @@ -181,11 +213,24 @@ enum fi_op { /* End of point to point atomic ops */ FI_ATOMIC_OP_LAST, - /* Collective only ops */ - FI_BARRIER = FI_COLLECTIVE_OFFSET, + /* Collective datatypes */ + FI_NOOP = FI_COLLECTIVE_OFFSET, +}; + +#endif + +#ifndef FABRIC_DIRECT_COLLECTIVE_DEF + +enum fi_collective_op { + FI_BARRIER, FI_BROADCAST, FI_ALLTOALL, + FI_ALLREDUCE, FI_ALLGATHER, + FI_REDUCE_SCATTER, + FI_REDUCE, + FI_SCATTER, + FI_GATHER, }; #endif @@ -194,6 +239,7 @@ enum fi_op { struct fi_atomic_attr; struct fi_cq_attr; struct fi_cntr_attr; +struct fi_collective_attr; struct fi_ops_domain { size_t size; @@ -218,6 +264,9 @@ struct fi_ops_domain { int (*query_atomic)(struct fid_domain *domain, enum fi_datatype datatype, enum fi_op op, struct fi_atomic_attr *attr, uint64_t flags); + int (*query_collective)(struct fid_domain *domain, + enum fi_collective_op coll, + struct fi_collective_attr *attr, uint64_t flags); }; /* Memory registration flags */ @@ -369,7 +418,8 @@ static inline int fi_mr_refresh(struct fid_mr *mr, const struct iovec *iov, size_t count, uint64_t flags) { - struct fi_mr_modify modify = { 0 }; + struct fi_mr_modify modify; + memset(&modify, 0, sizeof(modify)); modify.flags = flags; modify.attr.mr_iov = iov; modify.attr.iov_count = count; diff --git a/include/rdma/fi_eq.h b/include/rdma/fi_eq.h index 1cf7a6f4787..f3d78779380 100644 --- a/include/rdma/fi_eq.h +++ b/include/rdma/fi_eq.h @@ -33,6 +33,8 @@ #ifndef FI_EQ_H #define FI_EQ_H +#include + #ifndef _WIN32 #include #endif /* _WIN32 */ @@ -58,7 +60,9 @@ enum fi_wait_obj { FI_WAIT_UNSPEC, FI_WAIT_SET, FI_WAIT_FD, - FI_WAIT_MUTEX_COND /* pthread mutex & cond */ + FI_WAIT_MUTEX_COND, /* pthread mutex & cond */ + FI_WAIT_YIELD, + FI_WAIT_POLLFD, }; struct fi_wait_attr { @@ -83,6 +87,11 @@ struct fi_mutex_cond { }; #endif /* _WIN32 */ +struct fi_wait_pollfd { + uint64_t change_index; + size_t nfds; + struct pollfd *fd; +}; /* * Poll Set diff --git a/include/rdma/providers/fi_log.h b/include/rdma/providers/fi_log.h index a326df77882..acf6f246e55 100644 --- a/include/rdma/providers/fi_log.h +++ b/include/rdma/providers/fi_log.h @@ -74,9 +74,12 @@ void fi_log(const struct fi_provider *prov, enum fi_log_level level, #define FI_LOG(prov, level, subsystem, ...) \ do { \ - if (fi_log_enabled(prov, level, subsystem)) \ + if (fi_log_enabled(prov, level, subsystem)) { \ + int saved_errno = errno; \ fi_log(prov, level, subsystem, \ __func__, __LINE__, __VA_ARGS__); \ + errno = saved_errno; \ + } \ } while (0) #define FI_WARN(prov, subsystem, ...) \ @@ -100,6 +103,17 @@ void fi_log(const struct fi_provider *prov, enum fi_log_level level, do {} while (0) #endif +#define FI_WARN_ONCE(prov, subsystem, ...) ({ \ + static int warned; \ + if (!warned && fi_log_enabled(prov, FI_LOG_WARN, subsystem)) { \ + int saved_errno = errno; \ + fi_log(prov, FI_LOG_WARN, subsystem, \ + __func__, __LINE__, __VA_ARGS__); \ + warned = 1; \ + errno = saved_errno; \ + } \ +}) + #ifdef __cplusplus } #endif diff --git a/include/unix/osd.h b/include/unix/osd.h index 5f6a4d211aa..7078b19e170 100644 --- a/include/unix/osd.h +++ b/include/unix/osd.h @@ -73,6 +73,10 @@ (((err) == EAGAIN) || \ ((err) == EWOULDBLOCK)) +#define OFI_SOCK_TRY_ACCEPT_AGAIN(err) \ + (((err) == EAGAIN) || \ + ((err) == EWOULDBLOCK)) + #define OFI_SOCK_TRY_CONN_AGAIN(err) \ ((err) == EINPROGRESS) @@ -119,68 +123,12 @@ static inline SOCKET ofi_socket(int domain, int type, int protocol) return socket(domain, type, protocol); } -static inline ssize_t ofi_read_socket(SOCKET fd, void *buf, size_t count) -{ - return read(fd, buf, count); -} - -static inline ssize_t ofi_write_socket(SOCKET fd, const void *buf, size_t count) -{ - return write(fd, buf, count); -} - -static inline ssize_t ofi_recv_socket(SOCKET fd, void *buf, size_t count, - int flags) -{ - return recv(fd, buf, count, flags); -} - -static inline ssize_t ofi_recvfrom_socket(SOCKET fd, void *buf, size_t count, int flags, - struct sockaddr *from, socklen_t *fromlen) -{ - return recvfrom(fd, buf, count, flags, from, fromlen); -} - -static inline ssize_t ofi_send_socket(SOCKET fd, const void *buf, size_t count, - int flags) -{ - return send(fd, buf, count, flags); -} - -static inline ssize_t ofi_sendto_socket(SOCKET fd, const void *buf, size_t count, int flags, - const struct sockaddr *to, socklen_t tolen) -{ - return sendto(fd, buf, count, flags, to, tolen); -} - -static inline ssize_t ofi_writev_socket(SOCKET fd, struct iovec *iov, size_t iov_cnt) -{ - return writev(fd, iov, iov_cnt); -} - -static inline ssize_t ofi_readv_socket(SOCKET fd, struct iovec *iov, int iov_cnt) -{ - return readv(fd, iov, iov_cnt); -} - -static inline ssize_t -ofi_sendmsg_tcp(SOCKET fd, const struct msghdr *msg, int flags) -{ - return sendmsg(fd, msg, flags); -} - static inline ssize_t ofi_sendmsg_udp(SOCKET fd, const struct msghdr *msg, int flags) { return sendmsg(fd, msg, flags); } -static inline ssize_t -ofi_recvmsg_tcp(SOCKET fd, struct msghdr *msg, int flags) -{ - return recvmsg(fd, msg, flags); -} - static inline ssize_t ofi_recvmsg_udp(SOCKET fd, struct msghdr *msg, int flags) { @@ -199,6 +147,8 @@ static inline int ofi_close_socket(SOCKET socket) int fi_fd_nonblock(int fd); +int fi_fd_block(int fd); + static inline int ofi_sockerr(void) { return errno; @@ -229,14 +179,25 @@ static inline long ofi_sysconf(int name) static inline int ofi_is_loopback_addr(struct sockaddr *addr) { return (addr->sa_family == AF_INET && - ((struct sockaddr_in *)addr)->sin_addr.s_addr == ntohl(INADDR_LOOPBACK)) || + ((struct sockaddr_in *)addr)->sin_addr.s_addr == htonl(INADDR_LOOPBACK)) || (addr->sa_family == AF_INET6 && ((struct sockaddr_in6 *)addr)->sin6_addr.s6_addr32[0] == 0 && ((struct sockaddr_in6 *)addr)->sin6_addr.s6_addr32[1] == 0 && ((struct sockaddr_in6 *)addr)->sin6_addr.s6_addr32[2] == 0 && - ((struct sockaddr_in6 *)addr)->sin6_addr.s6_addr32[3] == ntohl(1)); + ((struct sockaddr_in6 *)addr)->sin6_addr.s6_addr32[3] == htonl(1)); } +#if !HAVE_CLOCK_GETTIME + +#define CLOCK_REALTIME 0 +#define CLOCK_REALTIME_COARSE 0 +#define CLOCK_MONOTONIC 0 + +typedef int clockid_t; + +int clock_gettime(clockid_t clk_id, struct timespec *tp); + +#endif /* !HAVE_CLOCK_GETTIME */ /* complex operations implementation */ @@ -285,6 +246,8 @@ OFI_DEF_COMPLEX_OPS(long_double) #ifdef HAVE_BUILTIN_ATOMICS #define ofi_atomic_add_and_fetch(radix, ptr, val) __sync_add_and_fetch((ptr), (val)) #define ofi_atomic_sub_and_fetch(radix, ptr, val) __sync_sub_and_fetch((ptr), (val)) +#define ofi_atomic_cas_bool(radix, ptr, expected, desired) \ + __sync_bool_compare_and_swap((ptr), (expected), (desired)) #endif /* HAVE_BUILTIN_ATOMICS */ int ofi_set_thread_affinity(const char *s); diff --git a/include/windows/config.h b/include/windows/config.h index 0e3031974b6..7a1f526b3ed 100644 --- a/include/windows/config.h +++ b/include/windows/config.h @@ -159,20 +159,20 @@ #define PACKAGE_BUGREPORT "ofiwg@lists.openfabrics.org" /* Define to the full name of this package. */ -#define PACKAGE_NAME "libfabric" - -/* Define to the full name and version of this package. */ -#define PACKAGE_STRING "libfabric 1.9.0a1" +#define PACKAGE_NAME PACKAGE /* Define to the one symbol short name of this package. */ -#define PACKAGE_TARNAME "libfabric" +#define PACKAGE_TARNAME PACKAGE + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "1.12.0rc2" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING PACKAGE_NAME " " PACKAGE_VERSION /* Define to the home page for this package. */ #define PACKAGE_URL "" -/* Define to the version of this package. */ -#define PACKAGE_VERSION "1.9.0a1" - /* Define to 1 if pthread_spin_init is available. */ /* #undef PT_LOCK_SPIN */ @@ -195,7 +195,7 @@ /* Version number of package */ #define _FI_EXP(s) #s #define _FI_TO_STRING(s) _FI_EXP(s) -#define VERSION _FI_TO_STRING(FI_MAJOR_VERSION) "." _FI_TO_STRING(FI_MINOR_VERSION) ".1a1" +#define VERSION _FI_TO_STRING(FI_MAJOR_VERSION) "." _FI_TO_STRING(FI_MINOR_VERSION) "." _FI_TO_STRING(FI_REVISION_VERSION) #ifndef BUILD_ID #define BUILD_ID "" diff --git a/include/windows/osd.h b/include/windows/osd.h index 5c2ea78cbea..2cb9bbcdb09 100644 --- a/include/windows/osd.h +++ b/include/windows/osd.h @@ -32,6 +32,7 @@ #include "pthread.h" #include +#include #include #include @@ -213,9 +214,6 @@ extern "C" { #define SHUT_RDWR SD_BOTH #endif -#ifndef _SC_PAGESIZE -#define _SC_PAGESIZE 0 -#endif #define FI_DESTRUCTOR(func) void func @@ -228,6 +226,10 @@ extern "C" { ((err) == EWOULDBLOCK) || \ ((err) == EAGAIN)) +#define OFI_SOCK_TRY_ACCEPT_AGAIN(err) \ + (((err) == EAGAIN) || \ + ((err) == EWOULDBLOCK)) + #define OFI_SOCK_TRY_CONN_AGAIN(err) \ (((err) == EWOULDBLOCK) || \ ((err) == EINPROGRESS)) @@ -263,6 +265,7 @@ do \ #define strcasecmp _stricmp #define snprintf _snprintf #define sleep(x) Sleep(x * 1000) +#define strtok_r strtok_s #define __PRI64_PREFIX "ll" @@ -711,40 +714,53 @@ static inline SOCKET ofi_socket(int domain, int type, int protocol) return socket(domain, type, protocol); } -static inline ssize_t ofi_read_socket(SOCKET fd, void *buf, size_t count) +/* + * The windows API limits socket send/recv transfers to INT_MAX. + * For nonblocking, stream sockets, we limit send/recv calls to that + * size, since the sockets aren't guaranteed to send the full amount + * requested. For datagram sockets, we don't expect any transfers to + * be larger than a few KB. + * We do not handle blocking sockets that attempt to transfer more + * than INT_MAX data at a time. + */ +static inline ssize_t +ofi_recv_socket(SOCKET fd, void *buf, size_t count, int flags) { - return recv(fd, (char *)buf, (int)count, 0); + int len = count > INT_MAX ? INT_MAX : (int) count; + return (ssize_t) recv(fd, (char *) buf, len, flags); } -static inline ssize_t ofi_write_socket(SOCKET fd, const void *buf, size_t count) +static inline ssize_t +ofi_send_socket(SOCKET fd, const void *buf, size_t count, int flags) { - return send(fd, (const char*)buf, (int)count, 0); + int len = count > INT_MAX ? INT_MAX : (int) count; + return (ssize_t) send(fd, (const char*) buf, len, flags); } -static inline ssize_t ofi_recv_socket(SOCKET fd, void *buf, size_t count, - int flags) +static inline ssize_t ofi_read_socket(SOCKET fd, void *buf, size_t count) { - return recv(fd, (char *)buf, (int)count, flags); + return ofi_recv_socket(fd, buf, count, 0); } -static inline ssize_t -ofi_recvfrom_socket(SOCKET fd, void *buf, size_t count, int flags, - struct sockaddr *from, socklen_t *fromlen) +static inline ssize_t ofi_write_socket(SOCKET fd, const void *buf, size_t count) { - return recvfrom(fd, (char*)buf, (int)count, flags, from, fromlen); + return ofi_send_socket(fd, buf, count, 0); } -static inline ssize_t ofi_send_socket(SOCKET fd, const void *buf, size_t count, - int flags) +static inline ssize_t +ofi_recvfrom_socket(SOCKET fd, void *buf, size_t count, int flags, + struct sockaddr *from, socklen_t *fromlen) { - return send(fd, (const char*)buf, (int)count, flags); + int len = count > INT_MAX ? INT_MAX : (int) count; + return recvfrom(fd, (char*) buf, len, flags, from, (int *) fromlen); } static inline ssize_t ofi_sendto_socket(SOCKET fd, const void *buf, size_t count, int flags, const struct sockaddr *to, socklen_t tolen) { - return sendto(fd, (const char*)buf, (int)count, flags, to, tolen); + int len = count > INT_MAX ? INT_MAX : (int) count; + return sendto(fd, (const char*) buf, len, flags, to, (int) tolen); } ssize_t ofi_writev_socket(SOCKET fd, const struct iovec *iovec, size_t iov_cnt); @@ -780,6 +796,12 @@ static inline int fi_fd_nonblock(SOCKET fd) return ioctlsocket(fd, FIONBIO, &argp) ? -WSAGetLastError() : 0; } +static inline int fi_fd_block(SOCKET fd) +{ + u_long argp = 0; + return ioctlsocket(fd, FIONBIO, &argp) ? -WSAGetLastError() : 0; +} + /* Note: Use static variable `errno` for libc routines * (such as fopen, lseek and etc) * If you need to define which function/variable is needed @@ -831,15 +853,35 @@ static inline char * strndup(char const *src, size_t n) return dst; } +char *strcasestr(const char *haystack, const char *needle); + +#ifndef _SC_PAGESIZE +#define _SC_PAGESIZE 0 +#endif + +#ifndef _SC_NPROCESSORS_ONLN +#define _SC_NPROCESSORS_ONLN 1 +#endif + +#ifndef _SC_PHYS_PAGES +#define _SC_PHYS_PAGES 2 +#endif + static inline long ofi_sysconf(int name) { SYSTEM_INFO si; + ULONGLONG mem_size = 0; GetSystemInfo(&si); switch (name) { case _SC_PAGESIZE: return si.dwPageSize; + case _SC_NPROCESSORS_ONLN: + return si.dwNumberOfProcessors; + case _SC_PHYS_PAGES: + GetPhysicallyInstalledSystemMemory(&mem_size); + return mem_size / si.dwPageSize; default: errno = EINVAL; return -1; @@ -870,7 +912,7 @@ static inline int ofi_hugepage_enabled(void) static inline int ofi_is_loopback_addr(struct sockaddr *addr) { return (addr->sa_family == AF_INET && - ((struct sockaddr_in *)addr)->sin_addr.s_addr == ntohl(INADDR_LOOPBACK)) || + ((struct sockaddr_in *)addr)->sin_addr.s_addr == htonl(INADDR_LOOPBACK)) || (addr->sa_family == AF_INET6 && ((struct sockaddr_in6 *)addr)->sin6_addr.u.Word[0] == 0 && ((struct sockaddr_in6 *)addr)->sin6_addr.u.Word[1] == 0 && @@ -879,11 +921,30 @@ static inline int ofi_is_loopback_addr(struct sockaddr *addr) { ((struct sockaddr_in6 *)addr)->sin6_addr.u.Word[4] == 0 && ((struct sockaddr_in6 *)addr)->sin6_addr.u.Word[5] == 0 && ((struct sockaddr_in6 *)addr)->sin6_addr.u.Word[6] == 0 && - ((struct sockaddr_in6 *)addr)->sin6_addr.u.Word[7] == ntohs(1)); + ((struct sockaddr_in6 *)addr)->sin6_addr.u.Word[7] == htons(1)); } size_t ofi_ifaddr_get_speed(struct ifaddrs *ifa); +#define file2unix_time 10000000i64 +#define win2unix_epoch 116444736000000000i64 +#define CLOCK_MONOTONIC 1 + +/* Own implementation of clock_gettime*/ +static inline +int clock_gettime(int which_clock, struct timespec *spec) +{ + __int64 wintime; + + GetSystemTimeAsFileTime((FILETIME*)&wintime); + wintime -= win2unix_epoch; + + spec->tv_sec = wintime / file2unix_time; + spec->tv_nsec = wintime % file2unix_time * 100; + + return 0; +} + /* complex operations implementation */ #define OFI_DEF_COMPLEX(type) \ @@ -950,11 +1011,15 @@ OFI_DEF_COMPLEX(long_double) /* atomics primitives */ #ifdef HAVE_BUILTIN_ATOMICS #define InterlockedAdd32 InterlockedAdd +#define InterlockedCompareExchange32 InterlockedCompareExchange typedef LONG ofi_atomic_int_32_t; typedef LONGLONG ofi_atomic_int_64_t; #define ofi_atomic_add_and_fetch(radix, ptr, val) InterlockedAdd##radix((ofi_atomic_int_##radix##_t *)(ptr), (ofi_atomic_int_##radix##_t)(val)) #define ofi_atomic_sub_and_fetch(radix, ptr, val) InterlockedAdd##radix((ofi_atomic_int_##radix##_t *)(ptr), -(ofi_atomic_int_##radix##_t)(val)) +#define ofi_atomic_cas_bool(radix, ptr, expected, desired) \ + (InterlockedCompareExchange##radix(ptr, desired, expected) == expected) + #endif /* HAVE_BUILTIN_ATOMICS */ static inline int ofi_set_thread_affinity(const char *s) diff --git a/include/windows/pthread.h b/include/windows/pthread.h index 3548b54f852..67e3f9cef2b 100644 --- a/include/windows/pthread.h +++ b/include/windows/pthread.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2017 Intel Corporation. All rights reserved. +* Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -38,8 +39,10 @@ #include #include #include +#include #define PTHREAD_MUTEX_INITIALIZER {0} +#define PTHREAD_RWLOCK_INITIALIZER {0} #define pthread_cond_signal WakeConditionVariable #define pthread_cond_broadcast WakeAllConditionVariable @@ -138,6 +141,12 @@ static inline pthread_t pthread_self(void) return (pthread_t) ENOSYS; } +static inline int pthread_yield(void) +{ + (void) SwitchToThread(); + return 0; +} + /* * TODO: temporary solution * Need to re-implement @@ -148,6 +157,84 @@ typedef struct pthread_cleanup_t pthread_cleanup_callback_t routine; void *arg; } pthread_cleanup_t; + +/* Read-Write lock implementation */ + +typedef struct { + SRWLOCK lock; /* Windows Slim Reader Writer Lock */ + bool write_mode; +} pthread_rwlock_t; +typedef void pthread_rwlockattr_t; + +static inline int pthread_rwlock_init(pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr) +{ + (void)attr; + if (rwlock) { + InitializeSRWLock(&(rwlock->lock)); + rwlock->write_mode = false; + return 0; + } + return 1; +} + +static inline int pthread_rwlock_destroy(pthread_rwlock_t *rwlock) +{ + /* No SRWLock cleanup function */ + (void)rwlock; + return 0; +} + +static inline int pthread_rwlock_rdlock(pthread_rwlock_t *rwlock) +{ + if (rwlock) { + AcquireSRWLockShared(&(rwlock->lock)); + return 0; + } + return 1; +} + +static inline int pthread_rwlock_tryrdlock(pthread_rwlock_t *rwlock) +{ + if (rwlock && TryAcquireSRWLockShared(&(rwlock->lock))) { + return 0; + } + return 1; +} + +static inline int pthread_rwlock_wrlock(pthread_rwlock_t *rwlock) +{ + if (rwlock) { + AcquireSRWLockExclusive(&(rwlock->lock)); + rwlock->write_mode = true; + return 0; + } + return 1; +} + +static inline int pthread_rwlock_trywrlock(pthread_rwlock_t *rwlock) +{ + if (rwlock && TryAcquireSRWLockExclusive(&(rwlock->lock))) { + rwlock->write_mode = true; + return 0; + } + return 1; +} + + +static inline int pthread_rwlock_unlock(pthread_rwlock_t *rwlock) +{ + if (rwlock) { + if (rwlock->write_mode) { + rwlock->write_mode = false; + ReleaseSRWLockExclusive(&(rwlock->lock)); + } else { + ReleaseSRWLockShared(&(rwlock->lock)); + } + return 0; + } + return 1; +} + #ifndef __cplusplus #define pthread_cleanup_push(_rout, _arg) \ { \ diff --git a/info.vcxproj b/info.vcxproj index e772aea94a5..ec60d27d1eb 100644 --- a/info.vcxproj +++ b/info.vcxproj @@ -13,6 +13,10 @@ Debug-v140 x64 + + Debug-v142 + x64 + Release-ICC x64 @@ -25,6 +29,10 @@ Release-v140 x64 + + Release-v142 + x64 + {90850937-D15C-491D-B294-66DCA165254D} @@ -45,6 +53,12 @@ v141 Unicode + + Application + true + v142 + Unicode + Application true @@ -65,6 +79,13 @@ true Unicode + + Application + false + v142 + true + Unicode + Application false @@ -83,6 +104,9 @@ + + + @@ -92,6 +116,9 @@ + + + @@ -106,6 +133,11 @@ $(Platform)\$(Configuration)\info\ fi_$(ProjectName) + + true + $(Platform)\$(Configuration)\info\ + fi_$(ProjectName) + true $(Platform)\$(Configuration)\info\ @@ -121,6 +153,11 @@ $(Platform)\$(Configuration)\info\ fi_$(ProjectName) + + false + $(Platform)\$(Configuration)\info\ + fi_$(ProjectName) + false $(Platform)\$(Configuration)\info\ @@ -156,6 +193,21 @@ true + + + + + Level3 + Disabled + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(SoludionDir)util\windows\getopt;$(SolutionDir)include;$(SolutionDir)include\windows;%(AdditionalIncludeDirectories) + MultiThreadedDebug + + + Console + true + + @@ -209,6 +261,25 @@ true + + + Level3 + + + MaxSpeed + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(SoludionDir)util\windows\getopt;$(SolutionDir)include;$(SolutionDir)include\windows;%(AdditionalIncludeDirectories) + MultiThreaded + + + Console + true + true + true + + Level3 @@ -232,12 +303,15 @@ true true + true true true true + true true + @@ -250,4 +324,4 @@ - \ No newline at end of file + diff --git a/info.vcxproj.filters b/info.vcxproj.filters index 1d2dc9fcdbb..447ea6df8e8 100644 --- a/info.vcxproj.filters +++ b/info.vcxproj.filters @@ -18,6 +18,9 @@ Source Files + + Source Files + Source Files diff --git a/libfabric.def b/libfabric.def index cc3c447fdcd..e870b4489ed 100644 --- a/libfabric.def +++ b/libfabric.def @@ -1,5 +1,6 @@ EXPORTS + fi_version = fi_version fi_dupinfo = fi_dupinfo fi_getinfo = fi_getinfo fi_freeinfo = fi_freeinfo diff --git a/libfabric.map.in b/libfabric.map.in index 53a3ed3c942..2bcc349330f 100644 --- a/libfabric.map.in +++ b/libfabric.map.in @@ -31,3 +31,15 @@ FABRIC_1.2 { fi_freeinfo; fi_dupinfo; } FABRIC_1.1; + +FABRIC_1.3 { + global: + fi_getinfo; + fi_freeinfo; + fi_dupinfo; +} FABRIC_1.2; + +FABRIC_1.4 { + global: + fi_tostr_r; +} FABRIC_1.3; \ No newline at end of file diff --git a/libfabric.sln b/libfabric.sln index 6258724c334..39734ce17d6 100644 --- a/libfabric.sln +++ b/libfabric.sln @@ -1,7 +1,7 @@  Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 14 -VisualStudioVersion = 14.0.25420.1 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.29709.97 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libfabric", "libfabric.vcxproj", "{6B3A874F-B14C-4F16-B7C3-31E94859AE3E}" EndProject @@ -25,9 +25,11 @@ Global Debug-ICC|x64 = Debug-ICC|x64 Debug-v140|x64 = Debug-v140|x64 Debug-v141|x64 = Debug-v141|x64 + Debug-v142|x64 = Debug-v142|x64 Release-ICC|x64 = Release-ICC|x64 Release-v140|x64 = Release-v140|x64 Release-v141|x64 = Release-v141|x64 + Release-v142|x64 = Release-v142|x64 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Debug-ICC|x64.ActiveCfg = Debug-ICC|x64 @@ -36,50 +38,69 @@ Global {6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Debug-v140|x64.Build.0 = Debug-v140|x64 {6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Debug-v141|x64.ActiveCfg = Debug-v141|x64 {6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Debug-v141|x64.Build.0 = Debug-v141|x64 + {6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Debug-v142|x64.ActiveCfg = Debug-v142|x64 + {6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Debug-v142|x64.Build.0 = Debug-v142|x64 {6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Release-ICC|x64.ActiveCfg = Release-ICC|x64 {6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Release-ICC|x64.Build.0 = Release-ICC|x64 {6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Release-v140|x64.ActiveCfg = Release-v140|x64 {6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Release-v140|x64.Build.0 = Release-v140|x64 {6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Release-v141|x64.ActiveCfg = Release-v141|x64 {6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Release-v141|x64.Build.0 = Release-v141|x64 + {6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Release-v142|x64.ActiveCfg = Release-v142|x64 + {6B3A874F-B14C-4F16-B7C3-31E94859AE3E}.Release-v142|x64.Build.0 = Release-v142|x64 {90850937-D15C-491D-B294-66DCA165254D}.Debug-ICC|x64.ActiveCfg = Debug-ICC|x64 {90850937-D15C-491D-B294-66DCA165254D}.Debug-ICC|x64.Build.0 = Debug-ICC|x64 {90850937-D15C-491D-B294-66DCA165254D}.Debug-v140|x64.ActiveCfg = Debug-v140|x64 {90850937-D15C-491D-B294-66DCA165254D}.Debug-v140|x64.Build.0 = Debug-v140|x64 {90850937-D15C-491D-B294-66DCA165254D}.Debug-v141|x64.ActiveCfg = Debug-v141|x64 {90850937-D15C-491D-B294-66DCA165254D}.Debug-v141|x64.Build.0 = Debug-v141|x64 + {90850937-D15C-491D-B294-66DCA165254D}.Debug-v142|x64.ActiveCfg = Debug-v142|x64 + {90850937-D15C-491D-B294-66DCA165254D}.Debug-v142|x64.Build.0 = Debug-v142|x64 {90850937-D15C-491D-B294-66DCA165254D}.Release-ICC|x64.ActiveCfg = Release-ICC|x64 {90850937-D15C-491D-B294-66DCA165254D}.Release-ICC|x64.Build.0 = Release-ICC|x64 {90850937-D15C-491D-B294-66DCA165254D}.Release-v140|x64.ActiveCfg = Release-v140|x64 {90850937-D15C-491D-B294-66DCA165254D}.Release-v140|x64.Build.0 = Release-v140|x64 {90850937-D15C-491D-B294-66DCA165254D}.Release-v141|x64.ActiveCfg = Release-v141|x64 {90850937-D15C-491D-B294-66DCA165254D}.Release-v141|x64.Build.0 = Release-v141|x64 + {90850937-D15C-491D-B294-66DCA165254D}.Release-v142|x64.ActiveCfg = Release-v142|x64 + {90850937-D15C-491D-B294-66DCA165254D}.Release-v142|x64.Build.0 = Release-v142|x64 {DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Debug-ICC|x64.ActiveCfg = Debug-ICC|x64 {DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Debug-ICC|x64.Build.0 = Debug-ICC|x64 {DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Debug-v140|x64.ActiveCfg = Debug-v140|x64 {DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Debug-v140|x64.Build.0 = Debug-v140|x64 {DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Debug-v141|x64.ActiveCfg = Debug-v141|x64 {DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Debug-v141|x64.Build.0 = Debug-v141|x64 + {DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Debug-v142|x64.ActiveCfg = Debug-v142|x64 + {DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Debug-v142|x64.Build.0 = Debug-v142|x64 {DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Release-ICC|x64.ActiveCfg = Release-ICC|x64 {DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Release-ICC|x64.Build.0 = Release-ICC|x64 {DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Release-v140|x64.ActiveCfg = Release-v140|x64 {DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Release-v140|x64.Build.0 = Release-v140|x64 {DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Release-v141|x64.ActiveCfg = Release-v141|x64 {DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Release-v141|x64.Build.0 = Release-v141|x64 + {DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Release-v142|x64.ActiveCfg = Release-v142|x64 + {DBBD5F92-1E78-40ED-8D64-F958D0EF12B2}.Release-v142|x64.Build.0 = Release-v142|x64 {C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Debug-ICC|x64.ActiveCfg = Debug-ICC|x64 {C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Debug-ICC|x64.Build.0 = Debug-ICC|x64 {C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Debug-v140|x64.ActiveCfg = Debug-v140|x64 {C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Debug-v140|x64.Build.0 = Debug-v140|x64 {C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Debug-v141|x64.ActiveCfg = Debug-v141|x64 {C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Debug-v141|x64.Build.0 = Debug-v141|x64 + {C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Debug-v142|x64.ActiveCfg = Debug-v142|x64 + {C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Debug-v142|x64.Build.0 = Debug-v142|x64 {C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Release-ICC|x64.ActiveCfg = Release-ICC|x64 {C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Release-ICC|x64.Build.0 = Release-ICC|x64 {C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Release-v140|x64.ActiveCfg = Release-v140|x64 {C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Release-v140|x64.Build.0 = Release-v140|x64 {C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Release-v141|x64.ActiveCfg = Release-v141|x64 {C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Release-v141|x64.Build.0 = Release-v141|x64 + {C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Release-v142|x64.ActiveCfg = Release-v142|x64 + {C835FB00-8E80-4D4A-9791-4B7D6D37168A}.Release-v142|x64.Build.0 = Release-v142|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {081E384D-462B-4FB7-AB58-B39108563DB3} + EndGlobalSection EndGlobal diff --git a/libfabric.spec.in b/libfabric.spec.in index 71ca8fe55df..a1d5e1ff2b9 100644 --- a/libfabric.spec.in +++ b/libfabric.spec.in @@ -20,6 +20,9 @@ License: GPLv2 or BSD Url: http://www.github.com/ofiwg/libfabric Source: http://www.github.org/ofiwg/%{name}/releases/download/v{%version}/%{name}-%{version}.tar.bz2 BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) +%if 0%{?suse_version} >= 1 +Provides: libfabric1 = %{version}-%{release} +%endif %description libfabric provides a user-space API to access high-performance fabric @@ -75,9 +78,12 @@ EOF %endif %endif -%makeinstall installdirs +%make_install installdirs # remove unpackaged files from the buildroot rm -f %{buildroot}%{_libdir}/*.la +%if 0%{?_version_symbolic_link:1} +%{__ln_s} %{version} %{buildroot}/%{_version_symbolic_link} +%endif %clean rm -rf %{buildroot} @@ -91,6 +97,9 @@ rm -rf %{buildroot} %{_bindir}/fi_info %{_bindir}/fi_strerror %{_bindir}/fi_pingpong +%if 0%{?_version_symbolic_link:1} +%{_version_symbolic_link} +%endif %dir %{_libdir}/libfabric/ %doc AUTHORS COPYING README diff --git a/libfabric.vcxproj b/libfabric.vcxproj index 5a084bd6990..fd7765932e0 100644 --- a/libfabric.vcxproj +++ b/libfabric.vcxproj @@ -13,6 +13,10 @@ Debug-v141 x64 + + Debug-v142 + x64 + Release-ICC x64 @@ -25,6 +29,10 @@ Release-v141 x64 + + Release-v142 + x64 + {6B3A874F-B14C-4F16-B7C3-31E94859AE3E} @@ -45,6 +53,12 @@ v141 MultiByte + + DynamicLibrary + true + v142 + MultiByte + DynamicLibrary true @@ -65,6 +79,13 @@ true MultiByte + + DynamicLibrary + false + v142 + true + MultiByte + DynamicLibrary false @@ -81,6 +102,9 @@ + + + @@ -90,6 +114,9 @@ + + + @@ -102,6 +129,10 @@ true $(Platform)\$(Configuration)\libfabric\ + + true + $(Platform)\$(Configuration)\libfabric\ + true $(Platform)\$(Configuration)\libfabric\ @@ -114,25 +145,22 @@ false $(Platform)\$(Configuration)\libfabric\ + + false + $(Platform)\$(Configuration)\libfabric\ + false $(Platform)\$(Configuration)\libfabric\ - + NotUsing Level4 - Disabled - WIN32;_WINSOCKAPI_=;_CRT_SECURE_NO_WARNINGS;_WINSOCK_DEPRECATED_NO_WARNINGS;_WINDOWS;_USRDLL;LIBFABRIC_EXPORTS;HAVE_CONFIG_H;ENABLE_DEBUG;%(PreprocessorDefinitions) + $(ProjectDir)include;$(ProjectDir)include\windows;$(ProjectDir)prov\netdir\NetDirect;$(ProjectDir)prov\hook\src;$(ProjectDir)prov\hook\include;$(ProjectDir)prov\hook\perf\include;$(ProjectDir)prov\efa\src;$(ProjectDir)prov\efa\include;$(ProjectDir)prov\efa\src\rxr;$(ProjectDir)prov\efa\src\efa_verbs;$(ProjectDir)prov\efa\src\efa_verbs\plat true - $(ProjectDir)include;$(ProjectDir)include\windows;$(ProjectDir)prov\netdir\NetDirect;$(ProjectDir)prov\hook\src;$(ProjectDir)prov\hook\include;$(ProjectDir)prov\hook\perf\include CompileAsC - 4127;4200;4204;4221;4115;4201;4100 - true - MultiThreadedDebug false - false - true Windows @@ -141,6 +169,17 @@ libfabric.def + + + Disabled + WIN32;_WINSOCKAPI_=;_CRT_SECURE_NO_WARNINGS;_WINSOCK_DEPRECATED_NO_WARNINGS;_WINDOWS;_USRDLL;LIBFABRIC_EXPORTS;HAVE_CONFIG_H;ENABLE_DEBUG;%(PreprocessorDefinitions) + 4127;4200;4204;4221;4115;4201;4100 + true + MultiThreadedDebug + false + true + + NotUsing @@ -157,12 +196,17 @@ false true - - Windows - true - Synchronization.lib;Ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;iphlpapi.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) - libfabric.def - + + + + Disabled + WIN32;_WINSOCKAPI_=;_CRT_SECURE_NO_WARNINGS;_WINSOCK_DEPRECATED_NO_WARNINGS;_WINDOWS;_USRDLL;LIBFABRIC_EXPORTS;HAVE_CONFIG_H;ENABLE_DEBUG;%(PreprocessorDefinitions) + 4127;4200;4204;4221;4115;4201;4100 + true + MultiThreadedDebug + false + true + @@ -201,6 +245,7 @@ false MultiThreaded false + /DNDEBUG %(AdditionalOptions) Windows @@ -225,15 +270,32 @@ true false MultiThreaded - false + /DNDEBUG %(AdditionalOptions) + + + true + true + + + + + Level4 + NotUsing + MaxSpeed + true + true + WIN32;_WINSOCKAPI_=;_CRT_SECURE_NO_WARNINGS;_WINSOCK_DEPRECATED_NO_WARNINGS;_WINDOWS;_USRDLL;LIBFABRIC_EXPORTS;HAVE_CONFIG_H;%(PreprocessorDefinitions) + $(ProjectDir)include;$(ProjectDir)include\windows;$(ProjectDir)prov\netdir\NetDirect;$(ProjectDir)prov\hook\src;$(ProjectDir)prov\hook\include;$(ProjectDir)prov\hook\perf\include; + 4127;4200;4204;4221;4115;4201;4100 + true + true + false + MultiThreaded + /DNDEBUG %(AdditionalOptions) - Windows - true true true - Synchronization.lib;Ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;iphlpapi.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) - libfabric.def @@ -295,12 +357,16 @@ + + + + @@ -318,12 +384,16 @@ + + + + @@ -338,180 +408,234 @@ + + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) + $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) $(ProjectDir)prov\sockets\include;%(AdditionalIncludeDirectories) @@ -535,9 +659,11 @@ ofi_osd.h ofi_osd.h + ofi_osd.h ofi_osd.h ofi_osd.h ofi_osd.h + ofi_osd.h ofi_osd.h @@ -545,6 +671,7 @@ + @@ -557,7 +684,10 @@ + + + 4127;869 @@ -566,6 +696,10 @@ + + + + @@ -583,9 +717,11 @@ + + @@ -613,6 +749,7 @@ + diff --git a/libfabric.vcxproj.filters b/libfabric.vcxproj.filters index 67a99fca620..55f0e0440a0 100644 --- a/libfabric.vcxproj.filters +++ b/libfabric.vcxproj.filters @@ -138,6 +138,15 @@ Source Files\src + + Source Files\src + + + Source Files\src + + + Source Files\src + Source Files\src @@ -180,9 +189,18 @@ Source Files\prov\util + + Source Files\prov\util + Source Files\prov\util + + Source Files\prov\util + + + Source Files\prov\util + Source Files\src\windows @@ -399,6 +417,9 @@ Source Files\prov\util + + Source Files\prov\util + Source Files\prov\util @@ -491,9 +512,15 @@ Header Files + + Header Files + Header Files + + Header Files + Header Files @@ -548,6 +575,9 @@ Header Files\rdma + + Header Files\rdma + Header Files\windows diff --git a/man/fabric.7.md b/man/fabric.7.md index 6a16c3717cf..29f76f17ed8 100644 --- a/man/fabric.7.md +++ b/man/fabric.7.md @@ -297,6 +297,68 @@ portability across providers. fabric domain may not be available in a child process because of copy on write restrictions. +# ABI CHANGES + +libfabric releases maintain compatibility with older releases, so that +compiled applications can continue to work as-is, and previously written +applications will compile against newer versions of the library without +needing source code changes. The changes below describe ABI updates +that have occurred and which libfabric release corresponds to the +changes. + +Note that because most functions called by applications actually call +static inline functions, which in turn reference function pointers in +order to call directly into providers, libfabric only exports a handful +of functions directly. ABI changes are limited to those functions, +most notably the fi_getinfo call and its returned attribute structures. + +The ABI version is independent from the libfabric release version. + +## ABI 1.0 + +The initial libfabric release (1.0.0) also corresponds to ABI version 1.0. +The 1.0 ABI was unchanged for libfabric major.minor versions 1.0, 1.1, 1.2, +1.3, and 1.4. + +## ABI 1.1 + +A number of external data structures were appended starting with libfabric +version 1.5. These changes included adding the fields to the following +data structures. The 1.1 ABI was exported by libfabric versions 1.5 and +1.6. + +*fi_fabric_attr* +: Added api_version + +*fi_domain_attr* +: Added cntr_cnt, mr_iov_limit, caps, mode, auth_key, auth_key_size, + max_err_data, and mr_cnt fields. The mr_mode field was also changed + from an enum to an integer flag field. + +*fi_ep_attr* +: Added auth_key_size and auth_key fields. + +## ABI 1.2 + +The 1.2 ABI version was exported by libfabric versions 1.7 and 1.8, and +expanded the following structure. + +*fi_info* +: The fi_info structure was expanded to reference a new fabric object, + fid_nic. When available, the fid_nic references a new set of attributes + related to network hardware details. + +## ABI 1.3 + +The 1.3 ABI is also the current ABI version. All libfabric releases +starting at 1.9 export this ABI. + +*fi_domain_attr* +: Added tclass + +*fi_tx_attr* +: Added tclass + # SEE ALSO [`fi_info`(1)](fi_info.1.html), diff --git a/man/fi_atomic.3.md b/man/fi_atomic.3.md index 37f9daaea0d..33758563d51 100644 --- a/man/fi_atomic.3.md +++ b/man/fi_atomic.3.md @@ -130,7 +130,7 @@ int fi_query_atomic(struct fid_domain *domain, *desc / compare_desc / result_desc* : Data descriptor associated with the local data buffer, local compare - buffer, and local result buffer, respectively. + buffer, and local result buffer, respectively. See [`fi_mr`(3)](fi_mr.3.html). *dest_addr* : Destination address for connectionless atomic operations. Ignored for @@ -382,7 +382,7 @@ and type of parameters that they accept as input. Otherwise, they perform the same general function. The call fi_atomic transfers the data contained in the user-specified -data buffer to a remote node. For unconnected endpoints, the destination +data buffer to a remote node. For connectionless endpoints, the destination endpoint is specified through the dest_addr parameter. Unless the endpoint has been configured differently, the data buffer passed into fi_atomic must not be touched by the application until the @@ -405,7 +405,7 @@ discussion below for more details. The requested message size that can be used with fi_inject_atomic is limited by inject_size. The fi_atomicmsg call supports atomic functions over both connected -and unconnected endpoints, with the ability to control the atomic +and connectionless endpoints, with the ability to control the atomic operation per call through the use of flags. The fi_atomicmsg function takes a struct fi_msg_atomic as input. @@ -600,7 +600,7 @@ with atomic message calls. targeting the same peer endpoint have completed. Operations posted after the fencing will see and/or replace the results of any operations initiated prior to the fenced operation. - + The ordering of operations starting at the posting of the fenced operation (inclusive) to the posting of a subsequent fenced operation (exclusive) is controlled by the endpoint's ordering semantics. diff --git a/man/fi_av_set.3.md b/man/fi_av_set.3.md index f8004b47f10..38f439d5fd4 100644 --- a/man/fi_av_set.3.md +++ b/man/fi_av_set.3.md @@ -9,17 +9,47 @@ tagline: Libfabric Programmer's Manual fi_av_set \- Address vector set operations -fi_av_open / fi_close -: Open or close an address vector +fi_av_set / fi_close +: Open or close an address vector set + +fi_av_set_union +: Perform a set union operation on two AV sets + +fi_av_set_intersect +: Perform a set intersect operation on two AV sets + +fi_av_set_diff +: Perform a set difference operation on two AV sets + +fi_av_set_insert +: Add an address to an AV set + +fi_av_set_remove +: Remove an address from an AV set + +fi_av_set_addr +: Obtain a collective address for current addresses in an AV set # SYNOPSIS ```c -#include +#include + +int fi_av_set(struct fid_av *av, struct fi_av_set_attr *attr, + struct fid_av_set **set, void * context); + +int fi_av_set_union(struct fid_av_set *dst, const struct fid_av_set *src); + +int fi_av_set_intersect(struct fid_av_set *dst, const struct fid_av_set *src); + +int fi_av_set_diff(struct fid_av_set *dst, const struct fid_av_set *src); + +int fi_av_set_insert(struct fid_av_set *set, fi_addr_t addr); -int fi_av_open(struct fid_domain *domain, struct fi_av_attr *attr, - struct fid_av **av, void *context); +int fi_av_set_remove(struct fid_av_set *set, fi_addr_t addr); + +int fi_av_set_addr(struct fid_av_set *set, fi_addr_t *coll_addr); int fi_close(struct fid *av_set); ``` @@ -29,6 +59,15 @@ int fi_close(struct fid *av_set); *av* : Address vector +*set* +: Address vector set + +*dst* +: Address vector set updated by set operation + +*src* +: Address vector set providing input to a set operation + *attr* : Address vector set attributes @@ -38,6 +77,12 @@ int fi_close(struct fid *av_set); *flags* : Additional flags to apply to the operation. +*addr* +: Destination address to insert to remove from AV set. + +*coll_addr* +: Address identifying collective group. + # DESCRIPTION An address vector set (AV set) represents an ordered subset of addresses of an @@ -77,6 +122,8 @@ struct fi_av_set_attr { *count* : Indicates the expected the number of members that will be a part of the AV set. The provider uses this to optimize resource allocations. + If count is 0, the provider will select a size based on + available system configuration data or underlying limitations. *start_addr / end_addr* : The starting and ending addresses, inclusive, to @@ -88,6 +135,9 @@ struct fi_av_set_attr { empty AV set, a communication key is being provided, or the AV is of type FI_AV_MAP. + The number of addresses between start_addr and end_addr must be less than + or equal to the specified count value. + *stride* : The number of entries between successive addresses included in the AV set. The AV set will include all addresses from start_addr + stride x i, @@ -139,6 +189,21 @@ The AV set insert call appends the specified address to the end of the AV set. The AV set remove call removes the specified address from the given AV set. The order of the remaining addresses in the AV set is unchanged. +## fi_av_set_addr + +Returns an address that may be used to communicate with all current members +of an AV set. This is a local operation only that does not involve network +communication. The returned address may be used as input into +fi_join_collective. Note that attempting to use the address returned from +fi_av_set_addr (e.g. passing it to fi_join_collective) while simultaneously +modifying the addresses stored in an AV set results in undefined behavior. + +## fi_close + +Closes an AV set and releases all resources associated with it. Any +operations active at the time an AV set is closed will be aborted, with +the result of the collective undefined. + # NOTES Developers who are familiar with MPI will find that AV sets are similar to diff --git a/man/fi_cntr.3.md b/man/fi_cntr.3.md index 9d4b5c0c5b1..cc87f5ab909 100644 --- a/man/fi_cntr.3.md +++ b/man/fi_cntr.3.md @@ -131,8 +131,8 @@ struct fi_cntr_attr { object associated with a counter, in order to use it in other system calls. The following values may be used to specify the type of wait object associated with a counter: FI_WAIT_NONE, FI_WAIT_UNSPEC, - FI_WAIT_SET, FI_WAIT_FD, and FI_WAIT_MUTEX_COND. The default is - FI_WAIT_NONE. + FI_WAIT_SET, FI_WAIT_FD, FI_WAIT_MUTEX_COND, and FI_WAIT_YIELD. + The default is FI_WAIT_NONE. - *FI_WAIT_NONE* : Used to indicate that the user will not block (wait) for events on @@ -161,6 +161,10 @@ struct fi_cntr_attr { : Specifies that the counter should use a pthread mutex and cond variable as a wait object. +- *FI_WAIT_YIELD* +: Indicates that the counter will wait without a wait object but instead + yield on every wait. Allows usage of fi_cntr_wait through a spin. + *wait_set* : If wait_obj is FI_WAIT_SET, this field references a wait object to which the event counter should attach. When an event is added to diff --git a/man/fi_collective.3.md b/man/fi_collective.3.md index e0a0246f107..e4035cc1c1f 100644 --- a/man/fi_collective.3.md +++ b/man/fi_collective.3.md @@ -15,22 +15,33 @@ fi_barrier the barrier call. fi_broadcast -: A single sender transmits data to all receiver peers. +: A single sender transmits data to all peers, including itself. + +fi_alltoall +: Each peer distributes a slice of its local data to all peers. fi_allreduce : Collective operation where all peers broadcast an atomic operation to all other peers. +fi_allgather +: Each peer sends a complete copy of its local data to all peers. + fi_reduce_scatter : Collective call where data is collected from all peers and merged (reduced). The results of the reduction is distributed back to the peers, with each peer receiving a slice of the results. -fi_alltoall -: Each peer distributes a slice of its local data to all peers. +fi_reduce +: Collective call where data is collected from all peers to a root peer + and merged (reduced). -fi_allgather -: Each peer sends a complete copy of its local data to all peers. +fi_scatter +: A single sender distributes (scatters) a slice of its local data to + all peers. + +fi_gather +: All peers send their data to a root peer. fi_query_collective : Returns information about which collective operations are supported by a @@ -49,32 +60,46 @@ ssize_t fi_barrier(struct fid_ep *ep, fi_addr_t coll_addr, void *context); ssize_t fi_broadcast(struct fid_ep *ep, void *buf, size_t count, void *desc, - fi_addr_t coll_addr, enum fi_datatype datatype, enum fi_op op, + fi_addr_t coll_addr, fi_addr_t root_addr, enum fi_datatype datatype, uint64_t flags, void *context); -ssize_t fi_allreduce(struct fid_ep *ep, const void *buf, size_t count, +ssize_t fi_alltoall(struct fid_ep *ep, const void *buf, size_t count, void *desc, void *result, void *result_desc, - fi_addr_t coll_addr, enum fi_datatype datatype, enum fi_op op, + fi_addr_t coll_addr, enum fi_datatype datatype, uint64_t flags, void *context); -ssize_t fi_reduce_scatter(struct fid_ep *ep, const void *buf, size_t count, +ssize_t fi_allreduce(struct fid_ep *ep, const void *buf, size_t count, void *desc, void *result, void *result_desc, fi_addr_t coll_addr, enum fi_datatype datatype, enum fi_op op, uint64_t flags, void *context); -ssize_t fi_alltoall(struct fid_ep *ep, const void *buf, size_t count, +ssize_t fi_allgather(struct fid_ep *ep, const void *buf, size_t count, void *desc, void *result, void *result_desc, fi_addr_t coll_addr, enum fi_datatype datatype, uint64_t flags, void *context); -ssize_t fi_allgather(struct fid_ep *ep, const void *buf, size_t count, +ssize_t fi_reduce_scatter(struct fid_ep *ep, const void *buf, size_t count, void *desc, void *result, void *result_desc, - fi_addr_t coll_addr, enum fi_datatype datatype, + fi_addr_t coll_addr, enum fi_datatype datatype, enum fi_op op, + uint64_t flags, void *context); + +ssize_t fi_reduce(struct fid_ep *ep, const void *buf, size_t count, + void *desc, void *result, void *result_desc, fi_addr_t coll_addr, + fi_addr_t root_addr, enum fi_datatype datatype, enum fi_op op, + uint64_t flags, void *context); + +ssize_t fi_scatter(struct fid_ep *ep, const void *buf, size_t count, + void *desc, void *result, void *result_desc, fi_addr_t coll_addr, + fi_addr_t root_addr, enum fi_datatype datatype, + uint64_t flags, void *context); + +ssize_t fi_gather(struct fid_ep *ep, const void *buf, size_t count, + void *desc, void *result, void *result_desc, fi_addr_t coll_addr, + fi_addr_t root_addr, enum fi_datatype datatype, uint64_t flags, void *context); int fi_query_collective(struct fid_domain *domain, - enum fi_datatype datatype, enum fi_op op, - struct fi_collective_attr *attr, uint64_t flags); + fi_collective_op coll, struct fi_collective_attr *attr, uint64_t flags); ``` # ARGUMENTS @@ -107,6 +132,9 @@ int fi_query_collective(struct fid_domain *domain, *coll_addr* : Address referring to the collective group of endpoints. +*root_addr* +: Single endpoint that is the source or destination of collective data. + *flags* : Additional flags to apply for the atomic operation @@ -115,7 +143,13 @@ int fi_query_collective(struct fid_domain *domain, ignored if the operation will not generate a successful completion, unless an op flag specifies the context parameter be used for required input. -# DESCRIPTION +# DESCRIPTION (EXPERIMENTAL APIs) + +The collective APIs are new to the 1.9 libfabric release. Although, efforts +have been made to design the APIs such that they align well with applications +and are implementable by the providers, the APIs should be considered +experimental and may be subject to change in future versions of the +library until the experimental tag has been removed. In general collective operations can be thought of as coordinated atomic operations between a set of peer endpoints. Readers should refer to the @@ -166,9 +200,8 @@ associated synchronously with an AV using the fi_ep_bind() call. Upon completion of the fi_join_collective operation, an fi_addr is provided that is used as the target address when invoking a collective operation. -For developer convenience, a set of collective APIs are defined. However, -these are inline wrappers around the atomic interfaces. Collective APIs -differ from message and RMA interfaces in that the format of the data is +For developer convenience, a set of collective APIs are defined. Collective +APIs differ from message and RMA interfaces in that the format of the data is known to the provider, and the collective may perform an operation on that data. This aligns collective operations closely with the atomic interfaces. @@ -198,12 +231,12 @@ will report that the join has completed. Application managed collective memberships are an exception. With application managed memberships, the fi_join_collective call may be completed locally without fabric communication. For provider managed memberships, the join collective call requires as -input a coll_addr that refers to an existing collective group. The -fi_join_collective call will create a new collective subgroup. If there is -no existing collective group (e.g. this is the first group being created), -or if application managed memberships are used, coll_addr should be set to -FI_ADDR_UNAVAIL. For provider managed memberships, this will result in -using all entries in the associated AV as the base. +input a coll_addr that refers to either an address associated with an +AV set (see fi_av_set_addr) or an existing collective group (obtained through +a previous call to fi_join_collective). The +fi_join_collective call will create a new collective subgroup. +If application managed memberships are used, coll_addr should be set to +FI_ADDR_UNAVAIL. Applications must call fi_close on the collective group to disconnect the endpoint from the group. After a join operation has completed, the @@ -222,11 +255,9 @@ completed prior to them calling barrier has finished. ## Broadcast (fi_broadcast) fi_broadcast transfers an array of data from a single sender to all other -members of the collective group. The sender of the broadcast data must -specify the FI_SEND flag, while receivers use the FI_RECV flag. The input -buf parameter is treated as either the transmit buffer, if FI_SEND is set, or -the receive buffer, if FI_RECV is set. Either the FI_SEND or FI_RECV flag -must be set. The broadcast operation acts as an atomic write or read to a +members of the collective group. The input buf parameter is treated as the +transmit buffer if the local rank is the root, otherwise it is the receive +buffer. The broadcast operation acts as an atomic write or read to a data array. As a result, the format of the data in buf is specified through the datatype parameter. Any non-void datatype may be broadcast. @@ -242,6 +273,31 @@ array of integers to a group of peers. broadcast ``` +## All to All (fi_alltoall) + +The fi_alltoall collective involves distributing (or scattering) different +portions of an array of data to peers. It is best explained using an +example. Here three peers perform an all to all collective to exchange +different entries in an integer array. + +``` +[1] [2] [3] +[5] [6] [7] +[9] [10] [11] + \ | / + All to all + / | \ +[1] [5] [9] +[2] [6] [10] +[3] [7] [11] +``` + +Each peer sends a piece of its data to the other peers. + +All to all operations may be performed on any non-void datatype. However, +all to all does not perform an operation on the data itself, so no operation +is specified. + ## All Reduce (fi_allreduce) fi_allreduce can be described as all peers providing input into an atomic @@ -270,28 +326,25 @@ between three peers. All Reduce ``` -## All to All (fi_alltoall) +## All Gather (fi_allgather) -The fi_alltoall collective involves distributing (or scattering) different -portions of an array of data to peers. It is best explained using an -example. Here three peers perform an all to all collective to exchange -different entries in an integer array. +Conceptually, all gather can be viewed as the opposite of the scatter +component from reduce-scatter. All gather collects data from all peers into +a single array, then copies that array back to each peer. ``` -[1] [2] [3] -[5] [6] [7] -[9] [10] [11] - \ | / - All to all - / | \ -[1] [5] [9] -[5] [6] [7] -[9] [10] [11] +[1] [5] [9] + \ | / + All gather + / | \ +[1] [1] [1] +[5] [5] [5] +[9] [9] [9] ``` -All to all operations may be performed on any non-void datatype. However, -all to all does not perform an operation on the data itself, so no operation -is specified. +All gather may be performed on any non-void datatype. However, all gather +does not perform an operation on the data itself, so no operation is +specified. ## Reduce-Scatter (fi_reduce_scatter) @@ -321,25 +374,69 @@ This is shown by the following example: The reduce scatter call supports the same datatype and atomic operation as fi_allreduce. -## All Gather (fi_allgather) +## Reduce (fi_reduce) -Conceptually, all gather can be viewed as the opposite of the scatter -component from reduce-scatter. All gather collects data from all peers into -a single array, then copies that array back to each peer. +The fi_reduce collective is the first half of an fi_allreduce operation. +With reduce, all peers provide input into an atomic operation, with the +the results collected by a single 'root' endpoint. + +This is shown by the following example, with the leftmost peer identified +as the root: ``` -[1] [5] [9] - \ | / - All gather - / | \ [1] [1] [1] [5] [5] [5] [9] [9] [9] + \ | / + sum (reduce) + / + [3] +[15] +[27] ``` -All gather may be performed on any non-void datatype. However, all gather -does not perform an operation on the data itself, so no operation is -specified. +The reduce call supports the same datatype and atomic operation as +fi_allreduce. + +## Scatter (fi_scatter) + +The fi_scatter collective is the second half of an fi_reduce_scatter operation. +The data from a single 'root' endpoint is split and distributed to all peers. + +This is shown by the following example: + +``` + [3] +[15] +[27] + \ + scatter + / | \ +[3] [15] [27] +``` + +The scatter operation is used to distribute results to the peers. No atomic +operation is performed on the data. + +## Gather (fi_gather) + +The fi_gather operation is used to collect (gather) the results from all peers +and store them at a 'root' peer. + +This is shown by the following example, with the leftmost peer identified +as the root. + +``` +[1] [5] [9] + \ | / + gather + / +[1] +[5] +[9] +``` + +The gather operation does not perform any operation on the data itself. ## Query Collective Attributes (fi_query_collective) @@ -350,34 +447,47 @@ by the provider must be implemented by the application. The query call checks whether a provider supports a specific collective operation for a given datatype and operation, if applicable. -The datatype and operation of the collective are provided as input -into fi_query_collective. For operations that do not exchange -application data, such as fi_barrier, the datatype should be set to -FI_VOID. The op parameter may reference one of these atomic opcodes: -FI_MIN, FI_MAX, FI_SUM, FI_PROD, FI_LOR, FI_LAND, FI_BOR, FI_BAND, -FI_LXOR, FI_BXOR, or a collective operation: FI_BARRIER, FI_BROADCAST, -FI_ALLTOALL, FI_ALLGATHER. The use of an atomic opcode will indicate -if the provider supports the fi_allreduce() call for the given -operation and datatype, unless the FI_SCATTER flag has been specified. If -FI_SCATTER has been set, query will return if the provider supports the -fi_reduce_scatter() call for the given operation and datatype. -Specifying a collective operation for the op parameter queries support -for the corresponding collective. - -On success, fi_query_collective will provide information about -the supported limits through the struct fi_collective_attr parameter. +The name of the collective, as well as the datatype and associated +operation, if applicable, and are provided as input +into fi_query_collective. + +The coll parameter may reference one of these collectives: +FI_BARRIER, FI_BROADCAST, FI_ALLTOALL, FI_ALLREDUCE, FI_ALLGATHER, +FI_REDUCE_SCATTER, FI_REDUCE, FI_SCATTER, or FI_GATHER. Additional +details on the collective operation is specified through the struct +fi_collective_attr parameter. For collectives that act on data, the +operation and related data type must be specified through the given +attributes. {% highlight c %} struct fi_collective_attr { - struct fi_atomic_attr datatype_attr; - size_t max_members; - uint64_t mode; + enum fi_op op; + enum fi_datatype datatype; + struct fi_atomic_attr datatype_attr; + size_t max_members; + uint64_t mode; }; {% endhighlight %} For a description of struct fi_atomic_attr, see [`fi_atomic`(3)](fi_atomic.3.html). +*op* +: On input, this specifies the atomic operation involved with the collective + call. This should be set to one of the following values: FI_MIN, FI_MAX, + FI_SUM, FI_PROD, FI_LOR, FI_LAND, FI_BOR, FI_BAND, FI_LXOR, FI_BXOR, + FI_ATOMIC_READ, FI_ATOMIC_WRITE, of FI_NOOP. For collectives that do + not exchange application data (fi_barrier), this should be set to FI_NOOP. + +*datatype* +: On onput, specifies the datatype of the data being modified by the + collective. This should be set to one of the following values: + FI_INT8, FI_UINT8, FI_INT16, FI_UINT16, FI_INT32, FI_UINT32, FI_INT64, + FI_UINT64, FI_FLOAT, FI_DOUBLE, FI_FLOAT_COMPLEX, FI_DOUBLE_COMPLEX, + FI_LONG_DOUBLE, FI_LONG_DOUBLE_COMPLEX, or FI_VOID. For collectives + that do not exchange application data (fi_barrier), this should be set + to FI_VOID. + *datatype_attr.count* : The maximum number of elements that may be used with the collective. @@ -393,7 +503,7 @@ For a description of struct fi_atomic_attr, see *mode* : This field is reserved and should be 0. -If a collective operation is supported, the query call will return 0, +If a collective operation is supported, the query call will return FI_SUCCESS, along with attributes on the limits for using that collective operation through the provider. @@ -409,15 +519,6 @@ point atomic operations. The following flags are defined for the specified operations. -*FI_SEND* -: Applies to fi_broadcast() operations. This indicates that the caller - is the transmitter of the broadcast data. There should only be a single - transmitter for each broadcast collective operation. - -*FI_RECV* -: Applies to fi_broadcast() operation. This indicates that the caller - is the receiver of broadcase data. - *FI_SCATTER* : Applies to fi_query_collective. When set, requests attribute information on the reduce-scatter collective operation. diff --git a/man/fi_control.3.md b/man/fi_control.3.md index 61f41a5f9db..648f3f326ff 100644 --- a/man/fi_control.3.md +++ b/man/fi_control.3.md @@ -15,6 +15,9 @@ fi_control \- Perform an operation on a fabric resource. #include int fi_control(struct fid *fid, int command, void *arg); +int fi_alias(struct fid *fid, struct fid **alias_fid, uint64_t flags); +int fi_get_val(struct fid *fid, int name, void *val); +int fi_set_val(struct fid *fid, int name, void *val); ``` @@ -38,6 +41,15 @@ resource being operated on, the specified command, and any provided arguments for the command. For specific details, see the fabric resource specific help pages noted below. +fi_alias, fi_get_val, and fi_set_val are wrappers for fi_control with +commands FI_ALIAS, FI_GET_VAL, FI_SET_VAL, respectively. fi_alias creates +an alias of the specified fabric resource. fi_get_val reads the value of +the named parameter associated with the fabric resource, while fi_set_val +updates that value. Available parameter names depend on the type of the +fabric resource and the provider in use. Providers may define provider +specific names in the provider extension header files ('rdma/fi_ext_*.h'). +Please refer to the provider man pages for details. + # SEE ALSO [`fi_endpoint`(3)](fi_endpoint.3.html), diff --git a/man/fi_cq.3.md b/man/fi_cq.3.md index c7d0cff5432..e16009572a7 100644 --- a/man/fi_cq.3.md +++ b/man/fi_cq.3.md @@ -221,8 +221,8 @@ struct fi_cq_tagged_entry { fi_control to retrieve the underlying wait object associated with a CQ, in order to use it in other system calls. The following values may be used to specify the type of wait object associated with a - CQ: FI_WAIT_NONE, FI_WAIT_UNSPEC, FI_WAIT_SET, FI_WAIT_FD, and - FI_WAIT_MUTEX_COND. The default is FI_WAIT_NONE. + CQ: FI_WAIT_NONE, FI_WAIT_UNSPEC, FI_WAIT_SET, FI_WAIT_FD, + FI_WAIT_MUTEX_COND, and FI_WAIT_YIELD. The default is FI_WAIT_NONE. - *FI_WAIT_NONE* : Used to indicate that the user will not block (wait) for completions @@ -252,9 +252,10 @@ struct fi_cq_tagged_entry { : Specifies that the CQ should use a pthread mutex and cond variable as a wait object. -- *FI_WAIT_CRITSEC_COND* -: Windows specific. Specifies that the CQ should use a critical - section and condition variable as a wait object. +- *FI_WAIT_YIELD* +: Indicates that the CQ will wait without a wait object but instead + yield on every wait. Allows usage of fi_cq_sread and fi_cq_sreadfrom + through a spin. *signaling_vector* : If the FI_AFFINITY flag is set, this indicates the logical cpu number @@ -518,11 +519,15 @@ of these fields are the same for all CQ entry structure formats. on converting this error value into a human readable string. *err_data* -: On an error, err_data may reference a provider specific amount of data - associated with an error. The use of this field and its meaning is +: The err_data field is used to return provider specific information, if + available, about the error. On input, err_data should reference a data + buffer of size err_data_size. On output, the provider will fill in this + buffer with any provider specific data which may help identify the cause + of the error. The contents of the err_data field and its meaning is provider specific. It is intended to be used as a debugging aid. See fi_cq_strerror for additional details on converting this error data into - a human readable string. + a human readable string. See the compatibility note below on how this + field is used for older libfabric releases. *err_data_size* : On input, err_data_size indicates the size of the err_data buffer in bytes. @@ -530,9 +535,12 @@ of these fields are the same for all CQ entry structure formats. err_data buffer. The err_data information is typically used with fi_cq_strerror to provide details about the type of error that occurred. - For compatibility purposes, if err_data_size is 0 on input, or the fabric - was opened with release < 1.5, err_data will be set to a data buffer - owned by the provider. The contents of the buffer will remain valid until a + For compatibility purposes, the behavior of the err_data and err_data_size + fields is may be modified from that listed above. If err_data_size is 0 + on input, or the fabric was opened with release < 1.5, then any buffer + referenced by err_data will be ignored on input. In this situation, on + output err_data will be set to a data buffer owned by the provider. + The contents of the buffer will remain valid until a subsequent read call against the CQ. Applications must serialize access to the CQ when processing errors to ensure that the buffer referenced by err_data does not change. @@ -723,6 +731,7 @@ The operational flags for the described completion levels are defined below. claiming the message or results. As a result, match complete may involve additional provider level acknowledgements or lengthy delays. However, this completion model enables peer applications to synchronize their execution. + Many providers may not support this semantic. *FI_COMMIT_COMPLETE* : Indicates that a completion should not be generated (locally or at the @@ -734,6 +743,26 @@ The operational flags for the described completion levels are defined below. memory regions over reliable endpoints. This completion mode is experimental. +*FI_FENCE* +: This is not a completion level, but plays a role in the completion + ordering between operations that would not normally be ordered. An + operation that is marked with the FI_FENCE flag and all + operations posted after the fenced operation are deferred until all + previous operations targeting the same peer endpoint have completed. + Additionally, the completion of the fenced operation indicates that + prior operations have met the same completion level as the fenced + operation. For example, if an operation is posted as + FI_DELIVERY_COMPLETE | FI_FENCE, then its completion indicates prior + operations have met the semantic required for FI_DELIVERY_COMPLETE. + This is true even if the prior operation was posted with a lower + completion level, such as FI_TRANSMIT_COMPLETE or FI_INJECT_COMPLETE. + + Note that a completion generated for an operation posted prior to + the fenced operation only guarantees that the completion level + that was originally requested has been met. It is the completion + of the fenced operation that guarantees that the additional + semantics have been met. + # NOTES A completion queue must be bound to at least one enabled endpoint before any diff --git a/man/fi_domain.3.md b/man/fi_domain.3.md index 28d52daf65d..7acda47cd3f 100644 --- a/man/fi_domain.3.md +++ b/man/fi_domain.3.md @@ -26,6 +26,9 @@ int fi_domain_bind(struct fid_domain *domain, struct fid *eq, int fi_open_ops(struct fid *domain, const char *name, uint64_t flags, void **ops, void *context); + +int fi_set_ops(struct fid *domain, const char *name, uint64_t flags, + void *ops, void *context); ``` # ARGUMENTS @@ -74,6 +77,74 @@ interfaces may be used to access low-level resources and operations that are specific to the opened resource domain. The details of domain interfaces are outside the scope of this documentation. +## fi_set_ops + +fi_set_ops assigns callbacks that a provider should invoke in place +of performing selected tasks. This allows users to modify or control +a provider's default behavior. Conceptually, it allows the user to +hook specific functions used by a provider and replace it with their +own. + +The operations being modified are identified using a well-known +character string, passed as the name parameter. The format of the +ops parameter is dependent upon the name value. The ops parameter will +reference a structure containing the callbacks and other fields needed +by the provider to invoke the user's functions. + +If a provider accepts the override, it will return FI_SUCCESS. If the +override is unknown or not supported, the provider will return +-FI_ENOSYS. Overrides should be set prior to allocating resources on +the domain. + +The following fi_set_ops operations and corresponding callback +structures are defined. + +**FI_SET_OPS_HMEM_OVERRIDE -- Heterogeneous Memory Overrides** + +HMEM override allows users to override HMEM related operations a +provider may perform. Currently, the scope of the HMEM override +is to allow a user to define the memory movement functions a provider +should use when accessing a user buffer. The user-defined memory +movement functions need to account for all the different HMEM iface +types a provider may encounter. + +All objects allocated against a domain will inherit this override. + +The following is the HMEM override operation name and structure. + +```c +#define FI_SET_OPS_HMEM_OVERRIDE "hmem_override_ops" + +struct fi_hmem_override_ops { + size_t size; + + ssize_t (*copy_from_hmem_iov)(void *dest, size_t size, + enum fi_hmem_iface iface, uint64_t device, const struct iovec *hmem_iov, + size_t hmem_iov_count, uint64_t hmem_iov_offset); + + ssize_t (*copy_to_hmem_iov)(enum fi_hmem_iface iface, uint64_t device, + const struct iovec *hmem_iov, size_t hmem_iov_count, + uint64_t hmem_iov_offset, const void *src, size_t size); +}; +``` + +All fields in struct fi_hmem_override_ops must be set (non-null) to a +valid value. + +*size* +: This should be set to the sizeof(struct fi_hmem_override_ops). The +size field is used for forward and backward compatibility purposes. + +*copy_from_hmem_iov* +: Copy data from the device/hmem to host memory. This function should +return a negative fi_errno on error, or the number of bytes copied on +success. + +*copy_to_hmem_iov* +: Copy data from host memory to the device/hmem. This function should +return a negative fi_errno on error, or the number of bytes copied on +success. + ## fi_domain_bind Associates an event queue with the domain. An event queue bound to a @@ -134,6 +205,7 @@ struct fi_domain_attr { size_t auth_key_size; size_t max_err_data; size_t mr_cnt; + uint32_t tclass; }; ``` @@ -145,6 +217,10 @@ fi_getinfo, if no domain was specified, but the user has an opened instance of the named domain, this will reference the first opened instance. If no instance has been opened, this field will be NULL. +The domain instance returned by fi_getinfo should only be considered +valid if the application does not close any domain instances from +another thread while fi_getinfo is being processed. + ## Name The name of the access domain. @@ -158,17 +234,31 @@ accessed by multiple threads. Applications which can guarantee serialization in their access of provider allocated resources and interfaces enables a provider to eliminate lower-level locks. -*FI_THREAD_UNSPEC* -: This value indicates that no threading model has been defined. It - may be used on input hints to the fi_getinfo call. When specified, - providers will return a threading model that allows for the greatest - level of parallelism. +*FI_THREAD_COMPLETION* +: The completion threading model is intended for providers that make use + of manual progress. Applications must serialize access to all objects + that are associated through the use of having a shared completion + structure. This includes endpoint, transmit context, receive context, + completion queue, counter, wait set, and poll set objects. -*FI_THREAD_SAFE* -: A thread safe serialization model allows a multi-threaded - application to access any allocated resources through any interface - without restriction. All providers are required to support - FI_THREAD_SAFE. + For example, threads must serialize access to an endpoint and its + bound completion queue(s) and/or counters. Access to endpoints that + share the same completion queue must also be serialized. + + The use of FI_THREAD_COMPLETION can increase parallelism over + FI_THREAD_SAFE, but requires the use of isolated resources. + +*FI_THREAD_DOMAIN* +: A domain serialization model requires applications to serialize + access to all objects belonging to a domain. + +*FI_THREAD_ENDPOINT* +: The endpoint threading model is similar to FI_THREAD_FID, but with + the added restriction that serialization is required when accessing + the same endpoint, even if multiple transmit and receive contexts are + used. Conceptually, FI_THREAD_ENDPOINT maps well to providers that + implement fabric services in hardware but use a single command + queue to access different data flows. *FI_THREAD_FID* : A fabric descriptor (FID) serialization model requires applications @@ -194,31 +284,17 @@ interfaces enables a provider to eliminate lower-level locks. fabric services in hardware and provide separate command queues to different data flows. -*FI_THREAD_ENDPOINT* -: The endpoint threading model is similar to FI_THREAD_FID, but with - the added restriction that serialization is required when accessing - the same endpoint, even if multiple transmit and receive contexts are - used. Conceptually, FI_THREAD_ENDPOINT maps well to providers that - implement fabric services in hardware but use a single command - queue to access different data flows. - -*FI_THREAD_COMPLETION* -: The completion threading model is intended for providers that make use - of manual progress. Applications must serialize access to all objects - that are associated through the use of having a shared completion - structure. This includes endpoint, transmit context, receive context, - completion queue, counter, wait set, and poll set objects. - - For example, threads must serialize access to an endpoint and its - bound completion queue(s) and/or counters. Access to endpoints that - share the same completion queue must also be serialized. - - The use of FI_THREAD_COMPLETION can increase parallelism over - FI_THREAD_SAFE, but requires the use of isolated resources. +*FI_THREAD_SAFE* +: A thread safe serialization model allows a multi-threaded + application to access any allocated resources through any interface + without restriction. All providers are required to support + FI_THREAD_SAFE. -*FI_THREAD_DOMAIN* -: A domain serialization model requires applications to serialize - access to all objects belonging to a domain. +*FI_THREAD_UNSPEC* +: This value indicates that no threading model has been defined. It + may be used on input hints to the fi_getinfo call. When specified, + providers will return a threading model that allows for the greatest + level of parallelism. ## Progress Models (control_progress / data_progress) @@ -249,10 +325,6 @@ reliable transfers, as a result of retry and acknowledgement processing. To balance between performance and ease of use, two progress models are defined. -*FI_PROGRESS_UNSPEC* -: This value indicates that no progress model has been defined. It - may be used on input hints to the fi_getinfo call. - *FI_PROGRESS_AUTO* : This progress model indicates that the provider will make forward progress on an asynchronous operation without further intervention @@ -288,6 +360,10 @@ are defined. manual progress may still need application assistance to process received operations. +*FI_PROGRESS_UNSPEC* +: This value indicates that no progress model has been defined. It + may be used on input hints to the fi_getinfo call. + ## Resource Management (resource_mgmt) Resource management (RM) is provider and protocol support to protect @@ -309,10 +385,6 @@ provider implementation and protocol may still provide some level of protection against overruns. However, such protection is not guaranteed. The following values for resource management are defined. -*FI_RM_UNSPEC* -: This value indicates that no resource management model has been defined. - It may be used on input hints to the fi_getinfo call. - *FI_RM_DISABLED* : The provider is free to select an implementation and protocol that does not protect against resource overruns. The application is responsible @@ -321,6 +393,10 @@ The following values for resource management are defined. *FI_RM_ENABLED* : Resource management is enabled for this provider domain. +*FI_RM_UNSPEC* +: This value indicates that no resource management model has been defined. + It may be used on input hints to the fi_getinfo call. + The behavior of the various resource management options depends on whether the endpoint is reliable or unreliable, as well as provider and protocol specific implementation details, as shown in the following table. The @@ -395,7 +471,7 @@ transfer operation. When a resource management error occurs on an endpoint, the endpoint is transitioned into a disabled state. Any operations which have not -already completed will fail and be discarded. For unconnected endpoints, +already completed will fail and be discarded. For connectionless endpoints, the endpoint must be re-enabled before it will accept new data transfer operations. For connected endpoints, the connection is torn down and must be re-established. @@ -422,15 +498,15 @@ Specifies the type of address vectors that are usable with this domain. For additional details on AV type, see [`fi_av`(3)](fi_av.3.html). The following values may be specified. -*FI_AV_UNSPEC* -: Any address vector format is requested and supported. - *FI_AV_MAP* : Only address vectors of type AV map are requested or supported. *FI_AV_TABLE* : Only address vectors of type AV index are requested or supported. +*FI_AV_UNSPEC* +: Any address vector format is requested and supported. + Address vectors are only used by connectionless endpoints. Applications that require the use of a specific type of address vector should set the domain attribute av_type to the necessary value when calling fi_getinfo. @@ -445,6 +521,13 @@ Defines memory registration specific mode bits used with this domain. Full details on MR mode options are available in [`fi_mr`(3)](fi_mr.3.html). The following values may be specified. +*FI_MR_ALLOCATED* +: Indicates that memory registration occurs on allocated data buffers, and + physical pages must back all virtual addresses being registered. + +*FI_MR_ENDPOINT* +: Memory registration occurs at the endpoint level, rather than domain. + *FI_MR_LOCAL* : The provider is optimized around having applications register memory for locally accessed data buffers. Data buffers used in send and @@ -452,39 +535,32 @@ The following values may be specified. operations must be registered by the application for access domains opened with this capability. -*FI_MR_RAW* -: The provider requires additional setup as part of their memory registration - process. This mode is required by providers that use a memory key - that is larger than 64-bits. - -*FI_MR_VIRT_ADDR* -: Registered memory regions are referenced by peers using the virtual address - of the registered memory region, rather than a 0-based offset. - -*FI_MR_ALLOCATED* -: Indicates that memory registration occurs on allocated data buffers, and - physical pages must back all virtual addresses being registered. - -*FI_MR_PROV_KEY* -: Memory registration keys are selected and returned by the provider. - *FI_MR_MMU_NOTIFY* : Indicates that the application is responsible for notifying the provider when the page tables referencing a registered memory region may have been updated. +*FI_MR_PROV_KEY* +: Memory registration keys are selected and returned by the provider. + +*FI_MR_RAW* +: The provider requires additional setup as part of their memory registration + process. This mode is required by providers that use a memory key + that is larger than 64-bits. + *FI_MR_RMA_EVENT* : Indicates that the memory regions associated with completion counters must be explicitly enabled after being bound to any counter. -*FI_MR_ENDPOINT* -: Memory registration occurs at the endpoint level, rather than domain. - *FI_MR_UNSPEC* : Defined for compatibility -- library versions 1.4 and earlier. Setting mr_mode to 0 indicates that FI_MR_BASIC or FI_MR_SCALABLE are requested and supported. +*FI_MR_VIRT_ADDR* +: Registered memory regions are referenced by peers using the virtual address + of the registered memory region, rather than a 0-based offset. + *FI_MR_BASIC* : Defined for compatibility -- library versions 1.4 and earlier. Only basic memory registration operations are requested or supported. @@ -541,7 +617,11 @@ fixed value of the maximum number of endpoints supported by the underlying hardware, or may be a dynamic value, based on the default attributes of an allocated endpoint, such as the endpoint capabilities and size. The endpoint count is the number of addressable endpoints -supported by the provider. +supported by the provider. Providers return capability limits based on +configured hardware maximum capabilities. Providers cannot predict all +possible system limitations without posteriori knowledge acquired during +runtime that will further limit these hardware maximums (e.g. application +memory consumption, FD usage, etc.). ## Transmit Context Count (tx_ctx_cnt) @@ -666,6 +746,12 @@ Applications can set the mr_cnt on input to fi_getinfo, in order to indicate their memory registration requirements. Doing so may allow the provider to optimize any memory registration cache or lookup tables. +## Traffic Class (tclass) + +This specifies the default traffic class that will be associated any endpoints +created within the domain. See [`fi_endpoint`(3)](fi_endpoint.3.html +for additional information. + # RETURN VALUE Returns 0 on success. On error, a negative value corresponding to fabric diff --git a/man/fi_efa.7.md b/man/fi_efa.7.md index c71eebf6266..89a3dfc6e23 100644 --- a/man/fi_efa.7.md +++ b/man/fi_efa.7.md @@ -53,8 +53,10 @@ The following features are supported: registrations on the DGRAM endpoint. *Memory registration modes* -: The RDM endpoint does not require memory registration and the - *FI_EP_DGRAM* endpoint only supports *FI_MR_LOCAL*. +: The RDM endpoint does not require memory registration for send and receive + operations, i.e. it does not require *FI_MR_LOCAL*. Applications may specify + *FI_MR_LOCAL* in the MR mode flags in order to use descriptors provided by the + application. The *FI_EP_DGRAM* endpoint only supports *FI_MR_LOCAL*. *Progress* : The RDM endpoint supports both *FI_PROGRESS_AUTO* and *FI_PROGRESS_MANUAL*, @@ -69,14 +71,14 @@ The following features are supported: # LIMITATIONS -The provider does not support *FI_ATOMIC* interfaces. For RMA operations, +The DGRAM endpoint does not support *FI_ATOMIC* interfaces. For RMA operations, completion events for RMA targets (*FI_RMA_EVENT*) is not supported. The DGRAM endpoint does not fully protect against resource overruns, so resource management is disabled for this endpoint (*FI_RM_DISABLED*). No support for selective completions. -No support for counters. +No support for counters for the DGRAM endpoint. No support for inject. @@ -121,10 +123,6 @@ These OFI runtime parameters apply only to the RDM endpoint. buffer for iov's larger than max_memcpy_size. Defaults to true. When disabled, only uses a bounce buffer -*FI_EFA_MR_CACHE_MERGE_REGIONS* -: Enables merging overlapping and adjacent memory registration regions. - Defaults to true. - *FI_EFA_MR_MAX_CACHED_COUNT* : Sets the maximum number of memory registrations that can be cached at any time. @@ -155,6 +153,22 @@ These OFI runtime parameters apply only to the RDM endpoint. : Time interval (us) for the base timeout to use for exponential backoff to a peer after a receiver not ready error. +*FI_EFA_ENABLE_SHM_TRANSFER* +: Enable SHM provider to provide the communication across all intra-node processes. + SHM transfer will be disabled in the case where + [`ptrace protection`](https://wiki.ubuntu.com/SecurityTeam/Roadmap/KernelHardening#ptrace_Protection) + is turned on. You can turn it off to enable shm transfer. + +*FI_EFA_SHM_AV_SIZE* +: Defines the maximum number of entries in SHM provider's address vector. + +*FI_EFA_SHM_MAX_MEDIUM_SIZE* +: Defines the switch point between small/medium message and large message. The message + larger than this switch point will be transferred with large message protocol. + +*FI_EFA_INTER_MAX_MEDIUM_MESSAGE_SIZE* +: The maximum size for inter EFA messages to be sent by using medium message protocol. Messages which can fit in one packet will be sent as eager message. Messages whose sizes are smaller than this value will be sent using medium message protocol. Other messages will be sent using CTS based long message protocol. + # SEE ALSO [`fabric`(7)](fabric.7.html), diff --git a/man/fi_endpoint.3.md b/man/fi_endpoint.3.md index 433762c48f1..7797e303487 100644 --- a/man/fi_endpoint.3.md +++ b/man/fi_endpoint.3.md @@ -41,6 +41,9 @@ fi_getopt / fi_setopt fi_rx_context / fi_tx_context / fi_srx_context / fi_stx_context : Open a transmit or receive context. +fi_tc_dscp_set / fi_tc_dscp_get +: Convert between a DSCP value and a network traffic class + fi_rx_size_left / fi_tx_size_left (DEPRECATED) : Query the lower bound on how many RX/TX operations may be posted without an operation returning -FI_EAGAIN. This functions have been deprecated @@ -100,6 +103,10 @@ int fi_getopt(struct fid *ep, int level, int optname, int fi_setopt(struct fid *ep, int level, int optname, const void *optval, size_t optlen); +uint32_t fi_tc_dscp_set(uint8_t dscp); + +uint8_t fi_tc_dscp_get(uint32_t tclass); + DEPRECATED ssize_t fi_rx_size_left(struct fid_ep *ep); DEPRECATED ssize_t fi_tx_size_left(struct fid_ep *ep); @@ -191,7 +198,7 @@ Additionally, endpoints that use manual progress must be associated with relevant completion queues or event queues in order to drive progress. For endpoints that are only used as the target of RMA or atomic operations, this means binding the endpoint to a completion -queue associated with receive processing. Unconnected endpoints must +queue associated with receive processing. Connectionless endpoints must be bound to an address vector. Once an endpoint has been activated, it may be associated with an address @@ -279,11 +286,6 @@ CQs, based on the type of operation. This is specified using fi_ep_bind flags. The following flags may be OR'ed together when binding an endpoint to a completion domain CQ. -*FI_TRANSMIT* -: Directs the completion of outbound data transfer requests to the - specified completion queue. This includes send message, RMA, and - atomic operations. - *FI_RECV* : Directs the notification of inbound data transfers to the specified completion queue. This includes received messages. This binding @@ -310,28 +312,24 @@ binding an endpoint to a completion domain CQ. See Notes section below for additional information on how this flag interacts with the FI_CONTEXT and FI_CONTEXT2 mode bits. +*FI_TRANSMIT* +: Directs the completion of outbound data transfer requests to the + specified completion queue. This includes send message, RMA, and + atomic operations. + An endpoint may optionally be bound to a completion counter. Associating an endpoint with a counter is in addition to binding the EP with a CQ. When binding an endpoint to a counter, the following flags may be specified. -*FI_SEND* -: Increments the specified counter whenever a message transfer initiated - over the endpoint has completed successfully or in error. Sent messages - include both tagged and normal message operations. - -*FI_RECV* -: Increments the specified counter whenever a message is - received over the endpoint. Received messages include both tagged - and normal message operations. - *FI_READ* : Increments the specified counter whenever an RMA read, atomic fetch, or atomic compare operation initiated from the endpoint has completed successfully or in error. -*FI_WRITE* -: Increments the specified counter whenever an RMA write or base atomic - operation initiated from the endpoint has completed successfully or in error. +*FI_RECV* +: Increments the specified counter whenever a message is + received over the endpoint. Received messages include both tagged + and normal message operations. *FI_REMOTE_READ* : Increments the specified counter whenever an RMA read, atomic fetch, or @@ -345,6 +343,15 @@ binding an endpoint to a counter, the following flags may be specified. the given endpoint. Use of this flag requires that the endpoint be created using FI_RMA_EVENT. +*FI_SEND* +: Increments the specified counter whenever a message transfer initiated + over the endpoint has completed successfully or in error. Sent messages + include both tagged and normal message operations. + +*FI_WRITE* +: Increments the specified counter whenever an RMA write or base atomic + operation initiated from the endpoint has completed successfully or in error. + An endpoint may only be bound to a single CQ or counter for a given type of operation. For example, a EP may not bind to two counters both using FI_WRITE. Furthermore, providers may limit CQ and counter @@ -429,6 +436,10 @@ The base operation of an endpoint is selected during creation using struct fi_info. The following control commands and arguments may be assigned to an endpoint. +**FI_BACKLOG - int *value** +: This option only applies to passive endpoints. It is used to set the + connection request backlog for listening endpoints. + **FI_GETOPSFLAG -- uint64_t *flags** : Used to retrieve the current value of flags associated with the data transfer operations initiated on the endpoint. The control argument must @@ -436,6 +447,14 @@ assigned to an endpoint. data transfer flags to be returned. See below for a list of control flags. +**FI_GETWAIT -- void \*\*** +: This command allows the user to retrieve the file descriptor associated + with a socket endpoint. The fi_control arg parameter should be an address + where a pointer to the returned file descriptor will be written. See fi_eq.3 + for addition details using fi_control with FI_GETWAIT. The file descriptor + may be used for notification that the endpoint is ready to send or receive + data. + **FI_SETOPSFLAG -- uint64_t *flags** : Used to change the data transfer operation flags associated with an endpoint. The control argument must include FI_TRANSMIT or FI_RECV (not both) @@ -444,18 +463,6 @@ assigned to an endpoint. attributes that were set when the endpoint was created. Valid control flags are defined below. -**FI_BACKLOG - int *value** -: This option only applies to passive endpoints. It is used to set the - connection request backlog for listening endpoints. - -*FI_GETWAIT (void \*\*)* -: This command allows the user to retrieve the file descriptor associated - with a socket endpoint. The fi_control arg parameter should be an address - where a pointer to the returned file descriptor will be written. See fi_eq.3 - for addition details using fi_control with FI_GETWAIT. The file descriptor - may be used for notification that the endpoint is ready to send or receive - data. - ## fi_getopt / fi_setopt Endpoint protocol operations may be retrieved using fi_getopt or set @@ -468,28 +475,11 @@ The following option levels and option names and parameters are defined. *FI_OPT_ENDPOINT* -- *FI_OPT_MIN_MULTI_RECV - size_t* -: Defines the minimum receive buffer space available when the receive - buffer is released by the provider (see FI_MULTI_RECV). Modifying this - value is only guaranteed to set the minimum buffer space needed on - receives posted after the value has been changed. It is recommended - that applications that want to override the default MIN_MULTI_RECV - value set this option before enabling the corresponding endpoint. - -- *FI_OPT_CM_DATA_SIZE - size_t* -: Defines the size of available space in CM messages for user-defined - data. This value limits the amount of data that applications can exchange - between peer endpoints using the fi_connect, fi_accept, and fi_reject - operations. The size returned is dependent upon the properties of the - endpoint, except in the case of passive endpoints, in which the size reflects - the maximum size of the data that may be present as part of a connection - request event. This option is read only. - - *FI_OPT_BUFFERED_LIMIT - size_t* : Defines the maximum size of a buffered message that will be reported to users as part of a receive completion when the FI_BUFFERED_RECV mode is enabled on an endpoint. - + fi_getopt() will return the currently configured threshold, or the provider's default threshold if one has not be set by the application. fi_setopt() allows an application to configure the threshold. If the @@ -509,10 +499,37 @@ The following option levels and option names and parameters are defined. to discard or claim a buffered receive or when to claim a buffered receive on getting a buffered receive completion. The value is typically used by a provider when sending a rendezvous protocol request where it would send - atleast FI_OPT_BUFFERED_MIN bytes of application data along with it. A smaller - sized renedezvous protocol message usually results in better latency for the + at least FI_OPT_BUFFERED_MIN bytes of application data along with it. A smaller + sized rendezvous protocol message usually results in better latency for the overall transfer of a large message. +- *FI_OPT_CM_DATA_SIZE - size_t* +: Defines the size of available space in CM messages for user-defined + data. This value limits the amount of data that applications can exchange + between peer endpoints using the fi_connect, fi_accept, and fi_reject + operations. The size returned is dependent upon the properties of the + endpoint, except in the case of passive endpoints, in which the size reflects + the maximum size of the data that may be present as part of a connection + request event. This option is read only. + +- *FI_OPT_MIN_MULTI_RECV - size_t* +: Defines the minimum receive buffer space available when the receive + buffer is released by the provider (see FI_MULTI_RECV). Modifying this + value is only guaranteed to set the minimum buffer space needed on + receives posted after the value has been changed. It is recommended + that applications that want to override the default MIN_MULTI_RECV + value set this option before enabling the corresponding endpoint. + +## fi_tc_dscp_set + +This call converts a DSCP defined value into a libfabric traffic class value. +It should be used when assigning a DSCP value when setting the tclass field +in either domain or endpoint attributes + +## fi_tc_dscp_get + +This call returns the DSCP value associated with the tclass field for the +domain or endpoint attributes. ## fi_rx_size_left (DEPRECATED) @@ -567,25 +584,26 @@ struct fi_ep_attr { If specified, indicates the type of fabric interface communication desired. Supported types are: -*FI_EP_UNSPEC* -: The type of endpoint is not specified. This is usually provided as - input, with other attributes of the endpoint or the provider - selecting the type. - -*FI_EP_MSG* -: Provides a reliable, connection-oriented data transfer service with - flow control that maintains message boundaries. - *FI_EP_DGRAM* : Supports a connectionless, unreliable datagram communication. Message boundaries are maintained, but the maximum message size may be limited to the fabric MTU. Flow control is not guaranteed. +*FI_EP_MSG* +: Provides a reliable, connection-oriented data transfer service with + flow control that maintains message boundaries. + *FI_EP_RDM* -: Reliable datagram message. Provides a reliable, unconnected data +: Reliable datagram message. Provides a reliable, connectionless data transfer service with flow control that maintains message boundaries. +*FI_EP_SOCK_DGRAM* +: A connectionless, unreliable datagram endpoint with UDP socket-like + semantics. FI_EP_SOCK_DGRAM is most useful for applications designed + around using UDP sockets. See the SOCKET ENDPOINT section for additional + details and restrictions that apply to datagram socket endpoints. + *FI_EP_SOCK_STREAM* : Data streaming endpoint with TCP socket-like semantics. Provides a reliable, connection-oriented data transfer service that does @@ -594,11 +612,10 @@ desired. Supported types are: ENDPOINT section for additional details and restrictions that apply to stream endpoints. -*FI_EP_SOCK_DGRAM* -: A connectionless, unreliable datagram endpoint with UDP socket-like - semantics. FI_EP_SOCK_DGRAM is most useful for applications designed - around using UDP sockets. See the SOCKET ENDPOINT section for additional - details and restrictions that apply to datagram socket endpoints. +*FI_EP_UNSPEC* +: The type of endpoint is not specified. This is usually provided as + input, with other attributes of the endpoint or the provider + selecting the type. ## Protocol @@ -609,65 +626,69 @@ Provider specific protocols are also allowed. Provider specific protocols will be indicated by having the upper bit of the protocol value set to one. -*FI_PROTO_UNSPEC* -: The protocol is not specified. This is usually provided as input, - with other attributes of the socket or the provider selecting the - actual protocol. +*FI_PROTO_GNI* +: Protocol runs over Cray GNI low-level interface. -*FI_PROTO_RDMA_CM_IB_RC* -: The protocol runs over Infiniband reliable-connected queue pairs, - using the RDMA CM protocol for connection establishment. +*FI_PROTO_IB_RDM* +: Reliable-datagram protocol implemented over InfiniBand reliable-connected + queue pairs. + +*FI_PROTO_IB_UD* +: The protocol runs over Infiniband unreliable datagram queue pairs. *FI_PROTO_IWARP* : The protocol runs over the Internet wide area RDMA protocol transport. -*FI_PROTO_IB_UD* -: The protocol runs over Infiniband unreliable datagram queue pairs. +*FI_PROTO_IWARP_RDM* +: Reliable-datagram protocol implemented over iWarp reliable-connected + queue pairs. + +*FI_PROTO_NETWORKDIRECT* +: Protocol runs over Microsoft NetworkDirect service provider interface. + This adds reliable-datagram semantics over the NetworkDirect connection- + oriented endpoint semantics. *FI_PROTO_PSMX* : The protocol is based on an Intel proprietary protocol known as PSM, performance scaled messaging. PSMX is an extended version of the PSM protocol to support the libfabric interfaces. -*FI_PROTO_UDP* -: The protocol sends and receives UDP datagrams. For example, an - endpoint using *FI_PROTO_UDP* will be able to communicate with a - remote peer that is using Berkeley *SOCK_DGRAM* sockets using - *IPPROTO_UDP*. - -*FI_PROTO_SOCK_TCP* -: The protocol is layered over TCP packets. +*FI_PROTO_PSMX2* +: The protocol is based on an Intel proprietary protocol known as PSM2, + performance scaled messaging version 2. PSMX2 is an extended version of the + PSM2 protocol to support the libfabric interfaces. -*FI_PROTO_IWARP_RDM* -: Reliable-datagram protocol implemented over iWarp reliable-connected - queue pairs. +*FI_PROTO_PSMX3* +: The protocol is Intel's protocol known as PSM3, performance scaled + messaging version 3. PSMX3 is implemented over RoCEv2 and verbs. -*FI_PROTO_IB_RDM* -: Reliable-datagram protocol implemented over InfiniBand reliable-connected - queue pairs. +*FI_PROTO_RDMA_CM_IB_RC* +: The protocol runs over Infiniband reliable-connected queue pairs, + using the RDMA CM protocol for connection establishment. -*FI_PROTO_GNI* -: Protocol runs over Cray GNI low-level interface. +*FI_PROTO_RXD* +: Reliable-datagram protocol implemented over datagram endpoints. RXD is + a libfabric utility component that adds RDM endpoint semantics over + DGRAM endpoint semantics. *FI_PROTO_RXM* : Reliable-datagram protocol implemented over message endpoints. RXM is a libfabric utility component that adds RDM endpoint semantics over MSG endpoint semantics. -*FI_PROTO_RXD* -: Reliable-datagram protocol implemented over datagram endpoints. RXD is - a libfabric utility component that adds RDM endpoint semantics over - DGRAM endpoint semantics. +*FI_PROTO_SOCK_TCP* +: The protocol is layered over TCP packets. -*FI_PROTO_NETWORKDIRECT* -: Protocol runs over Microsoft NetworkDirect service provider interface. - This adds reliable-datagram semantics over the NetworkDirect connection- - oriented endpoint semantics. +*FI_PROTO_UDP* +: The protocol sends and receives UDP datagrams. For example, an + endpoint using *FI_PROTO_UDP* will be able to communicate with a + remote peer that is using Berkeley *SOCK_DGRAM* sockets using + *IPPROTO_UDP*. -*FI_PROTO_PSMX2* -: The protocol is based on an Intel proprietary protocol known as PSM2, - performance scaled messaging version 2. PSMX2 is an extended version of the - PSM2 protocol to support the libfabric interfaces. +*FI_PROTO_UNSPEC* +: The protocol is not specified. This is usually provided as input, + with other attributes of the socket or the provider selecting the + actual protocol. ## protocol_version - Protocol Version @@ -676,7 +697,7 @@ The protocol version allows providers to extend an existing protocol, by adding support for additional features or functionality for example, in a backward compatible manner. Providers that support different versions of the same protocol should inter-operate, but only when using the -capabilities defined for the lesser version. +capabilities defined for the lesser version. ## max_msg_size - Max Message Size @@ -813,7 +834,7 @@ details. ## auth_key_size - Authorization Key Length The length of the authorization key in bytes. This field will be 0 if -authorization keys are not available or used. This field is ignored +authorization keys are not available or used. This field is ignored unless the fabric is opened with API version 1.5 or greater. ## auth_key - Authorization Key @@ -824,9 +845,9 @@ to limit communication between endpoints. Only peer endpoints that are programmed to use the same authorization key may communicate. Authorization keys are often used to implement job keys, to ensure that processes running in different jobs do not accidentally -cross traffic. The domain authorization key will be used if auth_key_size +cross traffic. The domain authorization key will be used if auth_key_size is set to 0. This field is ignored unless the fabric is opened with API -version 1.5 or greater. +version 1.5 or greater. # TRANSMIT CONTEXT ATTRIBUTES @@ -844,6 +865,7 @@ struct fi_tx_attr { size_t size; size_t iov_limit; size_t rma_iov_limit; + uint32_t tclass; }; {% endhighlight %} @@ -852,8 +874,21 @@ struct fi_tx_attr { The requested capabilities of the context. The capabilities must be a subset of those requested of the associated endpoint. See the CAPABILITIES section of fi_getinfo(3) for capability details. If -the caps field is 0 on input to fi_getinfo(3), the caps value from the -fi_info structure will be used. +the caps field is 0 on input to fi_getinfo(3), the applicable +capability bits from the fi_info structure will be used. + +The following capabilities apply to the transmit attributes: FI_MSG, +FI_RMA, FI_TAGGED, FI_ATOMIC, FI_READ, FI_WRITE, FI_SEND, FI_HMEM, +FI_TRIGGER, FI_FENCE, FI_MULTICAST, FI_RMA_PMEM, FI_NAMED_RX_CTX, +and FI_COLLECTIVE. + +Many applications will be able to ignore this field and rely solely +on the fi_info::caps field. Use of this field provides fine grained +control over the transmit capabilities associated with an endpoint. +It is useful when handling scalable endpoints, with multiple transmit +contexts, for example, and allows configuring a specific transmit +context with fewer capabilities than that supported by the endpoint +or other transmit contexts. ## mode @@ -892,6 +927,30 @@ which message data is sent or received by the transport layer. Message ordering requires matching ordering semantics on the receiving side of a data transfer operation in order to guarantee that ordering is met. +*FI_ORDER_ATOMIC_RAR* +: Atomic read after read. If set, atomic fetch operations are + transmitted in the order submitted relative to other + atomic fetch operations. If not set, atomic fetches + may be transmitted out of order from their submission. + +*FI_ORDER_ATOMIC_RAW* +: Atomic read after write. If set, atomic fetch operations are + transmitted in the order submitted relative to atomic update + operations. If not set, atomic fetches may be transmitted ahead + of atomic updates. + +*FI_ORDER_ATOMIC_WAR* +: RMA write after read. If set, atomic update operations are + transmitted in the order submitted relative to atomic fetch + operations. If not set, atomic updates may be transmitted + ahead of atomic fetches. + +*FI_ORDER_ATOMIC_WAW* +: RMA write after write. If set, atomic update operations are + transmitted in the order submitted relative to other atomic + update operations. If not atomic updates may be + transmitted out of order from their submission. + *FI_ORDER_NONE* : No ordering is specified. This value may be used as input in order to obtain the default message order supported by the provider. FI_ORDER_NONE @@ -903,54 +962,18 @@ transfer operation in order to guarantee that ordering is met. RMA and atomic read operations. If not set, RMA and atomic reads may be transmitted out of order from their submission. -*FI_ORDER_RAW* -: Read after write. If set, RMA and atomic read operations are - transmitted in the order submitted relative to RMA and atomic write - operations. If not set, RMA and atomic reads may be transmitted ahead - of RMA and atomic writes. - *FI_ORDER_RAS* : Read after send. If set, RMA and atomic read operations are transmitted in the order submitted relative to message send operations, including tagged sends. If not set, RMA and atomic reads may be transmitted ahead of sends. -*FI_ORDER_WAR* -: Write after read. If set, RMA and atomic write operations are - transmitted in the order submitted relative to RMA and atomic read - operations. If not set, RMA and atomic writes may be transmitted - ahead of RMA and atomic reads. - -*FI_ORDER_WAW* -: Write after write. If set, RMA and atomic write operations are - transmitted in the order submitted relative to other RMA and atomic - write operations. If not set, RMA and atomic writes may be - transmitted out of order from their submission. - -*FI_ORDER_WAS* -: Write after send. If set, RMA and atomic write operations are - transmitted in the order submitted relative to message send - operations, including tagged sends. If not set, RMA and atomic - writes may be transmitted ahead of sends. - -*FI_ORDER_SAR* -: Send after read. If set, message send operations, including tagged - sends, are transmitted in order submitted relative to RMA and atomic - read operations. If not set, message sends may be transmitted ahead - of RMA and atomic reads. - -*FI_ORDER_SAW* -: Send after write. If set, message send operations, including tagged - sends, are transmitted in order submitted relative to RMA and atomic - write operations. If not set, message sends may be transmitted ahead +*FI_ORDER_RAW* +: Read after write. If set, RMA and atomic read operations are + transmitted in the order submitted relative to RMA and atomic write + operations. If not set, RMA and atomic reads may be transmitted ahead of RMA and atomic writes. -*FI_ORDER_SAS* -: Send after send. If set, message send operations, including tagged - sends, are transmitted in the order submitted relative to other - message send. If not set, message sends may be transmitted out of - order from their submission. - *FI_ORDER_RMA_RAR* : RMA read after read. If set, RMA read operations are transmitted in the order submitted relative to other @@ -975,28 +998,40 @@ transfer operation in order to guarantee that ordering is met. write operations. If not set, RMA writes may be transmitted out of order from their submission. -*FI_ORDER_ATOMIC_RAR* -: Atomic read after read. If set, atomic fetch operations are - transmitted in the order submitted relative to other - atomic fetch operations. If not set, atomic fetches - may be transmitted out of order from their submission. +*FI_ORDER_SAR* +: Send after read. If set, message send operations, including tagged + sends, are transmitted in order submitted relative to RMA and atomic + read operations. If not set, message sends may be transmitted ahead + of RMA and atomic reads. -*FI_ORDER_ATOMIC_RAW* -: Atomic read after write. If set, atomic fetch operations are - transmitted in the order submitted relative to atomic update - operations. If not set, atomic fetches may be transmitted ahead - of atomic updates. +*FI_ORDER_SAS* +: Send after send. If set, message send operations, including tagged + sends, are transmitted in the order submitted relative to other + message send. If not set, message sends may be transmitted out of + order from their submission. -*FI_ORDER_ATOMIC_WAR* -: RMA write after read. If set, atomic update operations are - transmitted in the order submitted relative to atomic fetch - operations. If not set, atomic updates may be transmitted - ahead of atomic fetches. +*FI_ORDER_SAW* +: Send after write. If set, message send operations, including tagged + sends, are transmitted in order submitted relative to RMA and atomic + write operations. If not set, message sends may be transmitted ahead + of RMA and atomic writes. -*FI_ORDER_ATOMIC_WAW* -: RMA write after write. If set, atomic update operations are - transmitted in the order submitted relative to other atomic - update operations. If not atomic updates may be +*FI_ORDER_WAR* +: Write after read. If set, RMA and atomic write operations are + transmitted in the order submitted relative to RMA and atomic read + operations. If not set, RMA and atomic writes may be transmitted + ahead of RMA and atomic reads. + +*FI_ORDER_WAS* +: Write after send. If set, RMA and atomic write operations are + transmitted in the order submitted relative to message send + operations, including tagged sends. If not set, RMA and atomic + writes may be transmitted ahead of sends. + +*FI_ORDER_WAW* +: Write after write. If set, RMA and atomic write operations are + transmitted in the order submitted relative to other RMA and atomic + write operations. If not set, RMA and atomic writes may be transmitted out of order from their submission. ## comp_order - Completion Ordering @@ -1007,7 +1042,7 @@ message order. Relaxed completion order may enable faster reporting of completed transfers, allow acknowledgments to be sent over different fabric paths, and support more sophisticated retry mechanisms. This can result in lower-latency completions, particularly when -using unconnected endpoints. Strict completion ordering may require +using connectionless endpoints. Strict completion ordering may require that providers queue completed operations or limit available optimizations. For transmit requests, completion ordering depends on the endpoint @@ -1041,9 +1076,21 @@ be used with the FI_INJECT data transfer flag. ## size -The size of the context. The size is specified as the minimum number -of transmit operations that may be posted to the endpoint without the -operation returning -FI_EAGAIN. +The size of the transmit context. The mapping of the size value to resources +is provider specific, but it is directly related to the number of command +entries allocated for the endpoint. A smaller size value consumes fewer +hardware and software resources, while a larger size allows queuing more +transmit requests. + +While the size attribute guides the size of underlying endpoint transmit +queue, there is not necessarily a one-to-one mapping between a transmit +operation and a queue entry. A single transmit operation may consume +multiple queue entries; for example, one per scatter-gather entry. +Additionally, the size field is intended to guide the allocation of the +endpoint's transmit context. Specifically, for connectionless endpoints, +there may be lower-level queues use to track communication on a per peer basis. +The sizes of any lower-level queues may only be significantly smaller than +the endpoint's transmit size, in order to reduce resource utilization. ## iov_limit @@ -1061,6 +1108,57 @@ number of RMA IO vectors that may be specified when initiating an operation from the local endpoint, as well as the maximum number of IO vectors that may be carried in a single request from a remote endpoint. +## Traffic Class (tclass) + +Traffic classes can be a differentiated services +code point (DSCP) value, one of the following defined labels, or a +provider-specific definition. If tclass is unset or set to FI_TC_UNSPEC, +the endpoint will use the default traffic class associated with the +domain. + +*FI_TC_BEST_EFFORT* +: This is the default in the absence of any other local or fabric configuration. + This class carries the traffic for a number of applications executing + concurrently over the same network infrastructure. Even though it is shared, + network capacity and resource allocation are distributed fairly across the + applications. + +*FI_TC_BULK_DATA* +: This class is intended for large data transfers associated with I/O and + is present to separate sustained I/O transfers from other application + inter-process communications. + +*FI_TC_DEDICATED_ACCESS* +: This class operates at the highest priority, except the management class. + It carries a high bandwidth allocation, minimum latency targets, and the + highest scheduling and arbitration priority. + +*FI_TC_LOW_LATENCY* +: This class supports low latency, low jitter data patterns typically caused by + transactional data exchanges, barrier synchronizations, and collective + operations that are typical of HPC applications. This class often requires + maximum tolerable latencies that data transfers must achieve for correct or + performance operations. Fulfillment of such requests in this class will + typically require accompanying bandwidth and message size limitations so + as not to consume excessive bandwidth at high priority. + +*FI_TC_NETWORK_CTRL* +: This class is intended for traffic directly related to fabric (network) + management, which is critical to the correct operation of the network. + Its use is typically restricted to privileged network management applications. + +*FI_TC_SCAVENGER* +: This class is used for data that is desired but does not have strict delivery + requirements, such as in-band network or application level monitoring data. + Use of this class indicates that the traffic is considered lower priority + and should not interfere with higher priority workflows. + +*fi_tc_dscp_set / fi_tc_dscp_get* +: DSCP values are supported via the DSCP get and set functions. The + definitions for DSCP values are outside the scope of libfabric. See + the fi_tc_dscp_set and fi_tc_dscp_get function definitions for details + on their use. + # RECEIVE CONTEXT ATTRIBUTES Attributes specific to the receive capabilities of an endpoint are @@ -1084,8 +1182,22 @@ struct fi_rx_attr { The requested capabilities of the context. The capabilities must be a subset of those requested of the associated endpoint. See the CAPABILITIES section if fi_getinfo(3) for capability details. If -the caps field is 0 on input to fi_getinfo(3), the caps value from the -fi_info structure will be used. +the caps field is 0 on input to fi_getinfo(3), the applicable +capability bits from the fi_info structure will be used. + +The following capabilities apply to the receive attributes: FI_MSG, +FI_RMA, FI_TAGGED, FI_ATOMIC, FI_REMOTE_READ, FI_REMOTE_WRITE, FI_RECV, +FI_HMEM, FI_TRIGGER, FI_RMA_PMEM, FI_DIRECTED_RECV, FI_VARIABLE_MSG, +FI_MULTI_RECV, FI_SOURCE, FI_RMA_EVENT, FI_SOURCE_ERR, and +FI_COLLECTIVE. + +Many applications will be able to ignore this field and rely solely +on the fi_info::caps field. Use of this field provides fine grained +control over the receive capabilities associated with an endpoint. +It is useful when handling scalable endpoints, with multiple receive +contexts, for example, and allows configuring a specific receive +context with fewer capabilities than that supported by the endpoint +or other receive contexts. ## mode @@ -1125,6 +1237,11 @@ FI_ORDER_ATOMIC_RAW, FI_ORDER_ATOMIC_WAR, and FI_ORDER_ATOMIC_WAW. For a description of completion ordering, see the comp_order field in the _Transmit Context Attribute_ section. +*FI_ORDER_DATA* +: When set, this bit indicates that received data is written into memory + in order. Data ordering applies to memory accessed as part of a single + operation and between operations if message ordering is guaranteed. + *FI_ORDER_NONE* : No ordering is defined for completed operations. Receive operations may complete in any order, regardless of their submission order. @@ -1133,11 +1250,6 @@ the _Transmit Context Attribute_ section. : Receive operations complete in the order in which they are processed by the receive context, based on the receive side msg_order attribute. -*FI_ORDER_DATA* -: When set, this bit indicates that received data is written into memory - in order. Data ordering applies to memory accessed as part of a single - operation and between operations if message ordering is guaranteed. - ## total_buffered_recv This field is supported for backwards compatibility purposes. @@ -1158,9 +1270,21 @@ anticipate receiving unexpected messages, rather than modifying this value. ## size -The size of the context. The size is specified as the minimum number -of receive operations that may be posted to the endpoint without the -operation returning -FI_EAGAIN. +The size of the receive context. The mapping of the size value to resources +is provider specific, but it is directly related to the number of command +entries allocated for the endpoint. A smaller size value consumes fewer +hardware and software resources, while a larger size allows queuing more +transmit requests. + +While the size attribute guides the size of underlying endpoint receive +queue, there is not necessarily a one-to-one mapping between a receive +operation and a queue entry. A single receive operation may consume +multiple queue entries; for example, one per scatter-gather entry. +Additionally, the size field is intended to guide the allocation of the +endpoint's receive context. Specifically, for connectionless endpoints, +there may be lower-level queues use to track communication on a per peer basis. +The sizes of any lower-level queues may only be significantly smaller than +the endpoint's receive size, in order to reduce resource utilization. ## iov_limit @@ -1265,7 +1389,7 @@ associated with completion queues or counters. Completed receive operations are posted to the CQs bound to the endpoint. An endpoint may only be associated with a single receive context, and all connectionless endpoints associated with a shared receive context must -also share the same address vector. +also share the same address vector. Endpoints associated with a shared transmit context may use dedicated receive contexts, and vice-versa. Or an endpoint may use shared @@ -1338,6 +1462,24 @@ transfer operations, where a flags parameter is not available. Data transfer operations that take flags as input override the op_flags value of transmit or receive context attributes of an endpoint. +*FI_COMMIT_COMPLETE* +: Indicates that a completion should not be generated (locally or at the + peer) until the result of an operation have been made persistent. + See [`fi_cq`(3)](fi_cq.3.html) for additional details on completion + semantics. + +*FI_COMPLETION* +: Indicates that a completion queue entry should be written for data + transfer operations. This flag only applies to operations issued on an + endpoint that was bound to a completion queue with the + FI_SELECTIVE_COMPLETION flag set, otherwise, it is ignored. See the + fi_ep_bind section above for more detail. + +*FI_DELIVERY_COMPLETE* +: Indicates that a completion should be generated when the operation has been + processed by the destination endpoint(s). See [`fi_cq`(3)](fi_cq.3.html) + for additional details on completion semantics. + *FI_INJECT* : Indicates that all outbound data buffers should be returned to the user's control immediately after a data transfer call returns, even @@ -1348,6 +1490,16 @@ value of transmit or receive context attributes of an endpoint. this flag. This limit is indicated using inject_size (see inject_size above). +*FI_INJECT_COMPLETE* +: Indicates that a completion should be generated when the + source buffer(s) may be reused. See [`fi_cq`(3)](fi_cq.3.html) for + additional details on completion semantics. + +*FI_MULTICAST* +: Indicates that data transfers will target multicast addresses by default. + Any fi_addr_t passed into a data transfer operation will be treated as a + multicast address. + *FI_MULTI_RECV* : Applies to posted receive operations. This flag allows the user to post a single buffer that will receive multiple incoming messages. @@ -1360,39 +1512,11 @@ value of transmit or receive context attributes of an endpoint. available buffer space falls below the specified minimum (see FI_OPT_MIN_MULTI_RECV). -*FI_COMPLETION* -: Indicates that a completion queue entry should be written for data - transfer operations. This flag only applies to operations issued on an - endpoint that was bound to a completion queue with the - FI_SELECTIVE_COMPLETION flag set, otherwise, it is ignored. See the - fi_ep_bind section above for more detail. - -*FI_INJECT_COMPLETE* -: Indicates that a completion should be generated when the - source buffer(s) may be reused. See [`fi_cq`(3)](fi_cq.3.html) for - additional details on completion semantics. - *FI_TRANSMIT_COMPLETE* : Indicates that a completion should be generated when the transmit operation has completed relative to the local provider. See [`fi_cq`(3)](fi_cq.3.html) for additional details on completion semantics. -*FI_DELIVERY_COMPLETE* -: Indicates that a completion should be generated when the operation has been - processed by the destination endpoint(s). See [`fi_cq`(3)](fi_cq.3.html) - for additional details on completion semantics. - -*FI_COMMIT_COMPLETE* -: Indicates that a completion should not be generated (locally or at the - peer) until the result of an operation have been made persistent. - See [`fi_cq`(3)](fi_cq.3.html) for additional details on completion - semantics. - -*FI_MULTICAST* -: Indicates that data transfers will target multicast addresses by default. - Any fi_addr_t passed into a data transfer operation will be treated as a - multicast address. - # NOTES Users should call fi_close to release all resources allocated to the diff --git a/man/fi_eq.3.md b/man/fi_eq.3.md index 59006e0f845..cb39caba4cf 100644 --- a/man/fi_eq.3.md +++ b/man/fi_eq.3.md @@ -176,9 +176,9 @@ struct fi_eq_attr { : Specifies that the EQ should use a pthread mutex and cond variable as a wait object. -- *FI_WAIT_CRITSEC_COND* -: Windows specific. Specifies that the EQ should use a critical - section and condition variable as a wait object. +- *FI_WAIT_YIELD* +: Indicates that the EQ will wait without a wait object but instead + yield on every wait. Allows usage of fi_eq_sread through a spin. *signaling_vector* : If the FI_AFFINITY flag is set, this indicates the logical cpu number diff --git a/man/fi_fabric.3.md b/man/fi_fabric.3.md index bb3aa0af479..4e71633fbd9 100644 --- a/man/fi_fabric.3.md +++ b/man/fi_fabric.3.md @@ -12,7 +12,7 @@ fi_fabric \- Fabric domain operations fi_fabric / fi_close : Open / close a fabric domain -fi_tostr +fi_tostr / fi_tostr_r : Convert fabric attributes, flags, and capabilities to printable string # SYNOPSIS @@ -26,6 +26,9 @@ int fi_fabric(struct fi_fabric_attr *attr, int fi_close(struct fid *fabric); char * fi_tostr(const void *data, enum fi_type datatype); + +char * fi_tostr(char *buf, size_t len, const void *data, + enum fi_type datatype); ``` # ARGUMENTS @@ -40,6 +43,19 @@ char * fi_tostr(const void *data, enum fi_type datatype); : User specified context associated with the opened object. This context is returned as part of any associated asynchronous event. +*buf* +: Output buffer to write string. + +*len* +: Size in bytes of memory referenced by buf. + +*data* +: Input data to convert into a string. The format of data is determined + by the datatype parameter. + +*datatype* +: Indicates the data to convert to a printable string. + # DESCRIPTION A fabric domain represents a collection of hardware and software @@ -60,7 +76,7 @@ The fi_close call is used to release all resources associated with a fabric domain or interface. All items associated with the opened fabric must be released prior to calling fi_close. -## fi_tostr +## fi_tostr / fi_tostr_r Converts fabric interface attributes, capabilities, flags, and enum values into a printable string. The data parameter accepts a pointer @@ -144,10 +160,17 @@ datatype or field value. *FI_TYPE_FID* : struct fid * +*FI_TYPE_HMEM_IFACE* +: enum fi_hmem_iface * + fi_tostr() will return a pointer to an internal libfabric buffer that should not be modified, and will be overwritten the next time fi_tostr() is invoked. fi_tostr() is not thread safe. +The fi_tostr_r() function is a re-entrant and thread safe version of +fi_tostr(). It writes the string into a buffer provided by the caller. +fi_tostr_r() returns the start of the caller's buffer. + # NOTES The following resources are associated with fabric domains: access @@ -176,6 +199,10 @@ fi_getinfo, if no fabric was specified, but the user has an opened instance of the named fabric, this will reference the first opened instance. If no instance has been opened, this field will be NULL. +The fabric instance returned by fi_getinfo should only be considered +valid if the application does not close any fabric instances from +another thread while fi_getinfo is being processed. + ## name A fabric identifier. diff --git a/man/fi_getinfo.3.md b/man/fi_getinfo.3.md index 8fab0f32394..e78dbd541db 100644 --- a/man/fi_getinfo.3.md +++ b/man/fi_getinfo.3.md @@ -237,8 +237,8 @@ struct fi_info { : Optional attributes related to the hardware NIC associated with the specified fabric, domain, and endpoint data. This field is only valid for providers where the corresponding attributes are - closely associated with a hardware NIC. See [`fi_nic`(3)] - (fi_nic.3.html) for details. + closely associated with a hardware NIC. See + [`fi_nic`(3)](fi_nic.3.html) for details. # CAPABILITIES @@ -254,37 +254,6 @@ Applications may use this feature to request a minimal set of requirements, then check the returned capabilities to enable additional optimizations. -*FI_MSG* -: Specifies that an endpoint should support sending and receiving - messages or datagrams. Message capabilities imply support for send - and/or receive queues. Endpoints supporting this capability support - operations defined by struct fi_ops_msg. - - The caps may be used to specify or restrict the type of messaging - operations that are supported. In the absence of any relevant - flags, FI_MSG implies the ability to send and receive messages. - Applications can use the FI_SEND and FI_RECV flags to optimize an - endpoint as send-only or receive-only. - -*FI_RMA* -: Specifies that the endpoint should support RMA read and write - operations. Endpoints supporting this capability support operations - defined by struct fi_ops_rma. In the absence of any relevant flags, - FI_RMA implies the ability to initiate and be the target of remote - memory reads and writes. Applications can use the FI_READ, - FI_WRITE, FI_REMOTE_READ, and FI_REMOTE_WRITE flags to restrict the - types of RMA operations supported by an endpoint. - -*FI_TAGGED* -: Specifies that the endpoint should handle tagged message transfers. - Tagged message transfers associate a user-specified key or tag with - each message that is used for matching purposes at the remote side. - Endpoints supporting this capability support operations defined by - struct fi_ops_tagged. In the absence of any relevant flags, - FI_TAGGED implies the ability to send and receive tagged messages. - Applications can use the FI_SEND and FI_RECV flags to optimize an - endpoint as send-only or receive-only. - *FI_ATOMIC* : Specifies that the endpoint supports some set of atomic operations. Endpoints supporting this capability support operations defined by @@ -294,15 +263,10 @@ additional optimizations. FI_WRITE, FI_REMOTE_READ, and FI_REMOTE_WRITE flags to restrict the types of atomic operations supported by an endpoint. -*FI_MULTICAST* -: Indicates that the endpoint support multicast data transfers. This - capability must be paired with at least one other data transfer capability, - (e.g. FI_MSG, FI_SEND, FI_RECV, ...). - -*FI_NAMED_RX_CTX* -: Requests that endpoints which support multiple receive contexts - allow an initiator to target (or name) a specific receive context as - part of a data transfer operation. +*FI_COLLECTIVE* +: Requests support for collective operations. Endpoints that support + this capability support the collective operations defined in + [`fi_collective`(3)](fi_collective.3.html). *FI_DIRECTED_RECV* : Requests that the communication endpoint use the source address of @@ -310,40 +274,71 @@ additional optimizations. capability is not set, then the src_addr parameter for msg and tagged receive operations is ignored. +*FI_FENCE* +: Indicates that the endpoint support the FI_FENCE flag on data + transfer operations. Support requires tracking that all previous + transmit requests to a specified remote endpoint complete prior + to initiating the fenced operation. Fenced operations are often + used to enforce ordering between operations that are not otherwise + guaranteed by the underlying provider or protocol. + +*FI_HMEM* +: Specifies that the endpoint should support transfers to and from + device memory. + +*FI_LOCAL_COMM* +: Indicates that the endpoint support host local communication. This + flag may be used in conjunction with FI_REMOTE_COMM to indicate that + local and remote communication are required. If neither FI_LOCAL_COMM + or FI_REMOTE_COMM are specified, then the provider will indicate + support for the configuration that minimally affects performance. + Providers that set FI_LOCAL_COMM but not FI_REMOTE_COMM, for example + a shared memory provider, may only be used to communication between + processes on the same system. + +*FI_MSG* +: Specifies that an endpoint should support sending and receiving + messages or datagrams. Message capabilities imply support for send + and/or receive queues. Endpoints supporting this capability support + operations defined by struct fi_ops_msg. + + The caps may be used to specify or restrict the type of messaging + operations that are supported. In the absence of any relevant + flags, FI_MSG implies the ability to send and receive messages. + Applications can use the FI_SEND and FI_RECV flags to optimize an + endpoint as send-only or receive-only. + +*FI_MULTICAST* +: Indicates that the endpoint support multicast data transfers. This + capability must be paired with FI_MSG. Applications can use FI_SEND + and FI_RECV to optimize multicast as send-only or receive-only. + *FI_MULTI_RECV* : Specifies that the endpoint must support the FI_MULTI_RECV flag when posting receive buffers. -*FI_SOURCE* -: Requests that the endpoint return source addressing data as part of - its completion data. This capability only applies to connectionless - endpoints. Note that returning source address information may - require that the provider perform address translation and/or look-up - based on data available in the underlying protocol in order to - provide the requested data, which may adversely affect performance. - The performance impact may be greater for address vectors of type - FI_AV_TABLE. +*FI_NAMED_RX_CTX* +: Requests that endpoints which support multiple receive contexts + allow an initiator to target (or name) a specific receive context as + part of a data transfer operation. *FI_READ* : Indicates that the user requires an endpoint capable of initiating reads against remote memory regions. This flag requires that FI_RMA and/or FI_ATOMIC be set. -*FI_WRITE* -: Indicates that the user requires an endpoint capable of initiating - writes against remote memory regions. This flag requires that FI_RMA - and/or FI_ATOMIC be set. - -*FI_SEND* -: Indicates that the user requires an endpoint capable of sending - message data transfers. Message transfers include base message - operations as well as tagged message functionality. - *FI_RECV* : Indicates that the user requires an endpoint capable of receiving message data transfers. Message transfers include base message operations as well as tagged message functionality. +*FI_REMOTE_COMM* +: Indicates that the endpoint support communication with endpoints + located at remote nodes (across the fabric). See FI_LOCAL_COMM for + additional details. Providers that set FI_REMOTE_COMM but not + FI_LOCAL_COMM, for example NICs that lack loopback support, cannot + be used to communicate with processes on the same system. + *FI_REMOTE_READ* : Indicates that the user requires an endpoint capable of receiving read memory operations from remote endpoints. This flag requires @@ -354,45 +349,47 @@ additional optimizations. write memory operations from remote endpoints. This flag requires that FI_RMA and/or FI_ATOMIC be set. +*FI_RMA* +: Specifies that the endpoint should support RMA read and write + operations. Endpoints supporting this capability support operations + defined by struct fi_ops_rma. In the absence of any relevant flags, + FI_RMA implies the ability to initiate and be the target of remote + memory reads and writes. Applications can use the FI_READ, + FI_WRITE, FI_REMOTE_READ, and FI_REMOTE_WRITE flags to restrict the + types of RMA operations supported by an endpoint. + *FI_RMA_EVENT* : Requests that an endpoint support the generation of completion events when it is the target of an RMA and/or atomic operation. This flag requires that FI_REMOTE_READ and/or FI_REMOTE_WRITE be enabled on the endpoint. +*FI_RMA_PMEM* +: Indicates that the provider is 'persistent memory aware' and supports + RMA operations to and from persistent memory. Persistent memory aware + providers must support registration of memory that is backed by non- + volatile memory, RMA transfers to/from persistent memory, and enhanced + completion semantics. This flag requires that FI_RMA be set. + This capability is experimental. + +*FI_SEND* +: Indicates that the user requires an endpoint capable of sending + message data transfers. Message transfers include base message + operations as well as tagged message functionality. + *FI_SHARED_AV* : Requests or indicates support for address vectors which may be shared among multiple processes. -*FI_TRIGGER* -: Indicates that the endpoint should support triggered operations. - Endpoints support this capability must meet the usage model as - described by fi_trigger.3. - -*FI_FENCE* -: Indicates that the endpoint support the FI_FENCE flag on data - transfer operations. Support requires tracking that all previous - transmit requests to a specified remote endpoint complete prior - to initiating the fenced operation. Fenced operations are often - used to enforce ordering between operations that are not otherwise - guaranteed by the underlying provider or protocol. - -*FI_LOCAL_COMM* -: Indicates that the endpoint support host local communication. This - flag may be used in conjunction with FI_REMOTE_COMM to indicate that - local and remote communication are required. If neither FI_LOCAL_COMM - or FI_REMOTE_COMM are specified, then the provider will indicate - support for the configuration that minimally affects performance. - Providers that set FI_LOCAL_COMM but not FI_REMOTE_COMM, for example - a shared memory provider, may only be used to communication between - processes on the same system. - -*FI_REMOTE_COMM* -: Indicates that the endpoint support communication with endpoints - located at remote nodes (across the fabric). See FI_LOCAL_COMM for - additional details. Providers that set FI_REMOTE_COMM but not - FI_LOCAL_COMM, for example NICs that lack loopback support, cannot - be used to communicate with processes on the same system. +*FI_SOURCE* +: Requests that the endpoint return source addressing data as part of + its completion data. This capability only applies to connectionless + endpoints. Note that returning source address information may + require that the provider perform address translation and/or look-up + based on data available in the underlying protocol in order to + provide the requested data, which may adversely affect performance. + The performance impact may be greater for address vectors of type + FI_AV_TABLE. *FI_SOURCE_ERR* : Must be paired with FI_SOURCE. When specified, this requests that @@ -402,13 +399,20 @@ additional optimizations. validate incoming source address data against addresses stored in the local address vector, which may adversely affect performance. -*FI_RMA_PMEM* -: Indicates that the provider is 'persistent memory aware' and supports - RMA operations to and from persistent memory. Persistent memory aware - providers must support registration of memory that is backed by non- - volatile memory, RMA transfers to/from persistent memory, and enhanced - completion semantics. This flag requires that FI_RMA be set. - This capability is experimental. +*FI_TAGGED* +: Specifies that the endpoint should handle tagged message transfers. + Tagged message transfers associate a user-specified key or tag with + each message that is used for matching purposes at the remote side. + Endpoints supporting this capability support operations defined by + struct fi_ops_tagged. In the absence of any relevant flags, + FI_TAGGED implies the ability to send and receive tagged messages. + Applications can use the FI_SEND and FI_RECV flags to optimize an + endpoint as send-only or receive-only. + +*FI_TRIGGER* +: Indicates that the endpoint should support triggered operations. + Endpoints support this capability must meet the usage model as + described by fi_trigger.3. *FI_VARIABLE_MSG* @@ -420,18 +424,31 @@ additional optimizations. are any messages larger than an endpoint configurable size. This flag requires that FI_MSG and/or FI_TAGGED be set. -Capabilities may be grouped into two general categories: primary and -secondary. Primary capabilities must explicitly be requested by an -application, and a provider must enable support for only those primary -capabilities which were selected. Secondary capabilities may optionally -be requested by an application. If requested, a provider must support -the capability or fail the fi_getinfo request (FI_ENODATA). A provider +*FI_WRITE* +: Indicates that the user requires an endpoint capable of initiating + writes against remote memory regions. This flag requires that FI_RMA + and/or FI_ATOMIC be set. + +Capabilities may be grouped into three general categories: primary, +secondary, and primary modifiers. Primary capabilities must explicitly +be requested by an application, and a provider must enable support for +only those primary capabilities which were selected. Primary modifiers +are used to limit a primary capability, such as restricting an endpoint +to being send-only. If no modifiers are specified for an applicable +capability, all relevant modifiers are assumed. See above definitions +for details. + +Secondary capabilities may optionally be requested by an application. +If requested, a provider must support the capability or fail the +fi_getinfo request (FI_ENODATA). A provider may optionally report non-selected secondary capabilities if doing so would not compromise performance or security. Primary capabilities: FI_MSG, FI_RMA, FI_TAGGED, FI_ATOMIC, FI_MULTICAST, -FI_NAMED_RX_CTX, FI_DIRECTED_RECV, FI_READ, FI_WRITE, FI_RECV, FI_SEND, -FI_REMOTE_READ, FI_REMOTE_WRITE, and FI_VARIABLE_MSG. +FI_NAMED_RX_CTX, FI_DIRECTED_RECV, FI_VARIABLE_MSG, FI_HMEM, FI_COLLECTIVE + +Primary modifiers: FI_READ, FI_WRITE, FI_RECV, FI_SEND, +FI_REMOTE_READ, FI_REMOTE_WRITE Secondary capabilities: FI_MULTI_RECV, FI_SOURCE, FI_RMA_EVENT, FI_SHARED_AV, FI_TRIGGER, FI_FENCE, FI_LOCAL_COMM, FI_REMOTE_COMM, FI_SOURCE_ERR, FI_RMA_PMEM. @@ -454,6 +471,30 @@ created using the returned fi_info. The set of modes are listed below. If a NULL hints structure is provided, then the provider's supported set of modes will be returned in the info structure(s). +*FI_ASYNC_IOV* +: Applications can reference multiple data buffers as part of a single + operation through the use of IO vectors (SGEs). Typically, + the contents of an IO vector are copied by the provider into an + internal buffer area, or directly to the underlying hardware. + However, when a large number of IOV entries are supported, + IOV buffering may have a negative impact on performance and memory + consumption. The FI_ASYNC_IOV mode indicates that the application + must provide the buffering needed for the IO vectors. When set, + an application must not modify an IO vector of length > 1, including any + related memory descriptor array, until the associated + operation has completed. + +*FI_BUFFERED_RECV* +: The buffered receive mode bit indicates that the provider owns the + data buffer(s) that are accessed by the networking layer for received + messages. Typically, this implies that data must be copied from the + provider buffer into the application buffer. Applications that can + handle message processing from network allocated data buffers can set + this mode bit to avoid copies. For full details on application + requirements to support this mode, see the 'Buffered Receives' section + in [`fi_msg`(3)](fi_msg.3.html). This mode bit applies to FI_MSG and + FI_TAGGED receive operations. + *FI_CONTEXT* : Specifies that the provider requires that applications use struct fi_context as their per operation context parameter for operations @@ -530,25 +571,6 @@ supported set of modes will be returned in the info structure(s). must be a contiguous region, though it may or may not be directly adjacent to the payload portion of the buffer. -*FI_ASYNC_IOV* -: Applications can reference multiple data buffers as part of a single - operation through the use of IO vectors (SGEs). Typically, - the contents of an IO vector are copied by the provider into an - internal buffer area, or directly to the underlying hardware. - However, when a large number of IOV entries are supported, - IOV buffering may have a negative impact on performance and memory - consumption. The FI_ASYNC_IOV mode indicates that the application - must provide the buffering needed for the IO vectors. When set, - an application must not modify an IO vector of length > 1, including any - related memory descriptor array, until the associated - operation has completed. - -*FI_RX_CQ_DATA* -: This mode bit only applies to data transfers that set FI_REMOTE_CQ_DATA. - When set, a data transfer that carries remote CQ data will consume a - receive buffer at the target. This is true even for operations that would - normally not consume posted receive buffers, such as RMA write operations. - *FI_NOTIFY_FLAGS_ONLY* : This bit indicates that general completion flags may not be set by the provider, and are not needed by the application. If specified, @@ -563,16 +585,11 @@ supported set of modes will be returned in the info structure(s). and counters among endpoints, transmit contexts, and receive contexts that have the same set of capability flags. -*FI_BUFFERED_RECV* -: The buffered receive mode bit indicates that the provider owns the - data buffer(s) that are accessed by the networking layer for received - messages. Typically, this implies that data must be copied from the - provider buffer into the application buffer. Applications that can - handle message processing from network allocated data buffers can set - this mode bit to avoid copies. For full details on application - requirements to support this mode, see the 'Buffered Receives' section - in [`fi_msg`(3)](fi_msg.3.html). This mode bit applies to FI_MSG and - FI_TAGGED receive operations. +*FI_RX_CQ_DATA* +: This mode bit only applies to data transfers that set FI_REMOTE_CQ_DATA. + When set, a data transfer that carries remote CQ data will consume a + receive buffer at the target. This is true even for operations that would + normally not consume posted receive buffers, such as RMA write operations. # ADDRESSING FORMATS @@ -587,6 +604,47 @@ formats. In some cases, a selected addressing format may need to be translated or mapped into an address which is native to the fabric. See [`fi_av`(3)](fi_av.3.html). +*FI_ADDR_BGQ* +: Address is an IBM proprietary format that is used with their Blue Gene Q + systems. + +*FI_ADDR_EFA* +: Address is an Amazon Elastic Fabric Adapter (EFA) proprietary format. + +*FI_ADDR_GNI* +: Address is a Cray proprietary format that is used with their GNI + protocol. + +*FI_ADDR_PSMX* +: Address is an Intel proprietary format used with their Performance Scaled + Messaging protocol. + +*FI_ADDR_PSMX2* +: Address is an Intel proprietary format used with their Performance Scaled + Messaging protocol version 2. + +*FI_ADDR_PSMX3* +: Address is an Intel proprietary format used with their Performance Scaled + Messaging protocol version 3. + +*FI_ADDR_STR* +: Address is a formatted character string. The length and content of + the string is address and/or provider specific, but in general follows + a URI model: + +``` +address_format[://[node][:[service][/[field3]...][?[key=value][&k2=v2]...]]] +``` + + Examples: + - fi_sockaddr://10.31.6.12:7471 + - fi_sockaddr_in6://[fe80::6:12]:7471 + - fi_sockaddr://10.31.6.12:7471?qos=3 + + Since the string formatted address does not contain any provider + information, the prov_name field of the fabric attribute structure should + be used to filter by provider if necessary. + *FI_FORMAT_UNSPEC* : FI_FORMAT_UNSPEC indicates that a provider specific address format should be selected. Provider specific addresses may be protocol @@ -602,41 +660,19 @@ fabric. See [`fi_av`(3)](fi_av.3.html). will be determined at run time by interfaces examining the sa_family field. +*FI_SOCKADDR_IB* +: Address is of type sockaddr_ib (defined in Linux kernel source) + *FI_SOCKADDR_IN* : Address is of type sockaddr_in (IPv4). *FI_SOCKADDR_IN6* : Address is of type sockaddr_in6 (IPv6). -*FI_SOCKADDR_IB* -: Address is of type sockaddr_ib (defined in Linux kernel source) - *FI_ADDR_PSMX* : Address is an Intel proprietary format that is used with their PSMX (extended performance scaled messaging) protocol. -*FI_ADDR_GNI* -: Address is a Cray proprietary format that is used with their GNI - protocol. - -*FI_ADDR_STR* -: Address is a formatted character string. The length and content of - the string is address and/or provider specific, but in general follows - a URI model: - -``` -address_format[://[node][:[service][/[field3]...][?[key=value][&k2=v2]...]]] -``` - - Examples: - - fi_sockaddr://10.31.6.12:7471 - - fi_sockaddr_in6://[fe80::6:12]:7471 - - fi_sockaddr://10.31.6.12:7471?qos=3 - - Since the string formatted address does not contain any provider - information, the prov_name field of the fabric attribute structure should - be used to filter by provider if necessary. - # FLAGS The operation of the fi_getinfo call may be controlled through the use of @@ -647,12 +683,6 @@ input flags. Valid flags include the following. of a fabric address, such as a dotted decimal IP address. Use of this flag will suppress any lengthy name resolution protocol. -*FI_SOURCE* -: Indicates that the node and service parameters specify the local - source address to associate with an endpoint. If specified, either - the node and/or service parameter must be non-NULL. This flag is - often used with passive endpoints. - *FI_PROV_ATTR_ONLY* : Indicates that the caller is only querying for what providers are potentially available. All providers will return exactly one @@ -662,6 +692,12 @@ input flags. Valid flags include the following. The fabric_attr member will have the prov_name and prov_version values filled in. +*FI_SOURCE* +: Indicates that the node and service parameters specify the local + source address to associate with an endpoint. If specified, either + the node and/or service parameter must be non-NULL. This flag is + often used with passive endpoints. + # RETURN VALUE fi_getinfo() returns 0 on success. On error, fi_getinfo() returns a @@ -681,13 +717,13 @@ via fi_freeinfo(). : The specified endpoint or domain capability or operation flags are invalid. -*FI_ENOMEM* -: Indicates that there was insufficient memory to complete the operation. - *FI_ENODATA* : Indicates that no providers could be found which support the requested fabric information. +*FI_ENOMEM* +: Indicates that there was insufficient memory to complete the operation. + # NOTES If hints are provided, the operation will be controlled by the values diff --git a/man/fi_info.1.md b/man/fi_info.1.md index 55f66f45e78..92fd64de81d 100644 --- a/man/fi_info.1.md +++ b/man/fi_info.1.md @@ -70,9 +70,13 @@ providers, see the `--list` option. ## Discovery *-e, --env* -: List libfabric related environment levels which can be used to enable extra +: List libfabric related environment variables which can be used to enable extra configuration or tuning. +*-g [filter] +: Same as -e option, with output limited to environment variables containing +filter as a substring. + *-l, --list* : List available libfabric providers. diff --git a/man/fi_mlx.7.md b/man/fi_mlx.7.md index c6688ebadfa..0c382356f88 100644 --- a/man/fi_mlx.7.md +++ b/man/fi_mlx.7.md @@ -11,60 +11,8 @@ fi_mlx \- The MLX Fabric Provider # OVERVIEW -The *mlx* provider runs over the UCX library -that is currently supported by the Mellanox infiniband fabrics. -The *mlx* provider makes use of UCX tag matching API in order to -implement a limited set of the libfabric data transfer APIs, namely, -tagged message queue. - -Supported UCP API version: 1.0 - -# LIMITATIONS - -The *mlx* provider doesn't support all the features defined in the -libfabric API. Here are some of the limitations: - -Endpoint types -: Only supported type: *FI_RDM* - -Endpoint capabilities -: Endpoints can support the only data transfer capability - *FI_TAGGED*. - - -Modes -: *FI_CONTEXT* is required. That means, all the requests that generate - completions must have a valid pointer to type *struct fi_context* - passed as the operation context. - -Threading -: The supported mode is FI_THREAD_DOMAIN, i.e. the *mlx* provider is not thread safe. - - -Unsupported features -: These features are unsupported: connection management, event queue, - scalable endpoint, passive endpoint, shared receive context, - rma, atomics. - - -# RUNTIME PARAMETERS - -*FI_MLX_CONFIG* -: The path to the MLX configuration file (default: none). - -*FI_MLX_TINJECT_LIMIT* -: Maximal tinject message size (default: 1024). - -*FI_MLX_NS_ENABLE* -: Enforce usage of name server functionality for MLX provider - (default: disabled). - -*FI_MLX_NS_PORT* -: MLX provider's name server port (default: 12345). - -*FI_MLX_NS_IFACE* -: IPv4 network interface for MLX provider's name server - (default: any). +The mlx provider was deprecated and removed in libfabric 1.9 +due to a lack of a maintainer. # SEE ALSO diff --git a/man/fi_mr.3.md b/man/fi_mr.3.md index 1ac392a2884..d460c679f2b 100644 --- a/man/fi_mr.3.md +++ b/man/fi_mr.3.md @@ -32,7 +32,7 @@ fi_mr_unmap_key : Releases a previously mapped raw memory region key. fi_mr_bind -: Associate a registered memory region with a completion counter. +: Associate a registered memory region with a completion counter or an endpoint. fi_mr_refresh : Updates the memory pages associated with a memory region. @@ -93,10 +93,10 @@ int fi_mr_enable(struct fid_mr *mr); : User specified context associated with the memory region. *buf* -: Memory buffer to register with the fabric hardware +: Memory buffer to register with the fabric hardware. *len* -: Length of memory buffer to register +: Length of memory buffer to register. Must be > 0. *iov* : Vectored memory buffer. @@ -129,29 +129,44 @@ of a remote RMA or atomic data transfer. Additionally, a fabric provider may require that data buffers be registered before being used in local transfers. Memory registration restrictions are controlled using a separate set of mode bits, specified through the domain -attributes (mr_mode field). +attributes (mr_mode field). Each mr_mode bit requires that an +application take specific steps in order to use memory buffers with +libfabric interfaces. The following apply to memory registration. -*Scalable Memory Registration* -: By default, memory registration is considered scalable. (For library versions - 1.4 and earlier, this is indicated by setting mr_mode to FI_MR_SCALABLE, - with the fi_info mode bit FI_LOCAL_MR set to 0). For versions 1.5 and later, - scalable is implied by the lack of any mr_mode bits being set. The setting - of mr_mode bits therefore adjusts application behavior as described below. - Default, scalable registration has several properties. - - In scalable mode, registration occurs on memory address ranges. - Because registration refers to memory regions, versus data buffers, the - address ranges given for a registration request do not need to map to +*Default Memory Registration* +: If no mr_mode bits are set, the default behaviors describe below are + followed. Historically, these defaults were collectively referred to as + scalable memory registration. The default requirements are outlined below, + followed by definitions of how each mr_mode bit alters the definition. + + Compatibility: For library versions 1.4 and earlier, this was indicated by + setting mr_mode to FI_MR_SCALABLE and the fi_info mode bit FI_LOCAL_MR to 0. + FI_MR_SCALABLE and FI_LOCAL_MR were deprecated in libfabric version 1.5, + though they are supported for backwards compatibility purposes. + + For security, memory registration is required for data buffers that are + accessed directly by a peer process. For example, registration is + required for RMA target buffers (read or written to), and those accessed + by atomic or collective operations. + + By default, registration occurs on virtual address ranges. + Because registration refers to address ranges, rather than allocated + data buffers, the address ranges do not need to map to data buffers allocated by the application at the time the registration call is made. That is, an application can register any range of addresses in their virtual address space, whether or not those addresses are backed by physical pages or have been allocated. - The resulting memory regions are accessible by peers starting at a base - address of 0. That is, the target address that is specified is a byte - offset into the registered region. + Note that physical pages must back addresses prior to the addresses being + accessed as part of a data transfer operation, or the data transfers will + fail. Additionally, depending on the operation, this could result in the + local process receiving a segmentation fault for accessing invalid memory. + + Once registered, the resulting memory regions are accessible by peers starting + at a base address of 0. That is, the target address that is specified is a + byte offset into the registered region. The application also selects the access key associated with the MR. The key size is restricted to a maximum of 8 bytes. @@ -161,14 +176,30 @@ The following apply to memory registration. tagged sends, RMA, and atomics -- as well as buffers posted for receive and tagged receive operations. + Although the default memory registration behavior is convenient for + application developers, it is difficult to implement in hardware. + Attempts to hide the hardware requirements from the application often + results in significant and unacceptable impacts to performance. The + following mr_mode bits are provided as input into fi_getinfo. If a + provider requires the behavior defined for an mr_mode bit, it will leave + the bit set on output to fi_getinfo. Otherwise, the provider can clear + the bit to indicate that the behavior is not needed. + + By setting an mr_mode bit, the application has agreed to adjust its + behavior as indicated. Importantly, applications that choose to support + an mr_mode must be prepared to handle the case where the mr_mode is + not required. A provider will clear an mr_mode bit if it is not needed. + *FI_MR_LOCAL* : When the FI_MR_LOCAL mode bit is set, applications must register all data buffers that will be accessed by the local hardware and provide - a valid mem_desc parameter into applicable data transfer operations. + a valid desc parameter into applicable data transfer operations. When FI_MR_LOCAL is zero, applications are not required to register data buffers before using them for local operations (e.g. send and - receive data buffers), and the mem_desc parameter into data transfer - operations is ignored. + receive data buffers). The desc parameter into data transfer + operations will be ignored in this case, unless otherwise required + (e.g. se FI_MR_HMEM). It is recommended that applications pass in + NULL for desc when not required. A provider may hide local registration requirements from applications by making use of an internal registration cache or similar mechanisms. @@ -245,18 +276,36 @@ The following apply to memory registration. MR with an endpoint, the application must use fi_mr_bind(). To enable the memory region, the application must call fi_mr_enable(). +*FI_MR_HMEM* +: This mode bit is associated with the FI_HMEM capability. + If FI_MR_HMEM is set, the application must register buffers that + were allocated using a device call and provide a valid desc + parameter into applicable data transfer operations even if they are + only used for local operations (e.g. send and receive data buffers). + Device memory must be registered using the fi_mr_regattr call, with + the iface and device fields filled out. + + If FI_MR_HMEM is set, but FI_MR_LOCAL is unset, only device buffers + must be registered when used locally. In this case, the desc parameter + passed into data transfer operations must either be valid or NULL. + Similarly, if FI_MR_LOCAL is set, but FI_MR_HMEM is not, the desc + parameter must either be valid or NULL. + *Basic Memory Registration* -: Basic memory registration is indicated by the FI_MR_BASIC mr_mode bit. - FI_MR_BASIC is maintained for backwards compatibility (libfabric version - 1.4 or earlier). The behavior of basic registration is equivalent - to setting the following mr_mode bits to one: FI_MR_VIRT_ADDR, +: Basic memory registration was deprecated in libfabric version 1.5, but + is supported for backwards compatibility. Basic memory registration + is indicated by setting mr_mode equal to FI_MR_BASIC. + FI_MR_BASIC must be set alone and not paired with mr_mode bits. + Unlike other mr_mode bits, if FI_MR_BASIC is set on input to fi_getinfo(), + it will not be cleared by the provider. That is, setting mr_mode equal to + FI_MR_BASIC forces basic registration if the provider supports it. + + The behavior of basic registration is equivalent + to requiring the following mr_mode bits: FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, and FI_MR_PROV_KEY. Additionally, providers that - support basic registration usually required FI_MR_LOCAL. FI_MR_BASIC - must either be set alone, or in conjunction with FI_MR_LOCAL. Other - mr_mode bit pairings are invalid. Unlike other mr_mode bits, if - FI_MR_BASIC is set on input to fi_getinfo(), it will not be cleared - by the provider. That is, setting FI_MR_BASIC - to one requests basic registration. + support basic registration usually require the (deprecated) fi_info mode + bit FI_LOCAL_MR, which was incorporated into the FI_MR_LOCAL mr_mode + bit. The registrations functions -- fi_mr_reg, fi_mr_regv, and fi_mr_regattr -- are used to register one or more memory regions with @@ -452,6 +501,12 @@ struct fi_mr_attr { void *context; size_t auth_key_size; uint8_t *auth_key; + enum fi_hmem_iface iface; + union { + uint64_t reserved; + int cuda; + int ze + } device; }; ``` ## mr_iov @@ -540,10 +595,40 @@ version 1.5 or greater. Indicates the key to associate with this memory registration. Authorization keys are used to limit communication between endpoints. Only peer endpoints that are programmed to use the same authorization key may access the memory -region. The domain authorization key will be used if the auth_key_size -provided is 0. This field is ignored unless the fabric is opened with API +region. The domain authorization key will be used if the auth_key_size +provided is 0. This field is ignored unless the fabric is opened with API version 1.5 or greater. +## iface +Indicates the software interfaces used by the application to allocate and +manage the memory region. This field is ignored unless the application has +requested the FI_HMEM capability. + +*FI_HMEM_SYSTEM* +: Uses standard operating system calls and libraries, such as malloc, + calloc, realloc, mmap, and free. + +*FI_HMEM_CUDA* +: Uses Nvidia CUDA interfaces such as cuMemAlloc, cuMemAllocHost, + cuMemAllocManaged, cuMemFree, cudaMalloc, cudaFree. + +*FI_HMEM_ROCR* +: Uses AMD ROCR interfaces such as hsa_memory_allocate and hsa_memory_free. + +*FI_HMEM_ZE* +: Uses Intel L0 ZE interfaces such as zeDriverAllocSharedMem, + zeDriverFreeMem. + +## device +Reserved 64 bits for device identifier if using non-standard HMEM interface. +This field is ignore unless the iface field is valid. + +*cuda* +: For FI_HMEM_CUDA, this is equivalent to CUdevice (int). + +*ze* +: For FI_HMEM_ZE, this is equivalent to the ze_device_handle_t index (int). + # NOTES Direct access to an application's memory by a remote peer requires that @@ -612,13 +697,18 @@ Fabric errno values are defined in Many hardware NICs accessed by libfabric require that data buffers be registered with the hardware while the hardware accesses it. This ensures that the virtual to physical address mappings for those buffers do not change -while the transfer is ocurring. The performance impact of registering +while the transfer is occurring. The performance impact of registering memory regions can be significant. As a result, some providers make use of a registration cache, particularly when working with applications that are unable to manage their own network buffers. A registration cache avoids the overhead of registering and unregistering a data buffer with each transfer. +If a registration cache is going to be used for host and device memory, the +device must support unified virtual addressing. If the device does not +support unified virtual addressing, either an additional registration cache +is required to track this device memory, or device memory cannot be cached. + As a general rule, if hardware requires the FI_MR_LOCAL mode bit described above, but this is not supported by the application, a memory registration cache _may_ be in use. The following environment variables may be used to @@ -639,14 +729,30 @@ configure registration caches. are not actively being used as part of a data transfer. Setting this to zero will disable registration caching. -*FI_MR_CACHE_MERGE_REGIONS* -: If this variable is set to true, yes, or 1, then memory regions that are - adjacent or overlapping will be merged into a single larger region. Merging - regions reduces the total cache size and the number of regions managed by - the cache. However, merging regions can have a negative impact on - performance if a large number of adjacent regions are sent as separate data - transfers (such as sending elements of an array to peer(s)), and the larger - region is access infrequently. By default merging regions is disabled. +*FI_MR_CACHE_MONITOR* +: The cache monitor is responsible for detecting system memory (FI_HMEM_SYSTEM) + changes made between the virtual addresses used by an application and the + underlying physical pages. Valid monitor options are: userfaultfd, memhooks, + and disabled. Selecting disabled will turn off the registration cache. + Userfaultfd is a Linux kernel feature used to report virtual to physical + address mapping changes to user space. Memhooks operates by intercepting + relevant memory allocation and deallocation calls which may result in the + mappings changing, such as malloc, mmap, free, etc. Note that memhooks + operates at the elf linker layer, and does not use glibc memory hooks. + +*FI_MR_CUDA_CACHE_MONITOR_ENABLED* +: The CUDA cache monitor is responsible for detecting CUDA device memory + (FI_HMEM_CUDA) changes made between the device virtual addresses used by an + application and the underlying device physical pages. Valid monitor options + are: 0 or 1. Note that the CUDA memory monitor requires a CUDA toolkit version + with unified virtual addressing enabled. + +*FI_MR_ROCR_CACHE_MONITOR_ENABLED* +: The ROCR cache monitor is responsible for detecting ROCR device memory + (FI_HMEM_ROCR) changes made between the device virtual addresses used by an + application and the underlying device physical pages. Valid monitor options + are: 0 or 1. Note that the ROCR memory monitor requires a ROCR version with + unified virtual addressing enabled. # SEE ALSO diff --git a/man/fi_mrail.7.md b/man/fi_mrail.7.md index 6c12cab6e74..97f0ba2fb6a 100644 --- a/man/fi_mrail.7.md +++ b/man/fi_mrail.7.md @@ -13,7 +13,7 @@ fi_mrail \- The Multi-Rail Utility Provider The mrail provider (ofi_mrail) is an utility provider that layers over an underlying provider to enable the use of multiple network ports (rails). This increases -the total available bandwidth of an underlying proivder. The current status of +the total available bandwidth of an underlying provider. The current status of mrail provider is experimental - not all libfabric features are supported and performance is not guaranteed. @@ -34,7 +34,7 @@ capabilities / modes: Applications need to: * Support FI_MR_RAW MR mode bit to make use of FI_RMA capability. - * Set FI_OFI_MRAIL_ADDR_STRC env variable (see RUNTIME PARAMETERS section below). + * Set FI_OFI_MRAIL_ADDR env variable (see RUNTIME PARAMETERS section below). # SUPPORTED FEATURES @@ -69,16 +69,31 @@ feature not listed in "Supported features" can be assumed as unsupported. # FUNCTIONALITY OVERVIEW -For messages (FI_MSG, FI_TAGGED), the provider sends one message per rail in a -round-robin manner. Ordering is guaranteed through the use of sequence numbers. +For messages (FI_MSG, FI_TAGGED), the provider uses different policies to send messages +over one or more rails based on message size (See *FI_OFI_MRIAL_CONFIG* in the RUNTIME +PARAMETERS section). Ordering is guaranteed through the use of sequence numbers. + For RMA, the data is striped equally across all rails. # RUNTIME PARAMETERS The ofi_mrail provider checks for the following environment variables. +*FI_OFI_MRAIL_ADDR* +: Comma delimited list of individual rail addresses. Each address can be an address in + FI_ADDR_STR format, a host name, an IP address, or a netdev interface name. + *FI_OFI_MRAIL_ADDR_STRC* -: Comma delimited list of individual rail addresses in FI_ADDR_STR format. +: Deprecated. Replaced by *FI_OFI_MRAIL_ADDR*. + +*FI_OFI_MRAIL_CONFIG* +: Comma separated list of `:` pairs, sorted in ascending order of + ``. Each pair indicated the rail sharing policy to be used for messages + up to the size `` and not covered by all previous pairs. The value of + `` can be *fixed* (a fixed rail is used), *round-robin* (one rail per + message, selected in round-robin fashion), or *striping* (striping across all the + rails). The default configuration is `16384:fixed,ULONG_MAX:striping`. The value + ULONG_MAX can be input as -1. # SEE ALSO diff --git a/man/fi_msg.3.md b/man/fi_msg.3.md index 0879a71faa8..26d806c04ac 100644 --- a/man/fi_msg.3.md +++ b/man/fi_msg.3.md @@ -12,7 +12,7 @@ fi_msg - Message data transfer operations fi_recv / fi_recvv / fi_recvmsg : Post a buffer to receive an incoming message -fi_send / fi_sendv / fi_sendmsg +fi_send / fi_sendv / fi_sendmsg fi_inject / fi_senddata : Initiate an operation to send a message @@ -68,7 +68,7 @@ ssize_t fi_injectdata(struct fid_ep *ep, const void *buf, size_t len, : Count of vectored data entries. *desc* -: Descriptor associated with the data buffer +: Descriptor associated with the data buffer. See [`fi_mr`(3)](fi_mr.3.html). *data* : Remote CQ data to transfer with the sent message. @@ -127,11 +127,7 @@ event details. The call fi_send transfers the data contained in the user-specified data buffer to a remote endpoint, with message boundaries being -maintained. For connection based endpoints (FI_EP_MSG) the local -endpoint must be connected to a remote endpoint or destination before -fi_send is called. Unless the endpoint has been configured -differently, the data buffer passed into fi_send must not be touched -by the application until the fi_send call completes asynchronously. +maintained. ## fi_sendv @@ -143,7 +139,7 @@ message. ## fi_sendmsg The fi_sendmsg call supports data transfers over both connected and -unconnected endpoints, with the ability to control the send operation +connectionless endpoints, with the ability to control the send operation per call through the use of flags. The fi_sendmsg function takes a `struct fi_msg` as input. @@ -192,7 +188,7 @@ corresponding endpoint. Posted receives are searched in the order in which they were posted in order to match sends. Message boundaries are maintained. The order in which the receives complete is dependent on -the endpoint type and protocol. For unconnected endpoints, the +the endpoint type and protocol. For connectionless endpoints, the src_addr parameter can be used to indicate that a buffer should be posted to receive incoming data from a specific remote endpoint. @@ -205,7 +201,7 @@ parameter to a receive incoming data. ## fi_recvmsg The fi_recvmsg call supports posting buffers over both connected and -unconnected endpoints, with the ability to control the receive +connectionless endpoints, with the ability to control the receive operation per call through the use of flags. The fi_recvmsg function takes a struct fi_msg as input. @@ -276,7 +272,7 @@ fi_sendmsg. *FI_INJECT_COMPLETE* : Applies to fi_sendmsg. Indicates that a completion should be generated when the source buffer(s) may be reused. - + *FI_TRANSMIT_COMPLETE* : Applies to fi_sendmsg. Indicates that a completion should not be generated until the operation has been successfully transmitted and @@ -293,7 +289,7 @@ fi_sendmsg. targeting the same peer endpoint have completed. Operations posted after the fencing will see and/or replace the results of any operations initiated prior to the fenced operation. - + The ordering of operations starting at the posting of the fenced operation (inclusive) to the posting of a subsequent fenced operation (exclusive) is controlled by the endpoint's ordering semantics. diff --git a/man/fi_netdir.7.md b/man/fi_netdir.7.md index 634a893502b..dccf4c72ec3 100644 --- a/man/fi_netdir.7.md +++ b/man/fi_netdir.7.md @@ -64,9 +64,6 @@ libfabric API: # LIMITATIONS -The Network Direct is an experimental provider. The full support of the Network -Direct provider will be added to 1.6 release version of libfabric. - *Memory Regions* : Only FI_MR_BASIC mode is supported. Adding regions via s/g list is supported only up to a s/g list size of 1. No support for binding memory diff --git a/man/fi_nic.3.md b/man/fi_nic.3.md index dcdfef98bbe..ed8e99e0577 100644 --- a/man/fi_nic.3.md +++ b/man/fi_nic.3.md @@ -140,7 +140,7 @@ into the fabric. Provider attributes reference provider specific details of the device. These attributes are both provider and device specific. The attributes -can be interpretted by [`fi_tostr`(3)](fi_tostr.3.html). Applications +can be interpreted by [`fi_tostr`(3)](fi_tostr.3.html). Applications may also use the other attribute fields, such as related fi_fabric_attr: prov_name field, to determine an appropriate structure to cast the attributes. The format and definition of this field is outside the diff --git a/man/fi_poll.3.md b/man/fi_poll.3.md index 053a958545d..2ab5a27b108 100644 --- a/man/fi_poll.3.md +++ b/man/fi_poll.3.md @@ -174,7 +174,8 @@ struct fi_wait_attr { allow applications to block until the wait object is signaled, indicating that an event is available to be read. The following values may be used to specify the type of wait object associated - with a wait set: FI_WAIT_UNSPEC, FI_WAIT_FD, and FI_WAIT_MUTEX_COND. + with a wait set: FI_WAIT_UNSPEC, FI_WAIT_FD, FI_WAIT_MUTEX_COND, + and FI_WAIT_YIELD. - *FI_WAIT_UNSPEC* : Specifies that the user will only wait on the wait set using @@ -185,21 +186,30 @@ struct fi_wait_attr { retrieve the underlying wait object. - *FI_WAIT_FD* -: Indicates that the wait set should use file descriptor(s) as its wait - mechanism. It may not always be possible for a wait set to be implemented - using a single underlying file descriptor, but all wait objects will be file - descriptors. File descriptor wait objects must be usable in the - POSIX select(2), poll(2), and epoll(7) routines (if - available). However, a provider may signal an FD wait object by - marking it as readable or with an error. +: Indicates that the wait set should use a single file descriptor as + its wait mechanism, as exposed to the application. Internally, this + may require the use of epoll in order to support waiting on a single + file descriptor. File descriptor wait objects must be usable in the + POSIX select(2) and poll(2), and Linux epoll(7) routines (if + available). Provider signal an FD wait object by marking it as + readable or with an error. - *FI_WAIT_MUTEX_COND* : Specifies that the wait set should use a pthread mutex and cond variable as a wait object. -- *FI_WAIT_CRITSEC_COND* -: Windows specific. Specifies that the EQ should use a critical - section and condition variable as a wait object. +- *FI_WAIT_POLLFD* +: This option is similar to FI_WAIT_FD, but allows the wait mechanism to use + multiple file descriptors as its wait mechanism, as viewed by the + application. The use of FI_WAIT_POLLFD can eliminate the need to use + epoll to abstract away needing to check multiple file descriptors when + waiting for events. The file descriptors must be usable in the POSIX + select(2) and poll(2) routines, and match directly to being used with + poll. See the NOTES section below for details on using pollfd. + +- *FI_WAIT_YIELD* +: Indicates that the wait set will wait without a wait object but instead + yield on every wait. *flags* : Flags that set the default operation of the wait set. The use of @@ -276,20 +286,24 @@ processing. ## fi_control The fi_control call is used to access provider or implementation specific -details of the wait set. Access to the wait set should be serialized across -all calls when fi_control is invoked, as it may redirect the implementation -of wait set operations. The following control commands are usable with a -wait set. +details of a fids that support blocking calls, such as wait sets, completion +queues, counters, and event queues. Access to the wait set or fid should be +serialized across all calls when fi_control is invoked, as it may redirect +the implementation of wait set operations. The following control commands +are usable with a wait set or fid. *FI_GETWAIT (void \*\*)* : This command allows the user to retrieve the low-level wait object - associated with the wait set. The format of the wait set is specified + associated with a wait set or fid. The format of the wait set is specified during wait set creation, through the wait set attributes. The fi_control arg parameter should be an address where a pointer to the returned wait object will be written. This should be an 'int *' for FI_WAIT_FD, - or 'struct fi_mutex_cond' for FI_WAIT_MUTEX_COND. Support for FI_GETWAIT - is provider specific and may fail if not supported or if the wait set is - implemented using more than one wait object. + 'struct fi_mutex_cond' for FI_WAIT_MUTEX_COND, or 'struct fi_wait_pollfd' + for FI_WAIT_POLLFD. Support for FI_GETWAIT is provider specific. + +*FI_GETWAITOBJ (enum fi_wait_obj \*)* +: This command returns the type of wait object associated with a wait set + or fid. # RETURN VALUES @@ -305,6 +319,65 @@ fi_poll # NOTES +In many situations, blocking calls may need to wait on signals sent +to a number of file descriptors. For example, this is the case for +socket based providers, such as tcp and udp, as well as utility providers +such as multi-rail. For simplicity, when epoll is available, it can +be used to limit the number of file descriptors that an application +must monitor. The use of epoll may also be required in order +to support FI_WAIT_FD. + +However, in order to support waiting on multiple file descriptors on systems +where epoll support is not available, or where epoll performance may +negatively impact performance, FI_WAIT_POLLFD provides this mechanism. +A significant different between using POLLFD versus FD wait objects +is that with FI_WAIT_POLLFD, the file descriptors may change dynamically. +As an example, the file descriptors associated with a completion queues' +wait set may change as endpoint associations with the CQ are added and +removed. + +Struct fi_wait_pollfd is used to retrieve all file descriptors for fids +using FI_WAIT_POLLFD to support blocking calls. + +```c +struct fi_wait_pollfd { + uint64_t change_index; + size_t nfds; + struct pollfd *fd; +}; +``` + +*change_index* +: The change_index may be used to determine if there have been any changes + to the file descriptor list. Anytime a file descriptor is added, removed, + or its events are updated, this field is incremented by the provider. + Applications wishing to wait on file descriptors directly should cache + the change_index value. Before blocking on file descriptor events, the + app should use fi_control() to retrieve the current change_index and + compare that against its cached value. If the values differ, then the + app should update its file descriptor list prior to blocking. + +*nfds* +: On input to fi_control(), this indicates the number of entries in the + struct pollfd * array. On output, this will be set to the number of + entries needed to store the current number of file descriptors. If + the input value is smaller than the output value, fi_control() will + return the error -FI_ETOOSMALL. Note that setting nfds = 0 allows + an efficient way of checking the change_index. + +*fd* +: This points to an array of struct pollfd entries. The number of entries + is specified through the nfds field. If the number of needed entries + is less than or equal to the number of entries available, the struct + pollfd array will be filled out with a list of file descriptors and + corresponding events that can be used in the select(2) and poll(2) + calls. + +The change_index is updated only when the file descriptors associated with +the pollfd file set has changed. Checking the change_index is an additional +step needed when working with FI_WAIT_POLLFD wait objects directly. The use +of the fi_trywait() function is still required if accessing wait objects +directly. # SEE ALSO diff --git a/man/fi_provider.7.md b/man/fi_provider.7.md index 1689a585fd9..adb0f796001 100644 --- a/man/fi_provider.7.md +++ b/man/fi_provider.7.md @@ -33,6 +33,14 @@ This distribution of libfabric contains the following providers : High-speed InfiniBand networking from Intel. See [`fi_psm`(7)](fi_psm.7.html) for more information. +*PSM2* +: High-speed Omni-Path networking from Intel. See + [`fi_psm2`(7)](fi_psm2.7.html) for more information. + +*PSM3* +: High-speed Ethernet networking from Intel. See + [`fi_psm3`(7)](fi_psm3.7.html) for more information. + *Sockets* : A general purpose provider that can be used on any network that supports TCP/UDP sockets. This provider is not intended to provide @@ -64,6 +72,13 @@ This distribution of libfabric contains the following providers hardware interface for inter-instance communication on EC2. See [`fi_efa`(7)](fi_efa.7.html) for more information. +*SHM* +: A provider for intranode communication using shared memory. + The provider makes use of the Linux kernel feature Cross Memory + Attach (CMA) which allows processes to have full access to another + process' address space. + See [`fi_shm`(7)](fi_shm.7.html) for more information. + ## Utility providers *RxM* @@ -71,6 +86,11 @@ This distribution of libfabric contains the following providers endpoints emulated over MSG endpoints of a core provider. See [`fi_rxm`(7)](fi_rxm.7.html) for more information. +*RxD* +: The RxD provider (ofi_rxd) is a utility provider that supports RDM + endpoints emulated over DGRAM endpoints of a core provider. + See [`fi_rxd`(7)](fi_rxd.7.html) for more information. + ## Special providers *Hook* diff --git a/man/fi_psm.7.md b/man/fi_psm.7.md index 80514769f31..961125a11db 100644 --- a/man/fi_psm.7.md +++ b/man/fi_psm.7.md @@ -165,3 +165,4 @@ The *psm* provider checks for the following environment variables: [`fabric`(7)](fabric.7.html), [`fi_provider`(7)](fi_provider.7.html), [`fi_psm2`(7)](fi_psm2.7.html), +[`fi_psm3`(7)](fi_psm3.7.html), diff --git a/man/fi_psm2.7.md b/man/fi_psm2.7.md index 7de5b213900..686122527c6 100644 --- a/man/fi_psm2.7.md +++ b/man/fi_psm2.7.md @@ -115,6 +115,12 @@ The *psm2* provider checks for the following environment variables: The default UUID is 00FF00FF-0000-0000-0000-00FF0F0F00FF. + It is possible to create endpoints with UUID different from the one + set here. To achieve that, set 'info->ep_attr->auth_key' to the uuid + value and 'info->ep_attr->auth_key_size' to its size (16 bytes) when + calling fi_endpoint() or fi_scalable_ep(). It is still true that an + endpoint can only communicate with endpoints with the same UUID. + *FI_PSM2_NAME_SERVER* : The *psm2* provider has a simple built-in name server that can be used to resolve an IP address or host name into a transport address needed @@ -256,8 +262,21 @@ The *psm2* provider checks for the following environment variables: to 1 (means *tag60*) or 2 (means *tag64*), the choice is fixed at compile time and this runtime option will be disabled. +# PSM2 EXTENSIONS + +The *psm2* provider supports limited low level parameter setting through the +fi_set_val() and fi_get_val() functions. Currently the following parameters +can be set via the domain fid: + +* FI_PSM2_DISCONNECT * +: Overwite the global runtime parameter *FI_PSM2_DISCONNECT* for this domain. + See the *RUNTIME PARAMETERS* section for details. + +Valid parameter names are defined in the header file *rdma/fi_ext_psm2.h*. + # SEE ALSO [`fabric`(7)](fabric.7.html), [`fi_provider`(7)](fi_provider.7.html), [`fi_psm`(7)](fi_psm.7.html), +[`fi_psm3`(7)](fi_psm3.7.html), diff --git a/man/fi_psm3.7.md b/man/fi_psm3.7.md new file mode 100644 index 00000000000..23256bb8eff --- /dev/null +++ b/man/fi_psm3.7.md @@ -0,0 +1,265 @@ +--- +layout: page +title: fi_psm3(7) +tagline: Libfabric Programmer's Manual +--- +{% include JB/setup %} + +# NAME + +fi_psm3 \- The PSM3 Fabric Provider + +# OVERVIEW + +The *psm3* provider implements a Performance Scaled Messaging +capability which supports Intel RoCEv2 capable NICs. PSM3 represents +an Ethernet and standard RoCEv2 enhancement of previous PSM +implementations. + +# SUPPORTED FEATURES + +The *psm3* provider supports a subset of all the features defined in the +libfabric API. + +Endpoint types +: Supports non-connection based types *FI_DGRAM* and *FI_RDM*. + +Endpoint capabilities +: Endpoints can support any combination of data transfer capabilities + *FI_TAGGED*, *FI_MSG*, *FI_ATOMICS*, and *FI_RMA*. These capabilities + can be further refined by *FI_SEND*, *FI_RECV*, *FI_READ*, *FI_WRITE*, + *FI_REMOTE_READ*, and *FI_REMOTE_WRITE* to limit the direction of + operations. + + *FI_MULTI_RECV* is supported for non-tagged message queue only. + + Scalable endpoints are supported if the underlying PSM3 library supports + multiple endpoints. This condition must be satisfied both when the + provider is built and when the provider is used. See the *Scalable + endpoints* section for more information. + + Other supported capabilities include *FI_TRIGGER*, *FI_REMOTE_CQ_DATA*, + *FI_RMA_EVENT*, *FI_SOURCE*, and *FI_SOURCE_ERR*. Furthermore, + *FI_NAMED_RX_CTX* is supported when scalable endpoints are enabled. + +Modes +: *FI_CONTEXT* is required for the *FI_TAGGED* and *FI_MSG* + capabilities. That means, any request belonging to these two + categories that generates a completion must pass as the operation + context a valid pointer to type *struct fi_context*, and the space + referenced by the pointer must remain untouched until the request + has completed. If none of *FI_TAGGED* and *FI_MSG* is asked for, + the *FI_CONTEXT* mode is not required. + +Progress +: The *psm3* provider performs optimal with manual progress. By default, the + application is expected to call *fi_cq_read* or *fi_cntr_read* function + from time to time when no other libfabric function is called to ensure + progress is made in a timely manner. The provider does support auto + progress mode. However, the performance can be significantly impacted if + the application purely depends on the provider to make auto progress. + +Scalable endpoints +: Scalable endpoints support depends on the multi-EP feature of the *PSM3* + library. If the *PSM3* library supports this feature, the availability is + further controlled by an environment variable *PSM3_MULTI_EP*. The *psm3* + provider automatically sets this variable to 1 if it is not set. The + feature can be disabled explicitly by setting *PSM3_MULTI_EP* to 0. + + When creating a scalable endpoint, the exact number of contexts requested + should be set in the "fi_info" structure passed to the *fi_scalable_ep* + function. This number should be set in "fi_info->ep_attr->tx_ctx_cnt" or + "fi_info->ep_attr->rx_ctx_cnt" or both, whichever greater is used. The + *psm3* provider allocates all requested contexts upfront when the scalable + endpoint is created. The same context is used for both Tx and Rx. + + For optimal performance, it is advised to avoid having multiple threads + accessing the same context, either directly by posting send/recv/read/write + request, or indirectly by polling associated completion queues or counters. + + Using the scalable endpoint as a whole in communication functions is not + supported. Instead, individual tx context or rx context of the scalable + endpoint should be used. Similarly, using the address of the scalable + endpoint as the source address or destination address doesn't collectively + address all the tx/rx contexts. It addresses only the first tx/rx context, + instead. + +# LIMITATIONS + +The *psm3* provider doesn't support all the features defined in the +libfabric API. Here are some of the limitations not listed above: + +Unsupported features +: These features are unsupported: connection management, passive endpoint, + and shared receive context. + +# RUNTIME PARAMETERS + +The *psm3* provider checks for the following environment variables: + +*FI_PSM3_UUID* +: PSM requires that each job has a unique ID (UUID). All the processes + in the same job need to use the same UUID in order to be able to + talk to each other. The PSM reference manual advises to keep UUID + unique to each job. In practice, it generally works fine to reuse + UUID as long as (1) no two jobs with the same UUID are running at + the same time; and (2) previous jobs with the same UUID have exited + normally. If running into "resource busy" or "connection failure" + issues with unknown reason, it is advisable to manually set the UUID + to a value different from the default. + + The default UUID is 00FF00FF-0000-0000-0000-00FF0F0F00FF. + + It is possible to create endpoints with UUID different from the one + set here. To achieve that, set 'info->ep_attr->auth_key' to the uuid + value and 'info->ep_attr->auth_key_size' to its size (16 bytes) when + calling fi_endpoint() or fi_scalable_ep(). It is still true that an + endpoint can only communicate with endpoints with the same UUID. + +*FI_PSM3_NAME_SERVER* +: The *psm3* provider has a simple built-in name server that can be used + to resolve an IP address or host name into a transport address needed + by the *fi_av_insert* call. The main purpose of this name server is to + allow simple client-server type applications (such as those in *fabtests*) + to be written purely with libfabric, without using any out-of-band + communication mechanism. For such applications, the server would run first + to allow endpoints be created and registered with the name server, and + then the client would call *fi_getinfo* with the *node* parameter set to + the IP address or host name of the server. The resulting *fi_info* + structure would have the transport address of the endpoint created by the + server in the *dest_addr* field. Optionally the *service* parameter can + be used in addition to *node*. Notice that the *service* number is + interpreted by the provider and is not a TCP/IP port number. + + The name server is on by default. It can be turned off by setting the + variable to 0. This may save a small amount of resource since a separate + thread is created when the name server is on. + + The provider detects OpenMPI and MPICH runs and changes the default setting + to off. + +*FI_PSM3_TAGGED_RMA* +: The RMA functions are implemented on top of the PSM Active Message functions. + The Active Message functions have limit on the size of data can be transferred + in a single message. Large transfers can be divided into small chunks and + be pipe-lined. However, the bandwidth is sub-optimal by doing this way. + + The *psm3* provider use PSM tag-matching message queue functions to achieve + higher bandwidth for large size RMA. It takes advantage of the extra tag bits + available in PSM3 to separate the RMA traffic from the regular tagged message + queue. + + The option is on by default. To turn it off set the variable to 0. + +*FI_PSM3_DELAY* +: Time (seconds) to sleep before closing PSM endpoints. This is a workaround + for a bug in some versions of PSM library. + + The default setting is 0. + +*FI_PSM3_TIMEOUT* +: Timeout (seconds) for gracefully closing PSM endpoints. A forced closing + will be issued if timeout expires. + + The default setting is 5. + +*FI_PSM3_CONN_TIMEOUT* +: Timeout (seconds) for establishing connection between two PSM endpoints. + + The default setting is 5. + +*FI_PSM3_PROG_INTERVAL* +: When auto progress is enabled (asked via the hints to *fi_getinfo*), + a progress thread is created to make progress calls from time to time. + This option set the interval (microseconds) between progress calls. + + The default setting is 1 if affinity is set, or 1000 if not. See + *FI_PSM3_PROG_AFFINITY*. + +*FI_PSM3_PROG_AFFINITY* +: When set, specify the set of CPU cores to set the progress thread + affinity to. The format is + `[:[:]][,[:[:]]]*`, + where each triplet `::` defines a block of + core_ids. Both `` and `` can be either the `core_id` + (when >=0) or `core_id - num_cores` (when <0). + + By default affinity is not set. + +*FI_PSM3_INJECT_SIZE* +: Maximum message size allowed for fi_inject and fi_tinject calls. This is + an experimental feature to allow some applications to override default + inject size limitation. When the inject size is larger than the default + value, some inject calls might block. + + The default setting is 64. + +*FI_PSM3_LOCK_LEVEL* +: When set, dictate the level of locking being used by the provider. Level + 2 means all locks are enabled. Level 1 disables some locks and is suitable + for runs that limit the access to each PSM3 context to a single thread. + Level 0 disables all locks and thus is only suitable for single threaded + runs. + + To use level 0 or level 1, wait object and auto progress mode cannot be + used because they introduce internal threads that may break the conditions + needed for these levels. + + The default setting is 2. + +*FI_PSM3_LAZY_CONN* +: There are two strategies on when to establish connections between the PSM3 + endpoints that OFI endpoints are built on top of. In eager connection mode, + connections are established when addresses are inserted into the address + vector. In lazy connection mode, connections are established when addresses + are used the first time in communication. Eager connection mode has slightly + lower critical path overhead but lazy connection mode scales better. + + This option controls how the two connection modes are used. When set to 1, + lazy connection mode is always used. When set to 0, eager connection mode + is used when required conditions are all met and lazy connection mode is + used otherwise. The conditions for eager connection mode are: (1) multiple + endpoint (and scalable endpoint) support is disabled by explicitly setting + PSM3_MULTI_EP=0; and (2) the address vector type is FI_AV_MAP. + + The default setting is 0. + +*FI_PSM3_DISCONNECT* +: The provider has a mechanism to automatically send disconnection notifications + to all connected peers before the local endpoint is closed. As the response, + the peers call *psm3_ep_disconnect* to clean up the connection state at their + side. This allows the same PSM3 epid be used by different dynamically started + processes (clients) to communicate with the same peer (server). This mechanism, + however, introduce extra overhead to the finalization phase. For applications + that never reuse epids within the same session such overhead is unnecessary. + + This option controls whether the automatic disconnection notification mechanism + should be enabled. For client-server application mentioned above, the client + side should set this option to 1, but the server should set it to 0. + + The default setting is 0. + +*FI_PSM3_TAG_LAYOUT* +: Select how the 96-bit PSM3 tag bits are organized. Currently three choices are + available: *tag60* means 32-4-60 partitioning for CQ data, internal protocol + flags, and application tag. *tag64* means 4-28-64 partitioning for internal + protocol flags, CQ data, and application tag. *auto* means to choose either + *tag60* or *tag64* based on the hints passed to fi_getinfo -- *tag60* is used + if remote CQ data support is requested explicitly, either by passing non-zero value + via *hints->domain_attr->cq_data_size* or by including *FI_REMOTE_CQ_DATA* in + *hints->caps*, otherwise *tag64* is used. If *tag64* is the result of automatic + selection, *fi_getinfo* also returns a second instance of the provider with + *tag60* layout. + + The default setting is *auto*. + + Notice that if the provider is compiled with macro *PSMX3_TAG_LAYOUT* defined + to 1 (means *tag60*) or 2 (means *tag64*), the choice is fixed at compile time + and this runtime option will be disabled. + +# SEE ALSO + +[`fabric`(7)](fabric.7.html), +[`fi_provider`(7)](fi_provider.7.html), +[`fi_psm`(7)](fi_psm.7.html), +[`fi_psm2`(7)](fi_psm2.7.html), diff --git a/man/fi_rma.3.md b/man/fi_rma.3.md index 38f684f0e10..156780834c3 100644 --- a/man/fi_rma.3.md +++ b/man/fi_rma.3.md @@ -12,7 +12,7 @@ fi_rma - Remote memory access operations fi_read / fi_readv / fi_readmsg : Initiates a read from remote memory -fi_write / fi_writev / fi_writemsg +fi_write / fi_writev / fi_writemsg fi_inject_write / fi_writedata : Initiate a write to remote memory @@ -82,6 +82,7 @@ ssize_t fi_inject_writedata(struct fid_ep *ep, const void *buf, size_t len, *desc* : Descriptor associated with the local data buffer + See [`fi_mr`(3)](fi_mr.3.html). *data* : Remote CQ data to transfer with the operation. @@ -140,11 +141,7 @@ may be delivered. ## fi_write The call fi_write transfers the data contained in the user-specified -data buffer to a remote memory region. The local endpoint must be -connected to a remote endpoint or destination before fi_write is -called. Unless the endpoint has been configured differently, the data -buffer passed into fi_write must not be touched by the application -until the fi_write call completes asynchronously. +data buffer to a remote memory region. ## fi_writev @@ -155,7 +152,7 @@ referenced by the iov parameter to the remote memory region. ## fi_writemsg The fi_writemsg call supports data transfers over both connected and -unconnected endpoints, with the ability to control the write operation +connectionless endpoints, with the ability to control the write operation per call through the use of flags. The fi_writemsg function takes a struct fi_msg_rma as input. @@ -198,9 +195,7 @@ transfer. ## fi_read The fi_read call requests that the remote endpoint transfer data from -the remote memory region into the local data buffer. The local -endpoint must be connected to a remote endpoint or destination before -fi_read is called. +the remote memory region into the local data buffer. ## fi_readv @@ -211,7 +206,7 @@ the set of data buffers referenced by the iov parameter. ## fi_readmsg The fi_readmsg call supports data transfers over both connected and -unconnected endpoints, with the ability to control the read operation +connectionless endpoints, with the ability to control the read operation per call through the use of flags. The fi_readmsg function takes a struct fi_msg_rma as input. @@ -253,7 +248,7 @@ fi_writemsg. *FI_INJECT_COMPLETE* : Applies to fi_writemsg. Indicates that a completion should be generated when the source buffer(s) may be reused. - + *FI_TRANSMIT_COMPLETE* : Applies to fi_writemsg. Indicates that a completion should not be generated until the operation has been successfully transmitted and @@ -275,7 +270,7 @@ fi_writemsg. targeting the same peer endpoint have completed. Operations posted after the fencing will see and/or replace the results of any operations initiated prior to the fenced operation. - + The ordering of operations starting at the posting of the fenced operation (inclusive) to the posting of a subsequent fenced operation (exclusive) is controlled by the endpoint's ordering semantics. diff --git a/man/fi_rstream.7.md b/man/fi_rstream.7.md index 9b58e60fdcd..ee77ddeb3c1 100644 --- a/man/fi_rstream.7.md +++ b/man/fi_rstream.7.md @@ -42,8 +42,8 @@ supported: *fi_msg*. : The provider supports FI_THREAD_SAFE *Verbs-iWarp* -: The provider has added features to enable iWarp. To use this feature, the ep protocol - IWARP must be requested in a getinfo call. +: The provider has added features to enable iWarp. To use this feature, the + ep protocol iWarp must be requested in an fi_getinfo call. # LIMITATIONS @@ -54,8 +54,6 @@ The rstream provider is experimental and lacks performance validation and protocol. There are default settings that limit the message stream (provider memory region size and CQ size). These can be modified by fi_setopt. - - # SETTINGS The *rstream* provider settings can be modified via fi_setopt on the @@ -76,8 +74,8 @@ The *rstream* provider settings can be modified via fi_setopt on the # OFI EXTENSIONS The rstream provider has extended the current OFI API set in order to enable a - user implemenation of Poll. Specifically sendmsg(FI_PEEK) is supported which replicates - the behavior of the recvmsg(FI_PEEK) feature. + user implementation of Poll. Specifically sendmsg(FI_PEEK) is supported + which replicates the behavior of the recvmsg(FI_PEEK) feature. # SEE ALSO diff --git a/man/fi_rxm.7.md b/man/fi_rxm.7.md index 52798612c68..5d6190f2df2 100644 --- a/man/fi_rxm.7.md +++ b/man/fi_rxm.7.md @@ -131,14 +131,23 @@ The ofi_rxm provider checks for the following environment variables. : Defines the maximum number of MSG provider CQ entries (default: 1) that would be read per progress (RxM CQ read). +*FI_OFI_RXM_ENABLE_DYN_RBUF* +: Enables support for dynamic receive buffering, if available by the message + endpoint provider. This feature allows direct placement of received + message data into application buffers, bypassing RxM bounce buffers. + This feature targets providers that provide internal network buffering, + such as the tcp provider. (default: false) + *FI_OFI_RXM_SAR_LIMIT* : Set this environment variable to control the RxM SAR (Segmentation And Reassembly) - protocol. Messages of size greater than this (default: 256 Kb) would be transmitted + protocol. Messages of size greater than this (default: 128 Kb) would be transmitted via rendezvous protocol. *FI_OFI_RXM_USE_SRX* -: Set this to 1 to use shared receive context from MSG provider. This reduces - overall memory usage but there may be a slight increase in latency (default: 0). +: Set this to 1 to use shared receive context from MSG provider, or 0 to + disable using shared receive context. Shared receive contexts reduce overall + memory usage, but may increase in message latency. If not set, verbs will + not use shared receive contexts by default, but the tcp provider will. *FI_OFI_RXM_TX_SIZE* : Defines default TX context size (default: 1024) @@ -158,9 +167,14 @@ with (default: 256). *FI_OFI_RXM_CM_PROGRESS_INTERVAL* : Defines the duration of time in microseconds between calls to RxM CM progression - functions when using manual progress. Higher values may provide less noise for + functions when using manual progress. Higher values may provide less noise for calls to fi_cq read functions, but may increase connection setup time (default: 10000) +*FI_OFI_RXM_CQ_EQ_FAIRNESS* +: Defines the maximum number of message provider CQ entries that can be + consecutively read across progress calls without checking to see if the + CM progress interval has been reached (default: 128) + # Tuning ## Bandwidth @@ -196,7 +210,6 @@ of memory. The workaround is to use shared receive contexts for the MSG provider (FI_OFI_RXM_USE_SRX=1) or reduce eager message size (FI_OFI_RXM_BUFFER_SIZE) and MSG provider TX/RX queue sizes (FI_OFI_RXM_MSG_TX_SIZE / FI_OFI_RXM_MSG_RX_SIZE). - # SEE ALSO [`fabric`(7)](fabric.7.html), diff --git a/man/fi_shm.7.md b/man/fi_shm.7.md index 242a3ebed55..8e00a5907bf 100644 --- a/man/fi_shm.7.md +++ b/man/fi_shm.7.md @@ -74,7 +74,7 @@ of operations. provided (and in the case of setting the src address without FI_SOURCE and no hints), the process ID will be used as a default address. On endpoint creation, if the src_addr has the "fi_shm://" prefix, the provider - will append ":[dom_idx]:[ep_idx]" as a unique endpoint name (essentially, + will append ":[uid]:[dom_idx]:[ep_idx]" as a unique endpoint name (essentially, in place of a service). In the case of the "fi_ns://" prefix (or any other prefix if one was provided by the application), no supplemental information is required to make it unique and it will remain with only the @@ -109,7 +109,18 @@ No support for counters. # RUNTIME PARAMETERS -No runtime parameters are currently defined. +The *shm* provider checks for the following environment variables: + +*FI_SHM_SAR_THRESHOLD* +: Maximum message size to use segmentation protocol before switching + to mmap (only valid when CMA is not available). Default: SIZE_MAX + (18446744073709551615) + +*FI_SHM_TX_SIZE* +: Maximum number of outstanding tx operations. Default 1024 + +*FI_SHM_RX_SIZE* +: Maximum number of outstanding rx operations. Default 1024 # SEE ALSO diff --git a/man/fi_tagged.3.md b/man/fi_tagged.3.md index d1b1371e62d..badcfce252e 100644 --- a/man/fi_tagged.3.md +++ b/man/fi_tagged.3.md @@ -75,7 +75,8 @@ ssize_t fi_tinjectdata(struct fid_ep *ep, const void *buf, size_t len, : Mask of bits to ignore applied to the tag for receive operations. *desc* -: Memory descriptor associated with the data buffer +: Memory descriptor associated with the data buffer. + See [`fi_mr`(3)](fi_mr.3.html). *data* : Remote CQ data to transfer with the sent data. @@ -164,7 +165,7 @@ message. ## fi_tsendmsg The fi_tsendmsg call supports data transfers over both connected and -unconnected endpoints, with the ability to control the send operation +connectionless endpoints, with the ability to control the send operation per call through the use of flags. The fi_tsendmsg function takes a struct fi_msg_tagged as input. @@ -215,7 +216,7 @@ parameter to a receive incoming data. ## fi_trecvmsg The fi_trecvmsg call supports posting buffers over both connected and -unconnected endpoints, with the ability to control the receive +connectionless endpoints, with the ability to control the receive operation per call through the use of flags. The fi_trecvmsg function takes a struct fi_msg_tagged as input. @@ -257,7 +258,7 @@ and/or fi_tsendmsg. *FI_INJECT_COMPLETE* : Applies to fi_tsendmsg. Indicates that a completion should be generated when the source buffer(s) may be reused. - + *FI_TRANSMIT_COMPLETE* : Applies to fi_tsendmsg. Indicates that a completion should not be generated until the operation has been successfully transmitted and @@ -275,7 +276,7 @@ and/or fi_tsendmsg. targeting the same peer endpoint have completed. Operations posted after the fencing will see and/or replace the results of any operations initiated prior to the fenced operation. - + The ordering of operations starting at the posting of the fenced operation (inclusive) to the posting of a subsequent fenced operation (exclusive) is controlled by the endpoint's ordering semantics. @@ -288,7 +289,7 @@ The following flags may be used with fi_trecvmsg. allocated buffering enabled (see fi_rx_attr total_buffered_recv). Unlike standard receive operations, a receive operation with the FI_PEEK flag set does not remain queued with the provider after the peek completes - successfully. The peek operation operates asynchronously, and the results + successfully. The peek operation operates asynchronously, and the results of the peek operation are available in the completion queue associated with the endpoint. If no message is found matching the tags specified in the peek request, then a completion queue error entry with err field set to FI_ENOMSG @@ -367,7 +368,7 @@ receiving endpoint. For discussion purposes, the completion queue is assumed to be configured for FI_CQ_FORMAT_TAGGED. The op_context field will point to a struct -fi_recv_contex. +fi_recv_context. {% highlight c %} struct fi_recv_context { diff --git a/man/fi_tcp.7.md b/man/fi_tcp.7.md index 87a417b57de..4eb3e44aa14 100644 --- a/man/fi_tcp.7.md +++ b/man/fi_tcp.7.md @@ -23,10 +23,11 @@ The following features are supported *Endpoint types* : *FI_EP_MSG* is the only supported endpoint type. Reliable -datagram endpoint over TCP sockets can be achieved by layering RxM over -tcp provider. + datagram endpoint over TCP sockets can be achieved by layering RxM over + tcp provider. -: *FI_EP_RDM* is supported by layering ofi_rxm provider on top of the tcp provider. +: *FI_EP_RDM* is supported by layering ofi_rxm provider on top of the + tcp provider. *Endpoint capabilities* : The tcp provider currently supports *FI_MSG*, *FI_RMA* @@ -40,7 +41,6 @@ tcp provider. *Multi recv buffers* : The tcp provider supports multi recv buffers - # RUNTIME PARAMETERS The tcp provider check for the following enviroment variables - @@ -49,14 +49,16 @@ The tcp provider check for the following enviroment variables - : A specific can be requested with this variable *FI_TCP_PORT_LOW_RANGE/FI_TCP_PORT_HIGH_RANGE* -: These variables are used to set the range of ports to be used by the tcp provider for its passive endpoint creation. This is useful where only a range of ports are allowed by firewall for tcp connections. - +: These variables are used to set the range of ports to be used by the + tcp provider for its passive endpoint creation. This is useful where + only a range of ports are allowed by firewall for tcp connections. # LIMITATIONS -tcp provider is implemented over TCP sockets to emulate libfabric API. Hence -the performance is lower than what an application might see implementing to -sockets directly. +The tcp provider is implemented over TCP sockets to emulate libfabric API. +Hence the performance may be lower than what an application might see +implementing to sockets directly, depending on the types of data transfers +the application is trying to achieve. # SEE ALSO diff --git a/man/fi_trigger.3.md b/man/fi_trigger.3.md index 5623e2539d3..637d0a894e2 100644 --- a/man/fi_trigger.3.md +++ b/man/fi_trigger.3.md @@ -94,12 +94,10 @@ struct fi_trigger_threshold { they will be triggered in the order in which they were submitted to the endpoint. -# EXPERIMENTAL DEFERRED WORK QUEUES +# DEFERRED WORK QUEUES The following feature and description are enhancements to triggered -operation support, but should be considered experimental. Until the -experimental tag is removed, the interfaces, semantics, and data -structures defined below may change between library versions. +operation support. The deferred work queue interface is designed as primitive constructs that can be used to implement application-level collective operations. diff --git a/man/fi_verbs.7.md b/man/fi_verbs.7.md index 4c239bea4b9..595d3dca196 100644 --- a/man/fi_verbs.7.md +++ b/man/fi_verbs.7.md @@ -153,6 +153,8 @@ The support for fork in the provider has the following limitations: ### XRC Transport The XRC transport is intended to be used when layered with the RXM provider and requires the use of shared receive contexts. See [`fi_rxm`(7)](fi_rxm.7.thml). +To enable XRC, the following environment variables must usually be set: +FI_VERBS_PREFER_XRC and FI_OFI_RXM_USE_SRX. # RUNTIME PARAMETERS @@ -167,39 +169,50 @@ The verbs provider checks for the following environment variables. : Default maximum rx context size (default: 384) *FI_VERBS_TX_IOV_LIMIT* -: Default maximum tx iov_limit (default: 4). Note: RDM (internal - deprecated) EP type supports only 1 +: Default maximum tx iov_limit (default: 4). Note: RDM (internal - + deprecated) EP type supports only 1 *FI_VERBS_RX_IOV_LIMIT* -: Default maximum rx iov_limit (default: 4). Note: RDM (internal - deprecated) EP type supports only 1 +: Default maximum rx iov_limit (default: 4). Note: RDM (internal - + deprecated) EP type supports only 1 *FI_VERBS_INLINE_SIZE* -: Default maximum inline size. Actual inject size returned in fi_info may be greater (default: 64) +: Default maximum inline size. Actual inject size returned in fi_info + may be greater (default: 64) *FI_VERBS_MIN_RNR_TIMER* : Set min_rnr_timer QP attribute (0 - 31) (default: 12) *FI_VERBS_CQREAD_BUNCH_SIZE* -: The number of entries to be read from the verbs completion queue at a time (default: 8). +: The number of entries to be read from the verbs completion queue + at a time (default: 8). + +*FI_VERBS_PREFER_XRC* +: Prioritize XRC transport fi_info before RC transport fi_info (default: + 0, RC fi_info will be before XRC fi_info) + +*FI_VERBS_GID_IDX* +: The GID index to use (default: 0) + +*FI_VERBS_DEVICE_NAME* +: Specify a specific verbs device to use by name + +### Variables specific to MSG endpoints *FI_VERBS_IFACE* : The prefix or the full name of the network interface associated with the verbs device (default: ib) -*FI_VERBS_PREFER_XRC* -: Prioritize XRC transport fi_info before RC transport fi_info (default: 0, RC fi_info will be before XRC fi_info) - ### Variables specific to DGRAM endpoints *FI_VERBS_DGRAM_USE_NAME_SERVER* -: The option that enables/disables OFI Name Server thread. The NS thread is used to - resolve IP-addresses to provider specific addresses (default: 1, if "OMPI_COMM_WORLD_RANK" - and "PMI_RANK" environment variables aren't defined) +: The option that enables/disables OFI Name Server thread. The NS thread is + used to resolve IP-addresses to provider specific addresses (default: 1, + if "OMPI_COMM_WORLD_RANK" and "PMI_RANK" environment variables aren't defined) *FI_VERBS_NAME_SERVER_PORT* -: The port on which Name Server thread listens incoming connections and requests (default: 5678) - -*FI_VERBS_GID_IDX* -: The GID index to use (default: 0) +: The port on which Name Server thread listens incoming connections and + requests (default: 5678) ### Environment variables notes The fi_info utility would give the up-to-date information on environment variables: @@ -212,7 +225,7 @@ fi_info -p verbs -e - Set FI_LOG_LEVEL=info or FI_LOG_LEVEL=debug (if debug build of libfabric is available) and check if there any errors because of incorrect input parameters to fi_getinfo. -- Check if "fi_info -p verbs" is successful. If that fails the following chkecklist +- Check if "fi_info -p verbs" is successful. If that fails the following checklist may help in ensuring that the RDMA verbs stack is functional: - If libfabric was compiled, check if verbs provider was built. Building verbs provider would be skipped if its dependencies (listed in requirements) aren't diff --git a/man/man1/fi_info.1 b/man/man1/fi_info.1 index f878e474237..90c75c14dbe 100644 --- a/man/man1/fi_info.1 +++ b/man/man1/fi_info.1 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_info" "1" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_info" "1" "2020\-01\-30" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -84,11 +84,17 @@ Filter interfaces to only those with the given fabric name. .SS Discovery .TP .B \f[I]\-e, \-\-env\f[] -List libfabric related environment levels which can be used to enable +List libfabric related environment variables which can be used to enable extra configuration or tuning. .RS .RE .TP +.B *\-g [filter] +Same as \-e option, with output limited to environment variables +containing filter as a substring. +.RS +.RE +.TP .B \f[I]\-l, \-\-list\f[] List available libfabric providers. .RS diff --git a/man/man3/fi_atomic.3 b/man/man3/fi_atomic.3 index 36ed67a9198..3570a1875c5 100644 --- a/man/man3/fi_atomic.3 +++ b/man/man3/fi_atomic.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_atomic" "3" "2019\-07\-17" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_atomic" "3" "2020\-10\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -155,6 +155,7 @@ Local data buffer to store initial value of remote buffer .B \f[I]desc / compare_desc / result_desc\f[] Data descriptor associated with the local data buffer, local compare buffer, and local result buffer, respectively. +See \f[C]fi_mr\f[](3). .RS .RE .TP @@ -522,8 +523,8 @@ Otherwise, they perform the same general function. .PP The call fi_atomic transfers the data contained in the user\-specified data buffer to a remote node. -For unconnected endpoints, the destination endpoint is specified through -the dest_addr parameter. +For connectionless endpoints, the destination endpoint is specified +through the dest_addr parameter. Unless the endpoint has been configured differently, the data buffer passed into fi_atomic must not be touched by the application until the fi_atomic call completes asynchronously. @@ -548,8 +549,8 @@ The requested message size that can be used with fi_inject_atomic is limited by inject_size. .PP The fi_atomicmsg call supports atomic functions over both connected and -unconnected endpoints, with the ability to control the atomic operation -per call through the use of flags. +connectionless endpoints, with the ability to control the atomic +operation per call through the use of flags. The fi_atomicmsg function takes a struct fi_msg_atomic as input. .IP .nf diff --git a/man/man3/fi_av_set.3 b/man/man3/fi_av_set.3 index 8ba0dcb5a8a..51548a6e626 100644 --- a/man/man3/fi_av_set.3 +++ b/man/man3/fi_av_set.3 @@ -1,23 +1,65 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_av_set" "3" "2019\-07\-17" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_av_set" "3" "2020\-03\-20" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP fi_av_set \- Address vector set operations .TP -.B fi_av_open / fi_close -Open or close an address vector +.B fi_av_set / fi_close +Open or close an address vector set +.RS +.RE +.TP +.B fi_av_set_union +Perform a set union operation on two AV sets +.RS +.RE +.TP +.B fi_av_set_intersect +Perform a set intersect operation on two AV sets +.RS +.RE +.TP +.B fi_av_set_diff +Perform a set difference operation on two AV sets +.RS +.RE +.TP +.B fi_av_set_insert +Add an address to an AV set +.RS +.RE +.TP +.B fi_av_set_remove +Remove an address from an AV set +.RS +.RE +.TP +.B fi_av_set_addr +Obtain a collective address for current addresses in an AV set .RS .RE .SH SYNOPSIS .IP .nf \f[C] -#include\ +#include\ + +int\ fi_av_set(struct\ fid_av\ *av,\ struct\ fi_av_set_attr\ *attr, +\ \ \ \ \ \ struct\ fid_av_set\ **set,\ void\ *\ context); + +int\ fi_av_set_union(struct\ fid_av_set\ *dst,\ const\ struct\ fid_av_set\ *src); + +int\ fi_av_set_intersect(struct\ fid_av_set\ *dst,\ const\ struct\ fid_av_set\ *src); -int\ fi_av_open(struct\ fid_domain\ *domain,\ struct\ fi_av_attr\ *attr, -\ \ \ \ struct\ fid_av\ **av,\ void\ *context); +int\ fi_av_set_diff(struct\ fid_av_set\ *dst,\ const\ struct\ fid_av_set\ *src); + +int\ fi_av_set_insert(struct\ fid_av_set\ *set,\ fi_addr_t\ addr); + +int\ fi_av_set_remove(struct\ fid_av_set\ *set,\ fi_addr_t\ addr); + +int\ fi_av_set_addr(struct\ fid_av_set\ *set,\ fi_addr_t\ *coll_addr); int\ fi_close(struct\ fid\ *av_set); \f[] @@ -29,6 +71,21 @@ Address vector .RS .RE .TP +.B \f[I]set\f[] +Address vector set +.RS +.RE +.TP +.B \f[I]dst\f[] +Address vector set updated by set operation +.RS +.RE +.TP +.B \f[I]src\f[] +Address vector set providing input to a set operation +.RS +.RE +.TP .B \f[I]attr\f[] Address vector set attributes .RS @@ -43,6 +100,16 @@ User specified context associated with the address vector set Additional flags to apply to the operation. .RS .RE +.TP +.B \f[I]addr\f[] +Destination address to insert to remove from AV set. +.RS +.RE +.TP +.B \f[I]coll_addr\f[] +Address identifying collective group. +.RS +.RE .SH DESCRIPTION .PP An address vector set (AV set) represents an ordered subset of addresses @@ -87,6 +154,8 @@ struct\ fi_av_set_attr\ { Indicates the expected the number of members that will be a part of the AV set. The provider uses this to optimize resource allocations. +If count is 0, the provider will select a size based on available system +configuration data or underlying limitations. .RS .RE .TP @@ -103,6 +172,9 @@ an empty AV set, a communication key is being provided, or the AV is of type FI_AV_MAP. .RS .RE +.PP +The number of addresses between start_addr and end_addr must be less +than or equal to the specified count value. .TP .B \f[I]stride\f[] The number of entries between successive addresses included in the AV @@ -164,6 +236,22 @@ AV set. The AV set remove call removes the specified address from the given AV set. The order of the remaining addresses in the AV set is unchanged. +.SS fi_av_set_addr +.PP +Returns an address that may be used to communicate with all current +members of an AV set. +This is a local operation only that does not involve network +communication. +The returned address may be used as input into fi_join_collective. +Note that attempting to use the address returned from fi_av_set_addr +(e.g. +passing it to fi_join_collective) while simultaneously modifying the +addresses stored in an AV set results in undefined behavior. +.SS fi_close +.PP +Closes an AV set and releases all resources associated with it. +Any operations active at the time an AV set is closed will be aborted, +with the result of the collective undefined. .SH NOTES .PP Developers who are familiar with MPI will find that AV sets are similar diff --git a/man/man3/fi_cntr.3 b/man/man3/fi_cntr.3 index 0633c4992f6..e9587184445 100644 --- a/man/man3/fi_cntr.3 +++ b/man/man3/fi_cntr.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_cntr" "3" "2019\-02\-04" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_cntr" "3" "2019\-12\-13" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -162,7 +162,7 @@ Users may use fi_control to retrieve the underlying wait object associated with a counter, in order to use it in other system calls. The following values may be used to specify the type of wait object associated with a counter: FI_WAIT_NONE, FI_WAIT_UNSPEC, FI_WAIT_SET, -FI_WAIT_FD, and FI_WAIT_MUTEX_COND. +FI_WAIT_FD, FI_WAIT_MUTEX_COND, and FI_WAIT_YIELD. The default is FI_WAIT_NONE. .RS .RE @@ -208,6 +208,13 @@ as a wait object. .RS .RE .TP +.B \- \f[I]FI_WAIT_YIELD\f[] +Indicates that the counter will wait without a wait object but instead +yield on every wait. +Allows usage of fi_cntr_wait through a spin. +.RS +.RE +.TP .B \f[I]wait_set\f[] If wait_obj is FI_WAIT_SET, this field references a wait object to which the event counter should attach. diff --git a/man/man3/fi_collective.3 b/man/man3/fi_collective.3 index ba4bfb1e24d..3d000b09064 100644 --- a/man/man3/fi_collective.3 +++ b/man/man3/fi_collective.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_collective" "3" "2019\-07\-17" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_collective" "3" "2020\-04\-13" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .TP @@ -16,7 +16,12 @@ the barrier call. .RE .TP .B fi_broadcast -A single sender transmits data to all receiver peers. +A single sender transmits data to all peers, including itself. +.RS +.RE +.TP +.B fi_alltoall +Each peer distributes a slice of its local data to all peers. .RS .RE .TP @@ -26,6 +31,11 @@ all other peers. .RS .RE .TP +.B fi_allgather +Each peer sends a complete copy of its local data to all peers. +.RS +.RE +.TP .B fi_reduce_scatter Collective call where data is collected from all peers and merged (reduced). @@ -34,13 +44,20 @@ peer receiving a slice of the results. .RS .RE .TP -.B fi_alltoall -Each peer distributes a slice of its local data to all peers. +.B fi_reduce +Collective call where data is collected from all peers to a root peer +and merged (reduced). .RS .RE .TP -.B fi_allgather -Each peer sends a complete copy of its local data to all peers. +.B fi_scatter +A single sender distributes (scatters) a slice of its local data to all +peers. +.RS +.RE +.TP +.B fi_gather +All peers send their data to a root peer. .RS .RE .TP @@ -63,32 +80,46 @@ ssize_t\ fi_barrier(struct\ fid_ep\ *ep,\ fi_addr_t\ coll_addr, \ \ \ \ void\ *context); ssize_t\ fi_broadcast(struct\ fid_ep\ *ep,\ void\ *buf,\ size_t\ count,\ void\ *desc, -\ \ \ \ fi_addr_t\ coll_addr,\ enum\ fi_datatype\ datatype,\ enum\ fi_op\ op, +\ \ \ \ fi_addr_t\ coll_addr,\ fi_addr_t\ root_addr,\ enum\ fi_datatype\ datatype, \ \ \ \ uint64_t\ flags,\ void\ *context); -ssize_t\ fi_allreduce(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ count, +ssize_t\ fi_alltoall(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ count, \ \ \ \ void\ *desc,\ void\ *result,\ void\ *result_desc, -\ \ \ \ fi_addr_t\ coll_addr,\ enum\ fi_datatype\ datatype,\ enum\ fi_op\ op, +\ \ \ \ fi_addr_t\ coll_addr,\ enum\ fi_datatype\ datatype, \ \ \ \ uint64_t\ flags,\ void\ *context); -ssize_t\ fi_reduce_scatter(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ count, +ssize_t\ fi_allreduce(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ count, \ \ \ \ void\ *desc,\ void\ *result,\ void\ *result_desc, \ \ \ \ fi_addr_t\ coll_addr,\ enum\ fi_datatype\ datatype,\ enum\ fi_op\ op, \ \ \ \ uint64_t\ flags,\ void\ *context); -ssize_t\ fi_alltoall(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ count, +ssize_t\ fi_allgather(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ count, \ \ \ \ void\ *desc,\ void\ *result,\ void\ *result_desc, \ \ \ \ fi_addr_t\ coll_addr,\ enum\ fi_datatype\ datatype, \ \ \ \ uint64_t\ flags,\ void\ *context); -ssize_t\ fi_allgather(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ count, +ssize_t\ fi_reduce_scatter(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ count, \ \ \ \ void\ *desc,\ void\ *result,\ void\ *result_desc, -\ \ \ \ fi_addr_t\ coll_addr,\ enum\ fi_datatype\ datatype, +\ \ \ \ fi_addr_t\ coll_addr,\ enum\ fi_datatype\ datatype,\ enum\ fi_op\ op, +\ \ \ \ uint64_t\ flags,\ void\ *context); + +ssize_t\ fi_reduce(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ count, +\ \ \ \ void\ *desc,\ void\ *result,\ void\ *result_desc,\ fi_addr_t\ coll_addr, +\ \ \ \ fi_addr_t\ root_addr,\ enum\ fi_datatype\ datatype,\ enum\ fi_op\ op, +\ \ \ \ uint64_t\ flags,\ void\ *context); + +ssize_t\ fi_scatter(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ count, +\ \ \ \ void\ *desc,\ void\ *result,\ void\ *result_desc,\ fi_addr_t\ coll_addr, +\ \ \ \ fi_addr_t\ root_addr,\ enum\ fi_datatype\ datatype, +\ \ \ \ uint64_t\ flags,\ void\ *context); + +ssize_t\ fi_gather(struct\ fid_ep\ *ep,\ const\ void\ *buf,\ size_t\ count, +\ \ \ \ void\ *desc,\ void\ *result,\ void\ *result_desc,\ fi_addr_t\ coll_addr, +\ \ \ \ fi_addr_t\ root_addr,\ enum\ fi_datatype\ datatype, \ \ \ \ uint64_t\ flags,\ void\ *context); int\ fi_query_collective(struct\ fid_domain\ *domain, -\ \ \ \ enum\ fi_datatype\ datatype,\ enum\ fi_op\ op, -\ \ \ \ struct\ fi_collective_attr\ *attr,\ uint64_t\ flags); +\ \ \ \ fi_collective_op\ coll,\ struct\ fi_collective_attr\ *attr,\ uint64_t\ flags); \f[] .fi .SH ARGUMENTS @@ -139,6 +170,11 @@ Address referring to the collective group of endpoints. .RS .RE .TP +.B \f[I]root_addr\f[] +Single endpoint that is the source or destination of collective data. +.RS +.RE +.TP .B \f[I]flags\f[] Additional flags to apply for the atomic operation .RS @@ -151,7 +187,13 @@ successful completion, unless an op flag specifies the context parameter be used for required input. .RS .RE -.SH DESCRIPTION +.SH DESCRIPTION (EXPERIMENTAL APIs) +.PP +The collective APIs are new to the 1.9 libfabric release. +Although, efforts have been made to design the APIs such that they align +well with applications and are implementable by the providers, the APIs +should be considered experimental and may be subject to change in future +versions of the library until the experimental tag has been removed. .PP In general collective operations can be thought of as coordinated atomic operations between a set of peer endpoints. @@ -214,7 +256,6 @@ provided that is used as the target address when invoking a collective operation. .PP For developer convenience, a set of collective APIs are defined. -However, these are inline wrappers around the atomic interfaces. Collective APIs differ from message and RMA interfaces in that the format of the data is known to the provider, and the collective may perform an operation on that data. @@ -251,13 +292,12 @@ Application managed collective memberships are an exception. With application managed memberships, the fi_join_collective call may be completed locally without fabric communication. For provider managed memberships, the join collective call requires as -input a coll_addr that refers to an existing collective group. +input a coll_addr that refers to either an address associated with an AV +set (see fi_av_set_addr) or an existing collective group (obtained +through a previous call to fi_join_collective). The fi_join_collective call will create a new collective subgroup. -If there is no existing collective group (e.g. -this is the first group being created), or if application managed -memberships are used, coll_addr should be set to FI_ADDR_UNAVAIL. -For provider managed memberships, this will result in using all entries -in the associated AV as the base. +If application managed memberships are used, coll_addr should be set to +FI_ADDR_UNAVAIL. .PP Applications must call fi_close on the collective group to disconnect the endpoint from the group. @@ -277,11 +317,8 @@ completed prior to them calling barrier has finished. .PP fi_broadcast transfers an array of data from a single sender to all other members of the collective group. -The sender of the broadcast data must specify the FI_SEND flag, while -receivers use the FI_RECV flag. -The input buf parameter is treated as either the transmit buffer, if -FI_SEND is set, or the receive buffer, if FI_RECV is set. -Either the FI_SEND or FI_RECV flag must be set. +The input buf parameter is treated as the transmit buffer if the local +rank is the root, otherwise it is the receive buffer. The broadcast operation acts as an atomic write or read to a data array. As a result, the format of the data in buf is specified through the datatype parameter. @@ -300,6 +337,33 @@ transfer an array of integers to a group of peers. \ broadcast \f[] .fi +.SS All to All (fi_alltoall) +.PP +The fi_alltoall collective involves distributing (or scattering) +different portions of an array of data to peers. +It is best explained using an example. +Here three peers perform an all to all collective to exchange different +entries in an integer array. +.IP +.nf +\f[C] +[1]\ \ \ [2]\ \ \ [3] +[5]\ \ \ [6]\ \ \ [7] +[9]\ \ [10]\ \ [11] +\ \ \ \\\ \ \ |\ \ \ / +\ \ \ All\ to\ all +\ \ \ /\ \ \ |\ \ \ \\ +[1]\ \ \ [5]\ \ \ [9] +[2]\ \ \ [6]\ \ [10] +[3]\ \ \ [7]\ \ [11] +\f[] +.fi +.PP +Each peer sends a piece of its data to the other peers. +.PP +All to all operations may be performed on any non\-void datatype. +However, all to all does not perform an operation on the data itself, so +no operation is specified. .SS All Reduce (fi_allreduce) .PP fi_allreduce can be described as all peers providing input into an @@ -331,30 +395,27 @@ involving summing an array of integers between three peers. \ \ All\ Reduce \f[] .fi -.SS All to All (fi_alltoall) +.SS All Gather (fi_allgather) .PP -The fi_alltoall collective involves distributing (or scattering) -different portions of an array of data to peers. -It is best explained using an example. -Here three peers perform an all to all collective to exchange different -entries in an integer array. +Conceptually, all gather can be viewed as the opposite of the scatter +component from reduce\-scatter. +All gather collects data from all peers into a single array, then copies +that array back to each peer. .IP .nf \f[C] -[1]\ \ \ [2]\ \ \ [3] -[5]\ \ \ [6]\ \ \ [7] -[9]\ \ [10]\ \ [11] -\ \ \ \\\ \ \ |\ \ \ / -\ \ \ All\ to\ all -\ \ \ /\ \ \ |\ \ \ \\ -[1]\ \ \ [5]\ \ \ [9] -[5]\ \ \ [6]\ \ \ [7] -[9]\ \ [10]\ \ [11] +[1]\ \ [5]\ \ [9] +\ \ \\\ \ \ |\ \ \ / +\ All\ gather +\ \ /\ \ \ |\ \ \ \\ +[1]\ \ [1]\ \ [1] +[5]\ \ [5]\ \ [5] +[9]\ \ [9]\ \ [9] \f[] .fi .PP -All to all operations may be performed on any non\-void datatype. -However, all to all does not perform an operation on the data itself, so +All gather may be performed on any non\-void datatype. +However, all gather does not perform an operation on the data itself, so no operation is specified. .SS Reduce\-Scatter (fi_reduce_scatter) .PP @@ -387,28 +448,75 @@ This is shown by the following example: .PP The reduce scatter call supports the same datatype and atomic operation as fi_allreduce. -.SS All Gather (fi_allgather) +.SS Reduce (fi_reduce) .PP -Conceptually, all gather can be viewed as the opposite of the scatter -component from reduce\-scatter. -All gather collects data from all peers into a single array, then copies -that array back to each peer. +The fi_reduce collective is the first half of an fi_allreduce operation. +With reduce, all peers provide input into an atomic operation, with the +the results collected by a single \[aq]root\[aq] endpoint. +.PP +This is shown by the following example, with the leftmost peer +identified as the root: .IP .nf \f[C] -[1]\ \ [5]\ \ [9] -\ \ \\\ \ \ |\ \ \ / -\ All\ gather -\ \ /\ \ \ |\ \ \ \\ [1]\ \ [1]\ \ [1] [5]\ \ [5]\ \ [5] [9]\ \ [9]\ \ [9] +\ \ \\\ \ \ |\ \ \ / +\ \ \ \ \ sum\ (reduce) +\ \ \ \ / +\ [3] +[15] +[27] \f[] .fi .PP -All gather may be performed on any non\-void datatype. -However, all gather does not perform an operation on the data itself, so -no operation is specified. +The reduce call supports the same datatype and atomic operation as +fi_allreduce. +.SS Scatter (fi_scatter) +.PP +The fi_scatter collective is the second half of an fi_reduce_scatter +operation. +The data from a single \[aq]root\[aq] endpoint is split and distributed +to all peers. +.PP +This is shown by the following example: +.IP +.nf +\f[C] +\ [3] +[15] +[27] +\ \ \ \ \\ +\ \ \ scatter +\ \ /\ \ \ |\ \ \ \\ +[3]\ [15]\ [27] +\f[] +.fi +.PP +The scatter operation is used to distribute results to the peers. +No atomic operation is performed on the data. +.SS Gather (fi_gather) +.PP +The fi_gather operation is used to collect (gather) the results from all +peers and store them at a \[aq]root\[aq] peer. +.PP +This is shown by the following example, with the leftmost peer +identified as the root. +.IP +.nf +\f[C] +[1]\ \ [5]\ \ [9] +\ \ \\\ \ \ |\ \ \ / +\ \ \ \ gather +\ \ \ / +[1] +[5] +[9] +\f[] +.fi +.PP +The gather operation does not perform any operation on the data itself. .SS Query Collective Attributes (fi_query_collective) .PP The fi_query_collective call reports which collective operations are @@ -418,37 +526,55 @@ the provider must be implemented by the application. The query call checks whether a provider supports a specific collective operation for a given datatype and operation, if applicable. .PP -The datatype and operation of the collective are provided as input into +The name of the collective, as well as the datatype and associated +operation, if applicable, and are provided as input into fi_query_collective. -For operations that do not exchange application data, such as -fi_barrier, the datatype should be set to FI_VOID. -The op parameter may reference one of these atomic opcodes: FI_MIN, -FI_MAX, FI_SUM, FI_PROD, FI_LOR, FI_LAND, FI_BOR, FI_BAND, FI_LXOR, -FI_BXOR, or a collective operation: FI_BARRIER, FI_BROADCAST, -FI_ALLTOALL, FI_ALLGATHER. -The use of an atomic opcode will indicate if the provider supports the -fi_allreduce() call for the given operation and datatype, unless the -FI_SCATTER flag has been specified. -If FI_SCATTER has been set, query will return if the provider supports -the fi_reduce_scatter() call for the given operation and datatype. -Specifying a collective operation for the op parameter queries support -for the corresponding collective. -.PP -On success, fi_query_collective will provide information about the -supported limits through the struct fi_collective_attr parameter. +.PP +The coll parameter may reference one of these collectives: FI_BARRIER, +FI_BROADCAST, FI_ALLTOALL, FI_ALLREDUCE, FI_ALLGATHER, +FI_REDUCE_SCATTER, FI_REDUCE, FI_SCATTER, or FI_GATHER. +Additional details on the collective operation is specified through the +struct fi_collective_attr parameter. +For collectives that act on data, the operation and related data type +must be specified through the given attributes. .IP .nf \f[C] struct\ fi_collective_attr\ { +\ \ \ \ enum\ fi_op\ op; +\ \ \ \ enum\ fi_datatype\ datatype; \ \ \ \ struct\ fi_atomic_attr\ datatype_attr; \ \ \ \ size_t\ max_members; -\ \ \ \ uint64_t\ mode; +\ \ \ \ \ \ uint64_t\ mode; }; \f[] .fi .PP For a description of struct fi_atomic_attr, see \f[C]fi_atomic\f[](3). .TP +.B \f[I]op\f[] +On input, this specifies the atomic operation involved with the +collective call. +This should be set to one of the following values: FI_MIN, FI_MAX, +FI_SUM, FI_PROD, FI_LOR, FI_LAND, FI_BOR, FI_BAND, FI_LXOR, FI_BXOR, +FI_ATOMIC_READ, FI_ATOMIC_WRITE, of FI_NOOP. +For collectives that do not exchange application data (fi_barrier), this +should be set to FI_NOOP. +.RS +.RE +.TP +.B \f[I]datatype\f[] +On onput, specifies the datatype of the data being modified by the +collective. +This should be set to one of the following values: FI_INT8, FI_UINT8, +FI_INT16, FI_UINT16, FI_INT32, FI_UINT32, FI_INT64, FI_UINT64, FI_FLOAT, +FI_DOUBLE, FI_FLOAT_COMPLEX, FI_DOUBLE_COMPLEX, FI_LONG_DOUBLE, +FI_LONG_DOUBLE_COMPLEX, or FI_VOID. +For collectives that do not exchange application data (fi_barrier), this +should be set to FI_VOID. +.RS +.RE +.TP .B \f[I]datatype_attr.count\f[] The maximum number of elements that may be used with the collective. .RS @@ -472,9 +598,9 @@ This field is reserved and should be 0. .RS .RE .PP -If a collective operation is supported, the query call will return 0, -along with attributes on the limits for using that collective operation -through the provider. +If a collective operation is supported, the query call will return +FI_SUCCESS, along with attributes on the limits for using that +collective operation through the provider. .SS Completions .PP Collective operations map to underlying fi_atomic operations. @@ -486,20 +612,6 @@ those defined for point to point atomic operations. .PP The following flags are defined for the specified operations. .TP -.B \f[I]FI_SEND\f[] -Applies to fi_broadcast() operations. -This indicates that the caller is the transmitter of the broadcast data. -There should only be a single transmitter for each broadcast collective -operation. -.RS -.RE -.TP -.B \f[I]FI_RECV\f[] -Applies to fi_broadcast() operation. -This indicates that the caller is the receiver of broadcase data. -.RS -.RE -.TP .B \f[I]FI_SCATTER\f[] Applies to fi_query_collective. When set, requests attribute information on the reduce\-scatter diff --git a/man/man3/fi_control.3 b/man/man3/fi_control.3 index c6bc5b3bb0c..0ae4b524b77 100644 --- a/man/man3/fi_control.3 +++ b/man/man3/fi_control.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_control" "3" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_control" "3" "2020\-11\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -12,6 +12,9 @@ fi_control \- Perform an operation on a fabric resource. #include\ int\ fi_control(struct\ fid\ *fid,\ int\ command,\ void\ *arg); +int\ fi_alias(struct\ fid\ *fid,\ struct\ fid\ **alias_fid,\ uint64_t\ flags); +int\ fi_get_val(struct\ fid\ *fid,\ int\ name,\ void\ *val); +int\ fi_set_val(struct\ fid\ *fid,\ int\ name,\ void\ *val); \f[] .fi .SH ARGUMENTS @@ -40,6 +43,17 @@ being operated on, the specified command, and any provided arguments for the command. For specific details, see the fabric resource specific help pages noted below. +.PP +fi_alias, fi_get_val, and fi_set_val are wrappers for fi_control with +commands FI_ALIAS, FI_GET_VAL, FI_SET_VAL, respectively. +fi_alias creates an alias of the specified fabric resource. +fi_get_val reads the value of the named parameter associated with the +fabric resource, while fi_set_val updates that value. +Available parameter names depend on the type of the fabric resource and +the provider in use. +Providers may define provider specific names in the provider extension +header files (\[aq]rdma/fi_ext_*.h\[aq]). +Please refer to the provider man pages for details. .SH SEE ALSO .PP \f[C]fi_endpoint\f[](3), \f[C]fi_cm\f[](3), \f[C]fi_cntr\f[](3), diff --git a/man/man3/fi_cq.3 b/man/man3/fi_cq.3 index 9ad3243a8ef..b84308d82f7 100644 --- a/man/man3/fi_cq.3 +++ b/man/man3/fi_cq.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_cq" "3" "2019\-02\-27" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_cq" "3" "2019\-12\-13" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -298,7 +298,7 @@ Users may use fi_control to retrieve the underlying wait object associated with a CQ, in order to use it in other system calls. The following values may be used to specify the type of wait object associated with a CQ: FI_WAIT_NONE, FI_WAIT_UNSPEC, FI_WAIT_SET, -FI_WAIT_FD, and FI_WAIT_MUTEX_COND. +FI_WAIT_FD, FI_WAIT_MUTEX_COND, and FI_WAIT_YIELD. The default is FI_WAIT_NONE. .RS .RE @@ -346,10 +346,10 @@ wait object. .RS .RE .TP -.B \- \f[I]FI_WAIT_CRITSEC_COND\f[] -Windows specific. -Specifies that the CQ should use a critical section and condition -variable as a wait object. +.B \- \f[I]FI_WAIT_YIELD\f[] +Indicates that the CQ will wait without a wait object but instead yield +on every wait. +Allows usage of fi_cq_sread and fi_cq_sreadfrom through a spin. .RS .RE .TP @@ -663,12 +663,17 @@ into a human readable string. .RE .TP .B \f[I]err_data\f[] -On an error, err_data may reference a provider specific amount of data -associated with an error. -The use of this field and its meaning is provider specific. +The err_data field is used to return provider specific information, if +available, about the error. +On input, err_data should reference a data buffer of size err_data_size. +On output, the provider will fill in this buffer with any provider +specific data which may help identify the cause of the error. +The contents of the err_data field and its meaning is provider specific. It is intended to be used as a debugging aid. See fi_cq_strerror for additional details on converting this error data into a human readable string. +See the compatibility note below on how this field is used for older +libfabric releases. .RS .RE .TP @@ -682,9 +687,12 @@ provide details about the type of error that occurred. .RS .RE .PP -For compatibility purposes, if err_data_size is 0 on input, or the -fabric was opened with release < 1.5, err_data will be set to a data -buffer owned by the provider. +For compatibility purposes, the behavior of the err_data and +err_data_size fields is may be modified from that listed above. +If err_data_size is 0 on input, or the fabric was opened with release < +1.5, then any buffer referenced by err_data will be ignored on input. +In this situation, on output err_data will be set to a data buffer owned +by the provider. The contents of the buffer will remain valid until a subsequent read call against the CQ. Applications must serialize access to the CQ when processing errors to @@ -943,6 +951,7 @@ As a result, match complete may involve additional provider level acknowledgements or lengthy delays. However, this completion model enables peer applications to synchronize their execution. +Many providers may not support this semantic. .RS .RE .TP @@ -957,6 +966,29 @@ in the case of power failure. This completion mode applies only to operations that target persistent memory regions over reliable endpoints. This completion mode is experimental. +.TP +.B \f[I]FI_FENCE\f[] +This is not a completion level, but plays a role in the completion +ordering between operations that would not normally be ordered. +An operation that is marked with the FI_FENCE flag and all operations +posted after the fenced operation are deferred until all previous +operations targeting the same peer endpoint have completed. +Additionally, the completion of the fenced operation indicates that +prior operations have met the same completion level as the fenced +operation. +For example, if an operation is posted as FI_DELIVERY_COMPLETE | +FI_FENCE, then its completion indicates prior operations have met the +semantic required for FI_DELIVERY_COMPLETE. +This is true even if the prior operation was posted with a lower +completion level, such as FI_TRANSMIT_COMPLETE or FI_INJECT_COMPLETE. +.RS +.RE +.PP +Note that a completion generated for an operation posted prior to the +fenced operation only guarantees that the completion level that was +originally requested has been met. +It is the completion of the fenced operation that guarantees that the +additional semantics have been met. .SH NOTES .PP A completion queue must be bound to at least one enabled endpoint before diff --git a/man/man3/fi_domain.3 b/man/man3/fi_domain.3 index a358698f812..bc42f00583c 100644 --- a/man/man3/fi_domain.3 +++ b/man/man3/fi_domain.3 @@ -1,7 +1,7 @@ .\"t .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_domain" "3" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_domain" "3" "2020\-10\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -24,6 +24,9 @@ int\ fi_domain_bind(struct\ fid_domain\ *domain,\ struct\ fid\ *eq, int\ fi_open_ops(struct\ fid\ *domain,\ const\ char\ *name,\ uint64_t\ flags, \ \ \ \ void\ **ops,\ void\ *context); + +int\ fi_set_ops(struct\ fid\ *domain,\ const\ char\ *name,\ uint64_t\ flags, +\ \ \ \ void\ *ops,\ void\ *context); \f[] .fi .SH ARGUMENTS @@ -85,6 +88,84 @@ Provider interfaces may be used to access low\-level resources and operations that are specific to the opened resource domain. The details of domain interfaces are outside the scope of this documentation. +.SS fi_set_ops +.PP +fi_set_ops assigns callbacks that a provider should invoke in place of +performing selected tasks. +This allows users to modify or control a provider\[aq]s default +behavior. +Conceptually, it allows the user to hook specific functions used by a +provider and replace it with their own. +.PP +The operations being modified are identified using a well\-known +character string, passed as the name parameter. +The format of the ops parameter is dependent upon the name value. +The ops parameter will reference a structure containing the callbacks +and other fields needed by the provider to invoke the user\[aq]s +functions. +.PP +If a provider accepts the override, it will return FI_SUCCESS. +If the override is unknown or not supported, the provider will return +\-FI_ENOSYS. +Overrides should be set prior to allocating resources on the domain. +.PP +The following fi_set_ops operations and corresponding callback +structures are defined. +.PP +\f[B]FI_SET_OPS_HMEM_OVERRIDE \-\- Heterogeneous Memory Overrides\f[] +.PP +HMEM override allows users to override HMEM related operations a +provider may perform. +Currently, the scope of the HMEM override is to allow a user to define +the memory movement functions a provider should use when accessing a +user buffer. +The user\-defined memory movement functions need to account for all the +different HMEM iface types a provider may encounter. +.PP +All objects allocated against a domain will inherit this override. +.PP +The following is the HMEM override operation name and structure. +.IP +.nf +\f[C] +#define\ FI_SET_OPS_HMEM_OVERRIDE\ "hmem_override_ops" + +struct\ fi_hmem_override_ops\ { +\ \ \ \ size_t\ \ size; + +\ \ \ \ ssize_t\ (*copy_from_hmem_iov)(void\ *dest,\ size_t\ size, +\ \ \ \ \ \ \ \ enum\ fi_hmem_iface\ iface,\ uint64_t\ device,\ const\ struct\ iovec\ *hmem_iov, +\ \ \ \ \ \ \ \ size_t\ hmem_iov_count,\ uint64_t\ hmem_iov_offset); + +\ \ \ \ ssize_t\ (*copy_to_hmem_iov)(enum\ fi_hmem_iface\ iface,\ uint64_t\ device, +\ \ \ \ const\ struct\ iovec\ *hmem_iov,\ size_t\ hmem_iov_count, +\ \ \ \ \ \ \ \ uint64_t\ hmem_iov_offset,\ const\ void\ *src,\ size_t\ size); +}; +\f[] +.fi +.PP +All fields in struct fi_hmem_override_ops must be set (non\-null) to a +valid value. +.TP +.B \f[I]size\f[] +This should be set to the sizeof(struct fi_hmem_override_ops). +The size field is used for forward and backward compatibility purposes. +.RS +.RE +.TP +.B \f[I]copy_from_hmem_iov\f[] +Copy data from the device/hmem to host memory. +This function should return a negative fi_errno on error, or the number +of bytes copied on success. +.RS +.RE +.TP +.B \f[I]copy_to_hmem_iov\f[] +Copy data from host memory to the device/hmem. +This function should return a negative fi_errno on error, or the number +of bytes copied on success. +.RS +.RE .SS fi_domain_bind .PP Associates an event queue with the domain. @@ -144,6 +225,7 @@ struct\ fi_domain_attr\ { \ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ auth_key_size; \ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ max_err_data; \ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ mr_cnt; +\ \ \ \ uint32_t\ \ \ \ \ \ \ \ \ \ \ \ \ \ tclass; }; \f[] .fi @@ -155,6 +237,10 @@ On output from fi_getinfo, if no domain was specified, but the user has an opened instance of the named domain, this will reference the first opened instance. If no instance has been opened, this field will be NULL. +.PP +The domain instance returned by fi_getinfo should only be considered +valid if the application does not close any domain instances from +another thread while fi_getinfo is being processed. .SS Name .PP The name of the access domain. @@ -168,19 +254,37 @@ Applications which can guarantee serialization in their access of provider allocated resources and interfaces enables a provider to eliminate lower\-level locks. .TP -.B \f[I]FI_THREAD_UNSPEC\f[] -This value indicates that no threading model has been defined. -It may be used on input hints to the fi_getinfo call. -When specified, providers will return a threading model that allows for -the greatest level of parallelism. +.B \f[I]FI_THREAD_COMPLETION\f[] +The completion threading model is intended for providers that make use +of manual progress. +Applications must serialize access to all objects that are associated +through the use of having a shared completion structure. +This includes endpoint, transmit context, receive context, completion +queue, counter, wait set, and poll set objects. .RS .RE +.PP +For example, threads must serialize access to an endpoint and its bound +completion queue(s) and/or counters. +Access to endpoints that share the same completion queue must also be +serialized. +.PP +The use of FI_THREAD_COMPLETION can increase parallelism over +FI_THREAD_SAFE, but requires the use of isolated resources. .TP -.B \f[I]FI_THREAD_SAFE\f[] -A thread safe serialization model allows a multi\-threaded application -to access any allocated resources through any interface without -restriction. -All providers are required to support FI_THREAD_SAFE. +.B \f[I]FI_THREAD_DOMAIN\f[] +A domain serialization model requires applications to serialize access +to all objects belonging to a domain. +.RS +.RE +.TP +.B \f[I]FI_THREAD_ENDPOINT\f[] +The endpoint threading model is similar to FI_THREAD_FID, but with the +added restriction that serialization is required when accessing the same +endpoint, even if multiple transmit and receive contexts are used. +Conceptually, FI_THREAD_ENDPOINT maps well to providers that implement +fabric services in hardware but use a single command queue to access +different data flows. .RS .RE .TP @@ -212,37 +316,19 @@ Conceptually, FI_THREAD_FID maps well to providers that implement fabric services in hardware and provide separate command queues to different data flows. .TP -.B \f[I]FI_THREAD_ENDPOINT\f[] -The endpoint threading model is similar to FI_THREAD_FID, but with the -added restriction that serialization is required when accessing the same -endpoint, even if multiple transmit and receive contexts are used. -Conceptually, FI_THREAD_ENDPOINT maps well to providers that implement -fabric services in hardware but use a single command queue to access -different data flows. -.RS -.RE -.TP -.B \f[I]FI_THREAD_COMPLETION\f[] -The completion threading model is intended for providers that make use -of manual progress. -Applications must serialize access to all objects that are associated -through the use of having a shared completion structure. -This includes endpoint, transmit context, receive context, completion -queue, counter, wait set, and poll set objects. +.B \f[I]FI_THREAD_SAFE\f[] +A thread safe serialization model allows a multi\-threaded application +to access any allocated resources through any interface without +restriction. +All providers are required to support FI_THREAD_SAFE. .RS .RE -.PP -For example, threads must serialize access to an endpoint and its bound -completion queue(s) and/or counters. -Access to endpoints that share the same completion queue must also be -serialized. -.PP -The use of FI_THREAD_COMPLETION can increase parallelism over -FI_THREAD_SAFE, but requires the use of isolated resources. .TP -.B \f[I]FI_THREAD_DOMAIN\f[] -A domain serialization model requires applications to serialize access -to all objects belonging to a domain. +.B \f[I]FI_THREAD_UNSPEC\f[] +This value indicates that no threading model has been defined. +It may be used on input hints to the fi_getinfo call. +When specified, providers will return a threading model that allows for +the greatest level of parallelism. .RS .RE .SS Progress Models (control_progress / data_progress) @@ -277,12 +363,6 @@ and acknowledgement processing. To balance between performance and ease of use, two progress models are defined. .TP -.B \f[I]FI_PROGRESS_UNSPEC\f[] -This value indicates that no progress model has been defined. -It may be used on input hints to the fi_getinfo call. -.RS -.RE -.TP .B \f[I]FI_PROGRESS_AUTO\f[] This progress model indicates that the provider will make forward progress on an asynchronous operation without further intervention by @@ -324,6 +404,12 @@ events for the operations. For example, an endpoint that acts purely as the target of RMA or atomic operations that uses manual progress may still need application assistance to process received operations. +.TP +.B \f[I]FI_PROGRESS_UNSPEC\f[] +This value indicates that no progress model has been defined. +It may be used on input hints to the fi_getinfo call. +.RS +.RE .SS Resource Management (resource_mgmt) .PP Resource management (RM) is provider and protocol support to protect @@ -347,12 +433,6 @@ protection against overruns. However, such protection is not guaranteed. The following values for resource management are defined. .TP -.B \f[I]FI_RM_UNSPEC\f[] -This value indicates that no resource management model has been defined. -It may be used on input hints to the fi_getinfo call. -.RS -.RE -.TP .B \f[I]FI_RM_DISABLED\f[] The provider is free to select an implementation and protocol that does not protect against resource overruns. @@ -364,6 +444,12 @@ The application is responsible for resource protection. Resource management is enabled for this provider domain. .RS .RE +.TP +.B \f[I]FI_RM_UNSPEC\f[] +This value indicates that no resource management model has been defined. +It may be used on input hints to the fi_getinfo call. +.RS +.RE .PP The behavior of the various resource management options depends on whether the endpoint is reliable or unreliable, as well as provider and @@ -569,7 +655,7 @@ When a resource management error occurs on an endpoint, the endpoint is transitioned into a disabled state. Any operations which have not already completed will fail and be discarded. -For unconnected endpoints, the endpoint must be re\-enabled before it +For connectionless endpoints, the endpoint must be re\-enabled before it will accept new data transfer operations. For connected endpoints, the connection is torn down and must be re\-established. @@ -599,11 +685,6 @@ Specifies the type of address vectors that are usable with this domain. For additional details on AV type, see \f[C]fi_av\f[](3). The following values may be specified. .TP -.B \f[I]FI_AV_UNSPEC\f[] -Any address vector format is requested and supported. -.RS -.RE -.TP .B \f[I]FI_AV_MAP\f[] Only address vectors of type AV map are requested or supported. .RS @@ -613,6 +694,11 @@ Only address vectors of type AV map are requested or supported. Only address vectors of type AV index are requested or supported. .RS .RE +.TP +.B \f[I]FI_AV_UNSPEC\f[] +Any address vector format is requested and supported. +.RS +.RE .PP Address vectors are only used by connectionless endpoints. Applications that require the use of a specific type of address vector @@ -629,32 +715,30 @@ Defines memory registration specific mode bits used with this domain. Full details on MR mode options are available in \f[C]fi_mr\f[](3). The following values may be specified. .TP -.B \f[I]FI_MR_LOCAL\f[] -The provider is optimized around having applications register memory for -locally accessed data buffers. -Data buffers used in send and receive operations and as the source -buffer for RMA and atomic operations must be registered by the -application for access domains opened with this capability. +.B \f[I]FI_MR_ALLOCATED\f[] +Indicates that memory registration occurs on allocated data buffers, and +physical pages must back all virtual addresses being registered. .RS .RE .TP -.B \f[I]FI_MR_RAW\f[] -The provider requires additional setup as part of their memory -registration process. -This mode is required by providers that use a memory key that is larger -than 64\-bits. +.B \f[I]FI_MR_ENDPOINT\f[] +Memory registration occurs at the endpoint level, rather than domain. .RS .RE .TP -.B \f[I]FI_MR_VIRT_ADDR\f[] -Registered memory regions are referenced by peers using the virtual -address of the registered memory region, rather than a 0\-based offset. +.B \f[I]FI_MR_LOCAL\f[] +The provider is optimized around having applications register memory for +locally accessed data buffers. +Data buffers used in send and receive operations and as the source +buffer for RMA and atomic operations must be registered by the +application for access domains opened with this capability. .RS .RE .TP -.B \f[I]FI_MR_ALLOCATED\f[] -Indicates that memory registration occurs on allocated data buffers, and -physical pages must back all virtual addresses being registered. +.B \f[I]FI_MR_MMU_NOTIFY\f[] +Indicates that the application is responsible for notifying the provider +when the page tables referencing a registered memory region may have +been updated. .RS .RE .TP @@ -663,10 +747,11 @@ Memory registration keys are selected and returned by the provider. .RS .RE .TP -.B \f[I]FI_MR_MMU_NOTIFY\f[] -Indicates that the application is responsible for notifying the provider -when the page tables referencing a registered memory region may have -been updated. +.B \f[I]FI_MR_RAW\f[] +The provider requires additional setup as part of their memory +registration process. +This mode is required by providers that use a memory key that is larger +than 64\-bits. .RS .RE .TP @@ -676,11 +761,6 @@ must be explicitly enabled after being bound to any counter. .RS .RE .TP -.B \f[I]FI_MR_ENDPOINT\f[] -Memory registration occurs at the endpoint level, rather than domain. -.RS -.RE -.TP .B \f[I]FI_MR_UNSPEC\f[] Defined for compatibility \-\- library versions 1.4 and earlier. Setting mr_mode to 0 indicates that FI_MR_BASIC or FI_MR_SCALABLE are @@ -688,6 +768,12 @@ requested and supported. .RS .RE .TP +.B \f[I]FI_MR_VIRT_ADDR\f[] +Registered memory regions are referenced by peers using the virtual +address of the registered memory region, rather than a 0\-based offset. +.RS +.RE +.TP .B \f[I]FI_MR_BASIC\f[] Defined for compatibility \-\- library versions 1.4 and earlier. Only basic memory registration operations are requested or supported. @@ -752,6 +838,12 @@ on the default attributes of an allocated endpoint, such as the endpoint capabilities and size. The endpoint count is the number of addressable endpoints supported by the provider. +Providers return capability limits based on configured hardware maximum +capabilities. +Providers cannot predict all possible system limitations without +posteriori knowledge acquired during runtime that will further limit +these hardware maximums (e.g. +application memory consumption, FD usage, etc.). .SS Transmit Context Count (tx_ctx_cnt) .PP The number of outbound command queues optimally supported by the @@ -879,6 +971,12 @@ Applications can set the mr_cnt on input to fi_getinfo, in order to indicate their memory registration requirements. Doing so may allow the provider to optimize any memory registration cache or lookup tables. +.SS Traffic Class (tclass) +.PP +This specifies the default traffic class that will be associated any +endpoints created within the domain. +See [\f[C]fi_endpoint\f[](3)](fi_endpoint.3.html for additional +information. .SH RETURN VALUE .PP Returns 0 on success. diff --git a/man/man3/fi_endpoint.3 b/man/man3/fi_endpoint.3 index 03d78dbb76e..334743d854d 100644 --- a/man/man3/fi_endpoint.3 +++ b/man/man3/fi_endpoint.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_endpoint" "3" "2019\-05\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_endpoint" "3" "2021\-02\-10" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -58,6 +58,11 @@ Open a transmit or receive context. .RS .RE .TP +.B fi_tc_dscp_set / fi_tc_dscp_get +Convert between a DSCP value and a network traffic class +.RS +.RE +.TP .B fi_rx_size_left / fi_tx_size_left (DEPRECATED) Query the lower bound on how many RX/TX operations may be posted without an operation returning \-FI_EAGAIN. @@ -120,6 +125,10 @@ int\ fi_getopt(struct\ fid\ *ep,\ int\ level,\ int\ optname, int\ fi_setopt(struct\ fid\ *ep,\ int\ level,\ int\ optname, \ \ \ \ const\ void\ *optval,\ size_t\ optlen); +uint32_t\ fi_tc_dscp_set(uint8_t\ dscp); + +uint8_t\ fi_tc_dscp_get(uint32_t\ tclass); + DEPRECATED\ ssize_t\ fi_rx_size_left(struct\ fid_ep\ *ep); DEPRECATED\ ssize_t\ fi_tx_size_left(struct\ fid_ep\ *ep); @@ -249,7 +258,7 @@ relevant completion queues or event queues in order to drive progress. For endpoints that are only used as the target of RMA or atomic operations, this means binding the endpoint to a completion queue associated with receive processing. -Unconnected endpoints must be bound to an address vector. +Connectionless endpoints must be bound to an address vector. .PP Once an endpoint has been activated, it may be associated with an address vector. @@ -347,13 +356,6 @@ This is specified using fi_ep_bind flags. The following flags may be OR\[aq]ed together when binding an endpoint to a completion domain CQ. .TP -.B \f[I]FI_TRANSMIT\f[] -Directs the completion of outbound data transfer requests to the -specified completion queue. -This includes send message, RMA, and atomic operations. -.RS -.RE -.TP .B \f[I]FI_RECV\f[] Directs the notification of inbound data transfers to the specified completion queue. @@ -387,6 +389,13 @@ avoid writing a CQ completion entry for every operation. .PP See Notes section below for additional information on how this flag interacts with the FI_CONTEXT and FI_CONTEXT2 mode bits. +.TP +.B \f[I]FI_TRANSMIT\f[] +Directs the completion of outbound data transfer requests to the +specified completion queue. +This includes send message, RMA, and atomic operations. +.RS +.RE .PP An endpoint may optionally be bound to a completion counter. Associating an endpoint with a counter is in addition to binding the EP @@ -394,20 +403,6 @@ with a CQ. When binding an endpoint to a counter, the following flags may be specified. .TP -.B \f[I]FI_SEND\f[] -Increments the specified counter whenever a message transfer initiated -over the endpoint has completed successfully or in error. -Sent messages include both tagged and normal message operations. -.RS -.RE -.TP -.B \f[I]FI_RECV\f[] -Increments the specified counter whenever a message is received over the -endpoint. -Received messages include both tagged and normal message operations. -.RS -.RE -.TP .B \f[I]FI_READ\f[] Increments the specified counter whenever an RMA read, atomic fetch, or atomic compare operation initiated from the endpoint has completed @@ -415,10 +410,10 @@ successfully or in error. .RS .RE .TP -.B \f[I]FI_WRITE\f[] -Increments the specified counter whenever an RMA write or base atomic -operation initiated from the endpoint has completed successfully or in -error. +.B \f[I]FI_RECV\f[] +Increments the specified counter whenever a message is received over the +endpoint. +Received messages include both tagged and normal message operations. .RS .RE .TP @@ -439,6 +434,20 @@ Use of this flag requires that the endpoint be created using FI_RMA_EVENT. .RS .RE +.TP +.B \f[I]FI_SEND\f[] +Increments the specified counter whenever a message transfer initiated +over the endpoint has completed successfully or in error. +Sent messages include both tagged and normal message operations. +.RS +.RE +.TP +.B \f[I]FI_WRITE\f[] +Increments the specified counter whenever an RMA write or base atomic +operation initiated from the endpoint has completed successfully or in +error. +.RS +.RE .PP An endpoint may only be bound to a single CQ or counter for a given type of operation. @@ -533,6 +542,13 @@ struct fi_info. The following control commands and arguments may be assigned to an endpoint. .TP +.B **FI_BACKLOG \- int *value** +This option only applies to passive endpoints. +It is used to set the connection request backlog for listening +endpoints. +.RS +.RE +.TP .B **FI_GETOPSFLAG \-\- uint64_t *flags** Used to retrieve the current value of flags associated with the data transfer operations initiated on the endpoint. @@ -542,6 +558,17 @@ See below for a list of control flags. .RS .RE .TP +.B \f[B]FI_GETWAIT \-\- void **\f[] +This command allows the user to retrieve the file descriptor associated +with a socket endpoint. +The fi_control arg parameter should be an address where a pointer to the +returned file descriptor will be written. +See fi_eq.3 for addition details using fi_control with FI_GETWAIT. +The file descriptor may be used for notification that the endpoint is +ready to send or receive data. +.RS +.RE +.TP .B **FI_SETOPSFLAG \-\- uint64_t *flags** Used to change the data transfer operation flags associated with an endpoint. @@ -553,24 +580,6 @@ attributes that were set when the endpoint was created. Valid control flags are defined below. .RS .RE -.TP -.B **FI_BACKLOG \- int *value** -This option only applies to passive endpoints. -It is used to set the connection request backlog for listening -endpoints. -.RS -.RE -.TP -.B \f[I]FI_GETWAIT (void **)\f[] -This command allows the user to retrieve the file descriptor associated -with a socket endpoint. -The fi_control arg parameter should be an address where a pointer to the -returned file descriptor will be written. -See fi_eq.3 for addition details using fi_control with FI_GETWAIT. -The file descriptor may be used for notification that the endpoint is -ready to send or receive data. -.RS -.RE .SS fi_getopt / fi_setopt .PP Endpoint protocol operations may be retrieved using fi_getopt or set @@ -585,35 +594,6 @@ The following option levels and option names and parameters are defined. \f[I]FI_OPT_ENDPOINT\f[] \[bu] .RS 2 .TP -.B \f[I]FI_OPT_MIN_MULTI_RECV \- size_t\f[] -Defines the minimum receive buffer space available when the receive -buffer is released by the provider (see FI_MULTI_RECV). -Modifying this value is only guaranteed to set the minimum buffer space -needed on receives posted after the value has been changed. -It is recommended that applications that want to override the default -MIN_MULTI_RECV value set this option before enabling the corresponding -endpoint. -.RS -.RE -.RE -\[bu] .RS 2 -.TP -.B \f[I]FI_OPT_CM_DATA_SIZE \- size_t\f[] -Defines the size of available space in CM messages for user\-defined -data. -This value limits the amount of data that applications can exchange -between peer endpoints using the fi_connect, fi_accept, and fi_reject -operations. -The size returned is dependent upon the properties of the endpoint, -except in the case of passive endpoints, in which the size reflects the -maximum size of the data that may be present as part of a connection -request event. -This option is read only. -.RS -.RE -.RE -\[bu] .RS 2 -.TP .B \f[I]FI_OPT_BUFFERED_LIMIT \- size_t\f[] Defines the maximum size of a buffered message that will be reported to users as part of a receive completion when the FI_BUFFERED_RECV mode is @@ -643,13 +623,52 @@ Applications would set this to a size that\[aq]s big enough to decide whether to discard or claim a buffered receive or when to claim a buffered receive on getting a buffered receive completion. The value is typically used by a provider when sending a rendezvous -protocol request where it would send atleast FI_OPT_BUFFERED_MIN bytes +protocol request where it would send at least FI_OPT_BUFFERED_MIN bytes of application data along with it. -A smaller sized renedezvous protocol message usually results in better +A smaller sized rendezvous protocol message usually results in better latency for the overall transfer of a large message. .RS .RE .RE +\[bu] .RS 2 +.TP +.B \f[I]FI_OPT_CM_DATA_SIZE \- size_t\f[] +Defines the size of available space in CM messages for user\-defined +data. +This value limits the amount of data that applications can exchange +between peer endpoints using the fi_connect, fi_accept, and fi_reject +operations. +The size returned is dependent upon the properties of the endpoint, +except in the case of passive endpoints, in which the size reflects the +maximum size of the data that may be present as part of a connection +request event. +This option is read only. +.RS +.RE +.RE +\[bu] .RS 2 +.TP +.B \f[I]FI_OPT_MIN_MULTI_RECV \- size_t\f[] +Defines the minimum receive buffer space available when the receive +buffer is released by the provider (see FI_MULTI_RECV). +Modifying this value is only guaranteed to set the minimum buffer space +needed on receives posted after the value has been changed. +It is recommended that applications that want to override the default +MIN_MULTI_RECV value set this option before enabling the corresponding +endpoint. +.RS +.RE +.RE +.SS fi_tc_dscp_set +.PP +This call converts a DSCP defined value into a libfabric traffic class +value. +It should be used when assigning a DSCP value when setting the tclass +field in either domain or endpoint attributes +.SS fi_tc_dscp_get +.PP +This call returns the DSCP value associated with the tclass field for +the domain or endpoint attributes. .SS fi_rx_size_left (DEPRECATED) .PP This function has been deprecated and will be removed in a future @@ -708,10 +727,11 @@ If specified, indicates the type of fabric interface communication desired. Supported types are: .TP -.B \f[I]FI_EP_UNSPEC\f[] -The type of endpoint is not specified. -This is usually provided as input, with other attributes of the endpoint -or the provider selecting the type. +.B \f[I]FI_EP_DGRAM\f[] +Supports a connectionless, unreliable datagram communication. +Message boundaries are maintained, but the maximum message size may be +limited to the fabric MTU. +Flow control is not guaranteed. .RS .RE .TP @@ -721,18 +741,20 @@ flow control that maintains message boundaries. .RS .RE .TP -.B \f[I]FI_EP_DGRAM\f[] -Supports a connectionless, unreliable datagram communication. -Message boundaries are maintained, but the maximum message size may be -limited to the fabric MTU. -Flow control is not guaranteed. +.B \f[I]FI_EP_RDM\f[] +Reliable datagram message. +Provides a reliable, connectionless data transfer service with flow +control that maintains message boundaries. .RS .RE .TP -.B \f[I]FI_EP_RDM\f[] -Reliable datagram message. -Provides a reliable, unconnected data transfer service with flow control -that maintains message boundaries. +.B \f[I]FI_EP_SOCK_DGRAM\f[] +A connectionless, unreliable datagram endpoint with UDP socket\-like +semantics. +FI_EP_SOCK_DGRAM is most useful for applications designed around using +UDP sockets. +See the SOCKET ENDPOINT section for additional details and restrictions +that apply to datagram socket endpoints. .RS .RE .TP @@ -747,13 +769,10 @@ that apply to stream endpoints. .RS .RE .TP -.B \f[I]FI_EP_SOCK_DGRAM\f[] -A connectionless, unreliable datagram endpoint with UDP socket\-like -semantics. -FI_EP_SOCK_DGRAM is most useful for applications designed around using -UDP sockets. -See the SOCKET ENDPOINT section for additional details and restrictions -that apply to datagram socket endpoints. +.B \f[I]FI_EP_UNSPEC\f[] +The type of endpoint is not specified. +This is usually provided as input, with other attributes of the endpoint +or the provider selecting the type. .RS .RE .SS Protocol @@ -766,16 +785,19 @@ Provider specific protocols are also allowed. Provider specific protocols will be indicated by having the upper bit of the protocol value set to one. .TP -.B \f[I]FI_PROTO_UNSPEC\f[] -The protocol is not specified. -This is usually provided as input, with other attributes of the socket -or the provider selecting the actual protocol. +.B \f[I]FI_PROTO_GNI\f[] +Protocol runs over Cray GNI low\-level interface. .RS .RE .TP -.B \f[I]FI_PROTO_RDMA_CM_IB_RC\f[] -The protocol runs over Infiniband reliable\-connected queue pairs, using -the RDMA CM protocol for connection establishment. +.B \f[I]FI_PROTO_IB_RDM\f[] +Reliable\-datagram protocol implemented over InfiniBand +reliable\-connected queue pairs. +.RS +.RE +.TP +.B \f[I]FI_PROTO_IB_UD\f[] +The protocol runs over Infiniband unreliable datagram queue pairs. .RS .RE .TP @@ -784,8 +806,16 @@ The protocol runs over the Internet wide area RDMA protocol transport. .RS .RE .TP -.B \f[I]FI_PROTO_IB_UD\f[] -The protocol runs over Infiniband unreliable datagram queue pairs. +.B \f[I]FI_PROTO_IWARP_RDM\f[] +Reliable\-datagram protocol implemented over iWarp reliable\-connected +queue pairs. +.RS +.RE +.TP +.B \f[I]FI_PROTO_NETWORKDIRECT\f[] +Protocol runs over Microsoft NetworkDirect service provider interface. +This adds reliable\-datagram semantics over the NetworkDirect +connection\- oriented endpoint semantics. .RS .RE .TP @@ -797,33 +827,31 @@ interfaces. .RS .RE .TP -.B \f[I]FI_PROTO_UDP\f[] -The protocol sends and receives UDP datagrams. -For example, an endpoint using \f[I]FI_PROTO_UDP\f[] will be able to -communicate with a remote peer that is using Berkeley -\f[I]SOCK_DGRAM\f[] sockets using \f[I]IPPROTO_UDP\f[]. -.RS -.RE -.TP -.B \f[I]FI_PROTO_SOCK_TCP\f[] -The protocol is layered over TCP packets. +.B \f[I]FI_PROTO_PSMX2\f[] +The protocol is based on an Intel proprietary protocol known as PSM2, +performance scaled messaging version 2. +PSMX2 is an extended version of the PSM2 protocol to support the +libfabric interfaces. .RS .RE .TP -.B \f[I]FI_PROTO_IWARP_RDM\f[] -Reliable\-datagram protocol implemented over iWarp reliable\-connected -queue pairs. +.B \f[I]FI_PROTO_PSMX3\f[] +The protocol is Intel\[aq]s protocol known as PSM3, performance scaled +messaging version 3. +PSMX3 is implemented over RoCEv2 and verbs. .RS .RE .TP -.B \f[I]FI_PROTO_IB_RDM\f[] -Reliable\-datagram protocol implemented over InfiniBand -reliable\-connected queue pairs. +.B \f[I]FI_PROTO_RDMA_CM_IB_RC\f[] +The protocol runs over Infiniband reliable\-connected queue pairs, using +the RDMA CM protocol for connection establishment. .RS .RE .TP -.B \f[I]FI_PROTO_GNI\f[] -Protocol runs over Cray GNI low\-level interface. +.B \f[I]FI_PROTO_RXD\f[] +Reliable\-datagram protocol implemented over datagram endpoints. +RXD is a libfabric utility component that adds RDM endpoint semantics +over DGRAM endpoint semantics. .RS .RE .TP @@ -834,25 +862,23 @@ over MSG endpoint semantics. .RS .RE .TP -.B \f[I]FI_PROTO_RXD\f[] -Reliable\-datagram protocol implemented over datagram endpoints. -RXD is a libfabric utility component that adds RDM endpoint semantics -over DGRAM endpoint semantics. +.B \f[I]FI_PROTO_SOCK_TCP\f[] +The protocol is layered over TCP packets. .RS .RE .TP -.B \f[I]FI_PROTO_NETWORKDIRECT\f[] -Protocol runs over Microsoft NetworkDirect service provider interface. -This adds reliable\-datagram semantics over the NetworkDirect -connection\- oriented endpoint semantics. -.RS +.B \f[I]FI_PROTO_UDP\f[] +The protocol sends and receives UDP datagrams. +For example, an endpoint using \f[I]FI_PROTO_UDP\f[] will be able to +communicate with a remote peer that is using Berkeley +\f[I]SOCK_DGRAM\f[] sockets using \f[I]IPPROTO_UDP\f[]. +.RS .RE .TP -.B \f[I]FI_PROTO_PSMX2\f[] -The protocol is based on an Intel proprietary protocol known as PSM2, -performance scaled messaging version 2. -PSMX2 is an extended version of the PSM2 protocol to support the -libfabric interfaces. +.B \f[I]FI_PROTO_UNSPEC\f[] +The protocol is not specified. +This is usually provided as input, with other attributes of the socket +or the provider selecting the actual protocol. .RS .RE .SS protocol_version \- Protocol Version @@ -1054,6 +1080,7 @@ struct\ fi_tx_attr\ { \ \ \ \ size_t\ \ \ \ size; \ \ \ \ size_t\ \ \ \ iov_limit; \ \ \ \ size_t\ \ \ \ rma_iov_limit; +\ \ \ \ uint32_t\ \ tclass; }; \f[] .fi @@ -1063,8 +1090,22 @@ The requested capabilities of the context. The capabilities must be a subset of those requested of the associated endpoint. See the CAPABILITIES section of fi_getinfo(3) for capability details. -If the caps field is 0 on input to fi_getinfo(3), the caps value from -the fi_info structure will be used. +If the caps field is 0 on input to fi_getinfo(3), the applicable +capability bits from the fi_info structure will be used. +.PP +The following capabilities apply to the transmit attributes: FI_MSG, +FI_RMA, FI_TAGGED, FI_ATOMIC, FI_READ, FI_WRITE, FI_SEND, FI_HMEM, +FI_TRIGGER, FI_FENCE, FI_MULTICAST, FI_RMA_PMEM, FI_NAMED_RX_CTX, and +FI_COLLECTIVE. +.PP +Many applications will be able to ignore this field and rely solely on +the fi_info::caps field. +Use of this field provides fine grained control over the transmit +capabilities associated with an endpoint. +It is useful when handling scalable endpoints, with multiple transmit +contexts, for example, and allows configuring a specific transmit +context with fewer capabilities than that supported by the endpoint or +other transmit contexts. .SS mode .PP The operational mode bits of the context. @@ -1104,6 +1145,40 @@ Message ordering requires matching ordering semantics on the receiving side of a data transfer operation in order to guarantee that ordering is met. .TP +.B \f[I]FI_ORDER_ATOMIC_RAR\f[] +Atomic read after read. +If set, atomic fetch operations are transmitted in the order submitted +relative to other atomic fetch operations. +If not set, atomic fetches may be transmitted out of order from their +submission. +.RS +.RE +.TP +.B \f[I]FI_ORDER_ATOMIC_RAW\f[] +Atomic read after write. +If set, atomic fetch operations are transmitted in the order submitted +relative to atomic update operations. +If not set, atomic fetches may be transmitted ahead of atomic updates. +.RS +.RE +.TP +.B \f[I]FI_ORDER_ATOMIC_WAR\f[] +RMA write after read. +If set, atomic update operations are transmitted in the order submitted +relative to atomic fetch operations. +If not set, atomic updates may be transmitted ahead of atomic fetches. +.RS +.RE +.TP +.B \f[I]FI_ORDER_ATOMIC_WAW\f[] +RMA write after write. +If set, atomic update operations are transmitted in the order submitted +relative to other atomic update operations. +If not atomic updates may be transmitted out of order from their +submission. +.RS +.RE +.TP .B \f[I]FI_ORDER_NONE\f[] No ordering is specified. This value may be used as input in order to obtain the default message @@ -1121,15 +1196,6 @@ their submission. .RS .RE .TP -.B \f[I]FI_ORDER_RAW\f[] -Read after write. -If set, RMA and atomic read operations are transmitted in the order -submitted relative to RMA and atomic write operations. -If not set, RMA and atomic reads may be transmitted ahead of RMA and -atomic writes. -.RS -.RE -.TP .B \f[I]FI_ORDER_RAS\f[] Read after send. If set, RMA and atomic read operations are transmitted in the order @@ -1138,56 +1204,12 @@ If not set, RMA and atomic reads may be transmitted ahead of sends. .RS .RE .TP -.B \f[I]FI_ORDER_WAR\f[] -Write after read. -If set, RMA and atomic write operations are transmitted in the order -submitted relative to RMA and atomic read operations. -If not set, RMA and atomic writes may be transmitted ahead of RMA and -atomic reads. -.RS -.RE -.TP -.B \f[I]FI_ORDER_WAW\f[] -Write after write. -If set, RMA and atomic write operations are transmitted in the order -submitted relative to other RMA and atomic write operations. -If not set, RMA and atomic writes may be transmitted out of order from -their submission. -.RS -.RE -.TP -.B \f[I]FI_ORDER_WAS\f[] -Write after send. -If set, RMA and atomic write operations are transmitted in the order -submitted relative to message send operations, including tagged sends. -If not set, RMA and atomic writes may be transmitted ahead of sends. -.RS -.RE -.TP -.B \f[I]FI_ORDER_SAR\f[] -Send after read. -If set, message send operations, including tagged sends, are transmitted -in order submitted relative to RMA and atomic read operations. -If not set, message sends may be transmitted ahead of RMA and atomic -reads. -.RS -.RE -.TP -.B \f[I]FI_ORDER_SAW\f[] -Send after write. -If set, message send operations, including tagged sends, are transmitted -in order submitted relative to RMA and atomic write operations. -If not set, message sends may be transmitted ahead of RMA and atomic -writes. -.RS -.RE -.TP -.B \f[I]FI_ORDER_SAS\f[] -Send after send. -If set, message send operations, including tagged sends, are transmitted -in the order submitted relative to other message send. -If not set, message sends may be transmitted out of order from their -submission. +.B \f[I]FI_ORDER_RAW\f[] +Read after write. +If set, RMA and atomic read operations are transmitted in the order +submitted relative to RMA and atomic write operations. +If not set, RMA and atomic reads may be transmitted ahead of RMA and +atomic writes. .RS .RE .TP @@ -1225,37 +1247,56 @@ submission. .RS .RE .TP -.B \f[I]FI_ORDER_ATOMIC_RAR\f[] -Atomic read after read. -If set, atomic fetch operations are transmitted in the order submitted -relative to other atomic fetch operations. -If not set, atomic fetches may be transmitted out of order from their +.B \f[I]FI_ORDER_SAR\f[] +Send after read. +If set, message send operations, including tagged sends, are transmitted +in order submitted relative to RMA and atomic read operations. +If not set, message sends may be transmitted ahead of RMA and atomic +reads. +.RS +.RE +.TP +.B \f[I]FI_ORDER_SAS\f[] +Send after send. +If set, message send operations, including tagged sends, are transmitted +in the order submitted relative to other message send. +If not set, message sends may be transmitted out of order from their submission. .RS .RE .TP -.B \f[I]FI_ORDER_ATOMIC_RAW\f[] -Atomic read after write. -If set, atomic fetch operations are transmitted in the order submitted -relative to atomic update operations. -If not set, atomic fetches may be transmitted ahead of atomic updates. +.B \f[I]FI_ORDER_SAW\f[] +Send after write. +If set, message send operations, including tagged sends, are transmitted +in order submitted relative to RMA and atomic write operations. +If not set, message sends may be transmitted ahead of RMA and atomic +writes. .RS .RE .TP -.B \f[I]FI_ORDER_ATOMIC_WAR\f[] -RMA write after read. -If set, atomic update operations are transmitted in the order submitted -relative to atomic fetch operations. -If not set, atomic updates may be transmitted ahead of atomic fetches. +.B \f[I]FI_ORDER_WAR\f[] +Write after read. +If set, RMA and atomic write operations are transmitted in the order +submitted relative to RMA and atomic read operations. +If not set, RMA and atomic writes may be transmitted ahead of RMA and +atomic reads. .RS .RE .TP -.B \f[I]FI_ORDER_ATOMIC_WAW\f[] -RMA write after write. -If set, atomic update operations are transmitted in the order submitted -relative to other atomic update operations. -If not atomic updates may be transmitted out of order from their -submission. +.B \f[I]FI_ORDER_WAS\f[] +Write after send. +If set, RMA and atomic write operations are transmitted in the order +submitted relative to message send operations, including tagged sends. +If not set, RMA and atomic writes may be transmitted ahead of sends. +.RS +.RE +.TP +.B \f[I]FI_ORDER_WAW\f[] +Write after write. +If set, RMA and atomic write operations are transmitted in the order +submitted relative to other RMA and atomic write operations. +If not set, RMA and atomic writes may be transmitted out of order from +their submission. .RS .RE .SS comp_order \- Completion Ordering @@ -1267,7 +1308,7 @@ Relaxed completion order may enable faster reporting of completed transfers, allow acknowledgments to be sent over different fabric paths, and support more sophisticated retry mechanisms. This can result in lower\-latency completions, particularly when using -unconnected endpoints. +connectionless endpoints. Strict completion ordering may require that providers queue completed operations or limit available optimizations. .PP @@ -1307,10 +1348,25 @@ inject operation (such as fi_inject) or may be used with the FI_INJECT data transfer flag. .SS size .PP -The size of the context. -The size is specified as the minimum number of transmit operations that -may be posted to the endpoint without the operation returning -\-FI_EAGAIN. +The size of the transmit context. +The mapping of the size value to resources is provider specific, but it +is directly related to the number of command entries allocated for the +endpoint. +A smaller size value consumes fewer hardware and software resources, +while a larger size allows queuing more transmit requests. +.PP +While the size attribute guides the size of underlying endpoint transmit +queue, there is not necessarily a one\-to\-one mapping between a +transmit operation and a queue entry. +A single transmit operation may consume multiple queue entries; for +example, one per scatter\-gather entry. +Additionally, the size field is intended to guide the allocation of the +endpoint\[aq]s transmit context. +Specifically, for connectionless endpoints, there may be lower\-level +queues use to track communication on a per peer basis. +The sizes of any lower\-level queues may only be significantly smaller +than the endpoint\[aq]s transmit size, in order to reduce resource +utilization. .SS iov_limit .PP This is the maximum number of IO vectors (scatter\-gather elements) that @@ -1327,6 +1383,75 @@ This limit applies to both the number of RMA IO vectors that may be specified when initiating an operation from the local endpoint, as well as the maximum number of IO vectors that may be carried in a single request from a remote endpoint. +.SS Traffic Class (tclass) +.PP +Traffic classes can be a differentiated services code point (DSCP) +value, one of the following defined labels, or a provider\-specific +definition. +If tclass is unset or set to FI_TC_UNSPEC, the endpoint will use the +default traffic class associated with the domain. +.TP +.B \f[I]FI_TC_BEST_EFFORT\f[] +This is the default in the absence of any other local or fabric +configuration. +This class carries the traffic for a number of applications executing +concurrently over the same network infrastructure. +Even though it is shared, network capacity and resource allocation are +distributed fairly across the applications. +.RS +.RE +.TP +.B \f[I]FI_TC_BULK_DATA\f[] +This class is intended for large data transfers associated with I/O and +is present to separate sustained I/O transfers from other application +inter\-process communications. +.RS +.RE +.TP +.B \f[I]FI_TC_DEDICATED_ACCESS\f[] +This class operates at the highest priority, except the management +class. +It carries a high bandwidth allocation, minimum latency targets, and the +highest scheduling and arbitration priority. +.RS +.RE +.TP +.B \f[I]FI_TC_LOW_LATENCY\f[] +This class supports low latency, low jitter data patterns typically +caused by transactional data exchanges, barrier synchronizations, and +collective operations that are typical of HPC applications. +This class often requires maximum tolerable latencies that data +transfers must achieve for correct or performance operations. +Fulfillment of such requests in this class will typically require +accompanying bandwidth and message size limitations so as not to consume +excessive bandwidth at high priority. +.RS +.RE +.TP +.B \f[I]FI_TC_NETWORK_CTRL\f[] +This class is intended for traffic directly related to fabric (network) +management, which is critical to the correct operation of the network. +Its use is typically restricted to privileged network management +applications. +.RS +.RE +.TP +.B \f[I]FI_TC_SCAVENGER\f[] +This class is used for data that is desired but does not have strict +delivery requirements, such as in\-band network or application level +monitoring data. +Use of this class indicates that the traffic is considered lower +priority and should not interfere with higher priority workflows. +.RS +.RE +.TP +.B \f[I]fi_tc_dscp_set / fi_tc_dscp_get\f[] +DSCP values are supported via the DSCP get and set functions. +The definitions for DSCP values are outside the scope of libfabric. +See the fi_tc_dscp_set and fi_tc_dscp_get function definitions for +details on their use. +.RS +.RE .SH RECEIVE CONTEXT ATTRIBUTES .PP Attributes specific to the receive capabilities of an endpoint are @@ -1352,8 +1477,23 @@ The requested capabilities of the context. The capabilities must be a subset of those requested of the associated endpoint. See the CAPABILITIES section if fi_getinfo(3) for capability details. -If the caps field is 0 on input to fi_getinfo(3), the caps value from -the fi_info structure will be used. +If the caps field is 0 on input to fi_getinfo(3), the applicable +capability bits from the fi_info structure will be used. +.PP +The following capabilities apply to the receive attributes: FI_MSG, +FI_RMA, FI_TAGGED, FI_ATOMIC, FI_REMOTE_READ, FI_REMOTE_WRITE, FI_RECV, +FI_HMEM, FI_TRIGGER, FI_RMA_PMEM, FI_DIRECTED_RECV, FI_VARIABLE_MSG, +FI_MULTI_RECV, FI_SOURCE, FI_RMA_EVENT, FI_SOURCE_ERR, and +FI_COLLECTIVE. +.PP +Many applications will be able to ignore this field and rely solely on +the fi_info::caps field. +Use of this field provides fine grained control over the receive +capabilities associated with an endpoint. +It is useful when handling scalable endpoints, with multiple receive +contexts, for example, and allows configuring a specific receive context +with fewer capabilities than that supported by the endpoint or other +receive contexts. .SS mode .PP The operational mode bits of the context. @@ -1392,6 +1532,14 @@ FI_ORDER_ATOMIC_WAW. For a description of completion ordering, see the comp_order field in the \f[I]Transmit Context Attribute\f[] section. .TP +.B \f[I]FI_ORDER_DATA\f[] +When set, this bit indicates that received data is written into memory +in order. +Data ordering applies to memory accessed as part of a single operation +and between operations if message ordering is guaranteed. +.RS +.RE +.TP .B \f[I]FI_ORDER_NONE\f[] No ordering is defined for completed operations. Receive operations may complete in any order, regardless of their @@ -1404,14 +1552,6 @@ Receive operations complete in the order in which they are processed by the receive context, based on the receive side msg_order attribute. .RS .RE -.TP -.B \f[I]FI_ORDER_DATA\f[] -When set, this bit indicates that received data is written into memory -in order. -Data ordering applies to memory accessed as part of a single operation -and between operations if message ordering is guaranteed. -.RS -.RE .SS total_buffered_recv .PP This field is supported for backwards compatibility purposes. @@ -1435,10 +1575,25 @@ anticipate receiving unexpected messages, rather than modifying this value. .SS size .PP -The size of the context. -The size is specified as the minimum number of receive operations that -may be posted to the endpoint without the operation returning -\-FI_EAGAIN. +The size of the receive context. +The mapping of the size value to resources is provider specific, but it +is directly related to the number of command entries allocated for the +endpoint. +A smaller size value consumes fewer hardware and software resources, +while a larger size allows queuing more transmit requests. +.PP +While the size attribute guides the size of underlying endpoint receive +queue, there is not necessarily a one\-to\-one mapping between a receive +operation and a queue entry. +A single receive operation may consume multiple queue entries; for +example, one per scatter\-gather entry. +Additionally, the size field is intended to guide the allocation of the +endpoint\[aq]s receive context. +Specifically, for connectionless endpoints, there may be lower\-level +queues use to track communication on a per peer basis. +The sizes of any lower\-level queues may only be significantly smaller +than the endpoint\[aq]s receive size, in order to reduce resource +utilization. .SS iov_limit .PP This is the maximum number of IO vectors (scatter\-gather elements) that @@ -1634,6 +1789,30 @@ data transfer operations, where a flags parameter is not available. Data transfer operations that take flags as input override the op_flags value of transmit or receive context attributes of an endpoint. .TP +.B \f[I]FI_COMMIT_COMPLETE\f[] +Indicates that a completion should not be generated (locally or at the +peer) until the result of an operation have been made persistent. +See \f[C]fi_cq\f[](3) for additional details on completion semantics. +.RS +.RE +.TP +.B \f[I]FI_COMPLETION\f[] +Indicates that a completion queue entry should be written for data +transfer operations. +This flag only applies to operations issued on an endpoint that was +bound to a completion queue with the FI_SELECTIVE_COMPLETION flag set, +otherwise, it is ignored. +See the fi_ep_bind section above for more detail. +.RS +.RE +.TP +.B \f[I]FI_DELIVERY_COMPLETE\f[] +Indicates that a completion should be generated when the operation has +been processed by the destination endpoint(s). +See \f[C]fi_cq\f[](3) for additional details on completion semantics. +.RS +.RE +.TP .B \f[I]FI_INJECT\f[] Indicates that all outbound data buffers should be returned to the user\[aq]s control immediately after a data transfer call returns, even @@ -1646,6 +1825,21 @@ This limit is indicated using inject_size (see inject_size above). .RS .RE .TP +.B \f[I]FI_INJECT_COMPLETE\f[] +Indicates that a completion should be generated when the source +buffer(s) may be reused. +See \f[C]fi_cq\f[](3) for additional details on completion semantics. +.RS +.RE +.TP +.B \f[I]FI_MULTICAST\f[] +Indicates that data transfers will target multicast addresses by +default. +Any fi_addr_t passed into a data transfer operation will be treated as a +multicast address. +.RS +.RE +.TP .B \f[I]FI_MULTI_RECV\f[] Applies to posted receive operations. This flag allows the user to post a single buffer that will receive @@ -1661,51 +1855,12 @@ space falls below the specified minimum (see FI_OPT_MIN_MULTI_RECV). .RS .RE .TP -.B \f[I]FI_COMPLETION\f[] -Indicates that a completion queue entry should be written for data -transfer operations. -This flag only applies to operations issued on an endpoint that was -bound to a completion queue with the FI_SELECTIVE_COMPLETION flag set, -otherwise, it is ignored. -See the fi_ep_bind section above for more detail. -.RS -.RE -.TP -.B \f[I]FI_INJECT_COMPLETE\f[] -Indicates that a completion should be generated when the source -buffer(s) may be reused. -See \f[C]fi_cq\f[](3) for additional details on completion semantics. -.RS -.RE -.TP .B \f[I]FI_TRANSMIT_COMPLETE\f[] Indicates that a completion should be generated when the transmit operation has completed relative to the local provider. See \f[C]fi_cq\f[](3) for additional details on completion semantics. .RS .RE -.TP -.B \f[I]FI_DELIVERY_COMPLETE\f[] -Indicates that a completion should be generated when the operation has -been processed by the destination endpoint(s). -See \f[C]fi_cq\f[](3) for additional details on completion semantics. -.RS -.RE -.TP -.B \f[I]FI_COMMIT_COMPLETE\f[] -Indicates that a completion should not be generated (locally or at the -peer) until the result of an operation have been made persistent. -See \f[C]fi_cq\f[](3) for additional details on completion semantics. -.RS -.RE -.TP -.B \f[I]FI_MULTICAST\f[] -Indicates that data transfers will target multicast addresses by -default. -Any fi_addr_t passed into a data transfer operation will be treated as a -multicast address. -.RS -.RE .SH NOTES .PP Users should call fi_close to release all resources allocated to the diff --git a/man/man3/fi_eq.3 b/man/man3/fi_eq.3 index d4d56758844..47b1bc2880a 100644 --- a/man/man3/fi_eq.3 +++ b/man/man3/fi_eq.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_eq" "3" "2019\-02\-19" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_eq" "3" "2019\-12\-13" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -241,10 +241,10 @@ wait object. .RS .RE .TP -.B \- \f[I]FI_WAIT_CRITSEC_COND\f[] -Windows specific. -Specifies that the EQ should use a critical section and condition -variable as a wait object. +.B \- \f[I]FI_WAIT_YIELD\f[] +Indicates that the EQ will wait without a wait object but instead yield +on every wait. +Allows usage of fi_eq_sread through a spin. .RS .RE .TP diff --git a/man/man3/fi_fabric.3 b/man/man3/fi_fabric.3 index 18d4ceb91e1..40de16cc8e8 100644 --- a/man/man3/fi_fabric.3 +++ b/man/man3/fi_fabric.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_fabric" "3" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_fabric" "3" "2020\-10\-20" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -11,7 +11,7 @@ Open / close a fabric domain .RS .RE .TP -.B fi_tostr +.B fi_tostr / fi_tostr_r Convert fabric attributes, flags, and capabilities to printable string .RS .RE @@ -27,6 +27,9 @@ int\ fi_fabric(struct\ fi_fabric_attr\ *attr, int\ fi_close(struct\ fid\ *fabric); char\ *\ fi_tostr(const\ void\ *data,\ enum\ fi_type\ datatype); + +char\ *\ fi_tostr(char\ *buf,\ size_t\ len,\ const\ void\ *data, +\ \ \ \ enum\ fi_type\ datatype); \f[] .fi .SH ARGUMENTS @@ -46,6 +49,27 @@ User specified context associated with the opened object. This context is returned as part of any associated asynchronous event. .RS .RE +.TP +.B \f[I]buf\f[] +Output buffer to write string. +.RS +.RE +.TP +.B \f[I]len\f[] +Size in bytes of memory referenced by buf. +.RS +.RE +.TP +.B \f[I]data\f[] +Input data to convert into a string. +The format of data is determined by the datatype parameter. +.RS +.RE +.TP +.B \f[I]datatype\f[] +Indicates the data to convert to a printable string. +.RS +.RE .SH DESCRIPTION .PP A fabric domain represents a collection of hardware and software @@ -65,7 +89,7 @@ The fi_close call is used to release all resources associated with a fabric domain or interface. All items associated with the opened fabric must be released prior to calling fi_close. -.SS fi_tostr +.SS fi_tostr / fi_tostr_r .PP Converts fabric interface attributes, capabilities, flags, and enum values into a printable string. @@ -197,11 +221,21 @@ enum fi_op_type struct fid * .RS .RE +.TP +.B \f[I]FI_TYPE_HMEM_IFACE\f[] +enum fi_hmem_iface * +.RS +.RE .PP fi_tostr() will return a pointer to an internal libfabric buffer that should not be modified, and will be overwritten the next time fi_tostr() is invoked. fi_tostr() is not thread safe. +.PP +The fi_tostr_r() function is a re\-entrant and thread safe version of +fi_tostr(). +It writes the string into a buffer provided by the caller. +fi_tostr_r() returns the start of the caller\[aq]s buffer. .SH NOTES .PP The following resources are associated with fabric domains: access @@ -230,6 +264,10 @@ On output from fi_getinfo, if no fabric was specified, but the user has an opened instance of the named fabric, this will reference the first opened instance. If no instance has been opened, this field will be NULL. +.PP +The fabric instance returned by fi_getinfo should only be considered +valid if the application does not close any fabric instances from +another thread while fi_getinfo is being processed. .SS name .PP A fabric identifier. diff --git a/man/man3/fi_getinfo.3 b/man/man3/fi_getinfo.3 index 5a1e6bd2be9..20698a66503 100644 --- a/man/man3/fi_getinfo.3 +++ b/man/man3/fi_getinfo.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_getinfo" "3" "2019\-02\-04" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_getinfo" "3" "2021\-02\-10" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -306,7 +306,7 @@ Optional attributes related to the hardware NIC associated with the specified fabric, domain, and endpoint data. This field is only valid for providers where the corresponding attributes are closely associated with a hardware NIC. -See [\f[C]fi_nic\f[](3)] (fi_nic.3.html) for details. +See \f[C]fi_nic\f[](3) for details. .RS .RE .SH CAPABILITIES @@ -325,48 +325,6 @@ Applications may use this feature to request a minimal set of requirements, then check the returned capabilities to enable additional optimizations. .TP -.B \f[I]FI_MSG\f[] -Specifies that an endpoint should support sending and receiving messages -or datagrams. -Message capabilities imply support for send and/or receive queues. -Endpoints supporting this capability support operations defined by -struct fi_ops_msg. -.RS -.RE -.PP -The caps may be used to specify or restrict the type of messaging -operations that are supported. -In the absence of any relevant flags, FI_MSG implies the ability to send -and receive messages. -Applications can use the FI_SEND and FI_RECV flags to optimize an -endpoint as send\-only or receive\-only. -.TP -.B \f[I]FI_RMA\f[] -Specifies that the endpoint should support RMA read and write -operations. -Endpoints supporting this capability support operations defined by -struct fi_ops_rma. -In the absence of any relevant flags, FI_RMA implies the ability to -initiate and be the target of remote memory reads and writes. -Applications can use the FI_READ, FI_WRITE, FI_REMOTE_READ, and -FI_REMOTE_WRITE flags to restrict the types of RMA operations supported -by an endpoint. -.RS -.RE -.TP -.B \f[I]FI_TAGGED\f[] -Specifies that the endpoint should handle tagged message transfers. -Tagged message transfers associate a user\-specified key or tag with -each message that is used for matching purposes at the remote side. -Endpoints supporting this capability support operations defined by -struct fi_ops_tagged. -In the absence of any relevant flags, FI_TAGGED implies the ability to -send and receive tagged messages. -Applications can use the FI_SEND and FI_RECV flags to optimize an -endpoint as send\-only or receive\-only. -.RS -.RE -.TP .B \f[I]FI_ATOMIC\f[] Specifies that the endpoint supports some set of atomic operations. Endpoints supporting this capability support operations defined by @@ -379,18 +337,10 @@ supported by an endpoint. .RS .RE .TP -.B \f[I]FI_MULTICAST\f[] -Indicates that the endpoint support multicast data transfers. -This capability must be paired with at least one other data transfer -capability, (e.g. -FI_MSG, FI_SEND, FI_RECV, ...). -.RS -.RE -.TP -.B \f[I]FI_NAMED_RX_CTX\f[] -Requests that endpoints which support multiple receive contexts allow an -initiator to target (or name) a specific receive context as part of a -data transfer operation. +.B \f[I]FI_COLLECTIVE\f[] +Requests support for collective operations. +Endpoints that support this capability support the collective operations +defined in \f[C]fi_collective\f[](3). .RS .RE .TP @@ -402,22 +352,71 @@ tagged receive operations is ignored. .RS .RE .TP +.B \f[I]FI_FENCE\f[] +Indicates that the endpoint support the FI_FENCE flag on data transfer +operations. +Support requires tracking that all previous transmit requests to a +specified remote endpoint complete prior to initiating the fenced +operation. +Fenced operations are often used to enforce ordering between operations +that are not otherwise guaranteed by the underlying provider or +protocol. +.RS +.RE +.TP +.B \f[I]FI_HMEM\f[] +Specifies that the endpoint should support transfers to and from device +memory. +.RS +.RE +.TP +.B \f[I]FI_LOCAL_COMM\f[] +Indicates that the endpoint support host local communication. +This flag may be used in conjunction with FI_REMOTE_COMM to indicate +that local and remote communication are required. +If neither FI_LOCAL_COMM or FI_REMOTE_COMM are specified, then the +provider will indicate support for the configuration that minimally +affects performance. +Providers that set FI_LOCAL_COMM but not FI_REMOTE_COMM, for example a +shared memory provider, may only be used to communication between +processes on the same system. +.RS +.RE +.TP +.B \f[I]FI_MSG\f[] +Specifies that an endpoint should support sending and receiving messages +or datagrams. +Message capabilities imply support for send and/or receive queues. +Endpoints supporting this capability support operations defined by +struct fi_ops_msg. +.RS +.RE +.PP +The caps may be used to specify or restrict the type of messaging +operations that are supported. +In the absence of any relevant flags, FI_MSG implies the ability to send +and receive messages. +Applications can use the FI_SEND and FI_RECV flags to optimize an +endpoint as send\-only or receive\-only. +.TP +.B \f[I]FI_MULTICAST\f[] +Indicates that the endpoint support multicast data transfers. +This capability must be paired with FI_MSG. +Applications can use FI_SEND and FI_RECV to optimize multicast as +send\-only or receive\-only. +.RS +.RE +.TP .B \f[I]FI_MULTI_RECV\f[] Specifies that the endpoint must support the FI_MULTI_RECV flag when posting receive buffers. .RS .RE .TP -.B \f[I]FI_SOURCE\f[] -Requests that the endpoint return source addressing data as part of its -completion data. -This capability only applies to connectionless endpoints. -Note that returning source address information may require that the -provider perform address translation and/or look\-up based on data -available in the underlying protocol in order to provide the requested -data, which may adversely affect performance. -The performance impact may be greater for address vectors of type -FI_AV_TABLE. +.B \f[I]FI_NAMED_RX_CTX\f[] +Requests that endpoints which support multiple receive contexts allow an +initiator to target (or name) a specific receive context as part of a +data transfer operation. .RS .RE .TP @@ -428,21 +427,6 @@ This flag requires that FI_RMA and/or FI_ATOMIC be set. .RS .RE .TP -.B \f[I]FI_WRITE\f[] -Indicates that the user requires an endpoint capable of initiating -writes against remote memory regions. -This flag requires that FI_RMA and/or FI_ATOMIC be set. -.RS -.RE -.TP -.B \f[I]FI_SEND\f[] -Indicates that the user requires an endpoint capable of sending message -data transfers. -Message transfers include base message operations as well as tagged -message functionality. -.RS -.RE -.TP .B \f[I]FI_RECV\f[] Indicates that the user requires an endpoint capable of receiving message data transfers. @@ -451,6 +435,16 @@ message functionality. .RS .RE .TP +.B \f[I]FI_REMOTE_COMM\f[] +Indicates that the endpoint support communication with endpoints located +at remote nodes (across the fabric). +See FI_LOCAL_COMM for additional details. +Providers that set FI_REMOTE_COMM but not FI_LOCAL_COMM, for example +NICs that lack loopback support, cannot be used to communicate with +processes on the same system. +.RS +.RE +.TP .B \f[I]FI_REMOTE_READ\f[] Indicates that the user requires an endpoint capable of receiving read memory operations from remote endpoints. @@ -465,6 +459,19 @@ This flag requires that FI_RMA and/or FI_ATOMIC be set. .RS .RE .TP +.B \f[I]FI_RMA\f[] +Specifies that the endpoint should support RMA read and write +operations. +Endpoints supporting this capability support operations defined by +struct fi_ops_rma. +In the absence of any relevant flags, FI_RMA implies the ability to +initiate and be the target of remote memory reads and writes. +Applications can use the FI_READ, FI_WRITE, FI_REMOTE_READ, and +FI_REMOTE_WRITE flags to restrict the types of RMA operations supported +by an endpoint. +.RS +.RE +.TP .B \f[I]FI_RMA_EVENT\f[] Requests that an endpoint support the generation of completion events when it is the target of an RMA and/or atomic operation. @@ -473,51 +480,41 @@ on the endpoint. .RS .RE .TP -.B \f[I]FI_SHARED_AV\f[] -Requests or indicates support for address vectors which may be shared -among multiple processes. -.RS -.RE -.TP -.B \f[I]FI_TRIGGER\f[] -Indicates that the endpoint should support triggered operations. -Endpoints support this capability must meet the usage model as described -by fi_trigger.3. +.B \f[I]FI_RMA_PMEM\f[] +Indicates that the provider is \[aq]persistent memory aware\[aq] and +supports RMA operations to and from persistent memory. +Persistent memory aware providers must support registration of memory +that is backed by non\- volatile memory, RMA transfers to/from +persistent memory, and enhanced completion semantics. +This flag requires that FI_RMA be set. +This capability is experimental. .RS .RE .TP -.B \f[I]FI_FENCE\f[] -Indicates that the endpoint support the FI_FENCE flag on data transfer -operations. -Support requires tracking that all previous transmit requests to a -specified remote endpoint complete prior to initiating the fenced -operation. -Fenced operations are often used to enforce ordering between operations -that are not otherwise guaranteed by the underlying provider or -protocol. +.B \f[I]FI_SEND\f[] +Indicates that the user requires an endpoint capable of sending message +data transfers. +Message transfers include base message operations as well as tagged +message functionality. .RS .RE .TP -.B \f[I]FI_LOCAL_COMM\f[] -Indicates that the endpoint support host local communication. -This flag may be used in conjunction with FI_REMOTE_COMM to indicate -that local and remote communication are required. -If neither FI_LOCAL_COMM or FI_REMOTE_COMM are specified, then the -provider will indicate support for the configuration that minimally -affects performance. -Providers that set FI_LOCAL_COMM but not FI_REMOTE_COMM, for example a -shared memory provider, may only be used to communication between -processes on the same system. +.B \f[I]FI_SHARED_AV\f[] +Requests or indicates support for address vectors which may be shared +among multiple processes. .RS .RE .TP -.B \f[I]FI_REMOTE_COMM\f[] -Indicates that the endpoint support communication with endpoints located -at remote nodes (across the fabric). -See FI_LOCAL_COMM for additional details. -Providers that set FI_REMOTE_COMM but not FI_LOCAL_COMM, for example -NICs that lack loopback support, cannot be used to communicate with -processes on the same system. +.B \f[I]FI_SOURCE\f[] +Requests that the endpoint return source addressing data as part of its +completion data. +This capability only applies to connectionless endpoints. +Note that returning source address information may require that the +provider perform address translation and/or look\-up based on data +available in the underlying protocol in order to provide the requested +data, which may adversely affect performance. +The performance impact may be greater for address vectors of type +FI_AV_TABLE. .RS .RE .TP @@ -532,14 +529,23 @@ vector, which may adversely affect performance. .RS .RE .TP -.B \f[I]FI_RMA_PMEM\f[] -Indicates that the provider is \[aq]persistent memory aware\[aq] and -supports RMA operations to and from persistent memory. -Persistent memory aware providers must support registration of memory -that is backed by non\- volatile memory, RMA transfers to/from -persistent memory, and enhanced completion semantics. -This flag requires that FI_RMA be set. -This capability is experimental. +.B \f[I]FI_TAGGED\f[] +Specifies that the endpoint should handle tagged message transfers. +Tagged message transfers associate a user\-specified key or tag with +each message that is used for matching purposes at the remote side. +Endpoints supporting this capability support operations defined by +struct fi_ops_tagged. +In the absence of any relevant flags, FI_TAGGED implies the ability to +send and receive tagged messages. +Applications can use the FI_SEND and FI_RECV flags to optimize an +endpoint as send\-only or receive\-only. +.RS +.RE +.TP +.B \f[I]FI_TRIGGER\f[] +Indicates that the endpoint should support triggered operations. +Endpoints support this capability must meet the usage model as described +by fi_trigger.3. .RS .RE .TP @@ -554,12 +560,25 @@ configurable size. This flag requires that FI_MSG and/or FI_TAGGED be set. .RS .RE +.TP +.B \f[I]FI_WRITE\f[] +Indicates that the user requires an endpoint capable of initiating +writes against remote memory regions. +This flag requires that FI_RMA and/or FI_ATOMIC be set. +.RS +.RE .PP -Capabilities may be grouped into two general categories: primary and -secondary. +Capabilities may be grouped into three general categories: primary, +secondary, and primary modifiers. Primary capabilities must explicitly be requested by an application, and a provider must enable support for only those primary capabilities which were selected. +Primary modifiers are used to limit a primary capability, such as +restricting an endpoint to being send\-only. +If no modifiers are specified for an applicable capability, all relevant +modifiers are assumed. +See above definitions for details. +.PP Secondary capabilities may optionally be requested by an application. If requested, a provider must support the capability or fail the fi_getinfo request (FI_ENODATA). @@ -567,8 +586,11 @@ A provider may optionally report non\-selected secondary capabilities if doing so would not compromise performance or security. .PP Primary capabilities: FI_MSG, FI_RMA, FI_TAGGED, FI_ATOMIC, -FI_MULTICAST, FI_NAMED_RX_CTX, FI_DIRECTED_RECV, FI_READ, FI_WRITE, -FI_RECV, FI_SEND, FI_REMOTE_READ, FI_REMOTE_WRITE, and FI_VARIABLE_MSG. +FI_MULTICAST, FI_NAMED_RX_CTX, FI_DIRECTED_RECV, FI_VARIABLE_MSG, +FI_HMEM, FI_COLLECTIVE +.PP +Primary modifiers: FI_READ, FI_WRITE, FI_RECV, FI_SEND, FI_REMOTE_READ, +FI_REMOTE_WRITE .PP Secondary capabilities: FI_MULTI_RECV, FI_SOURCE, FI_RMA_EVENT, FI_SHARED_AV, FI_TRIGGER, FI_FENCE, FI_LOCAL_COMM, FI_REMOTE_COMM, @@ -593,6 +615,35 @@ The set of modes are listed below. If a NULL hints structure is provided, then the provider\[aq]s supported set of modes will be returned in the info structure(s). .TP +.B \f[I]FI_ASYNC_IOV\f[] +Applications can reference multiple data buffers as part of a single +operation through the use of IO vectors (SGEs). +Typically, the contents of an IO vector are copied by the provider into +an internal buffer area, or directly to the underlying hardware. +However, when a large number of IOV entries are supported, IOV buffering +may have a negative impact on performance and memory consumption. +The FI_ASYNC_IOV mode indicates that the application must provide the +buffering needed for the IO vectors. +When set, an application must not modify an IO vector of length > 1, +including any related memory descriptor array, until the associated +operation has completed. +.RS +.RE +.TP +.B \f[I]FI_BUFFERED_RECV\f[] +The buffered receive mode bit indicates that the provider owns the data +buffer(s) that are accessed by the networking layer for received +messages. +Typically, this implies that data must be copied from the provider +buffer into the application buffer. +Applications that can handle message processing from network allocated +data buffers can set this mode bit to avoid copies. +For full details on application requirements to support this mode, see +the \[aq]Buffered Receives\[aq] section in \f[C]fi_msg\f[](3). +This mode bit applies to FI_MSG and FI_TAGGED receive operations. +.RS +.RE +.TP .B \f[I]FI_CONTEXT\f[] Specifies that the provider requires that applications use struct fi_context as their per operation context parameter for operations that @@ -685,30 +736,6 @@ For scatter\-gather send/recv operations, the prefix buffer must be a contiguous region, though it may or may not be directly adjacent to the payload portion of the buffer. .TP -.B \f[I]FI_ASYNC_IOV\f[] -Applications can reference multiple data buffers as part of a single -operation through the use of IO vectors (SGEs). -Typically, the contents of an IO vector are copied by the provider into -an internal buffer area, or directly to the underlying hardware. -However, when a large number of IOV entries are supported, IOV buffering -may have a negative impact on performance and memory consumption. -The FI_ASYNC_IOV mode indicates that the application must provide the -buffering needed for the IO vectors. -When set, an application must not modify an IO vector of length > 1, -including any related memory descriptor array, until the associated -operation has completed. -.RS -.RE -.TP -.B \f[I]FI_RX_CQ_DATA\f[] -This mode bit only applies to data transfers that set FI_REMOTE_CQ_DATA. -When set, a data transfer that carries remote CQ data will consume a -receive buffer at the target. -This is true even for operations that would normally not consume posted -receive buffers, such as RMA write operations. -.RS -.RE -.TP .B \f[I]FI_NOTIFY_FLAGS_ONLY\f[] This bit indicates that general completion flags may not be set by the provider, and are not needed by the application. @@ -729,17 +756,12 @@ contexts that have the same set of capability flags. .RS .RE .TP -.B \f[I]FI_BUFFERED_RECV\f[] -The buffered receive mode bit indicates that the provider owns the data -buffer(s) that are accessed by the networking layer for received -messages. -Typically, this implies that data must be copied from the provider -buffer into the application buffer. -Applications that can handle message processing from network allocated -data buffers can set this mode bit to avoid copies. -For full details on application requirements to support this mode, see -the \[aq]Buffered Receives\[aq] section in \f[C]fi_msg\f[](3). -This mode bit applies to FI_MSG and FI_TAGGED receive operations. +.B \f[I]FI_RX_CQ_DATA\f[] +This mode bit only applies to data transfers that set FI_REMOTE_CQ_DATA. +When set, a data transfer that carries remote CQ data will consume a +receive buffer at the target. +This is true even for operations that would normally not consume posted +receive buffers, such as RMA write operations. .RS .RE .SH ADDRESSING FORMATS @@ -757,51 +779,38 @@ In some cases, a selected addressing format may need to be translated or mapped into an address which is native to the fabric. See \f[C]fi_av\f[](3). .TP -.B \f[I]FI_FORMAT_UNSPEC\f[] -FI_FORMAT_UNSPEC indicates that a provider specific address format -should be selected. -Provider specific addresses may be protocol specific or a vendor -proprietary format. -Applications that select FI_FORMAT_UNSPEC should be prepared to treat -returned addressing data as opaque. -FI_FORMAT_UNSPEC targets apps which make use of an out of band address -exchange. -Applications which use FI_FORMAT_UNSPEC may use fi_getname() to obtain a -provider specific address assigned to an allocated endpoint. +.B \f[I]FI_ADDR_BGQ\f[] +Address is an IBM proprietary format that is used with their Blue Gene Q +systems. .RS .RE .TP -.B \f[I]FI_SOCKADDR\f[] -Address is of type sockaddr. -The specific socket address format will be determined at run time by -interfaces examining the sa_family field. -.RS -.RE -.TP -.B \f[I]FI_SOCKADDR_IN\f[] -Address is of type sockaddr_in (IPv4). +.B \f[I]FI_ADDR_EFA\f[] +Address is an Amazon Elastic Fabric Adapter (EFA) proprietary format. .RS .RE .TP -.B \f[I]FI_SOCKADDR_IN6\f[] -Address is of type sockaddr_in6 (IPv6). +.B \f[I]FI_ADDR_GNI\f[] +Address is a Cray proprietary format that is used with their GNI +protocol. .RS .RE .TP -.B \f[I]FI_SOCKADDR_IB\f[] -Address is of type sockaddr_ib (defined in Linux kernel source) +.B \f[I]FI_ADDR_PSMX\f[] +Address is an Intel proprietary format used with their Performance +Scaled Messaging protocol. .RS .RE .TP -.B \f[I]FI_ADDR_PSMX\f[] -Address is an Intel proprietary format that is used with their PSMX -(extended performance scaled messaging) protocol. +.B \f[I]FI_ADDR_PSMX2\f[] +Address is an Intel proprietary format used with their Performance +Scaled Messaging protocol version 2. .RS .RE .TP -.B \f[I]FI_ADDR_GNI\f[] -Address is a Cray proprietary format that is used with their GNI -protocol. +.B \f[I]FI_ADDR_PSMX3\f[] +Address is an Intel proprietary format used with their Performance +Scaled Messaging protocol version 3. .RS .RE .TP @@ -825,6 +834,48 @@ fi_sockaddr://10.31.6.12:7471?qos=3 Since the string formatted address does not contain any provider information, the prov_name field of the fabric attribute structure should be used to filter by provider if necessary. +.TP +.B \f[I]FI_FORMAT_UNSPEC\f[] +FI_FORMAT_UNSPEC indicates that a provider specific address format +should be selected. +Provider specific addresses may be protocol specific or a vendor +proprietary format. +Applications that select FI_FORMAT_UNSPEC should be prepared to treat +returned addressing data as opaque. +FI_FORMAT_UNSPEC targets apps which make use of an out of band address +exchange. +Applications which use FI_FORMAT_UNSPEC may use fi_getname() to obtain a +provider specific address assigned to an allocated endpoint. +.RS +.RE +.TP +.B \f[I]FI_SOCKADDR\f[] +Address is of type sockaddr. +The specific socket address format will be determined at run time by +interfaces examining the sa_family field. +.RS +.RE +.TP +.B \f[I]FI_SOCKADDR_IB\f[] +Address is of type sockaddr_ib (defined in Linux kernel source) +.RS +.RE +.TP +.B \f[I]FI_SOCKADDR_IN\f[] +Address is of type sockaddr_in (IPv4). +.RS +.RE +.TP +.B \f[I]FI_SOCKADDR_IN6\f[] +Address is of type sockaddr_in6 (IPv6). +.RS +.RE +.TP +.B \f[I]FI_ADDR_PSMX\f[] +Address is an Intel proprietary format that is used with their PSMX +(extended performance scaled messaging) protocol. +.RS +.RE .SH FLAGS .PP The operation of the fi_getinfo call may be controlled through the use @@ -838,15 +889,6 @@ Use of this flag will suppress any lengthy name resolution protocol. .RS .RE .TP -.B \f[I]FI_SOURCE\f[] -Indicates that the node and service parameters specify the local source -address to associate with an endpoint. -If specified, either the node and/or service parameter must be -non\-NULL. -This flag is often used with passive endpoints. -.RS -.RE -.TP .B \f[I]FI_PROV_ATTR_ONLY\f[] Indicates that the caller is only querying for what providers are potentially available. @@ -858,6 +900,15 @@ The fabric_attr member will have the prov_name and prov_version values filled in. .RS .RE +.TP +.B \f[I]FI_SOURCE\f[] +Indicates that the node and service parameters specify the local source +address to associate with an endpoint. +If specified, either the node and/or service parameter must be +non\-NULL. +This flag is often used with passive endpoints. +.RS +.RE .SH RETURN VALUE .PP fi_getinfo() returns 0 on success. @@ -880,16 +931,16 @@ invalid. .RS .RE .TP -.B \f[I]FI_ENOMEM\f[] -Indicates that there was insufficient memory to complete the operation. -.RS -.RE -.TP .B \f[I]FI_ENODATA\f[] Indicates that no providers could be found which support the requested fabric information. .RS .RE +.TP +.B \f[I]FI_ENOMEM\f[] +Indicates that there was insufficient memory to complete the operation. +.RS +.RE .SH NOTES .PP If hints are provided, the operation will be controlled by the values diff --git a/man/man3/fi_mr.3 b/man/man3/fi_mr.3 index a228df9211e..c790188c4de 100644 --- a/man/man3/fi_mr.3 +++ b/man/man3/fi_mr.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_mr" "3" "2019\-05\-04" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_mr" "3" "2020\-10\-01" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -43,7 +43,8 @@ Releases a previously mapped raw memory region key. .RE .TP .B fi_mr_bind -Associate a registered memory region with a completion counter. +Associate a registered memory region with a completion counter or an +endpoint. .RS .RE .TP @@ -118,12 +119,13 @@ User specified context associated with the memory region. .RE .TP .B \f[I]buf\f[] -Memory buffer to register with the fabric hardware +Memory buffer to register with the fabric hardware. .RS .RE .TP .B \f[I]len\f[] -Length of memory buffer to register +Length of memory buffer to register. +Must be > 0. .RS .RE .TP @@ -172,33 +174,48 @@ Additionally, a fabric provider may require that data buffers be registered before being used in local transfers. Memory registration restrictions are controlled using a separate set of mode bits, specified through the domain attributes (mr_mode field). +Each mr_mode bit requires that an application take specific steps in +order to use memory buffers with libfabric interfaces. .PP The following apply to memory registration. .TP -.B \f[I]Scalable Memory Registration\f[] -By default, memory registration is considered scalable. -(For library versions 1.4 and earlier, this is indicated by setting -mr_mode to FI_MR_SCALABLE, with the fi_info mode bit FI_LOCAL_MR set to -0). -For versions 1.5 and later, scalable is implied by the lack of any -mr_mode bits being set. -The setting of mr_mode bits therefore adjusts application behavior as -described below. -Default, scalable registration has several properties. +.B \f[I]Default Memory Registration\f[] +If no mr_mode bits are set, the default behaviors describe below are +followed. +Historically, these defaults were collectively referred to as scalable +memory registration. +The default requirements are outlined below, followed by definitions of +how each mr_mode bit alters the definition. .RS .RE .PP -In scalable mode, registration occurs on memory address ranges. -Because registration refers to memory regions, versus data buffers, the -address ranges given for a registration request do not need to map to -data buffers allocated by the application at the time the registration -call is made. +Compatibility: For library versions 1.4 and earlier, this was indicated +by setting mr_mode to FI_MR_SCALABLE and the fi_info mode bit +FI_LOCAL_MR to 0. +FI_MR_SCALABLE and FI_LOCAL_MR were deprecated in libfabric version 1.5, +though they are supported for backwards compatibility purposes. +.PP +For security, memory registration is required for data buffers that are +accessed directly by a peer process. +For example, registration is required for RMA target buffers (read or +written to), and those accessed by atomic or collective operations. +.PP +By default, registration occurs on virtual address ranges. +Because registration refers to address ranges, rather than allocated +data buffers, the address ranges do not need to map to data buffers +allocated by the application at the time the registration call is made. That is, an application can register any range of addresses in their virtual address space, whether or not those addresses are backed by physical pages or have been allocated. .PP -The resulting memory regions are accessible by peers starting at a base -address of 0. +Note that physical pages must back addresses prior to the addresses +being accessed as part of a data transfer operation, or the data +transfers will fail. +Additionally, depending on the operation, this could result in the local +process receiving a segmentation fault for accessing invalid memory. +.PP +Once registered, the resulting memory regions are accessible by peers +starting at a base address of 0. That is, the target address that is specified is a byte offset into the registered region. .PP @@ -210,15 +227,35 @@ registered. This includes source buffers for all transmit operations \-\- sends, tagged sends, RMA, and atomics \-\- as well as buffers posted for receive and tagged receive operations. +.PP +Although the default memory registration behavior is convenient for +application developers, it is difficult to implement in hardware. +Attempts to hide the hardware requirements from the application often +results in significant and unacceptable impacts to performance. +The following mr_mode bits are provided as input into fi_getinfo. +If a provider requires the behavior defined for an mr_mode bit, it will +leave the bit set on output to fi_getinfo. +Otherwise, the provider can clear the bit to indicate that the behavior +is not needed. +.PP +By setting an mr_mode bit, the application has agreed to adjust its +behavior as indicated. +Importantly, applications that choose to support an mr_mode must be +prepared to handle the case where the mr_mode is not required. +A provider will clear an mr_mode bit if it is not needed. .TP .B \f[I]FI_MR_LOCAL\f[] When the FI_MR_LOCAL mode bit is set, applications must register all data buffers that will be accessed by the local hardware and provide a -valid mem_desc parameter into applicable data transfer operations. +valid desc parameter into applicable data transfer operations. When FI_MR_LOCAL is zero, applications are not required to register data buffers before using them for local operations (e.g. -send and receive data buffers), and the mem_desc parameter into data -transfer operations is ignored. +send and receive data buffers). +The desc parameter into data transfer operations will be ignored in this +case, unless otherwise required (e.g. +se FI_MR_HMEM). +It is recommended that applications pass in NULL for desc when not +required. .RS .RE .PP @@ -316,24 +353,45 @@ To enable the memory region, the application must call fi_mr_enable(). .RS .RE .TP +.B \f[I]FI_MR_HMEM\f[] +This mode bit is associated with the FI_HMEM capability. +If FI_MR_HMEM is set, the application must register buffers that were +allocated using a device call and provide a valid desc parameter into +applicable data transfer operations even if they are only used for local +operations (e.g. +send and receive data buffers). +Device memory must be registered using the fi_mr_regattr call, with the +iface and device fields filled out. +.RS +.RE +.PP +If FI_MR_HMEM is set, but FI_MR_LOCAL is unset, only device buffers must +be registered when used locally. +In this case, the desc parameter passed into data transfer operations +must either be valid or NULL. +Similarly, if FI_MR_LOCAL is set, but FI_MR_HMEM is not, the desc +parameter must either be valid or NULL. +.TP .B \f[I]Basic Memory Registration\f[] -Basic memory registration is indicated by the FI_MR_BASIC mr_mode bit. -FI_MR_BASIC is maintained for backwards compatibility (libfabric version -1.4 or earlier). -The behavior of basic registration is equivalent to setting the -following mr_mode bits to one: FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, and -FI_MR_PROV_KEY. -Additionally, providers that support basic registration usually required -FI_MR_LOCAL. -FI_MR_BASIC must either be set alone, or in conjunction with -FI_MR_LOCAL. -Other mr_mode bit pairings are invalid. +Basic memory registration was deprecated in libfabric version 1.5, but +is supported for backwards compatibility. +Basic memory registration is indicated by setting mr_mode equal to +FI_MR_BASIC. +FI_MR_BASIC must be set alone and not paired with mr_mode bits. Unlike other mr_mode bits, if FI_MR_BASIC is set on input to fi_getinfo(), it will not be cleared by the provider. -That is, setting FI_MR_BASIC to one requests basic registration. +That is, setting mr_mode equal to FI_MR_BASIC forces basic registration +if the provider supports it. .RS .RE .PP +The behavior of basic registration is equivalent to requiring the +following mr_mode bits: FI_MR_VIRT_ADDR, FI_MR_ALLOCATED, and +FI_MR_PROV_KEY. +Additionally, providers that support basic registration usually require +the (deprecated) fi_info mode bit FI_LOCAL_MR, which was incorporated +into the FI_MR_LOCAL mr_mode bit. +.PP The registrations functions \-\- fi_mr_reg, fi_mr_regv, and fi_mr_regattr \-\- are used to register one or more memory regions with fabric resources. @@ -546,6 +604,12 @@ struct\ fi_mr_attr\ { \ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ *context; \ \ \ \ size_t\ \ \ \ \ \ \ \ \ \ \ \ \ auth_key_size; \ \ \ \ uint8_t\ \ \ \ \ \ \ \ \ \ \ \ *auth_key; +\ \ \ \ enum\ fi_hmem_iface\ iface; +\ \ \ \ union\ { +\ \ \ \ \ \ \ \ uint64_t\ \ \ \ \ \ \ \ \ reserved; +\ \ \ \ \ \ \ \ int\ \ \ \ \ \ \ \ \ \ \ \ \ \ cuda; +\ \ \ \ \ \ \ \ int\ \ \ \ \ \ ze +\ \ \ \ }\ device; }; \f[] .fi @@ -652,6 +716,52 @@ The domain authorization key will be used if the auth_key_size provided is 0. This field is ignored unless the fabric is opened with API version 1.5 or greater. +.SS iface +.PP +Indicates the software interfaces used by the application to allocate +and manage the memory region. +This field is ignored unless the application has requested the FI_HMEM +capability. +.TP +.B \f[I]FI_HMEM_SYSTEM\f[] +Uses standard operating system calls and libraries, such as malloc, +calloc, realloc, mmap, and free. +.RS +.RE +.TP +.B \f[I]FI_HMEM_CUDA\f[] +Uses Nvidia CUDA interfaces such as cuMemAlloc, cuMemAllocHost, +cuMemAllocManaged, cuMemFree, cudaMalloc, cudaFree. +.RS +.RE +.TP +.B \f[I]FI_HMEM_ROCR\f[] +Uses AMD ROCR interfaces such as hsa_memory_allocate and +hsa_memory_free. +.RS +.RE +.TP +.B \f[I]FI_HMEM_ZE\f[] +Uses Intel L0 ZE interfaces such as zeDriverAllocSharedMem, +zeDriverFreeMem. +.RS +.RE +.SS device +.PP +Reserved 64 bits for device identifier if using non\-standard HMEM +interface. +This field is ignore unless the iface field is valid. +.TP +.B \f[I]cuda\f[] +For FI_HMEM_CUDA, this is equivalent to CUdevice (int). +.RS +.RE +.TP +.B \f[I]ze\f[] +For FI_HMEM_ZE, this is equivalent to the ze_device_handle_t index +(int). +.RS +.RE .SH NOTES .PP Direct access to an application\[aq]s memory by a remote peer requires @@ -730,7 +840,7 @@ Returned if the specified flags are not supported by the provider. Many hardware NICs accessed by libfabric require that data buffers be registered with the hardware while the hardware accesses it. This ensures that the virtual to physical address mappings for those -buffers do not change while the transfer is ocurring. +buffers do not change while the transfer is occurring. The performance impact of registering memory regions can be significant. As a result, some providers make use of a registration cache, particularly when working with applications that are unable to manage @@ -738,6 +848,12 @@ their own network buffers. A registration cache avoids the overhead of registering and unregistering a data buffer with each transfer. .PP +If a registration cache is going to be used for host and device memory, +the device must support unified virtual addressing. +If the device does not support unified virtual addressing, either an +additional registration cache is required to track this device memory, +or device memory cannot be cached. +.PP As a general rule, if hardware requires the FI_MR_LOCAL mode bit described above, but this is not supported by the application, a memory registration cache \f[I]may\f[] be in use. @@ -766,16 +882,39 @@ Setting this to zero will disable registration caching. .RS .RE .TP -.B \f[I]FI_MR_CACHE_MERGE_REGIONS\f[] -If this variable is set to true, yes, or 1, then memory regions that are -adjacent or overlapping will be merged into a single larger region. -Merging regions reduces the total cache size and the number of regions -managed by the cache. -However, merging regions can have a negative impact on performance if a -large number of adjacent regions are sent as separate data transfers -(such as sending elements of an array to peer(s)), and the larger region -is access infrequently. -By default merging regions is disabled. +.B \f[I]FI_MR_CACHE_MONITOR\f[] +The cache monitor is responsible for detecting system memory +(FI_HMEM_SYSTEM) changes made between the virtual addresses used by an +application and the underlying physical pages. +Valid monitor options are: userfaultfd, memhooks, and disabled. +Selecting disabled will turn off the registration cache. +Userfaultfd is a Linux kernel feature used to report virtual to physical +address mapping changes to user space. +Memhooks operates by intercepting relevant memory allocation and +deallocation calls which may result in the mappings changing, such as +malloc, mmap, free, etc. +Note that memhooks operates at the elf linker layer, and does not use +glibc memory hooks. +.RS +.RE +.TP +.B \f[I]FI_MR_CUDA_CACHE_MONITOR_ENABLED\f[] +The CUDA cache monitor is responsible for detecting CUDA device memory +(FI_HMEM_CUDA) changes made between the device virtual addresses used by +an application and the underlying device physical pages. +Valid monitor options are: 0 or 1. +Note that the CUDA memory monitor requires a CUDA toolkit version with +unified virtual addressing enabled. +.RS +.RE +.TP +.B \f[I]FI_MR_ROCR_CACHE_MONITOR_ENABLED\f[] +The ROCR cache monitor is responsible for detecting ROCR device memory +(FI_HMEM_ROCR) changes made between the device virtual addresses used by +an application and the underlying device physical pages. +Valid monitor options are: 0 or 1. +Note that the ROCR memory monitor requires a ROCR version with unified +virtual addressing enabled. .RS .RE .SH SEE ALSO diff --git a/man/man3/fi_msg.3 b/man/man3/fi_msg.3 index c4fa5fedb09..237c2a07e00 100644 --- a/man/man3/fi_msg.3 +++ b/man/man3/fi_msg.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_msg" "3" "2019\-02\-04" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_msg" "3" "2020\-10\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -11,11 +11,8 @@ Post a buffer to receive an incoming message .RS .RE .PP -fi_send / fi_sendv / fi_sendmsg -.PD 0 -.P -.PD -fi_inject / fi_senddata : Initiate an operation to send a message +fi_send / fi_sendv / fi_sendmsg fi_inject / fi_senddata : Initiate an +operation to send a message .SH SYNOPSIS .IP .nf @@ -79,7 +76,8 @@ Count of vectored data entries. .RE .TP .B \f[I]desc\f[] -Descriptor associated with the data buffer +Descriptor associated with the data buffer. +See \f[C]fi_mr\f[](3). .RS .RE .TP @@ -154,11 +152,6 @@ See fi_cq for completion event details. The call fi_send transfers the data contained in the user\-specified data buffer to a remote endpoint, with message boundaries being maintained. -For connection based endpoints (FI_EP_MSG) the local endpoint must be -connected to a remote endpoint or destination before fi_send is called. -Unless the endpoint has been configured differently, the data buffer -passed into fi_send must not be touched by the application until the -fi_send call completes asynchronously. .SS fi_sendv .PP The fi_sendv call adds support for a scatter\-gather list to fi_send. @@ -167,7 +160,7 @@ parameter to a remote endpoint as a single message. .SS fi_sendmsg .PP The fi_sendmsg call supports data transfers over both connected and -unconnected endpoints, with the ability to control the send operation +connectionless endpoints, with the ability to control the send operation per call through the use of flags. The fi_sendmsg function takes a \f[C]struct\ fi_msg\f[] as input. .IP @@ -216,7 +209,7 @@ order to match sends. Message boundaries are maintained. The order in which the receives complete is dependent on the endpoint type and protocol. -For unconnected endpoints, the src_addr parameter can be used to +For connectionless endpoints, the src_addr parameter can be used to indicate that a buffer should be posted to receive incoming data from a specific remote endpoint. .SS fi_recvv @@ -227,8 +220,8 @@ parameter to a receive incoming data. .SS fi_recvmsg .PP The fi_recvmsg call supports posting buffers over both connected and -unconnected endpoints, with the ability to control the receive operation -per call through the use of flags. +connectionless endpoints, with the ability to control the receive +operation per call through the use of flags. The fi_recvmsg function takes a struct fi_msg as input. .SH FLAGS .PP diff --git a/man/man3/fi_nic.3 b/man/man3/fi_nic.3 index 4f21557f08c..8e4bfc2a9a3 100644 --- a/man/man3/fi_nic.3 +++ b/man/man3/fi_nic.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_nic" "3" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_nic" "3" "2020\-04\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -167,7 +167,7 @@ Ethernet or InfiniBand. .PP Provider attributes reference provider specific details of the device. These attributes are both provider and device specific. -The attributes can be interpretted by \f[C]fi_tostr\f[](3). +The attributes can be interpreted by \f[C]fi_tostr\f[](3). Applications may also use the other attribute fields, such as related fi_fabric_attr: prov_name field, to determine an appropriate structure to cast the attributes. diff --git a/man/man3/fi_poll.3 b/man/man3/fi_poll.3 index 30594f1428a..787c1395010 100644 --- a/man/man3/fi_poll.3 +++ b/man/man3/fi_poll.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_poll" "3" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_poll" "3" "2020\-04\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -211,8 +211,8 @@ Wait sets are associated with specific wait object(s). Wait objects allow applications to block until the wait object is signaled, indicating that an event is available to be read. The following values may be used to specify the type of wait object -associated with a wait set: FI_WAIT_UNSPEC, FI_WAIT_FD, and -FI_WAIT_MUTEX_COND. +associated with a wait set: FI_WAIT_UNSPEC, FI_WAIT_FD, +FI_WAIT_MUTEX_COND, and FI_WAIT_YIELD. .RS .RE .TP @@ -228,15 +228,14 @@ the underlying wait object. .RE .TP .B \- \f[I]FI_WAIT_FD\f[] -Indicates that the wait set should use file descriptor(s) as its wait -mechanism. -It may not always be possible for a wait set to be implemented using a -single underlying file descriptor, but all wait objects will be file -descriptors. -File descriptor wait objects must be usable in the POSIX select(2), -poll(2), and epoll(7) routines (if available). -However, a provider may signal an FD wait object by marking it as -readable or with an error. +Indicates that the wait set should use a single file descriptor as its +wait mechanism, as exposed to the application. +Internally, this may require the use of epoll in order to support +waiting on a single file descriptor. +File descriptor wait objects must be usable in the POSIX select(2) and +poll(2), and Linux epoll(7) routines (if available). +Provider signal an FD wait object by marking it as readable or with an +error. .RS .RE .TP @@ -246,10 +245,22 @@ as a wait object. .RS .RE .TP -.B \- \f[I]FI_WAIT_CRITSEC_COND\f[] -Windows specific. -Specifies that the EQ should use a critical section and condition -variable as a wait object. +.B \- \f[I]FI_WAIT_POLLFD\f[] +This option is similar to FI_WAIT_FD, but allows the wait mechanism to +use multiple file descriptors as its wait mechanism, as viewed by the +application. +The use of FI_WAIT_POLLFD can eliminate the need to use epoll to +abstract away needing to check multiple file descriptors when waiting +for events. +The file descriptors must be usable in the POSIX select(2) and poll(2) +routines, and match directly to being used with poll. +See the NOTES section below for details on using pollfd. +.RS +.RE +.TP +.B \- \f[I]FI_WAIT_YIELD\f[] +Indicates that the wait set will wait without a wait object but instead +yield on every wait. .RS .RE .TP @@ -329,24 +340,30 @@ queues and counters that may require processing. .SS fi_control .PP The fi_control call is used to access provider or implementation -specific details of the wait set. -Access to the wait set should be serialized across all calls when +specific details of a fids that support blocking calls, such as wait +sets, completion queues, counters, and event queues. +Access to the wait set or fid should be serialized across all calls when fi_control is invoked, as it may redirect the implementation of wait set operations. -The following control commands are usable with a wait set. +The following control commands are usable with a wait set or fid. .TP .B \f[I]FI_GETWAIT (void **)\f[] This command allows the user to retrieve the low\-level wait object -associated with the wait set. +associated with a wait set or fid. The format of the wait set is specified during wait set creation, through the wait set attributes. The fi_control arg parameter should be an address where a pointer to the returned wait object will be written. -This should be an \[aq]int *\[aq] for FI_WAIT_FD, or \[aq]struct -fi_mutex_cond\[aq] for FI_WAIT_MUTEX_COND. -Support for FI_GETWAIT is provider specific and may fail if not -supported or if the wait set is implemented using more than one wait -object. +This should be an \[aq]int *\[aq] for FI_WAIT_FD, \[aq]struct +fi_mutex_cond\[aq] for FI_WAIT_MUTEX_COND, or \[aq]struct +fi_wait_pollfd\[aq] for FI_WAIT_POLLFD. +Support for FI_GETWAIT is provider specific. +.RS +.RE +.TP +.B \f[I]FI_GETWAITOBJ (enum fi_wait_obj *)\f[] +This command returns the type of wait object associated with a wait set +or fid. .RS .RE .SH RETURN VALUES @@ -362,6 +379,81 @@ written to the context array. .RS .RE .SH NOTES +.PP +In many situations, blocking calls may need to wait on signals sent to a +number of file descriptors. +For example, this is the case for socket based providers, such as tcp +and udp, as well as utility providers such as multi\-rail. +For simplicity, when epoll is available, it can be used to limit the +number of file descriptors that an application must monitor. +The use of epoll may also be required in order to support FI_WAIT_FD. +.PP +However, in order to support waiting on multiple file descriptors on +systems where epoll support is not available, or where epoll performance +may negatively impact performance, FI_WAIT_POLLFD provides this +mechanism. +A significant different between using POLLFD versus FD wait objects is +that with FI_WAIT_POLLFD, the file descriptors may change dynamically. +As an example, the file descriptors associated with a completion +queues\[aq] wait set may change as endpoint associations with the CQ are +added and removed. +.PP +Struct fi_wait_pollfd is used to retrieve all file descriptors for fids +using FI_WAIT_POLLFD to support blocking calls. +.IP +.nf +\f[C] +struct\ fi_wait_pollfd\ { +\ \ \ \ uint64_t\ \ \ \ \ \ change_index; +\ \ \ \ size_t\ \ \ \ \ \ \ \ nfds; +\ \ \ \ struct\ pollfd\ *fd; +}; +\f[] +.fi +.TP +.B \f[I]change_index\f[] +The change_index may be used to determine if there have been any changes +to the file descriptor list. +Anytime a file descriptor is added, removed, or its events are updated, +this field is incremented by the provider. +Applications wishing to wait on file descriptors directly should cache +the change_index value. +Before blocking on file descriptor events, the app should use +fi_control() to retrieve the current change_index and compare that +against its cached value. +If the values differ, then the app should update its file descriptor +list prior to blocking. +.RS +.RE +.TP +.B \f[I]nfds\f[] +On input to fi_control(), this indicates the number of entries in the +struct pollfd * array. +On output, this will be set to the number of entries needed to store the +current number of file descriptors. +If the input value is smaller than the output value, fi_control() will +return the error \-FI_ETOOSMALL. +Note that setting nfds = 0 allows an efficient way of checking the +change_index. +.RS +.RE +.TP +.B \f[I]fd\f[] +This points to an array of struct pollfd entries. +The number of entries is specified through the nfds field. +If the number of needed entries is less than or equal to the number of +entries available, the struct pollfd array will be filled out with a +list of file descriptors and corresponding events that can be used in +the select(2) and poll(2) calls. +.RS +.RE +.PP +The change_index is updated only when the file descriptors associated +with the pollfd file set has changed. +Checking the change_index is an additional step needed when working with +FI_WAIT_POLLFD wait objects directly. +The use of the fi_trywait() function is still required if accessing wait +objects directly. .SH SEE ALSO .PP \f[C]fi_getinfo\f[](3), \f[C]fi_domain\f[](3), \f[C]fi_cntr\f[](3), diff --git a/man/man3/fi_rma.3 b/man/man3/fi_rma.3 index 89049633f49..4988c9e83b6 100644 --- a/man/man3/fi_rma.3 +++ b/man/man3/fi_rma.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_rma" "3" "2019\-02\-04" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_rma" "3" "2020\-10\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -11,11 +11,8 @@ Initiates a read from remote memory .RS .RE .PP -fi_write / fi_writev / fi_writemsg -.PD 0 -.P -.PD -fi_inject_write / fi_writedata : Initiate a write to remote memory +fi_write / fi_writev / fi_writemsg fi_inject_write / fi_writedata : +Initiate a write to remote memory .SH SYNOPSIS .IP .nf @@ -97,7 +94,7 @@ Protection key associated with the remote memory. .RE .TP .B \f[I]desc\f[] -Descriptor associated with the local data buffer +Descriptor associated with the local data buffer See \f[C]fi_mr\f[](3). .RS .RE .TP @@ -171,11 +168,6 @@ remote endpoint, so that the immediate data may be delivered. .PP The call fi_write transfers the data contained in the user\-specified data buffer to a remote memory region. -The local endpoint must be connected to a remote endpoint or destination -before fi_write is called. -Unless the endpoint has been configured differently, the data buffer -passed into fi_write must not be touched by the application until the -fi_write call completes asynchronously. .SS fi_writev .PP The fi_writev call adds support for a scatter\-gather list to fi_write. @@ -184,8 +176,8 @@ parameter to the remote memory region. .SS fi_writemsg .PP The fi_writemsg call supports data transfers over both connected and -unconnected endpoints, with the ability to control the write operation -per call through the use of flags. +connectionless endpoints, with the ability to control the write +operation per call through the use of flags. The fi_writemsg function takes a struct fi_msg_rma as input. .IP .nf @@ -226,8 +218,6 @@ the transfer. .PP The fi_read call requests that the remote endpoint transfer data from the remote memory region into the local data buffer. -The local endpoint must be connected to a remote endpoint or destination -before fi_read is called. .SS fi_readv .PP The fi_readv call adds support for a scatter\-gather list to fi_read. @@ -236,7 +226,7 @@ of data buffers referenced by the iov parameter. .SS fi_readmsg .PP The fi_readmsg call supports data transfers over both connected and -unconnected endpoints, with the ability to control the read operation +connectionless endpoints, with the ability to control the read operation per call through the use of flags. The fi_readmsg function takes a struct fi_msg_rma as input. .SH FLAGS diff --git a/man/man3/fi_tagged.3 b/man/man3/fi_tagged.3 index a150b7e5e36..d865714c901 100644 --- a/man/man3/fi_tagged.3 +++ b/man/man3/fi_tagged.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_tagged" "3" "2019\-02\-04" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_tagged" "3" "2020\-10\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -90,7 +90,8 @@ Mask of bits to ignore applied to the tag for receive operations. .RE .TP .B \f[I]desc\f[] -Memory descriptor associated with the data buffer +Memory descriptor associated with the data buffer. +See \f[C]fi_mr\f[](3). .RS .RE .TP @@ -196,7 +197,7 @@ parameter to a remote endpoint as a single message. .SS fi_tsendmsg .PP The fi_tsendmsg call supports data transfers over both connected and -unconnected endpoints, with the ability to control the send operation +connectionless endpoints, with the ability to control the send operation per call through the use of flags. The fi_tsendmsg function takes a struct fi_msg_tagged as input. .IP @@ -246,8 +247,8 @@ parameter to a receive incoming data. .SS fi_trecvmsg .PP The fi_trecvmsg call supports posting buffers over both connected and -unconnected endpoints, with the ability to control the receive operation -per call through the use of flags. +connectionless endpoints, with the ability to control the receive +operation per call through the use of flags. The fi_trecvmsg function takes a struct fi_msg_tagged as input. .SH FLAGS .PP @@ -440,7 +441,7 @@ receiving endpoint. .PP For discussion purposes, the completion queue is assumed to be configured for FI_CQ_FORMAT_TAGGED. -The op_context field will point to a struct fi_recv_contex. +The op_context field will point to a struct fi_recv_context. .IP .nf \f[C] diff --git a/man/man3/fi_trigger.3 b/man/man3/fi_trigger.3 index 701569913b6..590b077c0b8 100644 --- a/man/man3/fi_trigger.3 +++ b/man/man3/fi_trigger.3 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_trigger" "3" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_trigger" "3" "2019\-09\-17" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -95,12 +95,10 @@ Threshold operations are triggered in the order of the threshold values. This is true even if the counter increments by a value greater than 1. If two triggered operations have the same threshold, they will be triggered in the order in which they were submitted to the endpoint. -.SH EXPERIMENTAL DEFERRED WORK QUEUES +.SH DEFERRED WORK QUEUES .PP The following feature and description are enhancements to triggered -operation support, but should be considered experimental. -Until the experimental tag is removed, the interfaces, semantics, and -data structures defined below may change between library versions. +operation support. .PP The deferred work queue interface is designed as primitive constructs that can be used to implement application\-level collective operations. diff --git a/man/man7/fabric.7 b/man/man7/fabric.7 index f1440627255..c4b812bfc72 100644 --- a/man/man7/fabric.7 +++ b/man/man7/fabric.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fabric" "7" "2019\-05\-04" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fabric" "7" "2020\-07\-21" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -383,6 +383,80 @@ may not be available in a child process because of copy on write restrictions. .RS .RE +.SH ABI CHANGES +.PP +libfabric releases maintain compatibility with older releases, so that +compiled applications can continue to work as\-is, and previously +written applications will compile against newer versions of the library +without needing source code changes. +The changes below describe ABI updates that have occurred and which +libfabric release corresponds to the changes. +.PP +Note that because most functions called by applications actually call +static inline functions, which in turn reference function pointers in +order to call directly into providers, libfabric only exports a handful +of functions directly. +ABI changes are limited to those functions, most notably the fi_getinfo +call and its returned attribute structures. +.PP +The ABI version is independent from the libfabric release version. +.SS ABI 1.0 +.PP +The initial libfabric release (1.0.0) also corresponds to ABI version +1.0. +The 1.0 ABI was unchanged for libfabric major.minor versions 1.0, 1.1, +1.2, 1.3, and 1.4. +.SS ABI 1.1 +.PP +A number of external data structures were appended starting with +libfabric version 1.5. +These changes included adding the fields to the following data +structures. +The 1.1 ABI was exported by libfabric versions 1.5 and 1.6. +.TP +.B \f[I]fi_fabric_attr\f[] +Added api_version +.RS +.RE +.TP +.B \f[I]fi_domain_attr\f[] +Added cntr_cnt, mr_iov_limit, caps, mode, auth_key, auth_key_size, +max_err_data, and mr_cnt fields. +The mr_mode field was also changed from an enum to an integer flag +field. +.RS +.RE +.TP +.B \f[I]fi_ep_attr\f[] +Added auth_key_size and auth_key fields. +.RS +.RE +.SS ABI 1.2 +.PP +The 1.2 ABI version was exported by libfabric versions 1.7 and 1.8, and +expanded the following structure. +.TP +.B \f[I]fi_info\f[] +The fi_info structure was expanded to reference a new fabric object, +fid_nic. +When available, the fid_nic references a new set of attributes related +to network hardware details. +.RS +.RE +.SS ABI 1.3 +.PP +The 1.3 ABI is also the current ABI version. +All libfabric releases starting at 1.9 export this ABI. +.TP +.B \f[I]fi_domain_attr\f[] +Added tclass +.RS +.RE +.TP +.B \f[I]fi_tx_attr\f[] +Added tclass +.RS +.RE .SH SEE ALSO .PP \f[C]fi_info\f[](1), \f[C]fi_provider\f[](7), \f[C]fi_getinfo\f[](3), diff --git a/man/man7/fi_efa.7 b/man/man7/fi_efa.7 index dc6b13b3de4..d5f3499eb17 100644 --- a/man/man7/fi_efa.7 +++ b/man/man7/fi_efa.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_efa" "7" "2019\-06\-15" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_efa" "7" "2020\-09\-01" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -70,8 +70,12 @@ memory registrations on the DGRAM endpoint. .RE .TP .B \f[I]Memory registration modes\f[] -The RDM endpoint does not require memory registration and the -\f[I]FI_EP_DGRAM\f[] endpoint only supports \f[I]FI_MR_LOCAL\f[]. +The RDM endpoint does not require memory registration for send and +receive operations, i.e. +it does not require \f[I]FI_MR_LOCAL\f[]. +Applications may specify \f[I]FI_MR_LOCAL\f[] in the MR mode flags in +order to use descriptors provided by the application. +The \f[I]FI_EP_DGRAM\f[] endpoint only supports \f[I]FI_MR_LOCAL\f[]. .RS .RE .TP @@ -92,7 +96,7 @@ the provider is not thread safe when using the DGRAM endpoint. .RE .SH LIMITATIONS .PP -The provider does not support \f[I]FI_ATOMIC\f[] interfaces. +The DGRAM endpoint does not support \f[I]FI_ATOMIC\f[] interfaces. For RMA operations, completion events for RMA targets (\f[I]FI_RMA_EVENT\f[]) is not supported. The DGRAM endpoint does not fully protect against resource overruns, so @@ -101,7 +105,7 @@ resource management is disabled for this endpoint .PP No support for selective completions. .PP -No support for counters. +No support for counters for the DGRAM endpoint. .PP No support for inject. .SH RUNTIME PARAMETERS @@ -165,12 +169,6 @@ When disabled, only uses a bounce buffer .RS .RE .TP -.B \f[I]FI_EFA_MR_CACHE_MERGE_REGIONS\f[] -Enables merging overlapping and adjacent memory registration regions. -Defaults to true. -.RS -.RE -.TP .B \f[I]FI_EFA_MR_MAX_CACHED_COUNT\f[] Sets the maximum number of memory registrations that can be cached at any time. @@ -217,6 +215,38 @@ Time interval (us) for the base timeout to use for exponential backoff to a peer after a receiver not ready error. .RS .RE +.TP +.B \f[I]FI_EFA_ENABLE_SHM_TRANSFER\f[] +Enable SHM provider to provide the communication across all intra\-node +processes. +SHM transfer will be disabled in the case where +\f[C]ptrace\ protection\f[] is turned on. +You can turn it off to enable shm transfer. +.RS +.RE +.TP +.B \f[I]FI_EFA_SHM_AV_SIZE\f[] +Defines the maximum number of entries in SHM provider\[aq]s address +vector. +.RS +.RE +.TP +.B \f[I]FI_EFA_SHM_MAX_MEDIUM_SIZE\f[] +Defines the switch point between small/medium message and large message. +The message larger than this switch point will be transferred with large +message protocol. +.RS +.RE +.TP +.B \f[I]FI_EFA_INTER_MAX_MEDIUM_MESSAGE_SIZE\f[] +The maximum size for inter EFA messages to be sent by using medium +message protocol. +Messages which can fit in one packet will be sent as eager message. +Messages whose sizes are smaller than this value will be sent using +medium message protocol. +Other messages will be sent using CTS based long message protocol. +.RS +.RE .SH SEE ALSO .PP \f[C]fabric\f[](7), \f[C]fi_provider\f[](7), \f[C]fi_getinfo\f[](3) diff --git a/man/man7/fi_mlx.7 b/man/man7/fi_mlx.7 index d38ee44bc69..1a3df046f40 100644 --- a/man/man7/fi_mlx.7 +++ b/man/man7/fi_mlx.7 @@ -1,84 +1,14 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_mlx" "7" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_mlx" "7" "2019\-09\-17" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP fi_mlx \- The MLX Fabric Provider .SH OVERVIEW .PP -The \f[I]mlx\f[] provider runs over the UCX library that is currently -supported by the Mellanox infiniband fabrics. -The \f[I]mlx\f[] provider makes use of UCX tag matching API in order to -implement a limited set of the libfabric data transfer APIs, namely, -tagged message queue. -.PP -Supported UCP API version: 1.0 -.SH LIMITATIONS -.PP -The \f[I]mlx\f[] provider doesn\[aq]t support all the features defined -in the libfabric API. -Here are some of the limitations: -.TP -.B Endpoint types -Only supported type: \f[I]FI_RDM\f[] -.RS -.RE -.TP -.B Endpoint capabilities -Endpoints can support the only data transfer capability -\f[I]FI_TAGGED\f[]. -.RS -.RE -.TP -.B Modes -\f[I]FI_CONTEXT\f[] is required. -That means, all the requests that generate completions must have a valid -pointer to type \f[I]struct fi_context\f[] passed as the operation -context. -.RS -.RE -.TP -.B Threading -The supported mode is FI_THREAD_DOMAIN, i.e. -the \f[I]mlx\f[] provider is not thread safe. -.RS -.RE -.TP -.B Unsupported features -These features are unsupported: connection management, event queue, -scalable endpoint, passive endpoint, shared receive context, rma, -atomics. -.RS -.RE -.SH RUNTIME PARAMETERS -.TP -.B \f[I]FI_MLX_CONFIG\f[] -The path to the MLX configuration file (default: none). -.RS -.RE -.TP -.B \f[I]FI_MLX_TINJECT_LIMIT\f[] -Maximal tinject message size (default: 1024). -.RS -.RE -.TP -.B \f[I]FI_MLX_NS_ENABLE\f[] -Enforce usage of name server functionality for MLX provider (default: -disabled). -.RS -.RE -.TP -.B \f[I]FI_MLX_NS_PORT\f[] -MLX provider\[aq]s name server port (default: 12345). -.RS -.RE -.TP -.B \f[I]FI_MLX_NS_IFACE\f[] -IPv4 network interface for MLX provider\[aq]s name server (default: -any). -.RS -.RE +The mlx provider was deprecated and removed in libfabric 1.9 due to a +lack of a maintainer. .SH SEE ALSO .PP \f[C]fabric\f[](7), \f[C]fi_provider\f[](7), diff --git a/man/man7/fi_mrail.7 b/man/man7/fi_mrail.7 index 1b6e2396d7b..4b82177bf3a 100644 --- a/man/man7/fi_mrail.7 +++ b/man/man7/fi_mrail.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_mrail" "7" "2018\-12\-27" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_mrail" "7" "2020\-04\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -10,7 +10,7 @@ fi_mrail \- The Multi\-Rail Utility Provider The mrail provider (ofi_mrail) is an utility provider that layers over an underlying provider to enable the use of multiple network ports (rails). -This increases the total available bandwidth of an underlying proivder. +This increases the total available bandwidth of an underlying provider. The current status of mrail provider is experimental \- not all libfabric features are supported and performance is not guaranteed. .SH REQUIREMENTS @@ -28,8 +28,8 @@ FI_AV_TABLE .PP Applications need to: * Support FI_MR_RAW MR mode bit to make use of FI_RMA capability. -* Set FI_OFI_MRAIL_ADDR_STRC env variable (see RUNTIME PARAMETERS -section below). +* Set FI_OFI_MRAIL_ADDR env variable (see RUNTIME PARAMETERS section +below). .SH SUPPORTED FEATURES .TP .B \f[I]Endpoint types\f[] @@ -71,16 +71,40 @@ Multicast Triggered operations .SH FUNCTIONALITY OVERVIEW .PP -For messages (FI_MSG, FI_TAGGED), the provider sends one message per -rail in a round\-robin manner. +For messages (FI_MSG, FI_TAGGED), the provider uses different policies +to send messages over one or more rails based on message size (See +\f[I]FI_OFI_MRIAL_CONFIG\f[] in the RUNTIME PARAMETERS section). Ordering is guaranteed through the use of sequence numbers. +.PP For RMA, the data is striped equally across all rails. .SH RUNTIME PARAMETERS .PP The ofi_mrail provider checks for the following environment variables. .TP +.B \f[I]FI_OFI_MRAIL_ADDR\f[] +Comma delimited list of individual rail addresses. +Each address can be an address in FI_ADDR_STR format, a host name, an IP +address, or a netdev interface name. +.RS +.RE +.TP .B \f[I]FI_OFI_MRAIL_ADDR_STRC\f[] -Comma delimited list of individual rail addresses in FI_ADDR_STR format. +Deprecated. +Replaced by \f[I]FI_OFI_MRAIL_ADDR\f[]. +.RS +.RE +.TP +.B \f[I]FI_OFI_MRAIL_CONFIG\f[] +Comma separated list of \f[C]:\f[] pairs, sorted in +ascending order of \f[C]\f[]. +Each pair indicated the rail sharing policy to be used for messages up +to the size \f[C]\f[] and not covered by all previous pairs. +The value of \f[C]\f[] can be \f[I]fixed\f[] (a fixed rail is +used), \f[I]round\-robin\f[] (one rail per message, selected in +round\-robin fashion), or \f[I]striping\f[] (striping across all the +rails). +The default configuration is \f[C]16384:fixed,ULONG_MAX:striping\f[]. +The value ULONG_MAX can be input as \-1. .RS .RE .SH SEE ALSO diff --git a/man/man7/fi_netdir.7 b/man/man7/fi_netdir.7 index 3785465a4df..15031b25536 100644 --- a/man/man7/fi_netdir.7 +++ b/man/man7/fi_netdir.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_netdir" "7" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_netdir" "7" "2019\-11\-20" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -78,10 +78,6 @@ and FI_MSG receive/transmit operations. .RS .RE .SH LIMITATIONS -.PP -The Network Direct is an experimental provider. -The full support of the Network Direct provider will be added to 1.6 -release version of libfabric. .TP .B \f[I]Memory Regions\f[] Only FI_MR_BASIC mode is supported. diff --git a/man/man7/fi_provider.7 b/man/man7/fi_provider.7 index 73a71906978..326b11e58f9 100644 --- a/man/man7/fi_provider.7 +++ b/man/man7/fi_provider.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_provider" "7" "2019\-06\-15" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_provider" "7" "2021\-02\-10" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -33,6 +33,18 @@ See \f[C]fi_psm\f[](7) for more information. .RS .RE .TP +.B \f[I]PSM2\f[] +High\-speed Omni\-Path networking from Intel. +See \f[C]fi_psm2\f[](7) for more information. +.RS +.RE +.TP +.B \f[I]PSM3\f[] +High\-speed Ethernet networking from Intel. +See \f[C]fi_psm3\f[](7) for more information. +.RS +.RE +.TP .B \f[I]Sockets\f[] A general purpose provider that can be used on any network that supports TCP/UDP sockets. @@ -73,6 +85,15 @@ hardware interface for inter\-instance communication on EC2. See \f[C]fi_efa\f[](7) for more information. .RS .RE +.TP +.B \f[I]SHM\f[] +A provider for intranode communication using shared memory. +The provider makes use of the Linux kernel feature Cross Memory Attach +(CMA) which allows processes to have full access to another process\[aq] +address space. +See \f[C]fi_shm\f[](7) for more information. +.RS +.RE .SS Utility providers .TP .B \f[I]RxM\f[] @@ -81,6 +102,13 @@ endpoints emulated over MSG endpoints of a core provider. See \f[C]fi_rxm\f[](7) for more information. .RS .RE +.TP +.B \f[I]RxD\f[] +The RxD provider (ofi_rxd) is a utility provider that supports RDM +endpoints emulated over DGRAM endpoints of a core provider. +See \f[C]fi_rxd\f[](7) for more information. +.RS +.RE .SS Special providers .TP .B \f[I]Hook\f[] diff --git a/man/man7/fi_psm.7 b/man/man7/fi_psm.7 index 513355278be..d63b6c795af 100644 --- a/man/man7/fi_psm.7 +++ b/man/man7/fi_psm.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_psm" "7" "2018\-10\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_psm" "7" "2021\-02\-10" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -211,5 +211,6 @@ By default affinity is not set. .SH SEE ALSO .PP \f[C]fabric\f[](7), \f[C]fi_provider\f[](7), \f[C]fi_psm2\f[](7), +\f[C]fi_psm3\f[](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_psm2.7 b/man/man7/fi_psm2.7 index 0cee4e13c65..4644f29b419 100644 --- a/man/man7/fi_psm2.7 +++ b/man/man7/fi_psm2.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_psm2" "7" "2019\-04\-09" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_psm2" "7" "2021\-02\-10" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -145,6 +145,14 @@ different from the default. .RE .PP The default UUID is 00FF00FF\-0000\-0000\-0000\-00FF0F0F00FF. +.PP +It is possible to create endpoints with UUID different from the one set +here. +To achieve that, set \[aq]info\->ep_attr\->auth_key\[aq] to the uuid +value and \[aq]info\->ep_attr\->auth_key_size\[aq] to its size (16 +bytes) when calling fi_endpoint() or fi_scalable_ep(). +It is still true that an endpoint can only communicate with endpoints +with the same UUID. .TP .B \f[I]FI_PSM2_NAME_SERVER\f[] The \f[I]psm2\f[] provider has a simple built\-in name server that can @@ -338,8 +346,26 @@ Notice that if the provider is compiled with macro \f[I]PSMX2_TAG_LAYOUT\f[] defined to 1 (means \f[I]tag60\f[]) or 2 (means \f[I]tag64\f[]), the choice is fixed at compile time and this runtime option will be disabled. +.SH PSM2 EXTENSIONS +.PP +The \f[I]psm2\f[] provider supports limited low level parameter setting +through the fi_set_val() and fi_get_val() functions. +Currently the following parameters can be set via the domain fid: +\[bu] .RS 2 +.TP +.B FI_PSM2_DISCONNECT * +Overwite the global runtime parameter \f[I]FI_PSM2_DISCONNECT\f[] for +this domain. +See the \f[I]RUNTIME PARAMETERS\f[] section for details. +.RS +.RE +.RE +.PP +Valid parameter names are defined in the header file +\f[I]rdma/fi_ext_psm2.h\f[]. .SH SEE ALSO .PP \f[C]fabric\f[](7), \f[C]fi_provider\f[](7), \f[C]fi_psm\f[](7), +\f[C]fi_psm3\f[](7), .SH AUTHORS OpenFabrics. diff --git a/man/man7/fi_psm3.7 b/man/man7/fi_psm3.7 new file mode 100644 index 00000000000..86e4ce0f16b --- /dev/null +++ b/man/man7/fi_psm3.7 @@ -0,0 +1,344 @@ +.\" Automatically generated by Pandoc 1.19.2.4 +.\" +.TH "fi_psm3" "7" "2021\-02\-10" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.hy +.SH NAME +.PP +fi_psm3 \- The PSM3 Fabric Provider +.SH OVERVIEW +.PP +The \f[I]psm3\f[] provider implements a Performance Scaled Messaging +capability which supports Intel RoCEv2 capable NICs. +PSM3 represents an Ethernet and standard RoCEv2 enhancement of previous +PSM implementations. +.SH SUPPORTED FEATURES +.PP +The \f[I]psm3\f[] provider supports a subset of all the features defined +in the libfabric API. +.TP +.B Endpoint types +Supports non\-connection based types \f[I]FI_DGRAM\f[] and +\f[I]FI_RDM\f[]. +.RS +.RE +.TP +.B Endpoint capabilities +Endpoints can support any combination of data transfer capabilities +\f[I]FI_TAGGED\f[], \f[I]FI_MSG\f[], \f[I]FI_ATOMICS\f[], and +\f[I]FI_RMA\f[]. +These capabilities can be further refined by \f[I]FI_SEND\f[], +\f[I]FI_RECV\f[], \f[I]FI_READ\f[], \f[I]FI_WRITE\f[], +\f[I]FI_REMOTE_READ\f[], and \f[I]FI_REMOTE_WRITE\f[] to limit the +direction of operations. +.RS +.RE +.PP +\f[I]FI_MULTI_RECV\f[] is supported for non\-tagged message queue only. +.PP +Scalable endpoints are supported if the underlying PSM3 library supports +multiple endpoints. +This condition must be satisfied both when the provider is built and +when the provider is used. +See the \f[I]Scalable endpoints\f[] section for more information. +.PP +Other supported capabilities include \f[I]FI_TRIGGER\f[], +\f[I]FI_REMOTE_CQ_DATA\f[], \f[I]FI_RMA_EVENT\f[], \f[I]FI_SOURCE\f[], +and \f[I]FI_SOURCE_ERR\f[]. +Furthermore, \f[I]FI_NAMED_RX_CTX\f[] is supported when scalable +endpoints are enabled. +.TP +.B Modes +\f[I]FI_CONTEXT\f[] is required for the \f[I]FI_TAGGED\f[] and +\f[I]FI_MSG\f[] capabilities. +That means, any request belonging to these two categories that generates +a completion must pass as the operation context a valid pointer to type +\f[I]struct fi_context\f[], and the space referenced by the pointer must +remain untouched until the request has completed. +If none of \f[I]FI_TAGGED\f[] and \f[I]FI_MSG\f[] is asked for, the +\f[I]FI_CONTEXT\f[] mode is not required. +.RS +.RE +.TP +.B Progress +The \f[I]psm3\f[] provider performs optimal with manual progress. +By default, the application is expected to call \f[I]fi_cq_read\f[] or +\f[I]fi_cntr_read\f[] function from time to time when no other libfabric +function is called to ensure progress is made in a timely manner. +The provider does support auto progress mode. +However, the performance can be significantly impacted if the +application purely depends on the provider to make auto progress. +.RS +.RE +.TP +.B Scalable endpoints +Scalable endpoints support depends on the multi\-EP feature of the +\f[I]PSM3\f[] library. +If the \f[I]PSM3\f[] library supports this feature, the availability is +further controlled by an environment variable \f[I]PSM3_MULTI_EP\f[]. +The \f[I]psm3\f[] provider automatically sets this variable to 1 if it +is not set. +The feature can be disabled explicitly by setting \f[I]PSM3_MULTI_EP\f[] +to 0. +.RS +.RE +.PP +When creating a scalable endpoint, the exact number of contexts +requested should be set in the "fi_info" structure passed to the +\f[I]fi_scalable_ep\f[] function. +This number should be set in "fi_info\->ep_attr\->tx_ctx_cnt" or +"fi_info\->ep_attr\->rx_ctx_cnt" or both, whichever greater is used. +The \f[I]psm3\f[] provider allocates all requested contexts upfront when +the scalable endpoint is created. +The same context is used for both Tx and Rx. +.PP +For optimal performance, it is advised to avoid having multiple threads +accessing the same context, either directly by posting +send/recv/read/write request, or indirectly by polling associated +completion queues or counters. +.PP +Using the scalable endpoint as a whole in communication functions is not +supported. +Instead, individual tx context or rx context of the scalable endpoint +should be used. +Similarly, using the address of the scalable endpoint as the source +address or destination address doesn\[aq]t collectively address all the +tx/rx contexts. +It addresses only the first tx/rx context, instead. +.SH LIMITATIONS +.PP +The \f[I]psm3\f[] provider doesn\[aq]t support all the features defined +in the libfabric API. +Here are some of the limitations not listed above: +.TP +.B Unsupported features +These features are unsupported: connection management, passive endpoint, +and shared receive context. +.RS +.RE +.SH RUNTIME PARAMETERS +.PP +The \f[I]psm3\f[] provider checks for the following environment +variables: +.TP +.B \f[I]FI_PSM3_UUID\f[] +PSM requires that each job has a unique ID (UUID). +All the processes in the same job need to use the same UUID in order to +be able to talk to each other. +The PSM reference manual advises to keep UUID unique to each job. +In practice, it generally works fine to reuse UUID as long as (1) no two +jobs with the same UUID are running at the same time; and (2) previous +jobs with the same UUID have exited normally. +If running into "resource busy" or "connection failure" issues with +unknown reason, it is advisable to manually set the UUID to a value +different from the default. +.RS +.RE +.PP +The default UUID is 00FF00FF\-0000\-0000\-0000\-00FF0F0F00FF. +.PP +It is possible to create endpoints with UUID different from the one set +here. +To achieve that, set \[aq]info\->ep_attr\->auth_key\[aq] to the uuid +value and \[aq]info\->ep_attr\->auth_key_size\[aq] to its size (16 +bytes) when calling fi_endpoint() or fi_scalable_ep(). +It is still true that an endpoint can only communicate with endpoints +with the same UUID. +.TP +.B \f[I]FI_PSM3_NAME_SERVER\f[] +The \f[I]psm3\f[] provider has a simple built\-in name server that can +be used to resolve an IP address or host name into a transport address +needed by the \f[I]fi_av_insert\f[] call. +The main purpose of this name server is to allow simple client\-server +type applications (such as those in \f[I]fabtests\f[]) to be written +purely with libfabric, without using any out\-of\-band communication +mechanism. +For such applications, the server would run first to allow endpoints be +created and registered with the name server, and then the client would +call \f[I]fi_getinfo\f[] with the \f[I]node\f[] parameter set to the IP +address or host name of the server. +The resulting \f[I]fi_info\f[] structure would have the transport +address of the endpoint created by the server in the \f[I]dest_addr\f[] +field. +Optionally the \f[I]service\f[] parameter can be used in addition to +\f[I]node\f[]. +Notice that the \f[I]service\f[] number is interpreted by the provider +and is not a TCP/IP port number. +.RS +.RE +.PP +The name server is on by default. +It can be turned off by setting the variable to 0. +This may save a small amount of resource since a separate thread is +created when the name server is on. +.PP +The provider detects OpenMPI and MPICH runs and changes the default +setting to off. +.TP +.B \f[I]FI_PSM3_TAGGED_RMA\f[] +The RMA functions are implemented on top of the PSM Active Message +functions. +The Active Message functions have limit on the size of data can be +transferred in a single message. +Large transfers can be divided into small chunks and be pipe\-lined. +However, the bandwidth is sub\-optimal by doing this way. +.RS +.RE +.PP +The \f[I]psm3\f[] provider use PSM tag\-matching message queue functions +to achieve higher bandwidth for large size RMA. +It takes advantage of the extra tag bits available in PSM3 to separate +the RMA traffic from the regular tagged message queue. +.PP +The option is on by default. +To turn it off set the variable to 0. +.TP +.B \f[I]FI_PSM3_DELAY\f[] +Time (seconds) to sleep before closing PSM endpoints. +This is a workaround for a bug in some versions of PSM library. +.RS +.RE +.PP +The default setting is 0. +.TP +.B \f[I]FI_PSM3_TIMEOUT\f[] +Timeout (seconds) for gracefully closing PSM endpoints. +A forced closing will be issued if timeout expires. +.RS +.RE +.PP +The default setting is 5. +.TP +.B \f[I]FI_PSM3_CONN_TIMEOUT\f[] +Timeout (seconds) for establishing connection between two PSM endpoints. +.RS +.RE +.PP +The default setting is 5. +.TP +.B \f[I]FI_PSM3_PROG_INTERVAL\f[] +When auto progress is enabled (asked via the hints to +\f[I]fi_getinfo\f[]), a progress thread is created to make progress +calls from time to time. +This option set the interval (microseconds) between progress calls. +.RS +.RE +.PP +The default setting is 1 if affinity is set, or 1000 if not. +See \f[I]FI_PSM3_PROG_AFFINITY\f[]. +.TP +.B \f[I]FI_PSM3_PROG_AFFINITY\f[] +When set, specify the set of CPU cores to set the progress thread +affinity to. +The format is +\f[C][:[:]][,[:[:]]]*\f[], where +each triplet \f[C]::\f[] defines a block of +core_ids. +Both \f[C]\f[] and \f[C]\f[] can be either the +\f[C]core_id\f[] (when >=0) or \f[C]core_id\ \-\ num_cores\f[] (when +<0). +.RS +.RE +.PP +By default affinity is not set. +.TP +.B \f[I]FI_PSM3_INJECT_SIZE\f[] +Maximum message size allowed for fi_inject and fi_tinject calls. +This is an experimental feature to allow some applications to override +default inject size limitation. +When the inject size is larger than the default value, some inject calls +might block. +.RS +.RE +.PP +The default setting is 64. +.TP +.B \f[I]FI_PSM3_LOCK_LEVEL\f[] +When set, dictate the level of locking being used by the provider. +Level 2 means all locks are enabled. +Level 1 disables some locks and is suitable for runs that limit the +access to each PSM3 context to a single thread. +Level 0 disables all locks and thus is only suitable for single threaded +runs. +.RS +.RE +.PP +To use level 0 or level 1, wait object and auto progress mode cannot be +used because they introduce internal threads that may break the +conditions needed for these levels. +.PP +The default setting is 2. +.TP +.B \f[I]FI_PSM3_LAZY_CONN\f[] +There are two strategies on when to establish connections between the +PSM3 endpoints that OFI endpoints are built on top of. +In eager connection mode, connections are established when addresses are +inserted into the address vector. +In lazy connection mode, connections are established when addresses are +used the first time in communication. +Eager connection mode has slightly lower critical path overhead but lazy +connection mode scales better. +.RS +.RE +.PP +This option controls how the two connection modes are used. +When set to 1, lazy connection mode is always used. +When set to 0, eager connection mode is used when required conditions +are all met and lazy connection mode is used otherwise. +The conditions for eager connection mode are: (1) multiple endpoint (and +scalable endpoint) support is disabled by explicitly setting +PSM3_MULTI_EP=0; and (2) the address vector type is FI_AV_MAP. +.PP +The default setting is 0. +.TP +.B \f[I]FI_PSM3_DISCONNECT\f[] +The provider has a mechanism to automatically send disconnection +notifications to all connected peers before the local endpoint is +closed. +As the response, the peers call \f[I]psm3_ep_disconnect\f[] to clean up +the connection state at their side. +This allows the same PSM3 epid be used by different dynamically started +processes (clients) to communicate with the same peer (server). +This mechanism, however, introduce extra overhead to the finalization +phase. +For applications that never reuse epids within the same session such +overhead is unnecessary. +.RS +.RE +.PP +This option controls whether the automatic disconnection notification +mechanism should be enabled. +For client\-server application mentioned above, the client side should +set this option to 1, but the server should set it to 0. +.PP +The default setting is 0. +.TP +.B \f[I]FI_PSM3_TAG_LAYOUT\f[] +Select how the 96\-bit PSM3 tag bits are organized. +Currently three choices are available: \f[I]tag60\f[] means 32\-4\-60 +partitioning for CQ data, internal protocol flags, and application tag. +\f[I]tag64\f[] means 4\-28\-64 partitioning for internal protocol flags, +CQ data, and application tag. +\f[I]auto\f[] means to choose either \f[I]tag60\f[] or \f[I]tag64\f[] +based on the hints passed to fi_getinfo \-\- \f[I]tag60\f[] is used if +remote CQ data support is requested explicitly, either by passing +non\-zero value via \f[I]hints\->domain_attr\->cq_data_size\f[] or by +including \f[I]FI_REMOTE_CQ_DATA\f[] in \f[I]hints\->caps\f[], otherwise +\f[I]tag64\f[] is used. +If \f[I]tag64\f[] is the result of automatic selection, +\f[I]fi_getinfo\f[] also returns a second instance of the provider with +\f[I]tag60\f[] layout. +.RS +.RE +.PP +The default setting is \f[I]auto\f[]. +.PP +Notice that if the provider is compiled with macro +\f[I]PSMX3_TAG_LAYOUT\f[] defined to 1 (means \f[I]tag60\f[]) or 2 +(means \f[I]tag64\f[]), the choice is fixed at compile time and this +runtime option will be disabled. +.SH SEE ALSO +.PP +\f[C]fabric\f[](7), \f[C]fi_provider\f[](7), \f[C]fi_psm\f[](7), +\f[C]fi_psm2\f[](7), +.SH AUTHORS +OpenFabrics. diff --git a/man/man7/fi_rstream.7 b/man/man7/fi_rstream.7 index 60ec49c8d43..ac3f94cd5ea 100644 --- a/man/man7/fi_rstream.7 +++ b/man/man7/fi_rstream.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_rstream" "7" "2018\-11\-21" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_rstream" "7" "2020\-04\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -47,8 +47,8 @@ The provider supports FI_THREAD_SAFE .TP .B \f[I]Verbs\-iWarp\f[] The provider has added features to enable iWarp. -To use this feature, the ep protocol IWARP must be requested in a -getinfo call. +To use this feature, the ep protocol iWarp must be requested in an +fi_getinfo call. .RS .RE .SH LIMITATIONS @@ -93,7 +93,7 @@ Default is 384. .SH OFI EXTENSIONS .PP The rstream provider has extended the current OFI API set in order to -enable a user implemenation of Poll. +enable a user implementation of Poll. Specifically sendmsg(FI_PEEK) is supported which replicates the behavior of the recvmsg(FI_PEEK) feature. .SH SEE ALSO diff --git a/man/man7/fi_rxm.7 b/man/man7/fi_rxm.7 index 4e6cd131347..72bb3910885 100644 --- a/man/man7/fi_rxm.7 +++ b/man/man7/fi_rxm.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_rxm" "7" "2019\-06\-21" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_rxm" "7" "2021\-01\-25" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -148,18 +148,32 @@ would be read per progress (RxM CQ read). .RS .RE .TP +.B \f[I]FI_OFI_RXM_ENABLE_DYN_RBUF\f[] +Enables support for dynamic receive buffering, if available by the +message endpoint provider. +This feature allows direct placement of received message data into +application buffers, bypassing RxM bounce buffers. +This feature targets providers that provide internal network buffering, +such as the tcp provider. +(default: false) +.RS +.RE +.TP .B \f[I]FI_OFI_RXM_SAR_LIMIT\f[] Set this environment variable to control the RxM SAR (Segmentation And Reassembly) protocol. -Messages of size greater than this (default: 256 Kb) would be +Messages of size greater than this (default: 128 Kb) would be transmitted via rendezvous protocol. .RS .RE .TP .B \f[I]FI_OFI_RXM_USE_SRX\f[] -Set this to 1 to use shared receive context from MSG provider. -This reduces overall memory usage but there may be a slight increase in -latency (default: 0). +Set this to 1 to use shared receive context from MSG provider, or 0 to +disable using shared receive context. +Shared receive contexts reduce overall memory usage, but may increase in +message latency. +If not set, verbs will not use shared receive contexts by default, but +the tcp provider will. .RS .RE .TP @@ -196,6 +210,13 @@ Higher values may provide less noise for calls to fi_cq read functions, but may increase connection setup time (default: 10000) .RS .RE +.TP +.B \f[I]FI_OFI_RXM_CQ_EQ_FAIRNESS\f[] +Defines the maximum number of message provider CQ entries that can be +consecutively read across progress calls without checking to see if the +CM progress interval has been reached (default: 128) +.RS +.RE .SH Tuning .SS Bandwidth .PP diff --git a/man/man7/fi_shm.7 b/man/man7/fi_shm.7 index 910387eca27..ed04e5a7b2d 100644 --- a/man/man7/fi_shm.7 +++ b/man/man7/fi_shm.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_shm" "7" "2019\-02\-05" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_shm" "7" "2020\-04\-17" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -89,8 +89,8 @@ If no node or service are provided (and in the case of setting the src address without FI_SOURCE and no hints), the process ID will be used as a default address. On endpoint creation, if the src_addr has the "fi_shm://" prefix, the -provider will append ":[dom_idx]:[ep_idx]" as a unique endpoint name -(essentially, in place of a service). +provider will append ":[uid]:[dom_idx]:[ep_idx]" as a unique endpoint +name (essentially, in place of a service). In the case of the "fi_ns://" prefix (or any other prefix if one was provided by the application), no supplemental information is required to make it unique and it will remain with only the application\-defined @@ -125,7 +125,27 @@ EPs must be bound to both RX and TX CQs. No support for counters. .SH RUNTIME PARAMETERS .PP -No runtime parameters are currently defined. +The \f[I]shm\f[] provider checks for the following environment +variables: +.TP +.B \f[I]FI_SHM_SAR_THRESHOLD\f[] +Maximum message size to use segmentation protocol before switching to +mmap (only valid when CMA is not available). +Default: SIZE_MAX (18446744073709551615) +.RS +.RE +.TP +.B \f[I]FI_SHM_TX_SIZE\f[] +Maximum number of outstanding tx operations. +Default 1024 +.RS +.RE +.TP +.B \f[I]FI_SHM_RX_SIZE\f[] +Maximum number of outstanding rx operations. +Default 1024 +.RS +.RE .SH SEE ALSO .PP \f[C]fabric\f[](7), \f[C]fi_provider\f[](7), \f[C]fi_getinfo\f[](3) diff --git a/man/man7/fi_tcp.7 b/man/man7/fi_tcp.7 index 346b0aaf060..cc8b07285bc 100644 --- a/man/man7/fi_tcp.7 +++ b/man/man7/fi_tcp.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_tcp" "7" "2019\-06\-08" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_tcp" "7" "2020\-04\-14" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -64,9 +64,11 @@ tcp connections. .RE .SH LIMITATIONS .PP -tcp provider is implemented over TCP sockets to emulate libfabric API. -Hence the performance is lower than what an application might see -implementing to sockets directly. +The tcp provider is implemented over TCP sockets to emulate libfabric +API. +Hence the performance may be lower than what an application might see +implementing to sockets directly, depending on the types of data +transfers the application is trying to achieve. .SH SEE ALSO .PP \f[C]fabric\f[](7), \f[C]fi_provider\f[](7), \f[C]fi_getinfo\f[](3) diff --git a/man/man7/fi_verbs.7 b/man/man7/fi_verbs.7 index 545307bdeee..0793136008e 100644 --- a/man/man7/fi_verbs.7 +++ b/man/man7/fi_verbs.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 1.19.2.4 .\" -.TH "fi_verbs" "7" "2019\-07\-18" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" +.TH "fi_verbs" "7" "2020\-11\-12" "Libfabric Programmer\[aq]s Manual" "\@VERSION\@" .hy .SH NAME .PP @@ -155,6 +155,8 @@ to be re\-mapped when the process is forked (MADV_DONTFORK). The XRC transport is intended to be used when layered with the RXM provider and requires the use of shared receive contexts. See \f[C]fi_rxm\f[](7). +To enable XRC, the following environment variables must usually be set: +FI_VERBS_PREFER_XRC and FI_OFI_RXM_USE_SRX. .SH RUNTIME PARAMETERS .PP The verbs provider checks for the following environment variables. @@ -199,17 +201,28 @@ time (default: 8). .RS .RE .TP -.B \f[I]FI_VERBS_IFACE\f[] -The prefix or the full name of the network interface associated with the -verbs device (default: ib) -.RS -.RE -.TP .B \f[I]FI_VERBS_PREFER_XRC\f[] Prioritize XRC transport fi_info before RC transport fi_info (default: 0, RC fi_info will be before XRC fi_info) .RS .RE +.TP +.B \f[I]FI_VERBS_GID_IDX\f[] +The GID index to use (default: 0) +.RS +.RE +.TP +.B \f[I]FI_VERBS_DEVICE_NAME\f[] +Specify a specific verbs device to use by name +.RS +.RE +.SS Variables specific to MSG endpoints +.TP +.B \f[I]FI_VERBS_IFACE\f[] +The prefix or the full name of the network interface associated with the +verbs device (default: ib) +.RS +.RE .SS Variables specific to DGRAM endpoints .TP .B \f[I]FI_VERBS_DGRAM_USE_NAME_SERVER\f[] @@ -225,11 +238,6 @@ The port on which Name Server thread listens incoming connections and requests (default: 5678) .RS .RE -.TP -.B \f[I]FI_VERBS_GID_IDX\f[] -The GID index to use (default: 0) -.RS -.RE .SS Environment variables notes .PP The fi_info utility would give the up\-to\-date information on @@ -242,8 +250,8 @@ is available) and check if there any errors because of incorrect input parameters to fi_getinfo. .IP \[bu] 2 Check if "fi_info \-p verbs" is successful. -If that fails the following chkecklist may help in ensuring that the -RDMA verbs stack is functional: +If that fails the following checklist may help in ensuring that the RDMA +verbs stack is functional: .IP \[bu] 2 If libfabric was compiled, check if verbs provider was built. Building verbs provider would be skipped if its dependencies (listed in diff --git a/pingpong.vcxproj b/pingpong.vcxproj index 2e65c22b83f..8b6846d7c3a 100755 --- a/pingpong.vcxproj +++ b/pingpong.vcxproj @@ -13,6 +13,10 @@ Debug-v140 x64 + + Debug-v142 + x64 + Release-ICC x64 @@ -25,6 +29,10 @@ Release-v140 x64 + + Release-v142 + x64 + {DBBD5F92-1E78-40ED-8D64-F958D0EF12B2} @@ -45,6 +53,12 @@ v141 Unicode + + Application + true + v142 + Unicode + Application true @@ -65,6 +79,13 @@ true Unicode + + Application + false + v142 + true + Unicode + Application false @@ -83,6 +104,9 @@ + + + @@ -92,6 +116,9 @@ + + + @@ -106,6 +133,11 @@ $(Platform)\$(Configuration)\pingpong\ fi_$(ProjectName) + + true + $(Platform)\$(Configuration)\pingpong\ + fi_$(ProjectName) + true $(Platform)\$(Configuration)\pingpong\ @@ -121,6 +153,11 @@ $(Platform)\$(Configuration)\pingpong\ fi_$(ProjectName) + + true + $(Platform)\$(Configuration)\pingpong\ + fi_$(ProjectName) + true $(Platform)\$(Configuration)\pingpong\ @@ -158,6 +195,22 @@ Synchronization.lib;Ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + + + Level3 + Disabled + WIN32;_WINSOCKAPI_=;_CRT_SECURE_NO_WARNINGS;_WINDOWS;_USRDLL;LIBFABRIC_EXPORTS;HAVE_CONFIG_H;%(PreprocessorDefinitions) + $(SoludionDir)util\windows\getopt;$(SolutionDir)include;$(SolutionDir)include\windows;%(AdditionalIncludeDirectories) + MultiThreadedDebug + + + Console + true + Synchronization.lib;Ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + @@ -214,6 +267,26 @@ Synchronization.lib;Ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + Level3 + + + MaxSpeed + false + true + WIN32;_WINSOCKAPI_=;_CRT_SECURE_NO_WARNINGS;_WINDOWS;_USRDLL;LIBFABRIC_EXPORTS;HAVE_CONFIG_H;%(PreprocessorDefinitions) + $(SoludionDir)util\windows\getopt;$(SolutionDir)include;$(SolutionDir)include\windows;%(AdditionalIncludeDirectories) + MultiThreaded + + + Console + true + true + true + Synchronization.lib;Ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + Level3 @@ -238,9 +311,11 @@ true true + true true true true + true true @@ -256,4 +331,4 @@ - \ No newline at end of file + diff --git a/prov/bgq/src/fi_bgq_init.c b/prov/bgq/src/fi_bgq_init.c index b734bcf4def..9680d2f9677 100644 --- a/prov/bgq/src/fi_bgq_init.c +++ b/prov/bgq/src/fi_bgq_init.c @@ -330,7 +330,7 @@ static void fi_bgq_fini() static struct fi_provider fi_bgq_provider = { .name = FI_BGQ_PROVIDER_NAME, .version = FI_VERSION(0, 1), - .fi_version = FI_VERSION(1, 6), + .fi_version = OFI_VERSION_LATEST, .getinfo = fi_bgq_getinfo, .fabric = fi_bgq_fabric, .cleanup = fi_bgq_fini diff --git a/prov/efa/Makefile.include b/prov/efa/Makefile.include index 0cb5abcd8be..a4d027f0e77 100644 --- a/prov/efa/Makefile.include +++ b/prov/efa/Makefile.include @@ -1,5 +1,5 @@ # -# Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. +# Copyright (c) 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved. # # This software is available to you under a choice of one of two # licenses. You may choose to be licensed under the terms of the GNU @@ -31,10 +31,7 @@ # if HAVE_EFA _efa_files = \ - prov/efa/src/efa_verbs/efa_ib_cmd.c \ - prov/efa/src/efa_verbs/efa_cmd.c \ - prov/efa/src/efa_verbs/efa_device.c \ - prov/efa/src/efa_verbs/efa_init.c \ + prov/efa/src/efa_device.c \ prov/efa/src/efa_av.c \ prov/efa/src/efa_domain.c \ prov/efa/src/efa_cm.c \ @@ -43,36 +40,40 @@ _efa_files = \ prov/efa/src/efa_fabric.c \ prov/efa/src/efa_msg.c \ prov/efa/src/efa_mr.c \ + prov/efa/src/efa_rma.c \ prov/efa/src/rxr/rxr_attr.c \ prov/efa/src/rxr/rxr_init.c \ prov/efa/src/rxr/rxr_fabric.c \ prov/efa/src/rxr/rxr_domain.c \ prov/efa/src/rxr/rxr_cq.c \ prov/efa/src/rxr/rxr_ep.c \ - prov/efa/src/rxr/rxr_av.c \ prov/efa/src/rxr/rxr_cntr.c \ - prov/efa/src/rxr/rxr_rma.c + prov/efa/src/rxr/rxr_rma.c \ + prov/efa/src/rxr/rxr_msg.c \ + prov/efa/src/rxr/rxr_pkt_entry.c \ + prov/efa/src/rxr/rxr_pkt_type_req.c \ + prov/efa/src/rxr/rxr_pkt_type_data.c \ + prov/efa/src/rxr/rxr_pkt_type_misc.c \ + prov/efa/src/rxr/rxr_pkt_cmd.c \ + prov/efa/src/rxr/rxr_read.c \ + prov/efa/src/rxr/rxr_atomic.c _efa_headers = \ prov/efa/src/efa.h \ - prov/efa/src/efa_verbs/efa-abi.h \ - prov/efa/src/efa_verbs/efa_cmd.h \ - prov/efa/src/efa_verbs/efa_ib_cmd.h \ - prov/efa/src/efa_verbs/efa_ib.h \ - prov/efa/src/efa_verbs/efa_io_defs.h \ - prov/efa/src/efa_verbs/efa_verbs.h \ - prov/efa/include/infiniband/efa_arch.h \ - prov/efa/include/infiniband/efa_kern-abi.h \ - prov/efa/include/infiniband/efa_verbs.h \ prov/efa/src/rxr/rxr.h \ prov/efa/src/rxr/rxr_cntr.h \ - prov/efa/src/rxr/rxr_rma.h + prov/efa/src/rxr/rxr_rma.h \ + prov/efa/src/rxr/rxr_msg.h \ + prov/efa/src/rxr/rxr_pkt_entry.h \ + prov/efa/src/rxr/rxr_pkt_type.h \ + prov/efa/src/rxr/rxr_pkt_type_req.h \ + prov/efa/src/rxr/rxr_pkt_cmd.h \ + prov/efa/src/rxr/rxr_read.h \ + prov/efa/src/rxr/rxr_atomic.h -efa_CPPFLAGS = \ - -I$(top_srcdir)/prov/efa/include \ - -I$(top_srcdir)/prov/efa/src/efa_verbs \ - -I$(top_srcdir)/prov/efa/src/ \ - -I$(top_srcdir)/prov/efa/src/rxr/ +efa_CPPFLAGS += \ + -I$(top_srcdir)/prov/efa/src/ \ + -I$(top_srcdir)/prov/efa/src/rxr/ if HAVE_EFA_DL pkglib_LTLIBRARIES += libefa-fi.la diff --git a/prov/efa/configure.m4 b/prov/efa/configure.m4 index da8452d2bef..8d1693f8504 100644 --- a/prov/efa/configure.m4 +++ b/prov/efa/configure.m4 @@ -1,4 +1,4 @@ -dnl Configury specific to the libfabric Amazon provider +dnl Configury specific to the libfabric Amazon EFA provider dnl Called to configure this provider dnl @@ -12,7 +12,16 @@ AC_DEFUN([FI_EFA_CONFIGURE],[ efa_happy=0 efa_h_enable_poisoning=0 AS_IF([test x"$enable_efa" != x"no"], - [efa_happy=1]) + [FI_CHECK_PACKAGE([efa_ibverbs], + [infiniband/verbs.h], + [ibverbs], + [ibv_open_device], + [], + [$efa_PREFIX], + [$efa_LIBDIR], + [FI_EFA_DOUBLE_CHECK_LIBIBVERBS], + [efa_happy=0]) + ]) AC_ARG_ENABLE([efa-mem-poisoning], [AS_HELP_STRING([--enable-efa-mem-poisoning], @@ -31,9 +40,90 @@ AC_DEFUN([FI_EFA_CONFIGURE],[ [AC_MSG_RESULT([no]) efa_happy=0]) - # verbs definitions file depends on linux/types.h - AC_CHECK_HEADER([linux/types.h], [], [efa_happy=0]) + AS_IF([test x"$enable_efa" != x"no"], + [FI_CHECK_PACKAGE([efadv], + [infiniband/efadv.h], + [efa], + [efadv_query_ah], + [-libverbs], + [$efa_PREFIX], + [$efa_LIBDIR], + [efa_happy=1], + [ + efa_happy=0 + AC_MSG_WARN([The EFA provider requires rdma-core v27 or newer.]) + ]) + ]) + + AS_IF([test x"$enable_efa" != x"no"], + [FI_CHECK_PACKAGE([efadv], + [infiniband/efadv.h], + [efa], + [efadv_query_device], + [-libverbs], + [$efa_PREFIX], + [$efa_LIBDIR], + [efa_happy=1], + [efa_happy=0]) + ]) + save_CPPFLAGS=$CPPFLAGS + CPPFLAGS=-I$efa_PREFIX/include + AS_IF([test x"$enable_efa" != x"no"], + [AC_CHECK_MEMBER(struct efadv_device_attr.max_rdma_size, + [AC_DEFINE([HAVE_RDMA_SIZE], [1], [efadv_device_attr has max_rdma_size])], + [], + [[#include ]]) + ]) + + AS_IF([test x"$enable_efa" != x"no"], + [AC_CHECK_DECL(EFADV_DEVICE_ATTR_CAPS_RNR_RETRY, + [AC_DEFINE([HAVE_CAPS_RNR_RETRY], [1], [EFADV_DEVICE_ATTR_CAPS_RNR_RETRY is defined])], + [], + [[#include ]]) + ]) + CPPFLAGS=$save_CPPFLAGS AS_IF([test $efa_happy -eq 1 ], [$1], [$2]) + + efa_CPPFLAGS="$efa_ibverbs_CPPFLAGS $efadv_CPPFLAGS" + efa_LDFLAGS="$efa_ibverbs_LDFLAGS $efadv_LDFLAGS" + efa_LIBS="$efa_ibverbs_LIBS $efadv_LIBS" + AC_SUBST(efa_CPPFLAGS) + AC_SUBST(efa_LDFLAGS) + AC_SUBST(efa_LIBS) +]) + +dnl +dnl Per https://github.com/ofiwg/libfabric/issues/2070, it is possible +dnl that the AC_CHECK_LIB test for libibverbs is not sufficient -- +dnl i.e., AC_CHECK_LIB may succeed, but then linking with libtool may +dnl fail. This test therefore double checks that we can successfully +dnl use libtool to link against libibverbs. NOTE: this test is +dnl contingent upon LT_OUTPUT having already been invoked (i.e., so that +dnl the libtool script exists). +dnl +AC_DEFUN([FI_EFA_DOUBLE_CHECK_LIBIBVERBS],[ + AC_MSG_CHECKING(if libibverbs is linkable by libtool) + file=conftemp.$$.c + rm -f $file conftemp + cat > $file <<-EOF +char ibv_open_device (); +int main () +{ return ibv_open_device (); } +EOF + + cmd="./libtool --mode=link --tag=CC $CC $CPPFLAGS $CFLAGS $file -o conftemp $LDFLAGS -libverbs" + echo "configure:$LINENO: $cmd" >> config.log 2>&1 + eval $cmd >> config.log 2>&1 + status=$? + AS_IF([test $status -eq 0 && test -x conftemp], + [AC_MSG_RESULT(yes) + efa_happy=1], + [AC_MSG_RESULT(no) + echo "configure: failed program was" >> config.log + cat $file >> config.log + efa_happy=0]) + + rm -f $file conftemp ]) diff --git a/prov/efa/docs/building.md b/prov/efa/docs/building.md new file mode 100644 index 00000000000..2a3f7ce9e4e --- /dev/null +++ b/prov/efa/docs/building.md @@ -0,0 +1,55 @@ +## Building the EFA Libfabric Provider + +This document describes how to build the Libfabric provider once you've +followed the prerequisite steps to install required software, see the overview +doc if you are unsure what's needed. + +An example of building and installing Libfabric and verifying that the EFA +device is available via libfabric: +``` +$ ./autogen.sh +$ ./configure --enable-efa= --prefix=$PWD/install +$ make -j install +$ ./install/bin/fi_info -p efa +provider: efa + fabric: EFA-fe80::df:57ff:fe1a:beb3 + domain: efa_0-rdm + version: 112.0 + type: FI_EP_RDM + protocol: FI_PROTO_EFA +provider: efa + fabric: EFA-fe80::df:57ff:fe1a:beb3 + domain: efa_0-dgrm + version: 112.0 + type: FI_EP_DGRAM + protocol: FI_PROTO_EFA +``` + +Configure flags that may be useful in the context of the EFA provider: + +* `--enable-debug`: will turn on `FI_LOG_LEVEL=debug`, add `-g` among others to +CFLAGS (see configure.ac for full list), and compile in some extra data +structures that may be helpful for debugging. Note that debug will likely +impact performance. See `ENABLE_DEBUG` in the code. +* `--enable-efa`: allows you to specify the rdma-core install path which is +needed if rdma-core is not in the default paths. Also allows you to compile the +provider as a shared library. +* `--enable-efa-mem-poisoning`: Write a poison value into memory structures after +they are freed. This has a performance overhead like debug. See +`ENABLE_EFA_POISONING` in the code. +* `--with-cuda`: Build Libfabric with cuda support (if cuda libraries are not in +the default path). The EFA provider supports sends/RDMA reads with GPUDirect +via FI_HMEM when Libfabric has CUDA support enabled. +* `--with-gdrcopy`: Build Libfabric with the NVIDIA GDRCopy library enabled. If +not enabled the EFA provider will have to utilize the EFA device (via a +loopback read) to copy receives in the bounce buffers (host memory) matched to +GPU memory. + +CFLAGS that might be useful: + +* `RXR_PERF_ENABLED`: enable the perf hooks to determine cycle/instruction count +for functions in the send/receive/completion paths. See fi_hook(7) and the +Linux perf documentation for more information. +* `ENABLE_RXR_PKT_DUMP`: turn on packet dump prints, very verbose. These +functions haven't been kept up to date with recent protocol changes so this +might not be useful until fixed. diff --git a/prov/efa/docs/overview.md b/prov/efa/docs/overview.md new file mode 100644 index 00000000000..5dc4588624d --- /dev/null +++ b/prov/efa/docs/overview.md @@ -0,0 +1,69 @@ +## EFA Libfabric Provider Documentation + +The EFA Libfabric provider supports the Amazon Elastic Fabric Adapter (EFA), an +OS bypass network interface available on Amazon EC2 instances. The EFA device +supports both reliable and unreliable datagram send and receive semantics, the +EFA Libfabric provider adds additional functionality in software such as tag +matching, reordering, and software emulation for features the hardware does not +support natively. EFA provides lower and more consistent latency and higher +throughput compared to TCP transports which provides better application +performance for HPC and Machine Learning applications on Amazon EC2. + +Please see the [fi_efa(7) man +page](https://ofiwg.github.io/libfabric/master/man/fi_efa.7.html) for more +information on the features and capabilities of the EFA Libfabric provider. + +### Background information + +The EFA developer documentation assumes a working knowledge of OS bypass +networking and the Libfabric API. The [OFI Programmer's +Guide](https://github.com/ofiwg/ofi-guide/blob/master/OFIGuide.md) provides +motivation for Libfabric and defines the API and structures used by Libfabric +applications. + +For more information on EFA, SRD and the [AWS Nitro +System](https://aws.amazon.com/ec2/nitro/), please refer to these resources: + +* [A Cloud-Optimized Transport Protocol for Elastic and Scalable + HPC](https://ieeexplore.ieee.org/document/91673990) whitepaper +* [AWS re:Invent 2019 - Monday Night Live with Peter + DeSantis](https://www.youtube.com/watch?v=GPUWATKe15E&feature=youtu.be&t=228) + keynote +* [HPC Application Scaling with Elastic Fabric Adapter (EFA) and Scalable + Reliable Datagram + (SRD)](https://pages.awscloud.com/HPC-Application-Scaling-with-Elastic-Fabric-Adapter-EFA-and-Scalable-Reliable-Datagram-SRD_2020_0004-CMP_OD.html) + tech talk + +### Getting started with EFA Libfabric provider development + +You will need an Amazon EC2 instance which has EFA support. The [EFA getting +started guide](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html) in +the AWS documentation is a good reference on which Amazon EC2 instances support +EFA and how to setup an EFA enabled instance. + +To get started with EFA Libfabric provider development, you will need to either +install a pre-packaged version or build the: + +* [EFA kernel + driver](https://github.com/amzn/amzn-drivers/tree/master/kernel/linux/efa) - The + driver is required to enable the EFA device and is utilized by libibverbs and + Libfabric to setup and teardown device resources such as queue pairs, + completion queues, memory registration, and address handles. Some OS + distributions provide an up-to-date version of the EFA kernel driver such as + Amazon Linux 2 and Ubuntu. +* [rdma-core](https://github.com/linux-rdma/rdma-core) - The EFA Libfabric + provider utilizes the libibverbs library which provides an abstraction layer + for the Linux kernel verbs interface. This avoids tightly coupling the + Libfabric provider to the EFA kernel driver and simplifies the Libfabric + provider. Similar to the driver, there are OS distributions that pre-package + rdma-core. EFA device support was added to rdma-core version 24.0. However, + it's best to use the latest rdma-core release for bugfixes and to support the + latest device features. +* Ensure you have configured your instance to increase the locked memory limits + (unlimited is fine) and set aside [huge + pages](https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt) for the best + performance. The EFA Libfabric provider will utilize these for the bounce + buffers used for sends and matching receives. +* `FI_HMEM` and CUDA support for NVIDIA GPUDirect + EFA is available, see the + `--with-cuda` configure flag. GPUDirect will be enabled by default if CUDA is + installed in the default system paths. diff --git a/prov/efa/docs/pkt-processing.md b/prov/efa/docs/pkt-processing.md new file mode 100644 index 00000000000..5995b28e979 --- /dev/null +++ b/prov/efa/docs/pkt-processing.md @@ -0,0 +1,86 @@ +## EFA Libfabric Send/Receive/Completion Paths + +### Overview + +The EFA provider supports two different endpoint types, `FI_EP_RDM` and +`FI_EP_DGRAM`. This document covers `FI_EP_RDM` as it implements a wire +protocol and software support for some of the Libfabric API such as tag +matching, send after send ordering guarantees, segmentation and reassembly for +large messages, emulation for RMA and atomics, and more. + +There are a couple key data structures that are used to implement these +software-level features. The wire protocol that we implement is covered in a +separate document. + +### Relevant data structures and functions + +`rxr_ep` contains device information and structures for the endpoint including +the device/shm endpoints and completion queues and their state, the packet +pools for recv/send, outstanding app receives to be matched, outstanding sends +in progress, sends and receives queued due to resource exhaustion, unexpected +messages, and structures to track out of order packets and remote peer +capabilities and status. + +`rxr_tx_entry` contains information and structures for a send posted either +directly by the app or indirectly such as an emulated read/write. When the send +is completed a send completion will be written and the tx_entry will be +released. + +`rxr_rx_entry` contains information and structures for a receive posted by the +app. This structure is used for tag matching, to queue unexpected messages to +be matched later, and to keep track of whether long message receives are +complete. Just like the tx_entry, when done a receive completion is written to +the app and the rx_entry is freed. + +`rxr_ep_progress` is the progress handler we register when the completion queue +is created and is called via the util completion queue functions. While the EFA +device will progress sends and receives posted to it, the Libfabric provider +has to process those device completions, potentially copy data out of a bounce +buffer into the application buffer, and write the application completions. This +all happens in this function. The progress handler also progresses long +messages and queued messages. + +### Dealing with device resource exhaustion + +The EFA device has fixed send and receive queue sizes which the Libfabric +provider has to manage. In general, we try to write an error to the app when +resources are exhausted as the app can manage resource exhaustion better than +the provider. However, there are some cases where we have to queue packets or +store state about a send or receive to be acted on later. + +The first case is control messages that have to be queued, for example, we may +send parts of a message and then hit the device limit when sending a segmented, +medium message, or fail to send a control packet containing information that +can't be reconstructed in the future. `rxr_pkt_post_ctrl_or_queue` handles +those cases. + +We also may queue an rx/tx entry if we're unable to continue sending segments +or if we fail to post a control message for that entry. You'll find the lists +where those are queued and progressed in `rxr_ep_progress_internal`. + +### Dealing with receiver not ready errors (RNR) + +Note: this functionality is currently turned off. We configure the device to do +infinite retries as there are known bugs in the queuing/RNR logic that need to +be resolved first. + +Finally, the EFA device may write an error completion for RNR, meaning there is +no receive buffer available for the device to place the payload. This can +happen when the application is not posting receive buffers fast enough, but for +the `FI_EP_RDM` receive buffers are pre posted as packets are processed. When +we get RNR in that case, this means that a peer is overloaded. This can happen +for any control or data packet we post, so to handle this we queue these +packets to be sent later after we backoff for the remote peer. + +The occasional RNR is expected so we configure the device to retransmit a +handful of times without writing an error to the host. This is to avoid the +latency penalty of the device writing an error completion, the provider +processing that completion, and trying the send again. However, once the +Libfabric provider receives an RNR for the same packet that we already tried to +retransmit we start random exponential backoff for that peer. We stop sending +to that peer until the peer exits backoff, meaning we either received a +successful send completion for that peer or the backoff timer expires. + +See `rxr_cq_queue_pkt` for where the packets are queued and backoff timers are +set, and see `rxr_ep_check_peer_backoff_timer` for where those timers are +checked and we allow sends to that remote peer again. diff --git a/prov/efa/include/infiniband/efa_arch.h b/prov/efa/include/infiniband/efa_arch.h deleted file mode 100644 index 9734db7c5ed..00000000000 --- a/prov/efa/include/infiniband/efa_arch.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef INFINIBAND_ARCH_H -#define INFINIBAND_ARCH_H - -#include -#include -#include - -/* - * Architecture-specific defines. Currently, an architecture is - * required to implement the following operations: - * - * mb() - memory barrier. No loads or stores may be reordered across - * this macro by either the compiler or the CPU. - * rmb() - read memory barrier. No loads may be reordered across this - * macro by either the compiler or the CPU. - * wmb() - write memory barrier. No stores may be reordered across - * this macro by either the compiler or the CPU. - * wc_wmb() - flush write combine buffers. No write-combined writes - * will be reordered across this macro by either the compiler or - * the CPU. - */ - -#if defined(__x86_64__) -/* - * Only use lfence for mb() and rmb() because we don't care about - * ordering against non-temporal stores (for now at least). - */ -#define mb() asm volatile("lfence" ::: "memory") -#define rmb() mb() -#define wmb() asm volatile("" ::: "memory") -#define wc_wmb() asm volatile("sfence" ::: "memory") -#else -#warning No architecture specific defines found. Using generic implementation. -#define mb() asm volatile("" ::: "memory") -#define rmb() mb() -#define wmb() mb() -#define wc_wmb() wmb() -#endif - -#endif /* INFINIBAND_ARCH_H */ diff --git a/prov/efa/include/infiniband/efa_kern-abi.h b/prov/efa/include/infiniband/efa_kern-abi.h deleted file mode 100644 index 78471bc0c3e..00000000000 --- a/prov/efa/include/infiniband/efa_kern-abi.h +++ /dev/null @@ -1,1280 +0,0 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ -/* - * Copyright (c) 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. - * Copyright (c) 2005 PathScale, Inc. All rights reserved. - * Copyright (c) 2006 Mellanox Technologies. All rights reserved. - * Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef IB_USER_VERBS_H -#define IB_USER_VERBS_H - -#include - -/* - * This file must be kept in sync with the kernel's version of - * include/uapi/rdma/ib_user_verbs.h - */ - -/* - * The minimum and maximum kernel ABI that we can handle. - */ -#define IB_USER_VERBS_MIN_ABI_VERSION 6 -#define IB_USER_VERBS_MAX_ABI_VERSION 6 - -#define IB_USER_VERBS_CMD_THRESHOLD 50 - -enum { - IB_USER_VERBS_CMD_GET_CONTEXT, - IB_USER_VERBS_CMD_QUERY_DEVICE, - IB_USER_VERBS_CMD_QUERY_PORT, - IB_USER_VERBS_CMD_ALLOC_PD, - IB_USER_VERBS_CMD_DEALLOC_PD, - IB_USER_VERBS_CMD_CREATE_AH, - IB_USER_VERBS_CMD_MODIFY_AH, - IB_USER_VERBS_CMD_QUERY_AH, - IB_USER_VERBS_CMD_DESTROY_AH, - IB_USER_VERBS_CMD_REG_MR, - IB_USER_VERBS_CMD_REG_SMR, - IB_USER_VERBS_CMD_REREG_MR, - IB_USER_VERBS_CMD_QUERY_MR, - IB_USER_VERBS_CMD_DEREG_MR, - IB_USER_VERBS_CMD_ALLOC_MW, - IB_USER_VERBS_CMD_BIND_MW, - IB_USER_VERBS_CMD_DEALLOC_MW, - IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL, - IB_USER_VERBS_CMD_CREATE_CQ, - IB_USER_VERBS_CMD_RESIZE_CQ, - IB_USER_VERBS_CMD_DESTROY_CQ, - IB_USER_VERBS_CMD_POLL_CQ, - IB_USER_VERBS_CMD_PEEK_CQ, - IB_USER_VERBS_CMD_REQ_NOTIFY_CQ, - IB_USER_VERBS_CMD_CREATE_QP, - IB_USER_VERBS_CMD_QUERY_QP, - IB_USER_VERBS_CMD_MODIFY_QP, - IB_USER_VERBS_CMD_DESTROY_QP, - IB_USER_VERBS_CMD_POST_SEND, - IB_USER_VERBS_CMD_POST_RECV, - IB_USER_VERBS_CMD_ATTACH_MCAST, - IB_USER_VERBS_CMD_DETACH_MCAST, - IB_USER_VERBS_CMD_CREATE_SRQ, - IB_USER_VERBS_CMD_MODIFY_SRQ, - IB_USER_VERBS_CMD_QUERY_SRQ, - IB_USER_VERBS_CMD_DESTROY_SRQ, - IB_USER_VERBS_CMD_POST_SRQ_RECV, - IB_USER_VERBS_CMD_OPEN_XRCD, - IB_USER_VERBS_CMD_CLOSE_XRCD, - IB_USER_VERBS_CMD_CREATE_XSRQ, - IB_USER_VERBS_CMD_OPEN_QP, -}; - -enum { - IB_USER_VERBS_EX_CMD_QUERY_DEVICE = IB_USER_VERBS_CMD_QUERY_DEVICE, - IB_USER_VERBS_EX_CMD_CREATE_CQ = IB_USER_VERBS_CMD_CREATE_CQ, - IB_USER_VERBS_EX_CMD_CREATE_QP = IB_USER_VERBS_CMD_CREATE_QP, - IB_USER_VERBS_EX_CMD_MODIFY_QP = IB_USER_VERBS_CMD_MODIFY_QP, - IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, - IB_USER_VERBS_EX_CMD_DESTROY_FLOW, - IB_USER_VERBS_EX_CMD_CREATE_WQ, - IB_USER_VERBS_EX_CMD_MODIFY_WQ, - IB_USER_VERBS_EX_CMD_DESTROY_WQ, - IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL, - IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL, - IB_USER_VERBS_EX_CMD_MODIFY_CQ -}; - -/* - * Make sure that all structs defined in this file remain laid out so - * that they pack the same way on 32-bit and 64-bit architectures (to - * avoid incompatibility between 32-bit userspace and 64-bit kernels). - * Specifically: - * - Do not use pointer types -- pass pointers in __u64 instead. - * - Make sure that any structure larger than 4 bytes is padded to a - * multiple of 8 bytes. Otherwise the structure size will be - * different between 32-bit and 64-bit architectures. - */ - -struct ib_uverbs_async_event_desc { - __aligned_u64 element; - __u32 event_type; /* enum ib_event_type */ - __u32 reserved; -}; - -struct ib_uverbs_comp_event_desc { - __aligned_u64 cq_handle; -}; - -struct ib_uverbs_cq_moderation_caps { - __u16 max_cq_moderation_count; - __u16 max_cq_moderation_period; - __u32 reserved; -}; - -/* - * All commands from userspace should start with a __u32 command field - * followed by __u16 in_words and out_words fields (which give the - * length of the command block and response buffer if any in 32-bit - * words). The kernel driver will read these fields first and read - * the rest of the command struct based on these value. - */ - -#define IB_USER_VERBS_CMD_COMMAND_MASK 0xff -#define IB_USER_VERBS_CMD_FLAG_EXTENDED 0x80000000u - -struct ib_uverbs_cmd_hdr { - __u32 command; - __u16 in_words; - __u16 out_words; -}; - -struct ib_uverbs_ex_cmd_hdr { - __aligned_u64 response; - __u16 provider_in_words; - __u16 provider_out_words; - __u32 cmd_hdr_reserved; -}; - -struct ib_uverbs_get_context { - __aligned_u64 response; - __aligned_u64 driver_data[0]; -}; - -struct ib_uverbs_get_context_resp { - __u32 async_fd; - __u32 num_comp_vectors; -}; - -struct ib_uverbs_query_device { - __aligned_u64 response; - __aligned_u64 driver_data[0]; -}; - -struct ib_uverbs_query_device_resp { - __aligned_u64 fw_ver; - __be64 node_guid; - __be64 sys_image_guid; - __aligned_u64 max_mr_size; - __aligned_u64 page_size_cap; - __u32 vendor_id; - __u32 vendor_part_id; - __u32 hw_ver; - __u32 max_qp; - __u32 max_qp_wr; - __u32 device_cap_flags; - __u32 max_sge; - __u32 max_sge_rd; - __u32 max_cq; - __u32 max_cqe; - __u32 max_mr; - __u32 max_pd; - __u32 max_qp_rd_atom; - __u32 max_ee_rd_atom; - __u32 max_res_rd_atom; - __u32 max_qp_init_rd_atom; - __u32 max_ee_init_rd_atom; - __u32 atomic_cap; - __u32 max_ee; - __u32 max_rdd; - __u32 max_mw; - __u32 max_raw_ipv6_qp; - __u32 max_raw_ethy_qp; - __u32 max_mcast_grp; - __u32 max_mcast_qp_attach; - __u32 max_total_mcast_qp_attach; - __u32 max_ah; - __u32 max_fmr; - __u32 max_map_per_fmr; - __u32 max_srq; - __u32 max_srq_wr; - __u32 max_srq_sge; - __u16 max_pkeys; - __u8 local_ca_ack_delay; - __u8 phys_port_cnt; - __u8 reserved[4]; -}; - -struct ib_uverbs_ex_query_device { - __u32 comp_mask; - __u32 reserved; -}; - -struct ib_uverbs_odp_caps { - __aligned_u64 general_caps; - struct { - __u32 rc_odp_caps; - __u32 uc_odp_caps; - __u32 ud_odp_caps; - } per_transport_caps; - __u32 reserved; -}; - -struct ib_uverbs_rss_caps { - /* Corresponding bit will be set if qp type from - * 'enum ib_qp_type' is supported, e.g. - * supported_qpts |= 1 << IB_QPT_UD - */ - __u32 supported_qpts; - __u32 max_rwq_indirection_tables; - __u32 max_rwq_indirection_table_size; - __u32 reserved; -}; - -struct ib_uverbs_tm_caps { - /* Max size of rendezvous request message */ - __u32 max_rndv_hdr_size; - /* Max number of entries in tag matching list */ - __u32 max_num_tags; - /* TM flags */ - __u32 flags; - /* Max number of outstanding list operations */ - __u32 max_ops; - /* Max number of SGE in tag matching entry */ - __u32 max_sge; - __u32 reserved; -}; - -struct ib_uverbs_ex_query_device_resp { - struct ib_uverbs_query_device_resp base; - __u32 comp_mask; - __u32 response_length; - struct ib_uverbs_odp_caps odp_caps; - __aligned_u64 timestamp_mask; - __aligned_u64 hca_core_clock; /* in KHZ */ - __aligned_u64 device_cap_flags_ex; - struct ib_uverbs_rss_caps rss_caps; - __u32 max_wq_type_rq; - __u32 raw_packet_caps; - struct ib_uverbs_tm_caps tm_caps; - struct ib_uverbs_cq_moderation_caps cq_moderation_caps; - __aligned_u64 max_dm_size; -}; - -struct ib_uverbs_query_port { - __aligned_u64 response; - __u8 port_num; - __u8 reserved[7]; - __aligned_u64 driver_data[0]; -}; - -struct ib_uverbs_query_port_resp { - __u32 port_cap_flags; - __u32 max_msg_sz; - __u32 bad_pkey_cntr; - __u32 qkey_viol_cntr; - __u32 gid_tbl_len; - __u16 pkey_tbl_len; - __u16 lid; - __u16 sm_lid; - __u8 state; - __u8 max_mtu; - __u8 active_mtu; - __u8 lmc; - __u8 max_vl_num; - __u8 sm_sl; - __u8 subnet_timeout; - __u8 init_type_reply; - __u8 active_width; - __u8 active_speed; - __u8 phys_state; - __u8 link_layer; - __u8 reserved[2]; -}; - -struct ib_uverbs_alloc_pd { - __aligned_u64 response; - __aligned_u64 driver_data[0]; -}; - -struct ib_uverbs_alloc_pd_resp { - __u32 pd_handle; -}; - -struct ib_uverbs_dealloc_pd { - __u32 pd_handle; -}; - -struct ib_uverbs_open_xrcd { - __aligned_u64 response; - __u32 fd; - __u32 oflags; - __aligned_u64 driver_data[0]; -}; - -struct ib_uverbs_open_xrcd_resp { - __u32 xrcd_handle; -}; - -struct ib_uverbs_close_xrcd { - __u32 xrcd_handle; -}; - -struct ib_uverbs_reg_mr { - __aligned_u64 response; - __aligned_u64 start; - __aligned_u64 length; - __aligned_u64 hca_va; - __u32 pd_handle; - __u32 access_flags; - __aligned_u64 driver_data[0]; -}; - -struct ib_uverbs_reg_mr_resp { - __u32 mr_handle; - __u32 lkey; - __u32 rkey; -}; - -struct ib_uverbs_rereg_mr { - __aligned_u64 response; - __u32 mr_handle; - __u32 flags; - __aligned_u64 start; - __aligned_u64 length; - __aligned_u64 hca_va; - __u32 pd_handle; - __u32 access_flags; -}; - -struct ib_uverbs_rereg_mr_resp { - __u32 lkey; - __u32 rkey; -}; - -struct ib_uverbs_dereg_mr { - __u32 mr_handle; -}; - -struct ib_uverbs_alloc_mw { - __aligned_u64 response; - __u32 pd_handle; - __u8 mw_type; - __u8 reserved[3]; -}; - -struct ib_uverbs_alloc_mw_resp { - __u32 mw_handle; - __u32 rkey; -}; - -struct ib_uverbs_dealloc_mw { - __u32 mw_handle; -}; - -struct ib_uverbs_create_comp_channel { - __aligned_u64 response; -}; - -struct ib_uverbs_create_comp_channel_resp { - __u32 fd; -}; - -struct ib_uverbs_create_cq { - __aligned_u64 response; - __aligned_u64 user_handle; - __u32 cqe; - __u32 comp_vector; - __s32 comp_channel; - __u32 reserved; - __aligned_u64 driver_data[0]; -}; - -enum ib_uverbs_ex_create_cq_flags { - IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION = 1 << 0, - IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN = 1 << 1, -}; - -struct ib_uverbs_ex_create_cq { - __aligned_u64 user_handle; - __u32 cqe; - __u32 comp_vector; - __s32 comp_channel; - __u32 comp_mask; - __u32 flags; /* bitmask of ib_uverbs_ex_create_cq_flags */ - __u32 reserved; -}; - -struct ib_uverbs_create_cq_resp { - __u32 cq_handle; - __u32 cqe; -}; - -struct ib_uverbs_ex_create_cq_resp { - struct ib_uverbs_create_cq_resp base; - __u32 comp_mask; - __u32 response_length; -}; - -struct ib_uverbs_resize_cq { - __aligned_u64 response; - __u32 cq_handle; - __u32 cqe; - __aligned_u64 driver_data[0]; -}; - -struct ib_uverbs_resize_cq_resp { - __u32 cqe; - __u32 reserved; - __aligned_u64 driver_data[0]; -}; - -struct ib_uverbs_poll_cq { - __aligned_u64 response; - __u32 cq_handle; - __u32 ne; -}; - -struct ib_uverbs_wc { - __aligned_u64 wr_id; - __u32 status; - __u32 opcode; - __u32 vendor_err; - __u32 byte_len; - union { - __be32 imm_data; - __u32 invalidate_rkey; - } ex; - __u32 qp_num; - __u32 src_qp; - __u32 wc_flags; - __u16 pkey_index; - __u16 slid; - __u8 sl; - __u8 dlid_path_bits; - __u8 port_num; - __u8 reserved; -}; - -struct ib_uverbs_poll_cq_resp { - __u32 count; - __u32 reserved; - struct ib_uverbs_wc wc[0]; -}; - -struct ib_uverbs_req_notify_cq { - __u32 cq_handle; - __u32 solicited_only; -}; - -struct ib_uverbs_destroy_cq { - __aligned_u64 response; - __u32 cq_handle; - __u32 reserved; -}; - -struct ib_uverbs_destroy_cq_resp { - __u32 comp_events_reported; - __u32 async_events_reported; -}; - -struct ib_uverbs_global_route { - __u8 dgid[16]; - __u32 flow_label; - __u8 sgid_index; - __u8 hop_limit; - __u8 traffic_class; - __u8 reserved; -}; - -struct ib_uverbs_ah_attr { - struct ib_uverbs_global_route grh; - __u16 dlid; - __u8 sl; - __u8 src_path_bits; - __u8 static_rate; - __u8 is_global; - __u8 port_num; - __u8 reserved; -}; - -struct ib_uverbs_qp_attr { - __u32 qp_attr_mask; - __u32 qp_state; - __u32 cur_qp_state; - __u32 path_mtu; - __u32 path_mig_state; - __u32 qkey; - __u32 rq_psn; - __u32 sq_psn; - __u32 dest_qp_num; - __u32 qp_access_flags; - - struct ib_uverbs_ah_attr ah_attr; - struct ib_uverbs_ah_attr alt_ah_attr; - - /* ib_qp_cap */ - __u32 max_send_wr; - __u32 max_recv_wr; - __u32 max_send_sge; - __u32 max_recv_sge; - __u32 max_inline_data; - - __u16 pkey_index; - __u16 alt_pkey_index; - __u8 en_sqd_async_notify; - __u8 sq_draining; - __u8 max_rd_atomic; - __u8 max_dest_rd_atomic; - __u8 min_rnr_timer; - __u8 port_num; - __u8 timeout; - __u8 retry_cnt; - __u8 rnr_retry; - __u8 alt_port_num; - __u8 alt_timeout; - __u8 reserved[5]; -}; - -struct ib_uverbs_create_qp { - __aligned_u64 response; - __aligned_u64 user_handle; - __u32 pd_handle; - __u32 send_cq_handle; - __u32 recv_cq_handle; - __u32 srq_handle; - __u32 max_send_wr; - __u32 max_recv_wr; - __u32 max_send_sge; - __u32 max_recv_sge; - __u32 max_inline_data; - __u8 sq_sig_all; - __u8 qp_type; - __u8 is_srq; - __u8 reserved; - __aligned_u64 driver_data[0]; -}; - -enum ib_uverbs_create_qp_mask { - IB_UVERBS_CREATE_QP_MASK_IND_TABLE = 1UL << 0, -}; - -enum { - IB_UVERBS_CREATE_QP_SUP_COMP_MASK = IB_UVERBS_CREATE_QP_MASK_IND_TABLE, -}; - -enum { - /* - * This value is equal to IB_QP_DEST_QPN. - */ - IB_USER_LEGACY_LAST_QP_ATTR_MASK = 1ULL << 20, -}; - -enum { - /* - * This value is equal to IB_QP_RATE_LIMIT. - */ - IB_USER_LAST_QP_ATTR_MASK = 1ULL << 25, -}; - -struct ib_uverbs_ex_create_qp { - __aligned_u64 user_handle; - __u32 pd_handle; - __u32 send_cq_handle; - __u32 recv_cq_handle; - __u32 srq_handle; - __u32 max_send_wr; - __u32 max_recv_wr; - __u32 max_send_sge; - __u32 max_recv_sge; - __u32 max_inline_data; - __u8 sq_sig_all; - __u8 qp_type; - __u8 is_srq; - __u8 reserved; - __u32 comp_mask; - __u32 create_flags; - __u32 rwq_ind_tbl_handle; - __u32 source_qpn; -}; - -struct ib_uverbs_open_qp { - __aligned_u64 response; - __aligned_u64 user_handle; - __u32 pd_handle; - __u32 qpn; - __u8 qp_type; - __u8 reserved[7]; - __aligned_u64 driver_data[0]; -}; - -/* also used for open response */ -struct ib_uverbs_create_qp_resp { - __u32 qp_handle; - __u32 qpn; - __u32 max_send_wr; - __u32 max_recv_wr; - __u32 max_send_sge; - __u32 max_recv_sge; - __u32 max_inline_data; - __u32 reserved; -}; - -struct ib_uverbs_ex_create_qp_resp { - struct ib_uverbs_create_qp_resp base; - __u32 comp_mask; - __u32 response_length; -}; - -/* - * This struct needs to remain a multiple of 8 bytes to keep the - * alignment of the modify QP parameters. - */ -struct ib_uverbs_qp_dest { - __u8 dgid[16]; - __u32 flow_label; - __u16 dlid; - __u16 reserved; - __u8 sgid_index; - __u8 hop_limit; - __u8 traffic_class; - __u8 sl; - __u8 src_path_bits; - __u8 static_rate; - __u8 is_global; - __u8 port_num; -}; - -struct ib_uverbs_query_qp { - __aligned_u64 response; - __u32 qp_handle; - __u32 attr_mask; - __aligned_u64 driver_data[0]; -}; - -struct ib_uverbs_query_qp_resp { - struct ib_uverbs_qp_dest dest; - struct ib_uverbs_qp_dest alt_dest; - __u32 max_send_wr; - __u32 max_recv_wr; - __u32 max_send_sge; - __u32 max_recv_sge; - __u32 max_inline_data; - __u32 qkey; - __u32 rq_psn; - __u32 sq_psn; - __u32 dest_qp_num; - __u32 qp_access_flags; - __u16 pkey_index; - __u16 alt_pkey_index; - __u8 qp_state; - __u8 cur_qp_state; - __u8 path_mtu; - __u8 path_mig_state; - __u8 sq_draining; - __u8 max_rd_atomic; - __u8 max_dest_rd_atomic; - __u8 min_rnr_timer; - __u8 port_num; - __u8 timeout; - __u8 retry_cnt; - __u8 rnr_retry; - __u8 alt_port_num; - __u8 alt_timeout; - __u8 sq_sig_all; - __u8 reserved[5]; - __aligned_u64 driver_data[0]; -}; - -struct ib_uverbs_modify_qp { - struct ib_uverbs_qp_dest dest; - struct ib_uverbs_qp_dest alt_dest; - __u32 qp_handle; - __u32 attr_mask; - __u32 qkey; - __u32 rq_psn; - __u32 sq_psn; - __u32 dest_qp_num; - __u32 qp_access_flags; - __u16 pkey_index; - __u16 alt_pkey_index; - __u8 qp_state; - __u8 cur_qp_state; - __u8 path_mtu; - __u8 path_mig_state; - __u8 en_sqd_async_notify; - __u8 max_rd_atomic; - __u8 max_dest_rd_atomic; - __u8 min_rnr_timer; - __u8 port_num; - __u8 timeout; - __u8 retry_cnt; - __u8 rnr_retry; - __u8 alt_port_num; - __u8 alt_timeout; - __u8 reserved[2]; - __aligned_u64 driver_data[0]; -}; - -struct ib_uverbs_ex_modify_qp { - struct ib_uverbs_modify_qp base; - __u32 rate_limit; - __u32 reserved; -}; - -struct ib_uverbs_modify_qp_resp { -}; - -struct ib_uverbs_ex_modify_qp_resp { - __u32 comp_mask; - __u32 response_length; -}; - -struct ib_uverbs_destroy_qp { - __aligned_u64 response; - __u32 qp_handle; - __u32 reserved; -}; - -struct ib_uverbs_destroy_qp_resp { - __u32 events_reported; -}; - -/* - * The ib_uverbs_sge structure isn't used anywhere, since we assume - * the ib_sge structure is packed the same way on 32-bit and 64-bit - * architectures in both kernel and user space. It's just here to - * document the ABI. - */ -struct ib_uverbs_sge { - __aligned_u64 addr; - __u32 length; - __u32 lkey; -}; - -struct ib_uverbs_send_wr { - __aligned_u64 wr_id; - __u32 num_sge; - __u32 opcode; - __u32 send_flags; - union { - __be32 imm_data; - __u32 invalidate_rkey; - } ex; - union { - struct { - __aligned_u64 remote_addr; - __u32 rkey; - __u32 reserved; - } rdma; - struct { - __aligned_u64 remote_addr; - __aligned_u64 compare_add; - __aligned_u64 swap; - __u32 rkey; - __u32 reserved; - } atomic; - struct { - __u32 ah; - __u32 remote_qpn; - __u32 remote_qkey; - __u32 reserved; - } ud; - } wr; -}; - -struct ib_uverbs_post_send { - __aligned_u64 response; - __u32 qp_handle; - __u32 wr_count; - __u32 sge_count; - __u32 wqe_size; - struct ib_uverbs_send_wr send_wr[0]; -}; - -struct ib_uverbs_post_send_resp { - __u32 bad_wr; -}; - -struct ib_uverbs_recv_wr { - __aligned_u64 wr_id; - __u32 num_sge; - __u32 reserved; -}; - -struct ib_uverbs_post_recv { - __aligned_u64 response; - __u32 qp_handle; - __u32 wr_count; - __u32 sge_count; - __u32 wqe_size; - struct ib_uverbs_recv_wr recv_wr[0]; -}; - -struct ib_uverbs_post_recv_resp { - __u32 bad_wr; -}; - -struct ib_uverbs_post_srq_recv { - __aligned_u64 response; - __u32 srq_handle; - __u32 wr_count; - __u32 sge_count; - __u32 wqe_size; - struct ib_uverbs_recv_wr recv[0]; -}; - -struct ib_uverbs_post_srq_recv_resp { - __u32 bad_wr; -}; - -struct ib_uverbs_create_ah { - __aligned_u64 response; - __aligned_u64 user_handle; - __u32 pd_handle; - __u32 reserved; - struct ib_uverbs_ah_attr attr; -}; - -struct ib_uverbs_create_ah_resp { - __u32 ah_handle; -}; - -struct ib_uverbs_destroy_ah { - __u32 ah_handle; -}; - -struct ib_uverbs_attach_mcast { - __u8 gid[16]; - __u32 qp_handle; - __u16 mlid; - __u16 reserved; - __aligned_u64 driver_data[0]; -}; - -struct ib_uverbs_detach_mcast { - __u8 gid[16]; - __u32 qp_handle; - __u16 mlid; - __u16 reserved; - __aligned_u64 driver_data[0]; -}; - -struct ib_uverbs_flow_spec_hdr { - __u32 type; - __u16 size; - __u16 reserved; - /* followed by flow_spec */ - __aligned_u64 flow_spec_data[0]; -}; - -struct ib_uverbs_flow_eth_filter { - __u8 dst_mac[6]; - __u8 src_mac[6]; - __be16 ether_type; - __be16 vlan_tag; -}; - -struct ib_uverbs_flow_spec_eth { - union { - struct ib_uverbs_flow_spec_hdr hdr; - struct { - __u32 type; - __u16 size; - __u16 reserved; - }; - }; - struct ib_uverbs_flow_eth_filter val; - struct ib_uverbs_flow_eth_filter mask; -}; - -struct ib_uverbs_flow_ipv4_filter { - __be32 src_ip; - __be32 dst_ip; - __u8 proto; - __u8 tos; - __u8 ttl; - __u8 flags; -}; - -struct ib_uverbs_flow_spec_ipv4 { - union { - struct ib_uverbs_flow_spec_hdr hdr; - struct { - __u32 type; - __u16 size; - __u16 reserved; - }; - }; - struct ib_uverbs_flow_ipv4_filter val; - struct ib_uverbs_flow_ipv4_filter mask; -}; - -struct ib_uverbs_flow_tcp_udp_filter { - __be16 dst_port; - __be16 src_port; -}; - -struct ib_uverbs_flow_spec_tcp_udp { - union { - struct ib_uverbs_flow_spec_hdr hdr; - struct { - __u32 type; - __u16 size; - __u16 reserved; - }; - }; - struct ib_uverbs_flow_tcp_udp_filter val; - struct ib_uverbs_flow_tcp_udp_filter mask; -}; - -struct ib_uverbs_flow_ipv6_filter { - __u8 src_ip[16]; - __u8 dst_ip[16]; - __be32 flow_label; - __u8 next_hdr; - __u8 traffic_class; - __u8 hop_limit; - __u8 reserved; -}; - -struct ib_uverbs_flow_spec_ipv6 { - union { - struct ib_uverbs_flow_spec_hdr hdr; - struct { - __u32 type; - __u16 size; - __u16 reserved; - }; - }; - struct ib_uverbs_flow_ipv6_filter val; - struct ib_uverbs_flow_ipv6_filter mask; -}; - -struct ib_uverbs_flow_spec_action_tag { - union { - struct ib_uverbs_flow_spec_hdr hdr; - struct { - __u32 type; - __u16 size; - __u16 reserved; - }; - }; - __u32 tag_id; - __u32 reserved1; -}; - -struct ib_uverbs_flow_spec_action_drop { - union { - struct ib_uverbs_flow_spec_hdr hdr; - struct { - __u32 type; - __u16 size; - __u16 reserved; - }; - }; -}; - -struct ib_uverbs_flow_spec_action_handle { - union { - struct ib_uverbs_flow_spec_hdr hdr; - struct { - __u32 type; - __u16 size; - __u16 reserved; - }; - }; - __u32 handle; - __u32 reserved1; -}; - -struct ib_uverbs_flow_spec_action_count { - union { - struct ib_uverbs_flow_spec_hdr hdr; - struct { - __u32 type; - __u16 size; - __u16 reserved; - }; - }; - __u32 handle; - __u32 reserved1; -}; - -struct ib_uverbs_flow_tunnel_filter { - __be32 tunnel_id; -}; - -struct ib_uverbs_flow_spec_tunnel { - union { - struct ib_uverbs_flow_spec_hdr hdr; - struct { - __u32 type; - __u16 size; - __u16 reserved; - }; - }; - struct ib_uverbs_flow_tunnel_filter val; - struct ib_uverbs_flow_tunnel_filter mask; -}; - -struct ib_uverbs_flow_spec_esp_filter { - __u32 spi; - __u32 seq; -}; - -struct ib_uverbs_flow_spec_esp { - union { - struct ib_uverbs_flow_spec_hdr hdr; - struct { - __u32 type; - __u16 size; - __u16 reserved; - }; - }; - struct ib_uverbs_flow_spec_esp_filter val; - struct ib_uverbs_flow_spec_esp_filter mask; -}; - -struct ib_uverbs_flow_gre_filter { - /* c_ks_res0_ver field is bits 0-15 in offset 0 of a standard GRE header: - * bit 0 - C - checksum bit. - * bit 1 - reserved. set to 0. - * bit 2 - key bit. - * bit 3 - sequence number bit. - * bits 4:12 - reserved. set to 0. - * bits 13:15 - GRE version. - */ - __be16 c_ks_res0_ver; - __be16 protocol; - __be32 key; -}; - -struct ib_uverbs_flow_spec_gre { - union { - struct ib_uverbs_flow_spec_hdr hdr; - struct { - __u32 type; - __u16 size; - __u16 reserved; - }; - }; - struct ib_uverbs_flow_gre_filter val; - struct ib_uverbs_flow_gre_filter mask; -}; - -struct ib_uverbs_flow_mpls_filter { - /* The field includes the entire MPLS label: - * bits 0:19 - label field. - * bits 20:22 - traffic class field. - * bits 23 - bottom of stack bit. - * bits 24:31 - ttl field. - */ - __be32 label; -}; - -struct ib_uverbs_flow_spec_mpls { - union { - struct ib_uverbs_flow_spec_hdr hdr; - struct { - __u32 type; - __u16 size; - __u16 reserved; - }; - }; - struct ib_uverbs_flow_mpls_filter val; - struct ib_uverbs_flow_mpls_filter mask; -}; - -struct ib_uverbs_flow_attr { - __u32 type; - __u16 size; - __u16 priority; - __u8 num_of_specs; - __u8 reserved[2]; - __u8 port; - __u32 flags; - /* Following are the optional layers according to user request - * struct ib_flow_spec_xxx - * struct ib_flow_spec_yyy - */ - struct ib_uverbs_flow_spec_hdr flow_specs[0]; -}; - -struct ib_uverbs_create_flow { - __u32 comp_mask; - __u32 qp_handle; - struct ib_uverbs_flow_attr flow_attr; -}; - -struct ib_uverbs_create_flow_resp { - __u32 comp_mask; - __u32 flow_handle; -}; - -struct ib_uverbs_destroy_flow { - __u32 comp_mask; - __u32 flow_handle; -}; - -struct ib_uverbs_create_srq { - __aligned_u64 response; - __aligned_u64 user_handle; - __u32 pd_handle; - __u32 max_wr; - __u32 max_sge; - __u32 srq_limit; - __aligned_u64 driver_data[0]; -}; - -struct ib_uverbs_create_xsrq { - __aligned_u64 response; - __aligned_u64 user_handle; - __u32 srq_type; - __u32 pd_handle; - __u32 max_wr; - __u32 max_sge; - __u32 srq_limit; - __u32 max_num_tags; - __u32 xrcd_handle; - __u32 cq_handle; - __aligned_u64 driver_data[0]; -}; - -struct ib_uverbs_create_srq_resp { - __u32 srq_handle; - __u32 max_wr; - __u32 max_sge; - __u32 srqn; -}; - -struct ib_uverbs_modify_srq { - __u32 srq_handle; - __u32 attr_mask; - __u32 max_wr; - __u32 srq_limit; - __aligned_u64 driver_data[0]; -}; - -struct ib_uverbs_query_srq { - __aligned_u64 response; - __u32 srq_handle; - __u32 reserved; - __aligned_u64 driver_data[0]; -}; - -struct ib_uverbs_query_srq_resp { - __u32 max_wr; - __u32 max_sge; - __u32 srq_limit; - __u32 reserved; -}; - -struct ib_uverbs_destroy_srq { - __aligned_u64 response; - __u32 srq_handle; - __u32 reserved; -}; - -struct ib_uverbs_destroy_srq_resp { - __u32 events_reported; -}; - -struct ib_uverbs_ex_create_wq { - __u32 comp_mask; - __u32 wq_type; - __aligned_u64 user_handle; - __u32 pd_handle; - __u32 cq_handle; - __u32 max_wr; - __u32 max_sge; - __u32 create_flags; /* Use enum ib_wq_flags */ - __u32 reserved; -}; - -struct ib_uverbs_ex_create_wq_resp { - __u32 comp_mask; - __u32 response_length; - __u32 wq_handle; - __u32 max_wr; - __u32 max_sge; - __u32 wqn; -}; - -struct ib_uverbs_ex_destroy_wq { - __u32 comp_mask; - __u32 wq_handle; -}; - -struct ib_uverbs_ex_destroy_wq_resp { - __u32 comp_mask; - __u32 response_length; - __u32 events_reported; - __u32 reserved; -}; - -struct ib_uverbs_ex_modify_wq { - __u32 attr_mask; - __u32 wq_handle; - __u32 wq_state; - __u32 curr_wq_state; - __u32 flags; /* Use enum ib_wq_flags */ - __u32 flags_mask; /* Use enum ib_wq_flags */ -}; - -/* Prevent memory allocation rather than max expected size */ -#define IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE 0x0d -struct ib_uverbs_ex_create_rwq_ind_table { - __u32 comp_mask; - __u32 log_ind_tbl_size; - /* Following are the wq handles according to log_ind_tbl_size - * wq_handle1 - * wq_handle2 - */ - __u32 wq_handles[0]; -}; - -struct ib_uverbs_ex_create_rwq_ind_table_resp { - __u32 comp_mask; - __u32 response_length; - __u32 ind_tbl_handle; - __u32 ind_tbl_num; -}; - -struct ib_uverbs_ex_destroy_rwq_ind_table { - __u32 comp_mask; - __u32 ind_tbl_handle; -}; - -struct ib_uverbs_cq_moderation { - __u16 cq_count; - __u16 cq_period; -}; - -struct ib_uverbs_ex_modify_cq { - __u32 cq_handle; - __u32 attr_mask; - struct ib_uverbs_cq_moderation attr; - __u32 reserved; -}; - -#define IB_DEVICE_NAME_MAX 64 - -#endif /* IB_USER_VERBS_H */ diff --git a/prov/efa/include/infiniband/efa_verbs.h b/prov/efa/include/infiniband/efa_verbs.h deleted file mode 100644 index 8f3f7df5472..00000000000 --- a/prov/efa/include/infiniband/efa_verbs.h +++ /dev/null @@ -1,373 +0,0 @@ -/* - * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved. - * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2005 PathScale, Inc. All rights reserved. - * Copyright (c) 2017-2019 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef INFINIBAND_VERBS_H -#define INFINIBAND_VERBS_H - -#include -#include -#include -#include - -#ifdef __cplusplus -# define BEGIN_C_DECLS extern "C" { -# define END_C_DECLS } -#else /* !__cplusplus */ -# define BEGIN_C_DECLS -# define END_C_DECLS -#endif /* __cplusplus */ - -#if __GNUC__ >= 3 -# define __attribute_const __attribute__((const)) -#else -# define __attribute_const -#endif - -BEGIN_C_DECLS - -union ibv_gid { - uint8_t raw[16]; - struct { - uint64_t subnet_prefix; - uint64_t interface_id; - } global; -}; - -#ifndef container_of -/** - * container_of - cast a member of a structure out to the containing structure - * @ptr: the pointer to the member. - * @type: the type of the container struct this is embedded in. - * @member: the name of the member within the struct. - * - */ -#define container_of(ptr, type, member) \ - ((type *) ((uint8_t *)(ptr) - offsetof(type, member))) -#endif - -enum ibv_node_type { - IBV_NODE_UNKNOWN = -1, - IBV_NODE_CA = 1, - IBV_NODE_SWITCH, - IBV_NODE_ROUTER, - IBV_NODE_RNIC, - IBV_NODE_USNIC, - IBV_NODE_USNIC_UDP, -}; - -enum ibv_transport_type { - IBV_TRANSPORT_UNKNOWN = -1, - IBV_TRANSPORT_IB = 0, - IBV_TRANSPORT_IWARP, - IBV_TRANSPORT_USNIC, - IBV_TRANSPORT_USNIC_UDP, -}; - -enum ibv_atomic_cap { - IBV_ATOMIC_NONE, - IBV_ATOMIC_HCA, - IBV_ATOMIC_GLOB -}; - -struct ibv_device_attr { - char fw_ver[64]; - uint64_t node_guid; - uint64_t sys_image_guid; - uint64_t max_mr_size; - uint64_t page_size_cap; - uint32_t vendor_id; - uint32_t vendor_part_id; - uint32_t hw_ver; - int max_qp; - int max_qp_wr; - int device_cap_flags; - int max_sge; - int max_sge_rd; - int max_cq; - int max_cqe; - int max_mr; - int max_pd; - int max_qp_rd_atom; - int max_ee_rd_atom; - int max_res_rd_atom; - int max_qp_init_rd_atom; - int max_ee_init_rd_atom; - enum ibv_atomic_cap atomic_cap; - int max_ee; - int max_rdd; - int max_mw; - int max_raw_ipv6_qp; - int max_raw_ethy_qp; - int max_mcast_grp; - int max_mcast_qp_attach; - int max_total_mcast_qp_attach; - int max_ah; - int max_fmr; - int max_map_per_fmr; - int max_srq; - int max_srq_wr; - int max_srq_sge; - uint16_t max_pkeys; - uint8_t local_ca_ack_delay; - uint8_t phys_port_cnt; -}; - -enum ibv_mtu { - IBV_MTU_256 = 1, - IBV_MTU_512 = 2, - IBV_MTU_1024 = 3, - IBV_MTU_2048 = 4, - IBV_MTU_4096 = 5 -}; - -enum ibv_port_state { - IBV_PORT_NOP = 0, - IBV_PORT_DOWN = 1, - IBV_PORT_INIT = 2, - IBV_PORT_ARMED = 3, - IBV_PORT_ACTIVE = 4, - IBV_PORT_ACTIVE_DEFER = 5 -}; - -struct ibv_port_attr { - enum ibv_port_state state; - enum ibv_mtu max_mtu; - enum ibv_mtu active_mtu; - int gid_tbl_len; - uint32_t port_cap_flags; - uint32_t max_msg_sz; - uint32_t bad_pkey_cntr; - uint32_t qkey_viol_cntr; - uint16_t pkey_tbl_len; - uint16_t lid; - uint16_t sm_lid; - uint8_t lmc; - uint8_t max_vl_num; - uint8_t sm_sl; - uint8_t subnet_timeout; - uint8_t init_type_reply; - uint8_t active_width; - uint8_t active_speed; - uint8_t phys_state; - uint8_t link_layer; - uint8_t reserved; -}; - -enum ibv_access_flags { - IBV_ACCESS_LOCAL_WRITE = 1, - IBV_ACCESS_REMOTE_WRITE = (1<<1), - IBV_ACCESS_REMOTE_READ = (1<<2), - IBV_ACCESS_REMOTE_ATOMIC = (1<<3), - IBV_ACCESS_MW_BIND = (1<<4) -}; - -struct ibv_pd { - struct ibv_context *context; - uint32_t handle; -}; - -struct ibv_mr { - struct ibv_context *context; - struct ibv_pd *pd; - void *addr; - size_t length; - uint32_t handle; - uint32_t lkey; - uint32_t rkey; -}; - -struct ibv_global_route { - union ibv_gid dgid; - uint32_t flow_label; - uint8_t sgid_index; - uint8_t hop_limit; - uint8_t traffic_class; -}; - -struct ibv_ah_attr { - struct ibv_global_route grh; - uint16_t dlid; - uint8_t sl; - uint8_t src_path_bits; - uint8_t static_rate; - uint8_t is_global; - uint8_t port_num; -}; - -enum ibv_qp_type { - IBV_QPT_RC = 2, - IBV_QPT_UC, - IBV_QPT_UD, - IBV_QPT_RAW_PACKET = 8, - IBV_QPT_XRC_SEND = 9, - IBV_QPT_XRC_RECV, - IBV_QPT_DRIVER = 0xff, -}; - -struct ibv_qp_cap { - uint32_t max_send_wr; - uint32_t max_recv_wr; - uint32_t max_send_sge; - uint32_t max_recv_sge; - uint32_t max_inline_data; -}; - -struct ibv_qp_init_attr { - void *qp_context; - struct ibv_cq *send_cq; - struct ibv_cq *recv_cq; - struct ibv_srq *srq; - struct ibv_qp_cap cap; - enum ibv_qp_type qp_type; - int sq_sig_all; -}; - -enum ibv_qp_state { - IBV_QPS_RESET, - IBV_QPS_INIT, - IBV_QPS_RTR, - IBV_QPS_RTS, - IBV_QPS_SQD, - IBV_QPS_SQE, - IBV_QPS_ERR, - IBV_QPS_UNKNOWN -}; - -struct ibv_srq { - struct ibv_context *context; - void *srq_context; - struct ibv_pd *pd; - uint32_t handle; - - pthread_mutex_t mutex; - pthread_cond_t cond; - uint32_t events_completed; -}; - -struct ibv_qp { - struct ibv_context *context; - void *qp_context; - struct ibv_pd *pd; - struct ibv_cq *send_cq; - struct ibv_cq *recv_cq; - struct ibv_srq *srq; - uint32_t handle; - uint32_t qp_num; - enum ibv_qp_state state; - enum ibv_qp_type qp_type; - - pthread_mutex_t mutex; - pthread_cond_t cond; - uint32_t events_completed; -}; - -struct ibv_comp_channel { - struct ibv_context *context; - int fd; - int refcnt; -}; - -struct ibv_cq { - struct ibv_context *context; - struct ibv_comp_channel *channel; - void *cq_context; - uint32_t handle; - int cqe; - - pthread_mutex_t mutex; - pthread_cond_t cond; - uint32_t comp_events_completed; - uint32_t async_events_completed; -}; - -struct ibv_ah { - struct ibv_context *context; - struct ibv_pd *pd; - uint32_t handle; -}; - -struct ibv_device; -struct ibv_context; - -struct ibv_device_ops { - struct ibv_context * (*alloc_context)(struct ibv_device *device, int cmd_fd); - void (*free_context)(struct ibv_context *context); -}; - -enum { - IBV_SYSFS_NAME_MAX = 64, - IBV_SYSFS_PATH_MAX = 256 -}; - -struct ibv_device { - struct ibv_device_ops ops; - enum ibv_node_type node_type; - enum ibv_transport_type transport_type; - /* Name of underlying kernel IB device, eg "mthca0" */ - char name[IBV_SYSFS_NAME_MAX]; - /* Name of uverbs device, eg "uverbs0" */ - char dev_name[IBV_SYSFS_NAME_MAX]; - /* Path to infiniband_verbs class device in sysfs */ - char dev_path[IBV_SYSFS_PATH_MAX]; - /* Path to infiniband class device in sysfs */ - char ibdev_path[IBV_SYSFS_PATH_MAX]; -}; - -struct verbs_device { - struct ibv_device device; /* Must be first */ - size_t sz; - size_t size_of_context; - int (*init_context)(struct verbs_device *device, - struct ibv_context *ctx, int cmd_fd); - void (*uninit_context)(struct verbs_device *device, - struct ibv_context *ctx); - /* future fields added here */ -}; - -struct ibv_context { - struct ibv_device *device; - int cmd_fd; - int async_fd; - int num_comp_vectors; - pthread_mutex_t mutex; - void *abi_compat; -}; - -END_C_DECLS - -# undef __attribute_const - -#endif /* INFINIBAND_VERBS_H */ diff --git a/prov/efa/src/efa.h b/prov/efa/src/efa.h index 914fdf40c1a..7e6e6733a6f 100644 --- a/prov/efa/src/efa.h +++ b/prov/efa/src/efa.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright (c) 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -48,24 +48,24 @@ #include #include #include -#include -#include "infiniband/efa_arch.h" -#include "infiniband/efa_verbs.h" #include #include #include #include #include +#include +#include + #include "ofi.h" #include "ofi_enosys.h" #include "ofi_list.h" #include "ofi_util.h" #include "ofi_file.h" +#include "rxr.h" #define EFA_PROV_NAME "efa" -#define EFA_PROV_VERS FI_VERSION(3, 0) #define EFA_WARN(subsys, ...) FI_WARN(&efa_prov, subsys, __VA_ARGS__) #define EFA_TRACE(subsys, ...) FI_TRACE(&efa_prov, subsys, __VA_ARGS__) @@ -87,14 +87,24 @@ #define EFA_DEF_CQ_SIZE 1024 #define EFA_MR_IOV_LIMIT 1 -#define EFA_MR_SUPPORTED_PERMISSIONS (FI_SEND | FI_RECV) +#define EFA_MR_SUPPORTED_PERMISSIONS (FI_SEND | FI_RECV | FI_REMOTE_READ) + +/* + * Multiplier to give some room in the device memory registration limits + * to allow processes added to a running job to bootstrap. + */ +#define EFA_MR_CACHE_LIMIT_MULT (.9) -#define EFA_DEF_NUM_MR_CACHE 36 +#define EFA_MIN_AV_SIZE (16384) + +/* + * Specific flags and attributes for shm provider + */ +#define EFA_SHM_MAX_AV_COUNT (256) extern int efa_mr_cache_enable; extern size_t efa_mr_max_cached_count; extern size_t efa_mr_max_cached_size; -extern int efa_mr_cache_merge_regions; extern struct fi_provider efa_prov; extern struct util_prov efa_util_prov; @@ -106,45 +116,60 @@ struct efa_fabric { struct efa_ep_addr { uint8_t raw[16]; uint16_t qpn; + uint16_t pad; + uint32_t qkey; + struct efa_ep_addr *next; }; #define EFA_EP_ADDR_LEN sizeof(struct efa_ep_addr) +struct efa_ah { + struct ibv_ah *ibv_ah; + uint16_t ahn; +}; + struct efa_conn { - struct efa_ah *ah; + struct efa_ah ah; struct efa_ep_addr ep_addr; }; +/* + * Common fields for the beginning of the efa_domain and rxr_domain structures. + * This structure must be kept in sync with rxr_domain and efa_domain. This + * will be removed when the rxr and efa domain structures are combined. + */ +struct efa_domain_base { + struct util_domain util_domain; + enum efa_domain_type type; +}; + struct efa_domain { struct util_domain util_domain; + enum efa_domain_type type; + struct fid_domain *shm_domain; struct efa_context *ctx; - struct efa_pd *pd; + struct ibv_pd *ibv_pd; struct fi_info *info; struct efa_fabric *fab; - int rdm; - struct ofi_mr_cache cache; + struct ofi_mr_cache *cache; + struct efa_qp **qp_table; + size_t qp_table_sz_m1; }; -struct fi_ops_mr efa_domain_mr_ops; -struct fi_ops_mr efa_domain_mr_cache_ops; +extern struct fi_ops_mr efa_domain_mr_ops; +extern struct fi_ops_mr efa_domain_mr_cache_ops; int efa_mr_cache_entry_reg(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry); void efa_mr_cache_entry_dereg(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry); +int efa_mr_reg_shm(struct fid_domain *domain_fid, struct iovec *iov, + uint64_t access, struct fid_mr **mr_fid); + struct efa_wc { - uint64_t wr_id; - /* Completion flags */ - uint64_t flags; - /* Immediate data in network byte order */ - uint64_t imm_data; - /* Size of received data */ - uint32_t byte_len; - uint32_t comp_status; - struct efa_qp *qp; + struct ibv_wc ibv_wc; /* Source address */ uint16_t efa_ah; - uint16_t src_qp; }; struct efa_wce { @@ -154,124 +179,65 @@ struct efa_wce { typedef void (*efa_cq_read_entry)(struct efa_wc *wc, int index, void *buf); -struct efa_sub_cq { - uint16_t consumed_cnt; - int phase; - uint8_t *buf; - int qmask; - int cqe_size; - uint32_t ref_cnt; -}; - struct efa_cq { - struct fid_cq cq_fid; + struct util_cq util_cq; struct efa_domain *domain; size_t entry_size; efa_cq_read_entry read_entry; struct slist wcq; - fastlock_t outer_lock; + fastlock_t lock; struct ofi_bufpool *wce_pool; - struct ibv_cq ibv_cq; - uint8_t *buf; - size_t buf_size; - fastlock_t inner_lock; - uint32_t cqn; - int cqe_size; - struct efa_sub_cq *sub_cq_arr; - uint16_t num_sub_cqs; - /* Index of next sub cq idx to poll. This is used to guarantee fairness for sub cqs */ - uint16_t next_poll_idx; -}; - -struct efa_device { - struct verbs_device verbs_dev; - int page_size; - int abi_version; + struct ibv_cq *ibv_cq; }; struct efa_context { - struct ibv_context ibv_ctx; - int efa_everbs_cmd_fd; - struct efa_qp **qp_table; - pthread_mutex_t qp_table_mutex; - - int cqe_size; - uint16_t sub_cqs_per_cq; - uint16_t inject_size; - uint32_t cmds_supp_udata; - uint32_t max_llq_size; + struct ibv_context *ibv_ctx; + int dev_idx; uint64_t max_mr_size; + uint16_t inline_buf_size; + uint16_t max_wr_rdma_sge; + uint32_t max_rdma_size; + uint32_t device_caps; }; struct efa_pd { - struct ibv_pd ibv_pd; - struct efa_context *context; - uint16_t pdn; -}; - -struct efa_wq { - uint64_t *wrid; - /* wrid_idx_pool: Pool of free indexes in the wrid array, used to select the - * wrid entry to be used to hold the next tx packet's context. - * At init time, entry N will hold value N, as OOO tx-completions arrive, - * the value stored in a given entry might not equal the entry's index. - */ - uint32_t *wrid_idx_pool; - uint32_t wqe_cnt; - uint32_t wqe_posted; - uint32_t wqe_completed; - uint16_t desc_idx; - uint16_t desc_mask; - /* wrid_idx_pool_next: Index of the next entry to use in wrid_idx_pool. */ - uint16_t wrid_idx_pool_next; - int max_sge; - int phase; -}; - -struct efa_sq { - struct efa_wq wq; - uint32_t *db; - uint8_t *desc; - uint32_t desc_offset; - size_t desc_ring_mmap_size; - size_t max_inline_data; - size_t immediate_data_width; - uint16_t sub_cq_idx; -}; - -struct efa_rq { - struct efa_wq wq; - uint32_t *db; - uint8_t *buf; - size_t buf_size; - uint16_t sub_cq_idx; + struct ibv_pd *ibv_pd; + int use_cnt; }; struct efa_qp { - struct ibv_qp ibv_qp; + struct ibv_qp *ibv_qp; + struct ibv_qp_ex *ibv_qp_ex; struct efa_ep *ep; - struct efa_sq sq; - struct efa_rq rq; uint32_t qp_num; - int page_size; + uint32_t qkey; }; -struct efa_ah { - struct ibv_ah ibv_ah; - uint16_t efa_address_handle; +/* + * Descriptor returned for FI_HMEM peer memory registrations + */ +struct efa_mr_peer { + enum fi_hmem_iface iface; + union { + uint64_t reserved; + int cuda; + } device; }; -struct efa_mem_desc { +struct efa_mr { struct fid_mr mr_fid; - struct ibv_mr *mr; + struct ibv_mr *ibv_mr; struct efa_domain *domain; /* Used only in MR cache */ struct ofi_mr_entry *entry; + /* Used only in rdm */ + struct fid_mr *shm_mr; + struct efa_mr_peer peer; }; struct efa_ep { - struct fid_ep ep_fid; + struct util_ep util_ep; struct efa_domain *domain; struct efa_qp *qp; struct efa_cq *rcq; @@ -279,6 +245,23 @@ struct efa_ep { struct efa_av *av; struct fi_info *info; void *src_addr; + struct ibv_send_wr xmit_more_wr_head; + struct ibv_send_wr *xmit_more_wr_tail; + struct ibv_recv_wr recv_more_wr_head; + struct ibv_recv_wr *recv_more_wr_tail; + struct ofi_bufpool *send_wr_pool; + struct ofi_bufpool *recv_wr_pool; + struct ibv_ah *self_ah; +}; + +struct efa_send_wr { + struct ibv_send_wr wr; + struct ibv_sge sge[]; +}; + +struct efa_recv_wr { + struct ibv_recv_wr wr; + struct ibv_sge sge[]; }; typedef struct efa_conn * @@ -286,22 +269,32 @@ typedef struct efa_conn * (struct efa_av *av, fi_addr_t addr); struct efa_av { - struct fid_av av_fid; - struct efa_domain *domain; - struct efa_ep *ep; - size_t count; + struct fid_av *shm_rdm_av; + fi_addr_t shm_rdm_addr_map[EFA_SHM_MAX_AV_COUNT]; + struct efa_domain *domain; + struct efa_ep *ep; size_t used; size_t next; - uint64_t flags; + size_t shm_used; enum fi_av_type type; efa_addr_to_conn_func addr_to_conn; struct efa_reverse_av *reverse_av; + struct util_av util_av; + size_t count; + enum fi_ep_type ep_type; /* Used only for FI_AV_TABLE */ - struct efa_conn **conn_table; + struct efa_conn **conn_table; +}; + +struct efa_av_entry { + uint8_t ep_addr[EFA_EP_ADDR_LEN]; + fi_addr_t rdm_addr; + fi_addr_t shm_rdm_addr; + bool local_mapping; }; struct efa_ah_qpn { - uint16_t efa_ah; + uint16_t ahn; uint16_t qpn; }; @@ -325,54 +318,12 @@ struct efa_device_attr { uint16_t max_rq_sge; }; -static inline struct efa_device *to_efa_dev(struct ibv_device *ibdev) -{ - return container_of(ibdev, struct efa_device, verbs_dev); -} - -static inline struct efa_context *to_efa_ctx(struct ibv_context *ibctx) -{ - return container_of(ibctx, struct efa_context, ibv_ctx); -} - -static inline struct efa_pd *to_efa_pd(struct ibv_pd *ibpd) -{ - return container_of(ibpd, struct efa_pd, ibv_pd); -} - -static inline struct efa_cq *to_efa_cq(struct ibv_cq *ibcq) -{ - return container_of(ibcq, struct efa_cq, ibv_cq); -} - -static inline struct efa_qp *to_efa_qp(struct ibv_qp *ibqp) -{ - return container_of(ibqp, struct efa_qp, ibv_qp); -} - -static inline struct efa_ah *to_efa_ah(struct ibv_ah *ibah) -{ - return container_of(ibah, struct efa_ah, ibv_ah); -} - -static inline unsigned long align(unsigned long val, unsigned long align) -{ - return (val + align - 1) & ~(align - 1); -} -static inline uint32_t align_up_queue_size(uint32_t req) +static inline struct efa_av *rxr_ep_av(struct rxr_ep *ep) { - req--; - req |= req >> 1; - req |= req >> 2; - req |= req >> 4; - req |= req >> 8; - req |= req >> 16; - req++; - return req; + return container_of(ep->util_ep.av, struct efa_av, util_av); } -#define is_power_of_2(x) (!(x == 0) && !(x & (x - 1))) #define align_down_to_power_of_2(x) \ ({ \ __typeof__(x) n = (x); \ @@ -384,8 +335,22 @@ static inline uint32_t align_up_queue_size(uint32_t req) extern const struct efa_ep_domain efa_rdm_domain; extern const struct efa_ep_domain efa_dgrm_domain; -struct fi_ops_cm efa_ep_cm_ops; -struct fi_ops_msg efa_ep_msg_ops; +extern struct fi_ops_cm efa_ep_cm_ops; +extern struct fi_ops_msg efa_ep_msg_ops; +extern struct fi_ops_rma efa_ep_rma_ops; + +ssize_t efa_rma_post_read(struct efa_ep *ep, const struct fi_msg_rma *msg, + uint64_t flags, bool self_comm); + +extern fastlock_t pd_list_lock; +// This list has the same indicies as ctx_list. +extern struct efa_pd *pd_list; + +int efa_device_init(void); +void efa_device_free(void); + +struct efa_context **efa_device_get_context_list(int *num_ctx); +void efa_device_free_context_list(struct efa_context **list); const struct fi_info *efa_get_efa_info(const char *domain_name); int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info, @@ -397,13 +362,153 @@ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, int efa_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, struct fid_cq **cq_fid, void *context); +/* AV sub-functions */ +int efa_av_insert_addr(struct efa_av *av, struct efa_ep_addr *addr, + fi_addr_t *fi_addr, uint64_t flags, void *context); + /* Caller must hold cq->inner_lock. */ void efa_cq_inc_ref_cnt(struct efa_cq *cq, uint8_t sub_cq_idx); /* Caller must hold cq->inner_lock. */ void efa_cq_dec_ref_cnt(struct efa_cq *cq, uint8_t sub_cq_idx); -fi_addr_t efa_ah_qpn_to_addr(struct efa_ep *ep, uint16_t ah, uint16_t qpn); +fi_addr_t efa_ahn_qpn_to_addr(struct efa_av *av, uint16_t ahn, uint16_t qpn); struct fi_provider *init_lower_efa_prov(); +ssize_t efa_post_flush(struct efa_ep *ep, struct ibv_send_wr **bad_wr); + +ssize_t efa_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t count, fi_addr_t *src_addr); + +ssize_t efa_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *entry, uint64_t flags); + +bool efa_device_support_rdma_read(void); + +static inline +bool efa_ep_support_rdma_read(struct fid_ep *ep_fid) +{ + struct efa_ep *efa_ep; + + efa_ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid); + return efa_ep->domain->ctx->device_caps & EFADV_DEVICE_ATTR_CAPS_RDMA_READ; +} + +static inline +bool efa_ep_support_rnr_retry_modify(struct fid_ep *ep_fid) +{ +#ifdef HAVE_CAPS_RNR_RETRY + struct efa_ep *efa_ep; + + efa_ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid); + return efa_ep->domain->ctx->device_caps & EFADV_DEVICE_ATTR_CAPS_RNR_RETRY; +#else + return false; +#endif +} + +static inline +bool efa_peer_support_rdma_read(struct rxr_peer *peer) +{ + /* RDMA READ is an extra feature defined in version 4 (the base version). + * Because it is an extra feature, an EP will assume the peer does not support + * it before a handshake packet was received. + */ + return (peer->flags & RXR_PEER_HANDSHAKE_RECEIVED) && + (peer->features[0] & RXR_REQ_FEATURE_RDMA_READ); +} + +static inline +bool rxr_peer_support_delivery_complete(struct rxr_peer *peer) +{ + /* FI_DELIVERY_COMPLETE is an extra feature defined + * in version 4 (the base version). + * Because it is an extra feature, + * an EP will assume the peer does not support + * it before a handshake packet was received. + */ + return (peer->flags & RXR_PEER_HANDSHAKE_RECEIVED) && + (peer->features[0] & RXR_REQ_FEATURE_DELIVERY_COMPLETE); +} + +static inline +bool efa_both_support_rdma_read(struct rxr_ep *ep, struct rxr_peer *peer) +{ + if (!rxr_env.use_device_rdma) + return 0; + + return efa_ep_support_rdma_read(ep->rdm_ep) && + (peer->is_self || efa_peer_support_rdma_read(peer)); +} + +static inline +size_t efa_max_rdma_size(struct fid_ep *ep_fid) +{ + struct efa_ep *efa_ep; + + efa_ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid); + return efa_ep->domain->ctx->max_rdma_size; +} + +static inline +struct rxr_peer *efa_ep_get_peer(struct dlist_entry *ep_list_entry, + fi_addr_t addr) +{ + struct util_ep *util_ep; + struct rxr_ep *rxr_ep; + + util_ep = container_of(ep_list_entry, struct util_ep, + av_entry); + rxr_ep = container_of(util_ep, struct rxr_ep, util_ep); + return rxr_ep_get_peer(rxr_ep, addr); +} + +static inline +int efa_peer_in_use(struct rxr_peer *peer) +{ + struct rxr_pkt_entry *pending_pkt; + + if ((peer->tx_pending) || (peer->flags & RXR_PEER_IN_BACKOFF)) + return -FI_EBUSY; + if (peer->rx_init) { + pending_pkt = *ofi_recvwin_peek(peer->robuf); + if (pending_pkt && pending_pkt->pkt) + return -FI_EBUSY; + } + return 0; +} +static inline +void efa_free_robuf(struct rxr_peer *peer) +{ + ofi_recvwin_free(peer->robuf); + ofi_buf_free(peer->robuf); +} + +static inline +void efa_peer_reset(struct rxr_peer *peer) +{ + efa_free_robuf(peer); +#ifdef ENABLE_EFA_POISONING + rxr_poison_mem_region((uint32_t *)peer, sizeof(struct rxr_peer)); +#endif + memset(peer, 0, sizeof(struct rxr_peer)); + dlist_init(&peer->rnr_entry); +} + +static inline bool efa_ep_is_cuda_mr(struct efa_mr *efa_mr) +{ + return efa_mr ? (efa_mr->peer.iface == FI_HMEM_CUDA): false; +} + +/* + * efa_is_cache_available() is a check to see whether a memory registration + * cache is available to be used by this domain. + * + * Return value: + * return true if a memory registration cache exists in this domain. + * return false if a memory registration cache does not exist in this domain. + */ +static inline bool efa_is_cache_available(struct efa_domain *efa_domain) +{ + return efa_domain->cache; +} + #endif /* EFA_H */ diff --git a/prov/efa/src/efa_av.c b/prov/efa/src/efa_av.c index 7d2409d3836..74192f19e13 100644 --- a/prov/efa/src/efa_av.c +++ b/prov/efa/src/efa_av.c @@ -1,7 +1,7 @@ /* * Copyright (c) 2016, Cisco Systems, Inc. All rights reserved. * Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. - * Copyright (c) 2017-2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright (c) 2017-2020 Amazon.com, Inc. or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -35,9 +35,44 @@ #include #include +#include + #include #include "efa.h" -#include "efa_verbs.h" +#include "rxr.h" + +/* + * Local/remote peer detection by comparing peer GID with stored local GIDs + */ +static bool efa_is_local_peer(struct efa_av *av, const void *addr) +{ + struct efa_ep_addr *cur_efa_addr = local_efa_addr; + +#if ENABLE_DEBUG + char peer_gid[INET6_ADDRSTRLEN] = { 0 }; + + if (!inet_ntop(AF_INET6, ((struct efa_ep_addr *)addr)->raw, peer_gid, INET6_ADDRSTRLEN)) { + EFA_WARN(FI_LOG_AV, "Failed to get current EFA's GID, errno: %d\n", errno); + return 0; + } + EFA_INFO(FI_LOG_AV, "The peer's GID is %s.\n", peer_gid); +#endif + while (cur_efa_addr) { + if (!memcmp(((struct efa_ep_addr *)addr)->raw, cur_efa_addr->raw, 16)) { + EFA_INFO(FI_LOG_AV, "The peer is local.\n"); + return 1; + } + cur_efa_addr = cur_efa_addr->next; + } + + return 0; +} + +static bool efa_is_same_addr(struct efa_ep_addr *lhs, struct efa_ep_addr *rhs) +{ + return !memcmp(lhs->raw, rhs->raw, sizeof(lhs->raw)) && + lhs->qpn == rhs->qpn && lhs->qkey == rhs->qkey; +} static inline struct efa_conn *efa_av_tbl_idx_to_conn(struct efa_av *av, fi_addr_t addr) { @@ -53,12 +88,11 @@ static inline struct efa_conn *efa_av_map_addr_to_conn(struct efa_av *av, fi_add return (struct efa_conn *)(void *)addr; } -fi_addr_t efa_ah_qpn_to_addr(struct efa_ep *ep, uint16_t ah, uint16_t qpn) +fi_addr_t efa_ahn_qpn_to_addr(struct efa_av *av, uint16_t ahn, uint16_t qpn) { struct efa_reverse_av *reverse_av; - struct efa_av *av = ep->av; struct efa_ah_qpn key = { - .efa_ah = ah, + .ahn = ahn, .qpn = qpn, }; @@ -90,6 +124,24 @@ static size_t efa_av_tbl_find_first_empty(struct efa_av *av, size_t hint) return -1; } +static int efa_peer_resize(struct rxr_ep *ep, size_t current_count, + size_t new_count) +{ + void *p = realloc(&ep->peer[0], (new_count * sizeof(struct rxr_peer))); + + if (p) + ep->peer = p; + else + return -FI_ENOMEM; +#ifdef ENABLE_EFA_POISONING + rxr_poison_mem_region((uint32_t *)&ep->peer[current_count], (new_count - + current_count) * sizeof(struct rxr_peer)); +#endif + memset(&ep->peer[current_count], 0, + (new_count - current_count) * sizeof(struct rxr_peer)); + return 0; +} + static int efa_av_resize(struct efa_av *av, size_t new_av_count) { if (av->type == FI_AV_TABLE) { @@ -102,6 +154,12 @@ static int efa_av_resize(struct efa_av *av, size_t new_av_count) else return -FI_ENOMEM; +#ifdef ENABLE_EFA_POISONING + rxr_poison_mem_region((uint32_t *)av->conn_table + av->count, + (new_av_count - av->count) * + sizeof(*av->conn_table)); +#endif + memset(av->conn_table + av->count, 0, (new_av_count - av->count) * sizeof(*av->conn_table)); } @@ -112,21 +170,31 @@ static int efa_av_resize(struct efa_av *av, size_t new_av_count) } /* Inserts a single AH to AV. */ -static int efa_av_insert_ah(struct efa_av *av, struct efa_ep_addr *addr, fi_addr_t *fi_addr) +static int efa_av_insert_ah(struct efa_av *av, struct efa_ep_addr *addr, + fi_addr_t *fi_addr, uint64_t flags, void *context) { - struct efa_pd *pd = container_of(av->domain->pd, struct efa_pd, ibv_pd); - struct ibv_ah_attr ah_attr; + struct ibv_pd *ibv_pd = av->domain->ibv_pd; + struct ibv_ah_attr ah_attr = { 0 }; + char str[INET6_ADDRSTRLEN] = { 0 }; + struct efadv_ah_attr attr = { 0 }; struct efa_reverse_av *reverse_av; struct efa_ah_qpn key; struct efa_conn *conn; int err; + if (av->util_av.flags & FI_EVENT) + return -FI_ENOEQ; + if ((flags & FI_SYNC_ERR) && (!context || (flags & FI_EVENT))) + return -FI_EINVAL; + else if (flags & FI_SYNC_ERR) + memset(context, 0, sizeof(int)); + memset(&ah_attr, 0, sizeof(struct ibv_ah_attr)); inet_ntop(AF_INET6, addr->raw, str, INET6_ADDRSTRLEN); - EFA_INFO(FI_LOG_AV, "Insert address: GID[%s] QP[%u]\n", str, addr->qpn); + EFA_INFO(FI_LOG_AV, "Insert address: GID[%s] QP[%u] QKEY[%u]\n", str, addr->qpn, addr->qkey); if (!efa_av_is_valid_address(addr)) { - EFA_INFO(FI_LOG_AV, "Failed to insert bad addr"); + EFA_WARN(FI_LOG_AV, "Failed to insert bad addr"); err = -FI_EADDRNOTAVAIL; goto err_invalid; } @@ -138,9 +206,10 @@ static int efa_av_insert_ah(struct efa_av *av, struct efa_ep_addr *addr, fi_addr } ah_attr.port_num = 1; + ah_attr.is_global = 1; memcpy(ah_attr.grh.dgid.raw, addr->raw, sizeof(addr->raw)); - conn->ah = efa_cmd_create_ah(pd, &ah_attr); - if (!conn->ah) { + conn->ah.ibv_ah = ibv_create_ah(ibv_pd, &ah_attr); + if (!conn->ah.ibv_ah) { err = -FI_EINVAL; goto err_free_conn; } @@ -152,11 +221,13 @@ static int efa_av_insert_ah(struct efa_av *av, struct efa_ep_addr *addr, fi_addr break; case FI_AV_TABLE: - av->next = efa_av_tbl_find_first_empty(av, av->next); - assert(av->next != -1); - *fi_addr = av->next; + if (av->ep_type == FI_EP_DGRAM) { + av->next = efa_av_tbl_find_first_empty(av, av->next); + assert(av->next != -1); + *fi_addr = av->next; + } - av->conn_table[av->next] = conn; + av->conn_table[*fi_addr] = conn; av->next++; break; default: @@ -164,7 +235,12 @@ static int efa_av_insert_ah(struct efa_av *av, struct efa_ep_addr *addr, fi_addr break; } - key.efa_ah = conn->ah->efa_address_handle; + err = -efadv_query_ah(conn->ah.ibv_ah, &attr, sizeof(attr)); + if (err) + goto err_destroy_ah; + + conn->ah.ahn = attr.ahn; + key.ahn = conn->ah.ahn; key.qpn = addr->qpn; /* This is correct since the same address should be mapped to the same ah. */ HASH_FIND(hh, av->reverse_av, &key, sizeof(key), reverse_av); @@ -188,111 +264,221 @@ static int efa_av_insert_ah(struct efa_av *av, struct efa_ep_addr *addr, fi_addr return FI_SUCCESS; err_destroy_ah: - efa_cmd_destroy_ah(conn->ah); + ibv_destroy_ah(conn->ah.ibv_ah); err_free_conn: - free(conn); + ofi_freealign(conn); err_invalid: *fi_addr = FI_ADDR_NOTAVAIL; return err; } -static int efa_av_insert(struct fid_av *av_fid, const void *addr, - size_t count, fi_addr_t *fi_addr, - uint64_t flags, void *context) +/* + * Insert address translation in core av & in hash. + * + * If shm transfer is enabled and the addr comes from local peer, + * 1. convert addr to format 'gid_qpn', which will be set as shm's ep name later. + * 2. insert gid_qpn into shm's av + * 3. store returned fi_addr from shm into the hash table + */ +int efa_av_insert_addr(struct efa_av *av, struct efa_ep_addr *addr, + fi_addr_t *fi_addr, uint64_t flags, + void *context) { - struct efa_av *av = container_of(av_fid, struct efa_av, av_fid); - struct efa_ep_addr *addr_i; - int *fi_errors = context; - fi_addr_t fi_addr_res = FI_ADDR_UNSPEC; - int failed; - size_t i; - int err; - - if (av->flags & FI_EVENT) - return -FI_ENOEQ; - - if ((flags & FI_SYNC_ERR) && (!context || (flags & FI_EVENT))) - return -FI_EINVAL; - else if (flags & FI_SYNC_ERR) - memset(context, 0, sizeof(int) * count); + struct efa_av_entry *av_entry; + struct util_av_entry *util_av_entry; + int ret = 0; + struct rxr_peer *peer; + struct rxr_ep *rxr_ep; + struct util_ep *util_ep; + struct dlist_entry *ep_list_entry; + fi_addr_t shm_fiaddr; + char smr_name[NAME_MAX]; + + fastlock_acquire(&av->util_av.lock); + ret = ofi_av_insert_addr(&av->util_av, addr, fi_addr); + + if (ret) { + EFA_WARN(FI_LOG_AV, "Error in inserting address: %s\n", + fi_strerror(ret)); + goto out; + } + util_av_entry = ofi_bufpool_get_ibuf(av->util_av.av_entry_pool, + *fi_addr); + /* + * If the entry already exists then calling ofi_av_insert_addr would + * increase the use_cnt by 1. For a new entry use_cnt will be 1, whereas + * for a duplicate entry, use_cnt will be more that 1. + */ + if (ofi_atomic_get32(&util_av_entry->use_cnt) > 1) + goto find_out; + + av_entry = (struct efa_av_entry *)util_av_entry->data; + av_entry->rdm_addr = *fi_addr; + av_entry->local_mapping = 0; + + if (av->used + 1 > av->count) { + ret = efa_av_resize(av, av->count * 2); + if (ret) + goto out; + dlist_foreach(&av->util_av.ep_list, ep_list_entry) { + util_ep = container_of(ep_list_entry, struct util_ep, + av_entry); + rxr_ep = container_of(util_ep, struct rxr_ep, util_ep); + ret = efa_peer_resize(rxr_ep, av->used, + av->count); + if (ret) + goto out; + } + } - if (av->used + count > av->count) { - err = efa_av_resize(av, av->used + count); - if (err) - return err; + /* + * Walk through all the EPs that bound to the AV, + * update is_self flag corresponding peer structure + */ + dlist_foreach(&av->util_av.ep_list, ep_list_entry) { + util_ep = container_of(ep_list_entry, struct util_ep, av_entry); + rxr_ep = container_of(util_ep, struct rxr_ep, util_ep); + peer = rxr_ep_get_peer(rxr_ep, *fi_addr); + peer->is_self = efa_is_same_addr((struct efa_ep_addr *)rxr_ep->core_addr, + addr); } - failed = 0; - for (i = 0; i < count; i++) { - addr_i = (struct efa_ep_addr *)((uint8_t *)addr + i * EFA_EP_ADDR_LEN); - err = efa_av_insert_ah(av, addr_i, &fi_addr_res); - if (err) - failed++; - if (flags & FI_SYNC_ERR) - fi_errors[i] = err; - if (fi_addr) - fi_addr[i] = fi_addr_res; + /* If peer is local, insert the address into shm provider's av */ + if (rxr_env.enable_shm_transfer && efa_is_local_peer(av, addr)) { + if (av->shm_used >= rxr_env.shm_av_size) { + ret = -FI_ENOMEM; + EFA_WARN(FI_LOG_AV, + "Max number of shm AV entry %d has been reached.\n", + rxr_env.shm_av_size); + goto err_free_av_entry; + } + ret = rxr_ep_efa_addr_to_str(addr, smr_name); + if (ret != FI_SUCCESS) + goto err_free_av_entry; + + ret = fi_av_insert(av->shm_rdm_av, smr_name, 1, &shm_fiaddr, + flags, context); + if (OFI_UNLIKELY(ret != 1)) { + EFA_WARN(FI_LOG_AV, + "Failed to insert address to shm provider's av: %s\n", + fi_strerror(-ret)); + goto err_free_av_entry; + } else { + ret = 0; + } + EFA_INFO(FI_LOG_AV, + "Insert %s to shm provider's av. addr = %" PRIu64 + " rdm_fiaddr = %" PRIu64 " shm_rdm_fiaddr = %" PRIu64 + "\n", smr_name, *(uint64_t *)addr, *fi_addr, shm_fiaddr); + + assert(shm_fiaddr < rxr_env.shm_av_size); + av->shm_used++; + av_entry->local_mapping = 1; + av_entry->shm_rdm_addr = shm_fiaddr; + av->shm_rdm_addr_map[shm_fiaddr] = av_entry->rdm_addr; + + /* + * Walk through all the EPs that bound to the AV, + * update is_local flag and shm fi_addr_t in corresponding peer structure + */ + dlist_foreach(&av->util_av.ep_list, ep_list_entry) { + util_ep = container_of(ep_list_entry, struct util_ep, av_entry); + rxr_ep = container_of(util_ep, struct rxr_ep, util_ep); + if (rxr_ep->use_shm) { + peer = rxr_ep_get_peer(rxr_ep, *fi_addr); + peer->shm_fiaddr = shm_fiaddr; + peer->is_local = 1; + } + } + } + ret = efa_av_insert_ah(av, addr, fi_addr, + flags, context); + if (ret) { + EFA_WARN(FI_LOG_AV, "Error in inserting address: %s\n", + fi_strerror(ret)); + goto err_free_av_entry; } - return count - failed; +find_out: + EFA_INFO(FI_LOG_AV, + "addr = %" PRIu64 " rdm_fiaddr = %" PRIu64 "\n", + *(uint64_t *)addr, *fi_addr); + goto out; +err_free_av_entry: + ofi_ibuf_free(util_av_entry); +out: + fastlock_release(&av->util_av.lock); + return ret; } -static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, - size_t count, uint64_t flags) +int efa_av_insert(struct fid_av *av_fid, const void *addr, + size_t count, fi_addr_t *fi_addr, + uint64_t flags, void *context) { - struct efa_av *av = container_of(av_fid, struct efa_av, av_fid); - struct efa_conn *conn = NULL; - char str[INET6_ADDRSTRLEN]; - int ret = 0; - int i; - - if (!fi_addr || (av->type != FI_AV_MAP && av->type != FI_AV_TABLE)) - return -FI_EINVAL; - - for (i = 0; i < count; i++) { - struct efa_reverse_av *reverse_av; - struct efa_ah_qpn key; - - if (fi_addr[i] == FI_ADDR_NOTAVAIL) - continue; - - if (av->type == FI_AV_MAP) { - conn = (struct efa_conn *)fi_addr[i]; - } else { /* (av->type == FI_AV_TABLE) */ - conn = av->conn_table[fi_addr[i]]; - av->conn_table[fi_addr[i]] = NULL; - av->next = MIN(av->next, fi_addr[i]); + struct efa_av *av = container_of(av_fid, struct efa_av, util_av.av_fid); + int ret = 0, success_cnt = 0; + size_t i = 0; + struct efa_ep_addr *addr_i; + fi_addr_t fi_addr_res; + + /* + * Providers are allowed to ignore FI_MORE. + */ + + flags &= ~FI_MORE; + if (flags) + return -FI_ENOSYS; + + if (av->ep_type == FI_EP_RDM) { + for (i = 0; i < count; i++) { + addr_i = (struct efa_ep_addr *) ((uint8_t *)addr + i * EFA_EP_ADDR_LEN); + ret = efa_av_insert_addr(av, addr_i, &fi_addr_res, + flags, context); + if (ret) + break; + if (fi_addr) + fi_addr[i] = fi_addr_res; + success_cnt++; } - if (!conn) - continue; - - key.efa_ah = conn->ah->efa_address_handle; - key.qpn = conn->ep_addr.qpn; - HASH_FIND(hh, av->reverse_av, &key, sizeof(key), reverse_av); - if (OFI_LIKELY(!!reverse_av)) { - HASH_DEL(av->reverse_av, reverse_av); - free(reverse_av); + } else { + if (av->used + count > av->count) { + ret = efa_av_resize(av, av->used + count); + if (ret) + goto out; } + for (i = 0; i < count; i++) { + addr_i = (struct efa_ep_addr *) ((uint8_t *)addr + i * EFA_EP_ADDR_LEN); + ret = efa_av_insert_ah(av, addr_i, &fi_addr_res, + flags, context); + if (ret) + break; + if (fi_addr) + fi_addr[i] = fi_addr_res; + success_cnt++; + } + } +out: + /* cancel remaining request and log to event queue */ + for (; i < count ; i++) { + if (av->util_av.eq) + ofi_av_write_event(&av->util_av, i, FI_ECANCELED, + context); + if (fi_addr) + fi_addr[i] = FI_ADDR_NOTAVAIL; + } - ret = efa_cmd_destroy_ah(conn->ah); - if (ret) - return ret; - - memset(str, 0, sizeof(str)); - inet_ntop(AF_INET6, conn->ep_addr.raw, str, INET6_ADDRSTRLEN); - EFA_INFO(FI_LOG_AV, "av_remove conn[%p] with GID[%s] QP[%u]\n", conn, - str, conn->ep_addr.qpn); + /* update success to event queue */ + if (av->util_av.eq) + ofi_av_write_event(&av->util_av, success_cnt, 0, context); - free(conn); - av->used--; - } - return ret; + return success_cnt; } static int efa_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr, + void *addr, size_t *addrlen) { - struct efa_av *av = container_of(av_fid, struct efa_av, av_fid); + struct efa_av *av = container_of(av_fid, struct efa_av, util_av.av_fid); struct efa_conn *conn = NULL; if (av->type != FI_AV_MAP && av->type != FI_AV_TABLE) @@ -305,18 +491,154 @@ static int efa_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr, conn = (struct efa_conn *)fi_addr; } else { /* (av->type == FI_AV_TABLE) */ if (fi_addr >= av->count) - return -EINVAL; + return -FI_EINVAL; conn = av->conn_table[fi_addr]; } if (!conn) - return -EINVAL; + return -FI_EINVAL; memcpy(addr, (void *)&conn->ep_addr, MIN(sizeof(conn->ep_addr), *addrlen)); *addrlen = sizeof(conn->ep_addr); return 0; } +static int efa_av_remove_ah(struct fid_av *av_fid, fi_addr_t *fi_addr, + size_t count, uint64_t flags) +{ + struct efa_av *av = container_of(av_fid, struct efa_av, util_av.av_fid); + struct efa_conn *conn = NULL; + struct efa_reverse_av *reverse_av; + struct efa_ah_qpn key; + char str[INET6_ADDRSTRLEN]; + int ret = 0; + + if (!fi_addr || (av->type != FI_AV_MAP && av->type != FI_AV_TABLE)) + return -FI_EINVAL; + + if (*fi_addr == FI_ADDR_NOTAVAIL) + return ret; + + if (av->type == FI_AV_MAP) { + conn = (struct efa_conn *)fi_addr; + } else { /* (av->type == FI_AV_TABLE) */ + conn = av->conn_table[*fi_addr]; + av->conn_table[*fi_addr] = NULL; + av->next = MIN(av->next, *fi_addr); + } + if (!conn) + return ret; + + key.ahn = conn->ah.ahn; + key.qpn = conn->ep_addr.qpn; + HASH_FIND(hh, av->reverse_av, &key, sizeof(key), reverse_av); + if (OFI_LIKELY(!!reverse_av)) { + HASH_DEL(av->reverse_av, reverse_av); + free(reverse_av); + } + + ret = -ibv_destroy_ah(conn->ah.ibv_ah); + if (ret) + goto err_free_conn; + + memset(str, 0, sizeof(str)); + inet_ntop(AF_INET6, conn->ep_addr.raw, str, INET6_ADDRSTRLEN); + EFA_INFO(FI_LOG_AV, "av_remove conn[%p] with GID[%s] QP[%u]\n", conn, + str, conn->ep_addr.qpn); + av->used--; + +err_free_conn: + ofi_freealign(conn); + return ret; +} + +static int efa_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, + size_t count, uint64_t flags) +{ + int ret = 0; + size_t i; + struct efa_av *av; + struct util_av_entry *util_av_entry; + struct efa_av_entry *av_entry; + struct rxr_peer *peer; + struct dlist_entry *ep_list_entry; + + av = container_of(av_fid, struct efa_av, util_av.av_fid); + if (av->ep_type == FI_EP_DGRAM) { + for (i = 0; i < count; i++) { + ret = efa_av_remove_ah(&av->util_av.av_fid, &fi_addr[i], + 1, flags); + if (ret) + goto out; + } + goto out; + } + fastlock_acquire(&av->util_av.lock); + for (i = 0; i < count; i++) { + if (fi_addr[i] == FI_ADDR_NOTAVAIL || + fi_addr[i] > av->count) { + ret = -FI_ENOENT; + goto release_lock; + } + util_av_entry = ofi_bufpool_get_ibuf( + av->util_av.av_entry_pool, + fi_addr[i]); + if (!util_av_entry) { + ret = -FI_ENOENT; + goto release_lock; + } + /* + * If use_cnt is greater than 1, then just decrement + * the count by 1, without removing the entry. + */ + if (ofi_atomic_get32(&util_av_entry->use_cnt) > 1) { + ret = ofi_av_remove_addr(&av->util_av, fi_addr[i]); + goto release_lock; + } + av_entry = (struct efa_av_entry *)util_av_entry->data; + + /* Check if the peer is in use if it is then return */ + dlist_foreach(&av->util_av.ep_list, ep_list_entry) { + peer = efa_ep_get_peer(ep_list_entry, fi_addr[i]); + ret = efa_peer_in_use(peer); + if (ret) + goto release_lock; + } + + /* Only if the peer is not in use reset the peer */ + dlist_foreach(&av->util_av.ep_list, ep_list_entry) { + peer = efa_ep_get_peer(ep_list_entry, fi_addr[i]); + if (peer->rx_init) + efa_peer_reset(peer); + } + ret = efa_av_remove_ah(&av->util_av.av_fid, &fi_addr[i], 1, + flags); + if (ret) + goto release_lock; + /* remove an address from shm provider's av */ + if (rxr_env.enable_shm_transfer && av_entry->local_mapping) { + ret = fi_av_remove(av->shm_rdm_av, &av_entry->shm_rdm_addr, 1, flags); + if (ret) + goto err_free_av_entry; + + av->shm_used--; + assert(av_entry->shm_rdm_addr < rxr_env.shm_av_size); + av->shm_rdm_addr_map[av_entry->shm_rdm_addr] = FI_ADDR_UNSPEC; + } + ret = ofi_av_remove_addr(&av->util_av, *fi_addr); + if (ret) + goto err_free_av_entry; + } + fastlock_release(&av->util_av.lock); + goto out; +err_free_av_entry: + ofi_ibuf_free(util_av_entry); +release_lock: + fastlock_release(&av->util_av.lock); +out: + return ret; +} + static const char *efa_av_straddr(struct fid_av *av_fid, const void *addr, char *buf, size_t *len) { @@ -337,25 +659,51 @@ static int efa_av_close(struct fid *fid) { struct efa_av *av; int ret = 0; + int err = 0; int i; - av = container_of(fid, struct efa_av, av_fid.fid); + av = container_of(fid, struct efa_av, util_av.av_fid.fid); for (i = 0; i < av->count; i++) { fi_addr_t addr = i; - ret = efa_av_remove(&av->av_fid, &addr, 1, 0); - if (ret) - return ret; + ret = efa_av_remove_ah(&av->util_av.av_fid, &addr, 1, 0); + if (ret) { + err = ret; + EFA_WARN(FI_LOG_AV, "Failed to remove ah: %s\n", + fi_strerror(ret)); + } } free(av->conn_table); + if (av->ep_type == FI_EP_RDM) { + if (rxr_env.enable_shm_transfer && av->shm_rdm_av && + &av->shm_rdm_av->fid) { + ret = fi_close(&av->shm_rdm_av->fid); + if (ret) { + err = ret; + EFA_WARN(FI_LOG_AV, "Failed to close shm av: %s\n", + fi_strerror(ret)); + } + } + ret = ofi_av_close(&av->util_av); + if (ret) { + err = ret; + EFA_WARN(FI_LOG_AV, "Failed to close av: %s\n", + fi_strerror(ret)); + } + } free(av); - return 0; + return err; +} + +static int efa_av_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + return ofi_av_bind(fid, bfid, flags); } static struct fi_ops efa_av_fi_ops = { .size = sizeof(struct fi_ops), .close = efa_av_close, - .bind = fi_no_bind, + .bind = efa_av_bind, .control = fi_no_control, .ops_open = fi_no_ops_open, }; @@ -363,48 +711,113 @@ static struct fi_ops efa_av_fi_ops = { int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **av_fid, void *context) { - struct efa_domain *domain; + struct efa_domain *efa_domain; + struct util_domain *util_domain; + struct rxr_domain *rxr_domain; + struct efa_domain_base *efa_domain_base; struct efa_av *av; - size_t count = 64; - int err; - - domain = container_of(domain_fid, struct efa_domain, - util_domain.domain_fid); + struct util_av_attr util_attr; + size_t universe_size; + struct fi_av_attr av_attr; + int i, ret, retv; if (!attr) return -FI_EINVAL; - if (attr->flags) - return -FI_EBADFLAGS; + if (attr->name) + return -FI_ENOSYS; - switch (attr->type) { - case FI_AV_UNSPEC: - case FI_AV_TABLE: - attr->type = FI_AV_TABLE; - break; - case FI_AV_MAP: - default: - return -EINVAL; - } + /* FI_EVENT, FI_READ, and FI_SYMMETRIC are not supported */ + if (attr->flags) + return -FI_ENOSYS; - if (attr->count) - count = attr->count; + /* + * TODO: remove me once RxR supports resizing members tied to the AV + * size. + */ + if (!attr->count) + attr->count = EFA_MIN_AV_SIZE; + else + attr->count = MAX(attr->count, EFA_MIN_AV_SIZE); av = calloc(1, sizeof(*av)); if (!av) - return -ENOMEM; + return -FI_ENOMEM; + + util_domain = container_of(domain_fid, struct util_domain, + domain_fid); + efa_domain_base = container_of(util_domain, struct efa_domain_base, + util_domain.domain_fid); + attr->type = FI_AV_TABLE; + /* + * An rxr_domain fid was passed to the user if this is an RDM + * endpoint, otherwise it is an efa_domain fid. This will be + * removed once the rxr and efa domain structures are combined. + */ + if (efa_domain_base->type == EFA_DOMAIN_RDM) { + rxr_domain = (struct rxr_domain *)efa_domain_base; + efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain, + util_domain.domain_fid); + av->ep_type = FI_EP_RDM; + + if (fi_param_get_size_t(NULL, "universe_size", + &universe_size) == FI_SUCCESS) + attr->count = MAX(attr->count, universe_size); + + util_attr.addrlen = EFA_EP_ADDR_LEN; + util_attr.context_len = sizeof(struct efa_av_entry) - EFA_EP_ADDR_LEN; + util_attr.flags = 0; + ret = ofi_av_init(&efa_domain->util_domain, attr, &util_attr, + &av->util_av, context); + if (ret) + goto err; + av_attr = *attr; + if (rxr_env.enable_shm_transfer) { + /* + * shm av supports maximum 256 entries + * Reset the count to 128 to reduce memory footprint and satisfy + * the need of the instances with more CPUs. + */ + if (rxr_env.shm_av_size > EFA_SHM_MAX_AV_COUNT) { + ret = -FI_ENOSYS; + EFA_WARN(FI_LOG_AV, "The requested av size is beyond" + " shm supported maximum av size: %s\n", + fi_strerror(-ret)); + goto err_close_util_av; + } + av_attr.count = rxr_env.shm_av_size; + assert(av_attr.type == FI_AV_TABLE); + ret = fi_av_open(efa_domain->shm_domain, &av_attr, + &av->shm_rdm_av, context); + if (ret) + goto err_close_util_av; + + for (i = 0; i < EFA_SHM_MAX_AV_COUNT; ++i) + av->shm_rdm_addr_map[i] = FI_ADDR_UNSPEC; + } + } else { + efa_domain = (struct efa_domain *)efa_domain_base; + av->ep_type = FI_EP_DGRAM; + } + + EFA_INFO(FI_LOG_AV, "fi_av_attr:%" PRId64 "\n", + av_attr.flags); - av->domain = domain; + av->domain = efa_domain; av->type = attr->type; - av->count = count; av->used = 0; av->next = 0; + av->shm_used = 0; + av->count = attr->count; if (av->type == FI_AV_TABLE && av->count > 0) { av->conn_table = calloc(av->count, sizeof(*av->conn_table)); if (!av->conn_table) { - err = -ENOMEM; - goto err_free_av; + ret = -FI_ENOMEM; + if (av->ep_type == FI_EP_DGRAM) + goto err_close_util_av; + else + goto err_close_shm_av; } } @@ -413,16 +826,27 @@ int efa_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, else /* if (av->type == FI_AV_TABLE) */ av->addr_to_conn = efa_av_tbl_idx_to_conn; - av->av_fid.fid.fclass = FI_CLASS_AV; - av->av_fid.fid.context = context; - av->av_fid.fid.ops = &efa_av_fi_ops; + *av_fid = &av->util_av.av_fid; + (*av_fid)->fid.fclass = FI_CLASS_AV; + (*av_fid)->fid.context = context; + (*av_fid)->fid.ops = &efa_av_fi_ops; + (*av_fid)->ops = &efa_av_ops; - av->av_fid.ops = &efa_av_ops; - - *av_fid = &av->av_fid; return 0; -err_free_av: +err_close_shm_av: + if (rxr_env.enable_shm_transfer) { + retv = fi_close(&av->shm_rdm_av->fid); + if (retv) + EFA_WARN(FI_LOG_AV, "Unable to close shm av: %s\n", + fi_strerror(ret)); + } +err_close_util_av: + retv = ofi_av_close(&av->util_av); + if (retv) + EFA_WARN(FI_LOG_AV, + "Unable to close util_av: %s\n", fi_strerror(-retv)); +err: free(av); - return err; + return ret; } diff --git a/prov/efa/src/efa_cm.c b/prov/efa/src/efa_cm.c index 6a443d3dcdd..80559a23f95 100644 --- a/prov/efa/src/efa_cm.c +++ b/prov/efa/src/efa_cm.c @@ -50,15 +50,17 @@ static int efa_ep_getname(fid_t ep_fid, void *addr, size_t *addrlen) struct efa_ep *ep; char str[INET6_ADDRSTRLEN] = {}; - ep = container_of(ep_fid, struct efa_ep, ep_fid); + ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid); ep_addr = (struct efa_ep_addr *)ep->src_addr; ep_addr->qpn = ep->qp->qp_num; + ep_addr->pad = 0; + ep_addr->qkey = ep->qp->qkey; inet_ntop(AF_INET6, ep_addr->raw, str, INET6_ADDRSTRLEN); - EFA_INFO(FI_LOG_EP_CTRL, "EP addr: GID[%s] QP[%d] (length %zu)\n", - str, ep_addr->qpn, *addrlen); + EFA_INFO(FI_LOG_EP_CTRL, "EP addr: GID[%s] QP[%d] QKEY[%d] (length %zu)\n", + str, ep_addr->qpn, ep_addr->qkey, *addrlen); return efa_copy_addr(addr, addrlen, ep_addr); } diff --git a/prov/efa/src/efa_cq.c b/prov/efa/src/efa_cq.c index c6871a2d93d..2b8fe67a9a0 100644 --- a/prov/efa/src/efa_cq.c +++ b/prov/efa/src/efa_cq.c @@ -36,152 +36,50 @@ #include #include "efa.h" -#include "efa_cmd.h" -#include "efa_ib.h" -#include "efa_io_defs.h" -static __u32 efa_cq_sub_cq_get_current_index(struct efa_sub_cq *sub_cq) +static uint64_t efa_cq_wc_to_fi_flags(struct efa_wc *wc) { - return sub_cq->consumed_cnt & sub_cq->qmask; -} - -static int efa_cq_cqe_is_pending(struct efa_io_cdesc_common *cqe_common, int phase) -{ - return (cqe_common->flags & EFA_IO_CDESC_COMMON_PHASE_MASK) == phase; -} - -static struct efa_io_cdesc_common *efa_cq_sub_cq_get_cqe(struct efa_sub_cq *sub_cq, int entry) -{ - return (struct efa_io_cdesc_common *)(sub_cq->buf + (entry * sub_cq->cqe_size)); -} - -static void efa_cq_sub_cq_initialize(struct efa_sub_cq *sub_cq, uint8_t *buf, - int sub_cq_size, int cqe_size) -{ - sub_cq->consumed_cnt = 0; - sub_cq->phase = 1; - sub_cq->buf = buf; - sub_cq->qmask = sub_cq_size - 1; - sub_cq->cqe_size = cqe_size; - sub_cq->ref_cnt = 0; -} - -static int efa_cq_create(struct efa_cq *cq, struct efa_context *ctx, unsigned int cq_size) -{ - struct ibv_context *ibctx = &ctx->ibv_ctx; - int err, sub_cq_size, sub_buf_size; - uint64_t q_mmap_key, q_mmap_size; - uint16_t i, num_sub_cqs; - int fd = ibctx->cmd_fd; - uint8_t *buf; - uint32_t cqn; - - pthread_mutex_lock(&ibctx->mutex); - - cq->num_sub_cqs = ctx->sub_cqs_per_cq; - cq->cqe_size = ctx->cqe_size; - - cq_size = align_up_queue_size(cq_size); - err = efa_cmd_create_cq(cq, cq_size, &q_mmap_key, &q_mmap_size, &cqn); - if (err) { - EFA_WARN(FI_LOG_CQ, "efa_cmd_create_cq failed[%u].\n", err); - goto err_unlock; - } - - cq->cqn = cqn; - cq->buf_size = q_mmap_size; - num_sub_cqs = cq->num_sub_cqs; - sub_cq_size = cq->ibv_cq.cqe; - - err = fastlock_init(&cq->inner_lock); - if (err) { - err = -err; - EFA_WARN(FI_LOG_CQ, "cq spin lock init failed[%d]!\n", err); - goto err_destroy_cq; - } - - cq->buf = mmap(NULL, cq->buf_size, PROT_WRITE, MAP_SHARED, fd, q_mmap_key); - if (cq->buf == MAP_FAILED) { - EFA_WARN(FI_LOG_CQ, "cq buffer mmap failed[%d]!\n", errno); - err = -EINVAL; - goto err_destroy_lock; - } - - cq->sub_cq_arr = calloc(num_sub_cqs, sizeof(*cq->sub_cq_arr)); - if (!cq->sub_cq_arr) { - err = -ENOMEM; - EFA_WARN(FI_LOG_CQ, "sub cq allocation failed.\n"); - goto err_unmap_buf; - } - - buf = cq->buf; - sub_buf_size = cq->cqe_size * sub_cq_size; - for (i = 0; i < num_sub_cqs; i++) { - efa_cq_sub_cq_initialize(&cq->sub_cq_arr[i], buf, sub_cq_size, cq->cqe_size); - buf += sub_buf_size; + switch (wc->ibv_wc.opcode) { + case IBV_WC_SEND: + return FI_SEND | FI_MSG; + case IBV_WC_RECV: + return FI_RECV | FI_MSG; + default: + assert(0); + return 0; } - - pthread_mutex_unlock(&ibctx->mutex); - return 0; - -err_unmap_buf: - munmap(cq->buf, cq->buf_size); -err_destroy_lock: - fastlock_destroy(&cq->inner_lock); -err_destroy_cq: - efa_cmd_destroy_cq(cq); -err_unlock: - pthread_mutex_unlock(&ibctx->mutex); - return err; -} - -static int efa_cq_destroy(struct efa_cq *cq) -{ - int err; - - pthread_mutex_lock(&cq->domain->ctx->ibv_ctx.mutex); - - free(cq->sub_cq_arr); - if (munmap(cq->buf, cq->buf_size)) - EFA_WARN(FI_LOG_CQ, "cq[%u]: buffer unmap failed!\n", cq->cqn); - - fastlock_destroy(&cq->inner_lock); - err = efa_cmd_destroy_cq(cq); - - pthread_mutex_unlock(&cq->domain->ctx->ibv_ctx.mutex); - - return err; } -static ssize_t efa_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *entry, - uint64_t flags) +ssize_t efa_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *entry, + uint64_t flags) { struct efa_cq *cq; struct efa_wce *wce; struct slist_entry *slist_entry; uint32_t api_version; - cq = container_of(cq_fid, struct efa_cq, cq_fid); + cq = container_of(cq_fid, struct efa_cq, util_cq.cq_fid); - fastlock_acquire(&cq->outer_lock); + fastlock_acquire(&cq->lock); if (slist_empty(&cq->wcq)) goto err; wce = container_of(cq->wcq.head, struct efa_wce, entry); - if (!wce->wc.comp_status) + if (!wce->wc.ibv_wc.status) goto err; api_version = cq->domain->fab->util_fabric.fabric_fid.api_version; slist_entry = slist_remove_head(&cq->wcq); - fastlock_release(&cq->outer_lock); + fastlock_release(&cq->lock); wce = container_of(slist_entry, struct efa_wce, entry); - entry->op_context = (void *)(uintptr_t)wce->wc.wr_id; - entry->flags = wce->wc.flags; + entry->op_context = (void *)(uintptr_t)wce->wc.ibv_wc.wr_id; + entry->flags = efa_cq_wc_to_fi_flags(&wce->wc); entry->err = EIO; - entry->prov_errno = wce->wc.comp_status; + entry->prov_errno = wce->wc.ibv_wc.status; + EFA_WARN(FI_LOG_CQ, "Work completion status: %s\n", ibv_wc_status_str(wce->wc.ibv_wc.status)); /* We currently don't have err_data to give back to the user. */ if (FI_VERSION_GE(api_version, FI_VERSION(1, 5))) @@ -190,7 +88,7 @@ static ssize_t efa_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *ent ofi_buf_free(wce); return sizeof(*entry); err: - fastlock_release(&cq->outer_lock); + fastlock_release(&cq->lock); return -FI_EAGAIN; } @@ -198,154 +96,46 @@ static void efa_cq_read_context_entry(struct efa_wc *wc, int i, void *buf) { struct fi_cq_entry *entry = buf; - entry[i].op_context = (void *)(uintptr_t)wc->wr_id; + entry[i].op_context = (void *)(uintptr_t)wc->ibv_wc.wr_id; } static void efa_cq_read_msg_entry(struct efa_wc *wc, int i, void *buf) { struct fi_cq_msg_entry *entry = buf; - entry[i].op_context = (void *)(uintptr_t)wc->wr_id; - entry[i].flags = wc->flags; - entry[i].len = (uint64_t)wc->byte_len; + entry[i].op_context = (void *)(uintptr_t)wc->ibv_wc.wr_id; + entry[i].flags = efa_cq_wc_to_fi_flags(wc); + entry[i].len = (uint64_t)wc->ibv_wc.byte_len; } static void efa_cq_read_data_entry(struct efa_wc *wc, int i, void *buf) { struct fi_cq_data_entry *entry = buf; - entry[i].op_context = (void *)(uintptr_t)wc->wr_id; - entry[i].flags = wc->flags; - - entry[i].data = (wc->flags & FI_REMOTE_CQ_DATA) ? ntohl(wc->imm_data) : 0; - - entry->len = (wc->flags & FI_RECV) ? wc->byte_len : 0; + entry[i].op_context = (void *)(uintptr_t)wc->ibv_wc.wr_id; + entry[i].flags = efa_cq_wc_to_fi_flags(wc); + entry[i].data = 0; + entry[i].len = (uint64_t)wc->ibv_wc.byte_len; } -static struct efa_io_cdesc_common *cq_next_sub_cqe_get(struct efa_sub_cq *sub_cq) -{ - struct efa_io_cdesc_common *cqe; - __u32 current_index; - int is_pending; - - current_index = efa_cq_sub_cq_get_current_index(sub_cq); - cqe = efa_cq_sub_cq_get_cqe(sub_cq, current_index); - is_pending = efa_cq_cqe_is_pending(cqe, sub_cq->phase); - /* We need the rmb() to ensure that the rest of the completion - * entry is only read after the phase bit has been validated. - * We unconditionally call rmb rather than leave it in the for - * loop to prevent the compiler from optimizing out loads of - * the flag if the caller is in a tight loop. - */ - rmb(); - if (is_pending) { - sub_cq->consumed_cnt++; - if (efa_cq_sub_cq_get_current_index(sub_cq) == 0) - sub_cq->phase = 1 - sub_cq->phase; - return cqe; - } - - return NULL; -} - -static int efa_cq_poll_sub_cq(struct efa_cq *cq, struct efa_sub_cq *sub_cq, - struct efa_qp **cur_qp, struct efa_wc *wc) -{ - struct efa_context *ctx = to_efa_ctx(cq->ibv_cq.context); - struct efa_io_cdesc_common *cqe; - struct efa_wq *wq; - uint32_t qpn, wrid_idx; - - cqe = cq_next_sub_cqe_get(sub_cq); - if (!cqe) - return -FI_EAGAIN; - - qpn = cqe->qp_num; - if (!*cur_qp || (qpn != (*cur_qp)->qp_num)) { - /* We do not have to take the QP table lock here, - * because CQs will be locked while QPs are removed - * from the table. - */ - *cur_qp = ctx->qp_table[qpn]; - if (!*cur_qp) - return -FI_EOTHER; - } - - wrid_idx = cqe->req_id; - wc->comp_status = cqe->status; - wc->flags = 0; - if (get_efa_io_cdesc_common_q_type(cqe) == EFA_IO_SEND_QUEUE) { - wq = &(*cur_qp)->sq.wq; - wc->flags = FI_SEND | FI_MSG; - wc->efa_ah = 0; /* AH report is valid for RX only */ - wc->src_qp = 0; - } else { - struct efa_io_rx_cdesc *rcqe = - container_of(cqe, struct efa_io_rx_cdesc, common); - - wq = &(*cur_qp)->rq.wq; - wc->byte_len = cqe->length; - wc->flags = FI_RECV | FI_MSG; - if (get_efa_io_cdesc_common_has_imm(cqe)) { - wc->flags |= FI_REMOTE_CQ_DATA; - wc->imm_data = rcqe->imm; - } - wc->efa_ah = rcqe->ah; - wc->src_qp = rcqe->src_qp_num; - } - - wc->qp = *cur_qp; - wq->wrid_idx_pool_next--; - wq->wrid_idx_pool[wq->wrid_idx_pool_next] = wrid_idx; - wc->wr_id = wq->wrid[wrid_idx]; - wq->wqe_completed++; - - return FI_SUCCESS; -} - -/* Must call with cq->outer_lock held */ -ssize_t efa_poll_cq(struct efa_cq *cq, struct efa_wc *wc) -{ - uint16_t num_sub_cqs = cq->num_sub_cqs; - struct efa_sub_cq *sub_cq; - struct efa_qp *qp = NULL; - int err = FI_SUCCESS; - uint16_t sub_cq_idx; - - fastlock_acquire(&cq->inner_lock); - for (sub_cq_idx = 0; sub_cq_idx < num_sub_cqs; ++sub_cq_idx) { - sub_cq = &cq->sub_cq_arr[cq->next_poll_idx++]; - cq->next_poll_idx %= num_sub_cqs; - - if (!sub_cq->ref_cnt) - continue; - - err = efa_cq_poll_sub_cq(cq, sub_cq, &qp, wc); - if (err != -FI_EAGAIN) - break; - } - fastlock_release(&cq->inner_lock); - - return err; -} - -static ssize_t efa_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t count, - fi_addr_t *src_addr) +ssize_t efa_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t count, + fi_addr_t *src_addr) { struct efa_cq *cq; struct efa_wce *wce; struct slist_entry *entry; + struct efa_av *av; struct efa_wc wc; ssize_t ret = 0, i; - cq = container_of(cq_fid, struct efa_cq, cq_fid); + cq = container_of(cq_fid, struct efa_cq, util_cq.cq_fid); - fastlock_acquire(&cq->outer_lock); + fastlock_acquire(&cq->lock); for (i = 0; i < count; i++) { if (!slist_empty(&cq->wcq)) { wce = container_of(cq->wcq.head, struct efa_wce, entry); - if (wce->wc.comp_status) { + if (wce->wc.ibv_wc.status) { ret = -FI_EAVAIL; break; } @@ -356,15 +146,18 @@ static ssize_t efa_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t count, continue; } - ret = efa_poll_cq(cq, &wc); - if (ret) + ret = ibv_poll_cq(cq->ibv_cq, 1, &wc.ibv_wc); + if (ret != 1) { + if (!ret) + ret = -FI_EAGAIN; break; + } /* Insert error entry into wcq */ - if (wc.comp_status) { + if (wc.ibv_wc.status) { wce = ofi_buf_alloc(cq->wce_pool); if (!wce) { - fastlock_release(&cq->outer_lock); + fastlock_release(&cq->lock); return -FI_ENOMEM; } memset(wce, 0, sizeof(*wce)); @@ -374,60 +167,35 @@ static ssize_t efa_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t count, break; } - if (src_addr) - src_addr[i] = efa_ah_qpn_to_addr(wc.qp->ep, wc.efa_ah, - wc.src_qp); + if (src_addr) { + av = cq->domain->qp_table[wc.ibv_wc.qp_num & + cq->domain->qp_table_sz_m1]->ep->av; + + src_addr[i] = efa_ahn_qpn_to_addr(av, + wc.ibv_wc.slid, + wc.ibv_wc.src_qp); + } cq->read_entry(&wc, i, buf); } - fastlock_release(&cq->outer_lock); + fastlock_release(&cq->lock); return i ? i : ret; } -static ssize_t efa_cq_read(struct fid_cq *cq_fid, void *buf, size_t count) -{ - return efa_cq_readfrom(cq_fid, buf, count, NULL); -} - static const char *efa_cq_strerror(struct fid_cq *cq_fid, int prov_errno, const void *err_data, char *buf, size_t len) { - static const char *const status_str[] = { - [EFA_IO_COMP_STATUS_OK] = "Success", - [EFA_IO_COMP_STATUS_FLUSHED] = "Flushed during qp destroy", - [EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR] = "Internal qp error", - [EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_OP_TYPE] = "Invalid op type", - [EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH] = "Invalid ah", - [EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY] = "Invalid lkey", - [EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH] = "Local message too long", - [EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS] = "Bad remote address", - [EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT] = "Remote aborted", - [EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN] = "Bad dest qpn", - [EFA_IO_COMP_STATUS_REMOTE_ERROR_RNR] = "Destination rnr", - [EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH] = "Remote message too long", - [EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_STATUS] = "Unexpected status by responder", - }; - const char *strerr; - - if (prov_errno < EFA_IO_COMP_STATUS_OK || - prov_errno > EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_STATUS || - !status_str[prov_errno]) - strerr = "unknown error"; - else - strerr = status_str[prov_errno]; - - if (buf && len) - strncpy(buf, strerr, len); - return strerr; + /* XXX use vendor_error */ + return "unknown error"; } static struct fi_ops_cq efa_cq_ops = { .size = sizeof(struct fi_ops_cq), - .read = efa_cq_read, - .readfrom = efa_cq_readfrom, - .readerr = efa_cq_readerr, + .read = ofi_cq_read, + .readfrom = ofi_cq_readfrom, + .readerr = ofi_cq_readerr, .sread = fi_no_cq_sread, .sreadfrom = fi_no_cq_sreadfrom, .signal = fi_no_cq_signal, @@ -454,25 +222,30 @@ static int efa_cq_close(fid_t fid) struct slist_entry *entry; int ret; - cq = container_of(fid, struct efa_cq, cq_fid.fid); + cq = container_of(fid, struct efa_cq, util_cq.cq_fid.fid); - fastlock_acquire(&cq->outer_lock); + fastlock_acquire(&cq->lock); while (!slist_empty(&cq->wcq)) { entry = slist_remove_head(&cq->wcq); wce = container_of(entry, struct efa_wce, entry); ofi_buf_free(wce); } - fastlock_release(&cq->outer_lock); + fastlock_release(&cq->lock); ofi_bufpool_destroy(cq->wce_pool); - fastlock_destroy(&cq->outer_lock); + fastlock_destroy(&cq->lock); + + ret = -ibv_destroy_cq(cq->ibv_cq); + if (ret) + return ret; - ret = efa_cq_destroy(cq); + ret = ofi_cq_cleanup(&cq->util_cq); if (ret) return ret; free(cq); + return 0; } @@ -491,26 +264,29 @@ int efa_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, size_t size; int ret; + if (attr->wait_obj != FI_WAIT_NONE) + return -FI_ENOSYS; + cq = calloc(1, sizeof(*cq)); if (!cq) return -FI_ENOMEM; - cq->domain = container_of(domain_fid, struct efa_domain, - util_domain.domain_fid); - - switch (attr->wait_obj) { - case FI_WAIT_NONE: - break; - default: - ret = -FI_ENOSYS; + ret = ofi_cq_init(&efa_prov, domain_fid, attr, &cq->util_cq, + &ofi_cq_progress, context); + if (ret) { + EFA_WARN(FI_LOG_CQ, "Unable to create UTIL_CQ\n"); goto err_free_cq; } + cq->domain = container_of(domain_fid, struct efa_domain, + util_domain.domain_fid); + size = attr->size ? attr->size : EFA_DEF_CQ_SIZE; - ret = efa_cq_create(cq, cq->domain->ctx, size); - if (ret) { + cq->ibv_cq = ibv_create_cq(cq->domain->ctx->ibv_ctx, size, NULL, NULL, 0); + if (!cq->ibv_cq) { EFA_WARN(FI_LOG_CQ, "Unable to create CQ\n"); - goto err_free_cq; + ret = -FI_EINVAL; + goto err_free_util_cq; } ret = ofi_bufpool_create(&cq->wce_pool, sizeof(struct efa_wce), 16, 0, @@ -520,12 +296,6 @@ int efa_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, goto err_destroy_cq; } - cq->next_poll_idx = 0; - cq->cq_fid.fid.fclass = FI_CLASS_CQ; - cq->cq_fid.fid.context = context; - cq->cq_fid.fid.ops = &efa_cq_fi_ops; - cq->cq_fid.ops = &efa_cq_ops; - switch (attr->format) { case FI_CQ_FORMAT_UNSPEC: case FI_CQ_FORMAT_CONTEXT: @@ -546,28 +316,26 @@ int efa_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, goto err_destroy_pool; } - fastlock_init(&cq->outer_lock); + fastlock_init(&cq->lock); slist_init(&cq->wcq); - *cq_fid = &cq->cq_fid; + *cq_fid = &cq->util_cq.cq_fid; + (*cq_fid)->fid.fclass = FI_CLASS_CQ; + (*cq_fid)->fid.context = context; + (*cq_fid)->fid.ops = &efa_cq_fi_ops; + (*cq_fid)->ops = &efa_cq_ops; + return 0; err_destroy_pool: ofi_bufpool_destroy(cq->wce_pool); err_destroy_cq: - efa_cq_destroy(cq); + ibv_destroy_cq(cq->ibv_cq); +err_free_util_cq: + ofi_cq_cleanup(&cq->util_cq); err_free_cq: free(cq); return ret; } -void efa_cq_inc_ref_cnt(struct efa_cq *cq, uint8_t sub_cq_idx) -{ - cq->sub_cq_arr[sub_cq_idx].ref_cnt++; -} - -void efa_cq_dec_ref_cnt(struct efa_cq *cq, uint8_t sub_cq_idx) -{ - cq->sub_cq_arr[sub_cq_idx].ref_cnt--; -} diff --git a/prov/efa/src/efa_verbs/efa_device.c b/prov/efa/src/efa_device.c similarity index 60% rename from prov/efa/src/efa_verbs/efa_device.c rename to prov/efa/src/efa_device.c index 9528017e8ef..d60da55e281 100644 --- a/prov/efa/src/efa_verbs/efa_device.c +++ b/prov/efa/src/efa_device.c @@ -1,7 +1,7 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2017-2018 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright (c) 2017-2020 Amazon.com, Inc. or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -47,113 +47,36 @@ #include -#include "efa_ib.h" -#include "efa_io_defs.h" -#include "efa_cmd.h" +#include "efa.h" static struct efa_context **ctx_list; static int dev_cnt; -#define EFA_UVERBS_DEV_PATH "/dev/infiniband/" -#define EFA_EVERBS_DEV_NAME "efa_everbs" - -static int efa_everbs_init_cmd_file(struct efa_context *context, int devnum) -{ - int exp_mask = (EFA_USER_CMDS_SUPP_UDATA_CREATE_AH | - EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE); - char *efa_everbs_dev_path; - int efa_everbs_cmd_fd; - - /* everbs cmd file is not created/needed on newer kernels */ - if ((context->cmds_supp_udata & exp_mask) == exp_mask) - return 0; - - if (asprintf(&efa_everbs_dev_path, EFA_UVERBS_DEV_PATH EFA_EVERBS_DEV_NAME "%d", devnum) < 0) - return -errno; - - efa_everbs_cmd_fd = open(efa_everbs_dev_path, O_RDWR | O_CLOEXEC); - free(efa_everbs_dev_path); - if (efa_everbs_cmd_fd < 0) { - EFA_WARN(FI_LOG_FABRIC, "fail to open efa_everbs cmd file [%d]\n", - efa_everbs_cmd_fd); - return -errno; - } - context->efa_everbs_cmd_fd = efa_everbs_cmd_fd; - - return 0; -} - -static inline int efa_device_parse_everbs_idx(struct ibv_device *device) -{ - int devnum; - - if (sscanf(device->dev_name, "uverbs%d", &devnum) != 1) - return -EINVAL; - - return devnum; -} - static struct efa_context *efa_device_open(struct ibv_device *device) { struct efa_context *ctx; - char *devpath; - int cmd_fd; - int devnum; - int ret; - - if (asprintf(&devpath, EFA_UVERBS_DEV_PATH "%s", device->dev_name) < 0) - return NULL; - - /* - * We'll only be doing writes, but we need O_RDWR in case the - * provider needs to mmap() the file. - */ - cmd_fd = open(devpath, O_RDWR | O_CLOEXEC); - free(devpath); - - if (cmd_fd < 0) - return NULL; ctx = calloc(1, sizeof(struct efa_context)); if (!ctx) { errno = ENOMEM; - goto err_close_fd; + return NULL; } - ret = efa_cmd_alloc_ucontext(device, ctx, cmd_fd); - if (ret) - goto err_free_ctx; - - ctx->cqe_size = sizeof(struct efa_io_rx_cdesc); - if (ctx->cqe_size <= 0) - goto err_free_ctx; - - devnum = efa_device_parse_everbs_idx(device); - if (efa_everbs_init_cmd_file(ctx, devnum)) + ctx->ibv_ctx = ibv_open_device(device); + if (!ctx->ibv_ctx) goto err_free_ctx; - pthread_mutex_init(&ctx->ibv_ctx.mutex, NULL); - return ctx; err_free_ctx: free(ctx); -err_close_fd: - close(cmd_fd); return NULL; } static int efa_device_close(struct efa_context *ctx) { - int cmd_fd; - - pthread_mutex_destroy(&ctx->ibv_ctx.mutex); - cmd_fd = ctx->ibv_ctx.cmd_fd; - if (ctx->efa_everbs_cmd_fd) - close(ctx->efa_everbs_cmd_fd); - free(ctx->ibv_ctx.device); + ibv_close_device(ctx->ibv_ctx); free(ctx); - close(cmd_fd); return 0; } @@ -164,7 +87,9 @@ int efa_device_init(void) int ctx_idx; int ret; - dev_cnt = efa_ib_init(&device_list); + fastlock_init(&pd_list_lock); + + device_list = ibv_get_device_list(&dev_cnt); if (dev_cnt <= 0) return -ENODEV; @@ -174,28 +99,57 @@ int efa_device_init(void) goto err_free_dev_list; } + pd_list = calloc(dev_cnt, sizeof(*pd_list)); + if (!pd_list) { + ret = -ENOMEM; + goto err_free_ctx_list; + } + for (ctx_idx = 0; ctx_idx < dev_cnt; ctx_idx++) { ctx_list[ctx_idx] = efa_device_open(device_list[ctx_idx]); if (!ctx_list[ctx_idx]) { ret = -ENODEV; goto err_close_devs; } + ctx_list[ctx_idx]->dev_idx = ctx_idx; } - free(device_list); + ibv_free_device_list(device_list); return 0; err_close_devs: for (ctx_idx--; ctx_idx >= 0; ctx_idx--) efa_device_close(ctx_list[ctx_idx]); + free(pd_list); +err_free_ctx_list: free(ctx_list); err_free_dev_list: - free(device_list); + ibv_free_device_list(device_list); dev_cnt = 0; return ret; } +bool efa_device_support_rdma_read(void) +{ +#ifdef HAVE_RDMA_SIZE + int err; + struct efadv_device_attr efadv_attr; + + if (dev_cnt <=0) + return false; + + assert(dev_cnt > 0); + err = efadv_query_device(ctx_list[0]->ibv_ctx, &efadv_attr, sizeof(efadv_attr)); + if (err) + return false; + + return efadv_attr.device_caps & EFADV_DEVICE_ATTR_CAPS_RDMA_READ; +#else + return false; +#endif +} + void efa_device_free(void) { int i; @@ -203,8 +157,10 @@ void efa_device_free(void) for (i = 0; i < dev_cnt; i++) efa_device_close(ctx_list[i]); + free(pd_list); free(ctx_list); dev_cnt = 0; + fastlock_destroy(&pd_list_lock); } struct efa_context **efa_device_get_context_list(int *num_ctx) diff --git a/prov/efa/src/efa_domain.c b/prov/efa/src/efa_domain.c index c8c117a0df3..2250411bdff 100644 --- a/prov/efa/src/efa_domain.c +++ b/prov/efa/src/efa_domain.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. - * Copyright (c) 2017-2018 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright (c) 2017-2020 Amazon.com, Inc. or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -35,33 +35,56 @@ #include #include "efa.h" -#include "efa_verbs.h" +#include "rxr_cntr.h" + +fastlock_t pd_list_lock; +struct efa_pd *pd_list = NULL; static int efa_domain_close(fid_t fid) { struct efa_domain *domain; + struct efa_pd *efa_pd; int ret; domain = container_of(fid, struct efa_domain, util_domain.domain_fid.fid); - if (efa_mr_cache_enable) - ofi_mr_cache_cleanup(&domain->cache); + if (efa_is_cache_available(domain)) { + ofi_mr_cache_cleanup(domain->cache); + free(domain->cache); + domain->cache = NULL; + } - if (domain->pd) { - ret = efa_cmd_dealloc_pd(domain->pd); - if (ret) { - EFA_INFO_ERRNO(FI_LOG_DOMAIN, "efa_cmd_dealloc_pd", ret); - return ret; + if (domain->ibv_pd) { + fastlock_acquire(&pd_list_lock); + efa_pd = &pd_list[domain->ctx->dev_idx]; + if (efa_pd->use_cnt == 1) { + ret = -ibv_dealloc_pd(domain->ibv_pd); + if (ret) { + fastlock_release(&pd_list_lock); + EFA_INFO_ERRNO(FI_LOG_DOMAIN, "ibv_dealloc_pd", + ret); + return ret; + } + efa_pd->ibv_pd = NULL; } - domain->pd = NULL; + efa_pd->use_cnt--; + domain->ibv_pd = NULL; + fastlock_release(&pd_list_lock); } ret = ofi_domain_close(&domain->util_domain); if (ret) return ret; + if (domain->shm_domain) { + ret = fi_close(&domain->shm_domain->fid); + if (ret) + return ret; + } + fi_freeinfo(domain->info); + free(domain->qp_table); free(domain); return 0; } @@ -80,23 +103,83 @@ static int efa_open_device_by_name(struct efa_domain *domain, const char *name) if (!ctx_list) return -errno; - if (domain->rdm) + if (domain->type == EFA_DOMAIN_RDM) name_len = strlen(name) - strlen(efa_rdm_domain.suffix); else name_len = strlen(name) - strlen(efa_dgrm_domain.suffix); for (i = 0; i < num_ctx; i++) { - ret = strncmp(name, ctx_list[i]->ibv_ctx.device->name, name_len); + ret = strncmp(name, ctx_list[i]->ibv_ctx->device->name, name_len); if (!ret) { domain->ctx = ctx_list[i]; break; } } + /* + * Check if a PD has already been allocated for this device and reuse + * it if this is the case. + */ + fastlock_acquire(&pd_list_lock); + if (pd_list[i].ibv_pd) { + domain->ibv_pd = pd_list[i].ibv_pd; + pd_list[i].use_cnt++; + } else { + domain->ibv_pd = ibv_alloc_pd(domain->ctx->ibv_ctx); + if (!domain->ibv_pd) { + ret = -errno; + } else { + pd_list[i].ibv_pd = domain->ibv_pd; + pd_list[i].use_cnt++; + } + } + fastlock_release(&pd_list_lock); + efa_device_free_context_list(ctx_list); return ret; } +/* + * Register a temporary buffer and call ibv_fork_init() to determine if fork + * support is enabled. + * + * This relies on internal behavior in rdma-core and is a temporary workaround. + */ +static int efa_check_fork_enabled(struct fid_domain *domain_fid) +{ + struct fid_mr *mr; + char *buf; + int ret; + + buf = malloc(ofi_get_page_size()); + if (!buf) + return -FI_ENOMEM; + + ret = fi_mr_reg(domain_fid, buf, ofi_get_page_size(), + FI_SEND, 0, 0, 0, &mr, NULL); + if (ret) { + free(buf); + return ret; + } + + /* + * libibverbs maintains a global variable to determine if any + * registrations have occurred before ibv_fork_init() is called. + * EINVAL is returned if a memory region was registered before + * ibv_fork_init() was called and returns 0 if fork support is + * initialized already. + */ + ret = ibv_fork_init(); + + fi_close(&mr->fid); + free(buf); + + if (ret == EINVAL) + return 0; + + return 1; +} + static struct fi_ops efa_fid_ops = { .size = sizeof(struct fi_ops), .close = efa_domain_close, @@ -111,11 +194,12 @@ static struct fi_ops_domain efa_domain_ops = { .cq_open = efa_cq_open, .endpoint = efa_ep_open, .scalable_ep = fi_no_scalable_ep, - .cntr_open = fi_no_cntr_open, + .cntr_open = efa_cntr_open, .poll_open = fi_no_poll_open, .stx_ctx = fi_no_stx_context, .srx_ctx = fi_no_srx_context, .query_atomic = fi_no_query_atomic, + .query_collective = fi_no_query_collective, }; int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info, @@ -124,7 +208,12 @@ int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info, struct efa_domain *domain; struct efa_fabric *fabric; const struct fi_info *fi; + size_t qp_table_size; + bool app_mr_local; int ret; + struct ofi_mem_monitor *memory_monitors[OFI_HMEM_MAX] = { + [FI_HMEM_SYSTEM] = uffd_monitor, + }; fi = efa_get_efa_info(info->domain_attr->name); if (!fi) @@ -141,10 +230,18 @@ int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info, if (!domain) return -FI_ENOMEM; + qp_table_size = roundup_power_of_two(info->domain_attr->ep_cnt); + domain->qp_table_sz_m1 = qp_table_size - 1; + domain->qp_table = calloc(qp_table_size, sizeof(*domain->qp_table)); + if (!domain->qp_table) { + ret = -FI_ENOMEM; + goto err_free_domain; + } + ret = ofi_domain_init(fabric_fid, info, &domain->util_domain, context); if (ret) - goto err_free_domain; + goto err_free_qp_table; domain->info = fi_dupinfo(info); if (!domain->info) { @@ -152,56 +249,99 @@ int efa_domain_open(struct fid_fabric *fabric_fid, struct fi_info *info, goto err_close_domain; } - domain->rdm = EFA_EP_TYPE_IS_RDM(info); + if (EFA_EP_TYPE_IS_RDM(info)) { + struct rxr_domain *rxr_domain; + domain->type = EFA_DOMAIN_RDM; + rxr_domain = container_of(domain_fid, struct rxr_domain, + rdm_domain); + app_mr_local = rxr_domain->rxr_mr_local; + } else { + domain->type = EFA_DOMAIN_DGRAM; + /* DGRAM always requires FI_MR_LOCAL */ + app_mr_local = true; + } ret = efa_open_device_by_name(domain, info->domain_attr->name); if (ret) goto err_free_info; - domain->pd = efa_cmd_alloc_pd(domain->ctx); - if (!domain->pd) { - ret = -errno; - goto err_free_info; - } - - EFA_INFO(FI_LOG_DOMAIN, "Allocated pd[%u].\n", domain->pd->pdn); - domain->util_domain.domain_fid.fid.ops = &efa_fid_ops; domain->util_domain.domain_fid.ops = &efa_domain_ops; - + /* RMA mr_modes are being removed, since EFA layer + * does not have RMA capabilities. Hence, adding FI_MR_VIRT_ADDR + * until RMA capabilities are added to EFA layer + */ + domain->util_domain.mr_map.mode |= FI_MR_VIRT_ADDR; + /* + * ofi_domain_init() would have stored the EFA mr_modes in the mr_map, + * but we need the rbtree insertions and lookups to use EFA provider's + * specific key, so unset the FI_MR_PROV_KEY bit for mr_map. + */ + domain->util_domain.mr_map.mode &= ~FI_MR_PROV_KEY; domain->fab = fabric; + domain->util_domain.domain_fid.mr = &efa_domain_mr_ops; + *domain_fid = &domain->util_domain.domain_fid; - if (efa_mr_cache_enable) { + domain->cache = NULL; + + /* + * Check whether fork support is enabled when app does not request + * FI_MR_LOCAL even if the cache is disabled. + */ + if (!app_mr_local && efa_check_fork_enabled(*domain_fid)) { + fprintf(stderr, + "\nlibibverbs fork support is not supported by the EFA Libfabric\n" + "provider when memory registrations are handled by the provider.\n" + "\nFork support may currently be enabled via the RDMAV_FORK_SAFE\n" + "or IBV_FORK_SAFE environment variable or another library in your\n" + "application may be calling ibv_fork_init().\n" + "\nPlease refer to https://github.com/ofiwg/libfabric/issues/6332\n" + "for more information. Your job will now abort.\n"); + abort(); + } + + /* + * If FI_MR_LOCAL is set, we do not want to use the MR cache. + */ + if (!app_mr_local && efa_mr_cache_enable) { + domain->cache = (struct ofi_mr_cache *)calloc(1, sizeof(struct ofi_mr_cache)); + if (!domain->cache) { + ret = -FI_ENOMEM; + goto err_free_info; + } + if (!efa_mr_max_cached_count) - efa_mr_max_cached_count = info->domain_attr->mr_cnt / - EFA_DEF_NUM_MR_CACHE; + efa_mr_max_cached_count = info->domain_attr->mr_cnt * + EFA_MR_CACHE_LIMIT_MULT; if (!efa_mr_max_cached_size) - efa_mr_max_cached_size = domain->ctx->max_mr_size / - EFA_DEF_NUM_MR_CACHE; + efa_mr_max_cached_size = domain->ctx->max_mr_size * + EFA_MR_CACHE_LIMIT_MULT; cache_params.max_cnt = efa_mr_max_cached_count; cache_params.max_size = efa_mr_max_cached_size; - cache_params.merge_regions = efa_mr_cache_merge_regions; - domain->cache.entry_data_size = sizeof(struct efa_mem_desc); - domain->cache.add_region = efa_mr_cache_entry_reg; - domain->cache.delete_region = efa_mr_cache_entry_dereg; - ret = ofi_mr_cache_init(&domain->util_domain, uffd_monitor, - &domain->cache); + domain->cache->entry_data_size = sizeof(struct efa_mr); + domain->cache->add_region = efa_mr_cache_entry_reg; + domain->cache->delete_region = efa_mr_cache_entry_dereg; + ret = ofi_mr_cache_init(&domain->util_domain, memory_monitors, + domain->cache); if (!ret) { domain->util_domain.domain_fid.mr = &efa_domain_mr_cache_ops; - return 0; + EFA_INFO(FI_LOG_DOMAIN, "EFA MR cache enabled, max_cnt: %zu max_size: %zu\n", + cache_params.max_cnt, cache_params.max_size); + } else { + free(domain->cache); + domain->cache = NULL; } } - domain->util_domain.domain_fid.mr = &efa_domain_mr_ops; - efa_mr_cache_enable = 0; - return 0; err_free_info: fi_freeinfo(domain->info); err_close_domain: ofi_domain_close(&domain->util_domain); +err_free_qp_table: + free(domain->qp_table); err_free_domain: free(domain); return ret; diff --git a/prov/efa/src/efa_ep.c b/prov/efa/src/efa_ep.c index 9f6ef6d4791..76ae3228c7a 100644 --- a/prov/efa/src/efa_ep.c +++ b/prov/efa/src/efa_ep.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. - * Copyright (c) 2017-2018 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright (c) 2017-2020 Amazon.com, Inc. or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -32,313 +32,148 @@ */ #include "config.h" - #include "efa.h" -#include "efa_verbs.h" -#include "efa_ib.h" -#include "efa_io_defs.h" - -static void efa_ep_init_qp_indices(struct efa_qp *qp) -{ - qp->sq.wq.wqe_posted = 0; - qp->sq.wq.wqe_completed = 0; - qp->sq.wq.desc_idx = 0; - qp->sq.wq.wrid_idx_pool_next = 0; - - qp->rq.wq.wqe_posted = 0; - qp->rq.wq.wqe_completed = 0; - qp->rq.wq.desc_idx = 0; - qp->rq.wq.wrid_idx_pool_next = 0; -} - -static void efa_ep_setup_qp(struct efa_qp *qp, - struct ibv_qp_cap *cap, - size_t page_size) -{ - uint16_t rq_desc_cnt; - - efa_ep_init_qp_indices(qp); - - qp->sq.wq.wqe_cnt = align_up_queue_size(cap->max_send_wr); - qp->sq.wq.max_sge = cap->max_send_sge; - qp->sq.wq.desc_mask = qp->sq.wq.wqe_cnt - 1; - qp->rq.wq.max_sge = cap->max_recv_sge; - rq_desc_cnt = align_up_queue_size(cap->max_recv_sge * cap->max_recv_wr); - qp->rq.wq.desc_mask = rq_desc_cnt - 1; - qp->rq.wq.wqe_cnt = rq_desc_cnt / qp->rq.wq.max_sge; - - qp->page_size = page_size; -} +#include +#include +#define EFA_CQ_PROGRESS_ENTRIES 500 -static void efa_ep_wq_terminate(struct efa_wq *wq) +static int efa_generate_qkey() { - free(wq->wrid_idx_pool); - free(wq->wrid); -} - -static int efa_ep_wq_initialize(struct efa_wq *wq) -{ - int i, err; - - wq->wrid = malloc(wq->wqe_cnt * sizeof(*wq->wrid)); - if (!wq->wrid) - return -ENOMEM; + struct timeval tv; + struct timezone tz; + uint32_t val; + int err; - wq->wrid_idx_pool = malloc(wq->wqe_cnt * sizeof(__u32)); - if (!wq->wrid_idx_pool) { - err = -ENOMEM; - goto err_free_wrid; + err = gettimeofday(&tv, &tz); + if (err) { + EFA_WARN(FI_LOG_EP_CTRL, "Cannot gettimeofday, err=%d.\n", err); + return 0; } - /* Initialize the wrid free indexes pool. */ - for (i = 0; i < wq->wqe_cnt; i++) - wq->wrid_idx_pool[i] = i; + /* tv_usec is in range [0,1,000,000), shift it by 12 to [0,4,096,000,000 */ + val = (tv.tv_usec << 12) + tv.tv_sec; - return 0; + val = ofi_xorshift_random(val); -err_free_wrid: - free(wq->wrid); + /* 0x80000000 and up is privileged Q Key range. */ + val &= 0x7fffffff; - return err; + return val; } -static int efa_ep_sq_initialize(struct efa_qp *qp, struct efa_create_qp_resp *resp, int fd) +static int efa_ep_destroy_qp(struct efa_qp *qp) { - size_t desc_ring_size; - uint8_t *db_base; + struct efa_domain *domain; int err; - if (!qp->sq.wq.wqe_cnt) + if (!qp) return 0; - err = efa_ep_wq_initialize(&qp->sq.wq); + domain = qp->ep->domain; + domain->qp_table[qp->qp_num & domain->qp_table_sz_m1] = NULL; + err = -ibv_destroy_qp(qp->ibv_qp); if (err) - return err; - - qp->sq.immediate_data_width = 8; - qp->sq.desc_offset = resp->efa_resp.llq_desc_offset; - desc_ring_size = qp->sq.wq.wqe_cnt * sizeof(struct efa_io_tx_wqe); - qp->sq.desc_ring_mmap_size = align(desc_ring_size + qp->sq.desc_offset, qp->page_size); - qp->sq.max_inline_data = resp->ibv_resp.max_inline_data; - - qp->sq.desc = mmap(NULL, qp->sq.desc_ring_mmap_size, PROT_WRITE, - MAP_SHARED, fd, resp->efa_resp.llq_desc_mmap_key); - if (qp->sq.desc == MAP_FAILED) - goto err_terminate_wq; - qp->sq.desc += qp->sq.desc_offset; - - db_base = mmap(NULL, qp->page_size, PROT_WRITE, MAP_SHARED, fd, resp->efa_resp.sq_db_mmap_key); - if (db_base == MAP_FAILED) - goto err_unmap_desc_ring; - qp->sq.db = (uint32_t *)(db_base + resp->efa_resp.sq_db_offset); - qp->sq.sub_cq_idx = resp->efa_resp.send_sub_cq_idx; - - return 0; + EFA_INFO(FI_LOG_CORE, "destroy qp[%u] failed!\n", qp->qp_num); -err_unmap_desc_ring: - if (munmap(qp->sq.desc - qp->sq.desc_offset, qp->sq.desc_ring_mmap_size)) - EFA_WARN(FI_LOG_EP_CTRL, "qp[%u]: desc unmap failed!\n", qp->qp_num); -err_terminate_wq: - efa_ep_wq_terminate(&qp->sq.wq); - return -EINVAL; + free(qp); + return err; } -static void efa_ep_sq_terminate(struct efa_qp *qp) +static int efa_ep_modify_qp_state(struct efa_qp *qp, enum ibv_qp_state qp_state, + int attr_mask) { - void *db_aligned; + struct ibv_qp_attr attr = {}; - if (!qp->sq.wq.wrid) - return; + attr.qp_state = qp_state; - db_aligned = (void *)((__u64)qp->sq.db & ~(qp->page_size - 1)); - if (munmap(db_aligned, qp->page_size)) - EFA_WARN(FI_LOG_EP_CTRL, "qp[%u]: sq db unmap failed!\n", qp->qp_num); - if (munmap(qp->sq.desc - qp->sq.desc_offset, qp->sq.desc_ring_mmap_size)) - EFA_WARN(FI_LOG_EP_CTRL, "qp[%u]: desc data unmap failed!\n", qp->qp_num); + if (attr_mask & IBV_QP_PORT) + attr.port_num = 1; - efa_ep_wq_terminate(&qp->sq.wq); -} + if (attr_mask & IBV_QP_QKEY) + attr.qkey = qp->qkey; -static void efa_ep_rq_terminate(struct efa_qp *qp) -{ - void *db_aligned; - - if (!qp->rq.wq.wrid) - return; + /* + * You can set how many times the firmware retries here. + * Valid values are from 0(included) to 7(included). + * 0 stands for no firmware level retries. + * 7 means firmware retries infinitely. + */ + if (attr_mask & IBV_QP_RNR_RETRY) + attr.rnr_retry = 7; - db_aligned = (void *)((__u64)qp->rq.db & ~(qp->page_size - 1)); - if (munmap(db_aligned, qp->page_size)) - EFA_WARN(FI_LOG_EP_CTRL, "qp[%u]: rq db unmap failed!\n", qp->qp_num); - if (munmap(qp->rq.buf, qp->rq.buf_size)) - EFA_WARN(FI_LOG_EP_CTRL, "qp[%u]: rq buffer unmap failed!\n", qp->qp_num); + return -ibv_modify_qp(qp->ibv_qp, &attr, attr_mask); - efa_ep_wq_terminate(&qp->rq.wq); } -static int efa_ep_rq_initialize(struct efa_qp *qp, struct efa_create_qp_resp *resp, int fd) +static int efa_ep_modify_qp_rst2rts(struct efa_ep *ep, struct efa_qp *qp) { - uint8_t *db_base; int err; - if (!qp->rq.wq.wqe_cnt) - return 0; - - err = efa_ep_wq_initialize(&qp->rq.wq); + err = efa_ep_modify_qp_state(qp, IBV_QPS_INIT, + IBV_QP_STATE | IBV_QP_PKEY_INDEX | + IBV_QP_PORT | IBV_QP_QKEY); if (err) return err; - qp->rq.buf_size = resp->efa_resp.rq_mmap_size; - qp->rq.buf = mmap(NULL, qp->rq.buf_size, PROT_WRITE, MAP_SHARED, fd, resp->efa_resp.rq_mmap_key); - if (qp->rq.buf == MAP_FAILED) - goto err_terminate_wq; - - db_base = mmap(NULL, qp->page_size, PROT_WRITE, MAP_SHARED, fd, resp->efa_resp.rq_db_mmap_key); - if (db_base == MAP_FAILED) - goto err_unmap_rq_buf; - qp->rq.db = (uint32_t *)(db_base + resp->efa_resp.rq_db_offset); - qp->rq.sub_cq_idx = resp->efa_resp.recv_sub_cq_idx; - - return 0; - -err_unmap_rq_buf: - if (munmap(qp->rq.buf, qp->rq.buf_size)) - EFA_WARN(FI_LOG_EP_CTRL, "qp[%u]: rq buf unmap failed!\n", qp->qp_num); -err_terminate_wq: - efa_ep_wq_terminate(&qp->rq.wq); - return -EINVAL; -} - -static void efa_ep_lock_cqs(struct ibv_qp *ibqp) -{ - struct efa_cq *send_cq = to_efa_cq(ibqp->send_cq); - struct efa_cq *recv_cq = to_efa_cq(ibqp->recv_cq); - - if (recv_cq == send_cq && recv_cq) { - fastlock_acquire(&recv_cq->inner_lock); - } else { - if (recv_cq) - fastlock_acquire(&recv_cq->inner_lock); - if (send_cq) - fastlock_acquire(&send_cq->inner_lock); - } -} - -static void efa_ep_unlock_cqs(struct ibv_qp *ibqp) -{ - struct efa_cq *send_cq = to_efa_cq(ibqp->send_cq); - struct efa_cq *recv_cq = to_efa_cq(ibqp->recv_cq); - - if (recv_cq == send_cq && recv_cq) { - fastlock_release(&recv_cq->inner_lock); - } else { - if (recv_cq) - fastlock_release(&recv_cq->inner_lock); - if (send_cq) - fastlock_release(&send_cq->inner_lock); - } -} - -static int efa_ep_destroy_qp(struct efa_qp *qp) -{ - struct efa_context *ctx; - struct efa_cq *send_cq; - struct efa_cq *recv_cq; - struct ibv_qp *ibqp; - int err; - - if (!qp) - return 0; - - ibqp = &qp->ibv_qp; - ctx = to_efa_ctx(ibqp->context); - - pthread_mutex_lock(&ctx->qp_table_mutex); - efa_ep_lock_cqs(ibqp); - - if (ibqp->send_cq) { - send_cq = to_efa_cq(ibqp->send_cq); - efa_cq_dec_ref_cnt(send_cq, qp->sq.sub_cq_idx); - } - if (ibqp->recv_cq) { - recv_cq = to_efa_cq(ibqp->recv_cq); - efa_cq_dec_ref_cnt(recv_cq, qp->rq.sub_cq_idx); - } - ctx->qp_table[ibqp->qp_num] = NULL; - - efa_ep_unlock_cqs(ibqp); - pthread_mutex_unlock(&ctx->qp_table_mutex); - - err = efa_cmd_destroy_qp(qp); + err = efa_ep_modify_qp_state(qp, IBV_QPS_RTR, IBV_QP_STATE); if (err) - EFA_INFO(FI_LOG_CORE, "destroy qp[%u] failed!\n", qp->qp_num); - efa_ep_sq_terminate(qp); - efa_ep_rq_terminate(qp); + return err; - free(qp); - return err; + if (ep->util_ep.type != FI_EP_DGRAM && + efa_ep_support_rnr_retry_modify(&ep->util_ep.ep_fid)) + return efa_ep_modify_qp_state(qp, IBV_QPS_RTS, + IBV_QP_STATE | IBV_QP_SQ_PSN | IBV_QP_RNR_RETRY); + + return efa_ep_modify_qp_state(qp, IBV_QPS_RTS, + IBV_QP_STATE | IBV_QP_SQ_PSN); } -static int efa_ep_create_qp(struct efa_ep *ep, - struct efa_pd *pd, - struct ibv_qp_init_attr *init_attr) +static int efa_ep_create_qp_ex(struct efa_ep *ep, + struct ibv_pd *ibv_pd, + struct ibv_qp_init_attr_ex *init_attr_ex) { - struct ibv_pd *ibpd = &pd->ibv_pd; - struct efa_device *dev = to_efa_dev(ibpd->context->device); - struct efa_create_qp_resp resp; - struct efa_cq *send_cq; - struct efa_cq *recv_cq; + struct efa_domain *domain; struct efa_qp *qp; + struct efadv_qp_init_attr efa_attr = {}; int err; + domain = ep->domain; qp = calloc(1, sizeof(*qp)); if (!qp) return -FI_ENOMEM; - efa_ep_setup_qp(qp, &init_attr->cap, dev->page_size); + if (init_attr_ex->qp_type == IBV_QPT_UD) { + qp->ibv_qp = ibv_create_qp_ex(ibv_pd->context, init_attr_ex); + } else { + assert(init_attr_ex->qp_type == IBV_QPT_DRIVER); + efa_attr.driver_qp_type = EFADV_QP_DRIVER_TYPE_SRD; + qp->ibv_qp = efadv_create_qp_ex(ibv_pd->context, init_attr_ex, &efa_attr, + sizeof(struct efadv_qp_init_attr)); + } - err = efa_cmd_create_qp(qp, pd, init_attr, ep->domain->rdm, &resp); - if (err) { - EFA_WARN(FI_LOG_EP_CTRL, "efa_cmd_create_qp failed [%u]!\n", err); + if (!qp->ibv_qp) { + EFA_WARN(FI_LOG_EP_CTRL, "ibv_create_qp failed\n"); + err = -EINVAL; goto err_free_qp; } - qp->qp_num = qp->ibv_qp.qp_num; - err = efa_ep_rq_initialize(qp, &resp, ibpd->context->cmd_fd); + qp->ibv_qp_ex = ibv_qp_to_qp_ex(qp->ibv_qp); + qp->qkey = efa_generate_qkey(); + err = efa_ep_modify_qp_rst2rts(ep, qp); if (err) goto err_destroy_qp; - err = efa_ep_sq_initialize(qp, &resp, ibpd->context->cmd_fd); - if (err) - goto err_terminate_rq; - - pthread_mutex_lock(&pd->context->qp_table_mutex); - pd->context->qp_table[qp->qp_num] = qp; - pthread_mutex_unlock(&pd->context->qp_table_mutex); - - if (init_attr->send_cq) { - send_cq = to_efa_cq(init_attr->send_cq); - fastlock_acquire(&send_cq->inner_lock); - efa_cq_inc_ref_cnt(send_cq, resp.efa_resp.send_sub_cq_idx); - fastlock_release(&send_cq->inner_lock); - } - if (init_attr->recv_cq) { - recv_cq = to_efa_cq(init_attr->recv_cq); - fastlock_acquire(&recv_cq->inner_lock); - efa_cq_inc_ref_cnt(recv_cq, resp.efa_resp.recv_sub_cq_idx); - fastlock_release(&recv_cq->inner_lock); - } - + qp->qp_num = qp->ibv_qp->qp_num; ep->qp = qp; qp->ep = ep; - EFA_INFO(FI_LOG_EP_CTRL, "%s(): create QP %d\n", __func__, qp->qp_num); + domain->qp_table[ep->qp->qp_num & domain->qp_table_sz_m1] = ep->qp; + EFA_INFO(FI_LOG_EP_CTRL, "%s(): create QP %d qkey: %d\n", __func__, qp->qp_num, qp->qkey); return 0; -err_terminate_rq: - efa_ep_rq_terminate(qp); err_destroy_qp: - efa_cmd_destroy_qp(qp); + ibv_destroy_qp(qp->ibv_qp); err_free_qp: free(qp); @@ -400,9 +235,14 @@ static struct efa_ep *efa_ep_alloc(struct fi_info *info) static void efa_ep_destroy(struct efa_ep *ep) { + if (ep->self_ah) + ibv_destroy_ah(ep->self_ah); + efa_ep_destroy_qp(ep->qp); fi_freeinfo(ep->info); free(ep->src_addr); + if (ofi_endpoint_close(&ep->util_ep)) + FI_WARN(&efa_prov, FI_LOG_EP_CTRL, "Unable to close util EP\n"); free(ep); } @@ -410,7 +250,10 @@ static int efa_ep_close(fid_t fid) { struct efa_ep *ep; - ep = container_of(fid, struct efa_ep, ep_fid.fid); + ep = container_of(fid, struct efa_ep, util_ep.ep_fid.fid); + + ofi_bufpool_destroy(ep->recv_wr_pool); + ofi_bufpool_destroy(ep->send_wr_pool); efa_ep_destroy(ep); return 0; @@ -421,9 +264,11 @@ static int efa_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) struct efa_ep *ep; struct efa_cq *cq; struct efa_av *av; + struct util_eq *eq; + struct util_cntr *cntr; int ret; - ep = container_of(fid, struct efa_ep, ep_fid.fid); + ep = container_of(fid, struct efa_ep, util_ep.ep_fid.fid); ret = ofi_ep_bind_valid(&efa_prov, bfid, flags); if (ret) return ret; @@ -440,10 +285,14 @@ static int efa_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) if (!(flags & (FI_RECV | FI_TRANSMIT))) return -FI_EBADFLAGS; - cq = container_of(bfid, struct efa_cq, cq_fid); + cq = container_of(bfid, struct efa_cq, util_cq.cq_fid); if (ep->domain != cq->domain) return -FI_EINVAL; + ret = ofi_ep_bind_cq(&ep->util_ep, &cq->util_cq, flags); + if (ret) + return ret; + if (flags & FI_RECV) { if (ep->rcq) return -EINVAL; @@ -456,7 +305,7 @@ static int efa_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) } break; case FI_CLASS_AV: - av = container_of(bfid, struct efa_av, av_fid.fid); + av = container_of(bfid, struct efa_av, util_av.av_fid.fid); if (ep->domain != av->domain) { EFA_WARN(FI_LOG_EP_CTRL, "Address vector doesn't belong to same domain as EP.\n"); @@ -471,6 +320,20 @@ static int efa_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) ep->av->ep = ep; break; + case FI_CLASS_CNTR: + cntr = container_of(bfid, struct util_cntr, cntr_fid.fid); + + ret = ofi_ep_bind_cntr(&ep->util_ep, cntr, flags); + if (ret) + return ret; + break; + case FI_CLASS_EQ: + eq = container_of(bfid, struct util_eq, eq_fid.fid); + + ret = ofi_ep_bind_eq(&ep->util_ep, eq); + if (ret) + return ret; + break; default: return -EINVAL; } @@ -480,7 +343,7 @@ static int efa_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) static int efa_ep_getflags(struct fid_ep *ep_fid, uint64_t *flags) { - struct efa_ep *ep = container_of(ep_fid, struct efa_ep, ep_fid); + struct efa_ep *ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid); struct fi_tx_attr *tx_attr = ep->info->tx_attr; struct fi_rx_attr *rx_attr = ep->info->rx_attr; @@ -500,7 +363,7 @@ static int efa_ep_getflags(struct fid_ep *ep_fid, uint64_t *flags) static int efa_ep_setflags(struct fid_ep *ep_fid, uint64_t flags) { - struct efa_ep *ep = container_of(ep_fid, struct efa_ep, ep_fid); + struct efa_ep *ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid); struct fi_tx_attr *tx_attr = ep->info->tx_attr; struct fi_rx_attr *rx_attr = ep->info->rx_attr; @@ -521,14 +384,35 @@ static int efa_ep_setflags(struct fid_ep *ep_fid, uint64_t flags) return 0; } +/* efa_ep_create_self_ah() create an address handler for + * an EP's own address. The address handler is used by + * an EP to read from itself. It is used to + * copy data from host memory to GPU memory. + */ +static inline +int efa_ep_create_self_ah(struct efa_ep *ep, struct ibv_pd *ibv_pd) +{ + struct ibv_ah_attr ah_attr; + struct efa_ep_addr *self_addr; + + self_addr = (struct efa_ep_addr *)ep->src_addr; + + memset(&ah_attr, 0, sizeof(ah_attr)); + ah_attr.port_num = 1; + ah_attr.is_global = 1; + memcpy(ah_attr.grh.dgid.raw, self_addr->raw, sizeof(self_addr->raw)); + ep->self_ah = ibv_create_ah(ibv_pd, &ah_attr); + return ep->self_ah ? 0 : -FI_EINVAL; +} + static int efa_ep_enable(struct fid_ep *ep_fid) { - struct ibv_qp_init_attr attr = { 0 }; + struct ibv_qp_init_attr_ex attr_ex = { 0 }; const struct fi_info *efa_info; + struct ibv_pd *ibv_pd; struct efa_ep *ep; - struct efa_pd *pd; - - ep = container_of(ep_fid, struct efa_ep, ep_fid); + int err; + ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid); if (!ep->scq && !ep->rcq) { EFA_WARN(FI_LOG_EP_CTRL, @@ -555,29 +439,53 @@ static int efa_ep_enable(struct fid_ep *ep_fid) } if (ep->scq) { - attr.cap.max_send_wr = ep->info->tx_attr->size; - attr.cap.max_send_sge = ep->info->tx_attr->iov_limit; - attr.send_cq = &ep->scq->ibv_cq; - pd = ep->scq->domain->pd; + attr_ex.cap.max_send_wr = ep->info->tx_attr->size; + attr_ex.cap.max_send_sge = ep->info->tx_attr->iov_limit; + attr_ex.send_cq = ep->scq->ibv_cq; + ibv_pd = ep->scq->domain->ibv_pd; } else { - attr.send_cq = &ep->rcq->ibv_cq; - pd = ep->rcq->domain->pd; + attr_ex.send_cq = ep->rcq->ibv_cq; + ibv_pd = ep->rcq->domain->ibv_pd; } if (ep->rcq) { - attr.cap.max_recv_wr = ep->info->rx_attr->size; - attr.cap.max_recv_sge = ep->info->rx_attr->iov_limit; - attr.recv_cq = &ep->rcq->ibv_cq; + attr_ex.cap.max_recv_wr = ep->info->rx_attr->size; + attr_ex.cap.max_recv_sge = ep->info->rx_attr->iov_limit; + attr_ex.recv_cq = ep->rcq->ibv_cq; } else { - attr.recv_cq = &ep->scq->ibv_cq; + attr_ex.recv_cq = ep->scq->ibv_cq; } - attr.cap.max_inline_data = pd->context->inject_size; - attr.qp_type = ep->domain->rdm ? IBV_QPT_DRIVER : IBV_QPT_UD; - attr.sq_sig_all = 0; - attr.qp_context = ep; + attr_ex.cap.max_inline_data = ep->domain->ctx->inline_buf_size; - return efa_ep_create_qp(ep, pd, &attr); + if (ep->domain->type == EFA_DOMAIN_RDM) { + attr_ex.qp_type = IBV_QPT_DRIVER; + attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS; + attr_ex.send_ops_flags = IBV_QP_EX_WITH_SEND; + if (efa_ep_support_rdma_read(&ep->util_ep.ep_fid)) + attr_ex.send_ops_flags |= IBV_QP_EX_WITH_RDMA_READ; + attr_ex.pd = ibv_pd; + } else { + attr_ex.qp_type = IBV_QPT_UD; + attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; + attr_ex.pd = ibv_pd; + } + + attr_ex.qp_context = ep; + attr_ex.sq_sig_all = 1; + + err = efa_ep_create_qp_ex(ep, ibv_pd, &attr_ex); + if (err) + return err; + + err = efa_ep_create_self_ah(ep, ibv_pd); + if (err) { + EFA_WARN(FI_LOG_EP_CTRL, + "Endpoint cannot create ah for its own address\n"); + efa_ep_destroy_qp(ep->qp); + } + + return err; } static int efa_ep_control(struct fid *fid, int command, void *arg) @@ -611,6 +519,106 @@ static struct fi_ops efa_ep_ops = { .ops_open = fi_no_ops_open, }; +static void efa_ep_progress_internal(struct efa_ep *ep, struct efa_cq *efa_cq) +{ + struct util_cq *cq; + struct fi_cq_tagged_entry cq_entry[EFA_CQ_PROGRESS_ENTRIES]; + struct fi_cq_tagged_entry *temp_cq_entry; + struct fi_cq_err_entry cq_err_entry; + fi_addr_t src_addr[EFA_CQ_PROGRESS_ENTRIES]; + uint64_t flags; + int i; + ssize_t ret, err; + + cq = &efa_cq->util_cq; + flags = ep->util_ep.caps; + + VALGRIND_MAKE_MEM_DEFINED(&cq_entry, sizeof(cq_entry)); + + ret = efa_cq_readfrom(&cq->cq_fid, cq_entry, EFA_CQ_PROGRESS_ENTRIES, + (flags & FI_SOURCE) ? src_addr : NULL); + if (ret == -FI_EAGAIN) + return; + + if (OFI_UNLIKELY(ret < 0)) { + if (OFI_UNLIKELY(ret != -FI_EAVAIL)) { + EFA_WARN(FI_LOG_CQ, "no error available errno: %ld\n", ret); + efa_eq_write_error(&ep->util_ep, FI_EOTHER, ret); + return; + } + + err = efa_cq_readerr(&cq->cq_fid, &cq_err_entry, flags); + if (OFI_UNLIKELY(err < 0)) { + EFA_WARN(FI_LOG_CQ, "unable to read error entry errno: %ld\n", err); + efa_eq_write_error(&ep->util_ep, FI_EOTHER, err); + return; + } + + ofi_cq_write_error(cq, &cq_err_entry); + return; + } + + temp_cq_entry = (struct fi_cq_tagged_entry *)cq_entry; + for (i = 0; i < ret; i++) { + (flags & FI_SOURCE) ? + ofi_cq_write_src(cq, temp_cq_entry->op_context, + temp_cq_entry->flags, + temp_cq_entry->len, + temp_cq_entry->buf, + temp_cq_entry->data, + temp_cq_entry->tag, + src_addr[i]) : + ofi_cq_write(cq, temp_cq_entry->op_context, + temp_cq_entry->flags, + temp_cq_entry->len, + temp_cq_entry->buf, + temp_cq_entry->data, + temp_cq_entry->tag); + + temp_cq_entry = (struct fi_cq_tagged_entry *) + ((uint8_t *)temp_cq_entry + efa_cq->entry_size); + } + return; +} + +void efa_ep_progress(struct util_ep *ep) +{ + struct efa_ep *efa_ep; + struct efa_cq *rcq; + struct efa_cq *scq; + + efa_ep = container_of(ep, struct efa_ep, util_ep); + rcq = efa_ep->rcq; + scq = efa_ep->scq; + + fastlock_acquire(&ep->lock); + + if (rcq) + efa_ep_progress_internal(efa_ep, rcq); + + if (scq && scq != rcq) + efa_ep_progress_internal(efa_ep, scq); + + fastlock_release(&ep->lock); +} + +static struct fi_ops_atomic efa_ep_atomic_ops = { + .size = sizeof(struct fi_ops_atomic), + .write = fi_no_atomic_write, + .writev = fi_no_atomic_writev, + .writemsg = fi_no_atomic_writemsg, + .inject = fi_no_atomic_inject, + .readwrite = fi_no_atomic_readwrite, + .readwritev = fi_no_atomic_readwritev, + .readwritemsg = fi_no_atomic_readwritemsg, + .compwrite = fi_no_atomic_compwrite, + .compwritev = fi_no_atomic_compwritev, + .compwritemsg = fi_no_atomic_compwritemsg, + .writevalid = fi_no_atomic_writevalid, + .readwritevalid = fi_no_atomic_readwritevalid, + .compwritevalid = fi_no_atomic_compwritevalid, +}; + int efa_ep_open(struct fid_domain *domain_fid, struct fi_info *info, struct fid_ep **ep_fid, void *context) { @@ -623,8 +631,8 @@ int efa_ep_open(struct fid_domain *domain_fid, struct fi_info *info, util_domain.domain_fid); if (!info || !info->ep_attr || !info->domain_attr || - strncmp(domain->ctx->ibv_ctx.device->name, info->domain_attr->name, - strlen(domain->ctx->ibv_ctx.device->name))) { + strncmp(domain->ctx->ibv_ctx->device->name, info->domain_attr->name, + strlen(domain->ctx->ibv_ctx->device->name))) { EFA_INFO(FI_LOG_DOMAIN, "Invalid info->domain_attr->name\n"); return -FI_EINVAL; } @@ -658,30 +666,55 @@ int efa_ep_open(struct fid_domain *domain_fid, struct fi_info *info, if (!ep) return -FI_ENOMEM; + ret = ofi_endpoint_init(domain_fid, &efa_util_prov, info, &ep->util_ep, + context, efa_ep_progress); + if (ret) + goto err_ep_destroy; + + ret = ofi_bufpool_create(&ep->send_wr_pool, + sizeof(struct efa_send_wr) + + info->tx_attr->iov_limit * sizeof(struct ibv_sge), + 16, 0, 1024, 0); + if (ret) + goto err_ep_destroy; + + ret = ofi_bufpool_create(&ep->recv_wr_pool, + sizeof(struct efa_recv_wr) + + info->rx_attr->iov_limit * sizeof(struct ibv_sge), + 16, 0, 1024, 0); + if (ret) + goto err_send_wr_destroy; + ep->domain = domain; - ep->ep_fid.fid.fclass = FI_CLASS_EP; - ep->ep_fid.fid.context = context; - ep->ep_fid.fid.ops = &efa_ep_ops; - ep->ep_fid.ops = &efa_ep_base_ops; - ep->ep_fid.msg = &efa_ep_msg_ops; - ep->ep_fid.cm = &efa_ep_cm_ops; - ep->ep_fid.rma = NULL; - ep->ep_fid.atomic = NULL; + ep->xmit_more_wr_tail = &ep->xmit_more_wr_head; + ep->recv_more_wr_tail = &ep->recv_more_wr_head; if (info->src_addr) { ep->src_addr = (void *)calloc(1, EFA_EP_ADDR_LEN); if (!ep->src_addr) { ret = -FI_ENOMEM; - goto err; + goto err_recv_wr_destroy; } memcpy(ep->src_addr, info->src_addr, info->src_addrlen); } - *ep_fid = &ep->ep_fid; + *ep_fid = &ep->util_ep.ep_fid; + (*ep_fid)->fid.fclass = FI_CLASS_EP; + (*ep_fid)->fid.context = context; + (*ep_fid)->fid.ops = &efa_ep_ops; + (*ep_fid)->ops = &efa_ep_base_ops; + (*ep_fid)->msg = &efa_ep_msg_ops; + (*ep_fid)->cm = &efa_ep_cm_ops; + (*ep_fid)->rma = &efa_ep_rma_ops; + (*ep_fid)->atomic = &efa_ep_atomic_ops; return 0; -err: +err_recv_wr_destroy: + ofi_bufpool_destroy(ep->recv_wr_pool); +err_send_wr_destroy: + ofi_bufpool_destroy(ep->send_wr_pool); +err_ep_destroy: efa_ep_destroy(ep); return ret; } diff --git a/prov/efa/src/efa_fabric.c b/prov/efa/src/efa_fabric.c index ed4cc449e1a..b8233343386 100644 --- a/prov/efa/src/efa_fabric.c +++ b/prov/efa/src/efa_fabric.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2014-2016, Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2017-2018 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright (c) 2017-2020 Amazon.com, Inc. or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -40,6 +40,8 @@ #include #include +#include + #include #include #include @@ -50,16 +52,20 @@ #include #include "efa.h" -#include "efa_ib.h" -#include "efa_io_defs.h" -#include "efa_verbs.h" +#if HAVE_EFA_DL +#include +#endif #define EFA_FABRIC_PREFIX "EFA-" #define EFA_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM) -#define EFA_RDM_CAPS (FI_MSG | FI_RECV | FI_SEND | FI_SOURCE | EFA_DOMAIN_CAPS) -#define EFA_DGRM_CAPS (FI_MSG | FI_RECV | FI_SEND | FI_SOURCE | EFA_DOMAIN_CAPS) +#define EFA_RDM_TX_CAPS (OFI_TX_MSG_CAPS) +#define EFA_RDM_RX_CAPS (OFI_RX_MSG_CAPS | FI_SOURCE) +#define EFA_DGRM_TX_CAPS (OFI_TX_MSG_CAPS) +#define EFA_DGRM_RX_CAPS (OFI_RX_MSG_CAPS | FI_SOURCE) +#define EFA_RDM_CAPS (EFA_RDM_TX_CAPS | EFA_RDM_RX_CAPS | EFA_DOMAIN_CAPS) +#define EFA_DGRM_CAPS (EFA_DGRM_TX_CAPS | EFA_DGRM_RX_CAPS | EFA_DOMAIN_CAPS) #define EFA_TX_OP_FLAGS (FI_TRANSMIT_COMPLETE) @@ -72,13 +78,12 @@ #define EFA_NO_DEFAULT -1 -#define EFA_DEF_MR_CACHE_ENABLE 0 -#define EFA_DEF_MR_CACHE_MERGE_REGIONS 1 +#define EFA_DEF_MR_CACHE_ENABLE 1 int efa_mr_cache_enable = EFA_DEF_MR_CACHE_ENABLE; -int efa_mr_cache_merge_regions = EFA_DEF_MR_CACHE_MERGE_REGIONS; size_t efa_mr_max_cached_count; size_t efa_mr_max_cached_size; +int efa_set_rdmav_hugepages_safe = 0; static void efa_addr_to_str(const uint8_t *raw_addr, char *str); static int efa_get_addr(struct efa_context *ctx, void *src_addr); @@ -87,7 +92,7 @@ const struct fi_fabric_attr efa_fabric_attr = { .fabric = NULL, .name = NULL, .prov_name = NULL, - .prov_version = EFA_PROV_VERS, + .prov_version = OFI_VERSION_DEF_PROV, }; const struct fi_domain_attr efa_domain_attr = { @@ -96,9 +101,8 @@ const struct fi_domain_attr efa_domain_attr = { .control_progress = FI_PROGRESS_AUTO, .data_progress = FI_PROGRESS_AUTO, .resource_mgmt = FI_RM_DISABLED, - .mr_mode = OFI_MR_BASIC_MAP | FI_MR_LOCAL | FI_MR_BASIC, - .mr_key_size = sizeof_field(struct efa_io_tx_buf_desc, lkey), + .mr_key_size = sizeof_field(struct ibv_sge, lkey), .cq_data_size = 0, .tx_ctx_cnt = 1024, .rx_ctx_cnt = 1024, @@ -118,6 +122,7 @@ const struct fi_ep_attr efa_ep_attr = { }; const struct fi_rx_attr efa_dgrm_rx_attr = { + .caps = EFA_DGRM_RX_CAPS, .mode = FI_MSG_PREFIX | EFA_RX_MODE, .op_flags = EFA_RX_DGRM_OP_FLAGS, .msg_order = EFA_MSG_ORDER, @@ -127,6 +132,7 @@ const struct fi_rx_attr efa_dgrm_rx_attr = { }; const struct fi_rx_attr efa_rdm_rx_attr = { + .caps = EFA_RDM_RX_CAPS, .mode = EFA_RX_MODE, .op_flags = EFA_RX_RDM_OP_FLAGS, .msg_order = EFA_MSG_ORDER, @@ -136,6 +142,7 @@ const struct fi_rx_attr efa_rdm_rx_attr = { }; const struct fi_tx_attr efa_dgrm_tx_attr = { + .caps = EFA_DGRM_TX_CAPS, .mode = FI_MSG_PREFIX, .op_flags = EFA_TX_OP_FLAGS, .msg_order = EFA_MSG_ORDER, @@ -145,12 +152,13 @@ const struct fi_tx_attr efa_dgrm_tx_attr = { }; const struct fi_tx_attr efa_rdm_tx_attr = { + .caps = EFA_RDM_TX_CAPS, .mode = 0, .op_flags = EFA_TX_OP_FLAGS, .msg_order = EFA_MSG_ORDER, .comp_order = FI_ORDER_NONE, .inject_size = 0, - .rma_iov_limit = 0, + .rma_iov_limit = 1, }; const struct efa_ep_domain efa_rdm_domain = { @@ -239,20 +247,31 @@ static int efa_check_hints(uint32_t version, const struct fi_info *hints, return 0; } -static int efa_alloc_qp_table(struct efa_context *ctx, size_t ep_cnt) +static char *get_sysfs_path(void) { - ctx->qp_table = calloc(ep_cnt, sizeof(*ctx->qp_table)); - if (!ctx->qp_table) - return -FI_ENOMEM; - pthread_mutex_init(&ctx->qp_table_mutex, NULL); + char *env = NULL; + char *sysfs_path = NULL; + int len; + + /* + * Only follow use path passed in through the calling user's + * environment if we're not running SUID. + */ + if (getuid() == geteuid()) + env = getenv("SYSFS_PATH"); + + if (env) { + sysfs_path = strndup(env, IBV_SYSFS_PATH_MAX); + len = strlen(sysfs_path); + while (len > 0 && sysfs_path[len - 1] == '/') { + --len; + sysfs_path[len] = '\0'; + } + } else { + sysfs_path = strndup("/sys", IBV_SYSFS_PATH_MAX); + } - return FI_SUCCESS; -} - -static void efa_free_qp_table(struct efa_context *ctx) -{ - pthread_mutex_destroy(&ctx->qp_table_mutex); - free(ctx->qp_table); + return sysfs_path; } static int efa_alloc_fid_nic(struct fi_info *fi, struct efa_context *ctx, @@ -285,7 +304,7 @@ static int efa_alloc_fid_nic(struct fi_info *fi, struct efa_context *ctx, link_attr = fi->nic->link_attr; /* fi_device_attr */ - device_attr->name = strdup(ctx->ibv_ctx.device->name); + device_attr->name = strdup(ctx->ibv_ctx->device->name); if (!device_attr->name) { ret = -FI_ENOMEM; goto err_free_nic; @@ -325,7 +344,7 @@ static int efa_alloc_fid_nic(struct fi_info *fi, struct efa_context *ctx, } ret = asprintf(&driver_sym_path, "%s%s", - ctx->ibv_ctx.device->ibdev_path, "/device/driver"); + ctx->ibv_ctx->device->ibdev_path, "/device/driver"); if (ret < 0) { ret = -FI_ENOMEM; goto err_free_sysfs; @@ -359,7 +378,7 @@ static int efa_alloc_fid_nic(struct fi_info *fi, struct efa_context *ctx, /* fi_pci_attr */ ret = asprintf(&dbdf_sym_path, "%s%s", - ctx->ibv_ctx.device->ibdev_path, "/device"); + ctx->ibv_ctx->device->ibdev_path, "/device"); if (ret < 0) { ret = -FI_ENOMEM; goto err_free_driver_sym; @@ -405,9 +424,9 @@ static int efa_alloc_fid_nic(struct fi_info *fi, struct efa_context *ctx, efa_addr_to_str(src_addr, link_attr->address); - link_attr->mtu = port_attr->max_msg_sz; - - link_attr->speed = 0; + link_attr->mtu = port_attr->max_msg_sz - rxr_pkt_max_header_size(); + link_attr->speed = ofi_vrb_speed(port_attr->active_speed, + port_attr->active_width); switch (port_attr->state) { case IBV_PORT_DOWN: @@ -447,20 +466,91 @@ static int efa_alloc_fid_nic(struct fi_info *fi, struct efa_context *ctx, return ret; } +#if HAVE_LIBCUDA +/* + * efa_get_gdr_support() check if GPUDirect RDMA is supported by + * reading from sysfs file "class/infiniband//gdr" + * and set content of gdr_support accordingly. + * + * Return value: + * return 1 if sysfs file exist and has 1 in it. + * return 0 if sysfs file does not exist or has 0 in it. + * return a negatie value if error happened. + */ +static int efa_get_gdr_support(char *device_name) +{ + static const int MAX_GDR_SUPPORT_STRLEN = 8; + char *gdr_path = NULL; + char gdr_support_str[MAX_GDR_SUPPORT_STRLEN]; + int ret, read_len; + + ret = asprintf(&gdr_path, "class/infiniband/%s/device/gdr", device_name); + if (ret < 0) { + EFA_INFO_ERRNO(FI_LOG_FABRIC, "asprintf to build sysfs file name failed", ret); + goto out; + } + + ret = fi_read_file(get_sysfs_path(), gdr_path, + gdr_support_str, MAX_GDR_SUPPORT_STRLEN); + if (ret < 0) { + if (errno == ENOENT) { + /* sysfs file does not exist, gdr is not supported */ + ret = 0; + } + + goto out; + } + + if (ret == 0) { + EFA_WARN(FI_LOG_FABRIC, "Sysfs file %s is empty\n", gdr_path); + ret = -FI_EINVAL; + goto out; + } + + read_len = MIN(ret, MAX_GDR_SUPPORT_STRLEN); + ret = (0 == strncmp(gdr_support_str, "1", read_len)); +out: + free(gdr_path); + return ret; +} +#endif + static int efa_get_device_attrs(struct efa_context *ctx, struct fi_info *info) { + struct efadv_device_attr efadv_attr; struct efa_device_attr device_attr; struct ibv_device_attr *base_attr; struct ibv_port_attr port_attr; int ret; + memset(&efadv_attr, 0, sizeof(efadv_attr)); + memset(&device_attr, 0, sizeof(device_attr)); + base_attr = &device_attr.ibv_attr; - ret = efa_cmd_query_device(ctx, &device_attr); + ret = -ibv_query_device(ctx->ibv_ctx, base_attr); if (ret) { - EFA_INFO_ERRNO(FI_LOG_FABRIC, "efa_verbs_query_device_ex", ret); + EFA_INFO_ERRNO(FI_LOG_FABRIC, "ibv_query_device", ret); return ret; } + ret = -efadv_query_device(ctx->ibv_ctx, &efadv_attr, + sizeof(efadv_attr)); + if (ret) { + EFA_INFO_ERRNO(FI_LOG_FABRIC, "efadv_query_device", ret); + return ret; + } + + ctx->inline_buf_size = efadv_attr.inline_buf_size; + ctx->max_wr_rdma_sge = base_attr->max_sge_rd; + +#ifdef HAVE_RDMA_SIZE + ctx->max_rdma_size = efadv_attr.max_rdma_size; + ctx->device_caps = efadv_attr.device_caps; +#else + ctx->max_rdma_size = 0; + ctx->device_caps = 0; +#endif + ctx->max_mr_size = base_attr->max_mr_size; info->domain_attr->cq_cnt = base_attr->max_cq; info->domain_attr->ep_cnt = base_attr->max_qp; @@ -471,6 +561,23 @@ static int efa_get_device_attrs(struct efa_context *ctx, struct fi_info *info) info->domain_attr->resource_mgmt = FI_RM_DISABLED; info->domain_attr->mr_cnt = base_attr->max_mr; +#if HAVE_LIBCUDA + if (info->ep_attr->type == FI_EP_RDM) { + ret = efa_get_gdr_support(ctx->ibv_ctx->device->name); + if (ret < 0) { + EFA_WARN(FI_LOG_FABRIC, "get gdr support failed!\n"); + return ret; + } + + if (ret == 1) { + info->caps |= FI_HMEM; + info->tx_attr->caps |= FI_HMEM; + info->rx_attr->caps |= FI_HMEM; + info->domain_attr->mr_mode |= FI_MR_HMEM; + } + } +#endif + EFA_DBG(FI_LOG_DOMAIN, "Domain attribute :\n" "\t info->domain_attr->cq_cnt = %zu\n" "\t info->domain_attr->ep_cnt = %zu\n" @@ -485,25 +592,39 @@ static int efa_get_device_attrs(struct efa_context *ctx, struct fi_info *info) info->domain_attr->max_ep_tx_ctx, info->domain_attr->max_ep_rx_ctx); - info->tx_attr->iov_limit = device_attr.max_sq_sge; - info->tx_attr->size = align_down_to_power_of_2(MIN(device_attr.max_sq_wr, - ctx->max_llq_size / sizeof(struct efa_io_tx_wqe))); - info->rx_attr->iov_limit = device_attr.max_rq_sge; - info->rx_attr->size = align_down_to_power_of_2(device_attr.max_rq_wr / info->rx_attr->iov_limit); + info->tx_attr->iov_limit = efadv_attr.max_sq_sge; + info->tx_attr->size = align_down_to_power_of_2(efadv_attr.max_sq_wr); + if (info->ep_attr->type == FI_EP_RDM) { + info->tx_attr->inject_size = efadv_attr.inline_buf_size; + } else if (info->ep_attr->type == FI_EP_DGRAM) { + /* + * Currently, there is no mechanism for EFA layer (lower layer) + * to discard completions internally and FI_INJECT is not optional, + * it can only be disabled by setting inject_size to 0. RXR + * layer does not have this issue as completions can be read from + * the EFA layer and discarded in the RXR layer. For dgram + * endpoint, inject size needs to be set to 0 + */ + info->tx_attr->inject_size = 0; + } + info->rx_attr->iov_limit = efadv_attr.max_rq_sge; + info->rx_attr->size = align_down_to_power_of_2(efadv_attr.max_rq_wr / info->rx_attr->iov_limit); EFA_DBG(FI_LOG_DOMAIN, "Tx/Rx attribute :\n" "\t info->tx_attr->iov_limit = %zu\n" "\t info->tx_attr->size = %zu\n" + "\t info->tx_attr->inject_size = %zu\n" "\t info->rx_attr->iov_limit = %zu\n" "\t info->rx_attr->size = %zu\n", info->tx_attr->iov_limit, info->tx_attr->size, + info->tx_attr->inject_size, info->rx_attr->iov_limit, info->rx_attr->size); - ret = efa_cmd_query_port(ctx, 1, &port_attr); + ret = -ibv_query_port(ctx->ibv_ctx, 1, &port_attr); if (ret) { - EFA_INFO_ERRNO(FI_LOG_FABRIC, "ibv_query_port", errno); + EFA_INFO_ERRNO(FI_LOG_FABRIC, "ibv_query_port", ret); return ret; } @@ -511,11 +632,6 @@ static int efa_get_device_attrs(struct efa_context *ctx, struct fi_info *info) info->ep_attr->max_order_raw_size = port_attr.max_msg_sz; info->ep_attr->max_order_waw_size = port_attr.max_msg_sz; - EFA_DBG(FI_LOG_DOMAIN, "Internal attributes:\n" - "\tinject size = %" PRIu16 "\n" - "\tsub_cqs_per_cq = %" PRIu16 "\n", - ctx->inject_size, ctx->sub_cqs_per_cq); - /* Set fid nic attributes. */ ret = efa_alloc_fid_nic(info, ctx, &device_attr, &port_attr); if (ret) { @@ -560,9 +676,9 @@ static int efa_get_addr(struct efa_context *ctx, void *src_addr) union ibv_gid gid; int ret; - ret = efa_cmd_query_gid(ctx, 1, 0, &gid); + ret = ibv_query_gid(ctx->ibv_ctx, 1, 0, &gid); if (ret) { - EFA_INFO_ERRNO(FI_LOG_FABRIC, "efa_cmd_query_gid", errno); + EFA_INFO_ERRNO(FI_LOG_FABRIC, "ibv_query_gid", ret); return ret; } @@ -601,17 +717,14 @@ static int efa_alloc_info(struct efa_context *ctx, struct fi_info **info, fi->ep_attr->protocol = FI_PROTO_EFA; fi->ep_attr->type = ep_dom->type; - fi->tx_attr->caps = ep_dom->caps; - fi->rx_attr->caps = ep_dom->caps; ret = efa_get_device_attrs(ctx, fi); if (ret) goto err_free_info; - ret = efa_cmd_query_gid(ctx, 1, 0, &gid); + ret = ibv_query_gid(ctx->ibv_ctx, 1, 0, &gid); if (ret) { - EFA_INFO_ERRNO(FI_LOG_FABRIC, "efa_cmd_query_gid", errno); - ret = -errno; + EFA_INFO_ERRNO(FI_LOG_FABRIC, "ibv_query_gid", ret); goto err_free_info; } @@ -624,7 +737,7 @@ static int efa_alloc_info(struct efa_context *ctx, struct fi_info **info, } efa_addr_to_str(gid.raw, fi->fabric_attr->name); - name_len = strlen(ctx->ibv_ctx.device->name) + strlen(ep_dom->suffix); + name_len = strlen(ctx->ibv_ctx->device->name) + strlen(ep_dom->suffix); fi->domain_attr->name = malloc(name_len + 1); if (!fi->domain_attr->name) { ret = -FI_ENOMEM; @@ -632,7 +745,7 @@ static int efa_alloc_info(struct efa_context *ctx, struct fi_info **info, } snprintf(fi->domain_attr->name, name_len + 1, "%s%s", - ctx->ibv_ctx.device->name, ep_dom->suffix); + ctx->ibv_ctx->device->name, ep_dom->suffix); fi->domain_attr->name[name_len] = '\0'; fi->addr_format = FI_ADDR_EFA; @@ -741,6 +854,7 @@ static int efa_set_fi_address(const char *node, const char *service, uint64_t fl struct efa_ep_addr tmp_addr; void *dest_addr = NULL; int ret = FI_SUCCESS; + struct fi_info *cur; if (flags & FI_SOURCE) { if (hints && hints->dest_addr) @@ -757,15 +871,17 @@ static int efa_set_fi_address(const char *node, const char *service, uint64_t fl } if (dest_addr) { - fi->dest_addr = malloc(EFA_EP_ADDR_LEN); - if (!fi->dest_addr) - return -FI_ENOMEM; - memcpy(fi->dest_addr, dest_addr, EFA_EP_ADDR_LEN); + for (cur = fi; cur; cur = cur->next) { + cur->dest_addr = malloc(EFA_EP_ADDR_LEN); + if (!cur->dest_addr) { + for (; fi->dest_addr; fi = fi->next) + free(fi->dest_addr); + return -FI_ENOMEM; + } + memcpy(cur->dest_addr, dest_addr, EFA_EP_ADDR_LEN); + cur->dest_addrlen = EFA_EP_ADDR_LEN; + } } - - if (fi->dest_addr) - fi->dest_addrlen = EFA_EP_ADDR_LEN; - return ret; } @@ -834,6 +950,48 @@ static struct fi_ops_fabric efa_ops_fabric = { .trywait = ofi_trywait }; +static +void efa_atfork_callback() +{ + static int visited = 0; + + if (visited) + return; + + visited = 1; + if (getenv("RDMAV_FORK_SAFE") || getenv("IBV_FORK_SAFE") ) + return; + + fprintf(stderr, + "A process has executed an operation involving a call\n" + "to the fork() system call to create a child process.\n" + "\n" + "As a result, the libfabric EFA provider is operating in\n" + "a condition that could result in memory corruption or\n" + "other system errors.\n" + "\n" + "For the libfabric EFA provider to work safely when fork()\n" + "is called, the application must handle memory registrations\n" + "(FI_MR_LOCAL) and you will need to set the following environment\n" + "variables:\n" + " RDMAV_FORK_SAFE=1\n" + "MPI applications do not support this mode.\n" + "\n" + "However, this setting can result in signficant performance\n" + "impact to your application due to increased cost of memory\n" + "registration.\n" + "\n" + "You may want to check with your application vendor to see\n" + "if an application-level alternative (of not using fork)\n" + "exists.\n" + "\n" + "Please refer to https://github.com/ofiwg/libfabric/issues/6332\n" + "for more information.\n" + "\n" + "Your job will now abort.\n"); + abort(); +} + int efa_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric_fid, void *context) { @@ -841,6 +999,14 @@ int efa_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric_fid, struct efa_fabric *fab; int ret = 0; + ret = pthread_atfork(efa_atfork_callback, NULL, NULL); + if (ret) { + EFA_WARN(FI_LOG_FABRIC, + "Unable to register atfork callback: %s\n", + strerror(-ret)); + return -ret; + } + fab = calloc(1, sizeof(*fab)); if (!fab) return -FI_ENOMEM; @@ -865,31 +1031,29 @@ int efa_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric_fid, return 0; } -static void efa_dealloc_ctx(struct efa_context *ctx) -{ - efa_free_qp_table(ctx); -} - static void fi_efa_fini(void) { struct efa_context **ctx_list; int num_devices; - int i; + + if (efa_set_rdmav_hugepages_safe) + unsetenv("RDMAV_HUGEPAGES_SAFE"); fi_freeinfo((void *)efa_util_prov.info); efa_util_prov.info = NULL; ctx_list = efa_device_get_context_list(&num_devices); - for (i = 0; i < num_devices; i++) - efa_dealloc_ctx(ctx_list[i]); efa_device_free_context_list(ctx_list); efa_device_free(); +#if HAVE_EFA_DL + smr_cleanup(); +#endif } struct fi_provider efa_prov = { .name = EFA_PROV_NAME, - .version = EFA_PROV_VERS, - .fi_version = FI_VERSION(1, 8), + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, .getinfo = efa_getinfo, .fabric = efa_fabric, .cleanup = fi_efa_fini @@ -934,9 +1098,7 @@ static int efa_init_info(const struct fi_info **all_infos) continue; } - ret = efa_alloc_qp_table(ctx_list[i], tail->domain_attr->ep_cnt); - if (!ret) - retv = 0; + retv = 0; } efa_device_free_context_list(ctx_list); @@ -947,6 +1109,25 @@ static int efa_init_info(const struct fi_info **all_infos) struct fi_provider *init_lower_efa_prov() { + int err; + + if (!getenv("RDMAV_HUGEPAGES_SAFE")) { + /* + * Setting RDMAV_HUGEPAGES_SAFE alone will not impact + * application performance, because rdma-core will only + * check this environment variable when either + * RDMAV_FORK_SAFE or IBV_FORK_SAFE is set. + */ + err = setenv("RDMAV_HUGEPAGES_SAFE", "1", 1); + if (err) { + EFA_WARN(FI_LOG_FABRIC, + "Unable to set environment variable RDMAV_HUGEPAGES_SAFE\n"); + return NULL; + } + + efa_set_rdmav_hugepages_safe = 1; + } + if (efa_init_info(&efa_util_prov.info)) return NULL; diff --git a/prov/efa/src/efa_mr.c b/prov/efa/src/efa_mr.c index b9f72fd97c1..f89eb34bc34 100644 --- a/prov/efa/src/efa_mr.c +++ b/prov/efa/src/efa_mr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright (c) 2017-2020 Amazon.com, Inc. or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,14 +33,18 @@ #include "config.h" #include #include "efa.h" -#include "efa_verbs.h" + +static int efa_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, + uint64_t flags, struct fid_mr **mr_fid); +static int efa_mr_reg_impl(struct efa_mr *efa_mr, uint64_t flags, void *attr); +static int efa_mr_dereg_impl(struct efa_mr *efa_mr); static int efa_mr_cache_close(fid_t fid) { - struct efa_mem_desc *mr = container_of(fid, struct efa_mem_desc, + struct efa_mr *efa_mr = container_of(fid, struct efa_mr, mr_fid.fid); - ofi_mr_cache_delete(&mr->domain->cache, mr->entry); + ofi_mr_cache_delete(efa_mr->domain->cache, efa_mr->entry); return 0; } @@ -56,81 +60,132 @@ static struct fi_ops efa_mr_cache_ops = { int efa_mr_cache_entry_reg(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) { - int fi_ibv_access = IBV_ACCESS_LOCAL_WRITE; - - struct efa_mem_desc *md = (struct efa_mem_desc *)entry->data; - - md->domain = container_of(cache->domain, struct efa_domain, - util_domain); - md->mr_fid.fid.ops = &efa_mr_cache_ops; - - md->mr_fid.fid.fclass = FI_CLASS_MR; - md->mr_fid.fid.context = NULL; - - md->mr = efa_cmd_reg_mr(md->domain->pd, entry->info.iov.iov_base, - entry->info.iov.iov_len, fi_ibv_access); - if (!md->mr) { - EFA_WARN_ERRNO(FI_LOG_MR, "efa_cmd_reg_mr", errno); - return -errno; - } - - md->mr_fid.mem_desc = (void *)(uintptr_t)md->mr->lkey; - md->mr_fid.key = md->mr->rkey; - - return 0; + int ret = 0; + /* TODO + * Since, access is not passed as a parameter to efa_mr_cache_entry_reg, + * for now we will set access to all supported access modes i.e. + * FI_SEND | FI_RECV | FI_REMOTE_READ | FI_REMOTE_WRITE. Once access + * information is available this can be removed. + * Issue: https://github.com/ofiwg/libfabric/issues/5677 + */ + uint64_t access = FI_SEND | FI_RECV | FI_REMOTE_READ | FI_REMOTE_WRITE; + struct fi_mr_attr attr; + struct efa_mr *efa_mr = (struct efa_mr *)entry->data; + + efa_mr->domain = container_of(cache->domain, struct efa_domain, + util_domain); + efa_mr->mr_fid.fid.ops = &efa_mr_cache_ops; + efa_mr->mr_fid.fid.fclass = FI_CLASS_MR; + efa_mr->mr_fid.fid.context = NULL; + + attr.mr_iov = &entry->info.iov; + attr.iov_count = 1; + attr.access = access; + attr.offset = 0; + attr.requested_key = 0; + attr.context = NULL; + attr.iface = FI_HMEM_SYSTEM; + + ret = efa_mr_reg_impl(efa_mr, 0, (void *)&attr); + return ret; } void efa_mr_cache_entry_dereg(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) { - struct efa_mem_desc *md = (struct efa_mem_desc *)entry->data; - int ret = -efa_cmd_dereg_mr(md->mr); + struct efa_mr *efa_mr = (struct efa_mr *)entry->data; + int ret; + + if (!efa_mr->ibv_mr) + return; + ret = efa_mr_dereg_impl(efa_mr); if (ret) EFA_WARN(FI_LOG_MR, "Unable to dereg mr: %d\n", ret); } -static int efa_mr_cache_reg(struct fid *fid, const void *buf, size_t len, - uint64_t access, uint64_t offset, - uint64_t requested_key, uint64_t flags, - struct fid_mr **mr_fid, void *context) +/* + * efa_mr_reg_shm() is called by rxr_read_init_iov to used to generate + * shm only memory registrations. Such memory registrations were used + * when read message protocol was applied to SHM EP. In which case, + * we need to register the send iov as FI_REMOTE_READ. + * + * Note when we open the SHM domain we did not specify FI_MR_PROV_KEY + * therefore the SHM domain require us to proivde a key when calling + * fi_mr_reg on it. (rxr_set_shm_hints()) + * + * The reason we did not specify FI_MR_PROV_KEY when opening SHM + * domain is because we want ibv_mr and shm_mr to use the same + * key. For that, we first call ibv_reg_mr() to register memory + * and get a key, and use that key to register shm. (efa_m_reg_impl()) + * + * However, for SHM's read message protocol, we do not want to call + * ibv_reg_mr() because it is expensive, so we use a static variable + * SHM_MR_KEYGEN to generate key. + * + * It is initialized as 0x100000000, and each call to efa_mr_reg_shm() + * will use shm_mr_keygen as current key and increase it by 1. + * + * Note SHM_MR_KEYGEN starts from 0x100000000 because the key + * returned from ibv_reg_mr() is a 32 bits integer, thus is always + * smaller than 0x100000000. By starting from 0x100000000, we avoid + * key collision. + */ +int efa_mr_reg_shm(struct fid_domain *domain_fid, struct iovec *iov, + uint64_t access, struct fid_mr **mr_fid) +{ + static uint64_t SHM_MR_KEYGEN = 0x100000000; + uint64_t requested_key; + struct efa_domain *efa_domain; + + efa_domain = container_of(domain_fid, struct efa_domain, util_domain.domain_fid.fid); + assert(efa_domain->shm_domain); + + requested_key = SHM_MR_KEYGEN++; + return fi_mr_regv(efa_domain->shm_domain, iov, 1, access, 0, requested_key, 0, mr_fid, NULL); +} + +static int efa_mr_cache_regattr(struct fid *fid, const struct fi_mr_attr *attr, + uint64_t flags, struct fid_mr **mr_fid) { struct efa_domain *domain; - struct efa_mem_desc *md; + struct efa_mr *efa_mr; struct ofi_mr_entry *entry; int ret; + static const int EFA_MR_CACHE_FLUSH_CHECK = 512; - struct iovec iov = { - .iov_base = (void *)buf, - .iov_len = len, - }; - - struct fi_mr_attr attr = { - .mr_iov = &iov, - .iov_count = 1, - .access = access, - .offset = offset, - .requested_key = requested_key, - .context = context, - }; - - if (access & ~EFA_MR_SUPPORTED_PERMISSIONS) { - EFA_WARN(FI_LOG_MR, - "Unsupported access permissions. requested[0x%" PRIx64 "] supported[0x%" PRIx64 "]\n", - access, (uint64_t)EFA_MR_SUPPORTED_PERMISSIONS); + if (flags & OFI_MR_NOCACHE) { + ret = efa_mr_regattr(fid, attr, flags, mr_fid); + return ret; + } + + if (attr->iov_count > EFA_MR_IOV_LIMIT) { + EFA_WARN(FI_LOG_MR, "iov count > %d not supported\n", + EFA_MR_IOV_LIMIT); return -FI_EINVAL; } domain = container_of(fid, struct efa_domain, util_domain.domain_fid.fid); - ret = ofi_mr_cache_search(&domain->cache, &attr, &entry); + if (domain->cache->cached_cnt > 0 && domain->cache->cached_cnt % EFA_MR_CACHE_FLUSH_CHECK==0) { + ofi_mr_cache_flush(domain->cache, false); + } + + ret = ofi_mr_cache_search(domain->cache, attr, &entry); if (OFI_UNLIKELY(ret)) return ret; - md = (struct efa_mem_desc *)entry->data; - md->entry = entry; + efa_mr = (struct efa_mr *)entry->data; + efa_mr->entry = entry; - *mr_fid = &md->mr_fid; + if (domain->util_domain.info_domain_caps & FI_HMEM) + efa_mr->peer.iface = attr->iface; + else + efa_mr->peer.iface = FI_HMEM_SYSTEM; + if (efa_mr->peer.iface == FI_HMEM_CUDA) + efa_mr->peer.device.cuda = attr->device.cuda; + + *mr_fid = &efa_mr->mr_fid; return 0; } @@ -139,22 +194,30 @@ static int efa_mr_cache_regv(struct fid *fid, const struct iovec *iov, uint64_t requested_key, uint64_t flags, struct fid_mr **mr_fid, void *context) { - if (count > EFA_MR_IOV_LIMIT) { - EFA_WARN(FI_LOG_MR, "iov count > %d not supported\n", - EFA_MR_IOV_LIMIT); - return -FI_EINVAL; - } - return efa_mr_cache_reg(fid, iov->iov_base, iov->iov_len, access, - offset, requested_key, flags, mr_fid, context); + struct fi_mr_attr attr; + + attr.mr_iov = iov; + attr.iov_count = count; + attr.access = access; + attr.offset = offset; + attr.requested_key = requested_key; + attr.context = context; + attr.iface = FI_HMEM_SYSTEM; + + return efa_mr_cache_regattr(fid, &attr, flags, mr_fid); } -static int efa_mr_cache_regattr(struct fid *fid, const struct fi_mr_attr *attr, - uint64_t flags, struct fid_mr **mr_fid) +static int efa_mr_cache_reg(struct fid *fid, const void *buf, size_t len, + uint64_t access, uint64_t offset, + uint64_t requested_key, uint64_t flags, + struct fid_mr **mr_fid, void *context) { - return efa_mr_cache_regv(fid, attr->mr_iov, attr->iov_count, - attr->access, attr->offset, - attr->requested_key, flags, mr_fid, - attr->context); + struct iovec iov; + + iov.iov_base = (void *)buf; + iov.iov_len = len; + return efa_mr_cache_regv(fid, &iov, 1, access, offset, requested_key, + flags, mr_fid, context); } struct fi_ops_mr efa_domain_mr_cache_ops = { @@ -164,19 +227,53 @@ struct fi_ops_mr efa_domain_mr_cache_ops = { .regattr = efa_mr_cache_regattr, }; +static int efa_mr_dereg_impl(struct efa_mr *efa_mr) +{ + struct efa_domain *efa_domain; + int ret = 0; + int err; + + efa_domain = efa_mr->domain; + err = -ibv_dereg_mr(efa_mr->ibv_mr); + if (err) { + EFA_WARN(FI_LOG_MR, + "Unable to deregister memory registration\n"); + ret = err; + } + err = ofi_mr_map_remove(&efa_domain->util_domain.mr_map, + efa_mr->mr_fid.key); + if (err) { + EFA_WARN(FI_LOG_MR, + "Unable to remove MR entry from util map (%s)\n", + fi_strerror(-ret)); + ret = err; + } + if (rxr_env.enable_shm_transfer && efa_mr->shm_mr) { + err = fi_close(&efa_mr->shm_mr->fid); + if (err) { + EFA_WARN(FI_LOG_MR, + "Unable to close shm MR\n"); + ret = err; + } + } + return ret; +} + static int efa_mr_close(fid_t fid) + { - struct efa_mem_desc *mr; + struct efa_mr *efa_mr; int ret; - mr = container_of(fid, struct efa_mem_desc, mr_fid.fid); - ret = -efa_cmd_dereg_mr(mr->mr); - if (!ret) - free(mr); + efa_mr = container_of(fid, struct efa_mr, mr_fid.fid); + ret = efa_mr_dereg_impl(efa_mr); + if (ret) + EFA_WARN(FI_LOG_MR, "Unable to close MR\n"); + free(efa_mr); return ret; } -static struct fi_ops efa_mr_ops = { +struct fi_ops efa_mr_ops = { .size = sizeof(struct fi_ops), .close = efa_mr_close, .bind = fi_no_bind, @@ -184,79 +281,170 @@ static struct fi_ops efa_mr_ops = { .ops_open = fi_no_ops_open, }; -static int efa_mr_reg(struct fid *fid, const void *buf, size_t len, - uint64_t access, uint64_t offset, uint64_t requested_key, - uint64_t flags, struct fid_mr **mr_fid, void *context) +/* + * Set core_access to FI_SEND | FI_RECV if not already set, + * set the fi_ibv_access modes and do real registration (ibv_mr_reg) + * Insert the key returned by ibv_mr_reg into efa mr_map and shm mr_map + */ +static int efa_mr_reg_impl(struct efa_mr *efa_mr, uint64_t flags, void *attr) { - struct fid_domain *domain_fid; - struct efa_mem_desc *md; + uint64_t core_access, original_access; + struct fi_mr_attr *mr_attr = (struct fi_mr_attr *)attr; int fi_ibv_access = 0; + int ret = 0; + + /* To support Emulated RMA path, if the access is not supported + * by EFA, modify it to FI_SEND | FI_RECV + */ + core_access = mr_attr->access; + if (!core_access || (core_access & ~EFA_MR_SUPPORTED_PERMISSIONS)) + core_access = FI_SEND | FI_RECV; + + /* Local read access to an MR is enabled by default in verbs */ + if (core_access & FI_RECV) + fi_ibv_access |= IBV_ACCESS_LOCAL_WRITE; + + if (efa_mr->domain->ctx->device_caps & EFADV_DEVICE_ATTR_CAPS_RDMA_READ) + fi_ibv_access |= IBV_ACCESS_REMOTE_READ; + + efa_mr->ibv_mr = ibv_reg_mr(efa_mr->domain->ibv_pd, + (void *)mr_attr->mr_iov->iov_base, + mr_attr->mr_iov->iov_len, fi_ibv_access); + if (!efa_mr->ibv_mr) { + EFA_WARN(FI_LOG_MR, "Unable to register MR: %s\n", + fi_strerror(-errno)); + return -errno; + } - if (flags) + efa_mr->mr_fid.mem_desc = efa_mr; + efa_mr->mr_fid.key = efa_mr->ibv_mr->rkey; + /* + * Skipping the domain type check is okay here since util_domain is at + * the beginning of efa_domain and rxr_domain. + */ + if (efa_mr->domain->util_domain.info_domain_caps & FI_HMEM) + efa_mr->peer.iface = mr_attr->iface; + else + efa_mr->peer.iface = FI_HMEM_SYSTEM; + if (efa_mr->peer.iface == FI_HMEM_CUDA) + efa_mr->peer.device.cuda = mr_attr->device.cuda; + assert(efa_mr->mr_fid.key != FI_KEY_NOTAVAIL); + + mr_attr->requested_key = efa_mr->mr_fid.key; + + ret = ofi_mr_map_insert(&efa_mr->domain->util_domain.mr_map, attr, + &efa_mr->mr_fid.key, &efa_mr->mr_fid); + if (ret) { + EFA_WARN(FI_LOG_MR, + "Unable to add MR to map buf (%s): %p len: %zu\n", + fi_strerror(-ret), mr_attr->mr_iov->iov_base, + mr_attr->mr_iov->iov_len); + return ret; + } + if (efa_mr->domain->shm_domain && rxr_env.enable_shm_transfer) { + /* We need to add FI_REMOTE_READ to allow for Read implemented + * message protocols. + */ + original_access = mr_attr->access; + mr_attr->access |= FI_REMOTE_READ; + ret = fi_mr_regattr(efa_mr->domain->shm_domain, attr, + flags, &efa_mr->shm_mr); + mr_attr->access = original_access; + if (ret) { + EFA_WARN(FI_LOG_MR, + "Unable to register shm MR buf (%s): %p len: %zu\n", + fi_strerror(-ret), mr_attr->mr_iov->iov_base, + mr_attr->mr_iov->iov_len); + fi_close(&efa_mr->mr_fid.fid); + ofi_mr_map_remove(&efa_mr->domain->util_domain.mr_map, + efa_mr->mr_fid.key); + return ret; + } + } + return 0; +} + +static int efa_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, + uint64_t flags, struct fid_mr **mr_fid) +{ + struct fid_domain *domain_fid; + struct efa_mr *efa_mr = NULL; + int ret = 0; + + if (flags && flags != OFI_MR_NOCACHE) { + EFA_WARN(FI_LOG_MR, "Unsupported flag type. requested" + "[0x%" PRIx64 "] supported[0x%" PRIx64 "]\n", + flags, (uint64_t) OFI_MR_NOCACHE); return -FI_EBADFLAGS; + } - if (fid->fclass != FI_CLASS_DOMAIN) + if (fid->fclass != FI_CLASS_DOMAIN) { + EFA_WARN(FI_LOG_MR, "Unsupported domain. requested" + "[0x%" PRIx64 "] supported[0x%" PRIx64 "]\n", + fid->fclass, (uint64_t) FI_CLASS_DOMAIN); return -FI_EINVAL; + } - if (access & ~EFA_MR_SUPPORTED_PERMISSIONS) { - EFA_WARN(FI_LOG_MR, - "Unsupported access permissions. requested[0x%" PRIx64 "] supported[0x%" PRIx64 "]\n", - access, (uint64_t)EFA_MR_SUPPORTED_PERMISSIONS); + if (attr->iov_count > EFA_MR_IOV_LIMIT) { + EFA_WARN(FI_LOG_MR, "iov count > %d not supported\n", + EFA_MR_IOV_LIMIT); return -FI_EINVAL; } domain_fid = container_of(fid, struct fid_domain, fid); - md = calloc(1, sizeof(*md)); - if (!md) + efa_mr = calloc(1, sizeof(*efa_mr)); + if (!efa_mr) { + EFA_WARN(FI_LOG_MR, "Unable to initialize md"); return -FI_ENOMEM; + } - md->domain = container_of(domain_fid, struct efa_domain, - util_domain.domain_fid); - md->mr_fid.fid.fclass = FI_CLASS_MR; - md->mr_fid.fid.context = context; - md->mr_fid.fid.ops = &efa_mr_ops; - - /* Local read access to an MR is enabled by default in verbs */ - if (access & FI_RECV) - fi_ibv_access |= IBV_ACCESS_LOCAL_WRITE; + efa_mr->domain = container_of(domain_fid, struct efa_domain, + util_domain.domain_fid); + efa_mr->mr_fid.fid.fclass = FI_CLASS_MR; + efa_mr->mr_fid.fid.context = attr->context; + efa_mr->mr_fid.fid.ops = &efa_mr_ops; - md->mr = efa_cmd_reg_mr(md->domain->pd, (void *)buf, len, fi_ibv_access); - if (!md->mr) { - EFA_WARN_ERRNO(FI_LOG_MR, "efa_cmd_reg_mr", errno); + ret = efa_mr_reg_impl(efa_mr, flags, (void *)attr); + if (ret) goto err; - } - - md->mr_fid.mem_desc = (void *)(uintptr_t)md->mr->lkey; - md->mr_fid.key = md->mr->rkey; - *mr_fid = &md->mr_fid; + *mr_fid = &efa_mr->mr_fid; return 0; - err: - free(md); - return -errno; + EFA_WARN(FI_LOG_MR, "Unable to register MR: %s\n", + fi_strerror(-ret)); + free(efa_mr); + return ret; } static int efa_mr_regv(struct fid *fid, const struct iovec *iov, size_t count, uint64_t access, uint64_t offset, uint64_t requested_key, uint64_t flags, struct fid_mr **mr_fid, void *context) { - if (count > EFA_MR_IOV_LIMIT) { - EFA_WARN(FI_LOG_MR, "iov count > %d not supported\n", - EFA_MR_IOV_LIMIT); - return -FI_EINVAL; - } - return efa_mr_reg(fid, iov->iov_base, iov->iov_len, access, offset, - requested_key, flags, mr_fid, context); + struct fi_mr_attr attr; + + attr.mr_iov = iov; + attr.iov_count = count; + attr.access = access; + attr.offset = offset; + attr.requested_key = requested_key; + attr.context = context; + attr.iface = FI_HMEM_SYSTEM; + + return efa_mr_regattr(fid, &attr, flags, mr_fid); } -static int efa_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, - uint64_t flags, struct fid_mr **mr_fid) +static int efa_mr_reg(struct fid *fid, const void *buf, size_t len, + uint64_t access, uint64_t offset, uint64_t requested_key, + uint64_t flags, struct fid_mr **mr_fid, void *context) { - return efa_mr_regv(fid, attr->mr_iov, attr->iov_count, attr->access, - attr->offset, attr->requested_key, flags, mr_fid, - attr->context); + struct iovec iov; + + iov.iov_base = (void *)buf; + iov.iov_len = len; + return efa_mr_regv(fid, &iov, 1, access, offset, requested_key, + flags, mr_fid, context); } struct fi_ops_mr efa_domain_mr_ops = { diff --git a/prov/efa/src/efa_msg.c b/prov/efa/src/efa_msg.c index ce60d4424a8..b0c538523ff 100644 --- a/prov/efa/src/efa_msg.c +++ b/prov/efa/src/efa_msg.c @@ -1,6 +1,6 @@ /* * Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. - * Copyright (c) 2017-2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright (c) 2017-2020 Amazon.com, Inc. or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,8 +33,6 @@ #include "config.h" -#include "efa_verbs/efa_ib.h" -#include "efa_verbs/efa_io_defs.h" #include "ofi.h" #include "ofi_enosys.h" @@ -78,19 +76,40 @@ static inline void dump_msg(const struct fi_msg *msg, const char *context) } #endif /* EFA_MSG_DUMP */ -static ssize_t efa_post_recv_validate(struct efa_ep *ep, const struct fi_msg *msg) +static void free_send_wr_list(struct ibv_send_wr *head) { - struct efa_qp *qp = ep->qp; - //size_t len; + struct ibv_send_wr *wr = head; + struct ibv_send_wr *tmp; + + while (wr) { + tmp = wr->next; + ofi_buf_free(container_of(wr, struct efa_send_wr, wr)); + wr = tmp; + } +} +static void free_recv_wr_list(struct ibv_recv_wr *head) +{ + struct ibv_recv_wr *wr = head; + struct ibv_recv_wr *tmp; + + while (wr) { + tmp = wr->next; + ofi_buf_free(container_of(wr, struct efa_recv_wr, wr)); + wr = tmp; + } +} + +static ssize_t efa_post_recv_validate(struct efa_ep *ep, const struct fi_msg *msg) +{ if (OFI_UNLIKELY(!ep->rcq)) { EFA_WARN(FI_LOG_EP_DATA, "No receive cq was bound to ep.\n"); return -FI_EINVAL; } - if (OFI_UNLIKELY(msg->iov_count > qp->rq.wq.max_sge)) { - EFA_WARN(FI_LOG_EP_DATA, "requested sge[%zu] is greater than max supported[%d]!\n", - msg->iov_count, qp->rq.wq.max_sge); + if (OFI_UNLIKELY(msg->iov_count > ep->info->rx_attr->iov_limit)) { + EFA_WARN(FI_LOG_EP_DATA, "requested sge[%zu] is greater than max supported[%zu]!\n", + msg->iov_count, ep->info->tx_attr->iov_limit); return -FI_EINVAL; } @@ -101,96 +120,75 @@ static ssize_t efa_post_recv_validate(struct efa_ep *ep, const struct fi_msg *ms return -EINVAL; } -/* XXX: tests pass the prefix twice for some reason and break this check (will be removed when we move to libibverbs) - len = ofi_total_iov_len(msg->msg_iov, msg->iov_count); - - if (OFI_UNLIKELY(len > ep->info->ep_attr->max_msg_size + - ep->msg_prefix_size)) { - EFA_WARN(FI_LOG_EP_DATA, "requested size[%zu] is greater than max[%zu]!\n", - len, ep->info->ep_attr->max_msg_size + ep->msg_prefix_size); - return -FI_EINVAL; - } -*/ - - if (OFI_UNLIKELY((qp->rq.wq.wqe_posted - qp->rq.wq.wqe_completed) == qp->rq.wq.wqe_cnt)) { - EFA_DBG(FI_LOG_EP_DATA, "rq is full! posted[%u] completed[%u] wqe_cnt[%u]\n", - qp->rq.wq.wqe_posted, qp->rq.wq.wqe_completed, qp->rq.wq.wqe_cnt); - return -FI_EAGAIN; - } - return 0; } static ssize_t efa_post_recv(struct efa_ep *ep, const struct fi_msg *msg, uint64_t flags) { + struct efa_mr *efa_mr; struct efa_qp *qp = ep->qp; - struct efa_io_rx_desc rx_buf = {}; - uint32_t wqe_index, rq_desc_offset; - size_t i; - ssize_t err; + struct ibv_recv_wr *bad_wr; + struct efa_recv_wr *ewr; + struct ibv_recv_wr *wr; uintptr_t addr; + ssize_t err; + size_t i; + + ewr = ofi_buf_alloc(ep->recv_wr_pool); + if (OFI_UNLIKELY(!ewr)) + return -FI_ENOMEM; + memset(ewr, 0, sizeof(*ewr) + sizeof(*ewr->sge) * msg->iov_count); + wr = &ewr->wr; dump_msg(msg, "recv"); err = efa_post_recv_validate(ep, msg); - if (OFI_UNLIKELY(err)) - return err; - - /* Save wrid */ - /* Get the next wrid to be used from the index pool. */ - wqe_index = qp->rq.wq.wrid_idx_pool[qp->rq.wq.wrid_idx_pool_next]; - qp->rq.wq.wrid[wqe_index] = (uintptr_t)msg->context; - rx_buf.req_id = wqe_index; - qp->rq.wq.wqe_posted++; - - /* Will never overlap, as efa_post_recv_validate() succeeded. */ - qp->rq.wq.wrid_idx_pool_next++; - assert(qp->rq.wq.wrid_idx_pool_next <= qp->rq.wq.wqe_cnt); + if (OFI_UNLIKELY(err)) { + ofi_buf_free(ewr); + goto out_err; + } - /* Default init of the rx buffer */ - set_efa_io_rx_desc_first(&rx_buf, 1); - set_efa_io_rx_desc_last(&rx_buf, 0); + wr->wr_id = (uintptr_t)msg->context; + wr->num_sge = msg->iov_count; + wr->sg_list = ewr->sge; for (i = 0; i < msg->iov_count; i++) { - /* Set last indication if need) */ - if (i == (msg->iov_count - 1)) - set_efa_io_rx_desc_last(&rx_buf, 1); - addr = (uintptr_t)msg->msg_iov[i].iov_base; /* Set RX buffer desc from SGE */ - rx_buf.length = msg->msg_iov[i].iov_len; - set_efa_io_rx_desc_lkey(&rx_buf, (uint32_t)(uintptr_t)msg->desc[i]); - rx_buf.buf_addr_lo = addr; - rx_buf.buf_addr_hi = addr >> 32; - - /* Copy descriptor to RX ring */ - rq_desc_offset = (qp->rq.wq.desc_idx & qp->rq.wq.desc_mask) * sizeof(rx_buf); - memcpy(qp->rq.buf + rq_desc_offset, &rx_buf, sizeof(rx_buf)); - - /* Wrap rx descriptor index */ - qp->rq.wq.desc_idx++; - if ((qp->rq.wq.desc_idx & qp->rq.wq.desc_mask) == 0) - qp->rq.wq.phase++; - - /* reset descriptor for next iov */ - memset(&rx_buf, 0, sizeof(rx_buf)); + wr->sg_list[i].length = msg->msg_iov[i].iov_len; + assert(msg->desc[i]); + efa_mr = (struct efa_mr *)msg->desc[i]; + wr->sg_list[i].lkey = efa_mr->ibv_mr->lkey; + wr->sg_list[i].addr = addr; } + ep->recv_more_wr_tail->next = wr; + ep->recv_more_wr_tail = wr; + if (flags & FI_MORE) return 0; - wmb(); - *qp->rq.db = qp->rq.wq.desc_idx; + err = ibv_post_recv(qp->ibv_qp, ep->recv_more_wr_head.next, &bad_wr); - return 0; + free_recv_wr_list(ep->recv_more_wr_head.next); + ep->recv_more_wr_tail = &ep->recv_more_wr_head; + + return err; + +out_err: + if (ep->recv_more_wr_head.next) + ibv_post_recv(qp->ibv_qp, ep->recv_more_wr_head.next, &bad_wr); + + free_recv_wr_list(ep->recv_more_wr_head.next); + ep->recv_more_wr_tail = &ep->recv_more_wr_head; + + return err; } static ssize_t efa_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) { - struct efa_ep *ep; - - ep = container_of(ep_fid, struct efa_ep, ep_fid); + struct efa_ep *ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid); return efa_post_recv(ep, msg, flags); } @@ -198,12 +196,10 @@ static ssize_t efa_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, u static ssize_t efa_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context) { - struct efa_ep *ep; + struct efa_ep *ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid); struct iovec iov; struct fi_msg msg; - ep = container_of(ep_fid, struct efa_ep, ep_fid); - EFA_SETUP_IOV(iov, buf, len); EFA_SETUP_MSG(msg, &iov, &desc, 1, src_addr, context, 0); @@ -213,11 +209,9 @@ static ssize_t efa_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, static ssize_t efa_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, void *context) { - struct efa_ep *ep; + struct efa_ep *ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid); struct fi_msg msg; - ep = container_of(ep_fid, struct efa_ep, ep_fid); - EFA_SETUP_MSG(msg, iov, desc, count, src_addr, context, 0); return efa_post_recv(ep, &msg, 0); @@ -226,21 +220,14 @@ static ssize_t efa_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void static ssize_t efa_post_send_validate(struct efa_ep *ep, const struct fi_msg *msg, struct efa_conn *conn, uint64_t flags, size_t *len) { - struct efa_qp *qp = ep->qp; - if (OFI_UNLIKELY(!ep->scq)) { EFA_WARN(FI_LOG_EP_DATA, "No send cq was bound to ep.\n"); return -FI_EINVAL; } - if (OFI_UNLIKELY(msg->iov_count > qp->sq.wq.max_sge)) { - EFA_WARN(FI_LOG_EP_DATA, "requested sge[%zu] is greater than max supported[%d]!\n", - msg->iov_count, qp->sq.wq.max_sge); - return -FI_EINVAL; - } - - if (OFI_UNLIKELY(!conn->ah)) { - EFA_WARN(FI_LOG_EP_DATA, "Invalid fi_addr\n"); + if (OFI_UNLIKELY(msg->iov_count > ep->info->tx_attr->iov_limit)) { + EFA_WARN(FI_LOG_EP_DATA, "requested sge[%zu] is greater than max supported[%zu]!\n", + msg->iov_count, ep->info->tx_attr->iov_limit); return -FI_EINVAL; } @@ -258,71 +245,25 @@ static ssize_t efa_post_send_validate(struct efa_ep *ep, const struct fi_msg *ms return -FI_EINVAL; } - if (OFI_UNLIKELY((qp->sq.wq.wqe_posted - qp->sq.wq.wqe_completed) == qp->sq.wq.wqe_cnt)) { - EFA_DBG(FI_LOG_EP_DATA, "sq is full! posted[%u] completed[%u] wqe_cnt[%u]\n", - qp->sq.wq.wqe_posted, qp->sq.wq.wqe_completed, qp->sq.wq.wqe_cnt); - return -FI_EAGAIN; - } - return 0; } -static void efa_post_send_inline_data(struct efa_ep *ep, - const struct fi_msg *msg, - struct efa_io_tx_wqe *tx_wqe, - int *desc_size) -{ - const struct iovec *iov = msg->msg_iov; - uint32_t total_length = 0; - uint32_t length; - uintptr_t addr; - size_t i; - - for (i = 0; i < msg->iov_count; i++) { - length = iov[i].iov_len; - addr = (uintptr_t)iov[i].iov_base; - - /* Whole prefix must be on the first sgl */ - if (!i) { - /* Check if payload exists */ - if (length <= ep->info->ep_attr->msg_prefix_size) - continue; - - addr += ep->info->ep_attr->msg_prefix_size; - length -= ep->info->ep_attr->msg_prefix_size; - } - - memcpy(tx_wqe->data.inline_data + total_length, (void *)addr, length); - total_length += length; - } - *desc_size += total_length; - - set_efa_io_tx_meta_desc_inline_msg(&tx_wqe->common, 1); - tx_wqe->common.length = total_length; -} - -static void efa_post_send_immediate_data(const struct fi_msg *msg, - struct efa_io_tx_meta_desc *meta_desc) -{ - uint32_t imm_data; - - imm_data = htonl((uint32_t)msg->data); - meta_desc->immediate_data = imm_data; - set_efa_io_tx_meta_desc_has_imm(meta_desc, 1); -} - static void efa_post_send_sgl(struct efa_ep *ep, const struct fi_msg *msg, - struct efa_io_tx_wqe *tx_wqe, int *desc_size, - uint16_t *num_descs) + struct efa_send_wr *ewr) { - struct efa_io_tx_buf_desc *tx_buf; + struct efa_mr *efa_mr; + struct ibv_send_wr *wr = &ewr->wr; + struct ibv_sge *sge; size_t sgl_idx = 0; uint32_t length; uintptr_t addr; size_t i; + wr->num_sge = msg->iov_count; + wr->sg_list = ewr->sge; + for (i = 0; i < msg->iov_count; i++) { - tx_buf = &tx_wqe->data.sgl[sgl_idx]; + sge = &wr->sg_list[sgl_idx]; addr = (uintptr_t)msg->msg_iov[i].iov_base; length = msg->msg_iov[i].iov_len; @@ -337,88 +278,85 @@ static void efa_post_send_sgl(struct efa_ep *ep, const struct fi_msg *msg, } /* Set TX buffer desc from SGE */ - tx_buf->length = length; - tx_buf->lkey = (msg->desc ? ((uint32_t)(uintptr_t)msg->desc[i]) : 0); - tx_buf->buf_addr_lo = addr & 0xFFFFFFFF; - tx_buf->buf_addr_hi = addr >> 32; + sge->length = length; + assert (msg->desc && msg->desc[i]); + efa_mr = (struct efa_mr *)msg->desc[i]; + sge->lkey = efa_mr->ibv_mr->lkey; + sge->addr = addr; sgl_idx++; } +} - *num_descs = sgl_idx; - *desc_size += (sizeof(struct efa_io_tx_buf_desc) * sgl_idx); +ssize_t efa_post_flush(struct efa_ep *ep, struct ibv_send_wr **bad_wr) +{ + ssize_t ret; + + ret = ibv_post_send(ep->qp->ibv_qp, ep->xmit_more_wr_head.next, bad_wr); + free_send_wr_list(ep->xmit_more_wr_head.next); + ep->xmit_more_wr_tail = &ep->xmit_more_wr_head; + return ret; } static ssize_t efa_post_send(struct efa_ep *ep, const struct fi_msg *msg, uint64_t flags) { struct efa_qp *qp = ep->qp; - struct efa_io_tx_meta_desc *meta_desc; - struct efa_io_tx_wqe tx_wqe = {}; - uint32_t sq_desc_offset, wrid_idx; - int desc_size = sizeof(tx_wqe.common) + sizeof(tx_wqe.u); + struct ibv_send_wr *bad_wr; + struct efa_send_wr *ewr; + struct ibv_send_wr *wr; struct efa_conn *conn; size_t len; int ret; dump_msg(msg, "send"); + ewr = ofi_buf_alloc(ep->send_wr_pool); + if (OFI_UNLIKELY(!ewr)) + return -FI_ENOMEM; + + memset(ewr, 0, sizeof(*ewr) + sizeof(*ewr->sge) * msg->iov_count); + wr = &ewr->wr; conn = ep->av->addr_to_conn(ep->av, msg->addr); ret = efa_post_send_validate(ep, msg, conn, flags, &len); - if (OFI_UNLIKELY(ret)) - return ret; - - meta_desc = &tx_wqe.common; - - if (flags & FI_REMOTE_CQ_DATA) - efa_post_send_immediate_data(msg, meta_desc); - - if (len <= qp->sq.max_inline_data) - efa_post_send_inline_data(ep, msg, &tx_wqe, &desc_size); - else - efa_post_send_sgl(ep, msg, &tx_wqe, &desc_size, &meta_desc->length); - - /* Get the next wrid to be used from the index pool. */ - wrid_idx = qp->sq.wq.wrid_idx_pool[qp->sq.wq.wrid_idx_pool_next]; - qp->sq.wq.wrid[wrid_idx] = (uintptr_t)msg->context; - meta_desc->req_id = wrid_idx; - qp->sq.wq.wqe_posted++; - - /* Will never overlap, as efa_post_send_validate() succeeded. */ - qp->sq.wq.wrid_idx_pool_next++; - assert(qp->sq.wq.wrid_idx_pool_next <= qp->sq.wq.wqe_cnt); - - /* Set rest of the descriptor fields. */ - set_efa_io_tx_meta_desc_meta_desc(meta_desc, 1); - set_efa_io_tx_meta_desc_phase(meta_desc, qp->sq.wq.phase); - set_efa_io_tx_meta_desc_first(meta_desc, 1); - set_efa_io_tx_meta_desc_last(meta_desc, 1); - meta_desc->dest_qp_num = conn->ep_addr.qpn; - set_efa_io_tx_meta_desc_comp_req(meta_desc, 1); - meta_desc->ah = conn->ah->efa_address_handle; - - /* Copy descriptor */ - sq_desc_offset = (qp->sq.wq.desc_idx & qp->sq.wq.desc_mask) * sizeof(tx_wqe); - memcpy(qp->sq.desc + sq_desc_offset, &tx_wqe, desc_size); - - /* advance index and change phase */ - qp->sq.wq.desc_idx++; - if ((qp->sq.wq.desc_idx & qp->sq.wq.desc_mask) == 0) - qp->sq.wq.phase++; + if (OFI_UNLIKELY(ret)) { + ofi_buf_free(ewr); + goto out_err; + } + + efa_post_send_sgl(ep, msg, ewr); + + if (flags & FI_INJECT) + wr->send_flags |= IBV_SEND_INLINE; + + wr->opcode = IBV_WR_SEND; + wr->wr_id = (uintptr_t)msg->context; + wr->wr.ud.ah = conn->ah.ibv_ah; + wr->wr.ud.remote_qpn = conn->ep_addr.qpn; + wr->wr.ud.remote_qkey = conn->ep_addr.qkey; + + ep->xmit_more_wr_tail->next = wr; + ep->xmit_more_wr_tail = wr; if (flags & FI_MORE) return 0; - wmb(); - *qp->sq.db = qp->sq.wq.desc_idx; + ret = efa_post_flush(ep, &bad_wr); - return 0; + return ret; + +out_err: + if (ep->xmit_more_wr_head.next) + ibv_post_send(qp->ibv_qp, ep->xmit_more_wr_head.next, &bad_wr); + + free_send_wr_list(ep->xmit_more_wr_head.next); + ep->xmit_more_wr_tail = &ep->xmit_more_wr_head; + + return ret; } static ssize_t efa_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) { - struct efa_ep *ep; - - ep = container_of(ep_fid, struct efa_ep, ep_fid); + struct efa_ep *ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid); return efa_post_send(ep, msg, flags); } @@ -426,13 +364,11 @@ static ssize_t efa_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, u static ssize_t efa_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context) { - struct efa_ep *ep; + struct efa_ep *ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid); struct fi_msg msg; struct iovec iov; uint64_t flags; - ep = container_of(ep_fid, struct efa_ep, ep_fid); - EFA_SETUP_IOV(iov, buf, len); EFA_SETUP_MSG(msg, &iov, &desc, 1, dest_addr, context, 0); flags = ep->info->tx_attr->op_flags; @@ -443,13 +379,11 @@ static ssize_t efa_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len, static ssize_t efa_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, void *context) { - struct efa_ep *ep; + struct efa_ep *ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid); struct fi_msg msg; struct iovec iov; uint64_t flags; - ep = container_of(ep_fid, struct efa_ep, ep_fid); - EFA_SETUP_IOV(iov, buf, len); EFA_SETUP_MSG(msg, &iov, &desc, 1, dest_addr, context, data); @@ -461,12 +395,10 @@ static ssize_t efa_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t le static ssize_t efa_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, void *context) { - struct efa_ep *ep; + struct efa_ep *ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid); struct fi_msg msg; uint64_t flags; - ep = container_of(ep_fid, struct efa_ep, ep_fid); - EFA_SETUP_MSG(msg, iov, desc, count, dest_addr, context, 0); flags = ep->info->tx_attr->op_flags; diff --git a/prov/efa/src/efa_rma.c b/prov/efa/src/efa_rma.c new file mode 100644 index 00000000000..97c681311c7 --- /dev/null +++ b/prov/efa/src/efa_rma.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "efa.h" + + +/* + * efa_rma_post_read() will post a read request. + * + * Input: + * ep: endpoint + * msg: read operation information + * flags: currently no flags is taken + * self_comm: indicate whether the read is toward + * the end point itself. If self_comm is true, + * caller must set msg->addr to FI_ADDR_NOTAVAIL. + * + * On success return 0, + * If read iov and rma_iov count out of device limit, return -FI_EINVAL + * If read failed, return the error of read operation + */ +ssize_t efa_rma_post_read(struct efa_ep *ep, const struct fi_msg_rma *msg, + uint64_t flags, bool self_comm) +{ + struct efa_qp *qp; + struct efa_mr *efa_mr; + struct efa_conn *conn; + struct ibv_sge sge_list[msg->iov_count]; + int i; + + if (OFI_UNLIKELY(msg->iov_count > ep->domain->ctx->max_wr_rdma_sge)) { + EFA_WARN(FI_LOG_CQ, "invalid iov_count!\n"); + return -FI_EINVAL; + } + + if (OFI_UNLIKELY(msg->rma_iov_count > ep->domain->info->tx_attr->rma_iov_limit)) { + EFA_WARN(FI_LOG_CQ, "invalid rma_iov_count!\n"); + return -FI_EINVAL; + } + + if (OFI_UNLIKELY(ofi_total_iov_len(msg->msg_iov, msg->iov_count) + > ep->domain->ctx->max_rdma_size)) { + EFA_WARN(FI_LOG_CQ, "maximum rdma_size exceeded!\n"); + return -FI_EINVAL; + } + + /* caller must provide desc because EFA require FI_MR_LOCAL */ + assert(msg->desc); + + /* ep->domain->info->tx_attr->rma_iov_limit is set to 1 */ + qp = ep->qp; + ibv_wr_start(qp->ibv_qp_ex); + qp->ibv_qp_ex->wr_id = (uintptr_t)msg->context; + ibv_wr_rdma_read(qp->ibv_qp_ex, msg->rma_iov[0].key, msg->rma_iov[0].addr); + + for (i = 0; i < msg->iov_count; ++i) { + sge_list[i].addr = (uint64_t)msg->msg_iov[i].iov_base; + sge_list[i].length = msg->msg_iov[i].iov_len; + assert(msg->desc[i]); + efa_mr = (struct efa_mr *)msg->desc[i]; + sge_list[i].lkey = efa_mr->ibv_mr->lkey; + } + + ibv_wr_set_sge_list(qp->ibv_qp_ex, msg->iov_count, sge_list); + if (self_comm) { + assert(msg->addr == FI_ADDR_NOTAVAIL); + ibv_wr_set_ud_addr(qp->ibv_qp_ex, ep->self_ah, + qp->qp_num, qp->qkey); + } else { + conn = ep->av->addr_to_conn(ep->av, msg->addr); + ibv_wr_set_ud_addr(qp->ibv_qp_ex, conn->ah.ibv_ah, + conn->ep_addr.qpn, conn->ep_addr.qkey); + } + + return ibv_wr_complete(qp->ibv_qp_ex); +} + +static +ssize_t efa_rma_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) +{ + struct efa_ep *ep = container_of(ep_fid, struct efa_ep, util_ep.ep_fid); + + return efa_rma_post_read(ep, msg, flags, false); +} + +static +ssize_t efa_rma_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, + size_t iov_count, fi_addr_t src_addr, uint64_t addr, + uint64_t key, void *context) +{ + struct fi_rma_iov rma_iov; + struct fi_msg_rma msg; + + rma_iov.addr = addr; + rma_iov.len = ofi_total_iov_len(iov, iov_count); + rma_iov.key = key; + + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = iov; + msg.desc = desc; + msg.iov_count = iov_count; + msg.addr = src_addr; + msg.context = context; + msg.rma_iov = &rma_iov; + msg.rma_iov_count = 1; + + return efa_rma_readmsg(ep, &msg, 0); +} + +static +ssize_t efa_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc, + fi_addr_t src_addr, uint64_t addr, uint64_t key, + void *context) +{ + struct iovec iov; + + iov.iov_base = (void *)buf; + iov.iov_len = len; + return efa_rma_readv(ep, &iov, &desc, 1, src_addr, addr, key, context); +} + +struct fi_ops_rma efa_ep_rma_ops = { + .size = sizeof(struct fi_ops_rma), + .read = efa_rma_read, + .readv = efa_rma_readv, + .readmsg = efa_rma_readmsg, + .write = fi_no_rma_write, + .writev = fi_no_rma_writev, + .writemsg = fi_no_rma_writemsg, + .inject = fi_no_rma_inject, + .writedata = fi_no_rma_writedata, + .injectdata = fi_no_rma_injectdata, +}; + diff --git a/prov/efa/src/efa_verbs/efa-abi.h b/prov/efa/src/efa_verbs/efa-abi.h deleted file mode 100644 index 445f66ce89b..00000000000 --- a/prov/efa/src/efa_verbs/efa-abi.h +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) 2017-2019 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef EFA_ABI_H -#define EFA_ABI_H - -#include "infiniband/efa_kern-abi.h" - -/* - * Increment this value if any changes that break userspace ABI - * compatibility are made. - */ -#define EFA_UVERBS_ABI_VERSION 1 - -enum efa_ibv_user_cmds_supp_udata { - EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE = 1 << 0, - EFA_USER_CMDS_SUPP_UDATA_CREATE_AH = 1 << 1, -}; - -struct efa_ibv_alloc_ucontext_resp { - __u32 comp_mask; - __u32 cmds_supp_udata_mask; - __u16 sub_cqs_per_cq; - __u16 inline_buf_size; - __u32 max_llq_size; /* bytes */ -}; - -struct efa_ibv_alloc_pd_resp { - __u32 comp_mask; - __u16 pdn; - __u8 reserved_30[0x2]; -}; - -struct efa_ibv_create_cq { - __u32 comp_mask; - __u32 cq_entry_size; - __u16 num_sub_cqs; - __u8 reserved_50[0x6]; -}; - -struct efa_ibv_create_cq_resp { - __u32 comp_mask; - __u8 reserved_20[0x4]; - __aligned_u64 q_mmap_key; - __aligned_u64 q_mmap_size; - __u16 cq_idx; - __u8 reserved_d0[0x6]; -}; - -enum { - EFA_QP_DRIVER_TYPE_SRD = 0, -}; - -struct efa_ibv_create_qp { - __u32 comp_mask; - __u32 rq_ring_size; /* bytes */ - __u32 sq_ring_size; /* bytes */ - __u32 driver_qp_type; -}; - -struct efa_ibv_create_qp_resp { - __u32 comp_mask; - /* the offset inside the page of the rq db */ - __u32 rq_db_offset; - /* the offset inside the page of the sq db */ - __u32 sq_db_offset; - /* the offset inside the page of descriptors buffer */ - __u32 llq_desc_offset; - __aligned_u64 rq_mmap_key; - __aligned_u64 rq_mmap_size; - __aligned_u64 rq_db_mmap_key; - __aligned_u64 sq_db_mmap_key; - __aligned_u64 llq_desc_mmap_key; - __u16 send_sub_cq_idx; - __u16 recv_sub_cq_idx; - __u8 reserved_1e0[0x4]; -}; - -struct efa_ibv_create_ah_resp { - __u32 comp_mask; - __u16 efa_address_handle; - __u8 reserved_30[0x2]; -}; - -struct efa_ibv_ex_query_device_resp { - __u32 comp_mask; - __u32 max_sq_wr; - __u32 max_rq_wr; - __u16 max_sq_sge; - __u16 max_rq_sge; -}; - -/**************************************************************************************************/ -/* EFA CUSTOM COMMANDS */ -/**************************************************************************************************/ -enum efa_everbs_commands { - EFA_EVERBS_CMD_GET_AH = 1, - EFA_EVERBS_CMD_GET_EX_DEV_ATTRS, - EFA_EVERBS_CMD_MAX, -}; - -struct efa_everbs_get_ah { - __u32 command; - __u16 in_words; - __u16 out_words; - __u32 comp_mask; - __u16 pdn; - __u8 reserved_30[0x2]; - __aligned_u64 response; - __aligned_u64 user_handle; - __u8 gid[16]; -}; - -struct efa_everbs_get_ah_resp { - __u32 comp_mask; - __u16 efa_address_handle; - __u8 reserved_30[0x2]; -}; - -struct efa_everbs_get_ex_dev_attrs { - __u32 command; - __u16 in_words; - __u16 out_words; - __u32 comp_mask; - __u8 reserved_20[0x4]; - __aligned_u64 response; -}; - -struct efa_everbs_get_ex_dev_attrs_resp { - __u32 comp_mask; - __u32 max_sq_wr; - __u32 max_rq_wr; - __u16 max_sq_sge; - __u16 max_rq_sge; -}; - -#endif /* EFA_ABI_H */ diff --git a/prov/efa/src/efa_verbs/efa_cmd.c b/prov/efa/src/efa_verbs/efa_cmd.c deleted file mode 100644 index 2c291b84252..00000000000 --- a/prov/efa/src/efa_verbs/efa_cmd.c +++ /dev/null @@ -1,396 +0,0 @@ -/* - * Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "infiniband/efa_verbs.h" - -#include "efa_cmd.h" -#include "efa_ib_cmd.h" -#include "efa_io_defs.h" /* entry sizes */ - -int efa_cmd_alloc_ucontext(struct ibv_device *device, struct efa_context *ctx, int cmd_fd) -{ - struct efa_alloc_ucontext_resp resp; - struct ibv_get_context cmd = {}; - struct ibv_context *ibctx; - int ret; - - ibctx = &ctx->ibv_ctx; - ibctx->device = device; - ibctx->cmd_fd = cmd_fd; - - ret = efa_ib_cmd_get_context(ibctx, &cmd, sizeof(cmd), - &resp.ibv_resp, sizeof(resp)); - if (ret) - return ret; - - ctx->cmds_supp_udata = resp.efa_resp.cmds_supp_udata_mask; - ctx->sub_cqs_per_cq = resp.efa_resp.sub_cqs_per_cq; - ctx->inject_size = resp.efa_resp.inline_buf_size; - ctx->max_llq_size = resp.efa_resp.max_llq_size; - - return 0; -} - -static int efa_everbs_cmd_get_ex_query_dev(struct efa_context *ctx, - struct efa_device_attr *attr) -{ - struct efa_everbs_get_ex_dev_attrs_resp resp; - struct efa_everbs_get_ex_dev_attrs cmd = {}; - - cmd.command = EFA_EVERBS_CMD_GET_EX_DEV_ATTRS; - cmd.in_words = sizeof(cmd) / 4; - cmd.out_words = sizeof(resp) / 4; - cmd.response = (uintptr_t)&resp; - - if (write(ctx->efa_everbs_cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) - return -errno; - - VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof(resp)); - - attr->max_sq_wr = resp.max_sq_wr; - attr->max_rq_wr = resp.max_rq_wr; - attr->max_sq_sge = resp.max_sq_sge; - attr->max_rq_sge = resp.max_rq_sge; - - return 0; -} - -int efa_cmd_query_device(struct efa_context *ctx, struct efa_device_attr *attr) -{ - struct efa_ex_query_device_resp resp; - unsigned int major, minor, sub_minor; - struct ibv_ex_query_device cmd_ex; - struct ibv_query_device cmd; - uint64_t raw_fw_ver; - int ret; - - if (ctx->cmds_supp_udata & EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE) { - ret = efa_ib_cmd_query_device_ex(&ctx->ibv_ctx, &attr->ibv_attr, &raw_fw_ver, - &cmd_ex, sizeof(cmd_ex), sizeof(cmd_ex), - &resp.ibv_resp, sizeof(resp.ibv_resp), sizeof(resp)); - if (ret) - return ret; - - attr->max_sq_wr = resp.efa_resp.max_sq_wr; - attr->max_rq_wr = resp.efa_resp.max_rq_wr; - attr->max_sq_sge = resp.efa_resp.max_sq_sge; - attr->max_rq_sge = resp.efa_resp.max_rq_sge; - } else { - ret = efa_ib_cmd_query_device(&ctx->ibv_ctx, &attr->ibv_attr, &raw_fw_ver, &cmd, sizeof(cmd)); - if (ret) - return ret; - - ret = efa_everbs_cmd_get_ex_query_dev(ctx, attr); - if (ret) - return ret; - } - - major = (raw_fw_ver >> 32) & 0xffff; - minor = (raw_fw_ver >> 16) & 0xffff; - sub_minor = raw_fw_ver & 0xffff; - - snprintf(attr->ibv_attr.fw_ver, sizeof(attr->ibv_attr.fw_ver), - "%u.%u.%03u", major, minor, sub_minor); - - return 0; -} - -int efa_cmd_query_port(struct efa_context *ctx, uint8_t port, struct ibv_port_attr *attr) -{ - struct ibv_query_port cmd; - - return efa_ib_cmd_query_port(&ctx->ibv_ctx, port, attr, &cmd, sizeof(cmd)); -} - -struct efa_pd *efa_cmd_alloc_pd(struct efa_context *ctx) -{ - struct efa_alloc_pd_resp resp; - struct ibv_alloc_pd cmd; - struct efa_pd *pd; - - pd = malloc(sizeof(*pd)); - if (!pd) - return NULL; - - if (efa_ib_cmd_alloc_pd(&ctx->ibv_ctx, &pd->ibv_pd, &cmd, sizeof(cmd), - &resp.ibv_resp, sizeof(resp))) { - free(pd); - return NULL; - } - - pd->context = ctx; - pd->pdn = resp.efa_resp.pdn; - - return pd; -} - -int efa_cmd_dealloc_pd(struct efa_pd *pd) -{ - int ret; - - ret = efa_ib_cmd_dealloc_pd(&pd->ibv_pd); - if (ret) - return ret; - - free(pd); - return 0; -} - -struct ibv_mr *efa_cmd_reg_mr(struct efa_pd *pd, void *addr, - size_t length, int access) -{ - struct ib_uverbs_reg_mr_resp resp; - struct ibv_reg_mr cmd; - struct ibv_mr *mr; - int ret; - - mr = malloc(sizeof(*mr)); - if (!mr) - return NULL; - - ret = efa_ib_cmd_reg_mr(&pd->ibv_pd, addr, length, (uintptr_t)addr, - access, mr, &cmd, sizeof(cmd), - &resp, sizeof(resp)); - if (ret) { - free(mr); - return NULL; - } - - mr->context = pd->ibv_pd.context; - mr->pd = &pd->ibv_pd; - mr->addr = addr; - mr->length = length; - - return mr; -} - -int efa_cmd_dereg_mr(struct ibv_mr *mr) -{ - int ret; - - ret = efa_ib_cmd_dereg_mr(mr); - if (ret) - return ret; - - free(mr); - - return ret; -} - -/* context->mutex must be held */ -int efa_cmd_create_cq(struct efa_cq *cq, int cq_size, uint64_t *q_mmap_key, - uint64_t *q_mmap_size, uint32_t *cqn) -{ - struct efa_context *ctx = container_of(cq->domain->ctx, struct efa_context, ibv_ctx); - struct efa_create_cq cmd; - struct efa_create_cq_resp resp; - int err; - - memset(&cmd, 0, sizeof(struct efa_create_cq)); - cmd.efa_cmd.num_sub_cqs = ctx->sub_cqs_per_cq; - cmd.efa_cmd.cq_entry_size = ctx->cqe_size; - err = efa_ib_cmd_create_cq(&ctx->ibv_ctx, cq_size, - &cq->ibv_cq, &cmd.ibv_cmd, sizeof(cmd), - &resp.ibv_resp, sizeof(resp)); - if (err) { - EFA_WARN_ERRNO(FI_LOG_CQ, "Command failed to create cq", err); - return err; - } - - *q_mmap_size = resp.efa_resp.q_mmap_size; - *q_mmap_key = resp.efa_resp.q_mmap_key; - *cqn = resp.efa_resp.cq_idx; - - cq->ibv_cq.context = &ctx->ibv_ctx; - cq->ibv_cq.cq_context = cq; - cq->ibv_cq.comp_events_completed = 0; - cq->ibv_cq.async_events_completed = 0; - pthread_mutex_init(&cq->ibv_cq.mutex, NULL); - pthread_cond_init(&cq->ibv_cq.cond, NULL); - - return 0; -} - -/* context->mutex must be held */ -int efa_cmd_destroy_cq(struct efa_cq *cq) -{ - return efa_ib_cmd_destroy_cq(&cq->ibv_cq); -} - -int efa_cmd_create_qp(struct efa_qp *qp, struct efa_pd *pd, struct ibv_qp_init_attr *init_attr, - uint32_t srd_qp, struct efa_create_qp_resp *resp) -{ - struct ibv_pd *ibpd = &pd->ibv_pd; - struct efa_create_qp cmd; - int err; - - init_attr->cap.max_send_wr = qp->sq.wq.wqe_cnt; - init_attr->cap.max_recv_wr = qp->rq.wq.wqe_cnt; - - memset(&cmd, 0, sizeof(struct efa_create_qp)); - cmd.efa_cmd.rq_ring_size = (qp->rq.wq.desc_mask + 1) * - sizeof(struct efa_io_rx_desc); - cmd.efa_cmd.sq_ring_size = (qp->sq.wq.desc_mask + 1) * - sizeof(struct efa_io_tx_wqe); - cmd.efa_cmd.driver_qp_type = EFA_QP_DRIVER_TYPE_SRD; /* ignored on UD */ - err = efa_ib_cmd_create_qp(ibpd, &qp->ibv_qp, init_attr, - &cmd.ibv_cmd, sizeof(cmd), - &resp->ibv_resp, sizeof(*resp)); - if (err) - return err; - - qp->ibv_qp.context = ibpd->context; - qp->ibv_qp.qp_context = init_attr->qp_context; - qp->ibv_qp.pd = ibpd; - qp->ibv_qp.send_cq = init_attr->send_cq; - qp->ibv_qp.recv_cq = init_attr->recv_cq; - qp->ibv_qp.srq = init_attr->srq; - qp->ibv_qp.qp_type = init_attr->qp_type; - qp->ibv_qp.state = IBV_QPS_RESET; - qp->ibv_qp.events_completed = 0; - pthread_mutex_init(&qp->ibv_qp.mutex, NULL); - pthread_cond_init(&qp->ibv_qp.cond, NULL); - - return 0; -} - -int efa_cmd_destroy_qp(struct efa_qp *qp) -{ - return efa_ib_cmd_destroy_qp(&qp->ibv_qp); -} - -int efa_cmd_query_gid(struct efa_context *ctx, uint8_t port_num, - int index, union ibv_gid *gid) -{ - struct ibv_context *context = &ctx->ibv_ctx; - char name[24]; - char attr[41]; - uint16_t val; - int i; - - snprintf(name, sizeof(name), "ports/%d/gids/%d", port_num, index); - - if (fi_read_file(context->device->ibdev_path, name, - attr, sizeof(attr)) < 0) - return -1; - - for (i = 0; i < 8; ++i) { - if (sscanf(attr + i * 5, "%hx", &val) != 1) - return -1; - gid->raw[i * 2] = val >> 8; - gid->raw[i * 2 + 1] = val & 0xff; - } - - return 0; -} - -static int efa_everbs_cmd_get_ah(struct efa_context *ctx, struct efa_ah *efa_ah, struct ibv_pd *pd, - struct ibv_ah_attr *attr) -{ - struct efa_everbs_get_ah_resp resp; - struct efa_everbs_get_ah cmd = {}; - - cmd.command = EFA_EVERBS_CMD_GET_AH; - cmd.in_words = sizeof(cmd) / 4; - cmd.out_words = sizeof(resp) / 4; - cmd.response = (uintptr_t)&resp; - - cmd.user_handle = (uintptr_t)&efa_ah->ibv_ah; - cmd.pdn = to_efa_pd(pd)->pdn; - memcpy(cmd.gid, attr->grh.dgid.raw, 16); - - if (write(ctx->efa_everbs_cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) - return -errno; - - VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof(resp)); - efa_ah->efa_address_handle = resp.efa_address_handle; - - return 0; -} - -struct efa_ah *efa_cmd_create_ah(struct efa_pd *pd, struct ibv_ah_attr *attr) -{ - struct efa_context *ctx = pd->context; - struct efa_create_ah_resp resp = {}; - struct ibv_port_attr port_attr; - struct efa_ah *ah; - int err; - - err = efa_cmd_query_port(ctx, attr->port_num, &port_attr); - if (err) { - EFA_WARN_ERRNO(FI_LOG_AV, "Command failed to query port", err); - return NULL; - } - - ah = malloc(sizeof(*ah)); - if (!ah) { - EFA_WARN(FI_LOG_AV, "Failed to allocate memory for AH\n"); - return NULL; - } - - attr->is_global = 1; - - err = efa_ib_cmd_create_ah(&pd->ibv_pd, &ah->ibv_ah, attr, - &resp.ibv_resp, sizeof(resp)); - if (err) { - EFA_WARN_ERRNO(FI_LOG_AV, "Command failed to create ah", err); - goto err_free_ah; - } - - if (ctx->cmds_supp_udata & EFA_USER_CMDS_SUPP_UDATA_CREATE_AH) { - ah->efa_address_handle = resp.efa_resp.efa_address_handle; - } else { - err = efa_everbs_cmd_get_ah(ctx, ah, &pd->ibv_pd, attr); - if (err) { - EFA_WARN_ERRNO(FI_LOG_AV, "Command failed to get ah attrs", err); - goto err_destroy_ah; - } - } - - return ah; - -err_destroy_ah: - efa_ib_cmd_destroy_ah(&ah->ibv_ah); -err_free_ah: - free(ah); - return NULL; -} - -int efa_cmd_destroy_ah(struct efa_ah *ah) -{ - int ret; - - ret = efa_ib_cmd_destroy_ah(&ah->ibv_ah); - free(ah); - - return ret; -} diff --git a/prov/efa/src/efa_verbs/efa_cmd.h b/prov/efa/src/efa_verbs/efa_cmd.h deleted file mode 100644 index 214f4cc2a21..00000000000 --- a/prov/efa/src/efa_verbs/efa_cmd.h +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2017-2018 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef _EFA_CMD_H_ -#define _EFA_CMD_H_ - -#include "efa-abi.h" -#include "efa_ib_cmd.h" -#include "efa.h" - -struct efa_alloc_ucontext_resp { - struct ib_uverbs_get_context_resp ibv_resp; - struct efa_ibv_alloc_ucontext_resp efa_resp; -}; - -struct efa_ex_query_device_resp { - struct ib_uverbs_ex_query_device_resp ibv_resp; - struct efa_ibv_ex_query_device_resp efa_resp; -}; - -struct efa_alloc_pd_resp { - struct ib_uverbs_alloc_pd_resp ibv_resp; - struct efa_ibv_alloc_pd_resp efa_resp; -}; - -struct efa_create_cq { - struct ibv_create_cq ibv_cmd; - struct efa_ibv_create_cq efa_cmd; -}; - -struct efa_create_cq_resp { - struct ib_uverbs_create_cq_resp ibv_resp; - struct efa_ibv_create_cq_resp efa_resp; -}; - -struct efa_create_qp { - struct ibv_create_qp ibv_cmd; - struct efa_ibv_create_qp efa_cmd; -}; - -struct efa_create_qp_resp { - struct ib_uverbs_create_qp_resp ibv_resp; - struct efa_ibv_create_qp_resp efa_resp; -}; - -struct efa_create_ah_resp { - struct ib_uverbs_create_ah_resp ibv_resp; - struct efa_ibv_create_ah_resp efa_resp; -}; - -int efa_cmd_alloc_ucontext(struct ibv_device *device, struct efa_context *ctx, int cmd_fd); -int efa_cmd_query_device(struct efa_context *ctx, struct efa_device_attr *attr); -int efa_cmd_query_port(struct efa_context *ctx, uint8_t port, struct ibv_port_attr *attr); -struct efa_pd *efa_cmd_alloc_pd(struct efa_context *ctx); -int efa_cmd_dealloc_pd(struct efa_pd *pd); -struct ibv_mr *efa_cmd_reg_mr(struct efa_pd *pd, void *addr, - size_t length, int access); -int efa_cmd_dereg_mr(struct ibv_mr *mr); -int efa_cmd_create_cq(struct efa_cq *cq, int cq_size, uint64_t *q_mmap_key, - uint64_t *q_mmap_size, uint32_t *cqn); -int efa_cmd_destroy_cq(struct efa_cq *cq); -int efa_cmd_create_qp(struct efa_qp *qp, struct efa_pd *pd, struct ibv_qp_init_attr *init_attr, - uint32_t srd_qp, struct efa_create_qp_resp *resp); -int efa_cmd_destroy_qp(struct efa_qp *qp); -int efa_cmd_query_gid(struct efa_context *ctx, uint8_t port_num, - int index, union ibv_gid *gid); -struct efa_ah *efa_cmd_create_ah(struct efa_pd *pd, struct ibv_ah_attr *attr); -int efa_cmd_destroy_ah(struct efa_ah *ah); - -#endif /* _EFA_CMD_H_ */ diff --git a/prov/efa/src/efa_verbs/efa_ib_cmd.c b/prov/efa/src/efa_verbs/efa_ib_cmd.c deleted file mode 100644 index b73224e3c4c..00000000000 --- a/prov/efa/src/efa_verbs/efa_ib_cmd.c +++ /dev/null @@ -1,469 +0,0 @@ -/* - * Copyright (c) 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2005 PathScale, Inc. All rights reserved. - * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2017-2018 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#if HAVE_CONFIG_H -# include -#endif /* HAVE_CONFIG_H */ - -#include -#include -#include -#include -#include -#include - -#include "efa_ib.h" -#include "efa_ib_cmd.h" - -#define IBV_INIT_CMD(cmd, size, opcode) \ - do { \ - (cmd)->hdr.command = IB_USER_VERBS_CMD_##opcode; \ - (cmd)->hdr.in_words = (size) / 4; \ - (cmd)->hdr.out_words = 0; \ - } while (0) - -#define IBV_INIT_CMD_RESP(cmd, size, opcode, out, outsize) \ - do { \ - (cmd)->hdr.command = IB_USER_VERBS_CMD_##opcode; \ - (cmd)->hdr.in_words = (size) / 4; \ - (cmd)->hdr.out_words = (outsize) / 4; \ - (cmd)->ibcmd.response = (uintptr_t)(out); \ - } while (0) - -static inline uint32_t _cmd_ex(uint32_t cmd) -{ - return IB_USER_VERBS_CMD_FLAG_EXTENDED | cmd; -} - -#define IBV_INIT_CMD_RESP_EX_V(cmd, cmd_size, size, opcode, out, resp_size, \ - outsize) \ - do { \ - size_t c_size = cmd_size - sizeof(struct ib_uverbs_cmd_hdr) \ - - sizeof(struct ib_uverbs_ex_cmd_hdr); \ - (cmd)->hdr.command = \ - _cmd_ex(IB_USER_VERBS_EX_CMD_##opcode); \ - (cmd)->hdr.in_words = ((c_size) / 8); \ - (cmd)->hdr.out_words = ((resp_size) / 8); \ - (cmd)->ex_hdr.response = (uintptr_t)(out); \ - (cmd)->ex_hdr.provider_in_words = (((size) - (cmd_size)) / 8); \ - (cmd)->ex_hdr.provider_out_words = \ - (((outsize) - (resp_size)) / 8); \ - (cmd)->ex_hdr.cmd_hdr_reserved = 0; \ - } while (0) - -int efa_ib_cmd_get_context(struct ibv_context *context, struct ibv_get_context *cmd, - size_t cmd_size, struct ib_uverbs_get_context_resp *resp, - size_t resp_size) -{ - if (abi_ver < IB_USER_VERBS_MIN_ABI_VERSION) - return -ENOSYS; - - IBV_INIT_CMD_RESP(cmd, cmd_size, GET_CONTEXT, resp, resp_size); - - if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) - return -errno; - - VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); - - context->async_fd = resp->async_fd; - context->num_comp_vectors = resp->num_comp_vectors; - - return 0; -} - -static void copy_query_dev_fields(struct ibv_device_attr *device_attr, - struct ib_uverbs_query_device_resp *resp, - uint64_t *raw_fw_ver) -{ - *raw_fw_ver = resp->fw_ver; - device_attr->node_guid = resp->node_guid; - device_attr->sys_image_guid = resp->sys_image_guid; - device_attr->max_mr_size = resp->max_mr_size; - device_attr->page_size_cap = resp->page_size_cap; - device_attr->vendor_id = resp->vendor_id; - device_attr->vendor_part_id = resp->vendor_part_id; - device_attr->hw_ver = resp->hw_ver; - device_attr->max_qp = resp->max_qp; - device_attr->max_qp_wr = resp->max_qp_wr; - device_attr->device_cap_flags = resp->device_cap_flags; - device_attr->max_sge = resp->max_sge; - device_attr->max_sge_rd = resp->max_sge_rd; - device_attr->max_cq = resp->max_cq; - device_attr->max_cqe = resp->max_cqe; - device_attr->max_mr = resp->max_mr; - device_attr->max_pd = resp->max_pd; - device_attr->max_qp_rd_atom = resp->max_qp_rd_atom; - device_attr->max_ee_rd_atom = resp->max_ee_rd_atom; - device_attr->max_res_rd_atom = resp->max_res_rd_atom; - device_attr->max_qp_init_rd_atom = resp->max_qp_init_rd_atom; - device_attr->max_ee_init_rd_atom = resp->max_ee_init_rd_atom; - device_attr->atomic_cap = resp->atomic_cap; - device_attr->max_ee = resp->max_ee; - device_attr->max_rdd = resp->max_rdd; - device_attr->max_mw = resp->max_mw; - device_attr->max_raw_ipv6_qp = resp->max_raw_ipv6_qp; - device_attr->max_raw_ethy_qp = resp->max_raw_ethy_qp; - device_attr->max_mcast_grp = resp->max_mcast_grp; - device_attr->max_mcast_qp_attach = resp->max_mcast_qp_attach; - device_attr->max_total_mcast_qp_attach = resp->max_total_mcast_qp_attach; - device_attr->max_ah = resp->max_ah; - device_attr->max_fmr = resp->max_fmr; - device_attr->max_map_per_fmr = resp->max_map_per_fmr; - device_attr->max_srq = resp->max_srq; - device_attr->max_srq_wr = resp->max_srq_wr; - device_attr->max_srq_sge = resp->max_srq_sge; - device_attr->max_pkeys = resp->max_pkeys; - device_attr->local_ca_ack_delay = resp->local_ca_ack_delay; - device_attr->phys_port_cnt = resp->phys_port_cnt; -} - -int efa_ib_cmd_query_device(struct ibv_context *context, - struct ibv_device_attr *device_attr, - uint64_t *raw_fw_ver, - struct ibv_query_device *cmd, size_t cmd_size) -{ - struct ib_uverbs_query_device_resp resp; - - IBV_INIT_CMD_RESP(cmd, cmd_size, QUERY_DEVICE, &resp, sizeof(resp)); - - if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) - return -errno; - - VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof(resp)); - - memset(device_attr->fw_ver, 0, sizeof(device_attr->fw_ver)); - copy_query_dev_fields(device_attr, &resp, raw_fw_ver); - - return 0; -} - -int efa_ib_cmd_query_device_ex(struct ibv_context *context, - struct ibv_device_attr *device_attr, - uint64_t *raw_fw_ver, - struct ibv_ex_query_device *cmd, - size_t cmd_core_size, - size_t cmd_size, - struct ib_uverbs_ex_query_device_resp *resp, - size_t resp_core_size, - size_t resp_size) -{ - if (resp_core_size < offsetof(struct ib_uverbs_ex_query_device_resp, - response_length) + - sizeof(resp->response_length)) - return -EINVAL; - - IBV_INIT_CMD_RESP_EX_V(cmd, cmd_core_size, cmd_size, - QUERY_DEVICE, resp, resp_core_size, - resp_size); - cmd->ibcmd.comp_mask = 0; - cmd->ibcmd.reserved = 0; - - if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) - return -errno; - - VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); - - memset(device_attr->fw_ver, 0, sizeof(device_attr->fw_ver)); - copy_query_dev_fields(device_attr, &resp->base, raw_fw_ver); - - return 0; -} - -int efa_ib_cmd_query_port(struct ibv_context *context, uint8_t port_num, - struct ibv_port_attr *port_attr, - struct ibv_query_port *cmd, size_t cmd_size) -{ - struct ib_uverbs_query_port_resp resp; - - IBV_INIT_CMD_RESP(cmd, cmd_size, QUERY_PORT, &resp, sizeof(resp)); - cmd->ibcmd.port_num = port_num; - memset(cmd->ibcmd.reserved, 0, sizeof(cmd->ibcmd.reserved)); - - if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) - return -errno; - - VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof(resp)); - - port_attr->state = resp.state; - port_attr->max_mtu = resp.max_mtu; - port_attr->active_mtu = resp.active_mtu; - port_attr->gid_tbl_len = resp.gid_tbl_len; - port_attr->port_cap_flags = resp.port_cap_flags; - port_attr->max_msg_sz = resp.max_msg_sz; - port_attr->bad_pkey_cntr = resp.bad_pkey_cntr; - port_attr->qkey_viol_cntr = resp.qkey_viol_cntr; - port_attr->pkey_tbl_len = resp.pkey_tbl_len; - port_attr->lid = resp.lid; - port_attr->sm_lid = resp.sm_lid; - port_attr->lmc = resp.lmc; - port_attr->max_vl_num = resp.max_vl_num; - port_attr->sm_sl = resp.sm_sl; - port_attr->subnet_timeout = resp.subnet_timeout; - port_attr->init_type_reply = resp.init_type_reply; - port_attr->active_width = resp.active_width; - port_attr->active_speed = resp.active_speed; - port_attr->phys_state = resp.phys_state; - port_attr->link_layer = resp.link_layer; - - return 0; -} - -int efa_ib_cmd_alloc_pd(struct ibv_context *context, struct ibv_pd *pd, - struct ibv_alloc_pd *cmd, size_t cmd_size, - struct ib_uverbs_alloc_pd_resp *resp, size_t resp_size) -{ - IBV_INIT_CMD_RESP(cmd, cmd_size, ALLOC_PD, resp, resp_size); - - if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) - return -errno; - - VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); - - pd->handle = resp->pd_handle; - pd->context = context; - - return 0; -} - -int efa_ib_cmd_dealloc_pd(struct ibv_pd *pd) -{ - struct ibv_dealloc_pd cmd; - - IBV_INIT_CMD(&cmd, sizeof(cmd), DEALLOC_PD); - cmd.ibcmd.pd_handle = pd->handle; - - if (write(pd->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) - return -errno; - - return 0; -} - -int efa_ib_cmd_reg_mr(struct ibv_pd *pd, void *addr, size_t length, - uint64_t hca_va, int access, - struct ibv_mr *mr, struct ibv_reg_mr *cmd, - size_t cmd_size, - struct ib_uverbs_reg_mr_resp *resp, size_t resp_size) -{ - IBV_INIT_CMD_RESP(cmd, cmd_size, REG_MR, resp, resp_size); - - cmd->ibcmd.start = (uintptr_t)addr; - cmd->ibcmd.length = length; - cmd->ibcmd.hca_va = hca_va; - cmd->ibcmd.pd_handle = pd->handle; - cmd->ibcmd.access_flags = access; - - if (write(pd->context->cmd_fd, cmd, cmd_size) != cmd_size) - return -errno; - - VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); - - mr->handle = resp->mr_handle; - mr->lkey = resp->lkey; - mr->rkey = resp->rkey; - mr->context = pd->context; - - return 0; -} - -int efa_ib_cmd_dereg_mr(struct ibv_mr *mr) -{ - struct ibv_dereg_mr cmd; - - IBV_INIT_CMD(&cmd, sizeof(cmd), DEREG_MR); - cmd.ibcmd.mr_handle = mr->handle; - - if (write(mr->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) - return -errno; - - return 0; -} - -int efa_ib_cmd_create_cq(struct ibv_context *context, int cqe, - struct ibv_cq *cq, - struct ibv_create_cq *cmd, size_t cmd_size, - struct ib_uverbs_create_cq_resp *resp, size_t resp_size) -{ - IBV_INIT_CMD_RESP(cmd, cmd_size, CREATE_CQ, resp, resp_size); - cmd->ibcmd.user_handle = (uintptr_t)cq; - cmd->ibcmd.cqe = cqe; - cmd->ibcmd.comp_vector = 0; - cmd->ibcmd.comp_channel = -1; - cmd->ibcmd.reserved = 0; - - if (write(context->cmd_fd, cmd, cmd_size) != cmd_size) - return -errno; - - VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); - - cq->handle = resp->cq_handle; - cq->cqe = resp->cqe; - cq->context = context; - - return 0; -} - -int efa_ib_cmd_destroy_cq(struct ibv_cq *cq) -{ - struct ibv_destroy_cq cmd; - struct ib_uverbs_destroy_cq_resp resp; - - IBV_INIT_CMD_RESP(&cmd, sizeof(cmd), DESTROY_CQ, &resp, sizeof(resp)); - cmd.ibcmd.cq_handle = cq->handle; - cmd.ibcmd.reserved = 0; - - if (write(cq->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) - return -errno; - - VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof(resp)); - - pthread_mutex_lock(&cq->mutex); - while (cq->comp_events_completed != resp.comp_events_reported || - cq->async_events_completed != resp.async_events_reported) - pthread_cond_wait(&cq->cond, &cq->mutex); - pthread_mutex_unlock(&cq->mutex); - - return 0; -} - -int efa_ib_cmd_create_qp(struct ibv_pd *pd, - struct ibv_qp *qp, struct ibv_qp_init_attr *attr, - struct ibv_create_qp *cmd, size_t cmd_size, - struct ib_uverbs_create_qp_resp *resp, size_t resp_size) -{ - IBV_INIT_CMD_RESP(cmd, cmd_size, CREATE_QP, resp, resp_size); - - cmd->ibcmd.user_handle = (uintptr_t)qp; - cmd->ibcmd.pd_handle = pd->handle; - cmd->ibcmd.send_cq_handle = attr->send_cq->handle; - cmd->ibcmd.recv_cq_handle = attr->recv_cq->handle; - cmd->ibcmd.srq_handle = attr->srq ? attr->srq->handle : 0; - cmd->ibcmd.max_send_wr = attr->cap.max_send_wr; - cmd->ibcmd.max_recv_wr = attr->cap.max_recv_wr; - cmd->ibcmd.max_send_sge = attr->cap.max_send_sge; - cmd->ibcmd.max_recv_sge = attr->cap.max_recv_sge; - cmd->ibcmd.max_inline_data = attr->cap.max_inline_data; - cmd->ibcmd.sq_sig_all = attr->sq_sig_all; - cmd->ibcmd.qp_type = attr->qp_type; - cmd->ibcmd.is_srq = !!attr->srq; - cmd->ibcmd.reserved = 0; - - if (write(pd->context->cmd_fd, cmd, cmd_size) != cmd_size) - return -errno; - - VALGRIND_MAKE_MEM_DEFINED(resp, resp_size); - - qp->handle = resp->qp_handle; - qp->qp_num = resp->qpn; - qp->context = pd->context; - - attr->cap.max_recv_sge = resp->max_recv_sge; - attr->cap.max_send_sge = resp->max_send_sge; - attr->cap.max_recv_wr = resp->max_recv_wr; - attr->cap.max_send_wr = resp->max_send_wr; - attr->cap.max_inline_data = resp->max_inline_data; - - return 0; -} - -int efa_ib_cmd_destroy_qp(struct ibv_qp *qp) -{ - struct ibv_destroy_qp cmd; - struct ib_uverbs_destroy_qp_resp resp; - - IBV_INIT_CMD_RESP(&cmd, sizeof(cmd), DESTROY_QP, &resp, sizeof(resp)); - cmd.ibcmd.qp_handle = qp->handle; - cmd.ibcmd.reserved = 0; - - if (write(qp->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) - return -errno; - - VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof(resp)); - - pthread_mutex_lock(&qp->mutex); - while (qp->events_completed != resp.events_reported) - pthread_cond_wait(&qp->cond, &qp->mutex); - pthread_mutex_unlock(&qp->mutex); - - return 0; -} - -int efa_ib_cmd_create_ah(struct ibv_pd *pd, struct ibv_ah *ah, - struct ibv_ah_attr *attr, - struct ib_uverbs_create_ah_resp *resp, - size_t resp_size) -{ - struct ibv_create_ah cmd; - - IBV_INIT_CMD_RESP(&cmd, sizeof(cmd), CREATE_AH, resp, resp_size); - cmd.ibcmd.user_handle = (uintptr_t)ah; - cmd.ibcmd.pd_handle = pd->handle; - cmd.ibcmd.reserved = 0; - cmd.ibcmd.attr.dlid = attr->dlid; - cmd.ibcmd.attr.sl = attr->sl; - cmd.ibcmd.attr.src_path_bits = attr->src_path_bits; - cmd.ibcmd.attr.static_rate = attr->static_rate; - cmd.ibcmd.attr.is_global = attr->is_global; - cmd.ibcmd.attr.port_num = attr->port_num; - cmd.ibcmd.attr.reserved = 0; - cmd.ibcmd.attr.grh.flow_label = attr->grh.flow_label; - cmd.ibcmd.attr.grh.sgid_index = attr->grh.sgid_index; - cmd.ibcmd.attr.grh.hop_limit = attr->grh.hop_limit; - cmd.ibcmd.attr.grh.traffic_class = attr->grh.traffic_class; - cmd.ibcmd.attr.grh.reserved = 0; - memcpy(cmd.ibcmd.attr.grh.dgid, attr->grh.dgid.raw, 16); - - if (write(pd->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) - return -errno; - - VALGRIND_MAKE_MEM_DEFINED(&resp, resp_size); - - ah->handle = resp->ah_handle; - ah->context = pd->context; - - return 0; -} - -int efa_ib_cmd_destroy_ah(struct ibv_ah *ah) -{ - struct ibv_destroy_ah cmd; - - IBV_INIT_CMD(&cmd, sizeof(cmd), DESTROY_AH); - cmd.ibcmd.ah_handle = ah->handle; - - if (write(ah->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) - return -errno; - - return 0; -} diff --git a/prov/efa/src/efa_verbs/efa_ib_cmd.h b/prov/efa/src/efa_verbs/efa_ib_cmd.h deleted file mode 100644 index 7e0132fcf9e..00000000000 --- a/prov/efa/src/efa_verbs/efa_ib_cmd.h +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2005, 2006 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2005 PathScale, Inc. All rights reserved. - * Copyright (c) 2017-2019 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef EFA_IB_CMD_H_ -#define EFA_IB_CMD_H_ - -#include "infiniband/efa_kern-abi.h" - -struct ibv_get_context { - struct ib_uverbs_cmd_hdr hdr; - struct ib_uverbs_get_context ibcmd; -}; - -struct ibv_query_device { - struct ib_uverbs_cmd_hdr hdr; - struct ib_uverbs_query_device ibcmd; -}; - -struct ibv_ex_query_device { - struct ib_uverbs_cmd_hdr hdr; - struct ib_uverbs_ex_cmd_hdr ex_hdr; - struct ib_uverbs_ex_query_device ibcmd; -}; - -struct ibv_query_port { - struct ib_uverbs_cmd_hdr hdr; - struct ib_uverbs_query_port ibcmd; -}; - -struct ibv_alloc_pd { - struct ib_uverbs_cmd_hdr hdr; - struct ib_uverbs_alloc_pd ibcmd; -}; - -struct ibv_dealloc_pd { - struct ib_uverbs_cmd_hdr hdr; - struct ib_uverbs_dealloc_pd ibcmd; -}; - -struct ibv_reg_mr { - struct ib_uverbs_cmd_hdr hdr; - struct ib_uverbs_reg_mr ibcmd; -}; - -struct ibv_dereg_mr { - struct ib_uverbs_cmd_hdr hdr; - struct ib_uverbs_dereg_mr ibcmd; -}; - -struct ibv_create_cq { - struct ib_uverbs_cmd_hdr hdr; - struct ib_uverbs_create_cq ibcmd; -}; - -struct ibv_destroy_cq { - struct ib_uverbs_cmd_hdr hdr; - struct ib_uverbs_destroy_cq ibcmd; -}; - -struct ibv_create_qp { - struct ib_uverbs_cmd_hdr hdr; - struct ib_uverbs_create_qp ibcmd; -}; - -struct ibv_destroy_qp { - struct ib_uverbs_cmd_hdr hdr; - struct ib_uverbs_destroy_qp ibcmd; -}; - -struct ibv_create_ah { - struct ib_uverbs_cmd_hdr hdr; - struct ib_uverbs_create_ah ibcmd; -}; - -struct ibv_destroy_ah { - struct ib_uverbs_cmd_hdr hdr; - struct ib_uverbs_destroy_ah ibcmd; -}; - -int efa_ib_cmd_get_context(struct ibv_context *context, struct ibv_get_context *cmd, - size_t cmd_size, struct ib_uverbs_get_context_resp *resp, - size_t resp_size); -int efa_ib_cmd_query_device(struct ibv_context *context, - struct ibv_device_attr *device_attr, - uint64_t *raw_fw_ver, - struct ibv_query_device *cmd, size_t cmd_size); -int efa_ib_cmd_query_device_ex(struct ibv_context *context, - struct ibv_device_attr *device_attr, - uint64_t *raw_fw_ver, - struct ibv_ex_query_device *cmd, - size_t cmd_core_size, - size_t cmd_size, - struct ib_uverbs_ex_query_device_resp *resp, - size_t resp_core_size, - size_t resp_size); -int efa_ib_cmd_query_port(struct ibv_context *context, uint8_t port_num, - struct ibv_port_attr *port_attr, - struct ibv_query_port *cmd, size_t cmd_size); -int efa_ib_cmd_alloc_pd(struct ibv_context *context, struct ibv_pd *pd, - struct ibv_alloc_pd *cmd, size_t cmd_size, - struct ib_uverbs_alloc_pd_resp *resp, size_t resp_size); -int efa_ib_cmd_dealloc_pd(struct ibv_pd *pd); -int efa_ib_cmd_reg_mr(struct ibv_pd *pd, void *addr, size_t length, - uint64_t hca_va, int access, - struct ibv_mr *mr, struct ibv_reg_mr *cmd, - size_t cmd_size, - struct ib_uverbs_reg_mr_resp *resp, size_t resp_size); -int efa_ib_cmd_dereg_mr(struct ibv_mr *mr); -int efa_ib_cmd_create_cq(struct ibv_context *context, int cqe, - struct ibv_cq *cq, - struct ibv_create_cq *cmd, size_t cmd_size, - struct ib_uverbs_create_cq_resp *resp, size_t resp_size); -int efa_ib_cmd_destroy_cq(struct ibv_cq *cq); -int efa_ib_cmd_create_qp(struct ibv_pd *pd, - struct ibv_qp *qp, struct ibv_qp_init_attr *attr, - struct ibv_create_qp *cmd, size_t cmd_size, - struct ib_uverbs_create_qp_resp *resp, size_t resp_size); -int efa_ib_cmd_destroy_qp(struct ibv_qp *qp); -int efa_ib_cmd_create_ah(struct ibv_pd *pd, struct ibv_ah *ah, - struct ibv_ah_attr *attr, - struct ib_uverbs_create_ah_resp *resp, - size_t resp_size); -int efa_ib_cmd_destroy_ah(struct ibv_ah *ah); - -#endif /* EFA_IB_CMD_H_ */ diff --git a/prov/efa/src/efa_verbs/efa_init.c b/prov/efa/src/efa_verbs/efa_init.c deleted file mode 100644 index 40407063825..00000000000 --- a/prov/efa/src/efa_verbs/efa_init.c +++ /dev/null @@ -1,385 +0,0 @@ -/* - * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2006 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2017-2018 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#if HAVE_CONFIG_H -# include -#endif /* HAVE_CONFIG_H */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "efa_ib.h" - -#ifndef PCI_VENDOR_ID_AMAZON -#define PCI_VENDOR_ID_AMAZON 0x1d0f -#endif /* PCI_VENDOR_ID_AMAZON */ - -#ifndef PCI_DEV_ID_EFA_VF -#define PCI_DEV_ID_EFA_VF 0xefa0 -#endif - -#define HCA(v, d) { .vendor = PCI_VENDOR_ID_##v, .device = d } - -struct { - unsigned vendor; - unsigned device; -} hca_table[] = { - HCA(AMAZON, PCI_DEV_ID_EFA_VF), -}; - -HIDDEN int abi_ver; - -struct ibv_sysfs_dev { - char sysfs_name[IBV_SYSFS_NAME_MAX]; - char ibdev_name[IBV_SYSFS_NAME_MAX]; - char sysfs_path[IBV_SYSFS_PATH_MAX]; - char ibdev_path[IBV_SYSFS_PATH_MAX]; - struct ibv_sysfs_dev *next; - int abi_ver; - int have_driver; -}; - -char *get_sysfs_path(void) -{ - char *env = NULL; - char *sysfs_path = NULL; - int len; - - /* - * Only follow use path passed in through the calling user's - * environment if we're not running SUID. - */ - if (getuid() == geteuid()) - env = getenv("SYSFS_PATH"); - - if (env) { - sysfs_path = strndup(env, IBV_SYSFS_PATH_MAX); - len = strlen(sysfs_path); - while (len > 0 && sysfs_path[len - 1] == '/') { - --len; - sysfs_path[len] = '\0'; - } - } else { - sysfs_path = strndup("/sys", IBV_SYSFS_PATH_MAX); - } - - return sysfs_path; -} - -/* Return true if the snprintf succeeded, false if there was truncation or - * error. - */ -static inline bool __good_snprintf(size_t len, int rc) -{ - return (rc < len && rc >= 0); -} - -#define check_snprintf(buf, len, fmt, ...) \ - __good_snprintf(len, snprintf(buf, len, fmt, ##__VA_ARGS__)) - -static int efa_find_sysfs_devs(struct ibv_sysfs_dev **sysfs_dev_list) -{ - char class_path[IBV_SYSFS_PATH_MAX]; - DIR *class_dir; - struct dirent *dent; - struct ibv_sysfs_dev *sysfs_dev = NULL; - char *sysfs_path; - char value[8]; - int ret = 0; - - sysfs_path = get_sysfs_path(); - if (!sysfs_path) - return -ENOMEM; - if (!check_snprintf(class_path, sizeof(class_path), - "%s/class/infiniband_verbs", sysfs_path)) { - ret = -ENOMEM; - goto sysfs_path_free; - } - - class_dir = opendir(class_path); - if (!class_dir) { - EFA_DBG(FI_LOG_CORE, "Opendir error: %d (%s)\n", errno, - strerror(errno)); - ret = errno; - goto sysfs_path_free; - } - - *sysfs_dev_list = NULL; - while ((dent = readdir(class_dir))) { - struct stat buf; - - if (dent->d_name[0] == '.') - continue; - - if (!sysfs_dev) - sysfs_dev = malloc(sizeof(*sysfs_dev)); - if (!sysfs_dev) { - ret = -ENOMEM; - goto class_dir_close; - } - - if (!check_snprintf(sysfs_dev->sysfs_path, sizeof(sysfs_dev->sysfs_path), - "%s/%s", class_path, dent->d_name)) - continue; - - if (stat(sysfs_dev->sysfs_path, &buf)) { - EFA_INFO(FI_LOG_FABRIC, "couldn't stat '%s'.\n", - sysfs_dev->sysfs_path); - continue; - } - - if (!S_ISDIR(buf.st_mode)) - continue; - - if (!check_snprintf(sysfs_dev->sysfs_name, sizeof(sysfs_dev->sysfs_name), - "%s", dent->d_name)) - continue; - - if (fi_read_file(sysfs_dev->sysfs_path, "ibdev", - sysfs_dev->ibdev_name, - sizeof(sysfs_dev->ibdev_name)) < 0) { - EFA_INFO(FI_LOG_FABRIC, "No ibdev class attr for '%s'.\n", - dent->d_name); - continue; - } - - sysfs_dev->ibdev_name[sizeof(sysfs_dev->ibdev_name) - 1] = '\0'; - - if (strncmp(sysfs_dev->ibdev_name, "efa_", 4) != 0) - continue; - - if (!check_snprintf(sysfs_dev->ibdev_path, - sizeof(sysfs_dev->ibdev_path), - "%s/class/infiniband/%s", sysfs_path, - sysfs_dev->ibdev_name)) - continue; - - sysfs_dev->next = *sysfs_dev_list; - sysfs_dev->have_driver = 0; - if (fi_read_file(sysfs_dev->sysfs_path, "abi_version", - value, sizeof(value)) > 0) - sysfs_dev->abi_ver = strtol(value, NULL, 10); - else - sysfs_dev->abi_ver = 0; - - *sysfs_dev_list = sysfs_dev; - sysfs_dev = NULL; - } - - if (sysfs_dev) - free(sysfs_dev); - -class_dir_close: - closedir(class_dir); -sysfs_path_free: - free(sysfs_path); - return ret; -} - -static struct verbs_device *driver_init(const char *uverbs_sys_path, int abi_version) -{ - char value[8]; - struct efa_device *dev; - unsigned vendor, device; - int i; - - if (fi_read_file(uverbs_sys_path, "device/vendor", value, - sizeof(value)) < 0) - return NULL; - vendor = strtol(value, NULL, 16); - - if (fi_read_file(uverbs_sys_path, "device/device", value, - sizeof(value)) < 0) - return NULL; - device = strtol(value, NULL, 16); - - for (i = 0; i < ARRAY_SIZE(hca_table); ++i) - if (vendor == hca_table[i].vendor && - device == hca_table[i].device) - goto found; - - return NULL; - -found: - dev = calloc(1, sizeof(*dev)); - if (!dev) { - EFA_WARN(FI_LOG_FABRIC, "Couldn't allocate device for %s\n", - uverbs_sys_path); - return NULL; - } - - dev->page_size = sysconf(_SC_PAGESIZE); - dev->abi_version = abi_version; - - return &dev->verbs_dev; -} - -static struct ibv_device *device_init(struct ibv_sysfs_dev *sysfs_dev) -{ - struct verbs_device *vdev; - struct ibv_device *dev; - - vdev = driver_init(sysfs_dev->sysfs_path, sysfs_dev->abi_ver); - if (!vdev) - return NULL; - - dev = &vdev->device; - - strcpy(dev->dev_name, sysfs_dev->sysfs_name); - strcpy(dev->dev_path, sysfs_dev->sysfs_path); - strcpy(dev->name, sysfs_dev->ibdev_name); - strcpy(dev->ibdev_path, sysfs_dev->ibdev_path); - - return dev; -} - -static int check_abi_version(const char *path) -{ - char value[8]; - - if (fi_read_file(path, "class/infiniband_verbs/abi_version", - value, sizeof(value)) < 0) { - return -ENOSYS; - } - - abi_ver = strtol(value, NULL, 10); - - if (abi_ver < IB_USER_VERBS_MIN_ABI_VERSION || - abi_ver > IB_USER_VERBS_MAX_ABI_VERSION) { - EFA_WARN(FI_LOG_FABRIC, "Kernel ABI version %d doesn't match library version %d.\n", - abi_ver, IB_USER_VERBS_MAX_ABI_VERSION); - return -ENOSYS; - } - - return 0; -} - -static void check_memlock_limit(void) -{ - struct rlimit rlim; - - if (!geteuid()) - return; - - if (getrlimit(RLIMIT_MEMLOCK, &rlim)) { - EFA_INFO(FI_LOG_FABRIC, "getrlimit(RLIMIT_MEMLOCK) failed.\n"); - return; - } - - if (rlim.rlim_cur <= 32768) - EFA_INFO(FI_LOG_FABRIC, - "RLIMIT_MEMLOCK is %lu bytes. This will severely limit memory registrations.\n", - rlim.rlim_cur); -} - -static void add_device(struct ibv_device *dev, - struct ibv_device ***dev_list, - int *num_devices, - int *list_size) -{ - struct ibv_device **new_list; - - if (*list_size <= *num_devices) { - *list_size = *list_size ? *list_size * 2 : 1; - new_list = realloc(*dev_list, *list_size * sizeof(*new_list)); - if (!new_list) - return; - *dev_list = new_list; - } - - (*dev_list)[(*num_devices)++] = dev; -} - -HIDDEN int efa_ib_init(struct ibv_device ***list) -{ - struct ibv_sysfs_dev *sysfs_dev_list; - struct ibv_sysfs_dev *sysfs_dev; - struct ibv_sysfs_dev *next_dev; - struct ibv_device *device; - int num_devices = 0; - int list_size = 0; - char *sysfs_path; - int ret; - - *list = NULL; - - sysfs_path = get_sysfs_path(); - if (!sysfs_path) - return -ENOSYS; - - ret = check_abi_version(sysfs_path); - if (ret) - goto err_free_path; - - check_memlock_limit(); - - ret = efa_find_sysfs_devs(&sysfs_dev_list); - if (ret) - goto err_free_path; - - sysfs_dev = sysfs_dev_list; - while (sysfs_dev) { - device = device_init(sysfs_dev); - if (device) { - add_device(device, list, &num_devices, &list_size); - sysfs_dev->have_driver = 1; - } - sysfs_dev = sysfs_dev->next; - } - - sysfs_dev = sysfs_dev_list; - while (sysfs_dev) { - next_dev = sysfs_dev->next; - free(sysfs_dev); - sysfs_dev = next_dev; - } - - free(sysfs_path); - - return num_devices; - -err_free_path: - free(sysfs_path); - return ret; -} diff --git a/prov/efa/src/efa_verbs/efa_io_defs.h b/prov/efa/src/efa_verbs/efa_io_defs.h deleted file mode 100644 index b2ee54250d2..00000000000 --- a/prov/efa/src/efa_verbs/efa_io_defs.h +++ /dev/null @@ -1,654 +0,0 @@ -/* - * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef _EFA_IO_H_ -#define _EFA_IO_H_ - -#define EFA_IO_TX_DESC_NUM_BUFS 2 -#define EFA_IO_TX_DESC_INLINE_MAX_SIZE 32 -#define EFA_IO_TX_DESC_IMM_DATA_SIZE 4 - -enum efa_io_queue_type { - /* send queue (of a QP) */ - EFA_IO_SEND_QUEUE = 1, - /* recv queue (of a QP) */ - EFA_IO_RECV_QUEUE = 2, -}; - -enum efa_io_send_op_type { - /* invalid op */ - EFA_IO_INVALID_OP = 0, - /* send message */ - EFA_IO_SEND = 1, - /* RDMA read, future, not supported yet */ - EFA_IO_RDMA_READ = 2, - /* RDMA write, future, not supported yet */ - EFA_IO_RDMA_WRITE = 3, -}; - -enum efa_io_comp_status { - /* Successful completion */ - EFA_IO_COMP_STATUS_OK = 0, - /* Flushed during QP destroy */ - EFA_IO_COMP_STATUS_FLUSHED = 1, - /* Internal QP error */ - EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR = 2, - /* Bad operation type */ - EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_OP_TYPE = 3, - /* Bad AH */ - EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH = 4, - /* LKEY not registered or does not match IOVA */ - EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY = 5, - /* Message too long */ - EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH = 6, - /* Destination ENI is down or does not run EFA */ - EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS = 7, - /* Connection was reset by remote side */ - EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT = 8, - /* Bad dest QP number (QP does not exist or is in error state) */ - EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN = 9, - /* Destination resource not ready (no WQEs posted on RQ) */ - EFA_IO_COMP_STATUS_REMOTE_ERROR_RNR = 10, - /* Receiver SGL too short */ - EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH = 11, - /* Unexpected status returned by responder */ - EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_STATUS = 12, -}; - -struct efa_io_tx_meta_desc { - /* Verbs-generated Request ID */ - uint16_t req_id; - - /* - * control flags - * 3:0 : op_type - operation type: send/rdma/fast mem - * ops/etc - * 4 : has_imm - immediate_data field carries valid - * data. - * 5 : inline_msg - inline mode - inline message data - * follows this descriptor (no buffer descriptors). - * Note that it is different from immediate data - * 6 : meta_extension - Extended metadata. MBZ - * 7 : meta_desc - Indicates metadata descriptor. - * Must be set. - */ - uint8_t ctrl1; - - /* - * control flags - * 0 : phase - * 1 : reserved25 - MBZ - * 2 : first - Indicates first descriptor in - * transaction. Must be set. - * 3 : last - Indicates last descriptor in - * transaction. Must be set. - * 4 : comp_req - Indicates whether completion should - * be posted, after packet is transmitted. Valid only - * for the first descriptor - * 7:5 : reserved29 - MBZ - */ - uint8_t ctrl2; - - uint16_t dest_qp_num; - - /* - * If inline_msg bit is set, length of inline message in bytes, - * otherwise length of SGL (number of buffers). - */ - uint16_t length; - - /* - * immediate data: if has_imm is set, then this field is included - * within Tx message and reported in remote Rx completion. - */ - uint32_t immediate_data; - - uint16_t ah; - - uint16_t reserved; -}; - -/* - * Tx buffer descriptor, for any transport type. Preceded by metadata - * descriptor. - */ -struct efa_io_tx_buf_desc { - /* length in bytes */ - uint16_t length; - - /* - * control flags - * 6:0 : reserved16 - * 7 : meta_desc - MBZ - */ - uint8_t ctrl1; - - /* - * control flags - * 0 : phase - phase bit - * 1 : reserved25 - MBZ - * 2 : first - Indicates first descriptor in - * transaction. MBZ - * 3 : last - Indicates last descriptor in transaction - * 7:4 : reserved28 - MBZ - */ - uint8_t ctrl; - - /* memory translation key */ - uint32_t lkey; - - /* Buffer address bits[31:0] */ - uint32_t buf_addr_lo; - - /* Buffer address bits[63:32] */ - uint32_t buf_addr_hi; -}; - -/* Tx meta descriptor for UD */ -struct efa_io_tx_ud_meta { - /* Queue key */ - uint32_t qkey; - - uint8_t reserved[12]; -}; - -struct efa_io_remote_mem_addr { - /* length in bytes */ - uint16_t length; - - /* - * control flags - * 5:0 : reserved16 - * 6 : meta_extension - Must be set - * 7 : meta_desc - Must be set - */ - uint8_t ctrl1; - - /* - * control flags - * 0 : phase - phase bit - * 1 : reserved25 - MBZ - * 2 : first - Indicates first descriptor in - * transaction. MBZ - * 3 : last - Indicates last descriptor in transaction - * 7:4 : reserved28 - MBZ - */ - uint8_t ctrl; - - /* remote memory translation key */ - uint32_t rkey; - - /* Buffer address bits[31:0] */ - uint32_t buf_addr_lo; - - /* Buffer address bits[63:32] */ - uint32_t buf_addr_hi; -}; - -/* - * Tx WQE, composed of tx meta descriptors followed by either tx buffer - * descriptors or inline data - */ -struct efa_io_tx_wqe { - /* TX meta */ - struct efa_io_tx_meta_desc common; - - union { - /* Tx meta for UD */ - struct efa_io_tx_ud_meta ud; - - /* Reserved Tx meta for SRD */ - uint8_t srd_padding[16]; - - /* RDMA memory address */ - struct efa_io_remote_mem_addr rdma_mem_addr; - } u; - - union { - /* buffer descriptors */ - struct efa_io_tx_buf_desc sgl[2]; - - uint8_t inline_data[32]; - } data; -}; - -/* - * Rx buffer descriptor; RX WQE is composed of one or more RX buffer - * descriptors. - */ -struct efa_io_rx_desc { - /* Buffer address bits[31:0] */ - uint32_t buf_addr_lo; - - /* Buffer Pointer[63:32] */ - uint32_t buf_addr_hi; - - /* Verbs-generated request id. */ - uint16_t req_id; - - /* Length in bytes. */ - uint16_t length; - - /* - * LKey and control flags - * 23:0 : lkey - * 29:24 : reserved - MBZ - * 30 : first - Indicates first descriptor in WQE - * 31 : last - Indicates last descriptor in WQE - */ - uint32_t lkey_ctrl; -}; - -/* Common IO completion descriptor */ -struct efa_io_cdesc_common { - /* - * verbs-generated request ID, as provided in the completed tx or rx - * descriptor. - */ - uint16_t req_id; - - uint8_t status; - - /* - * flags - * 0 : phase - Phase bit - * 2:1 : q_type - enum efa_io_queue_type: send/recv - * 3 : has_imm - indicates that immediate data is - * present - for RX completions only - * 4 : wide_completion - indicates that wide - * completion format is used - * 7:5 : reserved29 - */ - uint8_t flags; - - /* local QP number */ - uint16_t qp_num; - - /* Transferred length */ - uint16_t length; -}; - -/* Tx completion descriptor */ -struct efa_io_tx_cdesc { - /* Common completion info */ - struct efa_io_cdesc_common common; -}; - -/* Rx Completion Descriptor */ -struct efa_io_rx_cdesc { - /* Common completion info */ - struct efa_io_cdesc_common common; - - /* Remote Address Handle FW index, 0xFFFF indicates invalid ah */ - uint16_t ah; - - uint16_t src_qp_num; - - /* Immediate data */ - uint32_t imm; -}; - -/* Extended Rx Completion Descriptor */ -struct efa_io_rx_cdesc_wide { - /* Base RX completion info */ - struct efa_io_rx_cdesc rx_cdesc_base; - - /* - * Word 0 of remote (source) address, needed only for in-band - * ad-hoc AH support - */ - uint32_t src_addr_0; - - /* - * Word 1 of remote (source) address, needed only for in-band - * ad-hoc AH support - */ - uint32_t src_addr_1; - - /* - * Word 2 of remote (source) address, needed only for in-band - * ad-hoc AH support - */ - uint32_t src_addr_2; - - /* - * Word 3 of remote (source) address, needed only for in-band - * ad-hoc AH support - */ - uint32_t src_addr_3; -}; - -/* tx_meta_desc */ -#define EFA_IO_TX_META_DESC_OP_TYPE_MASK GENMASK(3, 0) -#define EFA_IO_TX_META_DESC_HAS_IMM_SHIFT 4 -#define EFA_IO_TX_META_DESC_HAS_IMM_MASK BIT(4) -#define EFA_IO_TX_META_DESC_INLINE_MSG_SHIFT 5 -#define EFA_IO_TX_META_DESC_INLINE_MSG_MASK BIT(5) -#define EFA_IO_TX_META_DESC_META_EXTENSION_SHIFT 6 -#define EFA_IO_TX_META_DESC_META_EXTENSION_MASK BIT(6) -#define EFA_IO_TX_META_DESC_META_DESC_SHIFT 7 -#define EFA_IO_TX_META_DESC_META_DESC_MASK BIT(7) -#define EFA_IO_TX_META_DESC_PHASE_MASK BIT(0) -#define EFA_IO_TX_META_DESC_FIRST_SHIFT 2 -#define EFA_IO_TX_META_DESC_FIRST_MASK BIT(2) -#define EFA_IO_TX_META_DESC_LAST_SHIFT 3 -#define EFA_IO_TX_META_DESC_LAST_MASK BIT(3) -#define EFA_IO_TX_META_DESC_COMP_REQ_SHIFT 4 -#define EFA_IO_TX_META_DESC_COMP_REQ_MASK BIT(4) - -/* tx_buf_desc */ -#define EFA_IO_TX_BUF_DESC_META_DESC_SHIFT 7 -#define EFA_IO_TX_BUF_DESC_META_DESC_MASK BIT(7) -#define EFA_IO_TX_BUF_DESC_PHASE_MASK BIT(0) -#define EFA_IO_TX_BUF_DESC_FIRST_SHIFT 2 -#define EFA_IO_TX_BUF_DESC_FIRST_MASK BIT(2) -#define EFA_IO_TX_BUF_DESC_LAST_SHIFT 3 -#define EFA_IO_TX_BUF_DESC_LAST_MASK BIT(3) - -/* remote_mem_addr */ -#define EFA_IO_REMOTE_MEM_ADDR_META_EXTENSION_SHIFT 6 -#define EFA_IO_REMOTE_MEM_ADDR_META_EXTENSION_MASK BIT(6) -#define EFA_IO_REMOTE_MEM_ADDR_META_DESC_SHIFT 7 -#define EFA_IO_REMOTE_MEM_ADDR_META_DESC_MASK BIT(7) -#define EFA_IO_REMOTE_MEM_ADDR_PHASE_MASK BIT(0) -#define EFA_IO_REMOTE_MEM_ADDR_FIRST_SHIFT 2 -#define EFA_IO_REMOTE_MEM_ADDR_FIRST_MASK BIT(2) -#define EFA_IO_REMOTE_MEM_ADDR_LAST_SHIFT 3 -#define EFA_IO_REMOTE_MEM_ADDR_LAST_MASK BIT(3) - -/* rx_desc */ -#define EFA_IO_RX_DESC_LKEY_MASK GENMASK(23, 0) -#define EFA_IO_RX_DESC_FIRST_SHIFT 30 -#define EFA_IO_RX_DESC_FIRST_MASK BIT(30) -#define EFA_IO_RX_DESC_LAST_SHIFT 31 -#define EFA_IO_RX_DESC_LAST_MASK BIT(31) - -/* cdesc_common */ -#define EFA_IO_CDESC_COMMON_PHASE_MASK BIT(0) -#define EFA_IO_CDESC_COMMON_Q_TYPE_SHIFT 1 -#define EFA_IO_CDESC_COMMON_Q_TYPE_MASK GENMASK(2, 1) -#define EFA_IO_CDESC_COMMON_HAS_IMM_SHIFT 3 -#define EFA_IO_CDESC_COMMON_HAS_IMM_MASK BIT(3) -#define EFA_IO_CDESC_COMMON_WIDE_COMPLETION_SHIFT 4 -#define EFA_IO_CDESC_COMMON_WIDE_COMPLETION_MASK BIT(4) - -static inline uint8_t get_efa_io_tx_meta_desc_op_type(const struct efa_io_tx_meta_desc *p) -{ - return p->ctrl1 & EFA_IO_TX_META_DESC_OP_TYPE_MASK; -} - -static inline void set_efa_io_tx_meta_desc_op_type(struct efa_io_tx_meta_desc *p, uint8_t val) -{ - p->ctrl1 |= val & EFA_IO_TX_META_DESC_OP_TYPE_MASK; -} - -static inline uint8_t get_efa_io_tx_meta_desc_has_imm(const struct efa_io_tx_meta_desc *p) -{ - return (p->ctrl1 & EFA_IO_TX_META_DESC_HAS_IMM_MASK) >> EFA_IO_TX_META_DESC_HAS_IMM_SHIFT; -} - -static inline void set_efa_io_tx_meta_desc_has_imm(struct efa_io_tx_meta_desc *p, uint8_t val) -{ - p->ctrl1 |= (val << EFA_IO_TX_META_DESC_HAS_IMM_SHIFT) & EFA_IO_TX_META_DESC_HAS_IMM_MASK; -} - -static inline uint8_t get_efa_io_tx_meta_desc_inline_msg(const struct efa_io_tx_meta_desc *p) -{ - return (p->ctrl1 & EFA_IO_TX_META_DESC_INLINE_MSG_MASK) >> EFA_IO_TX_META_DESC_INLINE_MSG_SHIFT; -} - -static inline void set_efa_io_tx_meta_desc_inline_msg(struct efa_io_tx_meta_desc *p, uint8_t val) -{ - p->ctrl1 |= (val << EFA_IO_TX_META_DESC_INLINE_MSG_SHIFT) & EFA_IO_TX_META_DESC_INLINE_MSG_MASK; -} - -static inline uint8_t get_efa_io_tx_meta_desc_meta_extension(const struct efa_io_tx_meta_desc *p) -{ - return (p->ctrl1 & EFA_IO_TX_META_DESC_META_EXTENSION_MASK) >> EFA_IO_TX_META_DESC_META_EXTENSION_SHIFT; -} - -static inline void set_efa_io_tx_meta_desc_meta_extension(struct efa_io_tx_meta_desc *p, uint8_t val) -{ - p->ctrl1 |= (val << EFA_IO_TX_META_DESC_META_EXTENSION_SHIFT) & EFA_IO_TX_META_DESC_META_EXTENSION_MASK; -} - -static inline uint8_t get_efa_io_tx_meta_desc_meta_desc(const struct efa_io_tx_meta_desc *p) -{ - return (p->ctrl1 & EFA_IO_TX_META_DESC_META_DESC_MASK) >> EFA_IO_TX_META_DESC_META_DESC_SHIFT; -} - -static inline void set_efa_io_tx_meta_desc_meta_desc(struct efa_io_tx_meta_desc *p, uint8_t val) -{ - p->ctrl1 |= (val << EFA_IO_TX_META_DESC_META_DESC_SHIFT) & EFA_IO_TX_META_DESC_META_DESC_MASK; -} - -static inline uint8_t get_efa_io_tx_meta_desc_phase(const struct efa_io_tx_meta_desc *p) -{ - return p->ctrl2 & EFA_IO_TX_META_DESC_PHASE_MASK; -} - -static inline void set_efa_io_tx_meta_desc_phase(struct efa_io_tx_meta_desc *p, uint8_t val) -{ - p->ctrl2 |= val & EFA_IO_TX_META_DESC_PHASE_MASK; -} - -static inline uint8_t get_efa_io_tx_meta_desc_first(const struct efa_io_tx_meta_desc *p) -{ - return (p->ctrl2 & EFA_IO_TX_META_DESC_FIRST_MASK) >> EFA_IO_TX_META_DESC_FIRST_SHIFT; -} - -static inline void set_efa_io_tx_meta_desc_first(struct efa_io_tx_meta_desc *p, uint8_t val) -{ - p->ctrl2 |= (val << EFA_IO_TX_META_DESC_FIRST_SHIFT) & EFA_IO_TX_META_DESC_FIRST_MASK; -} - -static inline uint8_t get_efa_io_tx_meta_desc_last(const struct efa_io_tx_meta_desc *p) -{ - return (p->ctrl2 & EFA_IO_TX_META_DESC_LAST_MASK) >> EFA_IO_TX_META_DESC_LAST_SHIFT; -} - -static inline void set_efa_io_tx_meta_desc_last(struct efa_io_tx_meta_desc *p, uint8_t val) -{ - p->ctrl2 |= (val << EFA_IO_TX_META_DESC_LAST_SHIFT) & EFA_IO_TX_META_DESC_LAST_MASK; -} - -static inline uint8_t get_efa_io_tx_meta_desc_comp_req(const struct efa_io_tx_meta_desc *p) -{ - return (p->ctrl2 & EFA_IO_TX_META_DESC_COMP_REQ_MASK) >> EFA_IO_TX_META_DESC_COMP_REQ_SHIFT; -} - -static inline void set_efa_io_tx_meta_desc_comp_req(struct efa_io_tx_meta_desc *p, uint8_t val) -{ - p->ctrl2 |= (val << EFA_IO_TX_META_DESC_COMP_REQ_SHIFT) & EFA_IO_TX_META_DESC_COMP_REQ_MASK; -} - -static inline uint8_t get_efa_io_tx_buf_desc_meta_desc(const struct efa_io_tx_buf_desc *p) -{ - return (p->ctrl1 & EFA_IO_TX_BUF_DESC_META_DESC_MASK) >> EFA_IO_TX_BUF_DESC_META_DESC_SHIFT; -} - -static inline void set_efa_io_tx_buf_desc_meta_desc(struct efa_io_tx_buf_desc *p, uint8_t val) -{ - p->ctrl1 |= (val << EFA_IO_TX_BUF_DESC_META_DESC_SHIFT) & EFA_IO_TX_BUF_DESC_META_DESC_MASK; -} - -static inline uint8_t get_efa_io_tx_buf_desc_phase(const struct efa_io_tx_buf_desc *p) -{ - return p->ctrl & EFA_IO_TX_BUF_DESC_PHASE_MASK; -} - -static inline void set_efa_io_tx_buf_desc_phase(struct efa_io_tx_buf_desc *p, uint8_t val) -{ - p->ctrl |= val & EFA_IO_TX_BUF_DESC_PHASE_MASK; -} - -static inline uint8_t get_efa_io_tx_buf_desc_first(const struct efa_io_tx_buf_desc *p) -{ - return (p->ctrl & EFA_IO_TX_BUF_DESC_FIRST_MASK) >> EFA_IO_TX_BUF_DESC_FIRST_SHIFT; -} - -static inline void set_efa_io_tx_buf_desc_first(struct efa_io_tx_buf_desc *p, uint8_t val) -{ - p->ctrl |= (val << EFA_IO_TX_BUF_DESC_FIRST_SHIFT) & EFA_IO_TX_BUF_DESC_FIRST_MASK; -} - -static inline uint8_t get_efa_io_tx_buf_desc_last(const struct efa_io_tx_buf_desc *p) -{ - return (p->ctrl & EFA_IO_TX_BUF_DESC_LAST_MASK) >> EFA_IO_TX_BUF_DESC_LAST_SHIFT; -} - -static inline void set_efa_io_tx_buf_desc_last(struct efa_io_tx_buf_desc *p, uint8_t val) -{ - p->ctrl |= (val << EFA_IO_TX_BUF_DESC_LAST_SHIFT) & EFA_IO_TX_BUF_DESC_LAST_MASK; -} - -static inline uint8_t get_efa_io_remote_mem_addr_meta_extension(const struct efa_io_remote_mem_addr *p) -{ - return (p->ctrl1 & EFA_IO_REMOTE_MEM_ADDR_META_EXTENSION_MASK) >> EFA_IO_REMOTE_MEM_ADDR_META_EXTENSION_SHIFT; -} - -static inline void set_efa_io_remote_mem_addr_meta_extension(struct efa_io_remote_mem_addr *p, uint8_t val) -{ - p->ctrl1 |= (val << EFA_IO_REMOTE_MEM_ADDR_META_EXTENSION_SHIFT) & EFA_IO_REMOTE_MEM_ADDR_META_EXTENSION_MASK; -} - -static inline uint8_t get_efa_io_remote_mem_addr_meta_desc(const struct efa_io_remote_mem_addr *p) -{ - return (p->ctrl1 & EFA_IO_REMOTE_MEM_ADDR_META_DESC_MASK) >> EFA_IO_REMOTE_MEM_ADDR_META_DESC_SHIFT; -} - -static inline void set_efa_io_remote_mem_addr_meta_desc(struct efa_io_remote_mem_addr *p, uint8_t val) -{ - p->ctrl1 |= (val << EFA_IO_REMOTE_MEM_ADDR_META_DESC_SHIFT) & EFA_IO_REMOTE_MEM_ADDR_META_DESC_MASK; -} - -static inline uint8_t get_efa_io_remote_mem_addr_phase(const struct efa_io_remote_mem_addr *p) -{ - return p->ctrl & EFA_IO_REMOTE_MEM_ADDR_PHASE_MASK; -} - -static inline void set_efa_io_remote_mem_addr_phase(struct efa_io_remote_mem_addr *p, uint8_t val) -{ - p->ctrl |= val & EFA_IO_REMOTE_MEM_ADDR_PHASE_MASK; -} - -static inline uint8_t get_efa_io_remote_mem_addr_first(const struct efa_io_remote_mem_addr *p) -{ - return (p->ctrl & EFA_IO_REMOTE_MEM_ADDR_FIRST_MASK) >> EFA_IO_REMOTE_MEM_ADDR_FIRST_SHIFT; -} - -static inline void set_efa_io_remote_mem_addr_first(struct efa_io_remote_mem_addr *p, uint8_t val) -{ - p->ctrl |= (val << EFA_IO_REMOTE_MEM_ADDR_FIRST_SHIFT) & EFA_IO_REMOTE_MEM_ADDR_FIRST_MASK; -} - -static inline uint8_t get_efa_io_remote_mem_addr_last(const struct efa_io_remote_mem_addr *p) -{ - return (p->ctrl & EFA_IO_REMOTE_MEM_ADDR_LAST_MASK) >> EFA_IO_REMOTE_MEM_ADDR_LAST_SHIFT; -} - -static inline void set_efa_io_remote_mem_addr_last(struct efa_io_remote_mem_addr *p, uint8_t val) -{ - p->ctrl |= (val << EFA_IO_REMOTE_MEM_ADDR_LAST_SHIFT) & EFA_IO_REMOTE_MEM_ADDR_LAST_MASK; -} - -static inline uint32_t get_efa_io_rx_desc_lkey(const struct efa_io_rx_desc *p) -{ - return p->lkey_ctrl & EFA_IO_RX_DESC_LKEY_MASK; -} - -static inline void set_efa_io_rx_desc_lkey(struct efa_io_rx_desc *p, uint32_t val) -{ - p->lkey_ctrl |= val & EFA_IO_RX_DESC_LKEY_MASK; -} - -static inline uint32_t get_efa_io_rx_desc_first(const struct efa_io_rx_desc *p) -{ - return (p->lkey_ctrl & EFA_IO_RX_DESC_FIRST_MASK) >> EFA_IO_RX_DESC_FIRST_SHIFT; -} - -static inline void set_efa_io_rx_desc_first(struct efa_io_rx_desc *p, uint32_t val) -{ - p->lkey_ctrl |= (val << EFA_IO_RX_DESC_FIRST_SHIFT) & EFA_IO_RX_DESC_FIRST_MASK; -} - -static inline uint32_t get_efa_io_rx_desc_last(const struct efa_io_rx_desc *p) -{ - return (p->lkey_ctrl & EFA_IO_RX_DESC_LAST_MASK) >> EFA_IO_RX_DESC_LAST_SHIFT; -} - -static inline void set_efa_io_rx_desc_last(struct efa_io_rx_desc *p, uint32_t val) -{ - p->lkey_ctrl |= (val << EFA_IO_RX_DESC_LAST_SHIFT) & EFA_IO_RX_DESC_LAST_MASK; -} - -static inline uint8_t get_efa_io_cdesc_common_phase(const struct efa_io_cdesc_common *p) -{ - return p->flags & EFA_IO_CDESC_COMMON_PHASE_MASK; -} - -static inline void set_efa_io_cdesc_common_phase(struct efa_io_cdesc_common *p, uint8_t val) -{ - p->flags |= val & EFA_IO_CDESC_COMMON_PHASE_MASK; -} - -static inline uint8_t get_efa_io_cdesc_common_q_type(const struct efa_io_cdesc_common *p) -{ - return (p->flags & EFA_IO_CDESC_COMMON_Q_TYPE_MASK) >> EFA_IO_CDESC_COMMON_Q_TYPE_SHIFT; -} - -static inline void set_efa_io_cdesc_common_q_type(struct efa_io_cdesc_common *p, uint8_t val) -{ - p->flags |= (val << EFA_IO_CDESC_COMMON_Q_TYPE_SHIFT) & EFA_IO_CDESC_COMMON_Q_TYPE_MASK; -} - -static inline uint8_t get_efa_io_cdesc_common_has_imm(const struct efa_io_cdesc_common *p) -{ - return (p->flags & EFA_IO_CDESC_COMMON_HAS_IMM_MASK) >> EFA_IO_CDESC_COMMON_HAS_IMM_SHIFT; -} - -static inline void set_efa_io_cdesc_common_has_imm(struct efa_io_cdesc_common *p, uint8_t val) -{ - p->flags |= (val << EFA_IO_CDESC_COMMON_HAS_IMM_SHIFT) & EFA_IO_CDESC_COMMON_HAS_IMM_MASK; -} - -static inline uint8_t get_efa_io_cdesc_common_wide_completion(const struct efa_io_cdesc_common *p) -{ - return (p->flags & EFA_IO_CDESC_COMMON_WIDE_COMPLETION_MASK) >> EFA_IO_CDESC_COMMON_WIDE_COMPLETION_SHIFT; -} - -static inline void set_efa_io_cdesc_common_wide_completion(struct efa_io_cdesc_common *p, uint8_t val) -{ - p->flags |= (val << EFA_IO_CDESC_COMMON_WIDE_COMPLETION_SHIFT) & EFA_IO_CDESC_COMMON_WIDE_COMPLETION_MASK; -} - -#endif /* _EFA_IO_H_ */ diff --git a/prov/efa/src/rxr/rxr.h b/prov/efa/src/rxr/rxr.h index 8201b12ec9d..32873a096a5 100644 --- a/prov/efa/src/rxr/rxr.h +++ b/prov/efa/src/rxr/rxr.h @@ -61,16 +61,33 @@ #include #include #include +#include -#define RXR_MAJOR_VERSION (2) -#define RXR_MINOR_VERSION (0) -#define RXR_PROTOCOL_VERSION (2) -#define RXR_FI_VERSION FI_VERSION(1, 8) +#include "rxr_pkt_entry.h" +#include "rxr_pkt_type.h" + +/* + * EFA support interoperability between protocol version 4 and above, + * and version 4 is considered the base version. + */ +#define RXR_BASE_PROTOCOL_VERSION (4) +#define RXR_CUR_PROTOCOL_VERSION (4) +#define RXR_NUM_PROTOCOL_VERSION (RXR_CUR_PROTOCOL_VERSION - RXR_BASE_PROTOCOL_VERSION + 1) +#define RXR_MAX_PROTOCOL_VERSION (100) + +#define RXR_FI_VERSION OFI_VERSION_LATEST #define RXR_IOV_LIMIT (4) #ifdef ENABLE_EFA_POISONING extern const uint32_t rxr_poison_value; +static inline void rxr_poison_mem_region(uint32_t *ptr, size_t size) +{ + int i; + + for (i = 0; i < size / sizeof(rxr_poison_value); i++) + memcpy(ptr + i, &rxr_poison_value, sizeof(rxr_poison_value)); +} #endif /* @@ -84,7 +101,7 @@ extern const uint32_t rxr_poison_value; #define RXR_RECVWIN_SIZE (16384) #define RXR_DEF_CQ_SIZE (8192) #define RXR_REMOTE_CQ_DATA_LEN (8) -#define RXR_MIN_AV_SIZE (8192) + /* maximum timeout for RNR backoff (microseconds) */ #define RXR_DEF_RNR_MAX_TIMEOUT (1000000) /* bounds for random RNR backoff timeout */ @@ -140,17 +157,23 @@ extern const uint32_t rxr_poison_value; #define RXR_MULTI_RECV_CONSUMER BIT_ULL(5) /* - * for RMA + * Flag to tell if the transmission is using FI_DELIVERY_COMPLETE + * protocols */ -#define RXR_WRITE (1 << 6) -#define RXR_READ_REQ (1 << 7) -#define RXR_READ_DATA (1 << 8) + +#define RXR_DELIVERY_COMPLETE_REQUESTED BIT_ULL(6) /* - * Used to provide protocol compatibility across versions that include a - * credit request along with the RTS and those that do not + * Flag to tell if the sender + * receives the receipt packet for the tx_entry. */ -#define RXR_CREDIT_REQUEST BIT_ULL(9) +#define RXR_RECEIPT_RECEIVED BIT_ULL(7) + +/* + * Flag to tell that + * long message protocol is used + */ +#define RXR_LONGCTS_PROTOCOL BIT_ULL(8) /* * OFI flags @@ -160,6 +183,7 @@ extern const uint32_t rxr_poison_value; * 60 - 63 provider specific */ #define RXR_NO_COMPLETION BIT_ULL(60) +#define RXR_NO_COUNTER BIT_ULL(61) /* * RM flags @@ -169,20 +193,32 @@ extern const uint32_t rxr_poison_value; #define RXR_MTU_MAX_LIMIT BIT_ULL(15) + + +extern struct fi_info *shm_info; + extern struct fi_provider *lower_efa_prov; extern struct fi_provider rxr_prov; extern struct fi_info rxr_info; extern struct rxr_env rxr_env; extern struct fi_fabric_attr rxr_fabric_attr; extern struct util_prov rxr_util_prov; +extern struct efa_ep_addr *local_efa_addr; struct rxr_env { int rx_window_size; int tx_min_credits; int tx_max_credits; int tx_queue_size; - int enable_sas_ordering; + int use_device_rdma; + int use_zcpy_rx; + int zcpy_rx_seed; + int enable_shm_transfer; + int shm_av_size; + int shm_max_medium_size; int recvwin_size; + int readcopy_pool_size; + int atomrsp_pool_size; int cq_size; size_t max_memcpy_size; size_t mtu_size; @@ -194,50 +230,33 @@ struct rxr_env { int rx_copy_ooo; int max_timeout; int timeout_interval; + size_t efa_cq_read_size; + size_t shm_cq_read_size; + size_t efa_max_medium_msg_size; + size_t efa_min_read_msg_size; + size_t efa_min_read_write_size; + size_t efa_read_segment_size; }; -enum rxr_pkt_type { - RXR_RTS_PKT = 1, - RXR_CONNACK_PKT, - /* Large message types */ - RXR_CTS_PKT, - RXR_DATA_PKT, - RXR_READRSP_PKT, -}; - -/* pkt_entry types for rx pkts */ -enum rxr_pkt_entry_type { - RXR_PKT_ENTRY_POSTED = 1, /* entries that are posted to the core */ - RXR_PKT_ENTRY_UNEXP, /* entries used to stage unexpected msgs */ - RXR_PKT_ENTRY_OOO /* entries used to stage out-of-order RTS */ -}; - -/* pkt_entry state for retransmit tracking */ -enum rxr_pkt_entry_state { - RXR_PKT_ENTRY_FREE = 0, - RXR_PKT_ENTRY_IN_USE, - RXR_PKT_ENTRY_RNR_RETRANSMIT, +enum rxr_lower_ep_type { + EFA_EP = 1, + SHM_EP, }; enum rxr_x_entry_type { RXR_TX_ENTRY = 1, RXR_RX_ENTRY, + RXR_READ_ENTRY, }; enum rxr_tx_comm_type { RXR_TX_FREE = 0, /* tx_entry free state */ - RXR_TX_RTS, /* tx_entry sending RTS message */ + RXR_TX_REQ, /* tx_entry sending REQ packet */ RXR_TX_SEND, /* tx_entry sending data in progress */ - RXR_TX_QUEUED_RTS, /* tx_entry was unable to send RTS */ - RXR_TX_QUEUED_RTS_RNR, /* tx_entry RNR sending RTS packet */ + RXR_TX_QUEUED_SHM_RMA, /* tx_entry was unable to send RMA operations over shm provider */ + RXR_TX_QUEUED_CTRL, /* tx_entry was unable to send ctrl packet */ + RXR_TX_QUEUED_REQ_RNR, /* tx_entry RNR sending REQ packet */ RXR_TX_QUEUED_DATA_RNR, /* tx_entry RNR sending data packets */ - RXR_TX_SENT_READRSP, /* tx_entry (on remote EP) sent - * read response (FI_READ only) - */ - RXR_TX_QUEUED_READRSP, /* tx_entry (on remote EP) was - * unable to send read response - * (FI_READ only) - */ RXR_TX_WAIT_READ_FINISH, /* tx_entry (on initiating EP) wait * for rx_entry to finish receiving * (FI_READ only) @@ -246,62 +265,50 @@ enum rxr_tx_comm_type { enum rxr_rx_comm_type { RXR_RX_FREE = 0, /* rx_entry free state */ - RXR_RX_INIT, /* rx_entry ready to recv RTS */ + RXR_RX_INIT, /* rx_entry ready to recv RTM */ RXR_RX_UNEXP, /* rx_entry unexp msg waiting for post recv */ - RXR_RX_MATCHED, /* rx_entry matched with RTS msg */ + RXR_RX_MATCHED, /* rx_entry matched with RTM */ RXR_RX_RECV, /* rx_entry large msg recv data pkts */ - RXR_RX_QUEUED_CTS, /* rx_entry was unable to send CTS */ + RXR_RX_QUEUED_CTRL, /* rx_entry was unable to send ctrl packet */ + RXR_RX_QUEUED_EOR, /* rx_entry was unable to send EOR over shm */ RXR_RX_QUEUED_CTS_RNR, /* rx_entry RNR sending CTS */ RXR_RX_WAIT_READ_FINISH, /* rx_entry wait for send to finish, FI_READ */ + RXR_RX_WAIT_ATOMRSP_SENT, /* rx_entry wait for atomrsp packet sent completion */ }; -enum rxr_peer_state { - RXR_PEER_FREE = 0, /* rxr_peer free state */ - RXR_PEER_CONNREQ, /* RTS with endpoint address sent to peer */ - RXR_PEER_ACKED, /* RXR_CONNACK_PKT received from peer */ +enum rxr_rx_buf_owner { + RXR_RX_PROV_BUF = 0, /* Bounce buffers allocated and owned by provider */ + RXR_RX_USER_BUF, /* Recv buffers posted by applications */ }; -/* peer is in backoff, not allowed to send */ -#define RXR_PEER_IN_BACKOFF (1ULL << 0) -/* peer backoff was increased during this loop of the progress engine */ -#define RXR_PEER_BACKED_OFF (1ULL << 1) +#define RXR_PEER_REQ_SENT BIT_ULL(0) /* sent a REQ to the peer, peer should send a handshake back */ +#define RXR_PEER_HANDSHAKE_SENT BIT_ULL(1) +#define RXR_PEER_HANDSHAKE_RECEIVED BIT_ULL(2) +#define RXR_PEER_IN_BACKOFF BIT_ULL(3) /* peer is in backoff, not allowed to send */ +#define RXR_PEER_BACKED_OFF BIT_ULL(4) /* peer backoff was increased during this loop of the progress engine */ struct rxr_fabric { struct util_fabric util_fabric; struct fid_fabric *lower_fabric; + struct fid_fabric *shm_fabric; #ifdef RXR_PERF_ENABLED struct ofi_perfset perf_set; #endif }; -struct rxr_mr { - struct fid_mr mr_fid; - struct fid_mr *msg_mr; - struct rxr_domain *domain; -}; - -struct rxr_av_entry { - uint8_t addr[RXR_MAX_NAME_LENGTH]; - fi_addr_t rdm_addr; - UT_hash_handle hh; -}; - -struct rxr_av { - struct util_av util_av; - struct fid_av *rdm_av; - struct rxr_av_entry *av_map; - - int rdm_av_used; - size_t rdm_addrlen; -}; +#define RXR_MAX_NUM_PROTOCOLS (RXR_MAX_PROTOCOL_VERSION - RXR_BASE_PROTOCOL_VERSION + 1) struct rxr_peer { bool tx_init; /* tracks initialization of tx state */ bool rx_init; /* tracks initialization of rx state */ + bool is_self; /* self flag */ + bool is_local; /* local/remote peer flag */ + fi_addr_t shm_fiaddr; /* fi_addr_t addr from shm provider */ struct rxr_robuf *robuf; /* tracks expected msg_id on rx */ uint32_t next_msg_id; /* sender's view of msg_id */ - enum rxr_peer_state state; /* state of CM protocol with peer */ - unsigned int rnr_state; /* tracks RNR backoff for peer */ + uint32_t flags; + uint32_t maxproto; /* maximum supported protocol version by this peer */ + uint64_t features[RXR_MAX_NUM_PROTOCOLS]; /* the feature flag for each version */ size_t tx_pending; /* tracks pending tx ops to this peer */ uint16_t tx_credits; /* available send credits */ uint16_t rx_credits; /* available credits to allocate */ @@ -310,7 +317,29 @@ struct rxr_peer { int timeout_interval; /* initial RNR timeout value */ int rnr_timeout_exp; /* RNR timeout exponentation calc val */ struct dlist_entry rnr_entry; /* linked to rxr_ep peer_backoff_list */ - struct dlist_entry entry; /* linked to rxr_ep peer_list */ +}; + +struct rxr_queued_ctrl_info { + int type; + int inject; +}; + +struct rxr_atomic_hdr { + /* atomic_op is different from tx_op */ + uint32_t atomic_op; + uint32_t datatype; +}; + +/* extra information that is not included in fi_msg_atomic + * used by fetch atomic and compare atomic. + * resp stands for response + * comp stands for compare + */ +struct rxr_atomic_ex { + struct iovec resp_iov[RXR_IOV_LIMIT]; + int resp_iov_count; + struct iovec comp_iov[RXR_IOV_LIMIT]; + int comp_iov_count; }; struct rxr_rx_entry { @@ -324,6 +353,7 @@ struct rxr_rx_entry { */ uint32_t tx_id; uint32_t rx_id; + uint32_t op; /* * The following two varibales are for emulated RMA fi_read only @@ -331,18 +361,23 @@ struct rxr_rx_entry { uint32_t rma_loc_tx_id; uint32_t rma_initiator_rx_id; + struct rxr_atomic_hdr atomic_hdr; + uint32_t msg_id; uint64_t tag; uint64_t ignore; - uint64_t bytes_done; + uint64_t bytes_received; + uint64_t bytes_copied; int64_t window; uint16_t credit_request; + int credit_cts; uint64_t total_len; enum rxr_rx_comm_type state; + struct rxr_queued_ctrl_info queued_ctrl; uint64_t fi_flags; uint16_t rxr_flags; @@ -350,6 +385,15 @@ struct rxr_rx_entry { size_t iov_count; struct iovec iov[RXR_IOV_LIMIT]; + /* App-provided buffers and descriptors */ + void *desc[RXR_IOV_LIMIT]; + enum rxr_rx_buf_owner owner; + struct fi_msg *posted_recv; + + /* iov_count on sender side, used for large message READ over shm */ + size_t rma_iov_count; + struct fi_rma_iov rma_iov[RXR_IOV_LIMIT]; + struct fi_cq_tagged_entry cq_entry; /* entry is linked with rx entry lists in rxr_ep */ @@ -372,7 +416,8 @@ struct rxr_rx_entry { struct dlist_entry multi_recv_entry; struct rxr_rx_entry *master_entry; - struct rxr_pkt_entry *unexp_rts_pkt; + struct rxr_pkt_entry *unexp_pkt; + char *atomrsp_data; #if ENABLE_DEBUG /* linked with rx_pending_list in rxr_ep */ @@ -386,6 +431,7 @@ struct rxr_tx_entry { /* Must remain at the top */ enum rxr_x_entry_type type; + uint32_t op; fi_addr_t addr; /* @@ -407,8 +453,11 @@ struct rxr_tx_entry { uint64_t total_len; enum rxr_tx_comm_type state; + struct rxr_queued_ctrl_info queued_ctrl; uint64_t fi_flags; + uint64_t rxr_flags; + uint64_t send_flags; size_t iov_count; size_t iov_index; @@ -420,6 +469,13 @@ struct rxr_tx_entry { size_t rma_iov_count; struct fi_rma_iov rma_iov[RXR_IOV_LIMIT]; + /* App-provided reg descriptor */ + void *desc[RXR_IOV_LIMIT]; + + /* atomic related variables */ + struct rxr_atomic_hdr atomic_hdr; + struct rxr_atomic_ex atomic_ex; + /* Only used with mr threshold switch from memcpy */ size_t iov_mr_start; struct fid_mr *mr[RXR_IOV_LIMIT]; @@ -445,16 +501,21 @@ struct rxr_tx_entry { (*((enum rxr_x_entry_type *) \ ((unsigned char *)((pkt_entry)->x_entry)))) +enum efa_domain_type { + EFA_DOMAIN_DGRAM = 0, + EFA_DOMAIN_RDM, +}; + struct rxr_domain { struct util_domain util_domain; + enum efa_domain_type type; struct fid_domain *rdm_domain; - + size_t mtu_size; size_t addrlen; - uint8_t mr_local; + uint8_t rxr_mr_local; uint64_t rdm_mode; int do_progress; size_t cq_size; - enum fi_resource_mgmt resource_mgmt; }; struct rxr_ep { @@ -463,16 +524,24 @@ struct rxr_ep { uint8_t core_addr[RXR_MAX_NAME_LENGTH]; size_t core_addrlen; + /* per-version feature flag */ + uint64_t features[RXR_NUM_PROTOCOL_VERSION]; + /* per-peer information */ struct rxr_peer *peer; - /* free stack for reorder buffer */ - struct rxr_robuf_fs *robuf_fs; + /* bufpool for reorder buffer */ + struct ofi_bufpool *robuf_pool; /* core provider fid */ struct fid_ep *rdm_ep; struct fid_cq *rdm_cq; + /* shm provider fid */ + bool use_shm; + struct fid_ep *shm_ep; + struct fid_cq *shm_cq; + /* * RxR rx/tx queue sizes. These may be different from the core * provider's rx/tx size and will either limit the number of possible @@ -487,6 +556,12 @@ struct rxr_ep { /* core's capabilities */ uint64_t core_caps; + /* Endpoint's capability to support zero-copy rx */ + bool use_zcpy_rx; + + /* Application requested resource management support */ + int handle_resource_management; + /* rx/tx queue size of core provider */ size_t core_rx_size; size_t max_outstanding_tx; @@ -501,6 +576,12 @@ struct rxr_ep { /* core's supported tx/rx msg_order */ uint64_t core_msg_order; + /* Application's maximum msg size hint */ + size_t max_msg_size; + + /* RxR protocol's max header size */ + size_t max_proto_hdr_size; + /* tx iov limit of core provider */ size_t core_iov_limit; @@ -508,13 +589,25 @@ struct rxr_ep { size_t min_multi_recv_size; /* buffer pool for send & recv */ - struct ofi_bufpool *tx_pkt_pool; - struct ofi_bufpool *rx_pkt_pool; + struct ofi_bufpool *tx_pkt_efa_pool; + struct ofi_bufpool *rx_pkt_efa_pool; + + /* + * buffer pool for send & recv for shm as mtu size is different from + * the one of efa, and do not require local memory registration + */ + struct ofi_bufpool *tx_pkt_shm_pool; + struct ofi_bufpool *rx_pkt_shm_pool; /* staging area for unexpected and out-of-order packets */ struct ofi_bufpool *rx_unexp_pkt_pool; struct ofi_bufpool *rx_ooo_pkt_pool; + /* staging area for read copy */ + struct ofi_bufpool *rx_readcopy_pkt_pool; + int rx_readcopy_pkt_pool_used; + int rx_readcopy_pkt_pool_max_used; + #ifdef ENABLE_EFA_POISONING size_t tx_pkt_pool_entry_sz; size_t rx_pkt_pool_entry_sz; @@ -525,7 +618,17 @@ struct rxr_ep { struct ofi_bufpool *rx_entry_pool; /* datastructure to maintain read response */ struct ofi_bufpool *readrsp_tx_entry_pool; - + /* data structure to maintain read */ + struct ofi_bufpool *read_entry_pool; + /* data structure to maintain pkt rx map */ + struct ofi_bufpool *map_entry_pool; + /* rxr medium message pkt_entry to rx_entry map */ + struct rxr_pkt_rx_map *pkt_rx_map; + /* + * buffer pool for atomic response data, used by + * emulated fetch and compare atomic. + */ + struct ofi_bufpool *rx_atomrsp_pool; /* rx_entries with recv buf */ struct dlist_entry rx_list; /* rx_entries without recv buf (unexpected message) */ @@ -536,16 +639,18 @@ struct rxr_ep { struct dlist_entry rx_unexp_tagged_list; /* list of pre-posted recv buffers */ struct dlist_entry rx_posted_buf_list; + /* list of pre-posted recv buffers for shm */ + struct dlist_entry rx_posted_buf_shm_list; /* tx entries with queued messages */ struct dlist_entry tx_entry_queued_list; /* rx entries with queued messages */ struct dlist_entry rx_entry_queued_list; /* tx_entries with data to be sent (large messages) */ struct dlist_entry tx_pending_list; + /* read entries with data to be read */ + struct dlist_entry read_pending_list; /* rxr_peer entries that are in backoff due to RNR */ struct dlist_entry peer_backoff_list; - /* rxr_peer entries with an allocated robuf */ - struct dlist_entry peer_list; #if ENABLE_DEBUG /* rx_entries waiting for data to arrive (large messages) */ @@ -568,10 +673,13 @@ struct rxr_ep { size_t failed_send_comps; size_t recv_comps; #endif + /* number of posted buffer for shm */ + size_t posted_bufs_shm; + size_t rx_bufs_shm_to_post; /* number of posted buffers */ - size_t posted_bufs; - size_t rx_bufs_to_post; + size_t posted_bufs_efa; + size_t rx_bufs_efa_to_post; /* number of buffers available for large messages */ size_t available_data_bufs; /* Timestamp of when available_data_bufs was exhausted. */ @@ -584,381 +692,91 @@ struct rxr_ep { #define rxr_rx_flags(rxr_ep) ((rxr_ep)->util_ep.rx_op_flags) #define rxr_tx_flags(rxr_ep) ((rxr_ep)->util_ep.tx_op_flags) -/* - * Packet fields common to all rxr packets. The other packet headers below must - * be changed if this is updated. - */ -struct rxr_base_hdr { - uint8_t type; - uint8_t version; - uint16_t flags; -}; - -#if defined(static_assert) && defined(__x86_64__) -static_assert(sizeof(struct rxr_base_hdr) == 4, "rxr_base_hdr check"); -#endif - -/* - * RTS packet structure: rts_hdr, cq_data (optional), src_addr(optional), data. - */ -struct rxr_rts_hdr { - uint8_t type; - uint8_t version; - uint16_t flags; - /* end of rxr_base_hdr */ - /* TODO: need to add msg_id -> tx_id mapping to remove tx_id */ - uint16_t credit_request; - uint8_t addrlen; - uint8_t rma_iov_count; - uint32_t tx_id; - uint32_t msg_id; - uint64_t tag; - uint64_t data_len; -}; - -#if defined(static_assert) && defined(__x86_64__) -static_assert(sizeof(struct rxr_rts_hdr) == 32, "rxr_rts_hdr check"); -#endif - -struct rxr_connack_hdr { - uint8_t type; - uint8_t version; - uint16_t flags; - /* end of rxr_base_hdr */ -}; /* 4 bytes */ - -#if defined(static_assert) && defined(__x86_64__) -static_assert(sizeof(struct rxr_base_hdr) == 4, "rxr_connack_hdr check"); -#endif - -struct rxr_cts_hdr { - uint8_t type; - uint8_t version; - uint16_t flags; - /* end of rxr_base_hdr */ - uint8_t pad[4]; - /* TODO: need to add msg_id -> tx_id/rx_id mapping */ - uint32_t tx_id; - uint32_t rx_id; - uint64_t window; -}; - -#if defined(static_assert) && defined(__x86_64__) -static_assert(sizeof(struct rxr_cts_hdr) == 24, "rxr_cts_hdr check"); -#endif - -struct rxr_data_hdr { - uint8_t type; - uint8_t version; - uint16_t flags; - /* end of rxr_base_hdr */ - /* TODO: need to add msg_id -> tx_id/rx_id mapping */ - uint32_t rx_id; - uint64_t seg_size; - uint64_t seg_offset; -}; - -#if defined(static_assert) && defined(__x86_64__) -static_assert(sizeof(struct rxr_data_hdr) == 24, "rxr_data_hdr check"); -#endif - -struct rxr_readrsp_hdr { - uint8_t type; - uint8_t version; - uint16_t flags; - /* end of rxr_base_hdr */ - uint8_t pad[4]; - uint32_t rx_id; - uint32_t tx_id; - uint64_t seg_size; -}; - -#if defined(static_assert) && defined(__x86_64__) -static_assert(sizeof(struct rxr_readrsp_hdr) == sizeof(struct rxr_data_hdr), "rxr_readrsp_hdr check"); -#endif - -/* - * Control header without completion data. We will send more data with the RTS - * packet if RXR_REMOTE_CQ_DATA is not set. - */ -struct rxr_ctrl_hdr { - union { - struct rxr_base_hdr base_hdr; - struct rxr_rts_hdr rts_hdr; - struct rxr_connack_hdr connack_hdr; - struct rxr_cts_hdr cts_hdr; - }; -}; - /* * Control header with completion data. CQ data length is static. */ #define RXR_CQ_DATA_SIZE (8) -struct rxr_ctrl_cq_hdr { - union { - struct rxr_base_hdr base_hdr; - struct rxr_rts_hdr rts_hdr; - struct rxr_connack_hdr connack_hdr; - struct rxr_cts_hdr cts_hdr; - }; - uint64_t cq_data; -}; -/* - * There are three packet types: - * - Control packet with completion queue data - * - Control packet without completion queue data - * - Data packet - * - * All start with rxr_base_hdr so it is safe to cast between them to check - * values in that structure. - */ -struct rxr_ctrl_cq_pkt { - struct rxr_ctrl_cq_hdr hdr; - char data[]; -}; - -struct rxr_ctrl_pkt { - struct rxr_ctrl_hdr hdr; - char data[]; -}; - -struct rxr_data_pkt { - struct rxr_data_hdr hdr; - char data[]; -}; - -struct rxr_readrsp_pkt { - struct rxr_readrsp_hdr hdr; - char data[]; -}; - -struct rxr_pkt_entry { - /* for rx/tx_entry queued_pkts list */ - struct dlist_entry entry; -#if ENABLE_DEBUG - /* for tx/rx debug list or posted buf list */ - struct dlist_entry dbg_entry; -#endif - void *x_entry; /* pointer to rxr rx/tx entry */ - size_t pkt_size; - struct fid_mr *mr; - fi_addr_t addr; - void *pkt; /* rxr_ctrl_*_pkt, or rxr_data_pkt */ - enum rxr_pkt_entry_type type; - enum rxr_pkt_entry_state state; -#if ENABLE_DEBUG -/* pad to cache line size of 64 bytes */ - uint8_t pad[48]; -#endif -}; - -#if defined(static_assert) && defined(__x86_64__) -#if ENABLE_DEBUG -static_assert(sizeof(struct rxr_pkt_entry) == 128, "rxr_pkt_entry check"); -#else -static_assert(sizeof(struct rxr_pkt_entry) == 64, "rxr_pkt_entry check"); -#endif -#endif - -OFI_DECL_RECVWIN_BUF(struct rxr_pkt_entry*, rxr_robuf); -DECLARE_FREESTACK(struct rxr_robuf, rxr_robuf_fs); - -#define RXR_CTRL_HDR_SIZE (sizeof(struct rxr_ctrl_cq_hdr)) - -#define RXR_CTRL_HDR_SIZE_NO_CQ (sizeof(struct rxr_ctrl_hdr)) - -#define RXR_CONNACK_HDR_SIZE (sizeof(struct rxr_connack_hdr)) - -#define RXR_CTS_HDR_SIZE (sizeof(struct rxr_cts_hdr)) - -#define RXR_DATA_HDR_SIZE (sizeof(struct rxr_data_hdr)) - -#define RXR_READRSP_HDR_SIZE (sizeof(struct rxr_readrsp_hdr)) +static inline void rxr_copy_shm_cq_entry(struct fi_cq_tagged_entry *cq_tagged_entry, + struct fi_cq_data_entry *shm_cq_entry) +{ + cq_tagged_entry->op_context = shm_cq_entry->op_context; + cq_tagged_entry->flags = shm_cq_entry->flags; + cq_tagged_entry->len = shm_cq_entry->len; + cq_tagged_entry->buf = shm_cq_entry->buf; + cq_tagged_entry->data = shm_cq_entry->data; + cq_tagged_entry->tag = 0; // No tag for RMA; +} static inline struct rxr_peer *rxr_ep_get_peer(struct rxr_ep *ep, fi_addr_t addr) { return &ep->peer[addr]; } -static inline void rxr_ep_peer_init(struct rxr_ep *ep, struct rxr_peer *peer) +static inline void rxr_setup_msg(struct fi_msg *msg, const struct iovec *iov, void **desc, + size_t count, fi_addr_t addr, void *context, uint32_t data) +{ + msg->msg_iov = iov; + msg->desc = desc; + msg->iov_count = count; + msg->addr = addr; + msg->context = context; + msg->data = data; +} + +static inline void rxr_ep_peer_init_rx(struct rxr_ep *ep, struct rxr_peer *peer) { assert(!peer->rx_init); - peer->robuf = freestack_pop(ep->robuf_fs); + + peer->robuf = ofi_buf_alloc(ep->robuf_pool); + assert(peer->robuf); peer->robuf = ofi_recvwin_buf_alloc(peer->robuf, rxr_env.recvwin_size); - assert(peer->robuf); - dlist_insert_tail(&peer->entry, &ep->peer_list); peer->rx_credits = rxr_env.rx_window_size; peer->rx_init = 1; - - /* - * If the endpoint has never sent a message to this peer thus far, - * initialize tx state as well. - */ - if (!peer->tx_init) { - peer->tx_credits = rxr_env.tx_max_credits; - peer->tx_init = 1; - } } -struct rxr_rx_entry *rxr_ep_get_rx_entry(struct rxr_ep *ep, - const struct iovec *iov, - size_t iov_count, uint64_t tag, - uint64_t ignore, void *context, - fi_addr_t addr, uint32_t op, - uint64_t flags); - -struct rxr_rx_entry *rxr_ep_rx_entry_init(struct rxr_ep *ep, - struct rxr_rx_entry *rx_entry, - const struct iovec *iov, - size_t iov_count, uint64_t tag, - uint64_t ignore, void *context, - fi_addr_t addr, uint32_t op, - uint64_t flags); - -void rxr_generic_tx_entry_init(struct rxr_ep *ep, - struct rxr_tx_entry *tx_entry, - const struct iovec *iov, - size_t iov_count, - const struct fi_rma_iov *rma_iov, - size_t rma_iov_count, - fi_addr_t addr, uint64_t tag, - uint64_t data, void *context, - uint32_t op, uint64_t flags); - -struct rxr_tx_entry *rxr_ep_tx_entry_init(struct rxr_ep *rxr_ep, - const struct iovec *iov, - size_t iov_count, - const struct fi_rma_iov *rma_iov, - size_t rma_iov_count, - fi_addr_t addr, uint64_t tag, - uint64_t data, void *context, - uint32_t op, uint64_t flags); - -ssize_t rxr_tx(struct fid_ep *ep, const struct iovec *iov, size_t iov_count, - const struct fi_rma_iov *rma_iov, size_t rma_iov_count, - fi_addr_t addr, uint64_t tag, uint64_t data, void *context, - uint32_t op, uint64_t flags); - -static inline void -rxr_copy_pkt_entry(struct rxr_ep *ep, - struct rxr_pkt_entry *dest, - struct rxr_pkt_entry *src, - enum rxr_pkt_entry_type type) +static inline void rxr_ep_peer_init_tx(struct rxr_peer *peer) { - FI_DBG(&rxr_prov, FI_LOG_EP_CTRL, - "Copying packet (type %d) out of posted buffer\n", type); - assert(src->type == RXR_PKT_ENTRY_POSTED); - memcpy(dest, src, sizeof(struct rxr_pkt_entry)); - dest->pkt = (struct rxr_pkt *)((char *)dest + sizeof(*dest)); - memcpy(dest->pkt, src->pkt, ep->mtu_size); - dest->type = type; - dlist_init(&dest->entry); -#if ENABLE_DEBUG - dlist_init(&dest->dbg_entry); -#endif - dest->state = RXR_PKT_ENTRY_IN_USE; + assert(!peer->tx_init); + peer->tx_credits = rxr_env.tx_max_credits; + peer->tx_init = 1; } -static inline struct rxr_pkt_entry* -rxr_get_pkt_entry(struct rxr_ep *ep, struct ofi_bufpool *pkt_pool) -{ - struct rxr_pkt_entry *pkt_entry; - void *mr = NULL; +struct efa_ep_addr *rxr_ep_raw_addr(struct rxr_ep *ep); - pkt_entry = ofi_buf_alloc_ex(pkt_pool, &mr); - if (!pkt_entry) - return NULL; -#ifdef ENABLE_EFA_POISONING - memset(pkt_entry, 0, sizeof(*pkt_entry)); -#endif - dlist_init(&pkt_entry->entry); -#if ENABLE_DEBUG - dlist_init(&pkt_entry->dbg_entry); -#endif - pkt_entry->mr = (struct fid_mr *)mr; - pkt_entry->pkt = (struct rxr_pkt *)((char *)pkt_entry + - sizeof(*pkt_entry)); -#ifdef ENABLE_EFA_POISONING - memset(pkt_entry->pkt, 0, ep->mtu_size); -#endif - pkt_entry->state = RXR_PKT_ENTRY_IN_USE; +const char *rxr_ep_raw_addr_str(struct rxr_ep *ep, char *buf, size_t *buflen); - return pkt_entry; -} +struct efa_ep_addr *rxr_peer_raw_addr(struct rxr_ep *ep, fi_addr_t addr); -#ifdef ENABLE_EFA_POISONING -static inline void rxr_poison_mem_region(uint32_t *ptr, size_t size) -{ - int i; +const char *rxr_peer_raw_addr_str(struct rxr_ep *ep, fi_addr_t addr, char *buf, size_t *buflen); - for (i = 0; i < size / sizeof(rxr_poison_value); i++) - memcpy(ptr + i, &rxr_poison_value, sizeof(rxr_poison_value)); -} -#endif +struct rxr_rx_entry *rxr_ep_get_rx_entry(struct rxr_ep *ep, + const struct fi_msg *msg, + uint64_t tag, + uint64_t ignore, + uint32_t op, + uint64_t flags); -static inline void rxr_release_tx_pkt_entry(struct rxr_ep *ep, - struct rxr_pkt_entry *pkt) -{ - struct rxr_peer *peer; +struct rxr_rx_entry *rxr_ep_rx_entry_init(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry, + const struct fi_msg *msg, + uint64_t tag, + uint64_t ignore, + uint32_t op, + uint64_t flags); -#if ENABLE_DEBUG - dlist_remove(&pkt->dbg_entry); -#endif - /* - * Decrement rnr_queued_pkts counter and reset backoff for this peer if - * we get a send completion for a retransmitted packet. - */ - if (OFI_UNLIKELY(pkt->state == RXR_PKT_ENTRY_RNR_RETRANSMIT)) { - peer = rxr_ep_get_peer(ep, pkt->addr); - peer->rnr_queued_pkt_cnt--; - peer->timeout_interval = 0; - peer->rnr_timeout_exp = 0; - if (peer->rnr_state & RXR_PEER_IN_BACKOFF) - dlist_remove(&peer->rnr_entry); - peer->rnr_state = 0; - FI_DBG(&rxr_prov, FI_LOG_EP_DATA, - "reset backoff timer for peer: %" PRIu64 "\n", - pkt->addr); - } -#ifdef ENABLE_EFA_POISONING - rxr_poison_mem_region((uint32_t *)pkt, ep->tx_pkt_pool_entry_sz); -#endif - pkt->state = RXR_PKT_ENTRY_FREE; - ofi_buf_free(pkt); -} +void rxr_tx_entry_init(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry, + const struct fi_msg *msg, uint32_t op, uint64_t flags); -static inline void rxr_release_rx_pkt_entry(struct rxr_ep *ep, - struct rxr_pkt_entry *pkt) -{ -#if ENABLE_DEBUG - dlist_remove(&pkt->dbg_entry); -#endif -#ifdef ENABLE_EFA_POISONING - /* the same pool size is used for all types of rx pkt_entries */ - rxr_poison_mem_region((uint32_t *)pkt, ep->rx_pkt_pool_entry_sz); -#endif - pkt->state = RXR_PKT_ENTRY_FREE; - ofi_buf_free(pkt); -} +struct rxr_tx_entry *rxr_ep_alloc_tx_entry(struct rxr_ep *rxr_ep, + const struct fi_msg *msg, + uint32_t op, + uint64_t tag, + uint64_t flags); -static inline void rxr_release_tx_entry(struct rxr_ep *ep, - struct rxr_tx_entry *tx_entry) -{ -#if ENABLE_DEBUG - dlist_remove(&tx_entry->tx_entry_entry); -#endif - assert(dlist_empty(&tx_entry->queued_pkts)); -#ifdef ENABLE_EFA_POISONING - rxr_poison_mem_region((uint32_t *)tx_entry, - sizeof(struct rxr_tx_entry)); -#endif - tx_entry->state = RXR_TX_FREE; - tx_entry->msg_id = ~0; - ofi_buf_free(tx_entry); -} +void rxr_release_tx_entry(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry); static inline void rxr_release_rx_entry(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry) @@ -972,55 +790,9 @@ static inline void rxr_release_rx_entry(struct rxr_ep *ep, sizeof(struct rxr_rx_entry)); #endif rx_entry->state = RXR_RX_FREE; - rx_entry->msg_id = ~0; ofi_buf_free(rx_entry); } -static inline void *rxr_pkt_start(struct rxr_pkt_entry *pkt_entry) -{ - return (void *)((char *)pkt_entry + sizeof(*pkt_entry)); -} - -static inline struct rxr_base_hdr *rxr_get_base_hdr(void *pkt) -{ - return (struct rxr_base_hdr *)pkt; -} - -static inline struct rxr_rts_hdr *rxr_get_rts_hdr(void *pkt) -{ - return (struct rxr_rts_hdr *)pkt; -} - -static inline struct rxr_connack_hdr *rxr_get_connack_hdr(void *pkt) -{ - return (struct rxr_connack_hdr *)pkt; -} - -static inline struct rxr_cts_hdr *rxr_get_cts_hdr(void *pkt) -{ - return (struct rxr_cts_hdr *)pkt; -} - -static inline struct rxr_readrsp_hdr *rxr_get_readrsp_hdr(void *pkt) -{ - return (struct rxr_readrsp_hdr *)pkt; -} - -static inline struct rxr_ctrl_cq_pkt *rxr_get_ctrl_cq_pkt(void *pkt) -{ - return (struct rxr_ctrl_cq_pkt *)pkt; -} - -static inline struct rxr_ctrl_pkt *rxr_get_ctrl_pkt(void *pkt) -{ - return (struct rxr_ctrl_pkt *)pkt; -} - -static inline struct rxr_data_pkt *rxr_get_data_pkt(void *pkt) -{ - return (struct rxr_data_pkt *)pkt; -} - static inline int rxr_match_addr(fi_addr_t addr, fi_addr_t match_addr) { return (addr == FI_ADDR_UNSPEC || addr == match_addr); @@ -1054,39 +826,6 @@ static inline void rxr_ep_dec_tx_pending(struct rxr_ep *ep, #endif } -/* - * Helper function to compute the maximum payload of the RTS header based on - * the RTS header flags. The header may have a length greater than the possible - * RTS payload size if it is a large message. - */ -static inline uint64_t rxr_get_rts_data_size(struct rxr_ep *ep, - struct rxr_rts_hdr *rts_hdr) -{ - /* - * for read request, rts packet contain no data - * because data is on remote host - */ - if (rts_hdr->flags & RXR_READ_REQ) - return 0; - - size_t max_payload_size; - - if (rts_hdr->flags & RXR_REMOTE_CQ_DATA) - max_payload_size = ep->mtu_size - RXR_CTRL_HDR_SIZE; - else - max_payload_size = ep->mtu_size - RXR_CTRL_HDR_SIZE_NO_CQ; - - if (rts_hdr->flags & RXR_REMOTE_SRC_ADDR) - max_payload_size -= rts_hdr->addrlen; - - if (rts_hdr->flags & RXR_WRITE) - max_payload_size -= rts_hdr->rma_iov_count * - sizeof(struct fi_rma_iov); - - return (rts_hdr->data_len > max_payload_size) - ? max_payload_size : rts_hdr->data_len; -} - static inline size_t rxr_get_rx_pool_chunk_cnt(struct rxr_ep *ep) { return MIN(ep->core_rx_size, ep->rx_size); @@ -1099,14 +838,18 @@ static inline size_t rxr_get_tx_pool_chunk_cnt(struct rxr_ep *ep) static inline int rxr_need_sas_ordering(struct rxr_ep *ep) { - /* - * RxR needs to reorder RTS packets for send-after-send guarantees - * only if the application requested it and the core provider does not - * support it. - */ - return ((ep->msg_order & FI_ORDER_SAS) && - !(ep->core_msg_order & FI_ORDER_SAS) && - rxr_env.enable_sas_ordering); + return ep->msg_order & FI_ORDER_SAS; +} + +static inline int rxr_ep_use_zcpy_rx(struct rxr_ep *ep, struct fi_info *info) +{ + return !(ep->util_ep.caps & FI_DIRECTED_RECV) && + !(ep->util_ep.caps & FI_TAGGED) && + !(ep->util_ep.caps & FI_ATOMIC) && + (ep->max_msg_size <= ep->mtu_size - ep->max_proto_hdr_size) && + !rxr_need_sas_ordering(ep) && + info->mode & FI_MSG_PREFIX && + rxr_env.use_zcpy_rx; } /* Initialization functions */ @@ -1125,46 +868,40 @@ int rxr_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, int rxr_endpoint(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); -/* AV sub-functions */ -int rxr_av_insert_rdm_addr(struct rxr_av *av, const void *addr, - fi_addr_t *rdm_fiaddr, uint64_t flags, - void *context); -int rxr_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, - struct fid_av **av_fid, void *context); - /* EP sub-functions */ void rxr_ep_progress(struct util_ep *util_ep); -struct rxr_pkt_entry *rxr_ep_get_pkt_entry(struct rxr_ep *rxr_ep, - struct ofi_bufpool *pkt_pool); -int rxr_ep_post_buf(struct rxr_ep *ep, uint64_t flags); -ssize_t rxr_ep_send_msg(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry, - const struct fi_msg *msg, uint64_t flags); -ssize_t rxr_ep_post_data(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry); -ssize_t rxr_ep_post_readrsp(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry); -void rxr_ep_init_connack_pkt_entry(struct rxr_ep *ep, - struct rxr_pkt_entry *pkt_entry, - fi_addr_t addr); -void rxr_ep_calc_cts_window_credits(struct rxr_ep *ep, struct rxr_peer *peer, - uint64_t size, int request, - int *window, int *credits); -void rxr_ep_init_cts_pkt_entry(struct rxr_ep *ep, - struct rxr_rx_entry *rx_entry, - struct rxr_pkt_entry *pkt_entry, - uint64_t size, - int *credits); -void rxr_ep_init_readrsp_pkt_entry(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, - struct rxr_pkt_entry *pkt_entry); -struct rxr_rx_entry *rxr_ep_get_new_unexp_rx_entry(struct rxr_ep *ep, - struct rxr_pkt_entry *unexp_entry); +void rxr_ep_progress_internal(struct rxr_ep *rxr_ep); +int rxr_ep_post_buf(struct rxr_ep *ep, const struct fi_msg *posted_recv, + uint64_t flags, enum rxr_lower_ep_type lower_ep); + +int rxr_ep_set_tx_credit_request(struct rxr_ep *rxr_ep, + struct rxr_tx_entry *tx_entry); + +int rxr_ep_tx_init_mr_desc(struct rxr_domain *rxr_domain, + struct rxr_tx_entry *tx_entry, + int mr_iov_start, uint64_t access); + +void rxr_prepare_desc_send(struct rxr_domain *rxr_domain, + struct rxr_tx_entry *tx_entry); + +struct rxr_rx_entry *rxr_ep_lookup_mediumrtm_rx_entry(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_ep_record_mediumrtm_rx_entry(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry, + struct rxr_rx_entry *rx_entry); + +struct rxr_rx_entry *rxr_ep_alloc_unexp_rx_entry_for_msgrtm(struct rxr_ep *ep, + struct rxr_pkt_entry **pkt_entry); + +struct rxr_rx_entry *rxr_ep_alloc_unexp_rx_entry_for_tagrtm(struct rxr_ep *ep, + struct rxr_pkt_entry **pkt_entry); + struct rxr_rx_entry *rxr_ep_split_rx_entry(struct rxr_ep *ep, struct rxr_rx_entry *posted_entry, struct rxr_rx_entry *consumer_entry, struct rxr_pkt_entry *pkt_entry); -#if ENABLE_DEBUG -void rxr_ep_print_pkt(char *prefix, - struct rxr_ep *ep, - struct rxr_base_hdr *hdr); -#endif +int rxr_ep_efa_addr_to_str(const void *addr, char *temp_name); /* CQ sub-functions */ int rxr_cq_handle_rx_error(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry, @@ -1172,32 +909,37 @@ int rxr_cq_handle_rx_error(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry, int rxr_cq_handle_tx_error(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, ssize_t prov_errno); int rxr_cq_handle_cq_error(struct rxr_ep *ep, ssize_t err); -ssize_t rxr_cq_post_cts(struct rxr_ep *ep, - struct rxr_rx_entry *rx_entry, - uint64_t size); -int rxr_cq_handle_rx_completion(struct rxr_ep *ep, - struct fi_cq_msg_entry *comp, - struct rxr_pkt_entry *pkt_entry, +void rxr_cq_write_rx_completion(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry); +void rxr_cq_handle_rx_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry, + struct rxr_rx_entry *rx_entry); + void rxr_cq_write_tx_completion(struct rxr_ep *ep, - struct fi_cq_msg_entry *comp, struct rxr_tx_entry *tx_entry); -void rxr_cq_recv_rts_data(struct rxr_ep *ep, - struct rxr_rx_entry *rx_entry, - struct rxr_rts_hdr *rts_hdr); +void rxr_cq_handle_tx_completion(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry); + +void rxr_cq_handle_shm_completion(struct rxr_ep *ep, + struct fi_cq_data_entry *cq_entry, + fi_addr_t src_addr); -void rxr_cq_handle_pkt_recv_completion(struct rxr_ep *ep, - struct fi_cq_msg_entry *comp, - fi_addr_t src_addr); +int rxr_cq_reorder_msg(struct rxr_ep *ep, + struct rxr_peer *peer, + struct rxr_pkt_entry *pkt_entry); -void rxr_cq_handle_pkt_send_completion(struct rxr_ep *rxr_ep, - struct fi_cq_msg_entry *comp); +void rxr_cq_proc_pending_items_in_recvwin(struct rxr_ep *ep, + struct rxr_peer *peer); + +void rxr_cq_handle_shm_rma_write_data(struct rxr_ep *ep, + struct fi_cq_data_entry *shm_comp, + fi_addr_t src_addr); /* Aborts if unable to write to the eq */ -static inline void rxr_eq_write_error(struct rxr_ep *ep, ssize_t err, +static inline void efa_eq_write_error(struct util_ep *ep, ssize_t err, ssize_t prov_errno) { struct fi_eq_err_entry err_entry; @@ -1205,11 +947,11 @@ static inline void rxr_eq_write_error(struct rxr_ep *ep, ssize_t err, FI_WARN(&rxr_prov, FI_LOG_EQ, "Writing error %s to EQ.\n", fi_strerror(err)); - if (ep->util_ep.eq) { + if (ep->eq) { memset(&err_entry, 0, sizeof(err_entry)); err_entry.err = err; err_entry.prov_errno = prov_errno; - ret = fi_eq_write(&ep->util_ep.eq->eq_fid, FI_NOTIFY, + ret = fi_eq_write(&ep->eq->eq_fid, FI_NOTIFY, &err_entry, sizeof(err_entry), UTIL_FLAG_ERROR); @@ -1228,24 +970,11 @@ static inline void rxr_eq_write_error(struct rxr_ep *ep, ssize_t err, abort(); } -static inline struct rxr_av *rxr_ep_av(struct rxr_ep *ep) -{ - return container_of(ep->util_ep.av, struct rxr_av, util_av); -} - static inline struct rxr_domain *rxr_ep_domain(struct rxr_ep *ep) { return container_of(ep->util_ep.domain, struct rxr_domain, util_domain); } -static inline uint8_t rxr_ep_mr_local(struct rxr_ep *ep) -{ - struct rxr_domain *domain = container_of(ep->util_ep.domain, - struct rxr_domain, - util_domain); - return domain->mr_local; -} - /* * today we have only cq res check, in future we will have ctx, and other * resource check as well. @@ -1280,71 +1009,6 @@ static inline void rxr_rm_tx_cq_check(struct rxr_ep *ep, struct util_cq *tx_cq) fastlock_release(&tx_cq->cq_lock); } -static inline ssize_t rxr_ep_sendv_pkt(struct rxr_ep *ep, - struct rxr_pkt_entry *pkt_entry, - fi_addr_t addr, const struct iovec *iov, - void **desc, size_t count, - uint64_t flags) -{ - struct fi_msg msg; - - msg.msg_iov = iov; - msg.desc = desc; - msg.iov_count = count; - msg.addr = addr; - msg.context = pkt_entry; - msg.data = 0; - - return rxr_ep_send_msg(ep, pkt_entry, &msg, flags); -} - -/* rxr_pkt_start currently expects data pkt right after pkt hdr */ -static inline ssize_t rxr_ep_send_pkt_flags(struct rxr_ep *ep, - struct rxr_pkt_entry *pkt_entry, - fi_addr_t addr, uint64_t flags) -{ - struct iovec iov; - void *desc; - - iov.iov_base = rxr_pkt_start(pkt_entry); - iov.iov_len = pkt_entry->pkt_size; - - desc = rxr_ep_mr_local(ep) ? fi_mr_desc(pkt_entry->mr) : NULL; - - return rxr_ep_sendv_pkt(ep, pkt_entry, addr, &iov, &desc, 1, flags); -} - -static inline ssize_t rxr_ep_send_pkt(struct rxr_ep *ep, - struct rxr_pkt_entry *pkt_entry, - fi_addr_t addr) -{ - return rxr_ep_send_pkt_flags(ep, pkt_entry, addr, 0); -} - -static inline int rxr_ep_post_cts_or_queue(struct rxr_ep *ep, - struct rxr_rx_entry *rx_entry, - uint64_t bytes_left) -{ - int ret; - - if (rx_entry->state == RXR_RX_QUEUED_CTS) - return 0; - - ret = rxr_cq_post_cts(ep, rx_entry, bytes_left); - if (OFI_UNLIKELY(ret)) { - if (ret == -FI_EAGAIN) { - rx_entry->state = RXR_RX_QUEUED_CTS; - dlist_insert_tail(&rx_entry->queued_entry, - &ep->rx_entry_queued_list); - ret = 0; - } else { - if (rxr_cq_handle_rx_error(ep, rx_entry, ret)) - assert(0 && "failed to write err cq entry"); - } - } - return ret; -} - static inline bool rxr_peer_timeout_expired(struct rxr_ep *ep, struct rxr_peer *peer, uint64_t ts) @@ -1354,59 +1018,6 @@ static inline bool rxr_peer_timeout_expired(struct rxr_ep *ep, (1 << peer->rnr_timeout_exp)))); } -static inline bool -rxr_multi_recv_buffer_available(struct rxr_ep *ep, - struct rxr_rx_entry *rx_entry) -{ - assert(rx_entry->fi_flags & FI_MULTI_RECV); - assert(rx_entry->rxr_flags & RXR_MULTI_RECV_POSTED); - - return (ofi_total_iov_len(rx_entry->iov, rx_entry->iov_count) - >= ep->min_multi_recv_size); -} - -static inline bool -rxr_multi_recv_buffer_complete(struct rxr_ep *ep, - struct rxr_rx_entry *rx_entry) -{ - assert(rx_entry->fi_flags & FI_MULTI_RECV); - assert(rx_entry->rxr_flags & RXR_MULTI_RECV_POSTED); - - return (!rxr_multi_recv_buffer_available(ep, rx_entry) && - dlist_empty(&rx_entry->multi_recv_consumers)); -} - -static inline void -rxr_multi_recv_free_posted_entry(struct rxr_ep *ep, - struct rxr_rx_entry *rx_entry) -{ - assert(!(rx_entry->rxr_flags & RXR_MULTI_RECV_POSTED)); - - if ((rx_entry->rxr_flags & RXR_MULTI_RECV_CONSUMER) && - rxr_multi_recv_buffer_complete(ep, rx_entry->master_entry)) - rxr_release_rx_entry(ep, rx_entry->master_entry); -} - -static inline void -rxr_cq_handle_multi_recv_completion(struct rxr_ep *ep, - struct rxr_rx_entry *rx_entry) -{ - assert(!(rx_entry->rxr_flags & RXR_MULTI_RECV_POSTED) && - (rx_entry->rxr_flags & RXR_MULTI_RECV_CONSUMER)); - - dlist_remove(&rx_entry->multi_recv_entry); - rx_entry->rxr_flags &= ~RXR_MULTI_RECV_CONSUMER; - - if (!rxr_multi_recv_buffer_complete(ep, rx_entry->master_entry)) - return; - - /* - * Buffer is consumed and all messages have been received. Update the - * last message to release the application buffer. - */ - rx_entry->cq_entry.flags |= FI_MULTI_RECV; -} - /* Performance counter declarations */ #ifdef RXR_PERF_ENABLED #define RXR_PERF_FOREACH(DECL) \ diff --git a/prov/efa/src/rxr/rxr_atomic.c b/prov/efa/src/rxr/rxr_atomic.c new file mode 100644 index 00000000000..c1c16ee7baf --- /dev/null +++ b/prov/efa/src/rxr/rxr_atomic.c @@ -0,0 +1,585 @@ +/* + * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "efa.h" +#include "rxr.h" +#include "rxr_rma.h" +#include "rxr_cntr.h" +#include "rxr_atomic.h" +#include "rxr_pkt_cmd.h" + +static void rxr_atomic_copy_shm_msg(struct fi_msg_atomic *shm_msg, + const struct fi_msg_atomic *msg, + struct fi_rma_ioc *rma_iov) +{ + int i; + + assert(msg->rma_iov_count <= RXR_IOV_LIMIT); + memcpy(shm_msg, msg, sizeof(*msg)); + if (!(shm_info->domain_attr->mr_mode & FI_MR_VIRT_ADDR)) { + memcpy(rma_iov, msg->rma_iov, + sizeof(*msg->rma_iov) * msg->rma_iov_count); + for (i = 0; i < msg->rma_iov_count; i++) + rma_iov[i].addr = 0; + shm_msg->rma_iov = rma_iov; + } +} + +static +struct rxr_tx_entry * +rxr_atomic_alloc_tx_entry(struct rxr_ep *rxr_ep, + const struct fi_msg_atomic *msg_atomic, + const struct rxr_atomic_ex *atomic_ex, + uint32_t op, uint64_t flags) +{ + struct rxr_tx_entry *tx_entry; + struct fi_msg msg; + struct iovec iov[RXR_IOV_LIMIT]; + size_t datatype_size = ofi_datatype_size(msg_atomic->datatype); + + tx_entry = ofi_buf_alloc(rxr_ep->tx_entry_pool); + if (OFI_UNLIKELY(!tx_entry)) { + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "TX entries exhausted.\n"); + return NULL; + } + +#if ENABLE_DEBUG + dlist_insert_tail(&tx_entry->tx_entry_entry, &rxr_ep->tx_entry_list); +#endif + ofi_ioc_to_iov(msg_atomic->msg_iov, iov, msg_atomic->iov_count, datatype_size); + msg.addr = msg_atomic->addr; + msg.msg_iov = iov; + msg.context = msg_atomic->context; + msg.iov_count = msg_atomic->iov_count; + msg.data = msg_atomic->data; + msg.desc = msg_atomic->desc; + rxr_tx_entry_init(rxr_ep, tx_entry, &msg, op, flags); + + assert(msg_atomic->rma_iov_count > 0); + assert(msg_atomic->rma_iov); + tx_entry->rma_iov_count = msg_atomic->rma_iov_count; + ofi_rma_ioc_to_iov(msg_atomic->rma_iov, + tx_entry->rma_iov, + msg_atomic->rma_iov_count, + datatype_size); + + tx_entry->atomic_hdr.atomic_op = msg_atomic->op; + tx_entry->atomic_hdr.datatype = msg_atomic->datatype; + + if (op == ofi_op_atomic_fetch || op == ofi_op_atomic_compare) { + assert(atomic_ex); + memcpy(&tx_entry->atomic_ex, atomic_ex, sizeof(struct rxr_atomic_ex)); + } + + return tx_entry; +} + +static +ssize_t rxr_atomic_generic_efa(struct rxr_ep *rxr_ep, + const struct fi_msg_atomic *msg, + const struct rxr_atomic_ex *atomic_ex, + uint32_t op, uint64_t flags) +{ + struct rxr_tx_entry *tx_entry; + struct rxr_peer *peer; + bool delivery_complete_requested; + ssize_t err; + static int req_pkt_type_list[] = { + [ofi_op_atomic] = RXR_WRITE_RTA_PKT, + [ofi_op_atomic_fetch] = RXR_FETCH_RTA_PKT, + [ofi_op_atomic_compare] = RXR_COMPARE_RTA_PKT + }; + + assert(msg->iov_count <= rxr_ep->tx_iov_limit); + rxr_perfset_start(rxr_ep, perf_rxr_tx); + fastlock_acquire(&rxr_ep->util_ep.lock); + + if (OFI_UNLIKELY(is_tx_res_full(rxr_ep))) { + err = -FI_EAGAIN; + goto out; + } + + peer = rxr_ep_get_peer(rxr_ep, msg->addr); + + if (peer->flags & RXR_PEER_IN_BACKOFF) { + err = -FI_EAGAIN; + goto out; + } + + tx_entry = rxr_atomic_alloc_tx_entry(rxr_ep, msg, atomic_ex, op, flags); + if (OFI_UNLIKELY(!tx_entry)) { + err = -FI_EAGAIN; + rxr_ep_progress_internal(rxr_ep); + goto out; + } + + delivery_complete_requested = tx_entry->fi_flags & FI_DELIVERY_COMPLETE; + if (delivery_complete_requested && !(peer->is_local)) { + tx_entry->rxr_flags |= RXR_DELIVERY_COMPLETE_REQUESTED; + /* + * Because delivery complete is defined as an extra + * feature, the receiver might not support it. + * + * The sender cannot send with FI_DELIVERY_COMPLETE + * if the peer is not able to handle it. + * + * If the sender does not know whether the peer + * can handle it, it needs to trigger + * a handshake packet from the peer. + * + * The handshake packet contains + * the information whether the peer + * support it or not. + */ + err = rxr_pkt_trigger_handshake(rxr_ep, tx_entry->addr, peer); + if (OFI_UNLIKELY(err)) + goto out; + + if (!(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED)) { + err = -FI_EAGAIN; + goto out; + } else if (!rxr_peer_support_delivery_complete(peer)) { + err = -FI_EOPNOTSUPP; + goto out; + } + } + + tx_entry->msg_id = (peer->next_msg_id != ~0) ? + peer->next_msg_id++ : ++peer->next_msg_id; + + if (delivery_complete_requested && op == ofi_op_atomic) { + err = rxr_pkt_post_ctrl(rxr_ep, RXR_TX_ENTRY, + tx_entry, + RXR_DC_WRITE_RTA_PKT, + 0); + } else { + /* + * Fetch atomic and compare atomic + * support DELIVERY_COMPLETE + * by nature + */ + err = rxr_pkt_post_ctrl(rxr_ep, RXR_TX_ENTRY, + tx_entry, + req_pkt_type_list[op], + 0); + } + + if (OFI_UNLIKELY(err)) { + rxr_release_tx_entry(rxr_ep, tx_entry); + peer->next_msg_id--; + } + +out: + fastlock_release(&rxr_ep->util_ep.lock); + rxr_perfset_end(rxr_ep, perf_rxr_tx); + return err; +} + +static ssize_t +rxr_atomic_inject(struct fid_ep *ep, + const void *buf, size_t count, + fi_addr_t dest_addr, uint64_t remote_addr, uint64_t remote_key, + enum fi_datatype datatype, enum fi_op op) +{ + struct fi_ioc iov; + struct fi_rma_ioc rma_iov; + struct fi_msg_atomic msg; + + struct rxr_ep *rxr_ep; + struct rxr_peer *peer; + + rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid); + peer = rxr_ep_get_peer(rxr_ep, dest_addr); + if (peer->is_local) { + assert(rxr_ep->use_shm); + if (!(shm_info->domain_attr->mr_mode & FI_MR_VIRT_ADDR)) + remote_addr = 0; + return fi_inject_atomic(rxr_ep->shm_ep, buf, count, peer->shm_fiaddr, + remote_addr, remote_key, datatype, op); + } + + iov.addr = (void *)buf; + iov.count = count; + + rma_iov.addr = remote_addr; + rma_iov.count = count; + rma_iov.key = remote_key; + + msg.msg_iov = &iov; + msg.iov_count = 1; + msg.desc = NULL; + msg.addr = dest_addr; + msg.rma_iov = &rma_iov; + msg.rma_iov_count = 1; + msg.datatype = datatype; + msg.op = op; + msg.context = NULL; + msg.data = 0; + + return rxr_atomic_generic_efa(rxr_ep, &msg, NULL, ofi_op_atomic, + FI_INJECT | RXR_NO_COMPLETION); +} + +static ssize_t +rxr_atomic_writemsg(struct fid_ep *ep, + const struct fi_msg_atomic *msg, + uint64_t flags) +{ + struct fi_msg_atomic shm_msg; + struct rxr_ep *rxr_ep; + struct rxr_peer *peer; + struct fi_rma_ioc rma_iov[RXR_IOV_LIMIT]; + + FI_DBG(&rxr_prov, FI_LOG_EP_DATA, + "%s: iov_len: %lu flags: %lx\n", + __func__, ofi_total_ioc_cnt(msg->msg_iov, msg->iov_count), flags); + + rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid); + peer = rxr_ep_get_peer(rxr_ep, msg->addr); + if (peer->is_local) { + assert(rxr_ep->use_shm); + rxr_atomic_copy_shm_msg(&shm_msg, msg, rma_iov); + shm_msg.addr = peer->shm_fiaddr; + return fi_atomicmsg(rxr_ep->shm_ep, &shm_msg, flags); + } + + return rxr_atomic_generic_efa(rxr_ep, msg, NULL, ofi_op_atomic, flags); +} + +static ssize_t +rxr_atomic_writev(struct fid_ep *ep, + const struct fi_ioc *iov, void **desc, size_t count, + fi_addr_t dest_addr, uint64_t remote_addr, uint64_t remote_key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + struct fi_msg_atomic msg; + struct fi_rma_ioc rma_ioc; + + rma_ioc.addr = remote_addr; + rma_ioc.count = ofi_total_ioc_cnt(iov, count); + rma_ioc.key = remote_key; + + msg.msg_iov = iov; + msg.iov_count = count; + msg.desc = desc; + msg.addr = dest_addr; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.datatype = datatype; + msg.op = op; + msg.context = context; + msg.data = 0; + + FI_DBG(&rxr_prov, FI_LOG_EP_DATA, "%s total_count=%ld atomic_op=%d\n", __func__, + ofi_total_ioc_cnt(iov, count), msg.op); + + return rxr_atomic_writemsg(ep, &msg, 0); +} + +static ssize_t +rxr_atomic_write(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + fi_addr_t dest_addr, uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + struct fi_ioc ioc; + + ioc.addr = (void *)buf; + ioc.count = count; + return rxr_atomic_writev(ep, &ioc, &desc, 1, + dest_addr, addr, key, + datatype, op, context); +} + +static ssize_t +rxr_atomic_readwritemsg(struct fid_ep *ep, + const struct fi_msg_atomic *msg, + struct fi_ioc *resultv, void **result_desc, size_t result_count, + uint64_t flags) +{ + struct rxr_ep *rxr_ep; + struct rxr_peer *peer; + struct fi_msg_atomic shm_msg; + struct fi_rma_ioc rma_iov[RXR_IOV_LIMIT]; + struct rxr_atomic_ex atomic_ex; + size_t datatype_size = ofi_datatype_size(msg->datatype); + + FI_DBG(&rxr_prov, FI_LOG_EP_DATA, "%s total_len=%ld atomic_op=%d\n", __func__, + ofi_total_ioc_cnt(msg->msg_iov, msg->iov_count), msg->op); + + rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid); + peer = rxr_ep_get_peer(rxr_ep, msg->addr); + if (peer->is_local) { + assert(rxr_ep->use_shm); + rxr_atomic_copy_shm_msg(&shm_msg, msg, rma_iov); + shm_msg.addr = peer->shm_fiaddr; + return fi_fetch_atomicmsg(rxr_ep->shm_ep, &shm_msg, + resultv, result_desc, result_count, + flags); + } + + ofi_ioc_to_iov(resultv, atomic_ex.resp_iov, result_count, datatype_size); + atomic_ex.resp_iov_count = result_count; + return rxr_atomic_generic_efa(rxr_ep, msg, &atomic_ex, ofi_op_atomic_fetch, flags); +} + +static ssize_t +rxr_atomic_readwritev(struct fid_ep *ep, + const struct fi_ioc *iov, void **desc, size_t count, + struct fi_ioc *resultv, void **result_desc, size_t result_count, + fi_addr_t dest_addr, uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + struct fi_msg_atomic msg; + struct fi_rma_ioc rma_ioc; + + rma_ioc.addr = addr; + rma_ioc.count = ofi_total_ioc_cnt(iov, count); + rma_ioc.key = key; + + msg.msg_iov = iov; + msg.iov_count = count; + msg.desc = desc; + msg.addr = dest_addr; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.datatype = datatype; + msg.op = op; + msg.context = context; + msg.data = 0; + + return rxr_atomic_readwritemsg(ep, &msg, resultv, result_desc, result_count, 0); +} + +static ssize_t +rxr_atomic_readwrite(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + void *result, void *result_desc, + fi_addr_t dest_addr, uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + struct fi_ioc ioc, resp_ioc; + + ioc.addr = (void *)buf; + ioc.count = count; + resp_ioc.addr = result; + resp_ioc.count = count; + + return rxr_atomic_readwritev(ep, &ioc, &desc, 1, + &resp_ioc, &result_desc, 1, + dest_addr, addr, key, + datatype, op, context); +} + +static ssize_t +rxr_atomic_compwritemsg(struct fid_ep *ep, + const struct fi_msg_atomic *msg, + const struct fi_ioc *comparev, void **compare_desc, size_t compare_count, + struct fi_ioc *resultv, void **result_desc, size_t result_count, + uint64_t flags) +{ + struct rxr_ep *rxr_ep; + struct rxr_peer *peer; + struct fi_msg_atomic shm_msg; + struct fi_rma_ioc rma_iov[RXR_IOV_LIMIT]; + struct rxr_atomic_ex atomic_ex; + size_t datatype_size = ofi_datatype_size(msg->datatype); + + FI_DBG(&rxr_prov, FI_LOG_EP_DATA, + "%s: iov_len: %lu flags: %lx\n", + __func__, ofi_total_ioc_cnt(msg->msg_iov, msg->iov_count), flags); + + rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid); + peer = rxr_ep_get_peer(rxr_ep, msg->addr); + if (peer->is_local) { + assert(rxr_ep->use_shm); + rxr_atomic_copy_shm_msg(&shm_msg, msg, rma_iov); + shm_msg.addr = peer->shm_fiaddr; + return fi_compare_atomicmsg(rxr_ep->shm_ep, &shm_msg, + comparev, compare_desc, compare_count, + resultv, result_desc, result_count, + flags); + } + + ofi_ioc_to_iov(resultv, atomic_ex.resp_iov, result_count, datatype_size); + atomic_ex.resp_iov_count = result_count; + + ofi_ioc_to_iov(comparev, atomic_ex.comp_iov, compare_count, datatype_size); + atomic_ex.comp_iov_count = compare_count; + + return rxr_atomic_generic_efa(rxr_ep, msg, &atomic_ex, ofi_op_atomic_compare, flags); +} + +static ssize_t +rxr_atomic_compwritev(struct fid_ep *ep, + const struct fi_ioc *iov, void **desc, size_t count, + const struct fi_ioc *comparev, void **compare_desc, size_t compare_count, + struct fi_ioc *resultv, void **result_desc, size_t result_count, + fi_addr_t dest_addr, uint64_t rma_addr, uint64_t rma_key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + struct fi_msg_atomic msg; + struct fi_rma_ioc rma_ioc; + + rma_ioc.addr = rma_addr; + rma_ioc.count = ofi_total_ioc_cnt(iov, count); + rma_ioc.key = rma_key; + + msg.msg_iov = iov; + msg.iov_count = count; + msg.desc = desc; + msg.addr = dest_addr; + msg.rma_iov = &rma_ioc; + msg.rma_iov_count = 1; + msg.datatype = datatype; + msg.op = op; + msg.context = context; + msg.data = 0; + + return rxr_atomic_compwritemsg(ep, &msg, + comparev, compare_desc, compare_count, + resultv, result_desc, result_count, + 0); +} + +static ssize_t +rxr_atomic_compwrite(struct fid_ep *ep, + const void *buf, size_t count, void *desc, + const void *compare, void *compare_desc, + void *result, void *result_desc, + fi_addr_t dest_addr, uint64_t addr, uint64_t key, + enum fi_datatype datatype, enum fi_op op, void *context) +{ + struct fi_ioc ioc, resp_ioc, comp_ioc; + + ioc.addr = (void *)buf; + ioc.count = count; + resp_ioc.addr = result; + resp_ioc.count = count; + comp_ioc.addr = (void *)compare; + comp_ioc.count = count; + + return rxr_atomic_compwritev(ep, &ioc, &desc, 1, + &comp_ioc, &compare_desc, 1, + &resp_ioc, &result_desc, 1, + dest_addr, addr, key, + datatype, op, context); +} + +int rxr_query_atomic(struct fid_domain *domain, + enum fi_datatype datatype, enum fi_op op, + struct fi_atomic_attr *attr, uint64_t flags) +{ + struct rxr_domain *rxr_domain; + int ret; + size_t max_atomic_size; + + if (flags & FI_TAGGED) { + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, + "tagged atomic op not supported\n"); + return -FI_EINVAL; + } + + ret = ofi_atomic_valid(&rxr_prov, datatype, op, flags); + if (ret || !attr) + return ret; + + rxr_domain = container_of(domain, struct rxr_domain, + util_domain.domain_fid); + + max_atomic_size = rxr_domain->mtu_size - sizeof(struct rxr_rta_hdr) + - rxr_domain->addrlen + - RXR_IOV_LIMIT * sizeof(struct fi_rma_iov); + + if (flags & FI_COMPARE_ATOMIC) + max_atomic_size /= 2; + + attr->size = ofi_datatype_size(datatype); + attr->count = max_atomic_size / attr->size; + return 0; +} + +static int rxr_atomic_valid(struct fid_ep *ep_fid, enum fi_datatype datatype, + enum fi_op op, uint64_t flags, size_t *count) +{ + struct util_ep *ep; + struct fi_atomic_attr attr; + int ret; + + ep = container_of(ep_fid, struct util_ep, ep_fid); + ret = rxr_query_atomic(&ep->domain->domain_fid, + datatype, op, &attr, flags); + if (!ret) + *count = attr.count; + + return ret; +} + +static int rxr_atomic_write_valid(struct fid_ep *ep, enum fi_datatype datatype, + enum fi_op op, size_t *count) +{ + return rxr_atomic_valid(ep, datatype, op, 0, count); +} + +static int rxr_atomic_readwrite_valid(struct fid_ep *ep, + enum fi_datatype datatype, enum fi_op op, + size_t *count) +{ + return rxr_atomic_valid(ep, datatype, op, FI_FETCH_ATOMIC, count); +} + +static int rxr_atomic_compwrite_valid(struct fid_ep *ep, + enum fi_datatype datatype, enum fi_op op, + size_t *count) +{ + return rxr_atomic_valid(ep, datatype, op, FI_COMPARE_ATOMIC, count); +} + +struct fi_ops_atomic rxr_ops_atomic = { + .size = sizeof(struct fi_ops_atomic), + .write = rxr_atomic_write, + .writev = rxr_atomic_writev, + .writemsg = rxr_atomic_writemsg, + .inject = rxr_atomic_inject, + .readwrite = rxr_atomic_readwrite, + .readwritev = rxr_atomic_readwritev, + .readwritemsg = rxr_atomic_readwritemsg, + .compwrite = rxr_atomic_compwrite, + .compwritev = rxr_atomic_compwritev, + .compwritemsg = rxr_atomic_compwritemsg, + .writevalid = rxr_atomic_write_valid, + .readwritevalid = rxr_atomic_readwrite_valid, + .compwritevalid = rxr_atomic_compwrite_valid, +}; + diff --git a/prov/efa/src/rxr/rxr_atomic.h b/prov/efa/src/rxr/rxr_atomic.h new file mode 100644 index 00000000000..282cf2ab3f0 --- /dev/null +++ b/prov/efa/src/rxr/rxr_atomic.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ + +#ifndef _RXR_ATOMIC_H_ +#define _RXR_ATOMIC_H_ + +int rxr_query_atomic(struct fid_domain *domain, + enum fi_datatype datatype, enum fi_op op, + struct fi_atomic_attr *attr, uint64_t flags); + +extern struct fi_ops_atomic rxr_ops_atomic; + +#endif diff --git a/prov/efa/src/rxr/rxr_attr.c b/prov/efa/src/rxr/rxr_attr.c index 5ab1b80e81e..77b2a9eb6d9 100644 --- a/prov/efa/src/rxr/rxr_attr.c +++ b/prov/efa/src/rxr/rxr_attr.c @@ -37,19 +37,28 @@ const uint32_t rxr_poison_value = 0xdeadbeef; #endif -#define RXR_EP_CAPS (FI_MSG | FI_TAGGED | FI_RECV | FI_SEND | FI_READ \ - | FI_WRITE | FI_REMOTE_READ | FI_REMOTE_WRITE \ - | FI_DIRECTED_RECV | FI_SOURCE | FI_MULTI_RECV \ - | FI_RMA) +#if HAVE_LIBCUDA +#define EFA_HMEM_CAP FI_HMEM +#else +#define EFA_HMEM_CAP 0 +#endif +#define RXR_TX_CAPS (OFI_TX_MSG_CAPS | FI_TAGGED | OFI_TX_RMA_CAPS | \ + FI_ATOMIC | EFA_HMEM_CAP) +#define RXR_RX_CAPS (OFI_RX_MSG_CAPS | FI_TAGGED | OFI_RX_RMA_CAPS | \ + FI_SOURCE | FI_MULTI_RECV | FI_DIRECTED_RECV | \ + FI_ATOMIC | EFA_HMEM_CAP) +#define RXR_DOM_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM) /* TODO: Add support for true FI_DELIVERY_COMPLETE */ #define RXR_TX_OP_FLAGS (FI_INJECT | FI_COMPLETION | FI_TRANSMIT_COMPLETE | \ FI_DELIVERY_COMPLETE) -#define RXR_RX_OP_FLAGS (FI_COMPLETION) +#define RXR_RX_OP_FLAGS (FI_COMPLETION | FI_MULTI_RECV) struct fi_tx_attr rxr_tx_attr = { - .caps = RXR_EP_CAPS, - .msg_order = FI_ORDER_SAS, + .caps = RXR_TX_CAPS, + .msg_order = FI_ORDER_SAS | + FI_ORDER_ATOMIC_RAR | FI_ORDER_ATOMIC_RAW | + FI_ORDER_ATOMIC_WAR | FI_ORDER_ATOMIC_WAW, .op_flags = RXR_TX_OP_FLAGS, .comp_order = FI_ORDER_NONE, .inject_size = 0, @@ -58,8 +67,10 @@ struct fi_tx_attr rxr_tx_attr = { }; struct fi_rx_attr rxr_rx_attr = { - .caps = RXR_EP_CAPS, - .msg_order = FI_ORDER_SAS, + .caps = RXR_RX_CAPS, + .msg_order = FI_ORDER_SAS | + FI_ORDER_ATOMIC_RAR | FI_ORDER_ATOMIC_RAW | + FI_ORDER_ATOMIC_WAR | FI_ORDER_ATOMIC_WAW, .op_flags = RXR_RX_OP_FLAGS, .comp_order = FI_ORDER_NONE, .total_buffered_recv = 0, @@ -71,8 +82,9 @@ struct fi_ep_attr rxr_ep_attr = { .type = FI_EP_RDM, .protocol = FI_PROTO_EFA, .mem_tag_format = FI_TAG_GENERIC, - .protocol_version = RXR_PROTOCOL_VERSION, + .protocol_version = RXR_CUR_PROTOCOL_VERSION, .max_msg_size = UINT64_MAX, + .msg_prefix_size = 0, .tx_ctx_cnt = 1, .rx_ctx_cnt = 1 }; @@ -94,15 +106,16 @@ struct fi_domain_attr rxr_domain_attr = { .rx_ctx_cnt = 1, .max_ep_tx_ctx = 1, .max_ep_rx_ctx = 1, - .cq_data_size = RXR_CQ_DATA_SIZE + .cq_data_size = RXR_CQ_DATA_SIZE, + .caps = RXR_DOM_CAPS }; struct fi_fabric_attr rxr_fabric_attr = { - .prov_version = FI_VERSION(RXR_MAJOR_VERSION, RXR_MINOR_VERSION), + .prov_version = OFI_VERSION_DEF_PROV, }; struct fi_info rxr_info = { - .caps = RXR_EP_CAPS, + .caps = RXR_TX_CAPS | RXR_RX_CAPS | RXR_DOM_CAPS, .addr_format = FI_FORMAT_UNSPEC, .tx_attr = &rxr_tx_attr, .rx_attr = &rxr_rx_attr, diff --git a/prov/efa/src/rxr/rxr_av.c b/prov/efa/src/rxr/rxr_av.c deleted file mode 100644 index 6e1cb5a610e..00000000000 --- a/prov/efa/src/rxr/rxr_av.c +++ /dev/null @@ -1,328 +0,0 @@ -/* - * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. - * All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "rxr.h" -#include - -/* - * Insert address translation in core av & in hash. Return 1 on successful - * insertion regardless of whether it is in the hash table or not, 0 if the - * lower layer av insert fails. - */ -int rxr_av_insert_rdm_addr(struct rxr_av *av, const void *addr, - fi_addr_t *rdm_fiaddr, uint64_t flags, - void *context) -{ - struct rxr_av_entry *av_entry; - int ret = 1; - - fastlock_acquire(&av->util_av.lock); - - HASH_FIND(hh, av->av_map, addr, av->rdm_addrlen, av_entry); - - if (!av_entry) { - ret = fi_av_insert(av->rdm_av, addr, 1, - rdm_fiaddr, flags, context); - if (OFI_UNLIKELY(ret != 1)) { - FI_DBG(&rxr_prov, FI_LOG_AV, - "Error in inserting address: %s\n", fi_strerror(-ret)); - goto out; - } else { - av_entry = calloc(1, sizeof(*av_entry)); - if (OFI_UNLIKELY(!av_entry)) { - ret = -FI_ENOMEM; - FI_WARN(&rxr_prov, FI_LOG_AV, - "Failed to allocate memory for av_entry\n"); - goto out; - } - memcpy(av_entry->addr, addr, av->rdm_addrlen); - av_entry->rdm_addr = *(uint64_t *)rdm_fiaddr; - HASH_ADD(hh, av->av_map, addr, - av->rdm_addrlen, av_entry); - } - } else { - *rdm_fiaddr = (fi_addr_t)av_entry->rdm_addr; - } - - FI_DBG(&rxr_prov, FI_LOG_AV, - "addr = %" PRIu64 " rdm_fiaddr = %" PRIu64 "\n", - *(uint64_t *)addr, *rdm_fiaddr); -out: - fastlock_release(&av->util_av.lock); - return ret; -} - -static int rxr_av_insert(struct fid_av *av_fid, const void *addr, - size_t count, fi_addr_t *fi_addr, uint64_t flags, - void *context) -{ - struct rxr_av *av; - fi_addr_t fi_addr_res; - int i = 0, ret = 0, success_cnt = 0; - - /* - * Providers are allowed to ignore FI_MORE. FI_SYNC_ERR is not - * supported. - */ - flags &= ~FI_MORE; - - if (flags) - return -FI_ENOSYS; - - av = container_of(av_fid, struct rxr_av, util_av.av_fid); - - if (av->util_av.count < av->rdm_av_used + count) { - FI_WARN(&rxr_prov, FI_LOG_AV, - "AV insert failed. Expect inserting %zu AV entries, but only %zu available\n", - count, av->util_av.count - av->rdm_av_used); - if (av->util_av.eq) - ofi_av_write_event(&av->util_av, i, FI_ENOMEM, context); - goto out; - } - - for (; i < count; i++, addr = (uint8_t *)addr + av->rdm_addrlen) { - ret = rxr_av_insert_rdm_addr(av, addr, &fi_addr_res, - flags, context); - if (ret != 1) - break; - - if (fi_addr) - fi_addr[i] = fi_addr_res; - - success_cnt++; - } - - av->rdm_av_used += success_cnt; - -out: - /* cancel remaining request and log to event queue */ - for (; i < count ; i++) { - if (av->util_av.eq) - ofi_av_write_event(&av->util_av, i, FI_ECANCELED, - context); - if (fi_addr) - fi_addr[i] = FI_ADDR_NOTAVAIL; - } - - /* update success to event queue */ - if (av->util_av.eq) - ofi_av_write_event(&av->util_av, success_cnt, 0, context); - - return success_cnt; -} - -static int rxr_av_insertsvc(struct fid_av *av, const char *node, - const char *service, fi_addr_t *fi_addr, - uint64_t flags, void *context) -{ - return -FI_ENOSYS; -} - -static int rxr_av_insertsym(struct fid_av *av_fid, const char *node, - size_t nodecnt, const char *service, size_t svccnt, - fi_addr_t *fi_addr, uint64_t flags, void *context) -{ - return -FI_ENOSYS; -} - -static int rxr_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, - size_t count, uint64_t flags) -{ - int ret = 0; - size_t i; - struct rxr_av *av; - struct rxr_av_entry *av_entry; - void *addr; - - av = container_of(av_fid, struct rxr_av, util_av.av_fid); - addr = calloc(1, av->rdm_addrlen); - if (!addr) { - FI_WARN(&rxr_prov, FI_LOG_AV, - "Failed to allocate memory for av addr\n"); - return -FI_ENOMEM; - } - - fastlock_acquire(&av->util_av.lock); - for (i = 0; i < count; i++) { - ret = fi_av_lookup(av->rdm_av, fi_addr[i], - addr, &av->rdm_addrlen); - if (ret) - break; - - ret = fi_av_remove(av->rdm_av, &fi_addr[i], 1, flags); - if (ret) - break; - - HASH_FIND(hh, av->av_map, addr, av->rdm_addrlen, av_entry); - - if (av_entry) { - HASH_DEL(av->av_map, av_entry); - free(av_entry); - } - - av->rdm_av_used--; - } - fastlock_release(&av->util_av.lock); - free(addr); - return ret; -} - -static const char *rxr_av_straddr(struct fid_av *av, const void *addr, - char *buf, size_t *len) -{ - struct rxr_av *rxr_av; - - rxr_av = container_of(av, struct rxr_av, util_av.av_fid); - return rxr_av->rdm_av->ops->straddr(rxr_av->rdm_av, addr, buf, len); -} - -static int rxr_av_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr, - size_t *addrlen) -{ - struct rxr_av *rxr_av; - - rxr_av = container_of(av, struct rxr_av, util_av.av_fid); - return fi_av_lookup(rxr_av->rdm_av, fi_addr, addr, addrlen); -} - -static struct fi_ops_av rxr_av_ops = { - .size = sizeof(struct fi_ops_av), - .insert = rxr_av_insert, - .insertsvc = rxr_av_insertsvc, - .insertsym = rxr_av_insertsym, - .remove = rxr_av_remove, - .lookup = rxr_av_lookup, - .straddr = rxr_av_straddr, -}; - -static int rxr_av_close(struct fid *fid) -{ - struct rxr_av *av; - struct rxr_av_entry *curr_av_entry, *tmp; - int ret = 0; - - av = container_of(fid, struct rxr_av, util_av.av_fid); - ret = fi_close(&av->rdm_av->fid); - if (ret) - goto err; - - ret = ofi_av_close(&av->util_av); - if (ret) - goto err; - -err: - HASH_ITER(hh, av->av_map, curr_av_entry, tmp) { - HASH_DEL(av->av_map, curr_av_entry); - free(curr_av_entry); - } - free(av); - return ret; -} - -static int rxr_av_bind(struct fid *fid, struct fid *bfid, uint64_t flags) -{ - return ofi_av_bind(fid, bfid, flags); -} - -static struct fi_ops rxr_av_fi_ops = { - .size = sizeof(struct fi_ops), - .close = rxr_av_close, - .bind = rxr_av_bind, - .control = fi_no_control, - .ops_open = fi_no_ops_open, -}; - -int rxr_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, - struct fid_av **av_fid, void *context) -{ - struct rxr_av *av; - struct rxr_domain *domain; - struct fi_av_attr av_attr; - struct util_av_attr util_attr; - int ret; - - if (!attr) - return -FI_EINVAL; - - if (attr->name) - return -FI_ENOSYS; - - /* FI_EVENT, FI_READ, and FI_SYMMETRIC are not supported */ - if (attr->flags) - return -FI_ENOSYS; - - domain = container_of(domain_fid, struct rxr_domain, - util_domain.domain_fid); - av = calloc(1, sizeof(*av)); - if (!av) - return -FI_ENOMEM; - - /* - * TODO: remove me once RxR supports resizing members tied to the AV - * size. - */ - if (!attr->count) - attr->count = RXR_MIN_AV_SIZE; - else - attr->count = MAX(attr->count, RXR_MIN_AV_SIZE); - - util_attr.addrlen = sizeof(fi_addr_t); - util_attr.flags = 0; - ret = ofi_av_init(&domain->util_domain, attr, &util_attr, - &av->util_av, context); - if (ret) - goto err; - - av_attr = *attr; - - FI_DBG(&rxr_prov, FI_LOG_AV, "fi_av_attr:%" PRId64 "\n", - av_attr.flags); - - av_attr.type = FI_AV_TABLE; - - ret = fi_av_open(domain->rdm_domain, &av_attr, &av->rdm_av, context); - if (ret) - goto err; - - av->rdm_addrlen = domain->addrlen; - - *av_fid = &av->util_av.av_fid; - (*av_fid)->fid.fclass = FI_CLASS_AV; - (*av_fid)->fid.ops = &rxr_av_fi_ops; - (*av_fid)->ops = &rxr_av_ops; - return 0; - -err: - free(av); - return ret; -} diff --git a/prov/efa/src/rxr/rxr_cntr.c b/prov/efa/src/rxr/rxr_cntr.c index 85a0c7abcef..5b7e323df42 100644 --- a/prov/efa/src/rxr/rxr_cntr.c +++ b/prov/efa/src/rxr/rxr_cntr.c @@ -36,7 +36,7 @@ #include "rxr.h" #include "rxr_cntr.h" -static int rxr_cntr_wait(struct fid_cntr *cntr_fid, uint64_t threshold, int timeout) +static int efa_cntr_wait(struct fid_cntr *cntr_fid, uint64_t threshold, int timeout) { struct util_cntr *cntr; uint64_t start, errcnt; @@ -48,7 +48,7 @@ static int rxr_cntr_wait(struct fid_cntr *cntr_fid, uint64_t threshold, int time cntr = container_of(cntr_fid, struct util_cntr, cntr_fid); assert(cntr->wait); errcnt = ofi_atomic_get64(&cntr->err); - start = (timeout >= 0) ? fi_gettime_ms() : 0; + start = (timeout >= 0) ? ofi_gettime_ms() : 0; for (tryid = 0; tryid < numtry; ++tryid) { cntr->progress(cntr); @@ -59,7 +59,7 @@ static int rxr_cntr_wait(struct fid_cntr *cntr_fid, uint64_t threshold, int time return -FI_EAVAIL; if (timeout >= 0) { - timeout -= (int)(fi_gettime_ms() - start); + timeout -= (int)(ofi_gettime_ms() - start); if (timeout <= 0) return -FI_ETIMEDOUT; } @@ -74,7 +74,7 @@ static int rxr_cntr_wait(struct fid_cntr *cntr_fid, uint64_t threshold, int time return ret; } -int rxr_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, +int efa_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, struct fid_cntr **cntr_fid, void *context) { int ret; @@ -90,7 +90,7 @@ int rxr_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, goto free; *cntr_fid = &cntr->cntr_fid; - cntr->cntr_fid.ops->wait = rxr_cntr_wait; + cntr->cntr_fid.ops->wait = efa_cntr_wait; return FI_SUCCESS; free: @@ -98,20 +98,19 @@ int rxr_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, return ret; } -void rxr_cntr_report_tx_completion(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry) +void efa_cntr_report_tx_completion(struct util_ep *ep, uint64_t flags) { - uint64_t flags = tx_entry->cq_entry.flags & - (FI_SEND | FI_WRITE | FI_READ); struct util_cntr *cntr; + flags &= (FI_SEND | FI_WRITE | FI_READ); assert(flags == FI_SEND || flags == FI_WRITE || flags == FI_READ); if (flags == FI_SEND) - cntr = ep->util_ep.tx_cntr; + cntr = ep->tx_cntr; else if (flags == FI_WRITE) - cntr = ep->util_ep.wr_cntr; + cntr = ep->wr_cntr; else if (flags == FI_READ) - cntr = ep->util_ep.rd_cntr; + cntr = ep->rd_cntr; else cntr = NULL; @@ -119,21 +118,19 @@ void rxr_cntr_report_tx_completion(struct rxr_ep *ep, struct rxr_tx_entry *tx_en cntr->cntr_fid.ops->add(&cntr->cntr_fid, 1); } -void rxr_cntr_report_rx_completion(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry) +void efa_cntr_report_rx_completion(struct util_ep *ep, uint64_t flags) { - uint64_t flags = rx_entry->cq_entry.flags & - (FI_RECV | FI_REMOTE_WRITE | FI_REMOTE_READ); + struct util_cntr *cntr; + flags &= (FI_RECV | FI_REMOTE_WRITE | FI_REMOTE_READ); assert(flags == FI_RECV || flags == FI_REMOTE_WRITE || flags == FI_REMOTE_READ); - struct util_cntr *cntr; - if (flags == FI_RECV) - cntr = ep->util_ep.rx_cntr; + cntr = ep->rx_cntr; else if (flags == FI_REMOTE_READ) - cntr = ep->util_ep.rem_rd_cntr; + cntr = ep->rem_rd_cntr; else if (flags == FI_REMOTE_WRITE) - cntr = ep->util_ep.rem_wr_cntr; + cntr = ep->rem_wr_cntr; else cntr = NULL; @@ -141,7 +138,7 @@ void rxr_cntr_report_rx_completion(struct rxr_ep *ep, struct rxr_rx_entry *rx_en cntr->cntr_fid.ops->add(&cntr->cntr_fid, 1); } -void rxr_cntr_report_error(struct rxr_ep *ep, uint64_t flags) +void efa_cntr_report_error(struct util_ep *ep, uint64_t flags) { flags = flags & (FI_SEND | FI_READ | FI_WRITE | FI_ATOMIC | FI_RECV | FI_REMOTE_READ | FI_REMOTE_WRITE); @@ -149,17 +146,17 @@ void rxr_cntr_report_error(struct rxr_ep *ep, uint64_t flags) struct util_cntr *cntr; if (flags == FI_WRITE || flags == FI_ATOMIC) - cntr = ep->util_ep.wr_cntr; + cntr = ep->wr_cntr; else if (flags == FI_READ) - cntr = ep->util_ep.rd_cntr; + cntr = ep->rd_cntr; else if (flags == FI_SEND) - cntr = ep->util_ep.tx_cntr; + cntr = ep->tx_cntr; else if (flags == FI_RECV) - cntr = ep->util_ep.rx_cntr; + cntr = ep->rx_cntr; else if (flags == FI_REMOTE_READ) - cntr = ep->util_ep.rem_rd_cntr; + cntr = ep->rem_rd_cntr; else if (flags == FI_REMOTE_WRITE) - cntr = ep->util_ep.rem_wr_cntr; + cntr = ep->rem_wr_cntr; else cntr = NULL; diff --git a/prov/efa/src/rxr/rxr_cntr.h b/prov/efa/src/rxr/rxr_cntr.h index 6a824718cce..7bee0fa15c0 100644 --- a/prov/efa/src/rxr/rxr_cntr.h +++ b/prov/efa/src/rxr/rxr_cntr.h @@ -38,14 +38,14 @@ #ifndef _RXR_CNTR_H_ #define _RXR_CNTR_H_ -int rxr_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, +int efa_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, struct fid_cntr **cntr_fid, void *context); -void rxr_cntr_report_tx_completion(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry); +void efa_cntr_report_tx_completion(struct util_ep *ep, uint64_t flags); -void rxr_cntr_report_rx_completion(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry); +void efa_cntr_report_rx_completion(struct util_ep *ep, uint64_t flags); -void rxr_cntr_report_error(struct rxr_ep *ep, uint64_t flags); +void efa_cntr_report_error(struct util_ep *ep, uint64_t flags); #endif diff --git a/prov/efa/src/rxr/rxr_cq.c b/prov/efa/src/rxr/rxr_cq.c index df3d232a9f3..0e03c02bda4 100644 --- a/prov/efa/src/rxr/rxr_cq.c +++ b/prov/efa/src/rxr/rxr_cq.c @@ -38,7 +38,10 @@ #include #include "rxr.h" #include "rxr_rma.h" +#include "rxr_msg.h" #include "rxr_cntr.h" +#include "rxr_read.h" +#include "rxr_atomic.h" #include "efa.h" static const char *rxr_cq_strerror(struct fid_cq *cq_fid, int prov_errno, @@ -66,7 +69,7 @@ static const char *rxr_cq_strerror(struct fid_cq *cq_fid, int prov_errno, /* * Teardown rx_entry and write an error cq entry. With our current protocol we - * will only encounter an RX error when sending a queued RTS or CTS packet or + * will only encounter an RX error when sending a queued REQ or CTS packet or * if we are sending a CTS message. Because of this, the sender will not send * any additional data packets if the receiver encounters an error. If there is * a scenario in the future where the sender will continue to send data packets @@ -103,8 +106,9 @@ int rxr_cq_handle_rx_error(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry, dlist_remove(&rx_entry->rx_pending_entry); #endif break; - case RXR_RX_QUEUED_CTS: + case RXR_RX_QUEUED_CTRL: case RXR_RX_QUEUED_CTS_RNR: + case RXR_RX_QUEUED_EOR: dlist_remove(&rx_entry->queued_entry); break; default: @@ -116,17 +120,15 @@ int rxr_cq_handle_rx_error(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry, dlist_foreach_container_safe(&rx_entry->queued_pkts, struct rxr_pkt_entry, pkt_entry, entry, tmp) - rxr_release_tx_pkt_entry(ep, pkt_entry); + rxr_pkt_entry_release_tx(ep, pkt_entry); - if (rx_entry->unexp_rts_pkt) { - if (rx_entry->unexp_rts_pkt->type == RXR_PKT_ENTRY_POSTED) - ep->rx_bufs_to_post++; - rxr_release_rx_pkt_entry(ep, rx_entry->unexp_rts_pkt); - rx_entry->unexp_rts_pkt = NULL; + if (rx_entry->unexp_pkt) { + rxr_pkt_entry_release_rx(ep, rx_entry->unexp_pkt); + rx_entry->unexp_pkt = NULL; } if (rx_entry->fi_flags & FI_MULTI_RECV) - rxr_cq_handle_multi_recv_completion(ep, rx_entry); + rxr_msg_multi_recv_handle_completion(ep, rx_entry); err_entry.flags = rx_entry->cq_entry.flags; if (rx_entry->state != RXR_RX_UNEXP) @@ -135,7 +137,7 @@ int rxr_cq_handle_rx_error(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry, err_entry.data = rx_entry->cq_entry.data; err_entry.tag = rx_entry->cq_entry.tag; - rxr_multi_recv_free_posted_entry(ep, rx_entry); + rxr_msg_multi_recv_free_posted_entry(ep, rx_entry); FI_WARN(&rxr_prov, FI_LOG_CQ, "rxr_cq_handle_rx_error: err: %d, prov_err: %s (%d)\n", @@ -149,7 +151,7 @@ int rxr_cq_handle_rx_error(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry, */ //rxr_release_rx_entry(ep, rx_entry); - rxr_cntr_report_error(ep, err_entry.flags); + efa_cntr_report_error(&ep->util_ep, err_entry.flags); return ofi_cq_write_error(util_cq, &err_entry); } @@ -181,16 +183,19 @@ int rxr_cq_handle_tx_error(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, err_entry.prov_errno = (int)prov_errno; switch (tx_entry->state) { - case RXR_TX_RTS: + case RXR_TX_REQ: break; case RXR_TX_SEND: dlist_remove(&tx_entry->entry); break; - case RXR_TX_QUEUED_RTS: - case RXR_TX_QUEUED_RTS_RNR: + case RXR_TX_QUEUED_CTRL: + case RXR_TX_QUEUED_SHM_RMA: + case RXR_TX_QUEUED_REQ_RNR: case RXR_TX_QUEUED_DATA_RNR: dlist_remove(&tx_entry->queued_entry); break; + case RXR_TX_WAIT_READ_FINISH: + break; default: FI_WARN(&rxr_prov, FI_LOG_CQ, "tx_entry unknown state %d\n", tx_entry->state); @@ -200,7 +205,7 @@ int rxr_cq_handle_tx_error(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, dlist_foreach_container_safe(&tx_entry->queued_pkts, struct rxr_pkt_entry, pkt_entry, entry, tmp) - rxr_release_tx_pkt_entry(ep, pkt_entry); + rxr_pkt_entry_release_tx(ep, pkt_entry); err_entry.flags = tx_entry->cq_entry.flags; err_entry.op_context = tx_entry->cq_entry.op_context; @@ -210,7 +215,7 @@ int rxr_cq_handle_tx_error(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, if (FI_VERSION_GE(api_version, FI_VERSION(1, 5))) err_entry.err_data_size = 0; - FI_WARN(&rxr_prov, FI_LOG_CQ, + FI_WARN(&rxr_prov, FI_LOG_CQ, "rxr_cq_handle_tx_error: err: %d, prov_err: %s (%d)\n", err_entry.err, fi_strerror(-err_entry.prov_errno), err_entry.prov_errno); @@ -222,7 +227,7 @@ int rxr_cq_handle_tx_error(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, */ //rxr_release_tx_entry(ep, tx_entry); - rxr_cntr_report_error(ep, tx_entry->cq_entry.flags); + efa_cntr_report_error(&ep->util_ep, tx_entry->cq_entry.flags); return ofi_cq_write_error(util_cq, &err_entry); } @@ -252,11 +257,11 @@ static inline void rxr_cq_queue_pkt(struct rxr_ep *ep, * a retransmitted packet is received while waiting for the timer to * expire. */ - peer->rnr_ts = fi_gettime_us(); - if (peer->rnr_state & RXR_PEER_IN_BACKOFF) + peer->rnr_ts = ofi_gettime_us(); + if (peer->flags & RXR_PEER_IN_BACKOFF) goto queue_pkt; - peer->rnr_state |= RXR_PEER_IN_BACKOFF; + peer->flags |= RXR_PEER_IN_BACKOFF; if (!peer->timeout_interval) { if (rxr_env.timeout_interval) @@ -274,8 +279,8 @@ static inline void rxr_cq_queue_pkt(struct rxr_ep *ep, peer->rnr_queued_pkt_cnt); } else { /* Only backoff once per peer per progress thread loop. */ - if (!(peer->rnr_state & RXR_PEER_BACKED_OFF)) { - peer->rnr_state |= RXR_PEER_BACKED_OFF; + if (!(peer->flags & RXR_PEER_BACKED_OFF)) { + peer->flags |= RXR_PEER_BACKED_OFF; peer->rnr_timeout_exp++; FI_DBG(&rxr_prov, FI_LOG_EP_DATA, "increasing backoff for peer: %" PRIu64 @@ -300,6 +305,7 @@ int rxr_cq_handle_cq_error(struct rxr_ep *ep, ssize_t err) struct rxr_pkt_entry *pkt_entry; struct rxr_rx_entry *rx_entry; struct rxr_tx_entry *tx_entry; + struct rxr_read_entry *read_entry; struct rxr_peer *peer; ssize_t ret; @@ -321,7 +327,7 @@ int rxr_cq_handle_cq_error(struct rxr_ep *ep, ssize_t err) } ret = fi_cq_readerr(ep->rdm_cq, &err_entry, 0); - if (ret != sizeof(err_entry)) { + if (ret != 1) { if (ret < 0) { FI_WARN(&rxr_prov, FI_LOG_CQ, "fi_cq_readerr: %s\n", fi_strerror(-ret)); @@ -343,27 +349,26 @@ int rxr_cq_handle_cq_error(struct rxr_ep *ep, ssize_t err) peer = rxr_ep_get_peer(ep, pkt_entry->addr); /* - * A connack send could fail at the core provider if the peer endpoint - * is shutdown soon after it receives a send completion for the RTS - * packet that included src_address. The connack itself is irrelevant if + * A handshake send could fail at the core provider if the peer endpoint + * is shutdown soon after it receives a send completion for the REQ + * packet that included src_address. The handshake itself is irrelevant if * that happens, so just squelch this error entry and move on without * writing an error completion or event to the application. */ - if (rxr_get_base_hdr(pkt_entry->pkt)->type == RXR_CONNACK_PKT) { + if (rxr_get_base_hdr(pkt_entry->pkt)->type == RXR_HANDSHAKE_PKT) { FI_WARN(&rxr_prov, FI_LOG_CQ, - "Squelching error CQE for RXR_CONNACK_PKT\n"); + "Squelching error CQE for RXR_HANDSHAKE_PKT\n"); /* - * CONNACK packets do not have an associated rx/tx entry. Use + * HANDSHAKE packets do not have an associated rx/tx entry. Use * the flags instead to determine if this is a send or recv. */ if (err_entry.flags & FI_SEND) { rxr_ep_dec_tx_pending(ep, peer, 1); - rxr_release_tx_pkt_entry(ep, pkt_entry); + rxr_pkt_entry_release_tx(ep, pkt_entry); } else if (err_entry.flags & FI_RECV) { - rxr_release_rx_pkt_entry(ep, pkt_entry); - ep->rx_bufs_to_post++; + rxr_pkt_entry_release_rx(ep, pkt_entry); } else { - assert(0 && "unknown err_entry flags in CONNACK packet"); + assert(0 && "unknown err_entry flags in HANDSHAKE packet"); } return 0; } @@ -374,8 +379,7 @@ int rxr_cq_handle_cq_error(struct rxr_ep *ep, ssize_t err) * Since we don't have any context besides the error code, * we will write to the eq instead. */ - rxr_release_rx_pkt_entry(ep, pkt_entry); - ep->rx_bufs_to_post++; + rxr_pkt_entry_release_rx(ep, pkt_entry); goto write_err; } @@ -384,14 +388,15 @@ int rxr_cq_handle_cq_error(struct rxr_ep *ep, ssize_t err) * packet. Decrement the tx_pending counter and fall through to * the rx or tx entry handlers. */ - rxr_ep_dec_tx_pending(ep, peer, 1); + if (!peer->is_local) + rxr_ep_dec_tx_pending(ep, peer, 1); if (RXR_GET_X_ENTRY_TYPE(pkt_entry) == RXR_TX_ENTRY) { tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry; - if (err_entry.err != -FI_EAGAIN || - rxr_ep_domain(ep)->resource_mgmt == FI_RM_ENABLED) { + if (err_entry.prov_errno != IBV_WC_RNR_RETRY_EXC_ERR || + ep->handle_resource_management != FI_RM_ENABLED) { ret = rxr_cq_handle_tx_error(ep, tx_entry, err_entry.prov_errno); - rxr_release_tx_pkt_entry(ep, pkt_entry); + rxr_pkt_entry_release_tx(ep, pkt_entry); return ret; } @@ -401,19 +406,19 @@ int rxr_cq_handle_cq_error(struct rxr_ep *ep, ssize_t err) tx_entry->state = RXR_TX_QUEUED_DATA_RNR; dlist_insert_tail(&tx_entry->queued_entry, &ep->tx_entry_queued_list); - } else if (tx_entry->state == RXR_TX_RTS) { - tx_entry->state = RXR_TX_QUEUED_RTS_RNR; + } else if (tx_entry->state == RXR_TX_REQ) { + tx_entry->state = RXR_TX_QUEUED_REQ_RNR; dlist_insert_tail(&tx_entry->queued_entry, &ep->tx_entry_queued_list); } return 0; } else if (RXR_GET_X_ENTRY_TYPE(pkt_entry) == RXR_RX_ENTRY) { rx_entry = (struct rxr_rx_entry *)pkt_entry->x_entry; - if (err_entry.err != -FI_EAGAIN || - rxr_ep_domain(ep)->resource_mgmt == FI_RM_ENABLED) { + if (err_entry.prov_errno != IBV_WC_RNR_RETRY_EXC_ERR || + ep->handle_resource_management != FI_RM_ENABLED) { ret = rxr_cq_handle_rx_error(ep, rx_entry, err_entry.prov_errno); - rxr_release_tx_pkt_entry(ep, pkt_entry); + rxr_pkt_entry_release_tx(ep, pkt_entry); return ret; } rxr_cq_queue_pkt(ep, &rx_entry->queued_pkts, pkt_entry); @@ -423,6 +428,14 @@ int rxr_cq_handle_cq_error(struct rxr_ep *ep, ssize_t err) &ep->rx_entry_queued_list); } return 0; + } else if (RXR_GET_X_ENTRY_TYPE(pkt_entry) == RXR_READ_ENTRY) { + read_entry = (struct rxr_read_entry *)pkt_entry->x_entry; + /* read requests is not expected to get RNR, so we call + * rxr_read_handle_error() to handle general error here. + */ + ret = rxr_read_handle_error(ep, read_entry, err_entry.prov_errno); + rxr_pkt_entry_release_tx(ep, pkt_entry); + return ret; } FI_WARN(&rxr_prov, FI_LOG_CQ, @@ -430,115 +443,12 @@ int rxr_cq_handle_cq_error(struct rxr_ep *ep, ssize_t err) __func__, RXR_GET_X_ENTRY_TYPE(pkt_entry)); assert(0 && "unknown x_entry state"); write_err: - rxr_eq_write_error(ep, err_entry.err, err_entry.prov_errno); + efa_eq_write_error(&ep->util_ep, err_entry.err, err_entry.prov_errno); return 0; } -static int rxr_cq_match_recv(struct dlist_entry *item, const void *arg) -{ - const struct rxr_pkt_entry *pkt_entry = arg; - struct rxr_rx_entry *rx_entry; - - rx_entry = container_of(item, struct rxr_rx_entry, entry); - - return rxr_match_addr(rx_entry->addr, pkt_entry->addr); -} - -static int rxr_cq_match_trecv(struct dlist_entry *item, const void *arg) -{ - struct rxr_pkt_entry *pkt_entry = (struct rxr_pkt_entry *)arg; - struct rxr_rx_entry *rx_entry; - - rx_entry = container_of(item, struct rxr_rx_entry, entry); - - return rxr_match_addr(rx_entry->addr, pkt_entry->addr) && - rxr_match_tag(rx_entry->cq_entry.tag, rx_entry->ignore, - rxr_get_rts_hdr(pkt_entry->pkt)->tag); -} - -static void rxr_cq_post_connack(struct rxr_ep *ep, - struct rxr_peer *peer, - fi_addr_t addr) -{ - struct rxr_pkt_entry *pkt_entry; - ssize_t ret; - - if (peer->state == RXR_PEER_ACKED) - return; - - pkt_entry = rxr_get_pkt_entry(ep, ep->tx_pkt_pool); - if (OFI_UNLIKELY(!pkt_entry)) - return; - - rxr_ep_init_connack_pkt_entry(ep, pkt_entry, addr); - - /* - * TODO: Once we start using a core's selective completion capability, - * post the CONNACK packets without FI_COMPLETION. - */ - ret = rxr_ep_send_pkt(ep, pkt_entry, addr); - - /* - * Skip sending this connack on error and try again when processing the - * next RTS from this peer containing the source information - */ - if (OFI_UNLIKELY(ret)) { - rxr_release_tx_pkt_entry(ep, pkt_entry); - if (ret == -FI_EAGAIN) - return; - FI_WARN(&rxr_prov, FI_LOG_CQ, - "Failed to send a CONNACK packet: ret %zd\n", ret); - } else { - peer->state = RXR_PEER_ACKED; - } - - return; -} - -ssize_t rxr_cq_post_cts(struct rxr_ep *ep, - struct rxr_rx_entry *rx_entry, - uint64_t size) -{ - ssize_t ret; - struct rxr_pkt_entry *pkt_entry; - int credits; - - if (OFI_UNLIKELY(ep->posted_bufs == 0 || ep->available_data_bufs == 0)) - return -FI_EAGAIN; - - pkt_entry = rxr_get_pkt_entry(ep, ep->tx_pkt_pool); - - if (OFI_UNLIKELY(!pkt_entry)) - return -FI_EAGAIN; - - rxr_ep_init_cts_pkt_entry(ep, rx_entry, pkt_entry, size, &credits); - - ret = rxr_ep_send_pkt(ep, pkt_entry, rx_entry->addr); - if (OFI_UNLIKELY(ret)) - goto release_pkt; - - rx_entry->window = rxr_get_cts_hdr(pkt_entry->pkt)->window; - ep->available_data_bufs -= credits; - - /* - * Set a timer if available_bufs is exhausted. We may encounter a - * scenario where a peer has stopped responding so we need a fallback - * to replenish the credits. - */ - if (OFI_UNLIKELY(ep->available_data_bufs == 0)) - ep->available_data_bufs_ts = fi_gettime_us(); - - return ret; - -release_pkt: - rxr_release_tx_pkt_entry(ep, pkt_entry); - return ret; -} - -int rxr_cq_write_rx_completion(struct rxr_ep *ep, - struct fi_cq_msg_entry *comp, - struct rxr_pkt_entry *pkt_entry, - struct rxr_rx_entry *rx_entry) +void rxr_cq_write_rx_completion(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry) { struct util_cq *rx_cq = ep->util_ep.rx_cq; int ret = 0; @@ -565,8 +475,8 @@ int rxr_cq_write_rx_completion(struct rxr_ep *ep, "Unable to write recv error cq: %s\n", fi_strerror(-ret)); - rxr_cntr_report_error(ep, rx_entry->cq_entry.flags); - goto out; + efa_cntr_report_error(&ep->util_ep, rx_entry->cq_entry.flags); + return; } if (!(rx_entry->rxr_flags & RXR_RECV_CANCEL) && @@ -576,7 +486,7 @@ int rxr_cq_write_rx_completion(struct rxr_ep *ep, "Writing recv completion for rx_entry from peer: %" PRIu64 " rx_id: %" PRIu32 " msg_id: %" PRIu32 " tag: %lx total_len: %" PRIu64 "\n", - pkt_entry->addr, rx_entry->rx_id, rx_entry->msg_id, + rx_entry->addr, rx_entry->rx_id, rx_entry->msg_id, rx_entry->cq_entry.tag, rx_entry->total_len); if (ep->util_ep.caps & FI_SOURCE) @@ -605,44 +515,28 @@ int rxr_cq_write_rx_completion(struct rxr_ep *ep, fi_strerror(-ret)); if (rxr_cq_handle_rx_error(ep, rx_entry, ret)) assert(0 && "failed to write err cq entry"); - if (pkt_entry->type == RXR_PKT_ENTRY_POSTED) - ep->rx_bufs_to_post++; - rxr_release_rx_pkt_entry(ep, pkt_entry); - return ret; + return; } } - rxr_cntr_report_rx_completion(ep, rx_entry); - -out: - return 0; + efa_cntr_report_rx_completion(&ep->util_ep, rx_entry->cq_entry.flags); } -int rxr_cq_handle_rx_completion(struct rxr_ep *ep, - struct fi_cq_msg_entry *comp, - struct rxr_pkt_entry *pkt_entry, - struct rxr_rx_entry *rx_entry) +void rxr_cq_handle_rx_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry, + struct rxr_rx_entry *rx_entry) { - int ret = 0; struct rxr_tx_entry *tx_entry = NULL; - if (rx_entry->fi_flags & FI_MULTI_RECV) - rxr_cq_handle_multi_recv_completion(ep, rx_entry); - if (rx_entry->cq_entry.flags & FI_WRITE) { /* - * must be on the remote side, notify cq/counter - * if FI_RMA_EVENT is requested or REMOTE_CQ_DATA is on + * must be on the remote side, notify cq if REMOTE_CQ_DATA is on */ if (rx_entry->cq_entry.flags & FI_REMOTE_CQ_DATA) - ret = rxr_cq_write_rx_completion(ep, comp, pkt_entry, rx_entry); - else if (ep->util_ep.caps & FI_RMA_EVENT) - rxr_cntr_report_rx_completion(ep, rx_entry); + rxr_cq_write_rx_completion(ep, rx_entry); - if (pkt_entry->type == RXR_PKT_ENTRY_POSTED) - ep->rx_bufs_to_post++; - rxr_release_rx_pkt_entry(ep, pkt_entry); - return ret; + rxr_pkt_entry_release_rx(ep, pkt_entry); + return; } if (rx_entry->cq_entry.flags & FI_READ) { @@ -653,19 +547,18 @@ int rxr_cq_handle_rx_completion(struct rxr_ep *ep, * The following shows the sequence of events that * is happening * - * Initiator side Remote side + * Initiator side Remote side * create tx_entry * create rx_entry - * send rts(with rx_id) - * receive rts + * send rtr(with rx_id) + * receive rtr * create rx_entry * create tx_entry * tx_entry sending data * rx_entry receiving data * receive completed send completed * handle_rx_completion() handle_pkt_send_completion() - * |->write_tx_completion() |-> if (FI_RMA_EVENT) - * write_rx_completion() + * |->write_tx_completion() * * As can be seen, although there is a rx_entry on remote side, * the entry will not enter into rxr_cq_handle_rx_completion @@ -677,9 +570,9 @@ int rxr_cq_handle_rx_completion(struct rxr_ep *ep, assert(tx_entry->state == RXR_TX_WAIT_READ_FINISH); if (tx_entry->fi_flags & FI_COMPLETION) { /* Note write_tx_completion() will release tx_entry */ - rxr_cq_write_tx_completion(ep, comp, tx_entry); + rxr_cq_write_tx_completion(ep, tx_entry); } else { - rxr_cntr_report_tx_completion(ep, tx_entry); + efa_cntr_report_tx_completion(&ep->util_ep, tx_entry->cq_entry.flags); rxr_release_tx_entry(ep, tx_entry); } @@ -687,578 +580,185 @@ int rxr_cq_handle_rx_completion(struct rxr_ep *ep, * do not call rxr_release_rx_entry here because * caller will release */ - if (pkt_entry->type == RXR_PKT_ENTRY_POSTED) - ep->rx_bufs_to_post++; - rxr_release_rx_pkt_entry(ep, pkt_entry); - return 0; - } - - ret = rxr_cq_write_rx_completion(ep, comp, pkt_entry, rx_entry); - if (pkt_entry->type == RXR_PKT_ENTRY_POSTED) - ep->rx_bufs_to_post++; - rxr_release_rx_pkt_entry(ep, pkt_entry); - return ret; -} - -void rxr_cq_recv_rts_data(struct rxr_ep *ep, - struct rxr_rx_entry *rx_entry, - struct rxr_rts_hdr *rts_hdr) -{ - char *data; - uint32_t emulated_rma_flags = 0; - int ret = 0; - struct fi_rma_iov *rma_iov = NULL; - - /* - * Use the correct header and grab CQ data and data, but ignore the - * source_address since that has been fetched and processed already - */ - if (rts_hdr->flags & RXR_REMOTE_CQ_DATA) { - rx_entry->cq_entry.flags |= FI_REMOTE_CQ_DATA; - data = rxr_get_ctrl_cq_pkt(rts_hdr)->data + rts_hdr->addrlen; - rx_entry->cq_entry.data = - rxr_get_ctrl_cq_pkt(rts_hdr)->hdr.cq_data; - } else { - rx_entry->cq_entry.data = 0; - data = rxr_get_ctrl_pkt(rts_hdr)->data + rts_hdr->addrlen; - } - - if (rts_hdr->flags & (RXR_READ_REQ | RXR_WRITE)) { - rma_iov = (struct fi_rma_iov *)data; - - if (rts_hdr->flags & RXR_READ_REQ) { - emulated_rma_flags = FI_SEND; - rx_entry->cq_entry.flags |= (FI_RMA | FI_READ); - } else { - assert(rts_hdr->flags | RXR_WRITE); - emulated_rma_flags = FI_RECV; - rx_entry->cq_entry.flags |= (FI_RMA | FI_WRITE); - } - - assert(rx_entry->iov_count == 0); - - rx_entry->iov_count = rts_hdr->rma_iov_count; - ret = rxr_rma_verified_copy_iov(ep, rma_iov, rts_hdr->rma_iov_count, emulated_rma_flags, - rx_entry->iov); - if (ret) { - FI_WARN(&rxr_prov, FI_LOG_CQ, "RMA address verify failed!\n"); - rxr_cq_handle_cq_error(ep, -FI_EIO); - } - - rx_entry->cq_entry.len = ofi_total_iov_len(&rx_entry->iov[0], - rx_entry->iov_count); - rx_entry->cq_entry.buf = rx_entry->iov[0].iov_base; - data += rts_hdr->rma_iov_count * sizeof(struct fi_rma_iov); - } - - /* we are sinking message for CANCEL/DISCARD entry */ - if (OFI_UNLIKELY(rx_entry->rxr_flags & RXR_RECV_CANCEL)) { - rx_entry->bytes_done += rxr_get_rts_data_size(ep, rts_hdr); + rxr_pkt_entry_release_rx(ep, pkt_entry); return; } - if (rx_entry->cq_entry.flags & FI_READ) { - uint64_t *ptr = (uint64_t *)data; - - rx_entry->bytes_done = 0; - rx_entry->rma_initiator_rx_id = *ptr; - ptr += 1; - rx_entry->window = *ptr; - assert(rx_entry->window > 0); - } else { - rx_entry->bytes_done += ofi_copy_to_iov(rx_entry->iov, rx_entry->iov_count, - 0, data, rxr_get_rts_data_size(ep, rts_hdr)); - - assert(rx_entry->bytes_done == MIN(rx_entry->cq_entry.len, rxr_get_rts_data_size(ep, rts_hdr))); - } -} - -static int rxr_cq_process_rts(struct rxr_ep *ep, - struct rxr_pkt_entry *pkt_entry) -{ - struct rxr_rts_hdr *rts_hdr; - struct dlist_entry *match; - struct rxr_rx_entry *rx_entry; - struct rxr_tx_entry *tx_entry; - uint64_t bytes_left; - uint64_t tag = 0; - uint32_t op; - int ret = 0; - - rts_hdr = rxr_get_rts_hdr(pkt_entry->pkt); - - if (rts_hdr->flags & RXR_TAGGED) { - match = dlist_find_first_match(&ep->rx_tagged_list, - &rxr_cq_match_trecv, - (void *)pkt_entry); - } else if (rts_hdr->flags & (RXR_READ_REQ | RXR_WRITE)) { - /* - * rma is one sided operation, match is not expected - * we need to create a rx entry upon receiving a rts - */ - tag = ~0; // RMA is not tagged - op = (rts_hdr->flags & RXR_READ_REQ) ? ofi_op_read_rsp : ofi_op_write_async; - rx_entry = rxr_ep_get_rx_entry(ep, NULL, 0, tag, 0, NULL, pkt_entry->addr, op, 0); - if (OFI_UNLIKELY(!rx_entry)) { - FI_WARN(&rxr_prov, FI_LOG_CQ, - "RX entries exhausted.\n"); - rxr_eq_write_error(ep, FI_ENOBUFS, -FI_ENOBUFS); - return -FI_ENOBUFS; - } - dlist_insert_tail(&rx_entry->entry, &ep->rx_list); - match = &rx_entry->entry; - } else { - match = dlist_find_first_match(&ep->rx_list, - &rxr_cq_match_recv, - (void *)pkt_entry); - } - - if (OFI_UNLIKELY(!match)) { - rx_entry = rxr_ep_get_new_unexp_rx_entry(ep, pkt_entry); - if (!rx_entry) { - FI_WARN(&rxr_prov, FI_LOG_CQ, - "RX entries exhausted.\n"); - rxr_eq_write_error(ep, FI_ENOBUFS, -FI_ENOBUFS); - return -FI_ENOBUFS; - } - pkt_entry = rx_entry->unexp_rts_pkt; - rts_hdr = rxr_get_rts_hdr(pkt_entry->pkt); - } else { - rx_entry = container_of(match, struct rxr_rx_entry, entry); - if (rx_entry->rxr_flags & RXR_MULTI_RECV_POSTED) { - rx_entry = rxr_ep_split_rx_entry(ep, rx_entry, - NULL, pkt_entry); - if (OFI_UNLIKELY(!rx_entry)) { - FI_WARN(&rxr_prov, FI_LOG_CQ, - "RX entries exhausted.\n"); - rxr_eq_write_error(ep, FI_ENOBUFS, -FI_ENOBUFS); - return -FI_ENOBUFS; - } - } - - rx_entry->state = RXR_RX_MATCHED; - - if (!(rx_entry->fi_flags & FI_MULTI_RECV) || - !rxr_multi_recv_buffer_available(ep, - rx_entry->master_entry)) - dlist_remove(match); - } - - rx_entry->addr = pkt_entry->addr; - rx_entry->tx_id = rts_hdr->tx_id; - rx_entry->msg_id = rts_hdr->msg_id; - rx_entry->total_len = rts_hdr->data_len; - rx_entry->cq_entry.tag = rts_hdr->tag; - - if (OFI_UNLIKELY(!match)) - return 0; - - /* - * TODO: Change protocol to contact sender to stop sending when the - * message is truncated instead of sinking the additional data. - */ - - rxr_cq_recv_rts_data(ep, rx_entry, rts_hdr); - - if (rx_entry->cq_entry.flags & FI_READ) { - /* - * create a tx_entry for sending data back to initiator - */ - tx_entry = rxr_readrsp_tx_entry_init(ep, rx_entry); - - /* the only difference between a read response packet and - * a data packet is that read response packet has remote EP tx_id - * which initiator EP rx_entry need to send CTS back - */ - - ret = rxr_ep_post_readrsp(ep, tx_entry); - if (!ret) { - tx_entry->state = RXR_TX_SENT_READRSP; - if (tx_entry->bytes_sent < tx_entry->total_len) { - /* as long as read response packet has been sent, - * data packets are ready to be sent. it is OK that - * data packets arrive before read response packet, - * because tx_id is needed by the initator EP in order - * to send a CTS, which will not occur until - * all data packets in current window are received, which - * include the data in the read response packet. - */ - dlist_insert_tail(&tx_entry->entry, &ep->tx_pending_list); - tx_entry->state = RXR_TX_SEND; - } - } else if (ret == -FI_EAGAIN) { - dlist_insert_tail(&tx_entry->queued_entry, &ep->tx_entry_queued_list); - tx_entry->state = RXR_TX_QUEUED_READRSP; - ret = 0; - } else { - if (rxr_cq_handle_tx_error(ep, tx_entry, ret)) - assert(0 && "failed to write err cq entry"); - } - - rx_entry->state = RXR_RX_WAIT_READ_FINISH; - if (pkt_entry->type == RXR_PKT_ENTRY_POSTED) - ep->rx_bufs_to_post++; - rxr_release_rx_pkt_entry(ep, pkt_entry); - return ret; - } - - bytes_left = rx_entry->total_len - rxr_get_rts_data_size(ep, rts_hdr); - rx_entry->cq_entry.len = MIN(rx_entry->total_len, - rx_entry->cq_entry.len); - - if (!bytes_left) { - ret = rxr_cq_handle_rx_completion(ep, NULL, - pkt_entry, rx_entry); - rxr_multi_recv_free_posted_entry(ep, rx_entry); - if (!ret) - rxr_release_rx_entry(ep, rx_entry); - return ret; - } - -#if ENABLE_DEBUG - dlist_insert_tail(&rx_entry->rx_pending_entry, &ep->rx_pending_list); - ep->rx_pending++; -#endif - rx_entry->state = RXR_RX_RECV; - if (rts_hdr->flags & RXR_CREDIT_REQUEST) - rx_entry->credit_request = rts_hdr->credit_request; - else - rx_entry->credit_request = rxr_env.tx_min_credits; - - ret = rxr_ep_post_cts_or_queue(ep, rx_entry, bytes_left); - if (pkt_entry->type == RXR_PKT_ENTRY_POSTED) - ep->rx_bufs_to_post++; - rxr_release_rx_pkt_entry(ep, pkt_entry); + if (rx_entry->fi_flags & FI_MULTI_RECV) + rxr_msg_multi_recv_handle_completion(ep, rx_entry); - return ret; + rxr_cq_write_rx_completion(ep, rx_entry); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return; } -static int rxr_cq_reorder_msg(struct rxr_ep *ep, - struct rxr_peer *peer, - struct rxr_pkt_entry *pkt_entry) +int rxr_cq_reorder_msg(struct rxr_ep *ep, + struct rxr_peer *peer, + struct rxr_pkt_entry *pkt_entry) { - struct rxr_rts_hdr *rts_hdr; struct rxr_pkt_entry *ooo_entry; + struct rxr_pkt_entry *cur_ooo_entry; + uint32_t msg_id; - rts_hdr = rxr_get_rts_hdr(pkt_entry->pkt); + assert(rxr_get_base_hdr(pkt_entry->pkt)->type >= RXR_REQ_PKT_BEGIN); + msg_id = rxr_pkt_msg_id(pkt_entry); /* * TODO: Initialize peer state at the time of AV insertion * where duplicate detection is available. */ if (!peer->rx_init) - rxr_ep_peer_init(ep, peer); + rxr_ep_peer_init_rx(ep, peer); #if ENABLE_DEBUG - if (rts_hdr->msg_id != ofi_recvwin_next_exp_id(peer->robuf)) + if (msg_id != ofi_recvwin_next_exp_id(peer->robuf)) FI_DBG(&rxr_prov, FI_LOG_EP_CTRL, - "msg OOO rts_hdr->msg_id: %" PRIu32 " expected: %" - PRIu64 "\n", rts_hdr->msg_id, + "msg OOO msg_id: %" PRIu32 " expected: %" + PRIu32 "\n", msg_id, ofi_recvwin_next_exp_id(peer->robuf)); #endif - if (ofi_recvwin_is_exp(peer->robuf, rts_hdr->msg_id)) + if (ofi_recvwin_is_exp(peer->robuf, msg_id)) return 0; - else if (ofi_recvwin_is_delayed(peer->robuf, rts_hdr->msg_id)) + else if (!ofi_recvwin_id_valid(peer->robuf, msg_id)) return -FI_EALREADY; if (OFI_LIKELY(rxr_env.rx_copy_ooo)) { assert(pkt_entry->type == RXR_PKT_ENTRY_POSTED); - ooo_entry = rxr_get_pkt_entry(ep, ep->rx_ooo_pkt_pool); + ooo_entry = rxr_pkt_entry_clone(ep, ep->rx_ooo_pkt_pool, pkt_entry, RXR_PKT_ENTRY_OOO); if (OFI_UNLIKELY(!ooo_entry)) { FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "Unable to allocate rx_pkt_entry for OOO msg\n"); return -FI_ENOMEM; } - rxr_copy_pkt_entry(ep, ooo_entry, pkt_entry, RXR_PKT_ENTRY_OOO); - rts_hdr = rxr_get_rts_hdr(ooo_entry->pkt); - rxr_release_rx_pkt_entry(ep, pkt_entry); - ep->rx_bufs_to_post++; + rxr_pkt_entry_release_rx(ep, pkt_entry); } else { ooo_entry = pkt_entry; } - ofi_recvwin_queue_msg(peer->robuf, &ooo_entry, rts_hdr->msg_id); + cur_ooo_entry = *ofi_recvwin_get_msg(peer->robuf, msg_id); + if (cur_ooo_entry) { + assert(rxr_get_base_hdr(cur_ooo_entry->pkt)->type == RXR_MEDIUM_MSGRTM_PKT || + rxr_get_base_hdr(cur_ooo_entry->pkt)->type == RXR_MEDIUM_TAGRTM_PKT || + rxr_get_base_hdr(cur_ooo_entry->pkt)->type == RXR_DC_MEDIUM_MSGRTM_PKT || + rxr_get_base_hdr(cur_ooo_entry->pkt)->type == RXR_DC_MEDIUM_TAGRTM_PKT); + assert(rxr_pkt_msg_id(cur_ooo_entry) == msg_id); + assert(rxr_pkt_rtm_total_len(cur_ooo_entry) == rxr_pkt_rtm_total_len(ooo_entry)); + rxr_pkt_entry_append(cur_ooo_entry, ooo_entry); + } else { + ofi_recvwin_queue_msg(peer->robuf, &ooo_entry, msg_id); + } + return 1; } -static void rxr_cq_proc_pending_items_in_recvwin(struct rxr_ep *ep, - struct rxr_peer *peer) +void rxr_cq_proc_pending_items_in_recvwin(struct rxr_ep *ep, + struct rxr_peer *peer) { struct rxr_pkt_entry *pending_pkt; - struct rxr_rts_hdr *rts_hdr; int ret = 0; + uint32_t msg_id; while (1) { pending_pkt = *ofi_recvwin_peek(peer->robuf); if (!pending_pkt || !pending_pkt->pkt) return; - rts_hdr = rxr_get_rts_hdr(pending_pkt->pkt); - *ofi_recvwin_get_next_msg(peer->robuf) = NULL; - + msg_id = rxr_pkt_msg_id(pending_pkt); FI_DBG(&rxr_prov, FI_LOG_EP_CTRL, - "Processing msg_id %d from robuf\n", rts_hdr->msg_id); - - /* rxr_cq_process_rts will write error cq entry if needed */ - ret = rxr_cq_process_rts(ep, pending_pkt); + "Processing msg_id %d from robuf\n", msg_id); + /* rxr_pkt_proc_rtm_rta will write error cq entry if needed */ + ret = rxr_pkt_proc_rtm_rta(ep, pending_pkt); + *ofi_recvwin_get_next_msg(peer->robuf) = NULL; if (OFI_UNLIKELY(ret)) { FI_WARN(&rxr_prov, FI_LOG_CQ, "Error processing msg_id %d from robuf: %s\n", - rts_hdr->msg_id, fi_strerror(-ret)); + msg_id, fi_strerror(-ret)); return; } } return; } -static void rxr_cq_handle_rts(struct rxr_ep *ep, - struct fi_cq_msg_entry *comp, - struct rxr_pkt_entry *pkt_entry, - fi_addr_t src_addr) +/* Handle two scenarios: + * 1. RMA writes with immediate data at remote endpoint, + * 2. atomic completion on the requester + * write completion for both + */ +void rxr_cq_handle_shm_completion(struct rxr_ep *ep, struct fi_cq_data_entry *cq_entry, fi_addr_t src_addr) { - fi_addr_t rdm_addr; - struct rxr_rts_hdr *rts_hdr; - struct rxr_av *av; - struct rxr_peer *peer; - void *raw_address; - int i, ret; - - rts_hdr = rxr_get_rts_hdr(pkt_entry->pkt); - av = rxr_ep_av(ep); - - if (OFI_UNLIKELY(src_addr == FI_ADDR_NOTAVAIL)) { - assert(rts_hdr->flags & RXR_REMOTE_SRC_ADDR); - assert(rts_hdr->addrlen > 0); - if (rxr_get_base_hdr(pkt_entry->pkt)->version != - RXR_PROTOCOL_VERSION) { - char buffer[ep->core_addrlen * 3]; - int length = 0; - - for (i = 0; i < ep->core_addrlen; i++) - length += sprintf(&buffer[length], "%02x ", - ep->core_addr[i]); - FI_WARN(&rxr_prov, FI_LOG_CQ, - "Host %s:Invalid protocol version %d. Expected protocol version %d.\n", - buffer, - rxr_get_base_hdr(pkt_entry->pkt)->version, - RXR_PROTOCOL_VERSION); - rxr_eq_write_error(ep, FI_EIO, -FI_EINVAL); - fprintf(stderr, "Invalid protocol version %d. Expected protocol version %d. %s:%d\n", - rxr_get_base_hdr(pkt_entry->pkt)->version, - RXR_PROTOCOL_VERSION, __FILE__, __LINE__); - abort(); - } - raw_address = (rts_hdr->flags & RXR_REMOTE_CQ_DATA) ? - rxr_get_ctrl_cq_pkt(rts_hdr)->data - : rxr_get_ctrl_pkt(rts_hdr)->data; - - ret = rxr_av_insert_rdm_addr(av, - (void *)raw_address, - &rdm_addr, 0, NULL); - if (OFI_UNLIKELY(ret != 1)) { - rxr_eq_write_error(ep, FI_EINVAL, ret); - return; - } + struct util_cq *target_cq; + int ret; - pkt_entry->addr = rdm_addr; + if (cq_entry->flags & FI_ATOMIC) { + target_cq = ep->util_ep.tx_cq; } else { - pkt_entry->addr = src_addr; - } - - peer = rxr_ep_get_peer(ep, pkt_entry->addr); - assert(peer); - - if (ep->core_caps & FI_SOURCE) - rxr_cq_post_connack(ep, peer, pkt_entry->addr); - - if (rxr_need_sas_ordering(ep)) { - ret = rxr_cq_reorder_msg(ep, peer, pkt_entry); - if (ret == 1) { - /* Packet was queued */ - return; - } else if (OFI_UNLIKELY(ret == -FI_EALREADY)) { - FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, - "Duplicate RTS packet msg_id: %" PRIu32 - " next_msg_id: %" PRIu32 "\n", - rts_hdr->msg_id, peer->next_msg_id); - if (!rts_hdr->addrlen) - rxr_eq_write_error(ep, FI_EIO, ret); - rxr_release_rx_pkt_entry(ep, pkt_entry); - ep->rx_bufs_to_post++; - return; - } else if (OFI_UNLIKELY(ret == -FI_ENOMEM)) { - rxr_eq_write_error(ep, FI_ENOBUFS, -FI_ENOBUFS); - return; - } else if (OFI_UNLIKELY(ret < 0)) { - FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, - "Unknown error %d processing RTS packet msg_id: %" - PRIu32 "\n", ret, rts_hdr->msg_id); - rxr_eq_write_error(ep, FI_EIO, ret); - return; - } - - /* processing the expected packet */ - ofi_recvwin_slide(peer->robuf); - } - - /* rxr_cq_process_rts will write error cq entry if needed */ - ret = rxr_cq_process_rts(ep, pkt_entry); - if (OFI_UNLIKELY(ret)) - return; - - /* process pending items in reorder buff */ - if (rxr_need_sas_ordering(ep)) - rxr_cq_proc_pending_items_in_recvwin(ep, peer); - - return; -} - -static void rxr_cq_handle_connack(struct rxr_ep *ep, - struct fi_cq_msg_entry *comp, - struct rxr_pkt_entry *pkt_entry, - fi_addr_t src_addr) -{ - struct rxr_peer *peer; - - /* - * We don't really need any information from the actual connack packet - * itself, just the src_addr from the CQE - */ - assert(src_addr != FI_ADDR_NOTAVAIL); - peer = rxr_ep_get_peer(ep, src_addr); - peer->state = RXR_PEER_ACKED; - FI_DBG(&rxr_prov, FI_LOG_CQ, - "CONNACK received from %" PRIu64 "\n", src_addr); - rxr_release_rx_pkt_entry(ep, pkt_entry); - ep->rx_bufs_to_post++; -} - -void rxr_cq_handle_pkt_with_data(struct rxr_ep *ep, - struct rxr_rx_entry *rx_entry, - struct fi_cq_msg_entry *comp, - struct rxr_pkt_entry *pkt_entry, - char *data, size_t seg_offset, - size_t seg_size) -{ - struct rxr_peer *peer; - uint64_t bytes; - ssize_t ret; - - peer = rxr_ep_get_peer(ep, rx_entry->addr); - peer->rx_credits += ofi_div_ceil(seg_size, ep->max_data_payload_size); - rx_entry->window -= seg_size; - - if (ep->available_data_bufs < rxr_get_rx_pool_chunk_cnt(ep)) - ep->available_data_bufs++; - - bytes = rx_entry->total_len - rx_entry->bytes_done - - seg_size; + assert(cq_entry->flags & FI_REMOTE_CQ_DATA); + target_cq = ep->util_ep.rx_cq; + } + + if (ep->util_ep.caps & FI_SOURCE) + ret = ofi_cq_write_src(target_cq, + cq_entry->op_context, + cq_entry->flags, + cq_entry->len, + cq_entry->buf, + cq_entry->data, + 0, + src_addr); + else + ret = ofi_cq_write(target_cq, + cq_entry->op_context, + cq_entry->flags, + cq_entry->len, + cq_entry->buf, + cq_entry->data, + 0); - if (!rx_entry->window && bytes > 0) - rxr_ep_post_cts_or_queue(ep, rx_entry, bytes); + rxr_rm_rx_cq_check(ep, target_cq); - /* we are sinking message for CANCEL/DISCARD entry */ - if (OFI_LIKELY(!(rx_entry->rxr_flags & RXR_RECV_CANCEL))) { - ofi_copy_to_iov(rx_entry->iov, rx_entry->iov_count, - seg_offset, data, seg_size); + if (OFI_UNLIKELY(ret)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "Unable to write a cq entry for shm operation: %s\n", + fi_strerror(-ret)); + efa_eq_write_error(&ep->util_ep, FI_EIO, ret); } - rx_entry->bytes_done += seg_size; - if (rx_entry->total_len == rx_entry->bytes_done) { -#if ENABLE_DEBUG - dlist_remove(&rx_entry->rx_pending_entry); - ep->rx_pending--; -#endif - ret = rxr_cq_handle_rx_completion(ep, comp, - pkt_entry, rx_entry); - - rxr_multi_recv_free_posted_entry(ep, rx_entry); - if (OFI_LIKELY(!ret)) - rxr_release_rx_entry(ep, rx_entry); - return; + if (cq_entry->flags & FI_ATOMIC) { + efa_cntr_report_tx_completion(&ep->util_ep, cq_entry->flags); + } else { + assert(cq_entry->flags & FI_REMOTE_CQ_DATA); + efa_cntr_report_rx_completion(&ep->util_ep, cq_entry->flags); } - - rxr_release_rx_pkt_entry(ep, pkt_entry); - ep->rx_bufs_to_post++; } -static void rxr_cq_handle_readrsp(struct rxr_ep *ep, - struct fi_cq_msg_entry *comp, - struct rxr_pkt_entry *pkt_entry) -{ - struct rxr_readrsp_pkt *readrsp_pkt = NULL; - struct rxr_readrsp_hdr *readrsp_hdr = NULL; - struct rxr_rx_entry *rx_entry = NULL; - - readrsp_pkt = (struct rxr_readrsp_pkt *)pkt_entry->pkt; - readrsp_hdr = &readrsp_pkt->hdr; - rx_entry = ofi_bufpool_get_ibuf(ep->rx_entry_pool, readrsp_hdr->rx_id); - assert(rx_entry->cq_entry.flags & FI_READ); - rx_entry->tx_id = readrsp_hdr->tx_id; - rxr_cq_handle_pkt_with_data(ep, rx_entry, comp, pkt_entry, - readrsp_pkt->data, 0, readrsp_hdr->seg_size); -} +static inline +bool rxr_cq_need_tx_completion(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry) -static void rxr_cq_handle_cts(struct rxr_ep *ep, - struct fi_cq_msg_entry *comp, - struct rxr_pkt_entry *pkt_entry) { - struct rxr_peer *peer; - struct rxr_cts_hdr *cts_pkt; - struct rxr_tx_entry *tx_entry; - - cts_pkt = (struct rxr_cts_hdr *)pkt_entry->pkt; - if (cts_pkt->flags & RXR_READ_REQ) - tx_entry = ofi_bufpool_get_ibuf(ep->readrsp_tx_entry_pool, cts_pkt->tx_id); - else - tx_entry = ofi_bufpool_get_ibuf(ep->tx_entry_pool, cts_pkt->tx_id); + if (tx_entry->fi_flags & RXR_NO_COMPLETION) + return false; - tx_entry->rx_id = cts_pkt->rx_id; - tx_entry->window = cts_pkt->window; - - /* Return any excess tx_credits that were borrowed for the request */ - peer = rxr_ep_get_peer(ep, tx_entry->addr); - tx_entry->credit_allocated = ofi_div_ceil(cts_pkt->window, ep->max_data_payload_size); - if (tx_entry->credit_allocated < tx_entry->credit_request) - peer->tx_credits += tx_entry->credit_request - tx_entry->credit_allocated; - - rxr_release_rx_pkt_entry(ep, pkt_entry); - ep->rx_bufs_to_post++; - - if (tx_entry->state != RXR_TX_SEND) { - tx_entry->state = RXR_TX_SEND; - dlist_insert_tail(&tx_entry->entry, &ep->tx_pending_list); - } - return; + /* + * ep->util_ep.tx_msg_flags is either 0 or FI_COMPLETION, depend on + * whether app specfied FI_SELECTIVE_COMPLETION when binding CQ. + * (ep->util_ep.tx_msg_flags was set in ofi_ep_bind_cq()) + * + * If tx_msg_flags is 0, we only write completion when app specify + * FI_COMPLETION in flags. + */ + return ep->util_ep.tx_msg_flags == FI_COMPLETION || + tx_entry->fi_flags & FI_COMPLETION; } -static void rxr_cq_handle_data(struct rxr_ep *ep, - struct fi_cq_msg_entry *comp, - struct rxr_pkt_entry *pkt_entry) -{ - struct rxr_data_pkt *data_pkt; - struct rxr_rx_entry *rx_entry; - data_pkt = (struct rxr_data_pkt *)pkt_entry->pkt; - - rx_entry = ofi_bufpool_get_ibuf(ep->rx_entry_pool, - data_pkt->hdr.rx_id); - - rxr_cq_handle_pkt_with_data(ep, rx_entry, - comp, pkt_entry, - data_pkt->data, - data_pkt->hdr.seg_offset, - data_pkt->hdr.seg_size); -} void rxr_cq_write_tx_completion(struct rxr_ep *ep, - struct fi_cq_msg_entry *comp, struct rxr_tx_entry *tx_entry) { struct util_cq *tx_cq = ep->util_ep.tx_cq; int ret; - if (!(tx_entry->fi_flags & RXR_NO_COMPLETION) && - ofi_need_completion(rxr_tx_flags(ep), tx_entry->fi_flags)) { + if (rxr_cq_need_tx_completion(ep, tx_entry)) { FI_DBG(&rxr_prov, FI_LOG_CQ, "Writing send completion for tx_entry to peer: %" PRIu64 " tx_id: %" PRIu32 " msg_id: %" PRIu32 " tag: %lx len: %" @@ -1297,183 +797,47 @@ void rxr_cq_write_tx_completion(struct rxr_ep *ep, } } - rxr_cntr_report_tx_completion(ep, tx_entry); + efa_cntr_report_tx_completion(&ep->util_ep, tx_entry->cq_entry.flags); rxr_release_tx_entry(ep, tx_entry); return; } -void rxr_cq_handle_pkt_recv_completion(struct rxr_ep *ep, - struct fi_cq_msg_entry *cq_entry, - fi_addr_t src_addr) -{ - struct rxr_pkt_entry *pkt_entry; - - pkt_entry = (struct rxr_pkt_entry *)cq_entry->op_context; - ep->posted_bufs--; - - assert(rxr_get_base_hdr(pkt_entry->pkt)->version == - RXR_PROTOCOL_VERSION); - -#if ENABLE_DEBUG - dlist_remove(&pkt_entry->dbg_entry); - dlist_insert_tail(&pkt_entry->dbg_entry, &ep->rx_pkt_list); -#ifdef ENABLE_RXR_PKT_DUMP - rxr_ep_print_pkt("Received", ep, (struct rxr_base_hdr *)pkt_entry->pkt); -#endif -#endif - - switch (rxr_get_base_hdr(pkt_entry->pkt)->type) { - case RXR_RTS_PKT: - rxr_cq_handle_rts(ep, cq_entry, pkt_entry, src_addr); - return; - case RXR_CONNACK_PKT: - rxr_cq_handle_connack(ep, cq_entry, pkt_entry, src_addr); - return; - case RXR_CTS_PKT: - rxr_cq_handle_cts(ep, cq_entry, pkt_entry); - return; - case RXR_DATA_PKT: - rxr_cq_handle_data(ep, cq_entry, pkt_entry); - return; - case RXR_READRSP_PKT: - rxr_cq_handle_readrsp(ep, cq_entry, pkt_entry); - return; - default: - FI_WARN(&rxr_prov, FI_LOG_CQ, - "invalid control pkt type %d\n", - rxr_get_base_hdr(pkt_entry->pkt)->type); - assert(0 && "invalid control pkt type"); - rxr_cq_handle_cq_error(ep, -FI_EIO); - return; - } - return; -} - -static int rxr_send_completion_mr_dereg(struct rxr_tx_entry *tx_entry) -{ - int i, ret = 0; - - for (i = tx_entry->iov_mr_start; i < tx_entry->iov_count; i++) { - if (tx_entry->mr[i]) { - ret = fi_close((struct fid *)tx_entry->mr[i]); - if (OFI_UNLIKELY(ret)) - return ret; - } - } - return ret; -} - -void rxr_cq_handle_pkt_send_completion(struct rxr_ep *ep, struct fi_cq_msg_entry *comp) +void rxr_cq_handle_tx_completion(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry) { - struct rxr_pkt_entry *pkt_entry; - struct rxr_tx_entry *tx_entry = NULL; struct rxr_peer *peer; - struct rxr_rts_hdr *rts_hdr = NULL; - struct rxr_readrsp_hdr *readrsp_hdr = NULL; - uint32_t tx_id; - int ret; - pkt_entry = (struct rxr_pkt_entry *)comp->op_context; - assert(rxr_get_base_hdr(pkt_entry->pkt)->version == - RXR_PROTOCOL_VERSION); - peer = rxr_ep_get_peer(ep, pkt_entry->addr); + if (tx_entry->state == RXR_TX_SEND) + dlist_remove(&tx_entry->entry); - switch (rxr_get_base_hdr(pkt_entry->pkt)->type) { - case RXR_RTS_PKT: + peer = rxr_ep_get_peer(ep, tx_entry->addr); + peer->tx_credits += tx_entry->credit_allocated; + + if (tx_entry->cq_entry.flags & FI_READ) { /* - * for FI_READ, it is possible (though does not happen very offen) that at the point - * tx_entry has been released. The reason is, for FI_READ: - * 1. only the initator side will send a RTS. - * 2. the initator side will receive data packet. When all data was received, - * it will release the tx_entry - * Therefore, if it so happens that all data was received before we got the send - * completion notice, we will have a released tx_entry at this point. - * Nonetheless, because for FI_READ tx_entry will be release in rxr_handle_rx_completion, - * we will ignore it here. + * this must be on remote side + * see explaination on rxr_cq_handle_rx_completion */ - rts_hdr = rxr_get_rts_hdr(pkt_entry->pkt); - if (!(rts_hdr->flags & RXR_READ_REQ)) { - tx_id = rts_hdr->tx_id; - tx_entry = ofi_bufpool_get_ibuf(ep->tx_entry_pool, tx_id); - tx_entry->bytes_acked += rxr_get_rts_data_size(ep, rts_hdr); - } - break; - case RXR_CONNACK_PKT: - break; - case RXR_CTS_PKT: - break; - case RXR_DATA_PKT: - tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry; - tx_entry->bytes_acked += - rxr_get_data_pkt(pkt_entry->pkt)->hdr.seg_size; - break; - case RXR_READRSP_PKT: - readrsp_hdr = rxr_get_readrsp_hdr(pkt_entry->pkt); - tx_id = readrsp_hdr->tx_id; - tx_entry = ofi_bufpool_get_ibuf(ep->readrsp_tx_entry_pool, tx_id); - assert(tx_entry->cq_entry.flags & FI_READ); - tx_entry->bytes_acked += readrsp_hdr->seg_size; - break; - default: - FI_WARN(&rxr_prov, FI_LOG_CQ, - "invalid control pkt type %d\n", - rxr_get_base_hdr(pkt_entry->pkt)->type); - assert(0 && "invalid control pkt type"); - rxr_cq_handle_cq_error(ep, -FI_EIO); - return; - } - - if (tx_entry && tx_entry->total_len == tx_entry->bytes_acked) { - if (tx_entry->state == RXR_TX_SEND) - dlist_remove(&tx_entry->entry); - if (tx_entry->state == RXR_TX_SEND && - efa_mr_cache_enable && rxr_ep_mr_local(ep)) { - ret = rxr_send_completion_mr_dereg(tx_entry); - if (OFI_UNLIKELY(ret)) { - FI_WARN(&rxr_prov, FI_LOG_MR, - "In-line memory deregistration failed with error: %s.\n", - fi_strerror(-ret)); - } - } + struct rxr_rx_entry *rx_entry = NULL; - peer->tx_credits += tx_entry->credit_allocated; + rx_entry = ofi_bufpool_get_ibuf(ep->rx_entry_pool, tx_entry->rma_loc_rx_id); + assert(rx_entry); + assert(rx_entry->state == RXR_RX_WAIT_READ_FINISH); - if (tx_entry->cq_entry.flags & FI_READ) { - /* - * this must be on remote side - * see explaination on rxr_cq_handle_rx_completion - */ - struct rxr_rx_entry *rx_entry = NULL; - - rx_entry = ofi_bufpool_get_ibuf(ep->rx_entry_pool, tx_entry->rma_loc_rx_id); - assert(rx_entry); - assert(rx_entry->state == RXR_RX_WAIT_READ_FINISH); - - if (ep->util_ep.caps & FI_RMA_EVENT) { - rx_entry->cq_entry.len = rx_entry->total_len; - rx_entry->bytes_done = rx_entry->total_len; - rxr_cntr_report_rx_completion(ep, rx_entry); - } - - rxr_release_rx_entry(ep, rx_entry); - /* just release tx, do not write completion */ - rxr_release_tx_entry(ep, tx_entry); - } else if (tx_entry->cq_entry.flags & FI_WRITE) { - if (tx_entry->fi_flags & FI_COMPLETION) { - rxr_cq_write_tx_completion(ep, comp, tx_entry); - } else { - rxr_cntr_report_tx_completion(ep, tx_entry); - rxr_release_tx_entry(ep, tx_entry); - } + rxr_release_rx_entry(ep, rx_entry); + /* just release tx, do not write completion */ + rxr_release_tx_entry(ep, tx_entry); + } else if (tx_entry->cq_entry.flags & FI_WRITE) { + if (tx_entry->fi_flags & FI_COMPLETION) { + rxr_cq_write_tx_completion(ep, tx_entry); } else { - assert(tx_entry->cq_entry.flags & FI_SEND); - rxr_cq_write_tx_completion(ep, comp, tx_entry); + if (!(tx_entry->fi_flags & RXR_NO_COUNTER)) + efa_cntr_report_tx_completion(&ep->util_ep, tx_entry->cq_entry.flags); + rxr_release_tx_entry(ep, tx_entry); } + } else { + assert(tx_entry->cq_entry.flags & FI_SEND); + rxr_cq_write_tx_completion(ep, tx_entry); } - - rxr_release_tx_pkt_entry(ep, pkt_entry); - rxr_ep_dec_tx_pending(ep, peer, 0); - return; } static int rxr_cq_close(struct fid *fid) diff --git a/prov/efa/src/rxr/rxr_domain.c b/prov/efa/src/rxr/rxr_domain.c index ce91c918add..9624dc14ca2 100644 --- a/prov/efa/src/rxr/rxr_domain.c +++ b/prov/efa/src/rxr/rxr_domain.c @@ -38,20 +38,23 @@ #include #include +#include "efa.h" #include "rxr.h" #include "rxr_cntr.h" +#include "rxr_atomic.h" static struct fi_ops_domain rxr_domain_ops = { .size = sizeof(struct fi_ops_domain), - .av_open = rxr_av_open, + .av_open = efa_av_open, .cq_open = rxr_cq_open, .endpoint = rxr_endpoint, .scalable_ep = fi_no_scalable_ep, - .cntr_open = rxr_cntr_open, + .cntr_open = efa_cntr_open, .poll_open = fi_poll_create, .stx_ctx = fi_no_stx_context, .srx_ctx = fi_no_srx_context, - .query_atomic = fi_no_query_atomic, + .query_atomic = rxr_query_atomic, + .query_collective = fi_no_query_collective, }; static int rxr_domain_close(fid_t fid) @@ -61,7 +64,6 @@ static int rxr_domain_close(fid_t fid) rxr_domain = container_of(fid, struct rxr_domain, util_domain.domain_fid.fid); - ret = fi_close(&rxr_domain->rdm_domain->fid); if (ret) return ret; @@ -82,92 +84,31 @@ static struct fi_ops rxr_domain_fi_ops = { .ops_open = fi_no_ops_open, }; -static int rxr_mr_close(fid_t fid) -{ - struct rxr_domain *rxr_domain; - struct rxr_mr *rxr_mr; - int ret; - - rxr_mr = container_of(fid, struct rxr_mr, mr_fid.fid); - rxr_domain = rxr_mr->domain; - - ret = ofi_mr_map_remove(&rxr_domain->util_domain.mr_map, - rxr_mr->mr_fid.key); - if (ret) - FI_WARN(&rxr_prov, FI_LOG_MR, - "Unable to remove MR entry from util map (%s)\n", - fi_strerror(-ret)); - - ret = fi_close(&rxr_mr->msg_mr->fid); - if (ret) - FI_WARN(&rxr_prov, FI_LOG_MR, - "Unable to close MR\n"); - free(rxr_mr); - return ret; -} - -static struct fi_ops rxr_mr_ops = { - .size = sizeof(struct fi_ops), - .close = rxr_mr_close, - .bind = fi_no_bind, - .control = fi_no_control, - .ops_open = fi_no_ops_open, -}; - +/* + * The mr key generated in lower EFA registration will be used in SHM + * registration and mr_map in an unified way + */ int rxr_mr_regattr(struct fid *domain_fid, const struct fi_mr_attr *attr, uint64_t flags, struct fid_mr **mr) { struct rxr_domain *rxr_domain; - struct fi_mr_attr *core_attr; - struct rxr_mr *rxr_mr; - int ret; + int ret = 0; rxr_domain = container_of(domain_fid, struct rxr_domain, util_domain.domain_fid.fid); - rxr_mr = calloc(1, sizeof(*rxr_mr)); - if (!rxr_mr) - return -FI_ENOMEM; - - /* discard const qualifier to override access registered with EFA */ - core_attr = (struct fi_mr_attr *)attr; - core_attr->access = FI_SEND | FI_RECV; + if (attr->iface == FI_HMEM_CUDA) + flags |= OFI_MR_NOCACHE; - ret = fi_mr_regattr(rxr_domain->rdm_domain, core_attr, flags, - &rxr_mr->msg_mr); + ret = fi_mr_regattr(rxr_domain->rdm_domain, attr, flags, mr); if (ret) { FI_WARN(&rxr_prov, FI_LOG_MR, "Unable to register MR buf (%s): %p len: %zu\n", fi_strerror(-ret), attr->mr_iov->iov_base, attr->mr_iov->iov_len); - goto err; } - - rxr_mr->mr_fid.fid.fclass = FI_CLASS_MR; - rxr_mr->mr_fid.fid.context = attr->context; - rxr_mr->mr_fid.fid.ops = &rxr_mr_ops; - rxr_mr->mr_fid.mem_desc = rxr_mr->msg_mr; - rxr_mr->mr_fid.key = fi_mr_key(rxr_mr->msg_mr); - rxr_mr->domain = rxr_domain; - *mr = &rxr_mr->mr_fid; - - assert(rxr_mr->mr_fid.key != FI_KEY_NOTAVAIL); - ret = ofi_mr_map_insert(&rxr_domain->util_domain.mr_map, attr, - &rxr_mr->mr_fid.key, mr); - if (ret) { - FI_WARN(&rxr_prov, FI_LOG_MR, - "Unable to add MR to map buf (%s): %p len: %zu\n", - fi_strerror(-ret), attr->mr_iov->iov_base, - attr->mr_iov->iov_len); - goto err; - } - - return 0; -err: - free(rxr_mr); return ret; } - int rxr_mr_regv(struct fid *domain_fid, const struct iovec *iov, size_t count, uint64_t access, uint64_t offset, uint64_t requested_key, uint64_t flags, @@ -181,6 +122,7 @@ int rxr_mr_regv(struct fid *domain_fid, const struct iovec *iov, attr.offset = offset; attr.requested_key = requested_key; attr.context = context; + attr.iface = FI_HMEM_SYSTEM; return rxr_mr_regattr(domain_fid, &attr, flags, mr_fid); } @@ -210,6 +152,7 @@ int rxr_domain_open(struct fid_fabric *fabric, struct fi_info *info, int ret, retv; struct fi_info *rdm_info; struct rxr_domain *rxr_domain; + struct efa_domain *efa_domain; struct rxr_fabric *rxr_fabric; rxr_fabric = container_of(fabric, struct rxr_fabric, @@ -237,6 +180,9 @@ int rxr_domain_open(struct fid_fabric *fabric, struct fi_info *info, if (!rxr_domain) return -FI_ENOMEM; + rxr_domain->rxr_mr_local = ofi_mr_local(info); + rxr_domain->type = EFA_DOMAIN_RDM; + ret = rxr_get_lower_rdm_info(fabric->api_version, NULL, NULL, 0, &rxr_util_prov, info, &rdm_info); if (ret) @@ -247,29 +193,37 @@ int rxr_domain_open(struct fid_fabric *fabric, struct fi_info *info, if (ret) goto err_free_core_info; + efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain, + util_domain.domain_fid); + + /* Open shm provider's access domain */ + if (rxr_env.enable_shm_transfer) { + assert(!strcmp(shm_info->fabric_attr->name, "shm")); + ret = fi_domain(rxr_fabric->shm_fabric, shm_info, + &efa_domain->shm_domain, context); + if (ret) + goto err_close_core_domain; + } + rxr_domain->rdm_mode = rdm_info->mode; + rxr_domain->mtu_size = rdm_info->ep_attr->max_msg_size; rxr_domain->addrlen = (info->src_addr) ? info->src_addrlen : info->dest_addrlen; rxr_domain->cq_size = MAX(info->rx_attr->size + info->tx_attr->size, rxr_env.cq_size); - rxr_domain->mr_local = ofi_mr_local(rdm_info); - rxr_domain->resource_mgmt = rdm_info->domain_attr->resource_mgmt; ret = ofi_domain_init(fabric, info, &rxr_domain->util_domain, context); if (ret) - goto err_close_core_domain; + goto err_close_shm_domain; rxr_domain->do_progress = 0; /* - * ofi_domain_init() would have stored the RxR mr_modes in the map, but - * we need the rbtree insertions and lookups to use the lower-provider - * specific key, since the latter can not support application keys - * (FI_MR_PROV_KEY only). Storing the lower provider's mode in the map - * instead. + * ofi_domain_init() would have stored the RxR mr_modes in the mr_map, but + * we need the rbtree insertions and lookups to use EFA provider's + * specific key, so unset the FI_MR_PROV_KEY bit for mr_map. */ - rxr_domain->util_domain.mr_map.mode |= - OFI_MR_BASIC_MAP | FI_MR_LOCAL | FI_MR_BASIC; + rxr_domain->util_domain.mr_map.mode &= ~FI_MR_PROV_KEY; *domain = &rxr_domain->util_domain.domain_fid; (*domain)->fid.ops = &rxr_domain_fi_ops; @@ -278,6 +232,13 @@ int rxr_domain_open(struct fid_fabric *fabric, struct fi_info *info, fi_freeinfo(rdm_info); return 0; +err_close_shm_domain: + if (rxr_env.enable_shm_transfer) { + retv = fi_close(&efa_domain->shm_domain->fid); + if (retv) + FI_WARN(&rxr_prov, FI_LOG_DOMAIN, + "Unable to close shm domain: %s\n", fi_strerror(-retv)); + } err_close_core_domain: retv = fi_close(&rxr_domain->rdm_domain->fid); if (retv) diff --git a/prov/efa/src/rxr/rxr_ep.c b/prov/efa/src/rxr/rxr_ep.c index 3e42d0a5494..4bab034c627 100644 --- a/prov/efa/src/rxr/rxr_ep.c +++ b/prov/efa/src/rxr/rxr_ep.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. + * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. * All rights reserved. * * This software is available to you under a choice of one of two @@ -40,173 +40,81 @@ #include "rxr.h" #include "efa.h" +#include "rxr_msg.h" #include "rxr_rma.h" +#include "rxr_pkt_cmd.h" +#include "rxr_read.h" +#include "rxr_atomic.h" -#define RXR_PKT_DUMP_DATA_LEN 64 - -struct rxr_match_info { - fi_addr_t addr; - uint64_t tag; - uint64_t ignore; -}; - -static void rxr_ep_progress_internal(struct rxr_ep *ep); - -#if ENABLE_DEBUG -static void rxr_ep_print_rts_pkt(struct rxr_ep *ep, - char *prefix, struct rxr_rts_hdr *rts_hdr) +struct efa_ep_addr *rxr_ep_raw_addr(struct rxr_ep *ep) { - char str[RXR_PKT_DUMP_DATA_LEN * 4]; - size_t str_len = RXR_PKT_DUMP_DATA_LEN * 4, l; - uint8_t *src; - uint8_t *data; - int i; - - str[str_len - 1] = '\0'; - - FI_DBG(&rxr_prov, FI_LOG_EP_DATA, - "%s RxR RTS packet - version: %" PRIu8 - " flags: %" PRIu16 - " tx_id: %" PRIu32 - " msg_id: %" PRIu32 - " tag: %lx data_len: %" PRIu64 "\n", - prefix, rts_hdr->version, rts_hdr->flags, rts_hdr->tx_id, - rts_hdr->msg_id, rts_hdr->tag, rts_hdr->data_len); - - if ((rts_hdr->flags & RXR_REMOTE_CQ_DATA) && - (rts_hdr->flags & RXR_REMOTE_SRC_ADDR)) { - src = (uint8_t *)((struct rxr_ctrl_cq_pkt *)rts_hdr)->data; - data = src + rts_hdr->addrlen; - } else if (!(rts_hdr->flags & RXR_REMOTE_CQ_DATA) && - (rts_hdr->flags & RXR_REMOTE_SRC_ADDR)) { - src = (uint8_t *)((struct rxr_ctrl_pkt *)rts_hdr)->data; - data = src + rts_hdr->addrlen; - } else if ((rts_hdr->flags & RXR_REMOTE_CQ_DATA) && - !(rts_hdr->flags & RXR_REMOTE_SRC_ADDR)) { - data = (uint8_t *)((struct rxr_ctrl_cq_pkt *)rts_hdr)->data; - } else { - data = (uint8_t *)((struct rxr_ctrl_pkt *)rts_hdr)->data; - } - - if (rts_hdr->flags & RXR_REMOTE_CQ_DATA) - FI_DBG(&rxr_prov, FI_LOG_EP_DATA, - "\tcq_data: %08lx\n", - ((struct rxr_ctrl_cq_hdr *)rts_hdr)->cq_data); - - if (rts_hdr->flags & RXR_REMOTE_SRC_ADDR) { - l = snprintf(str, str_len, "\tsrc_addr: "); - for (i = 0; i < rts_hdr->addrlen; i++) - l += snprintf(str + l, str_len - l, "%02x ", src[i]); - FI_DBG(&rxr_prov, FI_LOG_EP_DATA, "%s\n", str); - } - - l = snprintf(str, str_len, ("\tdata: ")); - for (i = 0; i < MIN(rxr_get_rts_data_size(ep, rts_hdr), - RXR_PKT_DUMP_DATA_LEN); i++) - l += snprintf(str + l, str_len - l, "%02x ", data[i]); - FI_DBG(&rxr_prov, FI_LOG_EP_DATA, "%s\n", str); + return (struct efa_ep_addr *)ep->core_addr; } -static void rxr_ep_print_connack_pkt(char *prefix, - struct rxr_connack_hdr *connack_hdr) +const char *rxr_ep_raw_addr_str(struct rxr_ep *ep, char *buf, size_t *buflen) { - FI_DBG(&rxr_prov, FI_LOG_EP_DATA, - "%s RxR CONNACK packet - version: %" PRIu8 - " flags: %x\n", prefix, connack_hdr->version, - connack_hdr->flags); + return ofi_straddr(buf, buflen, FI_ADDR_EFA, rxr_ep_raw_addr(ep)); } -static void rxr_ep_print_cts_pkt(char *prefix, struct rxr_cts_hdr *cts_hdr) +struct efa_ep_addr *rxr_peer_raw_addr(struct rxr_ep *ep, fi_addr_t addr) { - FI_DBG(&rxr_prov, FI_LOG_EP_DATA, - "%s RxR CTS packet - version: %" PRIu8 - " flags: %x tx_id: %" PRIu32 - " rx_id: %" PRIu32 - " window: %" PRIu64 - "\n", prefix, cts_hdr->version, cts_hdr->flags, - cts_hdr->tx_id, cts_hdr->rx_id, cts_hdr->window); -} + struct efa_ep *efa_ep; + struct efa_av *efa_av; + struct efa_conn *efa_conn; -static void rxr_ep_print_data_pkt(char *prefix, struct rxr_data_pkt *data_pkt) -{ - char str[RXR_PKT_DUMP_DATA_LEN * 4]; - size_t str_len = RXR_PKT_DUMP_DATA_LEN * 4, l; - int i; + efa_ep = container_of(ep->rdm_ep, struct efa_ep, util_ep.ep_fid); + efa_av = efa_ep->av; + efa_conn = efa_av->conn_table[(int)addr]; - str[str_len - 1] = '\0'; - - FI_DBG(&rxr_prov, FI_LOG_EP_DATA, - "%s RxR DATA packet - version: %" PRIu8 - " flags: %x rx_id: %" PRIu32 - " seg_size: %" PRIu64 - " seg_offset: %" PRIu64 - "\n", prefix, data_pkt->hdr.version, data_pkt->hdr.flags, - data_pkt->hdr.rx_id, data_pkt->hdr.seg_size, - data_pkt->hdr.seg_offset); - - l = snprintf(str, str_len, ("\tdata: ")); - for (i = 0; i < MIN(data_pkt->hdr.seg_size, RXR_PKT_DUMP_DATA_LEN); - i++) - l += snprintf(str + l, str_len - l, "%02x ", - ((uint8_t *)data_pkt->data)[i]); - FI_DBG(&rxr_prov, FI_LOG_EP_DATA, "%s\n", str); + return &efa_conn->ep_addr; } -void rxr_ep_print_pkt(char *prefix, struct rxr_ep *ep, struct rxr_base_hdr *hdr) +const char *rxr_peer_raw_addr_str(struct rxr_ep *ep, fi_addr_t addr, char *buf, size_t *buflen) { - switch (hdr->type) { - case RXR_RTS_PKT: - rxr_ep_print_rts_pkt(ep, prefix, (struct rxr_rts_hdr *)hdr); - break; - case RXR_CONNACK_PKT: - rxr_ep_print_connack_pkt(prefix, (struct rxr_connack_hdr *)hdr); - break; - case RXR_CTS_PKT: - rxr_ep_print_cts_pkt(prefix, (struct rxr_cts_hdr *)hdr); - break; - case RXR_DATA_PKT: - rxr_ep_print_data_pkt(prefix, (struct rxr_data_pkt *)hdr); - break; - default: - FI_WARN(&rxr_prov, FI_LOG_CQ, "invalid ctl pkt type %d\n", - rxr_get_base_hdr(hdr)->type); - assert(0); - return; - } + return ofi_straddr(buf, buflen, FI_ADDR_EFA, rxr_peer_raw_addr(ep, addr)); } -#endif struct rxr_rx_entry *rxr_ep_rx_entry_init(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry, - const struct iovec *iov, - size_t iov_count, uint64_t tag, - uint64_t ignore, void *context, - fi_addr_t addr, uint32_t op, + const struct fi_msg *msg, + uint64_t tag, + uint64_t ignore, + uint32_t op, uint64_t flags) { rx_entry->type = RXR_RX_ENTRY; rx_entry->rx_id = ofi_buf_index(rx_entry); - rx_entry->addr = addr; + rx_entry->addr = msg->addr; rx_entry->fi_flags = flags; rx_entry->rxr_flags = 0; - rx_entry->bytes_done = 0; + rx_entry->bytes_received = 0; + rx_entry->bytes_copied = 0; rx_entry->window = 0; - rx_entry->iov_count = iov_count; + rx_entry->iov_count = msg->iov_count; rx_entry->tag = tag; + rx_entry->op = op; rx_entry->ignore = ignore; - rx_entry->unexp_rts_pkt = NULL; + rx_entry->unexp_pkt = NULL; + rx_entry->rma_iov_count = 0; dlist_init(&rx_entry->queued_pkts); memset(&rx_entry->cq_entry, 0, sizeof(rx_entry->cq_entry)); + rx_entry->owner = ep->use_zcpy_rx ? RXR_RX_USER_BUF : RXR_RX_PROV_BUF; + /* Handle case where we're allocating an unexpected rx_entry */ - if (iov) { - memcpy(rx_entry->iov, iov, sizeof(*rx_entry->iov) * iov_count); - rx_entry->cq_entry.len = ofi_total_iov_len(iov, iov_count); - rx_entry->cq_entry.buf = iov[0].iov_base; + if (msg->msg_iov) { + memcpy(rx_entry->iov, msg->msg_iov, sizeof(*rx_entry->iov) * msg->iov_count); + rx_entry->cq_entry.len = ofi_total_iov_len(msg->msg_iov, msg->iov_count); + rx_entry->cq_entry.buf = msg->msg_iov[0].iov_base; } - rx_entry->cq_entry.op_context = context; + if (msg->desc) + memcpy(&rx_entry->desc[0], msg->desc, sizeof(*msg->desc) * msg->iov_count); + else + memset(&rx_entry->desc[0], 0, sizeof(rx_entry->desc)); + + rx_entry->cq_entry.op_context = msg->context; rx_entry->cq_entry.tag = 0; rx_entry->ignore = ~0; @@ -220,10 +128,17 @@ struct rxr_rx_entry *rxr_ep_rx_entry_init(struct rxr_ep *ep, rx_entry->cq_entry.flags = (FI_RECV | FI_MSG); break; case ofi_op_read_rsp: - rx_entry->cq_entry.flags = (FI_REMOTE_READ | FI_MSG); + rx_entry->cq_entry.flags = (FI_REMOTE_READ | FI_RMA); + break; + case ofi_op_write: + rx_entry->cq_entry.flags = (FI_REMOTE_WRITE | FI_RMA); break; - case ofi_op_write_async: - rx_entry->cq_entry.flags = (FI_REMOTE_WRITE | FI_MSG); + case ofi_op_atomic: + rx_entry->cq_entry.flags = (FI_REMOTE_WRITE | FI_ATOMIC); + break; + case ofi_op_atomic_fetch: + case ofi_op_atomic_compare: + rx_entry->cq_entry.flags = (FI_REMOTE_READ | FI_ATOMIC); break; default: FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, @@ -235,10 +150,10 @@ struct rxr_rx_entry *rxr_ep_rx_entry_init(struct rxr_ep *ep, } struct rxr_rx_entry *rxr_ep_get_rx_entry(struct rxr_ep *ep, - const struct iovec *iov, - size_t iov_count, uint64_t tag, - uint64_t ignore, void *context, - fi_addr_t addr, uint32_t op, + const struct fi_msg *msg, + uint64_t tag, + uint64_t ignore, + uint32_t op, uint64_t flags) { struct rxr_rx_entry *rx_entry; @@ -252,61 +167,67 @@ struct rxr_rx_entry *rxr_ep_get_rx_entry(struct rxr_ep *ep, #if ENABLE_DEBUG dlist_insert_tail(&rx_entry->rx_entry_entry, &ep->rx_entry_list); #endif - rx_entry = rxr_ep_rx_entry_init(ep, rx_entry, iov, iov_count, tag, - ignore, context, addr, op, flags); + rx_entry = rxr_ep_rx_entry_init(ep, rx_entry, msg, tag, ignore, op, flags); rx_entry->state = RXR_RX_INIT; + rx_entry->op = op; return rx_entry; } -/* - * Create a new rx_entry for an unexpected message. Store the packet for later - * processing and put the rx_entry on the appropriate unexpected list. - */ -struct rxr_rx_entry *rxr_ep_get_new_unexp_rx_entry(struct rxr_ep *ep, - struct rxr_pkt_entry *pkt_entry) +struct rxr_rx_entry *rxr_ep_alloc_unexp_rx_entry_for_msgrtm(struct rxr_ep *ep, + struct rxr_pkt_entry **pkt_entry_ptr) { struct rxr_rx_entry *rx_entry; - struct rxr_pkt_entry *unexp_entry; - struct rxr_rts_hdr *rts_pkt; - uint32_t op; + struct rxr_pkt_entry *unexp_pkt_entry; + struct fi_msg msg = {0}; - if (rxr_env.rx_copy_unexp && pkt_entry->type == RXR_PKT_ENTRY_POSTED) { - unexp_entry = rxr_get_pkt_entry(ep, ep->rx_unexp_pkt_pool); - if (OFI_UNLIKELY(!unexp_entry)) { - FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, - "Unable to allocate rx_pkt_entry for unexp msg\n"); - return NULL; - } - rxr_copy_pkt_entry(ep, unexp_entry, pkt_entry, - RXR_PKT_ENTRY_UNEXP); - rxr_release_rx_pkt_entry(ep, pkt_entry); - ep->rx_bufs_to_post++; - } else { - unexp_entry = pkt_entry; + unexp_pkt_entry = rxr_pkt_get_unexp(ep, pkt_entry_ptr); + if (OFI_UNLIKELY(!unexp_pkt_entry)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, "packet entries exhausted.\n"); + return NULL; } - rts_pkt = rxr_get_rts_hdr(unexp_entry->pkt); - - if (rts_pkt->flags & RXR_TAGGED) - op = ofi_op_tagged; - else - op = ofi_op_msg; - - rx_entry = rxr_ep_get_rx_entry(ep, NULL, 0, rts_pkt->tag, ~0, NULL, - unexp_entry->addr, op, 0); - if (OFI_UNLIKELY(!rx_entry)) + msg.addr = unexp_pkt_entry->addr; + rx_entry = rxr_ep_get_rx_entry(ep, &msg, 0, ~0, ofi_op_msg, 0); + if (OFI_UNLIKELY(!rx_entry)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, "RX entries exhausted.\n"); return NULL; + } + rx_entry->rxr_flags = 0; rx_entry->state = RXR_RX_UNEXP; - rx_entry->total_len = rts_pkt->data_len; - rx_entry->rxr_flags = rts_pkt->flags; - rx_entry->unexp_rts_pkt = unexp_entry; + rx_entry->unexp_pkt = unexp_pkt_entry; + rxr_pkt_rtm_init_rx_entry(unexp_pkt_entry, rx_entry); + dlist_insert_tail(&rx_entry->entry, &ep->rx_unexp_list); + return rx_entry; +} - if (op == ofi_op_tagged) - dlist_insert_tail(&rx_entry->entry, &ep->rx_unexp_tagged_list); - else - dlist_insert_tail(&rx_entry->entry, &ep->rx_unexp_list); +struct rxr_rx_entry *rxr_ep_alloc_unexp_rx_entry_for_tagrtm(struct rxr_ep *ep, + struct rxr_pkt_entry **pkt_entry_ptr) +{ + uint64_t tag; + struct rxr_rx_entry *rx_entry; + struct rxr_pkt_entry *unexp_pkt_entry; + struct fi_msg msg = {0}; + + unexp_pkt_entry = rxr_pkt_get_unexp(ep, pkt_entry_ptr); + if (OFI_UNLIKELY(!unexp_pkt_entry)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, "packet entries exhausted.\n"); + return NULL; + } + + tag = rxr_pkt_rtm_tag(unexp_pkt_entry); + msg.addr = unexp_pkt_entry->addr; + rx_entry = rxr_ep_get_rx_entry(ep, &msg, tag, ~0, ofi_op_tagged, 0); + if (OFI_UNLIKELY(!rx_entry)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, "RX entries exhausted.\n"); + return NULL; + } + rx_entry->rxr_flags = 0; + rx_entry->state = RXR_RX_UNEXP; + rx_entry->unexp_pkt = unexp_pkt_entry; + rxr_pkt_rtm_init_rx_entry(unexp_pkt_entry, rx_entry); + dlist_insert_tail(&rx_entry->entry, &ep->rx_unexp_tagged_list); return rx_entry; } @@ -316,15 +237,18 @@ struct rxr_rx_entry *rxr_ep_split_rx_entry(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry) { struct rxr_rx_entry *rx_entry; - struct rxr_rts_hdr *rts_pkt; - size_t buf_len, consumed_len; + size_t buf_len, consumed_len, data_len; + uint64_t tag; + struct fi_msg msg = {0}; + + assert(rxr_get_base_hdr(pkt_entry->pkt)->type >= RXR_REQ_PKT_BEGIN); + tag = 0; - rts_pkt = rxr_get_rts_hdr(pkt_entry->pkt); if (!consumer_entry) { - rx_entry = rxr_ep_get_rx_entry(ep, posted_entry->iov, - posted_entry->iov_count, - rts_pkt->tag, 0, NULL, - pkt_entry->addr, ofi_op_msg, + msg.msg_iov = posted_entry->iov; + msg.iov_count = posted_entry->iov_count; + msg.addr = pkt_entry->addr; + rx_entry = rxr_ep_get_rx_entry(ep, &msg, tag, 0, ofi_op_msg, posted_entry->fi_flags); if (OFI_UNLIKELY(!rx_entry)) return NULL; @@ -340,11 +264,14 @@ struct rxr_rx_entry *rxr_ep_split_rx_entry(struct rxr_ep *ep, rx_entry->iov_count = posted_entry->iov_count; } + rxr_pkt_rtm_init_rx_entry(pkt_entry, rx_entry); + data_len = rx_entry->total_len; buf_len = ofi_total_iov_len(rx_entry->iov, rx_entry->iov_count); - consumed_len = MIN(buf_len, rts_pkt->data_len); + consumed_len = MIN(buf_len, data_len); rx_entry->rxr_flags |= RXR_MULTI_RECV_CONSUMER; + rx_entry->total_len = data_len; rx_entry->fi_flags |= FI_MULTI_RECV; rx_entry->master_entry = posted_entry; rx_entry->cq_entry.len = consumed_len; @@ -361,1650 +288,347 @@ struct rxr_rx_entry *rxr_ep_split_rx_entry(struct rxr_ep *ep, return rx_entry; } -/* Post buf as undirected recv (FI_ADDR_UNSPEC) */ -int rxr_ep_post_buf(struct rxr_ep *ep, uint64_t flags) +/* Post buffers as undirected recv (FI_ADDR_UNSPEC) */ +int rxr_ep_post_buf(struct rxr_ep *ep, const struct fi_msg *posted_recv, uint64_t flags, enum rxr_lower_ep_type lower_ep_type) { - struct fi_msg msg; + struct fi_msg msg = {0}; struct iovec msg_iov; void *desc; - struct rxr_pkt_entry *rx_pkt_entry; + struct rxr_pkt_entry *rx_pkt_entry = NULL; int ret = 0; - rx_pkt_entry = rxr_get_pkt_entry(ep, ep->rx_pkt_pool); + switch (lower_ep_type) { + case SHM_EP: + rx_pkt_entry = rxr_pkt_entry_alloc(ep, ep->rx_pkt_shm_pool); + break; + case EFA_EP: + if (posted_recv) + rx_pkt_entry = rxr_pkt_entry_init_prefix(ep, posted_recv, ep->rx_pkt_efa_pool); + else + rx_pkt_entry = rxr_pkt_entry_alloc(ep, ep->rx_pkt_efa_pool); + break; + default: + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, + "invalid lower EP type %d\n", lower_ep_type); + assert(0 && "invalid lower EP type\n"); + } if (OFI_UNLIKELY(!rx_pkt_entry)) { FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "Unable to allocate rx_pkt_entry\n"); return -FI_ENOMEM; } -#if ENABLE_DEBUG - dlist_insert_tail(&rx_pkt_entry->dbg_entry, - &ep->rx_posted_buf_list); -#endif rx_pkt_entry->x_entry = NULL; - rx_pkt_entry->type = RXR_PKT_ENTRY_POSTED; msg_iov.iov_base = (void *)rxr_pkt_start(rx_pkt_entry); msg_iov.iov_len = ep->mtu_size; + rxr_setup_msg(&msg, &msg_iov, NULL, 1, FI_ADDR_UNSPEC, rx_pkt_entry, 0); - msg.msg_iov = &msg_iov; - desc = rxr_ep_mr_local(ep) ? fi_mr_desc(rx_pkt_entry->mr) : NULL; - msg.desc = &desc; - msg.iov_count = 1; - msg.addr = FI_ADDR_UNSPEC; - msg.context = rx_pkt_entry; - msg.data = 0; - - ret = fi_recvmsg(ep->rdm_ep, &msg, flags); - - if (OFI_UNLIKELY(ret)) { - rxr_release_rx_pkt_entry(ep, rx_pkt_entry); + switch (lower_ep_type) { + case SHM_EP: + /* pre-post buffer with shm */ +#if ENABLE_DEBUG + dlist_insert_tail(&rx_pkt_entry->dbg_entry, + &ep->rx_posted_buf_shm_list); +#endif + desc = NULL; + msg.desc = &desc; + ret = fi_recvmsg(ep->shm_ep, &msg, flags); + if (OFI_UNLIKELY(ret)) { + rxr_pkt_entry_release_rx(ep, rx_pkt_entry); + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, + "failed to post buf for shm %d (%s)\n", -ret, + fi_strerror(-ret)); + return ret; + } + ep->posted_bufs_shm++; + break; + case EFA_EP: +#if ENABLE_DEBUG + if (rx_pkt_entry->type != RXR_PKT_ENTRY_USER) + dlist_insert_tail(&rx_pkt_entry->dbg_entry, + &ep->rx_posted_buf_list); +#endif + desc = fi_mr_desc(rx_pkt_entry->mr); + msg.desc = &desc; + /* + * Use the actual receive sizes from the application + * rather than posting the full MTU size, like we do + * when using the bufpool. + */ + if (posted_recv) { + msg_iov.iov_len = posted_recv->msg_iov->iov_len; + msg.data = posted_recv->data; + assert(msg_iov.iov_len <= ep->mtu_size); + } + ret = fi_recvmsg(ep->rdm_ep, &msg, flags); + if (OFI_UNLIKELY(ret)) { + rxr_pkt_entry_release_rx(ep, rx_pkt_entry); + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, + "failed to post buf %d (%s)\n", -ret, + fi_strerror(-ret)); + return ret; + } + ep->posted_bufs_efa++; + break; + default: FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, - "failed to post buf %d (%s)\n", -ret, - fi_strerror(-ret)); - return ret; + "invalid lower EP type %d\n", lower_ep_type); + assert(0 && "invalid lower EP type\n"); } - ep->posted_bufs++; return 0; } -static int rxr_ep_match_unexp_msg(struct dlist_entry *item, const void *arg) +void rxr_tx_entry_init(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, + const struct fi_msg *msg, uint32_t op, uint64_t flags) { - const struct rxr_match_info *match_info = arg; - struct rxr_rx_entry *rx_entry; + uint64_t tx_op_flags; - rx_entry = container_of(item, struct rxr_rx_entry, entry); + tx_entry->type = RXR_TX_ENTRY; + tx_entry->op = op; + tx_entry->tx_id = ofi_buf_index(tx_entry); + tx_entry->state = RXR_TX_REQ; + tx_entry->addr = msg->addr; - return rxr_match_addr(match_info->addr, rx_entry->addr); -} + tx_entry->send_flags = 0; + tx_entry->rxr_flags = 0; + tx_entry->bytes_acked = 0; + tx_entry->bytes_sent = 0; + tx_entry->window = 0; + tx_entry->total_len = ofi_total_iov_len(msg->msg_iov, msg->iov_count); + tx_entry->iov_count = msg->iov_count; + tx_entry->iov_index = 0; + tx_entry->iov_mr_start = 0; + tx_entry->iov_offset = 0; + tx_entry->msg_id = 0; + dlist_init(&tx_entry->queued_pkts); -static int rxr_ep_match_unexp_tmsg(struct dlist_entry *item, const void *arg) -{ - const struct rxr_match_info *match_info = arg; - struct rxr_rx_entry *rx_entry; + memcpy(&tx_entry->iov[0], msg->msg_iov, sizeof(struct iovec) * msg->iov_count); + memset(tx_entry->mr, 0, sizeof(*tx_entry->mr) * msg->iov_count); + if (msg->desc) + memcpy(tx_entry->desc, msg->desc, sizeof(*msg->desc) * msg->iov_count); + else + memset(tx_entry->desc, 0, sizeof(tx_entry->desc)); + + /* + * The prefix is currently not used by the sender, but needs to be + * accounted for when copying the payload into the bounce-buffer. + */ + if (ep->use_zcpy_rx) { + assert(tx_entry->iov[0].iov_len >= sizeof(struct rxr_pkt_entry) + sizeof(struct rxr_eager_msgrtm_hdr)); + tx_entry->iov[0].iov_base = (char *)tx_entry->iov[0].iov_base + + sizeof(struct rxr_pkt_entry) + + sizeof(struct rxr_eager_msgrtm_hdr); + } + + /* set flags */ + assert(ep->util_ep.tx_msg_flags == 0 || + ep->util_ep.tx_msg_flags == FI_COMPLETION); + tx_op_flags = ep->util_ep.tx_op_flags; + if (ep->util_ep.tx_msg_flags == 0) + tx_op_flags &= ~FI_COMPLETION; + tx_entry->fi_flags = flags | tx_op_flags; - rx_entry = container_of(item, struct rxr_rx_entry, entry); + /* cq_entry on completion */ + tx_entry->cq_entry.op_context = msg->context; + tx_entry->cq_entry.len = ofi_total_iov_len(msg->msg_iov, msg->iov_count); + if (OFI_LIKELY(tx_entry->cq_entry.len > 0)) + tx_entry->cq_entry.buf = msg->msg_iov[0].iov_base; + else + tx_entry->cq_entry.buf = NULL; - return rxr_match_addr(match_info->addr, rx_entry->addr) && - rxr_match_tag(rx_entry->tag, match_info->ignore, - match_info->tag); + tx_entry->cq_entry.data = msg->data; + switch (op) { + case ofi_op_tagged: + tx_entry->cq_entry.flags = FI_TRANSMIT | FI_MSG | FI_TAGGED; + break; + case ofi_op_write: + tx_entry->cq_entry.flags = FI_RMA | FI_WRITE; + break; + case ofi_op_read_req: + tx_entry->cq_entry.flags = FI_RMA | FI_READ; + break; + case ofi_op_msg: + tx_entry->cq_entry.flags = FI_TRANSMIT | FI_MSG; + break; + case ofi_op_atomic: + tx_entry->cq_entry.flags = (FI_WRITE | FI_ATOMIC); + break; + case ofi_op_atomic_fetch: + case ofi_op_atomic_compare: + tx_entry->cq_entry.flags = (FI_READ | FI_ATOMIC); + break; + default: + FI_WARN(&rxr_prov, FI_LOG_CQ, "invalid operation type\n"); + assert(0); + } } -static int rxr_ep_handle_unexp_match(struct rxr_ep *ep, - struct rxr_rx_entry *rx_entry, - uint64_t tag, uint64_t ignore, - void *context, fi_addr_t addr, - uint32_t op, uint64_t flags) +/* create a new tx entry */ +struct rxr_tx_entry *rxr_ep_alloc_tx_entry(struct rxr_ep *rxr_ep, + const struct fi_msg *msg, + uint32_t op, + uint64_t tag, + uint64_t flags) { - struct rxr_pkt_entry *pkt_entry; - struct rxr_rts_hdr *rts_hdr; - uint64_t bytes_left, len; - int ret = 0; - - rx_entry->fi_flags = flags; - rx_entry->ignore = ignore; - rx_entry->state = RXR_RX_MATCHED; - - pkt_entry = rx_entry->unexp_rts_pkt; - rts_hdr = rxr_get_rts_hdr(pkt_entry->pkt); + struct rxr_tx_entry *tx_entry; - rx_entry->cq_entry.op_context = context; - /* - * we don't expect recv buf from application for discard, - * hence setting to NULL - */ - if (OFI_UNLIKELY(flags & FI_DISCARD)) { - rx_entry->cq_entry.buf = NULL; - rx_entry->cq_entry.len = rts_hdr->data_len; - } else { - rx_entry->cq_entry.buf = rx_entry->iov[0].iov_base; - len = MIN(rx_entry->total_len, - ofi_total_iov_len(rx_entry->iov, - rx_entry->iov_count)); - rx_entry->cq_entry.len = len; + tx_entry = ofi_buf_alloc(rxr_ep->tx_entry_pool); + if (OFI_UNLIKELY(!tx_entry)) { + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "TX entries exhausted.\n"); + return NULL; } - rx_entry->cq_entry.flags = (FI_RECV | FI_MSG); - + rxr_tx_entry_init(rxr_ep, tx_entry, msg, op, flags); if (op == ofi_op_tagged) { - rx_entry->cq_entry.flags |= FI_TAGGED; - rx_entry->cq_entry.tag = rx_entry->tag; - rx_entry->ignore = ignore; - } else { - rx_entry->cq_entry.tag = 0; - rx_entry->ignore = ~0; - } - - rxr_cq_recv_rts_data(ep, rx_entry, rts_hdr); - - /* - * TODO: Unsure how to handle fi_cq_msg_entry when writing completion - * events in the unexpected path. Right now this field is unused. If - * that changes we'll need to parse the flags as we get completion - * events from the provider in the recv path and save the flags in the - * rx_entry for the unexp message path to use when the app calls recv. - */ - if (rx_entry->total_len - rx_entry->bytes_done == 0) { - ret = rxr_cq_handle_rx_completion(ep, NULL, - pkt_entry, rx_entry); - if (!ret) - rxr_release_rx_entry(ep, rx_entry); - return 0; + tx_entry->cq_entry.tag = tag; + tx_entry->tag = tag; } - rx_entry->state = RXR_RX_RECV; #if ENABLE_DEBUG - dlist_insert_tail(&rx_entry->rx_pending_entry, &ep->rx_pending_list); - ep->rx_pending++; + dlist_insert_tail(&tx_entry->tx_entry_entry, &rxr_ep->tx_entry_list); #endif - bytes_left = rx_entry->total_len - rx_entry->bytes_done; - if (!rx_entry->window && bytes_left > 0) - ret = rxr_ep_post_cts_or_queue(ep, rx_entry, bytes_left); - - if (pkt_entry->type == RXR_PKT_ENTRY_POSTED) - ep->rx_bufs_to_post++; - rxr_release_rx_pkt_entry(ep, pkt_entry); - return ret; + return tx_entry; } -/* - * Search unexpected list for matching message and process it if found. - * - * Returns 0 if the message is processed, -FI_ENOMSG if no match is found. - */ -static int rxr_ep_check_unexp_msg_list(struct rxr_ep *ep, - const struct iovec *iov, - size_t iov_count, uint64_t tag, - uint64_t ignore, void *context, - fi_addr_t addr, uint32_t op, - uint64_t flags, - struct rxr_rx_entry *posted_entry) +void rxr_release_tx_entry(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry) { - struct rxr_match_info match_info; - struct dlist_entry *match; - struct rxr_rx_entry *rx_entry; - int ret; - - if (op == ofi_op_tagged) { - match_info.addr = addr; - match_info.tag = tag; - match_info.ignore = ignore; - match = dlist_remove_first_match(&ep->rx_unexp_tagged_list, - &rxr_ep_match_unexp_tmsg, - (void *)&match_info); - } else { - match_info.addr = addr; - match = dlist_remove_first_match(&ep->rx_unexp_list, - &rxr_ep_match_unexp_msg, - (void *)&match_info); - } - - if (!match) - return -FI_ENOMSG; + int i, err = 0; - rx_entry = container_of(match, struct rxr_rx_entry, entry); + for (i = 0; i < tx_entry->iov_count; i++) { + if (tx_entry->mr[i]) { + err = fi_close((struct fid *)tx_entry->mr[i]); + if (OFI_UNLIKELY(err)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, "mr dereg failed. err=%d\n", err); + efa_eq_write_error(&ep->util_ep, err, -err); + } - /* - * Initialize the matched entry as a multi-recv consumer if the posted - * buffer is a multi-recv buffer. - */ - if (posted_entry) { - /* - * rxr_ep_split_rx_entry will setup rx_entry iov and count - */ - rx_entry = rxr_ep_split_rx_entry(ep, posted_entry, rx_entry, - rx_entry->unexp_rts_pkt); - if (OFI_UNLIKELY(!rx_entry)) { - FI_WARN(&rxr_prov, FI_LOG_CQ, - "RX entries exhausted.\n"); - return -FI_ENOBUFS; + tx_entry->mr[i] = NULL; } - } else { - memcpy(rx_entry->iov, iov, sizeof(*rx_entry->iov) * iov_count); - rx_entry->iov_count = iov_count; } - FI_DBG(&rxr_prov, FI_LOG_EP_CTRL, - "Match found in unexp list for a posted recv msg_id: %" PRIu32 - " total_len: %" PRIu64 " tag: %lx\n", - rx_entry->msg_id, rx_entry->total_len, rx_entry->tag); - - ret = rxr_ep_handle_unexp_match(ep, rx_entry, tag, ignore, - context, addr, op, flags); - return ret; +#if ENABLE_DEBUG + dlist_remove(&tx_entry->tx_entry_entry); +#endif + assert(dlist_empty(&tx_entry->queued_pkts)); +#ifdef ENABLE_EFA_POISONING + rxr_poison_mem_region((uint32_t *)tx_entry, + sizeof(struct rxr_tx_entry)); +#endif + tx_entry->state = RXR_TX_FREE; + ofi_buf_free(tx_entry); } -static ssize_t rxr_ep_discard_trecv(struct rxr_ep *ep, - struct rxr_rx_entry *rx_entry, - const struct fi_msg_tagged *msg, - int64_t flags) +int rxr_ep_tx_init_mr_desc(struct rxr_domain *rxr_domain, + struct rxr_tx_entry *tx_entry, + int mr_iov_start, uint64_t access) { - int ret; - - if ((flags & FI_DISCARD) && !(flags & (FI_PEEK | FI_CLAIM))) - return -FI_EINVAL; - - rx_entry->fi_flags |= FI_DISCARD; - rx_entry->rxr_flags |= RXR_RECV_CANCEL; - ret = ofi_cq_write(ep->util_ep.rx_cq, msg->context, - FI_TAGGED | FI_RECV | FI_MSG, - 0, NULL, rx_entry->cq_entry.data, - rx_entry->cq_entry.tag); - rxr_rm_rx_cq_check(ep, ep->util_ep.rx_cq); - return ret; -} + int i, err, ret; -static ssize_t rxr_ep_claim_trecv(struct fid_ep *ep_fid, - const struct fi_msg_tagged *msg, - int64_t flags) -{ - ssize_t ret = 0; - struct rxr_ep *ep; - struct rxr_rx_entry *rx_entry; - struct fi_context *context; + ret = 0; + for (i = mr_iov_start; i < tx_entry->iov_count; ++i) { + if (tx_entry->desc[i]) { + assert(!tx_entry->mr[i]); + continue; + } - ep = container_of(ep_fid, struct rxr_ep, util_ep.ep_fid.fid); - fastlock_acquire(&ep->util_ep.lock); + if (tx_entry->iov[i].iov_len <= rxr_env.max_memcpy_size) { + assert(!tx_entry->mr[i]); + continue; + } - context = (struct fi_context *)msg->context; - rx_entry = (struct rxr_rx_entry *)context->internal[0]; + err = fi_mr_reg(rxr_domain->rdm_domain, + tx_entry->iov[i].iov_base, + tx_entry->iov[i].iov_len, + access, 0, 0, 0, + &tx_entry->mr[i], NULL); + if (err) { + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, + "fi_mr_reg failed! buf: %p len: %ld access: %lx", + tx_entry->iov[i].iov_base, tx_entry->iov[i].iov_len, + access); - if (flags & FI_DISCARD) { - ret = rxr_ep_discard_trecv(ep, rx_entry, msg, flags); - if (OFI_UNLIKELY(ret)) - goto out; + tx_entry->mr[i] = NULL; + ret = err; + } else { + tx_entry->desc[i] = fi_mr_desc(tx_entry->mr[i]); + } } - /* - * Handle unexp match entry even for discard entry as we are sinking - * messages for that case - */ - memcpy(rx_entry->iov, msg->msg_iov, - sizeof(*msg->msg_iov) * msg->iov_count); - rx_entry->iov_count = msg->iov_count; - - ret = rxr_ep_handle_unexp_match(ep, rx_entry, msg->tag, - msg->ignore, msg->context, - msg->addr, ofi_op_tagged, flags); - -out: - fastlock_release(&ep->util_ep.lock); return ret; } -static ssize_t rxr_ep_peek_trecv(struct fid_ep *ep_fid, - const struct fi_msg_tagged *msg, - uint64_t flags) +void rxr_prepare_desc_send(struct rxr_domain *rxr_domain, + struct rxr_tx_entry *tx_entry) { - ssize_t ret = 0; - struct rxr_ep *ep; - struct dlist_entry *match; - struct rxr_match_info match_info; - struct rxr_rx_entry *rx_entry; - struct fi_context *context; - struct rxr_pkt_entry *pkt_entry; - struct rxr_rts_hdr *rts_hdr; - - ep = container_of(ep_fid, struct rxr_ep, util_ep.ep_fid.fid); - - fastlock_acquire(&ep->util_ep.lock); - - rxr_ep_progress_internal(ep); - match_info.addr = msg->addr; - match_info.tag = msg->tag; - match_info.ignore = msg->ignore; - - match = dlist_find_first_match(&ep->rx_unexp_tagged_list, - &rxr_ep_match_unexp_tmsg, - (void *)&match_info); - if (!match) { - FI_DBG(&rxr_prov, FI_LOG_EP_CTRL, - "Message not found addr: %" PRIu64 - " tag: %lx ignore %lx\n", msg->addr, msg->tag, - msg->ignore); - ret = ofi_cq_write_error_peek(ep->util_ep.rx_cq, msg->tag, - msg->context); - goto out; - } - - rx_entry = container_of(match, struct rxr_rx_entry, entry); - context = (struct fi_context *)msg->context; - if (flags & FI_CLAIM) { - context->internal[0] = rx_entry; - dlist_remove(match); - } else if (flags & FI_DISCARD) { - dlist_remove(match); - - ret = rxr_ep_discard_trecv(ep, rx_entry, msg, flags); - if (ret) - goto out; - - memcpy(rx_entry->iov, msg->msg_iov, - sizeof(*msg->msg_iov) * msg->iov_count); - rx_entry->iov_count = msg->iov_count; - - ret = rxr_ep_handle_unexp_match(ep, rx_entry, - msg->tag, msg->ignore, - msg->context, msg->addr, - ofi_op_tagged, flags); - - goto out; - } - - pkt_entry = rx_entry->unexp_rts_pkt; - rts_hdr = rxr_get_rts_hdr(pkt_entry->pkt); + size_t offset; + int index; - if (rts_hdr->flags & RXR_REMOTE_CQ_DATA) { - rx_entry->cq_entry.data = - rxr_get_ctrl_cq_pkt(rts_hdr)->hdr.cq_data; - rx_entry->cq_entry.flags |= FI_REMOTE_CQ_DATA; + /* Set the iov index and iov offset from bytes sent */ + offset = tx_entry->bytes_sent; + for (index = 0; index < tx_entry->iov_count; ++index) { + if (offset >= tx_entry->iov[index].iov_len) { + offset -= tx_entry->iov[index].iov_len; + } else { + tx_entry->iov_index = index; + tx_entry->iov_offset = offset; + break; + } } - if (ep->util_ep.caps & FI_SOURCE) - ret = ofi_cq_write_src(ep->util_ep.rx_cq, context, - FI_TAGGED | FI_RECV, - rts_hdr->data_len, NULL, - rx_entry->cq_entry.data, rts_hdr->tag, - rx_entry->addr); - else - ret = ofi_cq_write(ep->util_ep.rx_cq, context, - FI_TAGGED | FI_RECV, - rts_hdr->data_len, NULL, - rx_entry->cq_entry.data, rts_hdr->tag); - rxr_rm_rx_cq_check(ep, ep->util_ep.rx_cq); -out: - fastlock_release(&ep->util_ep.lock); - return ret; -} - -static ssize_t rxr_multi_recv(struct rxr_ep *rxr_ep, const struct iovec *iov, - size_t iov_count, fi_addr_t addr, uint64_t tag, - uint64_t ignore, void *context, uint32_t op, - uint64_t flags) -{ - struct rxr_rx_entry *rx_entry; - int ret = 0; - - if ((ofi_total_iov_len(iov, iov_count) - < rxr_ep->min_multi_recv_size) || op != ofi_op_msg) - return -FI_EINVAL; - - /* - * Always get new rx_entry of type RXR_MULTI_RECV_POSTED when in the - * multi recv path. The posted entry will not be used for receiving - * messages but will be used for tracking the application's buffer and - * when to write the completion to release the buffer. - */ - rx_entry = rxr_ep_get_rx_entry(rxr_ep, iov, iov_count, tag, - ignore, context, - (rxr_ep->util_ep.caps & - FI_DIRECTED_RECV) ? addr : - FI_ADDR_UNSPEC, op, flags); - if (OFI_UNLIKELY(!rx_entry)) { - rxr_ep_progress_internal(rxr_ep); - return -FI_EAGAIN; - } - - rx_entry->rxr_flags |= RXR_MULTI_RECV_POSTED; - dlist_init(&rx_entry->multi_recv_consumers); - dlist_init(&rx_entry->multi_recv_entry); - - while (!dlist_empty(&rxr_ep->rx_unexp_list)) { - ret = rxr_ep_check_unexp_msg_list(rxr_ep, NULL, 0, tag, - ignore, context, - (rxr_ep->util_ep.caps - & FI_DIRECTED_RECV) ? - addr : FI_ADDR_UNSPEC, - op, flags, rx_entry); - - if (!rxr_multi_recv_buffer_available(rxr_ep, rx_entry)) { - /* - * Multi recv buffer consumed by short, unexp messages, - * free posted rx_entry. - */ - if (rxr_multi_recv_buffer_complete(rxr_ep, rx_entry)) - rxr_release_rx_entry(rxr_ep, rx_entry); - /* - * Multi recv buffer has been consumed, but waiting on - * long msg completion. Last msg completion will free - * posted rx_entry. - */ - if (ret == -FI_ENOMSG) - return 0; - return ret; - } - - if (ret == -FI_ENOMSG) { - ret = 0; - break; - } - - /* - * Error was encountered when processing unexpected messages, - * but there is buffer space available. Add the posted entry to - * the rx_list. - */ - if (ret) - break; - } - - dlist_insert_tail(&rx_entry->entry, &rxr_ep->rx_list); - return ret; -} -/* - * create a rx entry and verify in unexpected message list - * else add to posted recv list - */ -static ssize_t rxr_recv(struct fid_ep *ep, const struct iovec *iov, - size_t iov_count, fi_addr_t addr, uint64_t tag, - uint64_t ignore, void *context, uint32_t op, - uint64_t flags) -{ - ssize_t ret = 0; - struct rxr_ep *rxr_ep; - struct dlist_entry *unexp_list; - struct rxr_rx_entry *rx_entry; - uint64_t rx_op_flags; - - FI_DBG(&rxr_prov, FI_LOG_EP_DATA, - "%s: iov_len: %lu tag: %lx ignore: %lx op: %x flags: %lx\n", - __func__, ofi_total_iov_len(iov, iov_count), tag, ignore, - op, flags); - - rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid); - - assert(iov_count <= rxr_ep->rx_iov_limit); - - rxr_perfset_start(rxr_ep, perf_rxr_recv); - - assert(rxr_ep->util_ep.rx_msg_flags == 0 || rxr_ep->util_ep.rx_msg_flags == FI_COMPLETION); - rx_op_flags = rxr_ep->util_ep.rx_op_flags; - if (rxr_ep->util_ep.rx_msg_flags == 0) - rx_op_flags &= ~FI_COMPLETION; - flags = flags | rx_op_flags; - - fastlock_acquire(&rxr_ep->util_ep.lock); - if (OFI_UNLIKELY(is_rx_res_full(rxr_ep))) { - ret = -FI_EAGAIN; - goto out; - } - - if (flags & FI_MULTI_RECV) { - ret = rxr_multi_recv(rxr_ep, iov, iov_count, addr, tag, ignore, - context, op, flags); - goto out; - } - - unexp_list = (op == ofi_op_tagged) ? &rxr_ep->rx_unexp_tagged_list : - &rxr_ep->rx_unexp_list; - - if (!dlist_empty(unexp_list)) { - ret = rxr_ep_check_unexp_msg_list(rxr_ep, iov, iov_count, tag, - ignore, context, - (rxr_ep->util_ep.caps - & FI_DIRECTED_RECV) ? - addr : FI_ADDR_UNSPEC, - op, flags, NULL); - - if (ret != -FI_ENOMSG) - goto out; - ret = 0; - } - - rx_entry = rxr_ep_get_rx_entry(rxr_ep, iov, iov_count, tag, - ignore, context, - (rxr_ep->util_ep.caps & - FI_DIRECTED_RECV) ? addr : - FI_ADDR_UNSPEC, op, flags); - - if (OFI_UNLIKELY(!rx_entry)) { - ret = -FI_EAGAIN; - rxr_ep_progress_internal(rxr_ep); - goto out; - } - - if (op == ofi_op_tagged) - dlist_insert_tail(&rx_entry->entry, &rxr_ep->rx_tagged_list); - else - dlist_insert_tail(&rx_entry->entry, &rxr_ep->rx_list); - -out: - fastlock_release(&rxr_ep->util_ep.lock); - - rxr_perfset_end(rxr_ep, perf_rxr_recv); - return ret; -} - -static ssize_t rxr_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, - uint64_t flags) -{ - return rxr_recv(ep_fid, msg->msg_iov, msg->iov_count, msg->addr, - 0, 0, msg->context, ofi_op_msg, flags); -} - -static ssize_t rxr_ep_recv(struct fid_ep *ep, void *buf, size_t len, - void *desc, fi_addr_t src_addr, void *context) -{ - struct fi_msg msg; - struct iovec msg_iov; - - memset(&msg, 0, sizeof(msg)); - msg_iov.iov_base = buf; - msg_iov.iov_len = len; - - msg.msg_iov = &msg_iov; - msg.desc = &desc; - msg.iov_count = 1; - msg.addr = src_addr; - msg.context = context; - msg.data = 0; - - return rxr_ep_recvmsg(ep, &msg, 0); -} - -static ssize_t rxr_ep_recvv(struct fid_ep *ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - void *context) -{ - struct fi_msg msg; - - memset(&msg, 0, sizeof(msg)); - msg.msg_iov = iov; - msg.desc = desc; - msg.iov_count = count; - msg.addr = src_addr; - msg.context = context; - msg.data = 0; - - return rxr_ep_recvmsg(ep, &msg, 0); -} - - -void rxr_generic_tx_entry_init(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, - const struct iovec *iov, size_t iov_count, - const struct fi_rma_iov *rma_iov, - size_t rma_iov_count, fi_addr_t addr, - uint64_t tag, uint64_t data, void *context, - uint32_t op, uint64_t flags) -{ - tx_entry->type = RXR_TX_ENTRY; - tx_entry->tx_id = ofi_buf_index(tx_entry); - tx_entry->state = RXR_TX_RTS; - tx_entry->addr = addr; - tx_entry->tag = tag; - - tx_entry->send_flags = 0; - tx_entry->bytes_acked = 0; - tx_entry->bytes_sent = 0; - tx_entry->window = 0; - tx_entry->total_len = ofi_total_iov_len(iov, iov_count); - tx_entry->iov_count = iov_count; - tx_entry->iov_index = 0; - tx_entry->iov_mr_start = 0; - tx_entry->iov_offset = 0; - tx_entry->msg_id = ~0; - dlist_init(&tx_entry->queued_pkts); - - memcpy(&tx_entry->iov[0], iov, sizeof(*iov) * iov_count); - - /* cq_entry on completion */ - tx_entry->cq_entry.op_context = context; - tx_entry->cq_entry.len = ofi_total_iov_len(iov, iov_count); - if (OFI_LIKELY(tx_entry->cq_entry.len > 0)) - tx_entry->cq_entry.buf = iov[0].iov_base; - else - tx_entry->cq_entry.buf = NULL; - - tx_entry->cq_entry.data = data; - tx_entry->cq_entry.tag = 0; - switch (op) { - case ofi_op_tagged: - tx_entry->cq_entry.flags = FI_TRANSMIT | FI_MSG | FI_TAGGED; - tx_entry->cq_entry.tag = tag; - break; - case ofi_op_write: - tx_entry->cq_entry.flags = FI_RMA | FI_WRITE; - break; - case ofi_op_read_req: - tx_entry->cq_entry.flags = FI_RMA | FI_READ; - break; - case ofi_op_msg: - tx_entry->cq_entry.flags = FI_TRANSMIT | FI_MSG; - break; - default: - FI_WARN(&rxr_prov, FI_LOG_CQ, "invalid operation type in %s\n", - __func__); - assert(0); - } - - if (tx_entry->cq_entry.flags & FI_RMA) { - assert(rma_iov_count>0); - assert(rma_iov); - tx_entry->rma_iov_count = rma_iov_count; - memcpy(tx_entry->rma_iov, rma_iov, sizeof(struct fi_rma_iov) * rma_iov_count); - } -} - -/* create a new tx entry */ -struct rxr_tx_entry *rxr_ep_tx_entry_init(struct rxr_ep *rxr_ep, const struct iovec *iov, size_t iov_count, - const struct fi_rma_iov *rma_iov, size_t rma_iov_count, - fi_addr_t addr, uint64_t tag, uint64_t data, void *context, - uint32_t op, uint64_t flags) -{ - struct rxr_tx_entry *tx_entry; - uint64_t tx_op_flags; - - tx_entry = ofi_buf_alloc(rxr_ep->tx_entry_pool); - if (OFI_UNLIKELY(!tx_entry)) { - FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "TX entries exhausted.\n"); - return NULL; - } - -#if ENABLE_DEBUG - dlist_insert_tail(&tx_entry->tx_entry_entry, &rxr_ep->tx_entry_list); -#endif - - rxr_generic_tx_entry_init(rxr_ep, tx_entry, iov, iov_count, rma_iov, - rma_iov_count, addr, tag, data, context, - op, flags); - - assert(rxr_ep->util_ep.tx_msg_flags == 0 || rxr_ep->util_ep.tx_msg_flags == FI_COMPLETION); - tx_op_flags = rxr_ep->util_ep.tx_op_flags; - if (rxr_ep->util_ep.tx_msg_flags == 0) - tx_op_flags &= ~FI_COMPLETION; - tx_entry->fi_flags = flags | tx_op_flags; - - return tx_entry; -} - -/* - * Copies all consecutive small iov's into one buffer. If the function reaches - * an iov greater than the max memcpy size, it will end, only copying up to - * that iov. - */ -static size_t rxr_copy_from_iov(void *buf, uint64_t remaining_len, - struct rxr_tx_entry *tx_entry) -{ - struct iovec *tx_iov = tx_entry->iov; - uint64_t done = 0, len; - - while (tx_entry->iov_index < tx_entry->iov_count && - done < remaining_len) { - len = tx_iov[tx_entry->iov_index].iov_len; - if (tx_entry->mr[tx_entry->iov_index]) - break; - - len -= tx_entry->iov_offset; - - /* - * If the amount to be written surpasses the remaining length, - * copy up to the remaining length and return, else copy the - * entire iov and continue. - */ - if (done + len > remaining_len) { - len = remaining_len - done; - memcpy((char *)buf + done, - (char *)tx_iov[tx_entry->iov_index].iov_base + - tx_entry->iov_offset, len); - tx_entry->iov_offset += len; - done += len; - break; - } - memcpy((char *)buf + done, - (char *)tx_iov[tx_entry->iov_index].iov_base + - tx_entry->iov_offset, len); - tx_entry->iov_index++; - tx_entry->iov_offset = 0; - done += len; - } - return done; -} - -ssize_t rxr_ep_send_msg(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry, - const struct fi_msg *msg, uint64_t flags) -{ - struct rxr_peer *peer; - size_t ret; - - assert(ep->tx_pending <= ep->max_outstanding_tx); - - if (ep->tx_pending == ep->max_outstanding_tx) - return -FI_EAGAIN; - - peer = rxr_ep_get_peer(ep, pkt_entry->addr); - if (peer->rnr_state & RXR_PEER_IN_BACKOFF) - return -FI_EAGAIN; - -#if ENABLE_DEBUG - dlist_insert_tail(&pkt_entry->dbg_entry, &ep->tx_pkt_list); -#ifdef ENABLE_RXR_PKT_DUMP - rxr_ep_print_pkt("Sent", ep, (struct rxr_base_hdr *)pkt_entry->pkt); -#endif -#endif - ret = fi_sendmsg(ep->rdm_ep, msg, flags); - - if (OFI_LIKELY(!ret)) - rxr_ep_inc_tx_pending(ep, peer); - - return ret; -} - -static ssize_t rxr_ep_send_data_pkt_entry(struct rxr_ep *ep, - struct rxr_tx_entry *tx_entry, - struct rxr_pkt_entry *pkt_entry, - struct rxr_data_pkt *data_pkt) -{ - uint64_t payload_size; - - payload_size = MIN(tx_entry->total_len - tx_entry->bytes_sent, - ep->max_data_payload_size); - payload_size = MIN(payload_size, tx_entry->window); - data_pkt->hdr.seg_size = payload_size; - - pkt_entry->pkt_size = ofi_copy_from_iov(data_pkt->data, - payload_size, - tx_entry->iov, - tx_entry->iov_count, - tx_entry->bytes_sent); - assert(pkt_entry->pkt_size == payload_size); - - pkt_entry->pkt_size += RXR_DATA_HDR_SIZE; - pkt_entry->addr = tx_entry->addr; - - return rxr_ep_send_pkt_flags(ep, pkt_entry, tx_entry->addr, - tx_entry->send_flags); -} - -/* If mr local is not set, will skip copying and only send user buffers */ -static ssize_t rxr_ep_mr_send_data_pkt_entry(struct rxr_ep *ep, - struct rxr_tx_entry *tx_entry, - struct rxr_pkt_entry *pkt_entry, - struct rxr_data_pkt *data_pkt) -{ - /* The user's iov */ - struct iovec *tx_iov = tx_entry->iov; - /* The constructed iov to be passed to sendv - * and corresponding fid_mrs - */ - struct iovec iov[ep->core_iov_limit]; - struct fid_mr *mr[ep->core_iov_limit]; - /* Constructed iov's total size */ - uint64_t payload_size = 0; - /* pkt_entry offset to write data into */ - uint64_t pkt_used = 0; - /* Remaining size that can fit in the constructed iov */ - uint64_t remaining_len = MIN(tx_entry->window, - ep->max_data_payload_size); - /* The constructed iov's index */ - size_t i = 0; - size_t len = 0; - - ssize_t ret; - - /* Assign packet header in constructed iov */ - iov[i].iov_base = rxr_pkt_start(pkt_entry); - iov[i].iov_len = RXR_DATA_HDR_SIZE; - mr[i] = rxr_ep_mr_local(ep) ? fi_mr_desc(pkt_entry->mr) : NULL; - i++; - - /* - * Loops until payload size is at max, all user iovs are sent, the - * constructed iov count is greater than the core iov limit, or the tx - * entry window is exhausted. Each iteration fills one entry of the - * iov to be sent. - */ - while (tx_entry->iov_index < tx_entry->iov_count && - remaining_len > 0 && i < ep->core_iov_limit) { - /* If the iov was pre registered after the RTS */ - if (!rxr_ep_mr_local(ep) || - tx_entry->mr[tx_entry->iov_index]) { - iov[i].iov_base = - (char *)tx_iov[tx_entry->iov_index].iov_base + - tx_entry->iov_offset; - mr[i] = rxr_ep_mr_local(ep) ? - fi_mr_desc(tx_entry->mr[tx_entry->iov_index]) : - NULL; - - len = tx_iov[tx_entry->iov_index].iov_len - - tx_entry->iov_offset; - if (len > remaining_len) { - len = remaining_len; - tx_entry->iov_offset += len; - } else { - tx_entry->iov_index++; - tx_entry->iov_offset = 0; - } - iov[i].iov_len = len; - } else { - /* - * Copies any consecutive small iov's, returning size - * written while updating iov index and offset - */ - len = rxr_copy_from_iov((char *)data_pkt->data + - pkt_used, - remaining_len, - tx_entry); - - iov[i].iov_base = (char *)data_pkt->data + pkt_used; - iov[i].iov_len = len; - mr[i] = fi_mr_desc(pkt_entry->mr); - pkt_used += len; - } - payload_size += len; - remaining_len -= len; - i++; - } - data_pkt->hdr.seg_size = (uint16_t)payload_size; - pkt_entry->pkt_size = payload_size + RXR_DATA_HDR_SIZE; - pkt_entry->addr = tx_entry->addr; - - FI_DBG(&rxr_prov, FI_LOG_EP_DATA, - "Sending an iov count, %zu with payload size: %lu.\n", - i, payload_size); - ret = rxr_ep_sendv_pkt(ep, pkt_entry, tx_entry->addr, - (const struct iovec *)iov, - (void **)mr, i, tx_entry->send_flags); - return ret; -} - -ssize_t rxr_ep_post_data(struct rxr_ep *rxr_ep, - struct rxr_tx_entry *tx_entry) -{ - struct rxr_pkt_entry *pkt_entry; - struct rxr_data_pkt *data_pkt; - ssize_t ret; - - pkt_entry = rxr_get_pkt_entry(rxr_ep, rxr_ep->tx_pkt_pool); - - if (OFI_UNLIKELY(!pkt_entry)) - return -FI_ENOMEM; - - pkt_entry->x_entry = (void *)tx_entry; - pkt_entry->addr = tx_entry->addr; - - data_pkt = (struct rxr_data_pkt *)pkt_entry->pkt; - - data_pkt->hdr.type = RXR_DATA_PKT; - data_pkt->hdr.version = RXR_PROTOCOL_VERSION; - data_pkt->hdr.flags = 0; - - data_pkt->hdr.rx_id = tx_entry->rx_id; - - /* - * Data packets are sent in order so using bytes_sent is okay here. - */ - data_pkt->hdr.seg_offset = tx_entry->bytes_sent; - - if (efa_mr_cache_enable) { - ret = rxr_ep_mr_send_data_pkt_entry(rxr_ep, tx_entry, pkt_entry, - data_pkt); - } else { - ret = rxr_ep_send_data_pkt_entry(rxr_ep, tx_entry, pkt_entry, - data_pkt); - } - - if (OFI_UNLIKELY(ret)) { - rxr_release_tx_pkt_entry(rxr_ep, pkt_entry); - return ret; - } - data_pkt = rxr_get_data_pkt(pkt_entry->pkt); - tx_entry->bytes_sent += data_pkt->hdr.seg_size; - tx_entry->window -= data_pkt->hdr.seg_size; - - return ret; -} - -ssize_t rxr_ep_post_readrsp(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry) -{ - struct rxr_pkt_entry *pkt_entry; - ssize_t ret; - size_t data_len; - - pkt_entry = rxr_get_pkt_entry(ep, ep->tx_pkt_pool); - if (OFI_UNLIKELY(!pkt_entry)) - return -FI_EAGAIN; - - rxr_ep_init_readrsp_pkt_entry(ep, tx_entry, pkt_entry); - ret = rxr_ep_send_pkt(ep, pkt_entry, tx_entry->addr); - if (OFI_UNLIKELY(ret)) { - rxr_release_tx_pkt_entry(ep, pkt_entry); - FI_WARN(&rxr_prov, FI_LOG_CQ, - "Failed to send a read response packet: ret %zd\n", ret); - return ret; - } - - data_len = rxr_get_readrsp_hdr(pkt_entry->pkt)->seg_size; - tx_entry->bytes_sent += data_len; - tx_entry->window -= data_len; - assert(tx_entry->window >= 0); - assert(tx_entry->bytes_sent <= tx_entry->total_len); - assert(tx_entry->bytes_acked == 0); - return 0; -} - -void rxr_ep_calc_cts_window_credits(struct rxr_ep *ep, struct rxr_peer *peer, - uint64_t size, int request, - int *window, int *credits) -{ - struct rxr_av *av; - int num_peers; - - /* - * Adjust the peer credit pool based on the current AV size, which could - * have grown since the time this peer was initialized. - */ - av = rxr_ep_av(ep); - num_peers = av->rdm_av_used - 1; - if (num_peers && ofi_div_ceil(rxr_env.rx_window_size, num_peers) < peer->rx_credits) - peer->rx_credits = ofi_div_ceil(peer->rx_credits, num_peers); - - /* - * Allocate credits for this transfer based on the request, the number - * of available data buffers, and the number of outstanding peers this - * endpoint is actively tracking in the AV. Also ensure that a minimum - * number of credits are allocated to the transfer so the sender can - * make progress. - */ - *credits = MIN(MIN(ep->available_data_bufs, ep->posted_bufs), - peer->rx_credits); - *credits = MIN(request, *credits); - *credits = MAX(*credits, rxr_env.tx_min_credits); - *window = MIN(size, *credits * ep->max_data_payload_size); - if (peer->rx_credits > ofi_div_ceil(*window, ep->max_data_payload_size)) - peer->rx_credits -= ofi_div_ceil(*window, ep->max_data_payload_size); -} - -void rxr_ep_init_cts_pkt_entry(struct rxr_ep *ep, - struct rxr_rx_entry *rx_entry, - struct rxr_pkt_entry *pkt_entry, - uint64_t size, - int *credits) -{ - int window = 0; - struct rxr_cts_hdr *cts_hdr; - struct rxr_peer *peer; - - cts_hdr = (struct rxr_cts_hdr *)pkt_entry->pkt; - - cts_hdr->type = RXR_CTS_PKT; - cts_hdr->version = RXR_PROTOCOL_VERSION; - cts_hdr->flags = 0; - - if (rx_entry->cq_entry.flags & FI_READ) - cts_hdr->flags |= RXR_READ_REQ; - - cts_hdr->tx_id = rx_entry->tx_id; - cts_hdr->rx_id = rx_entry->rx_id; - - peer = rxr_ep_get_peer(ep, rx_entry->addr); - rxr_ep_calc_cts_window_credits(ep, peer, size, rx_entry->credit_request, - &window, credits); - cts_hdr->window = window; - - pkt_entry->pkt_size = RXR_CTS_HDR_SIZE; - pkt_entry->addr = rx_entry->addr; - pkt_entry->x_entry = (void *)rx_entry; -} - -void rxr_ep_init_connack_pkt_entry(struct rxr_ep *ep, - struct rxr_pkt_entry *pkt_entry, - fi_addr_t addr) -{ - struct rxr_connack_hdr *connack_hdr; - - connack_hdr = (struct rxr_connack_hdr *)pkt_entry->pkt; - - connack_hdr->type = RXR_CONNACK_PKT; - connack_hdr->version = RXR_PROTOCOL_VERSION; - connack_hdr->flags = 0; - - pkt_entry->pkt_size = RXR_CONNACK_HDR_SIZE; - pkt_entry->addr = addr; -} - -void rxr_ep_init_readrsp_pkt_entry(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, - struct rxr_pkt_entry *pkt_entry) -{ - struct rxr_readrsp_pkt *readrsp_pkt; - struct rxr_readrsp_hdr *readrsp_hdr; - size_t mtu = ep->mtu_size; - - readrsp_pkt = (struct rxr_readrsp_pkt *)pkt_entry->pkt; - readrsp_hdr = &readrsp_pkt->hdr; - readrsp_hdr->type = RXR_READRSP_PKT; - readrsp_hdr->version = RXR_PROTOCOL_VERSION; - readrsp_hdr->flags = 0; - readrsp_hdr->tx_id = tx_entry->tx_id; - readrsp_hdr->rx_id = tx_entry->rx_id; - readrsp_hdr->seg_size = ofi_copy_from_iov(readrsp_pkt->data, - mtu - RXR_READRSP_HDR_SIZE, - tx_entry->iov, - tx_entry->iov_count, 0); - pkt_entry->pkt_size = RXR_READRSP_HDR_SIZE + readrsp_hdr->seg_size; - pkt_entry->addr = tx_entry->addr; -} - -/* Initialize RTS packet */ -void rxr_init_rts_pkt_entry(struct rxr_ep *ep, - struct rxr_tx_entry *tx_entry, - struct rxr_pkt_entry *pkt_entry) -{ - struct rxr_rts_hdr *rts_hdr; - struct rxr_peer *peer; - char *data, *src; - uint64_t data_len; - size_t mtu = ep->mtu_size; - int rmalen = 0; - - rts_hdr = (struct rxr_rts_hdr *)pkt_entry->pkt; - peer = rxr_ep_get_peer(ep, tx_entry->addr); - - rts_hdr->type = RXR_RTS_PKT; - rts_hdr->version = RXR_PROTOCOL_VERSION; - rts_hdr->tag = tx_entry->tag; - - rts_hdr->data_len = tx_entry->total_len; - rts_hdr->tx_id = tx_entry->tx_id; - rts_hdr->msg_id = tx_entry->msg_id; - - /* - * Even with protocol versions prior to v3 that did not include a - * request in the RTS, the receiver can test for this flag and decide if - * it should be used as a heuristic for credit calculation. If the - * receiver is on <3 protocol version, the flag and the request just get - * ignored. - */ - rts_hdr->flags |= RXR_CREDIT_REQUEST; - rts_hdr->credit_request = tx_entry->credit_request; - - if (tx_entry->fi_flags & FI_REMOTE_CQ_DATA) { - rts_hdr->flags = RXR_REMOTE_CQ_DATA; - pkt_entry->pkt_size = RXR_CTRL_HDR_SIZE; - rxr_get_ctrl_cq_pkt(rts_hdr)->hdr.cq_data = - tx_entry->cq_entry.data; - src = rxr_get_ctrl_cq_pkt(rts_hdr)->data; - } else { - rts_hdr->flags = 0; - pkt_entry->pkt_size = RXR_CTRL_HDR_SIZE_NO_CQ; - src = rxr_get_ctrl_pkt(rts_hdr)->data; - } - - rts_hdr->addrlen = 0; - if (OFI_UNLIKELY(peer->state != RXR_PEER_ACKED)) { - /* - * This is the first communication with this peer on this - * endpoint, so send the core's address for this EP in the RTS - * so the remote side can insert it into its address vector. - */ - rts_hdr->addrlen = ep->core_addrlen; - rts_hdr->flags |= RXR_REMOTE_SRC_ADDR; - memcpy(src, ep->core_addr, rts_hdr->addrlen); - src += rts_hdr->addrlen; - pkt_entry->pkt_size += rts_hdr->addrlen; - } - - rts_hdr->rma_iov_count = 0; - if (tx_entry->cq_entry.flags & FI_RMA) { - if (tx_entry->cq_entry.flags & FI_WRITE) { - rts_hdr->flags |= RXR_WRITE; - } else { - assert(tx_entry->cq_entry.flags & FI_READ); - rts_hdr->flags |= RXR_READ_REQ; - } - - rmalen = tx_entry->rma_iov_count * sizeof(struct fi_rma_iov); - rts_hdr->rma_iov_count = tx_entry->rma_iov_count; - memcpy(src, tx_entry->rma_iov, rmalen); - src += rmalen; - pkt_entry->pkt_size += rmalen; - } - - /* - * currently copying for both INJECT and SEND, - * need to optimize for SEND (small & large) messages - */ - if (rts_hdr->flags & RXR_READ_REQ) { - /* no data to send, but need to send rx_id and window */ - memcpy(src, &tx_entry->rma_loc_rx_id, sizeof(uint64_t)); - src += sizeof(uint64_t); - pkt_entry->pkt_size += sizeof(uint64_t); - memcpy(src, &tx_entry->rma_window, sizeof(uint64_t)); - src += sizeof(uint64_t); - pkt_entry->pkt_size += sizeof(uint64_t); - } else { - data = src; - data_len = ofi_copy_from_iov(data, mtu - pkt_entry->pkt_size, - tx_entry->iov, tx_entry->iov_count, 0); - assert(data_len == rxr_get_rts_data_size(ep, rts_hdr)); - - pkt_entry->pkt_size += data_len; - } - - assert(pkt_entry->pkt_size <= mtu); - pkt_entry->addr = tx_entry->addr; - pkt_entry->x_entry = (void *)tx_entry; - - if (tx_entry->cq_entry.flags & FI_TAGGED) - rts_hdr->flags |= RXR_TAGGED; -} - -static void rxr_inline_mr_reg(struct rxr_domain *rxr_domain, - struct rxr_tx_entry *tx_entry, - size_t index) -{ - ssize_t ret; - tx_entry->iov_mr_start = index; - while (index < tx_entry->iov_count) { - if (tx_entry->iov[index].iov_len > rxr_env.max_memcpy_size) { - ret = fi_mr_reg(rxr_domain->rdm_domain, - tx_entry->iov[index].iov_base, - tx_entry->iov[index].iov_len, - FI_SEND, 0, 0, 0, - &tx_entry->mr[index], NULL); - if (ret) - tx_entry->mr[index] = NULL; - } - index++; - } - - return; -} - -/* Post request to send */ -static size_t rxr_ep_post_rts(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry) -{ - struct rxr_pkt_entry *pkt_entry; - struct rxr_peer *peer; - size_t pending = 0; - ssize_t ret; - uint64_t data_sent, offset; - int i; - - pkt_entry = rxr_get_pkt_entry(rxr_ep, rxr_ep->tx_pkt_pool); - - if (OFI_UNLIKELY(!pkt_entry)) - return -FI_EAGAIN; - - /* - * Init tx state for this peer. The rx state and reorder buffers will be - * initialized on the first recv so as to not allocate resources unless - * necessary. - */ - peer = rxr_ep_get_peer(rxr_ep, tx_entry->addr); - if (!peer->tx_init) { - peer->tx_credits = rxr_env.tx_max_credits; - peer->tx_init = 1; - } - - /* - * Divy up available credits to outstanding transfers and request the - * minimum of that and the amount required to finish the current long - * message. - */ - pending = peer->tx_pending + 1; - tx_entry->credit_request = MIN(ofi_div_ceil(peer->tx_credits, pending), - ofi_div_ceil(tx_entry->total_len, - rxr_ep->max_data_payload_size)); - tx_entry->credit_request = MAX(tx_entry->credit_request, - rxr_env.tx_min_credits); - if (peer->tx_credits >= tx_entry->credit_request) - peer->tx_credits -= tx_entry->credit_request; - - /* Queue this RTS for later if there are too many outstanding packets */ - if (!tx_entry->credit_request) - return -FI_EAGAIN; - - rxr_init_rts_pkt_entry(rxr_ep, tx_entry, pkt_entry); - - ret = rxr_ep_send_pkt(rxr_ep, pkt_entry, tx_entry->addr); - if (OFI_UNLIKELY(ret)) { - rxr_release_tx_pkt_entry(rxr_ep, pkt_entry); - return ret; - } - - if (tx_entry->cq_entry.flags & FI_READ) { - tx_entry->bytes_sent = 0; - assert(tx_entry->state == RXR_TX_RTS || - tx_entry->state == RXR_TX_QUEUED_RTS); - tx_entry->state = RXR_TX_WAIT_READ_FINISH; - return 0; - } - - data_sent = rxr_get_rts_data_size(rxr_ep, rxr_get_rts_hdr(pkt_entry->pkt)); - - tx_entry->bytes_sent += data_sent; - - if (!(efa_mr_cache_enable && tx_entry->total_len > data_sent)) - return ret; - - /* Set the iov index and iov offset from bytes sent */ - offset = data_sent; - for (i = 0; i < tx_entry->iov_count; i++) { - if (offset >= tx_entry->iov[i].iov_len) { - offset -= tx_entry->iov[i].iov_len; - } else { - tx_entry->iov_index = i; - tx_entry->iov_offset = offset; - break; - } - } - - if (rxr_ep_mr_local(rxr_ep)) - rxr_inline_mr_reg(rxr_ep_domain(rxr_ep), tx_entry, i); - - return 0; -} - - - -/* Generic send */ -ssize_t rxr_tx(struct fid_ep *ep, const struct iovec *iov, size_t iov_count, - const struct fi_rma_iov *rma_iov, size_t rma_iov_count, - fi_addr_t addr, uint64_t tag, uint64_t data, void *context, - uint32_t op, uint64_t flags) -{ - struct rxr_ep *rxr_ep; - ssize_t ret; - struct rxr_tx_entry *tx_entry; - struct rxr_peer *peer; - - FI_DBG(&rxr_prov, FI_LOG_EP_DATA, - "%s: iov_len: %lu tag: %lx op: %x flags: %lx\n", - __func__, ofi_total_iov_len(iov, iov_count), tag, op, flags); - - rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid); - - assert(iov_count <= rxr_ep->tx_iov_limit); - - rxr_perfset_start(rxr_ep, perf_rxr_tx); - - fastlock_acquire(&rxr_ep->util_ep.lock); - - if (OFI_UNLIKELY(is_tx_res_full(rxr_ep))) { - ret = -FI_EAGAIN; - goto out; - } - - tx_entry = rxr_ep_tx_entry_init(rxr_ep, iov, iov_count, - rma_iov, rma_iov_count, - addr, tag, data, context, - op, flags); - - if (OFI_UNLIKELY(!tx_entry)) { - ret = -FI_EAGAIN; - rxr_ep_progress_internal(rxr_ep); - goto out; - } - - peer = rxr_ep_get_peer(rxr_ep, addr); - tx_entry->msg_id = (peer->next_msg_id != ~0) ? - peer->next_msg_id++ : ++peer->next_msg_id; - - if (op == ofi_op_read_req) { - int ignore = ~0; - struct rxr_rx_entry *rx_entry = NULL; - int credits = 0; - int window = 0; - /* this rx_entry works same as a receiving rx_entry thus - * we use ofi_op_msg for its op. - * it does not write a rx completion. - */ - rx_entry = rxr_ep_get_rx_entry(rxr_ep, iov, iov_count, tag, - ignore, context, - addr, ofi_op_msg, 0); - if (!rx_entry) { - rxr_release_tx_entry(rxr_ep, tx_entry); - FI_WARN(&rxr_prov, FI_LOG_CQ, - "RX entries exhausted.\n"); - rxr_eq_write_error(rxr_ep, FI_ENOBUFS, -FI_ENOBUFS); - ret = -FI_ENOBUFS; - goto out; - } - - /* - * this rx_entry does not know its tx_id, because remote - * tx_entry has not been created yet. - * set tx_id to -1, and the correct one will be filled in - * rxr_cq_handle_readrsp() - */ - assert(rx_entry); - rx_entry->tx_id = -1; - rx_entry->cq_entry.flags |= FI_READ; - rx_entry->total_len = rx_entry->cq_entry.len; - - /* - * there will not be a CTS for fi_read, we calculate CTS - * window here, and send it via RTS. - * meanwhile set rx_entry->state to RXR_RX_RECV so that - * this rx_entry is ready to receive - */ - - /* If there is no available buffer, we do not proceed. - * It is important to decrease peer->next_msg_id by 1 - * in this case because this message was not sent. - */ - if (rxr_ep->available_data_bufs==0) { - rxr_release_tx_entry(rxr_ep, tx_entry); - rxr_release_rx_entry(rxr_ep, rx_entry); - peer->next_msg_id--; - ret = -FI_EAGAIN; - rxr_ep_progress_internal(rxr_ep); - goto out; - } - - rxr_ep_calc_cts_window_credits(rxr_ep, peer, - tx_entry->total_len, - tx_entry->credit_request, - &window, - &credits); - - rx_entry->window = window; - rxr_ep->available_data_bufs -= credits; - - rx_entry->state = RXR_RX_RECV; - /* rma_loc_tx_id is used in rxr_cq_handle_rx_completion() - * to locate the tx_entry for tx completion. - */ - rx_entry->rma_loc_tx_id = tx_entry->tx_id; -#if ENABLE_DEBUG - dlist_insert_tail(&rx_entry->rx_pending_entry, - &rxr_ep->rx_pending_list); - rxr_ep->rx_pending++; -#endif - /* - * this tx_entry does not need a rx_id, because it does not - * send any data. - * the rma_loc_rx_id and rma_window will be sent to remote EP - * via RTS - */ - tx_entry->rma_loc_rx_id = rx_entry->rx_id; - tx_entry->rma_window = rx_entry->window; - } - - ret = rxr_ep_post_rts(rxr_ep, tx_entry); - - if (OFI_UNLIKELY(ret)) { - if (ret == -FI_EAGAIN) { - tx_entry->state = RXR_TX_QUEUED_RTS; - dlist_insert_tail(&tx_entry->queued_entry, - &rxr_ep->tx_entry_queued_list); - ret = 0; - } else { - peer = rxr_ep_get_peer(rxr_ep, addr); - peer->next_msg_id--; - } - } - -out: - fastlock_release(&rxr_ep->util_ep.lock); - rxr_perfset_end(rxr_ep, perf_rxr_tx); - return ret; -} - -static ssize_t rxr_ep_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, - uint64_t flags) -{ - return rxr_tx(ep, msg->msg_iov, msg->iov_count, NULL, 0, - msg->addr, 0, msg->data, msg->context, - ofi_op_msg, flags); -} - -static ssize_t rxr_ep_sendv(struct fid_ep *ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t dest_addr, - void *context) -{ - struct fi_msg msg; - - memset(&msg, 0, sizeof(msg)); - msg.msg_iov = iov; - msg.desc = desc; - msg.iov_count = count; - msg.addr = dest_addr; - msg.context = context; - - return rxr_ep_sendmsg(ep, &msg, 0); -} - -static ssize_t rxr_ep_send(struct fid_ep *ep, const void *buf, size_t len, - void *desc, fi_addr_t dest_addr, void *context) -{ - struct iovec iov; - - iov.iov_base = (void *)buf; - iov.iov_len = len; - return rxr_ep_sendv(ep, &iov, desc, 1, dest_addr, context); -} - -static ssize_t rxr_ep_senddata(struct fid_ep *ep, const void *buf, size_t len, - void *desc, uint64_t data, fi_addr_t dest_addr, - void *context) -{ - struct iovec iov; - - iov.iov_base = (void *)buf; - iov.iov_len = len; - - return rxr_tx(ep, &iov, 1, NULL, 0, dest_addr, 0, data, context, - ofi_op_msg, FI_REMOTE_CQ_DATA); -} - -static ssize_t rxr_ep_inject(struct fid_ep *ep, const void *buf, size_t len, - fi_addr_t dest_addr) -{ -#if ENABLE_DEBUG - struct rxr_ep *rxr_ep; -#endif - struct iovec iov; - - iov.iov_base = (void *)buf; - iov.iov_len = len; - -#if ENABLE_DEBUG - rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid); - assert(len <= rxr_ep->core_inject_size - RXR_CTRL_HDR_SIZE_NO_CQ); -#endif - - return rxr_tx(ep, &iov, 1, NULL, 0, dest_addr, 0, 0, NULL, ofi_op_msg, - RXR_NO_COMPLETION | FI_INJECT); -} - -static ssize_t rxr_ep_injectdata(struct fid_ep *ep, const void *buf, - size_t len, uint64_t data, - fi_addr_t dest_addr) -{ -#if ENABLE_DEBUG - struct rxr_ep *rxr_ep; -#endif - struct iovec iov; - - iov.iov_base = (void *)buf; - iov.iov_len = len; - -#if ENABLE_DEBUG - rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid); - /* - * We advertise the largest possible inject size with no cq data or - * source address. This means that we may end up not using the core - * providers inject for this send. - */ - assert(len <= rxr_ep->core_inject_size - RXR_CTRL_HDR_SIZE_NO_CQ); -#endif - - return rxr_tx(ep, &iov, 1, NULL, 0, dest_addr, 0, data, NULL, - ofi_op_msg, RXR_NO_COMPLETION | FI_REMOTE_CQ_DATA | FI_INJECT); -} - -static struct fi_ops_msg rxr_ops_msg = { - .size = sizeof(struct fi_ops_msg), - .recv = rxr_ep_recv, - .recvv = rxr_ep_recvv, - .recvmsg = rxr_ep_recvmsg, - .send = rxr_ep_send, - .sendv = rxr_ep_sendv, - .sendmsg = rxr_ep_sendmsg, - .inject = rxr_ep_inject, - .senddata = rxr_ep_senddata, - .injectdata = rxr_ep_injectdata, -}; - -ssize_t rxr_ep_trecv(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, - fi_addr_t src_addr, uint64_t tag, uint64_t ignore, - void *context) -{ - struct iovec msg_iov; - - msg_iov.iov_base = (void *)buf; - msg_iov.iov_len = len; - - return rxr_recv(ep_fid, &msg_iov, 1, src_addr, tag, ignore, - context, ofi_op_tagged, 0); -} - -ssize_t rxr_ep_trecvv(struct fid_ep *ep_fid, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context) -{ - return rxr_recv(ep_fid, iov, count, src_addr, tag, ignore, - context, ofi_op_tagged, 0); -} - -ssize_t rxr_ep_trecvmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg, - uint64_t flags) -{ - ssize_t ret; - - if (flags & FI_PEEK) { - ret = rxr_ep_peek_trecv(ep_fid, msg, flags); - goto out; - } else if (flags & FI_CLAIM) { - ret = rxr_ep_claim_trecv(ep_fid, msg, flags); - goto out; - } - - ret = rxr_recv(ep_fid, msg->msg_iov, msg->iov_count, msg->addr, - msg->tag, msg->ignore, msg->context, - ofi_op_tagged, flags); - -out: - return ret; -} - -ssize_t rxr_ep_tsendmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg, - uint64_t flags) -{ - return rxr_tx(ep_fid, msg->msg_iov, msg->iov_count, NULL, 0, - msg->addr, msg->tag, msg->data, msg->context, - ofi_op_tagged, flags); -} - -ssize_t rxr_ep_tsendv(struct fid_ep *ep_fid, const struct iovec *iov, - void **desc, size_t count, fi_addr_t dest_addr, - uint64_t tag, void *context) -{ - struct fi_msg_tagged msg; - - memset(&msg, 0, sizeof(msg)); - msg.msg_iov = iov; - msg.desc = desc; - msg.iov_count = count; - msg.addr = dest_addr; - msg.context = context; - msg.tag = tag; - - return rxr_ep_tsendmsg(ep_fid, &msg, 0); -} - -ssize_t rxr_ep_tsend(struct fid_ep *ep_fid, const void *buf, size_t len, - void *desc, fi_addr_t dest_addr, uint64_t tag, - void *context) -{ - struct iovec msg_iov; - - msg_iov.iov_base = (void *)buf; - msg_iov.iov_len = len; - - return rxr_ep_tsendv(ep_fid, &msg_iov, desc, 1, dest_addr, tag, - context); -} - -ssize_t rxr_ep_tinject(struct fid_ep *ep_fid, const void *buf, size_t len, - fi_addr_t dest_addr, uint64_t tag) -{ -#if ENABLE_DEBUG - struct rxr_ep *rxr_ep; -#endif - struct iovec iov; - - iov.iov_base = (void *)buf; - iov.iov_len = len; - -#if ENABLE_DEBUG - rxr_ep = container_of(ep_fid, struct rxr_ep, util_ep.ep_fid.fid); - assert(len <= rxr_ep->core_inject_size - RXR_CTRL_HDR_SIZE_NO_CQ); -#endif - - return rxr_tx(ep_fid, &iov, 1, NULL, 0, dest_addr, tag, 0, NULL, - ofi_op_tagged, RXR_NO_COMPLETION | FI_INJECT); -} - -ssize_t rxr_ep_tsenddata(struct fid_ep *ep_fid, const void *buf, size_t len, - void *desc, uint64_t data, fi_addr_t dest_addr, - uint64_t tag, void *context) -{ - struct iovec iov; - - iov.iov_base = (void *)buf; - iov.iov_len = len; - - return rxr_tx(ep_fid, &iov, 1, NULL, 0, dest_addr, tag, data, context, - ofi_op_tagged, FI_REMOTE_CQ_DATA); + /* the return value of rxr_ep_tx_init_mr_desc() is not checked + * because the long message protocol would work with or without + * memory registration and descriptor. + */ + rxr_ep_tx_init_mr_desc(rxr_domain, tx_entry, index, FI_SEND); } -ssize_t rxr_ep_tinjectdata(struct fid_ep *ep_fid, const void *buf, size_t len, - uint64_t data, fi_addr_t dest_addr, uint64_t tag) +/* Generic send */ +int rxr_ep_set_tx_credit_request(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry) { -#if ENABLE_DEBUG - struct rxr_ep *rxr_ep; -#endif - struct iovec iov; + struct rxr_peer *peer; + int pending; - iov.iov_base = (void *)buf; - iov.iov_len = len; + peer = rxr_ep_get_peer(rxr_ep, tx_entry->addr); -#if ENABLE_DEBUG - rxr_ep = container_of(ep_fid, struct rxr_ep, util_ep.ep_fid.fid); /* - * We advertise the largest possible inject size with no cq data or - * source address. This means that we may end up not using the core - * providers inject for this send. + * Init tx state for this peer. The rx state and reorder buffers will be + * initialized on the first recv so as to not allocate resources unless + * necessary. */ - assert(len <= rxr_ep->core_inject_size - RXR_CTRL_HDR_SIZE_NO_CQ); -#endif + if (!peer->tx_init) + rxr_ep_peer_init_tx(peer); - return rxr_tx(ep_fid, &iov, 1, NULL, 0, dest_addr, tag, data, NULL, - ofi_op_tagged, RXR_NO_COMPLETION | FI_REMOTE_CQ_DATA | FI_INJECT); -} + /* + * Divy up available credits to outstanding transfers and request the + * minimum of that and the amount required to finish the current long + * message. + */ + pending = peer->tx_pending + 1; + tx_entry->credit_request = MIN(ofi_div_ceil(peer->tx_credits, pending), + ofi_div_ceil(tx_entry->total_len, + rxr_ep->max_data_payload_size)); + tx_entry->credit_request = MAX(tx_entry->credit_request, + rxr_env.tx_min_credits); + if (peer->tx_credits >= tx_entry->credit_request) + peer->tx_credits -= tx_entry->credit_request; -static struct fi_ops_tagged rxr_ops_tagged = { - .size = sizeof(struct fi_ops_tagged), - .recv = rxr_ep_trecv, - .recvv = rxr_ep_trecvv, - .recvmsg = rxr_ep_trecvmsg, - .send = rxr_ep_tsend, - .sendv = rxr_ep_tsendv, - .sendmsg = rxr_ep_tsendmsg, - .inject = rxr_ep_tinject, - .senddata = rxr_ep_tsenddata, - .injectdata = rxr_ep_tinjectdata, -}; + /* Queue this REQ for later if there are too many outstanding packets */ + if (!tx_entry->credit_request) + return -FI_EAGAIN; + + return 0; +} static void rxr_ep_free_res(struct rxr_ep *rxr_ep) { + size_t i = 0; struct rxr_peer *peer; - struct dlist_entry *tmp; + struct efa_av *av; #if ENABLE_DEBUG + struct dlist_entry *tmp; struct dlist_entry *entry; struct rxr_rx_entry *rx_entry; struct rxr_tx_entry *tx_entry; @@ -2012,39 +636,38 @@ static void rxr_ep_free_res(struct rxr_ep *rxr_ep) #endif if (rxr_need_sas_ordering(rxr_ep)) { - dlist_foreach_container_safe(&rxr_ep->peer_list, - struct rxr_peer, - peer, entry, tmp) { - ofi_recvwin_free(peer->robuf); + av = container_of(rxr_ep->util_ep.av, struct efa_av, util_av); + for (i = 0; i < av->count; ++i) { + peer = rxr_ep_get_peer(rxr_ep, i); + if (peer->rx_init) + efa_free_robuf(peer); } - - if (rxr_ep->robuf_fs) - rxr_robuf_fs_free(rxr_ep->robuf_fs); + if (rxr_ep->robuf_pool) + ofi_bufpool_destroy(rxr_ep->robuf_pool); } #if ENABLE_DEBUG - dlist_foreach_container_safe(&rxr_ep->peer_list, - struct rxr_peer, - peer, entry, tmp) { + av = container_of(rxr_ep->util_ep.av, struct efa_av, util_av); + for (i = 0; i < av->count; ++i) { + peer = rxr_ep_get_peer(rxr_ep, i); /* * TODO: Add support for wait/signal until all pending messages * have been sent/received so the core does not attempt to * complete a data operation or an internal RxR transfer after * the EP is shutdown. */ - if (peer->state == RXR_PEER_CONNREQ) - FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, - "Closing EP with unacked CONNREQs in flight\n"); + if ((peer->flags & RXR_PEER_REQ_SENT) && !(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED)) + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "Closing EP with unacked CONNREQs in flight\n"); } dlist_foreach(&rxr_ep->rx_unexp_list, entry) { rx_entry = container_of(entry, struct rxr_rx_entry, entry); - rxr_release_rx_pkt_entry(rxr_ep, rx_entry->unexp_rts_pkt); + rxr_pkt_entry_release_rx(rxr_ep, rx_entry->unexp_pkt); } dlist_foreach(&rxr_ep->rx_unexp_tagged_list, entry) { rx_entry = container_of(entry, struct rxr_rx_entry, entry); - rxr_release_rx_pkt_entry(rxr_ep, rx_entry->unexp_rts_pkt); + rxr_pkt_entry_release_rx(rxr_ep, rx_entry->unexp_pkt); } dlist_foreach(&rxr_ep->rx_entry_queued_list, entry) { @@ -2053,7 +676,7 @@ static void rxr_ep_free_res(struct rxr_ep *rxr_ep) dlist_foreach_container_safe(&rx_entry->queued_pkts, struct rxr_pkt_entry, pkt, entry, tmp) - rxr_release_tx_pkt_entry(rxr_ep, pkt); + rxr_pkt_entry_release_tx(rxr_ep, pkt); } dlist_foreach(&rxr_ep->tx_entry_queued_list, entry) { @@ -2062,23 +685,29 @@ static void rxr_ep_free_res(struct rxr_ep *rxr_ep) dlist_foreach_container_safe(&tx_entry->queued_pkts, struct rxr_pkt_entry, pkt, entry, tmp) - rxr_release_tx_pkt_entry(rxr_ep, pkt); + rxr_pkt_entry_release_tx(rxr_ep, pkt); } - dlist_foreach_safe(&rxr_ep->rx_pkt_list, entry, tmp) { - pkt = container_of(entry, struct rxr_pkt_entry, dbg_entry); - rxr_release_rx_pkt_entry(rxr_ep, pkt); + if (!rxr_ep->use_zcpy_rx) { + /* + * The provider does not own these entries, and there's no need + * to deep-free them even in a debug build. + */ + dlist_foreach_safe(&rxr_ep->rx_pkt_list, entry, tmp) { + pkt = container_of(entry, struct rxr_pkt_entry, dbg_entry); + rxr_pkt_entry_release_rx(rxr_ep, pkt); + } + dlist_foreach_safe(&rxr_ep->rx_posted_buf_list, entry, tmp) { + pkt = container_of(entry, struct rxr_pkt_entry, dbg_entry); + ofi_buf_free(pkt); + } } dlist_foreach_safe(&rxr_ep->tx_pkt_list, entry, tmp) { pkt = container_of(entry, struct rxr_pkt_entry, dbg_entry); - rxr_release_tx_pkt_entry(rxr_ep, pkt); + rxr_pkt_entry_release_tx(rxr_ep, pkt); } - dlist_foreach_safe(&rxr_ep->rx_posted_buf_list, entry, tmp) { - pkt = container_of(entry, struct rxr_pkt_entry, dbg_entry); - ofi_buf_free(pkt); - } dlist_foreach_safe(&rxr_ep->rx_entry_list, entry, tmp) { rx_entry = container_of(entry, struct rxr_rx_entry, rx_entry_entry); @@ -2089,6 +718,12 @@ static void rxr_ep_free_res(struct rxr_ep *rxr_ep) tx_entry_entry); rxr_release_tx_entry(rxr_ep, tx_entry); } + if (rxr_ep->use_shm) { + dlist_foreach_safe(&rxr_ep->rx_posted_buf_shm_list, entry, tmp) { + pkt = container_of(entry, struct rxr_pkt_entry, dbg_entry); + ofi_buf_free(pkt); + } + } #endif if (rxr_ep->rx_entry_pool) @@ -2097,20 +732,43 @@ static void rxr_ep_free_res(struct rxr_ep *rxr_ep) if (rxr_ep->tx_entry_pool) ofi_bufpool_destroy(rxr_ep->tx_entry_pool); + if (rxr_ep->map_entry_pool) + ofi_bufpool_destroy(rxr_ep->map_entry_pool); + + if (rxr_ep->read_entry_pool) + ofi_bufpool_destroy(rxr_ep->read_entry_pool); + if (rxr_ep->readrsp_tx_entry_pool) ofi_bufpool_destroy(rxr_ep->readrsp_tx_entry_pool); + if (rxr_ep->rx_readcopy_pkt_pool) { + FI_INFO(&rxr_prov, FI_LOG_EP_CTRL, "current usage of read copy packet pool is %d\n", + rxr_ep->rx_readcopy_pkt_pool_used); + FI_INFO(&rxr_prov, FI_LOG_EP_CTRL, "maximum usage of read copy packet pool is %d\n", + rxr_ep->rx_readcopy_pkt_pool_max_used); + assert(!rxr_ep->rx_readcopy_pkt_pool_used); + ofi_bufpool_destroy(rxr_ep->rx_readcopy_pkt_pool); + } + if (rxr_ep->rx_ooo_pkt_pool) ofi_bufpool_destroy(rxr_ep->rx_ooo_pkt_pool); if (rxr_ep->rx_unexp_pkt_pool) ofi_bufpool_destroy(rxr_ep->rx_unexp_pkt_pool); - if (rxr_ep->rx_pkt_pool) - ofi_bufpool_destroy(rxr_ep->rx_pkt_pool); + if (rxr_ep->rx_pkt_efa_pool) + ofi_bufpool_destroy(rxr_ep->rx_pkt_efa_pool); + + if (rxr_ep->tx_pkt_efa_pool) + ofi_bufpool_destroy(rxr_ep->tx_pkt_efa_pool); + + if (rxr_ep->use_shm) { + if (rxr_ep->rx_pkt_shm_pool) + ofi_bufpool_destroy(rxr_ep->rx_pkt_shm_pool); - if (rxr_ep->tx_pkt_pool) - ofi_bufpool_destroy(rxr_ep->tx_pkt_pool); + if (rxr_ep->tx_pkt_shm_pool) + ofi_bufpool_destroy(rxr_ep->tx_pkt_shm_pool); + } } static int rxr_ep_close(struct fid *fid) @@ -2132,6 +790,22 @@ static int rxr_ep_close(struct fid *fid) retv = ret; } + /* Close shm provider's endpoint and cq */ + if (rxr_ep->use_shm) { + ret = fi_close(&rxr_ep->shm_ep->fid); + if (ret) { + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "Unable to close shm EP\n"); + retv = ret; + } + + ret = fi_close(&rxr_ep->shm_cq->fid); + if (ret) { + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "Unable to close shm CQ\n"); + retv = ret; + } + } + + ret = ofi_endpoint_close(&rxr_ep->util_ep); if (ret) { FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "Unable to close util EP\n"); @@ -2148,34 +822,71 @@ static int rxr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) struct rxr_ep *rxr_ep = container_of(ep_fid, struct rxr_ep, util_ep.ep_fid.fid); struct util_cq *cq; - struct rxr_av *av; + struct efa_av *av; struct util_cntr *cntr; struct util_eq *eq; + struct dlist_entry *ep_list_first_entry; + struct util_ep *util_ep; + struct rxr_ep *rxr_first_ep; + struct rxr_peer *first_ep_peer, *peer; int ret = 0; + size_t i; switch (bfid->fclass) { case FI_CLASS_AV: - av = container_of(bfid, struct rxr_av, util_av.av_fid.fid); + av = container_of(bfid, struct efa_av, util_av.av_fid.fid); /* Bind util provider endpoint and av */ ret = ofi_ep_bind_av(&rxr_ep->util_ep, &av->util_av); if (ret) return ret; - /* Bind core provider endpoint & av */ - ret = fi_ep_bind(rxr_ep->rdm_ep, &av->rdm_av->fid, flags); + ret = fi_ep_bind(rxr_ep->rdm_ep, &av->util_av.av_fid.fid, flags); if (ret) return ret; - rxr_ep->peer = calloc(av->util_av.count, + rxr_ep->peer = calloc(av->count, sizeof(struct rxr_peer)); if (!rxr_ep->peer) return -FI_ENOMEM; - rxr_ep->robuf_fs = rxr_robuf_fs_create(rxr_ep->rx_size, - NULL, NULL); - if (!rxr_ep->robuf_fs) - return -FI_ENOMEM; + if (rxr_need_sas_ordering(rxr_ep)) { + ret = ofi_bufpool_create(&rxr_ep->robuf_pool, + sizeof(struct rxr_robuf), 16, + 0, 0, 0); + if (ret) + return ret; + } + + /* Bind shm provider endpoint & shm av */ + if (rxr_ep->use_shm) { + ret = fi_ep_bind(rxr_ep->shm_ep, &av->shm_rdm_av->fid, flags); + if (ret) + return ret; + /* + * We always update the new added EP's local information with the first + * bound EP. The if (ep_list_first_entry->next) check here is to skip the + * update for the first bound EP. + */ + ep_list_first_entry = av->util_av.ep_list.next; + if (ep_list_first_entry->next) { + util_ep = container_of(ep_list_first_entry, struct util_ep, av_entry); + rxr_first_ep = container_of(util_ep, struct rxr_ep, util_ep); + + /* + * Copy the entire peer array, because we may not be able to make the + * assumption that insertions are always indexed in order in the future. + */ + for (i = 0; i < av->count; i++) { + first_ep_peer = rxr_ep_get_peer(rxr_first_ep, i); + if (first_ep_peer->is_local) { + peer = rxr_ep_get_peer(rxr_ep, i); + peer->shm_fiaddr = first_ep_peer->shm_fiaddr; + peer->is_local = 1; + } + } + } + } break; case FI_CLASS_CQ: cq = container_of(bfid, struct util_cq, cq_fid.fid); @@ -2206,31 +917,53 @@ static int rxr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) return ret; } +static +void rxr_ep_set_features(struct rxr_ep *ep) +{ + memset(ep->features, 0, sizeof(ep->features)); + + /* RDMA read is an extra feature defined in protocol version 4 (the base version) */ + if (efa_ep_support_rdma_read(ep->rdm_ep)) + ep->features[0] |= RXR_REQ_FEATURE_RDMA_READ; + + ep->features[0] |= RXR_REQ_FEATURE_DELIVERY_COMPLETE; +} + static int rxr_ep_ctrl(struct fid *fid, int command, void *arg) { ssize_t ret; size_t i; struct rxr_ep *ep; uint64_t flags = FI_MORE; - size_t rx_size; + size_t rx_size, shm_rx_size; + char shm_ep_name[NAME_MAX]; switch (command) { case FI_ENABLE: - /* Enable core provider endpoint & post recv buff */ + /* Enable core endpoints & post recv buff */ ep = container_of(fid, struct rxr_ep, util_ep.ep_fid.fid); - rx_size = rxr_get_rx_pool_chunk_cnt(ep); - + /* + * If the endpoint is configured for zero-copy receives, the + * provider will use the application's undirected receives for + * its internal control packets as well. The onus will be on the + * application to ensure the receive queue is hydrated to avoid + * RNRs. + */ + rx_size = ep->use_zcpy_rx ? rxr_env.zcpy_rx_seed : rxr_get_rx_pool_chunk_cnt(ep); ret = fi_enable(ep->rdm_ep); if (ret) return ret; fastlock_acquire(&ep->util_ep.lock); + + rxr_ep_set_features(ep); + for (i = 0; i < rx_size; i++) { if (i == rx_size - 1) flags = 0; - ret = rxr_ep_post_buf(ep, flags); + ret = rxr_ep_post_buf(ep, NULL, flags, EFA_EP); if (ret) goto out; @@ -2245,6 +978,36 @@ static int rxr_ep_ctrl(struct fid *fid, int command, void *arg) assert(ret != -FI_ETOOSMALL); FI_DBG(&rxr_prov, FI_LOG_EP_CTRL, "core_addrlen = %ld\n", ep->core_addrlen); + + /* Enable shm provider endpoint & post recv buff. + * Once core ep enabled, 18 bytes efa_addr (16 bytes raw + 2 bytes qpn) is set. + * We convert the address to 'gid_qpn' format, and set it as shm ep name, so + * that shm ep can create shared memory region with it when enabling. + * In this way, each peer is able to open and map to other local peers' + * shared memory region. + */ + if (ep->use_shm) { + ret = rxr_ep_efa_addr_to_str(ep->core_addr, shm_ep_name); + if (ret < 0) + goto out; + + fi_setname(&ep->shm_ep->fid, shm_ep_name, sizeof(shm_ep_name)); + shm_rx_size = shm_info->rx_attr->size; + ret = fi_enable(ep->shm_ep); + if (ret) + return ret; + /* Pre-post buffer to receive from shm provider */ + for (i = 0; i < shm_rx_size; i++) { + if (i == shm_rx_size - 1) + flags = 0; + + ret = rxr_ep_post_buf(ep, NULL, flags, SHM_EP); + + if (ret) + goto out; + } + } + out: fastlock_release(&ep->util_ep.lock); break; @@ -2287,29 +1050,51 @@ static ssize_t rxr_ep_cancel_recv(struct rxr_ep *ep, entry = dlist_remove_first_match(recv_list, &rxr_ep_cancel_match_recv, context); - if (entry) { - rx_entry = container_of(entry, struct rxr_rx_entry, entry); - rx_entry->rxr_flags |= RXR_RECV_CANCEL; - if (rx_entry->fi_flags & FI_MULTI_RECV) - rxr_cq_handle_multi_recv_completion(ep, rx_entry); + if (!entry) { fastlock_release(&ep->util_ep.lock); - memset(&err_entry, 0, sizeof(err_entry)); - err_entry.op_context = rx_entry->cq_entry.op_context; - err_entry.flags |= rx_entry->cq_entry.flags; - err_entry.tag = rx_entry->tag; - err_entry.err = FI_ECANCELED; - err_entry.prov_errno = -FI_ECANCELED; - - domain = rxr_ep_domain(ep); - api_version = - domain->util_domain.fabric->fabric_fid.api_version; - if (FI_VERSION_GE(api_version, FI_VERSION(1, 5))) - err_entry.err_data_size = 0; - return ofi_cq_write_error(ep->util_ep.rx_cq, &err_entry); + return 0; } + rx_entry = container_of(entry, struct rxr_rx_entry, entry); + rx_entry->rxr_flags |= RXR_RECV_CANCEL; + if (rx_entry->fi_flags & FI_MULTI_RECV && + rx_entry->rxr_flags & RXR_MULTI_RECV_POSTED) { + if (dlist_empty(&rx_entry->multi_recv_consumers)) { + /* + * No pending messages for the buffer, + * release it back to the app. + */ + rx_entry->cq_entry.flags |= FI_MULTI_RECV; + } else { + rx_entry = container_of(rx_entry->multi_recv_consumers.next, + struct rxr_rx_entry, + multi_recv_entry); + rxr_msg_multi_recv_handle_completion(ep, rx_entry); + } + } else if (rx_entry->fi_flags & FI_MULTI_RECV && + rx_entry->rxr_flags & RXR_MULTI_RECV_CONSUMER) { + rxr_msg_multi_recv_handle_completion(ep, rx_entry); + } fastlock_release(&ep->util_ep.lock); - return 0; + memset(&err_entry, 0, sizeof(err_entry)); + err_entry.op_context = rx_entry->cq_entry.op_context; + err_entry.flags |= rx_entry->cq_entry.flags; + err_entry.tag = rx_entry->tag; + err_entry.err = FI_ECANCELED; + err_entry.prov_errno = -FI_ECANCELED; + + domain = rxr_ep_domain(ep); + api_version = + domain->util_domain.fabric->fabric_fid.api_version; + if (FI_VERSION_GE(api_version, FI_VERSION(1, 5))) + err_entry.err_data_size = 0; + /* + * Other states are currently receiving data. Subsequent messages will + * be sunk (via RXR_RECV_CANCEL flag) and the completion suppressed. + */ + if (rx_entry->state & (RXR_RX_INIT | RXR_RX_UNEXP | RXR_RX_MATCHED)) + rxr_release_rx_entry(ep, rx_entry); + return ofi_cq_write_error(ep->util_ep.rx_cq, &err_entry); } static ssize_t rxr_ep_cancel(fid_t fid_ep, void *context) @@ -2376,8 +1161,8 @@ static int rxr_buf_region_alloc_hndlr(struct ofi_bufpool_region *region) struct fid_mr *mr; struct rxr_domain *domain = region->pool->attr.context; - ret = fi_mr_reg(domain->rdm_domain, region->mem_region, - region->pool->region_size, + ret = fi_mr_reg(domain->rdm_domain, region->alloc_region, + region->pool->alloc_size, FI_SEND | FI_RECV, 0, 0, 0, &mr, NULL); region->context = mr; @@ -2395,8 +1180,18 @@ static void rxr_buf_region_free_hndlr(struct ofi_bufpool_region *region) fi_strerror(-ret)); } +/* + * rxr_create_pkt_pool create a packet pool. The size of pool is fixed + * and the memory is registered with device. + * + * Important arguments: + * size: packet entry size + * flags: caller can specify OFI_BUFPOOL_HUGEPAGES so the pool + * will be backed by huge pages. + */ static int rxr_create_pkt_pool(struct rxr_ep *ep, size_t size, size_t chunk_count, + size_t flags, struct ofi_bufpool **buf_pool) { struct ofi_bufpool_attr attr = { @@ -2404,13 +1199,11 @@ static int rxr_create_pkt_pool(struct rxr_ep *ep, size_t size, .alignment = RXR_BUF_POOL_ALIGNMENT, .max_cnt = chunk_count, .chunk_cnt = chunk_count, - .alloc_fn = rxr_ep_mr_local(ep) ? - rxr_buf_region_alloc_hndlr : NULL, - .free_fn = rxr_ep_mr_local(ep) ? - rxr_buf_region_free_hndlr : NULL, + .alloc_fn = rxr_buf_region_alloc_hndlr, + .free_fn = rxr_buf_region_free_hndlr, .init_fn = NULL, .context = rxr_ep_domain(ep), - .flags = OFI_BUFPOOL_HUGEPAGES, + .flags = flags, }; return ofi_bufpool_create_attr(&attr, buf_pool); @@ -2428,14 +1221,16 @@ int rxr_ep_init(struct rxr_ep *ep) #endif ret = rxr_create_pkt_pool(ep, entry_sz, rxr_get_tx_pool_chunk_cnt(ep), - &ep->tx_pkt_pool); + OFI_BUFPOOL_HUGEPAGES, + &ep->tx_pkt_efa_pool); if (ret) - goto err_out; + goto err_free; ret = rxr_create_pkt_pool(ep, entry_sz, rxr_get_rx_pool_chunk_cnt(ep), - &ep->rx_pkt_pool); + OFI_BUFPOOL_HUGEPAGES, + &ep->rx_pkt_efa_pool); if (ret) - goto err_free_tx_pool; + goto err_free; if (rxr_env.rx_copy_unexp) { ret = ofi_bufpool_create(&ep->rx_unexp_pkt_pool, entry_sz, @@ -2443,7 +1238,7 @@ int rxr_ep_init(struct rxr_ep *ep) rxr_get_rx_pool_chunk_cnt(ep), 0); if (ret) - goto err_free_rx_pool; + goto err_free; } if (rxr_env.rx_copy_ooo) { @@ -2452,7 +1247,31 @@ int rxr_ep_init(struct rxr_ep *ep) rxr_env.recvwin_size, 0); if (ret) - goto err_free_rx_unexp_pool; + goto err_free; + } + + if ((rxr_env.rx_copy_unexp || rxr_env.rx_copy_ooo) && + (rxr_ep_domain(ep)->util_domain.mr_mode & FI_MR_HMEM)) { + /* this pool is only needed when application requested FI_HMEM + * capability + */ + ret = rxr_create_pkt_pool(ep, entry_sz, + rxr_env.readcopy_pool_size, + 0, &ep->rx_readcopy_pkt_pool); + + if (ret) + goto err_free; + + ret = ofi_bufpool_grow(ep->rx_readcopy_pkt_pool); + if (ret) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "cannot allocate and register memory for readcopy packet pool. error: %s\n", + strerror(-ret)); + goto err_free; + } + + ep->rx_readcopy_pkt_pool_used = 0; + ep->rx_readcopy_pkt_pool_max_used = 0; } ret = ofi_bufpool_create(&ep->tx_entry_pool, @@ -2460,7 +1279,15 @@ int rxr_ep_init(struct rxr_ep *ep) RXR_BUF_POOL_ALIGNMENT, ep->tx_size, ep->tx_size, 0); if (ret) - goto err_free_rx_ooo_pool; + goto err_free; + + ret = ofi_bufpool_create(&ep->read_entry_pool, + sizeof(struct rxr_read_entry), + RXR_BUF_POOL_ALIGNMENT, + ep->tx_size + RXR_MAX_RX_QUEUE_SIZE, + ep->tx_size + ep->rx_size, 0); + if (ret) + goto err_free; ret = ofi_bufpool_create(&ep->readrsp_tx_entry_pool, sizeof(struct rxr_tx_entry), @@ -2468,7 +1295,7 @@ int rxr_ep_init(struct rxr_ep *ep) RXR_MAX_RX_QUEUE_SIZE, ep->rx_size, 0); if (ret) - goto err_free_tx_entry_pool; + goto err_free; ret = ofi_bufpool_create(&ep->rx_entry_pool, sizeof(struct rxr_rx_entry), @@ -2476,7 +1303,45 @@ int rxr_ep_init(struct rxr_ep *ep) RXR_MAX_RX_QUEUE_SIZE, ep->rx_size, 0); if (ret) - goto err_free_readrsp_tx_entry_pool; + goto err_free; + + ret = ofi_bufpool_create(&ep->map_entry_pool, + sizeof(struct rxr_pkt_rx_map), + RXR_BUF_POOL_ALIGNMENT, + RXR_MAX_RX_QUEUE_SIZE, + ep->rx_size, 0); + + if (ret) + goto err_free; + + ret = ofi_bufpool_create(&ep->rx_atomrsp_pool, + ep->mtu_size, + RXR_BUF_POOL_ALIGNMENT, + RXR_MAX_RX_QUEUE_SIZE, + rxr_env.atomrsp_pool_size, 0); + if (ret) + goto err_free; + + /* create pkt pool for shm */ + if (ep->use_shm) { + ret = ofi_bufpool_create(&ep->tx_pkt_shm_pool, + entry_sz, + RXR_BUF_POOL_ALIGNMENT, + shm_info->tx_attr->size, + shm_info->tx_attr->size, 0); + if (ret) + goto err_free; + + ret = ofi_bufpool_create(&ep->rx_pkt_shm_pool, + entry_sz, + RXR_BUF_POOL_ALIGNMENT, + shm_info->rx_attr->size, + shm_info->rx_attr->size, 0); + if (ret) + goto err_free; + + dlist_init(&ep->rx_posted_buf_shm_list); + } /* Initialize entry list */ dlist_init(&ep->rx_list); @@ -2487,8 +1352,8 @@ int rxr_ep_init(struct rxr_ep *ep) dlist_init(&ep->rx_entry_queued_list); dlist_init(&ep->tx_entry_queued_list); dlist_init(&ep->tx_pending_list); + dlist_init(&ep->read_pending_list); dlist_init(&ep->peer_backoff_list); - dlist_init(&ep->peer_list); #if ENABLE_DEBUG dlist_init(&ep->rx_pending_list); dlist_init(&ep->rx_pkt_list); @@ -2496,27 +1361,47 @@ int rxr_ep_init(struct rxr_ep *ep) dlist_init(&ep->rx_entry_list); dlist_init(&ep->tx_entry_list); #endif - + /* Initialize pkt to rx map */ + ep->pkt_rx_map = NULL; return 0; -err_free_readrsp_tx_entry_pool: + +err_free: + if (ep->tx_pkt_shm_pool) + ofi_bufpool_destroy(ep->tx_pkt_shm_pool); + + if (ep->rx_atomrsp_pool) + ofi_bufpool_destroy(ep->rx_atomrsp_pool); + + if (ep->map_entry_pool) + ofi_bufpool_destroy(ep->map_entry_pool); + + if (ep->rx_entry_pool) + ofi_bufpool_destroy(ep->rx_entry_pool); + if (ep->readrsp_tx_entry_pool) ofi_bufpool_destroy(ep->readrsp_tx_entry_pool); -err_free_tx_entry_pool: + + if (ep->read_entry_pool) + ofi_bufpool_destroy(ep->read_entry_pool); + if (ep->tx_entry_pool) ofi_bufpool_destroy(ep->tx_entry_pool); -err_free_rx_ooo_pool: + + if (ep->rx_readcopy_pkt_pool) + ofi_bufpool_destroy(ep->rx_readcopy_pkt_pool); + if (rxr_env.rx_copy_ooo && ep->rx_ooo_pkt_pool) ofi_bufpool_destroy(ep->rx_ooo_pkt_pool); -err_free_rx_unexp_pool: + if (rxr_env.rx_copy_unexp && ep->rx_unexp_pkt_pool) ofi_bufpool_destroy(ep->rx_unexp_pkt_pool); -err_free_rx_pool: - if (ep->rx_pkt_pool) - ofi_bufpool_destroy(ep->rx_pkt_pool); -err_free_tx_pool: - if (ep->tx_pkt_pool) - ofi_bufpool_destroy(ep->tx_pkt_pool); -err_out: + + if (ep->rx_pkt_efa_pool) + ofi_bufpool_destroy(ep->rx_pkt_efa_pool); + + if (ep->tx_pkt_efa_pool) + ofi_bufpool_destroy(ep->tx_pkt_efa_pool); + return ret; } @@ -2554,12 +1439,23 @@ static inline int rxr_ep_bulk_post_recv(struct rxr_ep *ep) uint64_t flags = FI_MORE; int ret; - while (ep->rx_bufs_to_post) { - if (ep->rx_bufs_to_post == 1) + while (ep->rx_bufs_efa_to_post) { + if (ep->rx_bufs_efa_to_post == 1) flags = 0; - ret = rxr_ep_post_buf(ep, flags); + ret = rxr_ep_post_buf(ep, NULL, flags, EFA_EP); if (OFI_LIKELY(!ret)) - ep->rx_bufs_to_post--; + ep->rx_bufs_efa_to_post--; + else + return ret; + } + /* bulk post recv buf for shm provider */ + flags = FI_MORE; + while (ep->use_shm && ep->rx_bufs_shm_to_post) { + if (ep->rx_bufs_shm_to_post == 1) + flags = 0; + ret = rxr_ep_post_buf(ep, NULL, flags, SHM_EP); + if (OFI_LIKELY(!ret)) + ep->rx_bufs_shm_to_post--; else return ret; } @@ -2576,7 +1472,11 @@ static inline int rxr_ep_send_queued_pkts(struct rxr_ep *ep, dlist_foreach_container_safe(pkts, struct rxr_pkt_entry, pkt_entry, entry, tmp) { - ret = rxr_ep_send_pkt(ep, pkt_entry, pkt_entry->addr); + if (ep->use_shm && rxr_ep_get_peer(ep, pkt_entry->addr)->is_local) { + dlist_remove(&pkt_entry->entry); + continue; + } + ret = rxr_pkt_entry_send(ep, pkt_entry, pkt_entry->addr); if (ret) return ret; dlist_remove(&pkt_entry->entry); @@ -2589,7 +1489,7 @@ static inline void rxr_ep_check_available_data_bufs_timer(struct rxr_ep *ep) if (OFI_LIKELY(ep->available_data_bufs != 0)) return; - if (fi_gettime_us() - ep->available_data_bufs_ts >= + if (ofi_gettime_us() - ep->available_data_bufs_ts >= RXR_AVAILABLE_DATA_BUFS_TIMEOUT) { ep->available_data_bufs = rxr_get_rx_pool_chunk_cnt(ep); ep->available_data_bufs_ts = 0; @@ -2608,55 +1508,67 @@ static inline void rxr_ep_check_peer_backoff_timer(struct rxr_ep *ep) dlist_foreach_container_safe(&ep->peer_backoff_list, struct rxr_peer, peer, rnr_entry, tmp) { - peer->rnr_state &= ~RXR_PEER_BACKED_OFF; - if (!rxr_peer_timeout_expired(ep, peer, fi_gettime_us())) + peer->flags &= ~RXR_PEER_BACKED_OFF; + if (!rxr_peer_timeout_expired(ep, peer, ofi_gettime_us())) continue; - peer->rnr_state = 0; + peer->flags &= ~RXR_PEER_IN_BACKOFF; dlist_remove(&peer->rnr_entry); } } -static void rxr_ep_progress_internal(struct rxr_ep *ep) +static inline void rxr_ep_poll_cq(struct rxr_ep *ep, + struct fid_cq *cq, + size_t cqe_to_process, + bool is_shm_cq) { - struct fi_cq_msg_entry cq_entry; - struct rxr_rx_entry *rx_entry; - struct rxr_tx_entry *tx_entry; - struct dlist_entry *tmp; + struct fi_cq_data_entry cq_entry; fi_addr_t src_addr; ssize_t ret; + struct efa_ep *efa_ep; + struct efa_av *efa_av; int i; - rxr_ep_check_available_data_bufs_timer(ep); + VALGRIND_MAKE_MEM_DEFINED(&cq_entry, sizeof(struct fi_cq_data_entry)); - VALGRIND_MAKE_MEM_DEFINED(&cq_entry, sizeof(struct fi_cq_msg_entry)); - - for (ret = 1, i = 0; ret > 0 && i < 100; i++) { - if (ep->core_caps & FI_SOURCE) { - ret = fi_cq_readfrom(ep->rdm_cq, &cq_entry, 1, &src_addr); - } else { - ret = fi_cq_read(ep->rdm_cq, &cq_entry, 1); - src_addr = FI_ADDR_NOTAVAIL; - } + efa_ep = container_of(ep->rdm_ep, struct efa_ep, util_ep.ep_fid); + efa_av = efa_ep->av; + for (i = 0; i < cqe_to_process; i++) { + ret = fi_cq_readfrom(cq, &cq_entry, 1, &src_addr); if (ret == -FI_EAGAIN) - break; + return; + if (OFI_UNLIKELY(ret < 0)) { if (rxr_cq_handle_cq_error(ep, ret)) assert(0 && "error writing error cq entry after reading from cq"); - rxr_ep_bulk_post_recv(ep); + if (!ep->use_zcpy_rx) + rxr_ep_bulk_post_recv(ep); return; } - if (cq_entry.flags & FI_SEND) { + if (OFI_UNLIKELY(ret == 0)) + return; + + if (is_shm_cq && src_addr != FI_ADDR_UNSPEC) { + /* convert SHM address to EFA address */ + assert(src_addr < EFA_SHM_MAX_AV_COUNT); + src_addr = efa_av->shm_rdm_addr_map[src_addr]; + } + + if (is_shm_cq && (cq_entry.flags & (FI_ATOMIC | FI_REMOTE_CQ_DATA))) { + rxr_cq_handle_shm_completion(ep, &cq_entry, src_addr); + } else if (cq_entry.flags & (FI_SEND | FI_READ | FI_WRITE)) { #if ENABLE_DEBUG - ep->send_comps++; + if (!is_shm_cq) + ep->send_comps++; #endif - rxr_cq_handle_pkt_send_completion(ep, &cq_entry); - } else if (cq_entry.flags & FI_RECV) { - rxr_cq_handle_pkt_recv_completion(ep, &cq_entry, src_addr); + rxr_pkt_handle_send_completion(ep, &cq_entry); + } else if (cq_entry.flags & (FI_RECV | FI_REMOTE_CQ_DATA)) { + rxr_pkt_handle_recv_completion(ep, &cq_entry, src_addr); #if ENABLE_DEBUG - ep->recv_comps++; + if (!is_shm_cq) + ep->recv_comps++; #endif } else { FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, @@ -2664,31 +1576,67 @@ static void rxr_ep_progress_internal(struct rxr_ep *ep) assert(0 && "Unhandled cq type"); } } +} + +void rxr_ep_progress_internal(struct rxr_ep *ep) +{ + struct ibv_send_wr *bad_wr; + struct efa_ep *efa_ep; + struct rxr_rx_entry *rx_entry; + struct rxr_tx_entry *tx_entry; + struct rxr_read_entry *read_entry; + struct rxr_peer *peer; + struct dlist_entry *tmp; + ssize_t ret; - ret = rxr_ep_bulk_post_recv(ep); + if (!ep->use_zcpy_rx) + rxr_ep_check_available_data_bufs_timer(ep); - if (OFI_UNLIKELY(ret)) { - if (rxr_cq_handle_cq_error(ep, ret)) - assert(0 && - "error writing error cq entry after failed post recv"); - return; + // Poll the EFA completion queue + rxr_ep_poll_cq(ep, ep->rdm_cq, rxr_env.efa_cq_read_size, 0); + + // Poll the SHM completion queue if enabled + if (ep->use_shm) + rxr_ep_poll_cq(ep, ep->shm_cq, rxr_env.shm_cq_read_size, 1); + + if (!ep->use_zcpy_rx) { + ret = rxr_ep_bulk_post_recv(ep); + + if (OFI_UNLIKELY(ret)) { + if (rxr_cq_handle_cq_error(ep, ret)) + assert(0 && + "error writing error cq entry after failed post recv"); + return; + } } rxr_ep_check_peer_backoff_timer(ep); /* - * Send any queued RTS/CTS packets. + * Send any queued ctrl packets. */ dlist_foreach_container_safe(&ep->rx_entry_queued_list, struct rxr_rx_entry, rx_entry, queued_entry, tmp) { - if (rx_entry->state == RXR_RX_QUEUED_CTS) - ret = rxr_cq_post_cts(ep, rx_entry, - rx_entry->total_len - - rx_entry->bytes_done); - else - ret = rxr_ep_send_queued_pkts(ep, - &rx_entry->queued_pkts); + peer = rxr_ep_get_peer(ep, rx_entry->addr); + + if (peer->flags & RXR_PEER_IN_BACKOFF) + continue; + + if (rx_entry->state == RXR_RX_QUEUED_CTRL) { + /* + * We should only have one packet pending at a time for + * rx_entry. Either the send failed due to RNR or the + * rx_entry is queued but not both. + */ + assert(dlist_empty(&rx_entry->queued_pkts)); + ret = rxr_pkt_post_ctrl(ep, RXR_RX_ENTRY, rx_entry, + rx_entry->queued_ctrl.type, + rx_entry->queued_ctrl.inject); + } else { + ret = rxr_ep_send_queued_pkts(ep, &rx_entry->queued_pkts); + } + if (ret == -FI_EAGAIN) break; if (OFI_UNLIKELY(ret)) @@ -2701,31 +1649,38 @@ static void rxr_ep_progress_internal(struct rxr_ep *ep) dlist_foreach_container_safe(&ep->tx_entry_queued_list, struct rxr_tx_entry, tx_entry, queued_entry, tmp) { - if (tx_entry->state == RXR_TX_QUEUED_RTS) - ret = rxr_ep_post_rts(ep, tx_entry); - else if (tx_entry->state == RXR_TX_QUEUED_READRSP) - ret = rxr_ep_post_readrsp(ep, tx_entry); - else - ret = rxr_ep_send_queued_pkts(ep, - &tx_entry->queued_pkts); + peer = rxr_ep_get_peer(ep, tx_entry->addr); + + if (peer->flags & RXR_PEER_IN_BACKOFF) + continue; + /* + * It is possible to receive an RNR after we queue this + * tx_entry if we run out of resources in the medium message + * protocol. Ensure all queued packets are posted before + * continuing to post additional control messages. + */ + ret = rxr_ep_send_queued_pkts(ep, &tx_entry->queued_pkts); if (ret == -FI_EAGAIN) break; if (OFI_UNLIKELY(ret)) goto tx_err; + if (tx_entry->state == RXR_TX_QUEUED_CTRL) { + ret = rxr_pkt_post_ctrl(ep, RXR_TX_ENTRY, tx_entry, + tx_entry->queued_ctrl.type, + tx_entry->queued_ctrl.inject); + if (ret == -FI_EAGAIN) + break; + if (OFI_UNLIKELY(ret)) + goto tx_err; + } + dlist_remove(&tx_entry->queued_entry); - if (tx_entry->state == RXR_TX_QUEUED_RTS || - tx_entry->state == RXR_TX_QUEUED_RTS_RNR) { - tx_entry->state = RXR_TX_RTS; - } else if (tx_entry->state == RXR_TX_QUEUED_READRSP) { - tx_entry->state = RXR_TX_SENT_READRSP; - if (tx_entry->bytes_sent < tx_entry->total_len) { - tx_entry->state = RXR_TX_SEND; - dlist_insert_tail(&tx_entry->entry, - &ep->tx_pending_list); - } + if (tx_entry->state == RXR_TX_QUEUED_REQ_RNR || + tx_entry->state == RXR_TX_QUEUED_CTRL) { + tx_entry->state = RXR_TX_REQ; } else if (tx_entry->state == RXR_TX_QUEUED_DATA_RNR) { tx_entry->state = RXR_TX_SEND; dlist_insert_tail(&tx_entry->entry, @@ -2738,6 +1693,11 @@ static void rxr_ep_progress_internal(struct rxr_ep *ep) */ dlist_foreach_container(&ep->tx_pending_list, struct rxr_tx_entry, tx_entry, entry) { + peer = rxr_ep_get_peer(ep, tx_entry->addr); + + if (peer->flags & RXR_PEER_IN_BACKOFF) + continue; + if (tx_entry->window > 0) tx_entry->send_flags |= FI_MORE; else @@ -2753,15 +1713,56 @@ static void rxr_ep_progress_internal(struct rxr_ep *ep) */ if (ep->tx_pending == ep->max_outstanding_tx) goto out; - ret = rxr_ep_post_data(ep, tx_entry); + + if (peer->flags & RXR_PEER_IN_BACKOFF) + break; + + ret = rxr_pkt_post_data(ep, tx_entry); if (OFI_UNLIKELY(ret)) { tx_entry->send_flags &= ~FI_MORE; + if (ret == -FI_EAGAIN) + goto out; goto tx_err; } } } + /* + * Send read requests until finish or error encoutered + */ + dlist_foreach_container_safe(&ep->read_pending_list, struct rxr_read_entry, + read_entry, pending_entry, tmp) { + peer = rxr_ep_get_peer(ep, read_entry->addr); + + if (peer->flags & RXR_PEER_IN_BACKOFF) + continue; + + /* + * The core's TX queue is full so we can't do any + * additional work. + */ + if (ep->tx_pending == ep->max_outstanding_tx) + goto out; + + ret = rxr_read_post(ep, read_entry); + if (ret == -FI_EAGAIN) + break; + + if (OFI_UNLIKELY(ret)) + goto read_err; + + read_entry->state = RXR_RDMA_ENTRY_SUBMITTED; + dlist_remove(&read_entry->pending_entry); + } + out: + efa_ep = container_of(ep->rdm_ep, struct efa_ep, util_ep.ep_fid); + if (efa_ep->xmit_more_wr_tail != &efa_ep->xmit_more_wr_head) { + ret = efa_post_flush(efa_ep, &bad_wr); + if (OFI_UNLIKELY(ret)) + goto tx_err; + } + return; rx_err: if (rxr_cq_handle_rx_error(ep, rx_entry, ret)) @@ -2773,6 +1774,12 @@ static void rxr_ep_progress_internal(struct rxr_ep *ep) assert(0 && "error writing error cq entry when handling TX error"); return; + +read_err: + if (rxr_read_handle_error(ep, read_entry, ret)) + assert(0 && + "error writing err cq entry while handling RDMA error"); + return; } void rxr_ep_progress(struct util_ep *util_ep) @@ -2786,11 +1793,34 @@ void rxr_ep_progress(struct util_ep *util_ep) fastlock_release(&ep->util_ep.lock); } +static +bool rxr_ep_use_shm(struct fi_info *info) +{ + /* App provided hints supercede environmental variables. + * + * Using the shm provider comes with some overheads, particularly in the + * progress engine when polling an empty completion queue, so avoid + * initializing the provider if the app provides a hint that it does not + * require node-local communication. We can still loopback over the EFA + * device in cases where the app violates the hint and continues + * communicating with node-local peers. + */ + if (info + /* If the app requires explicitly remote communication */ + && (info->caps & FI_REMOTE_COMM) + /* but not local communication */ + && !(info->caps & FI_LOCAL_COMM)) + return 0; + + return rxr_env.enable_shm_transfer; +} + int rxr_endpoint(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context) { struct fi_info *rdm_info; struct rxr_domain *rxr_domain; + struct efa_domain *efa_domain; struct rxr_ep *rxr_ep; struct fi_cq_attr cq_attr; int ret, retv; @@ -2802,7 +1832,7 @@ int rxr_endpoint(struct fid_domain *domain, struct fi_info *info, rxr_domain = container_of(domain, struct rxr_domain, util_domain.domain_fid); memset(&cq_attr, 0, sizeof(cq_attr)); - cq_attr.format = FI_CQ_FORMAT_MSG; + cq_attr.format = FI_CQ_FORMAT_DATA; cq_attr.wait_obj = FI_WAIT_NONE; ret = ofi_endpoint_init(domain, &rxr_util_prov, info, &rxr_ep->util_ep, @@ -2821,7 +1851,20 @@ int rxr_endpoint(struct fid_domain *domain, struct fi_info *info, ret = fi_endpoint(rxr_domain->rdm_domain, rdm_info, &rxr_ep->rdm_ep, rxr_ep); if (ret) - goto err_close_ofi_ep; + goto err_free_rdm_info; + + efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain, + util_domain.domain_fid); + + rxr_ep->use_shm = rxr_ep_use_shm(info); + if (rxr_ep->use_shm) { + /* Open shm provider's endpoint */ + assert(!strcmp(shm_info->fabric_attr->name, "shm")); + ret = fi_endpoint(efa_domain->shm_domain, shm_info, + &rxr_ep->shm_ep, rxr_ep); + if (ret) + goto err_close_core_ep; + } rxr_ep->rx_size = info->rx_attr->size; rxr_ep->tx_size = info->tx_attr->size; @@ -2842,25 +1885,33 @@ int rxr_endpoint(struct fid_domain *domain, struct fi_info *info, rxr_ep->msg_order = info->rx_attr->msg_order; rxr_ep->core_msg_order = rdm_info->rx_attr->msg_order; rxr_ep->core_inject_size = rdm_info->tx_attr->inject_size; + rxr_ep->max_msg_size = info->ep_attr->max_msg_size; + rxr_ep->max_proto_hdr_size = rxr_pkt_max_header_size(); rxr_ep->mtu_size = rdm_info->ep_attr->max_msg_size; + fi_freeinfo(rdm_info); + if (rxr_env.mtu_size > 0 && rxr_env.mtu_size < rxr_ep->mtu_size) rxr_ep->mtu_size = rxr_env.mtu_size; if (rxr_ep->mtu_size > RXR_MTU_MAX_LIMIT) rxr_ep->mtu_size = RXR_MTU_MAX_LIMIT; - rxr_ep->max_data_payload_size = rxr_ep->mtu_size - RXR_DATA_HDR_SIZE; - /* - * Assume our eager message size is the largest control header size - * without the source address. Use that value to set the default - * receive release threshold. - */ - rxr_ep->min_multi_recv_size = rxr_ep->mtu_size - RXR_CTRL_HDR_SIZE; + rxr_ep->max_data_payload_size = rxr_ep->mtu_size - sizeof(struct rxr_data_hdr); + rxr_ep->min_multi_recv_size = rxr_ep->mtu_size - rxr_ep->max_proto_hdr_size; if (rxr_env.tx_queue_size > 0 && rxr_env.tx_queue_size < rxr_ep->max_outstanding_tx) rxr_ep->max_outstanding_tx = rxr_env.tx_queue_size; + + rxr_ep->use_zcpy_rx = rxr_ep_use_zcpy_rx(rxr_ep, info); + FI_INFO(&rxr_prov, FI_LOG_EP_CTRL, "rxr_ep->use_zcpy_rx = %d\n", rxr_ep->use_zcpy_rx); + + rxr_ep->handle_resource_management = info->domain_attr->resource_mgmt; + FI_INFO(&rxr_prov, FI_LOG_EP_CTRL, + "rxr_ep->handle_resource_management = %d\n", + rxr_ep->handle_resource_management); + #if ENABLE_DEBUG rxr_ep->sends = 0; rxr_ep->send_comps = 0; @@ -2868,46 +1919,76 @@ int rxr_endpoint(struct fid_domain *domain, struct fi_info *info, rxr_ep->recv_comps = 0; #endif - rxr_ep->posted_bufs = 0; - rxr_ep->rx_bufs_to_post = 0; + rxr_ep->posted_bufs_shm = 0; + rxr_ep->rx_bufs_shm_to_post = 0; + rxr_ep->posted_bufs_efa = 0; + rxr_ep->rx_bufs_efa_to_post = 0; rxr_ep->tx_pending = 0; rxr_ep->available_data_bufs_ts = 0; - fi_freeinfo(rdm_info); - ret = fi_cq_open(rxr_domain->rdm_domain, &cq_attr, &rxr_ep->rdm_cq, rxr_ep); if (ret) - goto err_close_core_ep; + goto err_close_shm_ep; ret = fi_ep_bind(rxr_ep->rdm_ep, &rxr_ep->rdm_cq->fid, FI_TRANSMIT | FI_RECV); if (ret) goto err_close_core_cq; + /* Bind ep with shm provider's cq */ + if (rxr_ep->use_shm) { + ret = fi_cq_open(efa_domain->shm_domain, &cq_attr, + &rxr_ep->shm_cq, rxr_ep); + if (ret) + goto err_close_core_cq; + + ret = fi_ep_bind(rxr_ep->shm_ep, &rxr_ep->shm_cq->fid, + FI_TRANSMIT | FI_RECV); + if (ret) + goto err_close_shm_cq; + } + ret = rxr_ep_init(rxr_ep); if (ret) - goto err_close_core_cq; + goto err_close_shm_cq; *ep = &rxr_ep->util_ep.ep_fid; (*ep)->msg = &rxr_ops_msg; (*ep)->rma = &rxr_ops_rma; + (*ep)->atomic = &rxr_ops_atomic; (*ep)->tagged = &rxr_ops_tagged; (*ep)->fid.ops = &rxr_ep_fi_ops; (*ep)->ops = &rxr_ops_ep; (*ep)->cm = &rxr_ep_cm; return 0; +err_close_shm_cq: + if (rxr_ep->use_shm && rxr_ep->shm_cq) { + retv = fi_close(&rxr_ep->shm_cq->fid); + if (retv) + FI_WARN(&rxr_prov, FI_LOG_CQ, "Unable to close shm cq: %s\n", + fi_strerror(-retv)); + } err_close_core_cq: retv = fi_close(&rxr_ep->rdm_cq->fid); if (retv) FI_WARN(&rxr_prov, FI_LOG_CQ, "Unable to close cq: %s\n", fi_strerror(-retv)); +err_close_shm_ep: + if (rxr_ep->use_shm && rxr_ep->shm_ep) { + retv = fi_close(&rxr_ep->shm_ep->fid); + if (retv) + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "Unable to close shm EP: %s\n", + fi_strerror(-retv)); + } err_close_core_ep: retv = fi_close(&rxr_ep->rdm_ep->fid); if (retv) FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "Unable to close EP: %s\n", fi_strerror(-retv)); +err_free_rdm_info: + fi_freeinfo(rdm_info); err_close_ofi_ep: retv = ofi_endpoint_close(&rxr_ep->util_ep); if (retv) diff --git a/prov/efa/src/rxr/rxr_fabric.c b/prov/efa/src/rxr/rxr_fabric.c index 0a332279947..163c5258c25 100644 --- a/prov/efa/src/rxr/rxr_fabric.c +++ b/prov/efa/src/rxr/rxr_fabric.c @@ -64,6 +64,12 @@ static int rxr_fabric_close(fid_t fid) if (ret) return ret; + if (rxr_env.enable_shm_transfer) { + ret = fi_close(&rxr_fabric->shm_fabric->fid); + if (ret) + return ret; + } + ret = ofi_fabric_close(&rxr_fabric->util_fabric); if (ret) return ret; @@ -122,7 +128,17 @@ int rxr_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, ret = lower_efa_prov->fabric(rdm_info->fabric_attr, &rxr_fabric->lower_fabric, context); if (ret) - goto err_free_info; + goto err_free_rdm_info; + + /* Open shm provider's fabric domain */ + if (rxr_env.enable_shm_transfer) { + assert(!strcmp(shm_info->fabric_attr->name, "shm")); + ret = fi_fabric(shm_info->fabric_attr, + &rxr_fabric->shm_fabric, context); + if (ret) + goto err_close_rdm_fabric; + } + #ifdef RXR_PERF_ENABLED ret = ofi_perfset_create(&rxr_prov, &rxr_fabric->perf_set, @@ -142,7 +158,14 @@ int rxr_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, free(hints.fabric_attr); fi_freeinfo(rdm_info); return 0; -err_free_info: + +err_close_rdm_fabric: + retv = fi_close(&rxr_fabric->lower_fabric->fid); + if (retv) + FI_WARN(&rxr_prov, FI_LOG_FABRIC, + "Unable to close lower rdm fabric: %s\n", + fi_strerror(-retv)); +err_free_rdm_info: fi_freeinfo(rdm_info); err_free_hints: free(hints.fabric_attr); diff --git a/prov/efa/src/rxr/rxr_init.c b/prov/efa/src/rxr/rxr_init.c index c06e952a3aa..dfc5a5e1249 100644 --- a/prov/efa/src/rxr/rxr_init.c +++ b/prov/efa/src/rxr/rxr_init.c @@ -36,16 +36,28 @@ #include #include "rxr.h" #include "efa.h" +#include "ofi_hmem.h" + +struct fi_info *shm_info; struct fi_provider *lower_efa_prov; +struct efa_ep_addr *local_efa_addr; + struct rxr_env rxr_env = { .rx_window_size = RXR_DEF_MAX_RX_WINDOW, .tx_max_credits = RXR_DEF_MAX_TX_CREDITS, .tx_min_credits = RXR_DEF_MIN_TX_CREDITS, .tx_queue_size = 0, - .enable_sas_ordering = 1, + .enable_shm_transfer = 1, + .use_device_rdma = 0, + .use_zcpy_rx = 1, + .zcpy_rx_seed = 0, + .shm_av_size = 128, + .shm_max_medium_size = 4096, .recvwin_size = RXR_RECVWIN_SIZE, + .readcopy_pool_size = 256, + .atomrsp_pool_size = 1024, .cq_size = RXR_DEF_CQ_SIZE, .max_memcpy_size = 4096, .mtu_size = 0, @@ -57,6 +69,12 @@ struct rxr_env rxr_env = { .rx_copy_ooo = 1, .max_timeout = RXR_DEF_RNR_MAX_TIMEOUT, .timeout_interval = 0, /* 0 is random timeout */ + .efa_cq_read_size = 50, + .shm_cq_read_size = 50, + .efa_max_medium_msg_size = 65536, + .efa_min_read_msg_size = 1048576, + .efa_min_read_write_size = 65536, + .efa_read_segment_size = 1073741824, }; static void rxr_init_env(void) @@ -65,15 +83,19 @@ static void rxr_init_env(void) fi_param_get_int(&rxr_prov, "tx_max_credits", &rxr_env.tx_max_credits); fi_param_get_int(&rxr_prov, "tx_min_credits", &rxr_env.tx_min_credits); fi_param_get_int(&rxr_prov, "tx_queue_size", &rxr_env.tx_queue_size); - fi_param_get_int(&rxr_prov, "enable_sas_ordering", &rxr_env.enable_sas_ordering); + fi_param_get_int(&rxr_prov, "enable_shm_transfer", &rxr_env.enable_shm_transfer); + fi_param_get_int(&rxr_prov, "use_device_rdma", &rxr_env.use_device_rdma); + fi_param_get_int(&rxr_prov, "use_zcpy_rx", &rxr_env.use_zcpy_rx); + fi_param_get_int(&rxr_prov, "zcpy_rx_seed", &rxr_env.zcpy_rx_seed); + fi_param_get_int(&rxr_prov, "shm_av_size", &rxr_env.shm_av_size); + fi_param_get_int(&rxr_prov, "shm_max_medium_size", &rxr_env.shm_max_medium_size); fi_param_get_int(&rxr_prov, "recvwin_size", &rxr_env.recvwin_size); + fi_param_get_int(&rxr_prov, "readcopy_pool_size", &rxr_env.readcopy_pool_size); fi_param_get_int(&rxr_prov, "cq_size", &rxr_env.cq_size); fi_param_get_size_t(&rxr_prov, "max_memcpy_size", &rxr_env.max_memcpy_size); fi_param_get_bool(&rxr_prov, "mr_cache_enable", &efa_mr_cache_enable); - fi_param_get_bool(&rxr_prov, "mr_cache_merge_regions", - &efa_mr_cache_merge_regions); fi_param_get_size_t(&rxr_prov, "mr_max_cached_count", &efa_mr_max_cached_count); fi_param_get_size_t(&rxr_prov, "mr_max_cached_size", @@ -91,6 +113,49 @@ static void rxr_init_env(void) fi_param_get_int(&rxr_prov, "max_timeout", &rxr_env.max_timeout); fi_param_get_int(&rxr_prov, "timeout_interval", &rxr_env.timeout_interval); + fi_param_get_size_t(&rxr_prov, "efa_cq_read_size", + &rxr_env.efa_cq_read_size); + fi_param_get_size_t(&rxr_prov, "shm_cq_read_size", + &rxr_env.shm_cq_read_size); + fi_param_get_size_t(&rxr_prov, "inter_max_medium_message_size", + &rxr_env.efa_max_medium_msg_size); + fi_param_get_size_t(&rxr_prov, "inter_min_read_message_size", + &rxr_env.efa_min_read_msg_size); + fi_param_get_size_t(&rxr_prov, "inter_min_read_write_size", + &rxr_env.efa_min_read_write_size); + fi_param_get_size_t(&rxr_prov, "inter_read_segment_size", + &rxr_env.efa_read_segment_size); +} + +/* + * Stringify the void *addr to a string smr_name formatted as `gid_qpn`, which + * will be used to insert into shm provider's AV. Then shm uses smr_name as + * ep_name to create the shared memory region. + * + * The IPv6 address length is 46, but the max supported name length for shm is 32. + * The string `gid_qpn` could be truncated during snprintf. + * The current way works because the IPv6 addresses starting with FE in hexadecimals represent + * link local IPv6 addresses, which has reserved first 64 bits (FE80::/64). + * e.g., fe80:0000:0000:0000:0436:29ff:fe8e:ceaa -> fe80::436:29ff:fe8e:ceaa + * And the length of string `gid_qpn` (fe80::436:29ff:fe8e:ceaa_***) will not exceed 32. + * If the address is NOT link local, we need to think another reasonable way to + * generate the string. + */ +int rxr_ep_efa_addr_to_str(const void *addr, char *smr_name) +{ + char gid[INET6_ADDRSTRLEN] = { 0 }; + uint16_t qpn; + int ret; + + if (!inet_ntop(AF_INET6, ((struct efa_ep_addr *)addr)->raw, gid, INET6_ADDRSTRLEN)) { + printf("Failed to get current EFA's GID, errno: %d\n", errno); + return 0; + } + qpn = ((struct efa_ep_addr *)addr)->qpn; + + ret = snprintf(smr_name, NAME_MAX, "%ld_%s_%d", (size_t) getuid(), gid, qpn); + + return (ret <= 0) ? ret : FI_SUCCESS; } void rxr_info_to_core_mr_modes(uint32_t version, @@ -109,9 +174,15 @@ void rxr_info_to_core_mr_modes(uint32_t version, FI_MR_LOCAL | FI_MR_ALLOCATED; if (!hints) core_info->domain_attr->mr_mode |= OFI_MR_BASIC_MAP; - else if (hints->domain_attr) - core_info->domain_attr->mr_mode |= - hints->domain_attr->mr_mode & OFI_MR_BASIC_MAP; + else { + if (hints->domain_attr) + core_info->domain_attr->mr_mode |= + hints->domain_attr->mr_mode & OFI_MR_BASIC_MAP; + core_info->addr_format = hints->addr_format; + } +#if HAVE_LIBCUDA + core_info->domain_attr->mr_mode |= FI_MR_HMEM; +#endif } } @@ -152,6 +223,9 @@ static int rxr_copy_attr(const struct fi_info *info, struct fi_info *dup) if (!dup->nic) return -FI_ENOMEM; } + if (info->caps & FI_HMEM) + dup->caps |= FI_HMEM; + return 0; } @@ -168,8 +242,6 @@ static int rxr_info_to_core(uint32_t version, const struct fi_info *rxr_info, (*core_info)->ep_attr->type = FI_EP_RDM; (*core_info)->tx_attr->op_flags = FI_TRANSMIT_COMPLETE; - (*core_info)->addr_format = FI_ADDR_EFA; - /* * Skip copying address, domain, fabric info. */ @@ -182,6 +254,22 @@ static int rxr_info_to_core(uint32_t version, const struct fi_info *rxr_info, return ret; } +/* Explicitly set all necessary bits before calling shm provider's getinfo function */ +void rxr_set_shm_hints(struct fi_info *shm_hints) +{ + shm_hints->caps = FI_MSG | FI_TAGGED | FI_RECV | FI_SEND | FI_READ + | FI_WRITE | FI_REMOTE_READ | FI_REMOTE_WRITE + | FI_MULTI_RECV | FI_RMA; + shm_hints->domain_attr->av_type = FI_AV_TABLE; + shm_hints->domain_attr->mr_mode = FI_MR_VIRT_ADDR; + shm_hints->domain_attr->caps |= FI_LOCAL_COMM; + shm_hints->tx_attr->msg_order = FI_ORDER_SAS; + shm_hints->rx_attr->msg_order = FI_ORDER_SAS; + shm_hints->fabric_attr->name = strdup("shm"); + shm_hints->fabric_attr->prov_name = strdup("shm"); + shm_hints->ep_attr->type = FI_EP_RDM; +} + /* Pass tx/rx attr that user specifies down to core provider */ void rxr_reset_rx_tx_to_core(const struct fi_info *user_info, struct fi_info *core_info) @@ -193,9 +281,6 @@ void rxr_reset_rx_tx_to_core(const struct fi_info *user_info, core_info->rx_attr->size = user_info->rx_attr->size < core_info->rx_attr->size ? user_info->rx_attr->size : core_info->rx_attr->size; - core_info->rx_attr->iov_limit = - user_info->rx_attr->iov_limit < core_info->rx_attr->iov_limit ? - user_info->rx_attr->iov_limit : core_info->rx_attr->iov_limit; /* tx attr */ core_info->tx_attr->inject_size = user_info->tx_attr->inject_size < core_info->tx_attr->inject_size ? @@ -203,11 +288,12 @@ void rxr_reset_rx_tx_to_core(const struct fi_info *user_info, core_info->tx_attr->size = user_info->tx_attr->size < core_info->tx_attr->size ? user_info->tx_attr->size : core_info->tx_attr->size; - core_info->tx_attr->iov_limit = - user_info->tx_attr->iov_limit < core_info->tx_attr->iov_limit ? - user_info->tx_attr->iov_limit : core_info->tx_attr->iov_limit; } +/* + * Used to set tx/rx attributes that are characteristic of the device for the + * two endpoint types and not emulated in software. + */ void rxr_set_rx_tx_size(struct fi_info *info, const struct fi_info *core_info) { @@ -238,6 +324,9 @@ static int rxr_dgram_info_to_rxr(uint32_t version, static int rxr_info_to_rxr(uint32_t version, const struct fi_info *core_info, struct fi_info *info, const struct fi_info *hints) { + uint64_t atomic_ordering; + uint64_t max_atomic_size; + info->caps = rxr_info.caps; info->mode = rxr_info.mode; @@ -246,10 +335,8 @@ static int rxr_info_to_rxr(uint32_t version, const struct fi_info *core_info, *info->ep_attr = *rxr_info.ep_attr; *info->domain_attr = *rxr_info.domain_attr; - info->tx_attr->inject_size = - core_info->tx_attr->inject_size > RXR_CTRL_HDR_SIZE_NO_CQ ? - core_info->tx_attr->inject_size - RXR_CTRL_HDR_SIZE_NO_CQ - : 0; + /* TODO: update inject_size when we implement inject */ + info->tx_attr->inject_size = 0; rxr_info.tx_attr->inject_size = info->tx_attr->inject_size; info->addr_format = core_info->addr_format; @@ -257,15 +344,47 @@ static int rxr_info_to_rxr(uint32_t version, const struct fi_info *core_info, info->domain_attr->cq_cnt = core_info->domain_attr->cq_cnt; info->domain_attr->mr_key_size = core_info->domain_attr->mr_key_size; + /* + * Do not advertise FI_HMEM capabilities when the core can not support + * it or when the application passes NULL hints (given this is a primary + * cap). The logic for device-specific checks pertaining to HMEM comes + * further along this path. + */ + if ((core_info && !(core_info->caps & FI_HMEM)) || !hints) { + info->caps &= ~FI_HMEM; + } + /* * Handle user-provided hints and adapt the info object passed back up * based on EFA-specific constraints. */ if (hints) { - /* Disable packet reordering if the app doesn't need it */ - if (hints->tx_attr) - if (!(hints->tx_attr->msg_order & FI_ORDER_SAS)) - rxr_env.enable_sas_ordering = 0; + if (hints->tx_attr) { + + atomic_ordering = FI_ORDER_ATOMIC_RAR | FI_ORDER_ATOMIC_RAW | + FI_ORDER_ATOMIC_WAR | FI_ORDER_ATOMIC_WAW; + if (hints->tx_attr->msg_order & atomic_ordering) { + max_atomic_size = core_info->ep_attr->max_msg_size + - sizeof(struct rxr_rta_hdr) + - core_info->src_addrlen + - RXR_IOV_LIMIT * sizeof(struct fi_rma_iov); + + if (hints->tx_attr->msg_order & FI_ORDER_ATOMIC_RAW) { + info->ep_attr->max_order_raw_size = max_atomic_size; + rxr_info.ep_attr->max_order_raw_size = max_atomic_size; + } + + if (hints->tx_attr->msg_order & FI_ORDER_ATOMIC_WAR) { + info->ep_attr->max_order_war_size = max_atomic_size; + rxr_info.ep_attr->max_order_war_size = max_atomic_size; + } + + if (hints->tx_attr->msg_order & FI_ORDER_ATOMIC_WAW) { + info->ep_attr->max_order_waw_size = max_atomic_size; + rxr_info.ep_attr->max_order_waw_size = max_atomic_size; + } + } + } /* We only support manual progress for RMA operations */ if (hints->caps & FI_RMA) { @@ -273,11 +392,109 @@ static int rxr_info_to_rxr(uint32_t version, const struct fi_info *core_info, info->domain_attr->data_progress = FI_PROGRESS_MANUAL; } - /* Use a table for AV if the app has no strong requirement */ - if (!hints->domain_attr || hints->domain_attr->av_type == FI_AV_UNSPEC) - info->domain_attr->av_type = FI_AV_TABLE; +#if HAVE_LIBCUDA + /* If the application requires HMEM support, we will add FI_MR_HMEM + * to mr_mode, because we need application to provide descriptor + * for cuda buffer. + * Note we did not add FI_MR_LOCAL here because according + * to FI_MR man page: + * + * "If FI_MR_HMEM is set, but FI_MR_LOCAL is unset, + * only device buffers must be registered when used locally. + * " + * which means FI_MR_HMEM implies FI_MR_LOCAL for cuda buffer + */ + if (hints->caps & FI_HMEM) { + /* + * XXX: remove this once CUDA IPC is supported by SHM + * and we have a fallback path to use the device when + * SHM doesn't support CUDA IPC. + */ + if (hints->caps & FI_LOCAL_COMM) { + FI_WARN(&rxr_prov, FI_LOG_CORE, + "FI_HMEM is currently not supported by the EFA provider when FI_LOCAL_COMM is requested.\n"); + return -FI_ENODATA; + } + info->caps &= ~FI_LOCAL_COMM; + + if (!efa_device_support_rdma_read()) { + FI_WARN(&rxr_prov, FI_LOG_CORE, + "FI_HMEM capability requires RDMA, which this device does not support.\n"); + return -FI_ENODATA; + + } + + if (!rxr_env.use_device_rdma) { + FI_WARN(&rxr_prov, FI_LOG_CORE, + "FI_HMEM capability requires RDMA, which is turned off. You can turn it on by set environment variable FI_EFA_USE_DEVICE_RDMA to 1.\n"); + return -FI_ENODATA; + } + + if (hints->domain_attr && + !(hints->domain_attr->mr_mode & FI_MR_HMEM)) { + FI_WARN(&rxr_prov, FI_LOG_CORE, + "FI_HMEM capability requires device registrations (FI_MR_HMEM)\n"); + return -FI_ENODATA; + } + + info->domain_attr->mr_mode |= FI_MR_HMEM; + + } else { + /* + * FI_HMEM is a primary capability. Providers should + * only enable it if requested by applications. + */ + info->caps &= ~FI_HMEM; + } +#endif + /* + * The provider does not force applications to register buffers + * with the device, but if an application is able to, reuse + * their registrations and avoid the bounce buffers. + */ + if (hints->domain_attr && hints->domain_attr->mr_mode & FI_MR_LOCAL) + info->domain_attr->mr_mode |= FI_MR_LOCAL; + + /* + * Same goes for prefix mode, where the protocol does not + * absolutely need a prefix before receive buffers, but it can + * use it when available to optimize transfers with endpoints + * having the following profile: + * - Requires FI_MSG and not FI_TAGGED/FI_ATOMIC/FI_RMA + * - Can handle registrations (FI_MR_LOCAL) + * - No need for FI_DIRECTED_RECV + * - Guaranteed to send msgs smaller than info->nic->link_attr->mtu + */ + if (hints->mode & FI_MSG_PREFIX) { + FI_INFO(&rxr_prov, FI_LOG_CORE, + "FI_MSG_PREFIX supported by application.\n"); + info->mode |= FI_MSG_PREFIX; + info->tx_attr->mode |= FI_MSG_PREFIX; + info->rx_attr->mode |= FI_MSG_PREFIX; + + /* + * The prefix needs to be a multiple of 8. The pkt_entry + * is already at 64 bytes (128 with debug). + */ + info->ep_attr->msg_prefix_size = sizeof(struct rxr_pkt_entry) + + sizeof(struct rxr_eager_msgrtm_hdr); + assert(!(info->ep_attr->msg_prefix_size % 8)); + FI_INFO(&rxr_prov, FI_LOG_CORE, + "FI_MSG_PREFIX size = %ld\n", info->ep_attr->msg_prefix_size); + } } + /* Use a table for AV if the app has no strong requirement */ + if (!hints || !hints->domain_attr || + hints->domain_attr->av_type == FI_AV_UNSPEC) + info->domain_attr->av_type = FI_AV_TABLE; + + if (!hints || !hints->domain_attr || + hints->domain_attr->resource_mgmt == FI_RM_UNSPEC) + info->domain_attr->resource_mgmt = FI_RM_ENABLED; + else + info->domain_attr->resource_mgmt = hints->domain_attr->resource_mgmt; + rxr_set_rx_tx_size(info, core_info); return 0; } @@ -309,6 +526,49 @@ int rxr_get_lower_rdm_info(uint32_t version, const char *node, return ret; } +/* + * Call getinfo on lower efa provider to get all locally qualified fi_info + * structure, then store the corresponding efa nic GIDs + */ +int rxr_get_local_gids(struct fi_provider *lower_efa_prov) +{ + struct fi_info *core_info, *cur; + struct efa_ep_addr *cur_efa_addr; + int ret; + + cur_efa_addr = local_efa_addr = NULL; + core_info = cur = NULL; + + ret = lower_efa_prov->getinfo(rxr_prov.fi_version, NULL, NULL, 0, NULL, &core_info); + if (ret) + return ret; + + local_efa_addr = (struct efa_ep_addr *)malloc(sizeof(struct efa_ep_addr)); + if (!local_efa_addr) { + ret = -FI_ENOMEM; + goto out; + } + local_efa_addr->next = NULL; + + cur_efa_addr = local_efa_addr; + for (cur = core_info; cur; cur = cur->next) { + memcpy(cur_efa_addr->raw, ((struct efa_ep_addr *)cur->src_addr)->raw, 16); + if (cur->next) { + cur_efa_addr->next = (struct efa_ep_addr *)malloc(sizeof(struct efa_ep_addr)); + if (!cur_efa_addr->next) { + ret = -FI_ENOMEM; + goto out; + } + cur_efa_addr = cur_efa_addr->next; + cur_efa_addr->next = NULL; + } + } + +out: + fi_freeinfo(core_info); + return ret; +} + static int rxr_dgram_getinfo(uint32_t version, const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct fi_info **info, @@ -360,6 +620,7 @@ static int rxr_getinfo(uint32_t version, const char *node, const struct fi_info *hints, struct fi_info **info) { struct fi_info *core_info, *util_info, *cur, *tail; + struct fi_info *shm_hints; int ret; *info = tail = core_info = NULL; @@ -367,6 +628,7 @@ static int rxr_getinfo(uint32_t version, const char *node, if (hints && hints->ep_attr && hints->ep_attr->type == FI_EP_DGRAM) goto dgram_info; + ret = rxr_get_lower_rdm_info(version, node, service, flags, &rxr_util_prov, hints, &core_info); @@ -380,21 +642,27 @@ static int rxr_getinfo(uint32_t version, const char *node, util_info = fi_allocinfo(); if (!util_info) { ret = -FI_ENOMEM; - fi_freeinfo(*info); - goto out; + goto free_info; } - rxr_info_to_rxr(version, cur, util_info, hints); + ret = rxr_info_to_rxr(version, cur, util_info, hints); + if (ret) + goto free_info; ret = rxr_copy_attr(cur, util_info); - if (ret) { - fi_freeinfo(util_info); - fi_freeinfo(*info); - goto out; - } - + if (ret) + goto free_info; ofi_alter_info(util_info, hints, version); + + /* If application asked for FI_REMOTE_COMM but not FI_LOCAL_COMM, it + * does not want to use shm. In this case, we honor the request by + * unsetting the FI_LOCAL_COMM flag in info. This way rxr_endpoint() + * should disable shm transfer for the endpoint + */ + if (hints && hints->caps & FI_REMOTE_COMM && !(hints->caps & FI_LOCAL_COMM)) + util_info->caps &= ~FI_LOCAL_COMM; + if (!*info) *info = util_info; else @@ -410,21 +678,63 @@ static int rxr_getinfo(uint32_t version, const char *node, */ if (ret == -FI_ENODATA && *info) ret = 0; -out: + + if (!ret && rxr_env.enable_shm_transfer && !shm_info) { + shm_info = NULL; + shm_hints = fi_allocinfo(); + rxr_set_shm_hints(shm_hints); + ret = fi_getinfo(FI_VERSION(1, 8), NULL, NULL, + OFI_GETINFO_HIDDEN, shm_hints, &shm_info); + fi_freeinfo(shm_hints); + if (ret) { + FI_WARN(&rxr_prov, FI_LOG_CORE, "Disabling EFA shared memory support; failed to get shm provider's info: %s\n", + fi_strerror(-ret)); + rxr_env.enable_shm_transfer = 0; + ret = 0; + } else { + assert(!strcmp(shm_info->fabric_attr->name, "shm")); + } + } + fi_freeinfo(core_info); return ret; +free_info: + fi_freeinfo(core_info); + fi_freeinfo(util_info); + fi_freeinfo(*info); + *info = NULL; + return ret; } static void rxr_fini(void) { + struct efa_ep_addr *cur; + if (lower_efa_prov) lower_efa_prov->cleanup(); + + if (rxr_env.enable_shm_transfer) { + /* Cleanup all local efa nic GIDs */ + while (local_efa_addr) { + cur = local_efa_addr; + local_efa_addr = local_efa_addr->next; + free(cur); + } + if (shm_info) + fi_freeinfo(shm_info); + } + +#if HAVE_EFA_DL + ofi_monitors_cleanup(); + ofi_hmem_cleanup(); + ofi_mem_fini(); +#endif } struct fi_provider rxr_prov = { .name = "efa", - .version = FI_VERSION(RXR_MAJOR_VERSION, RXR_MINOR_VERSION), - .fi_version = RXR_FI_VERSION, + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, .getinfo = rxr_getinfo, .fabric = rxr_fabric, .cleanup = rxr_fini @@ -440,45 +750,77 @@ EFA_INI "Defines the minimum number of credits a sender requests from a receiver (Default: 32)."); fi_param_define(&rxr_prov, "tx_queue_size", FI_PARAM_INT, "Defines the maximum number of unacknowledged sends with the NIC."); - fi_param_define(&rxr_prov, "enable_sas_ordering", FI_PARAM_INT, - "Enable packet reordering for the RDM endpoint. This is always enabled when FI_ORDER_SAS is requested by the application. (Default: 1)"); + fi_param_define(&rxr_prov, "enable_shm_transfer", FI_PARAM_INT, + "Enable using SHM provider to provide the communication between processes on the same system. (Default: 1)"); + fi_param_define(&rxr_prov, "use_device_rdma", FI_PARAM_INT, + "whether to use device's RDMA functionality for one-sided and two-sided transfer."); + fi_param_define(&rxr_prov, "use_zcpy_rx", FI_PARAM_INT, + "Enables the use of application's receive buffers in place of bounce-buffers when feasible. (Default: 1)"); + fi_param_define(&rxr_prov, "zcpy_rx_seed", FI_PARAM_INT, + "Defines the number of bounce-buffers the provider will prepost during EP initialization. (Default: 0)"); + fi_param_define(&rxr_prov, "shm_av_size", FI_PARAM_INT, + "Defines the maximum number of entries in SHM provider's address vector (Default 128)."); + fi_param_define(&rxr_prov, "shm_max_medium_size", FI_PARAM_INT, + "Defines the switch point between small/medium message and large message. The message larger than this switch point will be transferred with large message protocol (Default 4096)."); fi_param_define(&rxr_prov, "recvwin_size", FI_PARAM_INT, "Defines the size of sliding receive window. (Default: 16384)"); + fi_param_define(&rxr_prov, "readcopy_pool_size", FI_PARAM_INT, + "Defines the size of readcopy packet pool size. (Default: 256)"); fi_param_define(&rxr_prov, "cq_size", FI_PARAM_INT, "Define the size of completion queue. (Default: 8192)"); fi_param_define(&rxr_prov, "mr_cache_enable", FI_PARAM_BOOL, "Enables using the mr cache and in-line registration instead of a bounce buffer for iov's larger than max_memcpy_size. Defaults to true. When disabled, only uses a bounce buffer."); - fi_param_define(&rxr_prov, "mr_cache_merge_regions", FI_PARAM_BOOL, - "Enables merging overlapping and adjacent memory registration regions. Defaults to true."); - fi_param_define(&rxr_prov, "mr_max_cached_count", FI_PARAM_INT, + fi_param_define(&rxr_prov, "mr_max_cached_count", FI_PARAM_SIZE_T, "Sets the maximum number of memory registrations that can be cached at any time."); - fi_param_define(&rxr_prov, "mr_max_cached_size", FI_PARAM_INT, + fi_param_define(&rxr_prov, "mr_max_cached_size", FI_PARAM_SIZE_T, "Sets the maximum amount of memory that cached memory registrations can hold onto at any time."); - fi_param_define(&rxr_prov, "max_memcpy_size", FI_PARAM_INT, + fi_param_define(&rxr_prov, "max_memcpy_size", FI_PARAM_SIZE_T, "Threshold size switch between using memory copy into a pre-registered bounce buffer and memory registration on the user buffer. (Default: 4096)"); - fi_param_define(&rxr_prov, "mtu_size", FI_PARAM_INT, + fi_param_define(&rxr_prov, "mtu_size", FI_PARAM_SIZE_T, "Override the MTU size of the device."); - fi_param_define(&rxr_prov, "tx_size", FI_PARAM_INT, + fi_param_define(&rxr_prov, "tx_size", FI_PARAM_SIZE_T, "Set the maximum number of transmit operations before the provider returns -FI_EAGAIN. For only the RDM endpoint, this parameter will cause transmit operations to be queued when this value is set higher than the default and the transmit queue is full."); - fi_param_define(&rxr_prov, "rx_size", FI_PARAM_INT, + fi_param_define(&rxr_prov, "rx_size", FI_PARAM_SIZE_T, "Set the maximum number of receive operations before the provider returns -FI_EAGAIN."); - fi_param_define(&rxr_prov, "tx_iov_limit", FI_PARAM_INT, + fi_param_define(&rxr_prov, "tx_iov_limit", FI_PARAM_SIZE_T, "Maximum transmit iov_limit."); - fi_param_define(&rxr_prov, "rx_iov_limit", FI_PARAM_INT, + fi_param_define(&rxr_prov, "rx_iov_limit", FI_PARAM_SIZE_T, "Maximum receive iov_limit."); fi_param_define(&rxr_prov, "rx_copy_unexp", FI_PARAM_BOOL, "Enables the use of a separate pool of bounce-buffers to copy unexpected messages out of the pre-posted receive buffers. (Default: 1)"); fi_param_define(&rxr_prov, "rx_copy_ooo", FI_PARAM_BOOL, - "Enables the use of a separate pool of bounce-buffers to copy out-of-order RTS packets out of the pre-posted receive buffers. (Default: 1)"); + "Enables the use of a separate pool of bounce-buffers to copy out-of-order RTM packets out of the pre-posted receive buffers. (Default: 1)"); fi_param_define(&rxr_prov, "max_timeout", FI_PARAM_INT, "Set the maximum timeout (us) for backoff to a peer after a receiver not ready error. (Default: 1000000)"); fi_param_define(&rxr_prov, "timeout_interval", FI_PARAM_INT, "Set the time interval (us) for the base timeout to use for exponential backoff to a peer after a receiver not ready error. (Default: 0 [random])"); + fi_param_define(&rxr_prov, "efa_cq_read_size", FI_PARAM_SIZE_T, + "Set the number of EFA completion entries to read for one loop for one iteration of the progress engine. (Default: 50)"); + fi_param_define(&rxr_prov, "shm_cq_read_size", FI_PARAM_SIZE_T, + "Set the number of SHM completion entries to read for one loop for one iteration of the progress engine. (Default: 50)"); + fi_param_define(&rxr_prov, "inter_max_medium_message_size", FI_PARAM_INT, + "The maximum message size for inter EFA medium message protocol (Default 65536)."); + fi_param_define(&rxr_prov, "inter_min_read_message_size", FI_PARAM_INT, + "The minimum message size for inter EFA read message protocol. If instance support RDMA read, messages whose size is larger than this value will be sent by read message protocol (Default 1048576)."); + + fi_param_define(&rxr_prov, "inter_min_read_write_size", FI_PARAM_INT, + "The mimimum message size for inter EFA write to use read write protocol. If firmware support RDMA read, and FI_EFA_USE_DEVICE_RDMA is 1, write requests whose size is larger than this value will use the read write protocol (Default 65536)."); + fi_param_define(&rxr_prov, "inter_read_segment_size", FI_PARAM_INT, + "Calls to RDMA read is segmented using this value."); rxr_init_env(); +#if HAVE_EFA_DL + ofi_mem_init(); + ofi_hmem_init(); + ofi_monitors_init(); +#endif + lower_efa_prov = init_lower_efa_prov(); if (!lower_efa_prov) return NULL; + if (rxr_env.enable_shm_transfer && rxr_get_local_gids(lower_efa_prov)) + return NULL; + return &rxr_prov; } diff --git a/prov/efa/src/rxr/rxr_msg.c b/prov/efa/src/rxr/rxr_msg.c new file mode 100644 index 00000000000..4777f755bbd --- /dev/null +++ b/prov/efa/src/rxr/rxr_msg.c @@ -0,0 +1,1161 @@ +/* + * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "ofi.h" +#include +#include + +#include "efa.h" +#include "rxr.h" +#include "rxr_msg.h" +#include "rxr_pkt_cmd.h" + +/** + * This file define the msg ops functions. + * It is consisted of the following sections: + * send functions, + * receive functions and + * ops structure + */ + +/** + * Send function + */ + +/** + * Utility functions used by both non-tagged and tagged send. + */ +static inline +ssize_t rxr_msg_post_cuda_rtm(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry) +{ + int err, tagged; + struct rxr_peer *peer; + int pkt_type; + bool delivery_complete_requested; + + assert(RXR_EAGER_MSGRTM_PKT + 1 == RXR_EAGER_TAGRTM_PKT); + assert(RXR_READ_MSGRTM_PKT + 1 == RXR_READ_TAGRTM_PKT); + assert(RXR_DC_EAGER_MSGRTM_PKT + 1 == RXR_DC_EAGER_TAGRTM_PKT); + + tagged = (tx_entry->op == ofi_op_tagged); + assert(tagged == 0 || tagged == 1); + + delivery_complete_requested = tx_entry->fi_flags & FI_DELIVERY_COMPLETE; + if (tx_entry->total_len == 0) { + pkt_type = delivery_complete_requested ? RXR_DC_EAGER_MSGRTM_PKT : RXR_EAGER_MSGRTM_PKT; + return rxr_pkt_post_ctrl(rxr_ep, RXR_TX_ENTRY, tx_entry, + pkt_type + tagged, 0); + } + + /* Currently cuda data must be sent using read message protocol. + * However, because read message protocol is an extra feature, we cannot + * sure if the receiver supports it. + * The only way we can be sure of that is through the handshake packet + * from the receiver, so here we call rxr_pkt_wait_handshake(). + */ + peer = rxr_ep_get_peer(rxr_ep, tx_entry->addr); + + err = rxr_pkt_wait_handshake(rxr_ep, tx_entry->addr, peer); + if (OFI_UNLIKELY(err)) { + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "waiting for handshake packet failed!\n"); + return err; + } + + assert(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED); + if (!efa_peer_support_rdma_read(peer)) { + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "Cannot send gpu data because receiver does not support RDMA\n"); + return -FI_EOPNOTSUPP; + } + + return rxr_pkt_post_ctrl(rxr_ep, RXR_TX_ENTRY, tx_entry, + RXR_READ_MSGRTM_PKT + tagged, 0); +} + +ssize_t rxr_msg_post_rtm(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry) +{ + /* + * For performance consideration, this function assume the tagged rtm packet type id is + * always the correspondent message rtm packet type id + 1, thus the assertion here. + */ + assert(RXR_EAGER_MSGRTM_PKT + 1 == RXR_EAGER_TAGRTM_PKT); + assert(RXR_READ_MSGRTM_PKT + 1 == RXR_READ_TAGRTM_PKT); + assert(RXR_LONG_MSGRTM_PKT + 1 == RXR_LONG_TAGRTM_PKT); + assert(RXR_MEDIUM_MSGRTM_PKT + 1 == RXR_MEDIUM_TAGRTM_PKT); + + assert(RXR_DC_EAGER_MSGRTM_PKT + 1 == RXR_DC_EAGER_TAGRTM_PKT); + assert(RXR_DC_MEDIUM_MSGRTM_PKT + 1 == RXR_DC_MEDIUM_TAGRTM_PKT); + assert(RXR_DC_LONG_MSGRTM_PKT + 1 == RXR_DC_LONG_TAGRTM_PKT); + + int tagged; + size_t max_rtm_data_size; + ssize_t err; + struct rxr_peer *peer; + bool delivery_complete_requested; + int ctrl_type; + struct efa_domain *efa_domain; + struct rxr_domain *rxr_domain = rxr_ep_domain(rxr_ep); + + efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain, + util_domain.domain_fid); + + assert(tx_entry->op == ofi_op_msg || tx_entry->op == ofi_op_tagged); + tagged = (tx_entry->op == ofi_op_tagged); + assert(tagged == 0 || tagged == 1); + + delivery_complete_requested = tx_entry->fi_flags & FI_DELIVERY_COMPLETE; + peer = rxr_ep_get_peer(rxr_ep, tx_entry->addr); + + if (delivery_complete_requested && !(peer->is_local)) { + tx_entry->rxr_flags |= RXR_DELIVERY_COMPLETE_REQUESTED; + /* + * Because delivery complete is defined as an extra + * feature, the receiver might not support it. + * + * The sender cannot send with FI_DELIVERY_COMPLETE + * if the peer is not able to handle it. + * + * If the sender does not know whether the peer + * can handle it, it needs to trigger + * a handshake packet from the peer. + * + * The handshake packet contains + * the information whether the peer + * support it or not. + */ + err = rxr_pkt_trigger_handshake(rxr_ep, tx_entry->addr, peer); + if (OFI_UNLIKELY(err)) + return err; + + if (!(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED)) + return -FI_EAGAIN; + + else if (!rxr_peer_support_delivery_complete(peer)) + return -FI_EOPNOTSUPP; + + max_rtm_data_size = rxr_pkt_req_max_data_size(rxr_ep, + tx_entry->addr, + RXR_DC_EAGER_MSGRTM_PKT + tagged); + } else { + max_rtm_data_size = rxr_pkt_req_max_data_size(rxr_ep, + tx_entry->addr, + RXR_EAGER_MSGRTM_PKT + tagged); + } + + if (peer->is_local) { + assert(rxr_ep->use_shm); + /* intra instance message */ + if (tx_entry->total_len > max_rtm_data_size) + /* + * Read message support + * FI_DELIVERY_COMPLETE implicitly. + */ + ctrl_type = RXR_READ_MSGRTM_PKT; + else + ctrl_type = delivery_complete_requested ? RXR_DC_EAGER_MSGRTM_PKT : RXR_EAGER_MSGRTM_PKT; + + return rxr_pkt_post_ctrl(rxr_ep, RXR_TX_ENTRY, tx_entry, ctrl_type + tagged, 0); + } + + if (rxr_ep->use_zcpy_rx) { + /* + * The application can not deal with varying packet header sizes + * before and after receiving a handshake. Forcing a handshake + * here so we can always use the smallest eager msg packet + * header size to determine the msg_prefix_size. + */ + err = rxr_pkt_wait_handshake(rxr_ep, tx_entry->addr, peer); + if (OFI_UNLIKELY(err)) + return err; + + assert(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED); + } + + if (efa_ep_is_cuda_mr(tx_entry->desc[0])) { + return rxr_msg_post_cuda_rtm(rxr_ep, tx_entry); + } + + /* inter instance message */ + if (tx_entry->total_len <= max_rtm_data_size) { + ctrl_type = (delivery_complete_requested) ? + RXR_DC_EAGER_MSGRTM_PKT : RXR_EAGER_MSGRTM_PKT; + return rxr_pkt_post_ctrl(rxr_ep, RXR_TX_ENTRY, tx_entry, + ctrl_type + tagged, 0); + } + + if (tx_entry->total_len <= rxr_env.efa_max_medium_msg_size) { + /* we do not check the return value of rxr_ep_init_mr_desc() + * because medium message works even if MR registration failed + */ + if (tx_entry->desc[0] || efa_is_cache_available(efa_domain)) + rxr_ep_tx_init_mr_desc(rxr_domain, tx_entry, 0, FI_SEND); + + /* + * we have to queue message RTM because data is sent as multiple + * medium RTM packets. It could happend that the first several packets + * were sent successfully, but the following packet encountered -FI_EAGAIN + */ + ctrl_type = delivery_complete_requested ? + RXR_DC_MEDIUM_MSGRTM_PKT : RXR_MEDIUM_MSGRTM_PKT; + return rxr_pkt_post_ctrl_or_queue(rxr_ep, RXR_TX_ENTRY, tx_entry, + ctrl_type + tagged, 0); + } + + if (tx_entry->total_len >= rxr_env.efa_min_read_msg_size && + efa_both_support_rdma_read(rxr_ep, peer) && + (tx_entry->desc[0] || efa_is_cache_available(efa_domain))) { + /* Read message support FI_DELIVERY_COMPLETE implicitly. */ + err = rxr_pkt_post_ctrl(rxr_ep, RXR_TX_ENTRY, tx_entry, + RXR_READ_MSGRTM_PKT + tagged, 0); + + if (err != -FI_ENOMEM) + return err; + + /* + * If memory registration failed, we continue here + * and fall back to use long message protocol + */ + } + + err = rxr_ep_set_tx_credit_request(rxr_ep, tx_entry); + if (OFI_UNLIKELY(err)) + return err; + + ctrl_type = delivery_complete_requested ? RXR_DC_LONG_MSGRTM_PKT : RXR_LONG_MSGRTM_PKT; + tx_entry->rxr_flags |= RXR_LONGCTS_PROTOCOL; + return rxr_pkt_post_ctrl(rxr_ep, RXR_TX_ENTRY, tx_entry, + ctrl_type + tagged, 0); +} + +ssize_t rxr_msg_generic_send(struct fid_ep *ep, const struct fi_msg *msg, + uint64_t tag, uint32_t op, uint64_t flags) +{ + struct rxr_ep *rxr_ep; + ssize_t err; + struct rxr_tx_entry *tx_entry; + struct rxr_peer *peer; + + FI_DBG(&rxr_prov, FI_LOG_EP_DATA, + "iov_len: %lu tag: %lx op: %x flags: %lx\n", + ofi_total_iov_len(msg->msg_iov, msg->iov_count), + tag, op, flags); + + rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid); + assert(msg->iov_count <= rxr_ep->tx_iov_limit); + + rxr_perfset_start(rxr_ep, perf_rxr_tx); + fastlock_acquire(&rxr_ep->util_ep.lock); + + if (OFI_UNLIKELY(is_tx_res_full(rxr_ep))) { + err = -FI_EAGAIN; + goto out; + } + + peer = rxr_ep_get_peer(rxr_ep, msg->addr); + + if (peer->flags & RXR_PEER_IN_BACKOFF) { + err = -FI_EAGAIN; + goto out; + } + + tx_entry = rxr_ep_alloc_tx_entry(rxr_ep, msg, op, tag, flags); + + if (OFI_UNLIKELY(!tx_entry)) { + err = -FI_EAGAIN; + rxr_ep_progress_internal(rxr_ep); + goto out; + } + + assert(tx_entry->op == ofi_op_msg || tx_entry->op == ofi_op_tagged); + + tx_entry->msg_id = peer->next_msg_id++; + err = rxr_msg_post_rtm(rxr_ep, tx_entry); + if (OFI_UNLIKELY(err)) { + rxr_release_tx_entry(rxr_ep, tx_entry); + peer->next_msg_id--; + } + +out: + fastlock_release(&rxr_ep->util_ep.lock); + rxr_perfset_end(rxr_ep, perf_rxr_tx); + return err; +} + +/** + * Non-tagged send ops function + */ +static +ssize_t rxr_msg_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, + uint64_t flags) +{ + return rxr_msg_generic_send(ep, msg, 0, ofi_op_msg, flags); +} + +static +ssize_t rxr_msg_sendv(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t dest_addr, + void *context) +{ + struct rxr_ep *rxr_ep; + struct fi_msg msg = {0}; + + rxr_setup_msg(&msg, iov, desc, count, dest_addr, context, 0); + rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid); + return rxr_msg_sendmsg(ep, &msg, rxr_tx_flags(rxr_ep)); +} + +static +ssize_t rxr_msg_send(struct fid_ep *ep, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, void *context) +{ + struct iovec iov; + + iov.iov_base = (void *)buf; + iov.iov_len = len; + return rxr_msg_sendv(ep, &iov, &desc, 1, dest_addr, context); +} + +static +ssize_t rxr_msg_senddata(struct fid_ep *ep, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, + void *context) +{ + struct fi_msg msg = {0}; + struct iovec iov; + struct rxr_ep *rxr_ep; + + iov.iov_base = (void *)buf; + iov.iov_len = len; + + rxr_setup_msg(&msg, &iov, &desc, 1, dest_addr, context, data); + rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid); + return rxr_msg_generic_send(ep, &msg, 0, ofi_op_msg, + rxr_tx_flags(rxr_ep) | FI_REMOTE_CQ_DATA); +} + +static +ssize_t rxr_msg_inject(struct fid_ep *ep, const void *buf, size_t len, + fi_addr_t dest_addr) +{ + struct rxr_ep *rxr_ep; + struct fi_msg msg = {0}; + struct iovec iov; + + iov.iov_base = (void *)buf; + iov.iov_len = len; + + rxr_setup_msg(&msg, &iov, NULL, 1, dest_addr, NULL, 0); + rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid); + assert(len <= rxr_ep->core_inject_size - sizeof(struct rxr_eager_msgrtm_hdr)); + + return rxr_msg_generic_send(ep, &msg, 0, ofi_op_msg, + rxr_tx_flags(rxr_ep) | RXR_NO_COMPLETION | FI_INJECT); +} + +static +ssize_t rxr_msg_injectdata(struct fid_ep *ep, const void *buf, + size_t len, uint64_t data, + fi_addr_t dest_addr) +{ + struct rxr_ep *rxr_ep; + struct fi_msg msg; + struct iovec iov; + + iov.iov_base = (void *)buf; + iov.iov_len = len; + + rxr_setup_msg(&msg, &iov, NULL, 1, dest_addr, NULL, data); + rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid); + /* + * We advertise the largest possible inject size with no cq data or + * source address. This means that we may end up not using the core + * providers inject for this send. + */ + assert(len <= rxr_ep->core_inject_size - sizeof(struct rxr_eager_msgrtm_hdr)); + return rxr_msg_generic_send(ep, &msg, 0, ofi_op_msg, + rxr_tx_flags(rxr_ep) | RXR_NO_COMPLETION | + FI_REMOTE_CQ_DATA | FI_INJECT); +} + +/** + * Tagged send op functions + */ +static +ssize_t rxr_msg_tsendmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *tmsg, + uint64_t flags) +{ + struct fi_msg msg = {0}; + + rxr_setup_msg(&msg, tmsg->msg_iov, tmsg->desc, tmsg->iov_count, tmsg->addr, tmsg->context, tmsg->data); + return rxr_msg_generic_send(ep_fid, &msg, tmsg->tag, ofi_op_tagged, flags); +} + +static +ssize_t rxr_msg_tsendv(struct fid_ep *ep_fid, const struct iovec *iov, + void **desc, size_t count, fi_addr_t dest_addr, + uint64_t tag, void *context) +{ + struct rxr_ep *rxr_ep; + struct fi_msg_tagged msg = {0}; + + msg.msg_iov = iov; + msg.desc = desc; + msg.iov_count = count; + msg.addr = dest_addr; + msg.context = context; + msg.tag = tag; + + rxr_ep = container_of(ep_fid, struct rxr_ep, util_ep.ep_fid.fid); + return rxr_msg_tsendmsg(ep_fid, &msg, rxr_tx_flags(rxr_ep)); +} + +static +ssize_t rxr_msg_tsend(struct fid_ep *ep_fid, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, uint64_t tag, + void *context) +{ + struct iovec msg_iov; + + msg_iov.iov_base = (void *)buf; + msg_iov.iov_len = len; + return rxr_msg_tsendv(ep_fid, &msg_iov, &desc, 1, dest_addr, tag, + context); +} + +static +ssize_t rxr_msg_tsenddata(struct fid_ep *ep_fid, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, + uint64_t tag, void *context) +{ + struct fi_msg msg = {0}; + struct iovec iov; + struct rxr_ep *rxr_ep; + + iov.iov_base = (void *)buf; + iov.iov_len = len; + + rxr_setup_msg(&msg, &iov, &desc, 1, dest_addr, context, data); + rxr_ep = container_of(ep_fid, struct rxr_ep, util_ep.ep_fid.fid); + return rxr_msg_generic_send(ep_fid, &msg, tag, ofi_op_tagged, + rxr_tx_flags(rxr_ep) | FI_REMOTE_CQ_DATA); +} + +static +ssize_t rxr_msg_tinject(struct fid_ep *ep_fid, const void *buf, size_t len, + fi_addr_t dest_addr, uint64_t tag) +{ + struct rxr_ep *rxr_ep; + struct fi_msg msg = {0}; + struct iovec iov; + + iov.iov_base = (void *)buf; + iov.iov_len = len; + + rxr_setup_msg(&msg, &iov, NULL, 1, dest_addr, NULL, 0); + rxr_ep = container_of(ep_fid, struct rxr_ep, util_ep.ep_fid.fid); + assert(len <= rxr_ep->core_inject_size - sizeof(struct rxr_eager_tagrtm_hdr)); + + return rxr_msg_generic_send(ep_fid, &msg, tag, ofi_op_tagged, + rxr_tx_flags(rxr_ep) | RXR_NO_COMPLETION | FI_INJECT); +} + +static +ssize_t rxr_msg_tinjectdata(struct fid_ep *ep_fid, const void *buf, size_t len, + uint64_t data, fi_addr_t dest_addr, uint64_t tag) +{ + struct rxr_ep *rxr_ep; + struct fi_msg msg = {0}; + struct iovec iov; + + iov.iov_base = (void *)buf; + iov.iov_len = len; + + rxr_setup_msg(&msg, &iov, NULL, 1, dest_addr, NULL, data); + rxr_ep = container_of(ep_fid, struct rxr_ep, util_ep.ep_fid.fid); + /* + * We advertise the largest possible inject size with no cq data or + * source address. This means that we may end up not using the core + * providers inject for this send. + */ + assert(len <= rxr_ep->core_inject_size - sizeof(struct rxr_eager_tagrtm_hdr)); + + return rxr_msg_generic_send(ep_fid, &msg, tag, ofi_op_tagged, + rxr_tx_flags(rxr_ep) | RXR_NO_COMPLETION | + FI_REMOTE_CQ_DATA | FI_INJECT); +} + +/** + * Receive functions + */ + +/** + * Utility functions and data structures + */ +struct rxr_match_info { + fi_addr_t addr; + uint64_t tag; + uint64_t ignore; +}; + +static +int rxr_msg_match_unexp_anyaddr(struct dlist_entry *item, const void *arg) +{ + return 1; +} + +static +int rxr_msg_match_unexp(struct dlist_entry *item, const void *arg) +{ + const struct rxr_match_info *match_info = arg; + struct rxr_rx_entry *rx_entry; + + rx_entry = container_of(item, struct rxr_rx_entry, entry); + + return rxr_match_addr(match_info->addr, rx_entry->addr); +} + +static +int rxr_msg_match_unexp_tagged_anyaddr(struct dlist_entry *item, const void *arg) +{ + const struct rxr_match_info *match_info = arg; + struct rxr_rx_entry *rx_entry; + + rx_entry = container_of(item, struct rxr_rx_entry, entry); + + return rxr_match_tag(rx_entry->tag, match_info->ignore, + match_info->tag); +} + +static +int rxr_msg_match_unexp_tagged(struct dlist_entry *item, const void *arg) +{ + const struct rxr_match_info *match_info = arg; + struct rxr_rx_entry *rx_entry; + + rx_entry = container_of(item, struct rxr_rx_entry, entry); + + return rxr_match_addr(match_info->addr, rx_entry->addr) && + rxr_match_tag(rx_entry->tag, match_info->ignore, + match_info->tag); +} + +static +int rxr_msg_handle_unexp_match(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry, + uint64_t tag, uint64_t ignore, + void *context, fi_addr_t addr, + uint32_t op, uint64_t flags) +{ + struct rxr_pkt_entry *pkt_entry; + uint64_t data_len; + + rx_entry->fi_flags = flags; + rx_entry->ignore = ignore; + rx_entry->state = RXR_RX_MATCHED; + + pkt_entry = rx_entry->unexp_pkt; + rx_entry->unexp_pkt = NULL; + data_len = rxr_pkt_rtm_total_len(pkt_entry); + + rx_entry->cq_entry.op_context = context; + /* + * we don't expect recv buf from application for discard, + * hence setting to NULL + */ + if (OFI_UNLIKELY(flags & FI_DISCARD)) { + rx_entry->cq_entry.buf = NULL; + rx_entry->cq_entry.len = data_len; + } else { + rx_entry->cq_entry.buf = rx_entry->iov[0].iov_base; + data_len = MIN(rx_entry->total_len, + ofi_total_iov_len(rx_entry->iov, rx_entry->iov_count)); + rx_entry->cq_entry.len = data_len; + } + + rx_entry->cq_entry.flags = (FI_RECV | FI_MSG); + + if (op == ofi_op_tagged) { + rx_entry->cq_entry.flags |= FI_TAGGED; + rx_entry->cq_entry.tag = rx_entry->tag; + rx_entry->ignore = ignore; + } else { + rx_entry->cq_entry.tag = 0; + rx_entry->ignore = ~0; + } + + return rxr_pkt_proc_matched_rtm(ep, rx_entry, pkt_entry); +} + +/* + * Search unexpected list for matching message and process it if found. + * Returns 0 if the message is processed, -FI_ENOMSG if no match is found. + */ +static +int rxr_msg_proc_unexp_msg_list(struct rxr_ep *ep, const struct fi_msg *msg, + uint64_t tag, uint64_t ignore, uint32_t op, uint64_t flags, + struct rxr_rx_entry *posted_entry) +{ + struct rxr_match_info match_info; + struct dlist_entry *match; + struct rxr_rx_entry *rx_entry; + dlist_func_t *match_func; + int ret; + + if (op == ofi_op_tagged) { + if (ep->util_ep.caps & FI_DIRECTED_RECV) + match_func = &rxr_msg_match_unexp_tagged; + else + match_func = &rxr_msg_match_unexp_tagged_anyaddr; + + match_info.addr = msg->addr; + match_info.tag = tag; + match_info.ignore = ignore; + match = dlist_remove_first_match(&ep->rx_unexp_tagged_list, + match_func, + (void *)&match_info); + } else { + if (ep->util_ep.caps & FI_DIRECTED_RECV) + match_func = &rxr_msg_match_unexp; + else + match_func = &rxr_msg_match_unexp_anyaddr; + + match_info.addr = msg->addr; + match = dlist_remove_first_match(&ep->rx_unexp_list, + match_func, + (void *)&match_info); + } + + if (!match) + return -FI_ENOMSG; + + rx_entry = container_of(match, struct rxr_rx_entry, entry); + + /* + * Initialize the matched entry as a multi-recv consumer if the posted + * buffer is a multi-recv buffer. + */ + if (posted_entry) { + /* + * rxr_ep_split_rx_entry will setup rx_entry iov and count + */ + rx_entry = rxr_ep_split_rx_entry(ep, posted_entry, rx_entry, + rx_entry->unexp_pkt); + if (OFI_UNLIKELY(!rx_entry)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "RX entries exhausted.\n"); + return -FI_ENOBUFS; + } + } else { + memcpy(rx_entry->iov, msg->msg_iov, sizeof(*rx_entry->iov) * msg->iov_count); + rx_entry->iov_count = msg->iov_count; + } + + if (msg->desc) + memcpy(rx_entry->desc, msg->desc, sizeof(void*) * msg->iov_count); + + FI_DBG(&rxr_prov, FI_LOG_EP_CTRL, + "Match found in unexp list for a posted recv msg_id: %" PRIu32 + " total_len: %" PRIu64 " tag: %lx\n", + rx_entry->msg_id, rx_entry->total_len, rx_entry->tag); + + ret = rxr_msg_handle_unexp_match(ep, rx_entry, tag, ignore, + msg->context, msg->addr, op, flags); + return ret; +} + +bool rxr_msg_multi_recv_buffer_available(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry) +{ + assert(rx_entry->fi_flags & FI_MULTI_RECV); + assert(rx_entry->rxr_flags & RXR_MULTI_RECV_POSTED); + + return (ofi_total_iov_len(rx_entry->iov, rx_entry->iov_count) + >= ep->min_multi_recv_size); +} + +static inline +bool rxr_msg_multi_recv_buffer_complete(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry) +{ + assert(rx_entry->fi_flags & FI_MULTI_RECV); + assert(rx_entry->rxr_flags & RXR_MULTI_RECV_POSTED); + + return (!rxr_msg_multi_recv_buffer_available(ep, rx_entry) && + dlist_empty(&rx_entry->multi_recv_consumers)); +} + +void rxr_msg_multi_recv_free_posted_entry(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry) +{ + assert(!(rx_entry->rxr_flags & RXR_MULTI_RECV_POSTED)); + + if ((rx_entry->rxr_flags & RXR_MULTI_RECV_CONSUMER) && + rxr_msg_multi_recv_buffer_complete(ep, rx_entry->master_entry)) + rxr_release_rx_entry(ep, rx_entry->master_entry); +} + +static +ssize_t rxr_msg_multi_recv(struct rxr_ep *rxr_ep, const struct fi_msg *msg, + uint64_t tag, uint64_t ignore, uint32_t op, uint64_t flags) +{ + struct rxr_rx_entry *rx_entry; + int ret = 0; + + if ((ofi_total_iov_len(msg->msg_iov, msg->iov_count) + < rxr_ep->min_multi_recv_size) || op != ofi_op_msg) + return -FI_EINVAL; + + /* + * Always get new rx_entry of type RXR_MULTI_RECV_POSTED when in the + * multi recv path. The posted entry will not be used for receiving + * messages but will be used for tracking the application's buffer and + * when to write the completion to release the buffer. + */ + rx_entry = rxr_ep_get_rx_entry(rxr_ep, msg, tag, ignore, op, flags); + if (OFI_UNLIKELY(!rx_entry)) { + rxr_ep_progress_internal(rxr_ep); + return -FI_EAGAIN; + } + + rx_entry->rxr_flags |= RXR_MULTI_RECV_POSTED; + dlist_init(&rx_entry->multi_recv_consumers); + dlist_init(&rx_entry->multi_recv_entry); + + while (!dlist_empty(&rxr_ep->rx_unexp_list)) { + ret = rxr_msg_proc_unexp_msg_list(rxr_ep, msg, tag, + ignore, op, flags, rx_entry); + + if (!rxr_msg_multi_recv_buffer_available(rxr_ep, rx_entry)) { + /* + * Multi recv buffer consumed by short, unexp messages, + * free posted rx_entry. + */ + if (rxr_msg_multi_recv_buffer_complete(rxr_ep, rx_entry)) + rxr_release_rx_entry(rxr_ep, rx_entry); + /* + * Multi recv buffer has been consumed, but waiting on + * long msg completion. Last msg completion will free + * posted rx_entry. + */ + if (ret == -FI_ENOMSG) + return 0; + return ret; + } + + if (ret == -FI_ENOMSG) { + ret = 0; + break; + } + + /* + * Error was encountered when processing unexpected messages, + * but there is buffer space available. Add the posted entry to + * the rx_list. + */ + if (ret) + break; + } + + dlist_insert_tail(&rx_entry->entry, &rxr_ep->rx_list); + return ret; +} + +void rxr_msg_multi_recv_handle_completion(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry) +{ + assert(!(rx_entry->rxr_flags & RXR_MULTI_RECV_POSTED) && + (rx_entry->rxr_flags & RXR_MULTI_RECV_CONSUMER)); + + dlist_remove(&rx_entry->multi_recv_entry); + rx_entry->rxr_flags &= ~RXR_MULTI_RECV_CONSUMER; + + if (!rxr_msg_multi_recv_buffer_complete(ep, rx_entry->master_entry)) + return; + + /* + * Buffer is consumed and all messages have been received. Update the + * last message to release the application buffer. + */ + rx_entry->cq_entry.flags |= FI_MULTI_RECV; +} + +/* + * create a rx entry and verify in unexpected message list + * else add to posted recv list + */ +static +ssize_t rxr_msg_generic_recv(struct fid_ep *ep, const struct fi_msg *msg, + uint64_t tag, uint64_t ignore, uint32_t op, + uint64_t flags) +{ + ssize_t ret = 0; + struct rxr_ep *rxr_ep; + struct dlist_entry *unexp_list; + struct rxr_rx_entry *rx_entry; + uint64_t rx_op_flags; + + FI_DBG(&rxr_prov, FI_LOG_EP_DATA, + "%s: iov_len: %lu tag: %lx ignore: %lx op: %x flags: %lx\n", + __func__, ofi_total_iov_len(msg->msg_iov, msg->iov_count), tag, ignore, + op, flags); + + rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid); + + assert(msg->iov_count <= rxr_ep->rx_iov_limit); + + rxr_perfset_start(rxr_ep, perf_rxr_recv); + + assert(rxr_ep->util_ep.rx_msg_flags == 0 || rxr_ep->util_ep.rx_msg_flags == FI_COMPLETION); + rx_op_flags = rxr_ep->util_ep.rx_op_flags; + if (rxr_ep->util_ep.rx_msg_flags == 0) + rx_op_flags &= ~FI_COMPLETION; + flags = flags | rx_op_flags; + + fastlock_acquire(&rxr_ep->util_ep.lock); + if (OFI_UNLIKELY(is_rx_res_full(rxr_ep))) { + ret = -FI_EAGAIN; + goto out; + } + + if (flags & FI_MULTI_RECV) { + ret = rxr_msg_multi_recv(rxr_ep, msg, tag, ignore, op, flags); + goto out; + } + + unexp_list = (op == ofi_op_tagged) ? &rxr_ep->rx_unexp_tagged_list : + &rxr_ep->rx_unexp_list; + + /* + * Attempt to match against stashed unexpected messages. This is not + * applicable to the zero-copy path where unexpected messages are not + * applicable, since there's no tag or address to match against. + */ + if (!dlist_empty(unexp_list) && !rxr_ep->use_zcpy_rx) { + ret = rxr_msg_proc_unexp_msg_list(rxr_ep, msg, tag, + ignore, op, flags, NULL); + + if (ret != -FI_ENOMSG) + goto out; + ret = 0; + } + + rx_entry = rxr_ep_get_rx_entry(rxr_ep, msg, tag, + ignore, op, flags); + + if (OFI_UNLIKELY(!rx_entry)) { + ret = -FI_EAGAIN; + rxr_ep_progress_internal(rxr_ep); + goto out; + } + + if (op == ofi_op_tagged) + dlist_insert_tail(&rx_entry->entry, &rxr_ep->rx_tagged_list); + else + dlist_insert_tail(&rx_entry->entry, &rxr_ep->rx_list); + + if (rxr_ep->use_zcpy_rx) + rxr_ep_post_buf(rxr_ep, msg, flags, EFA_EP); + +out: + fastlock_release(&rxr_ep->util_ep.lock); + + rxr_perfset_end(rxr_ep, perf_rxr_recv); + return ret; +} + +static +ssize_t rxr_msg_discard_trecv(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry, + const struct fi_msg_tagged *msg, + int64_t flags) +{ + int ret; + + if ((flags & FI_DISCARD) && !(flags & (FI_PEEK | FI_CLAIM))) + return -FI_EINVAL; + + rx_entry->fi_flags |= FI_DISCARD; + rx_entry->rxr_flags |= RXR_RECV_CANCEL; + ret = ofi_cq_write(ep->util_ep.rx_cq, msg->context, + FI_TAGGED | FI_RECV | FI_MSG, + 0, NULL, rx_entry->cq_entry.data, + rx_entry->cq_entry.tag); + rxr_rm_rx_cq_check(ep, ep->util_ep.rx_cq); + return ret; +} + +static +ssize_t rxr_msg_claim_trecv(struct fid_ep *ep_fid, + const struct fi_msg_tagged *msg, + int64_t flags) +{ + ssize_t ret = 0; + struct rxr_ep *ep; + struct rxr_rx_entry *rx_entry; + struct fi_context *context; + + ep = container_of(ep_fid, struct rxr_ep, util_ep.ep_fid.fid); + fastlock_acquire(&ep->util_ep.lock); + + context = (struct fi_context *)msg->context; + rx_entry = (struct rxr_rx_entry *)context->internal[0]; + + if (flags & FI_DISCARD) { + ret = rxr_msg_discard_trecv(ep, rx_entry, msg, flags); + if (OFI_UNLIKELY(ret)) + goto out; + } + + /* + * Handle unexp match entry even for discard entry as we are sinking + * messages for that case + */ + memcpy(rx_entry->iov, msg->msg_iov, + sizeof(*msg->msg_iov) * msg->iov_count); + rx_entry->iov_count = msg->iov_count; + + ret = rxr_msg_handle_unexp_match(ep, rx_entry, msg->tag, + msg->ignore, msg->context, + msg->addr, ofi_op_tagged, flags); + +out: + fastlock_release(&ep->util_ep.lock); + return ret; +} + +static +ssize_t rxr_msg_peek_trecv(struct fid_ep *ep_fid, + const struct fi_msg_tagged *msg, + uint64_t flags) +{ + ssize_t ret = 0; + struct rxr_ep *ep; + struct dlist_entry *match; + dlist_func_t *match_func; + struct rxr_match_info match_info; + struct rxr_rx_entry *rx_entry; + struct fi_context *context; + struct rxr_pkt_entry *pkt_entry; + size_t data_len; + int64_t tag; + + ep = container_of(ep_fid, struct rxr_ep, util_ep.ep_fid.fid); + + fastlock_acquire(&ep->util_ep.lock); + + rxr_ep_progress_internal(ep); + match_info.addr = msg->addr; + match_info.tag = msg->tag; + match_info.ignore = msg->ignore; + + if (ep->util_ep.caps & FI_DIRECTED_RECV) + match_func = &rxr_msg_match_unexp_tagged; + else + match_func = &rxr_msg_match_unexp_tagged_anyaddr; + + match = dlist_find_first_match(&ep->rx_unexp_tagged_list, + match_func, + (void *)&match_info); + if (!match) { + FI_DBG(&rxr_prov, FI_LOG_EP_CTRL, + "Message not found addr: %" PRIu64 + " tag: %lx ignore %lx\n", msg->addr, msg->tag, + msg->ignore); + ret = ofi_cq_write_error_peek(ep->util_ep.rx_cq, msg->tag, + msg->context); + goto out; + } + + rx_entry = container_of(match, struct rxr_rx_entry, entry); + context = (struct fi_context *)msg->context; + if (flags & FI_CLAIM) { + context->internal[0] = rx_entry; + dlist_remove(match); + } else if (flags & FI_DISCARD) { + dlist_remove(match); + + ret = rxr_msg_discard_trecv(ep, rx_entry, msg, flags); + if (ret) + goto out; + + memcpy(rx_entry->iov, msg->msg_iov, + sizeof(*msg->msg_iov) * msg->iov_count); + rx_entry->iov_count = msg->iov_count; + + ret = rxr_msg_handle_unexp_match(ep, rx_entry, + msg->tag, msg->ignore, + msg->context, msg->addr, + ofi_op_tagged, flags); + + goto out; + } + + pkt_entry = rx_entry->unexp_pkt; + data_len = rxr_pkt_rtm_total_len(pkt_entry); + tag = rxr_pkt_rtm_tag(pkt_entry); + + if (ep->util_ep.caps & FI_SOURCE) + ret = ofi_cq_write_src(ep->util_ep.rx_cq, context, + FI_TAGGED | FI_RECV, + data_len, NULL, + rx_entry->cq_entry.data, tag, + rx_entry->addr); + else + ret = ofi_cq_write(ep->util_ep.rx_cq, context, + FI_TAGGED | FI_RECV, + data_len, NULL, + rx_entry->cq_entry.data, tag); + rxr_rm_rx_cq_check(ep, ep->util_ep.rx_cq); +out: + fastlock_release(&ep->util_ep.lock); + return ret; +} + +/** + * Non-tagged receive ops + */ +static +ssize_t rxr_msg_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, + uint64_t flags) +{ + return rxr_msg_generic_recv(ep_fid, msg, 0, 0, ofi_op_msg, flags); +} + +static +ssize_t rxr_msg_recv(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, void *context) +{ + struct fi_msg msg = {0}; + struct iovec iov; + + iov.iov_base = buf; + iov.iov_len = len; + + rxr_setup_msg(&msg, &iov, &desc, 1, src_addr, context, 0); + return rxr_msg_recvmsg(ep, &msg, 0); +} + +static +ssize_t rxr_msg_recvv(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, + void *context) +{ + struct fi_msg msg = {0}; + + rxr_setup_msg(&msg, iov, desc, count, src_addr, context, 0); + return rxr_msg_recvmsg(ep, &msg, 0); +} + +/** + * Tagged receive ops functions + */ +static +ssize_t rxr_msg_trecv(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, + fi_addr_t src_addr, uint64_t tag, uint64_t ignore, + void *context) +{ + struct fi_msg msg = {0}; + struct iovec iov; + + iov.iov_base = (void *)buf; + iov.iov_len = len; + + rxr_setup_msg(&msg, &iov, &desc, 1, src_addr, context, 0); + return rxr_msg_generic_recv(ep_fid, &msg, tag, ignore, ofi_op_tagged, 0); +} + +static +ssize_t rxr_msg_trecvv(struct fid_ep *ep_fid, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, void *context) +{ + struct fi_msg msg = {0}; + + rxr_setup_msg(&msg, iov, desc, count, src_addr, context, 0); + return rxr_msg_generic_recv(ep_fid, &msg, tag, ignore, ofi_op_tagged, 0); +} + +static +ssize_t rxr_msg_trecvmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *tmsg, + uint64_t flags) +{ + ssize_t ret; + struct fi_msg msg = {0}; + + if (flags & FI_PEEK) { + ret = rxr_msg_peek_trecv(ep_fid, tmsg, flags); + goto out; + } else if (flags & FI_CLAIM) { + ret = rxr_msg_claim_trecv(ep_fid, tmsg, flags); + goto out; + } + + rxr_setup_msg(&msg, tmsg->msg_iov, tmsg->desc, tmsg->iov_count, tmsg->addr, tmsg->context, tmsg->data); + ret = rxr_msg_generic_recv(ep_fid, &msg, tmsg->tag, tmsg->ignore, + ofi_op_tagged, flags); + +out: + return ret; +} + +/** + * Ops structures used by rxr_endpoint() + */ +struct fi_ops_msg rxr_ops_msg = { + .size = sizeof(struct fi_ops_msg), + .send = rxr_msg_send, + .sendv = rxr_msg_sendv, + .sendmsg = rxr_msg_sendmsg, + .senddata = rxr_msg_senddata, + .inject = rxr_msg_inject, + .injectdata = rxr_msg_injectdata, + .recv = rxr_msg_recv, + .recvv = rxr_msg_recvv, + .recvmsg = rxr_msg_recvmsg, +}; + +struct fi_ops_tagged rxr_ops_tagged = { + .size = sizeof(struct fi_ops_tagged), + .send = rxr_msg_tsend, + .sendv = rxr_msg_tsendv, + .sendmsg = rxr_msg_tsendmsg, + .senddata = rxr_msg_tsenddata, + .inject = rxr_msg_tinject, + .injectdata = rxr_msg_tinjectdata, + .recv = rxr_msg_trecv, + .recvv = rxr_msg_trecvv, + .recvmsg = rxr_msg_trecvmsg, +}; + diff --git a/prov/efa/src/rxr/rxr_msg.h b/prov/efa/src/rxr/rxr_msg.h new file mode 100644 index 00000000000..58349d147f5 --- /dev/null +++ b/prov/efa/src/rxr/rxr_msg.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * multi recv related functions + */ +bool rxr_msg_multi_recv_buffer_available(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry); + +void rxr_msg_multi_recv_handle_completion(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry); + +void rxr_msg_multi_recv_free_posted_entry(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry); + +/* + * The following 2 OP structures are defined in rxr_msg.c and is + * used by rxr_endpoint() + */ +extern struct fi_ops_msg rxr_ops_msg; + +extern struct fi_ops_tagged rxr_ops_tagged; + +ssize_t rxr_msg_post_medium_rtm(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry); + +ssize_t rxr_msg_post_medium_rtm_or_queue(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry); diff --git a/prov/efa/src/rxr/rxr_pkt_cmd.c b/prov/efa/src/rxr/rxr_pkt_cmd.c new file mode 100644 index 00000000000..2382069adaa --- /dev/null +++ b/prov/efa/src/rxr/rxr_pkt_cmd.c @@ -0,0 +1,1046 @@ +/* + * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "efa.h" +#include "rxr.h" +#include "rxr_msg.h" +#include "rxr_cntr.h" +#include "rxr_read.h" +#include "rxr_pkt_cmd.h" + +/* Handshake wait timeout in microseconds */ +#define RXR_HANDSHAKE_WAIT_TIMEOUT 1000000 + +/* This file implements 4 actions that can be applied to a packet: + * posting, + * handling send completion and, + * handing recv completion. + * dump (for debug only) + */ + +/* + * Functions used to post a packet + */ +ssize_t rxr_pkt_post_data(struct rxr_ep *rxr_ep, + struct rxr_tx_entry *tx_entry) +{ + struct rxr_pkt_entry *pkt_entry; + struct rxr_data_pkt *data_pkt; + ssize_t ret; + + pkt_entry = rxr_pkt_entry_alloc(rxr_ep, rxr_ep->tx_pkt_efa_pool); + if (OFI_UNLIKELY(!pkt_entry)) { + FI_DBG(&rxr_prov, FI_LOG_EP_DATA, + "TX packets exhausted, current packets in flight %lu", + rxr_ep->tx_pending); + return -FI_EAGAIN; + } + + pkt_entry->x_entry = (void *)tx_entry; + pkt_entry->addr = tx_entry->addr; + + data_pkt = (struct rxr_data_pkt *)pkt_entry->pkt; + + data_pkt->hdr.type = RXR_DATA_PKT; + data_pkt->hdr.version = RXR_BASE_PROTOCOL_VERSION; + data_pkt->hdr.flags = 0; + + data_pkt->hdr.rx_id = tx_entry->rx_id; + + /* + * Data packets are sent in order so using bytes_sent is okay here. + */ + data_pkt->hdr.seg_offset = tx_entry->bytes_sent; + + if (tx_entry->desc[0]) + ret = rxr_pkt_send_data_desc(rxr_ep, tx_entry, pkt_entry); + else + ret = rxr_pkt_send_data(rxr_ep, tx_entry, pkt_entry); + + if (OFI_UNLIKELY(ret)) { + rxr_pkt_entry_release_tx(rxr_ep, pkt_entry); + return ret; + } + + data_pkt = rxr_get_data_pkt(pkt_entry->pkt); + tx_entry->bytes_sent += data_pkt->hdr.seg_size; + tx_entry->window -= data_pkt->hdr.seg_size; + assert(data_pkt->hdr.seg_size > 0); + assert(tx_entry->window >= 0); + return ret; +} + +/* + * rxr_pkt_init_ctrl() uses init functions declared in rxr_pkt_type.h + */ +static +int rxr_pkt_init_ctrl(struct rxr_ep *rxr_ep, int entry_type, void *x_entry, + int ctrl_type, struct rxr_pkt_entry *pkt_entry) +{ + int ret = 0; + + switch (ctrl_type) { + case RXR_READRSP_PKT: + ret = rxr_pkt_init_readrsp(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_CTS_PKT: + ret = rxr_pkt_init_cts(rxr_ep, (struct rxr_rx_entry *)x_entry, pkt_entry); + break; + case RXR_EOR_PKT: + ret = rxr_pkt_init_eor(rxr_ep, (struct rxr_rx_entry *)x_entry, pkt_entry); + break; + case RXR_ATOMRSP_PKT: + ret = rxr_pkt_init_atomrsp(rxr_ep, (struct rxr_rx_entry *)x_entry, pkt_entry); + break; + case RXR_RECEIPT_PKT: + ret = rxr_pkt_init_receipt(rxr_ep, (struct rxr_rx_entry *)x_entry, pkt_entry); + break; + case RXR_EAGER_MSGRTM_PKT: + ret = rxr_pkt_init_eager_msgrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_EAGER_TAGRTM_PKT: + ret = rxr_pkt_init_eager_tagrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_MEDIUM_MSGRTM_PKT: + ret = rxr_pkt_init_medium_msgrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_MEDIUM_TAGRTM_PKT: + ret = rxr_pkt_init_medium_tagrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_LONG_MSGRTM_PKT: + ret = rxr_pkt_init_long_msgrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_LONG_TAGRTM_PKT: + ret = rxr_pkt_init_long_tagrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_READ_MSGRTM_PKT: + ret = rxr_pkt_init_read_msgrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_READ_TAGRTM_PKT: + ret = rxr_pkt_init_read_tagrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_EAGER_RTW_PKT: + ret = rxr_pkt_init_eager_rtw(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_LONG_RTW_PKT: + ret = rxr_pkt_init_long_rtw(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_READ_RTW_PKT: + ret = rxr_pkt_init_read_rtw(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_SHORT_RTR_PKT: + ret = rxr_pkt_init_short_rtr(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_LONG_RTR_PKT: + ret = rxr_pkt_init_long_rtr(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_WRITE_RTA_PKT: + ret = rxr_pkt_init_write_rta(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_FETCH_RTA_PKT: + ret = rxr_pkt_init_fetch_rta(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_COMPARE_RTA_PKT: + ret = rxr_pkt_init_compare_rta(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_DC_EAGER_MSGRTM_PKT: + ret = rxr_pkt_init_dc_eager_msgrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_DC_EAGER_TAGRTM_PKT: + ret = rxr_pkt_init_dc_eager_tagrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_DC_MEDIUM_MSGRTM_PKT: + ret = rxr_pkt_init_dc_medium_msgrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_DC_MEDIUM_TAGRTM_PKT: + ret = rxr_pkt_init_dc_medium_tagrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_DC_LONG_MSGRTM_PKT: + ret = rxr_pkt_init_dc_long_msgrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_DC_LONG_TAGRTM_PKT: + ret = rxr_pkt_init_dc_long_tagrtm(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_DC_EAGER_RTW_PKT: + ret = rxr_pkt_init_dc_eager_rtw(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_DC_LONG_RTW_PKT: + ret = rxr_pkt_init_dc_long_rtw(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + case RXR_DC_WRITE_RTA_PKT: + ret = rxr_pkt_init_dc_write_rta(rxr_ep, (struct rxr_tx_entry *)x_entry, pkt_entry); + break; + default: + ret = -FI_EINVAL; + assert(0 && "unknown pkt type to init"); + break; + } + + return ret; +} + +/* + * rxr_pkt_handle_ctrl_sent() uses handle_sent() functions declared in rxr_pkt_type.h + */ +static +void rxr_pkt_handle_ctrl_sent(struct rxr_ep *rxr_ep, struct rxr_pkt_entry *pkt_entry) +{ + int ctrl_type = rxr_get_base_hdr(pkt_entry->pkt)->type; + + switch (ctrl_type) { + case RXR_READRSP_PKT: + rxr_pkt_handle_readrsp_sent(rxr_ep, pkt_entry); + break; + case RXR_CTS_PKT: + rxr_pkt_handle_cts_sent(rxr_ep, pkt_entry); + break; + case RXR_EOR_PKT: + rxr_pkt_handle_eor_sent(rxr_ep, pkt_entry); + break; + case RXR_ATOMRSP_PKT: + rxr_pkt_handle_atomrsp_sent(rxr_ep, pkt_entry); + break; + case RXR_RECEIPT_PKT: + rxr_pkt_handle_receipt_sent(rxr_ep, pkt_entry); + break; + case RXR_EAGER_MSGRTM_PKT: + case RXR_EAGER_TAGRTM_PKT: + rxr_pkt_handle_eager_rtm_sent(rxr_ep, pkt_entry); + break; + case RXR_MEDIUM_MSGRTM_PKT: + case RXR_MEDIUM_TAGRTM_PKT: + case RXR_DC_MEDIUM_MSGRTM_PKT: + case RXR_DC_MEDIUM_TAGRTM_PKT: + rxr_pkt_handle_medium_rtm_sent(rxr_ep, pkt_entry); + break; + case RXR_LONG_MSGRTM_PKT: + case RXR_DC_LONG_MSGRTM_PKT: + case RXR_LONG_TAGRTM_PKT: + case RXR_DC_LONG_TAGRTM_PKT: + rxr_pkt_handle_long_rtm_sent(rxr_ep, pkt_entry); + break; + case RXR_READ_MSGRTM_PKT: + case RXR_READ_TAGRTM_PKT: + rxr_pkt_handle_read_rtm_sent(rxr_ep, pkt_entry); + break; + case RXR_EAGER_RTW_PKT: + rxr_pkt_handle_eager_rtw_sent(rxr_ep, pkt_entry); + break; + case RXR_LONG_RTW_PKT: + case RXR_DC_LONG_RTW_PKT: + rxr_pkt_handle_long_rtw_sent(rxr_ep, pkt_entry); + break; + case RXR_READ_RTW_PKT: + rxr_pkt_handle_read_rtw_sent(rxr_ep, pkt_entry); + break; + case RXR_SHORT_RTR_PKT: + case RXR_LONG_RTR_PKT: + rxr_pkt_handle_rtr_sent(rxr_ep, pkt_entry); + break; + case RXR_WRITE_RTA_PKT: + case RXR_DC_WRITE_RTA_PKT: + case RXR_FETCH_RTA_PKT: + case RXR_COMPARE_RTA_PKT: + rxr_pkt_handle_rta_sent(rxr_ep, pkt_entry); + break; + case RXR_DC_EAGER_MSGRTM_PKT: + case RXR_DC_EAGER_TAGRTM_PKT: + case RXR_DC_EAGER_RTW_PKT: + break; + default: + assert(0 && "Unknown packet type to handle sent"); + break; + } +} + +ssize_t rxr_pkt_post_ctrl_once(struct rxr_ep *rxr_ep, int entry_type, void *x_entry, + int ctrl_type, bool inject) +{ + struct rxr_pkt_sendv send; + struct rxr_pkt_entry *pkt_entry; + struct rxr_tx_entry *tx_entry; + struct rxr_rx_entry *rx_entry; + struct rxr_peer *peer; + ssize_t err; + fi_addr_t addr; + + if (entry_type == RXR_TX_ENTRY) { + tx_entry = (struct rxr_tx_entry *)x_entry; + addr = tx_entry->addr; + } else { + rx_entry = (struct rxr_rx_entry *)x_entry; + addr = rx_entry->addr; + } + + peer = rxr_ep_get_peer(rxr_ep, addr); + if (peer->is_local) { + assert(rxr_ep->use_shm); + pkt_entry = rxr_pkt_entry_alloc(rxr_ep, rxr_ep->tx_pkt_shm_pool); + } else { + pkt_entry = rxr_pkt_entry_alloc(rxr_ep, rxr_ep->tx_pkt_efa_pool); + } + + if (!pkt_entry) + return -FI_EAGAIN; + + send.iov_count = 0; + pkt_entry->send = &send; + + /* + * rxr_pkt_init_ctrl will set pkt_entry->send if it want to use multi iov + */ + err = rxr_pkt_init_ctrl(rxr_ep, entry_type, x_entry, ctrl_type, pkt_entry); + if (OFI_UNLIKELY(err)) { + rxr_pkt_entry_release_tx(rxr_ep, pkt_entry); + return err; + } + + /* if send, tx_pkt_entry will be released while handle completion + * if inject, there will not be completion, therefore tx_pkt_entry has to be + * released here + */ + if (inject) + err = rxr_pkt_entry_inject(rxr_ep, pkt_entry, addr); + else if (pkt_entry->send->iov_count > 0) + err = rxr_pkt_entry_sendv(rxr_ep, pkt_entry, addr, + pkt_entry->send->iov, pkt_entry->send->desc, + pkt_entry->send->iov_count, 0); + else + err = rxr_pkt_entry_send(rxr_ep, pkt_entry, addr); + + pkt_entry->send = NULL; + if (OFI_UNLIKELY(err)) { + rxr_pkt_entry_release_tx(rxr_ep, pkt_entry); + return err; + } + + peer->flags |= RXR_PEER_REQ_SENT; + rxr_pkt_handle_ctrl_sent(rxr_ep, pkt_entry); + if (inject) + rxr_pkt_entry_release_tx(rxr_ep, pkt_entry); + + return 0; +} + +ssize_t rxr_pkt_post_ctrl(struct rxr_ep *ep, int entry_type, void *x_entry, + int ctrl_type, bool inject) +{ + ssize_t err; + struct rxr_tx_entry *tx_entry; + + if (ctrl_type == RXR_MEDIUM_TAGRTM_PKT || + ctrl_type == RXR_MEDIUM_MSGRTM_PKT || + ctrl_type == RXR_DC_MEDIUM_MSGRTM_PKT || + ctrl_type == RXR_DC_MEDIUM_TAGRTM_PKT) { + assert(entry_type == RXR_TX_ENTRY); + assert(!inject); + + tx_entry = (struct rxr_tx_entry *)x_entry; + while (tx_entry->bytes_sent < tx_entry->total_len) { + err = rxr_pkt_post_ctrl_once(ep, RXR_TX_ENTRY, x_entry, ctrl_type, 0); + if (OFI_UNLIKELY(err)) + return err; + } + + return 0; + } + + return rxr_pkt_post_ctrl_once(ep, entry_type, x_entry, ctrl_type, inject); +} + +ssize_t rxr_pkt_post_ctrl_or_queue(struct rxr_ep *ep, int entry_type, void *x_entry, int ctrl_type, bool inject) +{ + ssize_t err; + struct rxr_tx_entry *tx_entry; + struct rxr_rx_entry *rx_entry; + + err = rxr_pkt_post_ctrl(ep, entry_type, x_entry, ctrl_type, inject); + if (err == -FI_EAGAIN) { + if (entry_type == RXR_TX_ENTRY) { + tx_entry = (struct rxr_tx_entry *)x_entry; + assert(tx_entry->state != RXR_TX_QUEUED_CTRL || + tx_entry->state != RXR_TX_QUEUED_REQ_RNR); + tx_entry->state = RXR_TX_QUEUED_CTRL; + tx_entry->queued_ctrl.type = ctrl_type; + tx_entry->queued_ctrl.inject = inject; + dlist_insert_tail(&tx_entry->queued_entry, + &ep->tx_entry_queued_list); + } else { + assert(entry_type == RXR_RX_ENTRY); + rx_entry = (struct rxr_rx_entry *)x_entry; + assert(rx_entry->state != RXR_RX_QUEUED_CTRL || + rx_entry->state != RXR_RX_QUEUED_CTS_RNR); + rx_entry->state = RXR_RX_QUEUED_CTRL; + rx_entry->queued_ctrl.type = ctrl_type; + rx_entry->queued_ctrl.inject = inject; + dlist_insert_tail(&rx_entry->queued_entry, + &ep->rx_entry_queued_list); + } + + err = 0; + } + + return err; +} + +/* + * This function is used for any extra feature that does not have an alternative. + * + * This function will send a eager rtw packet to trigger handshake. + * + * We do not send eager rtm packets here because the receiver might require + * ordering and an extra eager rtm will interrupt the reorder + * process. + * + * ep: The endpoint on which the packet for triggering handshake will be sent. + * peer: The peer from which the sender receives handshake. + * addr: The address of the peer. + * + * This function will return 0 if sender successfully receives / have already + * received the handshake from the peer + * + * This function will return FI_EAGAIN if it fails to allocate or send the trigger packet. + * It will return FI_ETIMEDOUT if it fails to receive + * handshake packet within a certain period of time. + */ + +ssize_t rxr_pkt_wait_handshake(struct rxr_ep *ep, fi_addr_t addr, struct rxr_peer *peer) +{ + ssize_t ret; + + uint64_t current, endwait; + + ret = rxr_pkt_trigger_handshake(ep, addr, peer); + if (OFI_UNLIKELY(ret)) + return ret; + + current = ofi_gettime_us(); + endwait = current + RXR_HANDSHAKE_WAIT_TIMEOUT; + + while (current < endwait && + !(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED)) { + rxr_ep_progress_internal(ep); + current = ofi_gettime_us(); + } + + if (!(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED)) { + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, + "did not get handshake back in %f second(s). returning -FI_EAGAIN!\n", + RXR_HANDSHAKE_WAIT_TIMEOUT * 1e-6); + return -FI_EAGAIN; + } + + return 0; +} + +/* + * This function is used for any extra feature that does not have an + * alternative. + * + * This function will send a eager rtw packet to trigger handshake. + * + * We do not send eager rtm packets here because the receiver might require + * ordering and an extra eager rtm will interrupt the reorder + * process. + * + * ep: The endpoint on which the packet for triggering handshake will be sent. + * peer: The peer from which the sender receives handshake. + * addr: The address of the peer. + * + * This function will return 0 if the eager rtw packet is successfully sent. + */ +ssize_t rxr_pkt_trigger_handshake(struct rxr_ep *ep, + fi_addr_t addr, struct rxr_peer *peer) +{ + struct rxr_tx_entry *tx_entry; + ssize_t err; + + if ((peer->flags & RXR_PEER_HANDSHAKE_RECEIVED) || + (peer->flags & RXR_PEER_REQ_SENT)) + return 0; + + tx_entry = ofi_buf_alloc(ep->tx_entry_pool); + if (OFI_UNLIKELY(!tx_entry)) { + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "TX entries exhausted.\n"); + return -FI_EAGAIN; + } + + tx_entry->total_len = 0; + tx_entry->addr = addr; + tx_entry->msg_id = -1; + tx_entry->cq_entry.flags = FI_RMA | FI_WRITE; + tx_entry->cq_entry.buf = NULL; + dlist_init(&tx_entry->queued_pkts); + + tx_entry->type = RXR_TX_ENTRY; + tx_entry->op = ofi_op_write; + tx_entry->state = RXR_TX_REQ; + + tx_entry->send_flags = 0; + tx_entry->bytes_acked = 0; + tx_entry->bytes_sent = 0; + tx_entry->window = 0; + tx_entry->rma_iov_count = 0; + tx_entry->iov_count = 0; + tx_entry->iov_index = 0; + tx_entry->iov_mr_start = 0; + tx_entry->iov_offset = 0; + tx_entry->fi_flags = RXR_NO_COMPLETION | RXR_NO_COUNTER; + +#if ENABLE_DEBUG + dlist_insert_tail(&tx_entry->tx_entry_entry, &ep->tx_entry_list); +#endif + + err = rxr_pkt_post_ctrl(ep, RXR_TX_ENTRY, tx_entry, RXR_EAGER_RTW_PKT, 0); + + if (OFI_UNLIKELY(err)) + return err; + + return 0; +} + +/* return the data size in a packet entry */ +size_t rxr_pkt_data_size(struct rxr_pkt_entry *pkt_entry) +{ + int pkt_type; + + assert(pkt_entry); + pkt_type = rxr_get_base_hdr(pkt_entry->pkt)->type; + + if (pkt_type == RXR_DATA_PKT) + return pkt_entry->pkt_size - sizeof(struct rxr_data_hdr); + + if (pkt_type == RXR_READRSP_PKT) + return pkt_entry->pkt_size - sizeof(struct rxr_readrsp_hdr); + + if (pkt_type >= RXR_REQ_PKT_BEGIN) { + assert(pkt_type == RXR_EAGER_MSGRTM_PKT || pkt_type == RXR_EAGER_TAGRTM_PKT || + pkt_type == RXR_MEDIUM_MSGRTM_PKT || pkt_type == RXR_MEDIUM_TAGRTM_PKT || + pkt_type == RXR_LONG_MSGRTM_PKT || pkt_type == RXR_LONG_TAGRTM_PKT || + pkt_type == RXR_EAGER_RTW_PKT || + pkt_type == RXR_LONG_RTW_PKT || + pkt_type == RXR_DC_EAGER_MSGRTM_PKT || + pkt_type == RXR_DC_EAGER_TAGRTM_PKT || + pkt_type == RXR_DC_MEDIUM_MSGRTM_PKT || + pkt_type == RXR_DC_MEDIUM_TAGRTM_PKT || + pkt_type == RXR_DC_LONG_MSGRTM_PKT || + pkt_type == RXR_DC_LONG_TAGRTM_PKT || + pkt_type == RXR_DC_EAGER_RTW_PKT || + pkt_type == RXR_DC_LONG_RTW_PKT); + + return pkt_entry->pkt_size - rxr_pkt_req_hdr_size(pkt_entry); + } + + /* other packet type does not contain data, thus return 0 + */ + return 0; +} + +/* + * rxr_pkt_copy_to_rx() copy data to receiving buffer then + * update counter in rx_entry. + * + * If receiving buffer is on GPU memory, it will post a + * read request, otherwise it will copy data. + * + * If all data has been copied to receiving buffer, + * it will write rx completion and release rx_entry. + * + * Return value and states: + * + * On success, return 0 and release pkt_entry + * On failure, return error code + */ +ssize_t rxr_pkt_copy_to_rx(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry, + size_t data_offset, + struct rxr_pkt_entry *pkt_entry, + char *data, size_t data_size) +{ + ssize_t err, bytes_copied; + + pkt_entry->x_entry = rx_entry; + + if (data_size > 0 && efa_ep_is_cuda_mr(rx_entry->desc[0])) { + err = rxr_read_post_local_read_or_queue(ep, rx_entry, data_offset, + pkt_entry, data, data_size); + if (err) + FI_WARN(&rxr_prov, FI_LOG_CQ, "cannot post read to copy data\n"); + + return err; + } + + if (OFI_LIKELY(!(rx_entry->rxr_flags & RXR_RECV_CANCEL)) && + rx_entry->cq_entry.len > data_offset && data_size > 0) { + bytes_copied = ofi_copy_to_iov(rx_entry->iov, + rx_entry->iov_count, + data_offset, + data, + data_size); + if (bytes_copied != MIN(data_size, rx_entry->cq_entry.len - data_offset)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, "wrong size! bytes_copied: %ld\n", + bytes_copied); + return -FI_EINVAL; + } + } + + rxr_pkt_handle_data_copied(ep, pkt_entry, data_size); + return 0; +} + +void rxr_pkt_handle_data_copied(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry, + size_t data_size) +{ + struct rxr_rx_entry *rx_entry; + ssize_t ret; + + rx_entry = pkt_entry->x_entry; + assert(rx_entry); + rx_entry->bytes_copied += data_size; + + if (rx_entry->total_len == rx_entry->bytes_copied) { + if (rx_entry->rxr_flags & RXR_DELIVERY_COMPLETE_REQUESTED) { + ret = rxr_pkt_post_ctrl_or_queue(ep, + RXR_RX_ENTRY, + rx_entry, + RXR_RECEIPT_PKT, 0); + if (OFI_UNLIKELY(ret)) { + FI_WARN(&rxr_prov, + FI_LOG_CQ, + "Posting of receipt packet failed! err=%s\n", + fi_strerror(ret)); + efa_eq_write_error(&ep->util_ep, + FI_EIO, + ret); + rxr_release_rx_entry(ep, + rx_entry); + return; + } + rxr_cq_handle_rx_completion(ep, pkt_entry, rx_entry); + rxr_msg_multi_recv_free_posted_entry(ep, rx_entry); + /* rx_entry will be released + * when sender receives the + * receipt packet. + */ + return; + } + rxr_cq_handle_rx_completion(ep, pkt_entry, rx_entry); + rxr_msg_multi_recv_free_posted_entry(ep, rx_entry); + rxr_release_rx_entry(ep, rx_entry); + } else { + rxr_pkt_entry_release_rx(ep, pkt_entry); + } +} + +/* + * Functions used to handle packet send completion + */ +void rxr_pkt_handle_send_completion(struct rxr_ep *ep, struct fi_cq_data_entry *comp) +{ + struct rxr_pkt_entry *pkt_entry; + struct rxr_peer *peer; + + pkt_entry = (struct rxr_pkt_entry *)comp->op_context; + + switch (rxr_get_base_hdr(pkt_entry->pkt)->type) { + case RXR_HANDSHAKE_PKT: + break; + case RXR_CTS_PKT: + break; + case RXR_DATA_PKT: + rxr_pkt_handle_data_send_completion(ep, pkt_entry); + break; + case RXR_READRSP_PKT: + rxr_pkt_handle_readrsp_send_completion(ep, pkt_entry); + break; + case RXR_EOR_PKT: + rxr_pkt_handle_eor_send_completion(ep, pkt_entry); + break; + case RXR_RMA_CONTEXT_PKT: + rxr_pkt_handle_rma_completion(ep, pkt_entry); + return; + case RXR_ATOMRSP_PKT: + rxr_pkt_handle_atomrsp_send_completion(ep, pkt_entry); + break; + case RXR_RECEIPT_PKT: + rxr_pkt_handle_receipt_send_completion(ep, pkt_entry); + break; + case RXR_EAGER_MSGRTM_PKT: + case RXR_EAGER_TAGRTM_PKT: + rxr_pkt_handle_eager_rtm_send_completion(ep, pkt_entry); + break; + case RXR_MEDIUM_MSGRTM_PKT: + case RXR_MEDIUM_TAGRTM_PKT: + rxr_pkt_handle_medium_rtm_send_completion(ep, pkt_entry); + break; + case RXR_LONG_MSGRTM_PKT: + case RXR_LONG_TAGRTM_PKT: + rxr_pkt_handle_long_rtm_send_completion(ep, pkt_entry); + break; + case RXR_READ_MSGRTM_PKT: + case RXR_READ_TAGRTM_PKT: + rxr_pkt_handle_read_rtm_send_completion(ep, pkt_entry); + break; + case RXR_EAGER_RTW_PKT: + rxr_pkt_handle_eager_rtw_send_completion(ep, pkt_entry); + break; + case RXR_LONG_RTW_PKT: + rxr_pkt_handle_long_rtw_send_completion(ep, pkt_entry); + break; + case RXR_READ_RTW_PKT: + rxr_pkt_handle_read_rtw_send_completion(ep, pkt_entry); + break; + case RXR_SHORT_RTR_PKT: + case RXR_LONG_RTR_PKT: + rxr_pkt_handle_rtr_send_completion(ep, pkt_entry); + break; + case RXR_WRITE_RTA_PKT: + rxr_pkt_handle_write_rta_send_completion(ep, pkt_entry); + break; + case RXR_FETCH_RTA_PKT: + /* no action to be taken here */ + break; + case RXR_COMPARE_RTA_PKT: + /* no action to be taken here */ + break; + case RXR_DC_EAGER_MSGRTM_PKT: + case RXR_DC_EAGER_TAGRTM_PKT: + case RXR_DC_MEDIUM_MSGRTM_PKT: + case RXR_DC_MEDIUM_TAGRTM_PKT: + case RXR_DC_EAGER_RTW_PKT: + case RXR_DC_WRITE_RTA_PKT: + /* no action to be taken here */ + /* For non-dc version of the packet types, + * this is the place to write tx completion. + * However, for dc tx completion will always be + * written upon receving the receipt packet + * if not using long message protocols. + * Moreoever, because receipt can arrive + * before send completion, we cannot take + * any action on tx_entry here. + */ + break; + case RXR_DC_LONG_MSGRTM_PKT: + case RXR_DC_LONG_TAGRTM_PKT: + rxr_pkt_handle_dc_long_rtm_send_completion(ep, pkt_entry); + break; + case RXR_DC_LONG_RTW_PKT: + rxr_pkt_handle_dc_long_rtw_send_completion(ep, pkt_entry); + break; + default: + FI_WARN(&rxr_prov, FI_LOG_CQ, + "invalid control pkt type %d\n", + rxr_get_base_hdr(pkt_entry->pkt)->type); + assert(0 && "invalid control pkt type"); + rxr_cq_handle_cq_error(ep, -FI_EIO); + return; + } + + peer = rxr_ep_get_peer(ep, pkt_entry->addr); + if (!peer->is_local) + rxr_ep_dec_tx_pending(ep, peer, 0); + rxr_pkt_entry_release_tx(ep, pkt_entry); +} + +/* + * Functions used to handle packet receive completion + */ +static +fi_addr_t rxr_pkt_insert_addr(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry, void *raw_addr) +{ + int i, ret; + fi_addr_t rdm_addr; + struct efa_ep *efa_ep; + struct rxr_base_hdr *base_hdr; + + base_hdr = rxr_get_base_hdr(pkt_entry->pkt); + if (base_hdr->version < RXR_BASE_PROTOCOL_VERSION) { + char host_gid[ep->core_addrlen * 3]; + int length = 0; + + for (i = 0; i < ep->core_addrlen; i++) + length += sprintf(&host_gid[length], "%02x ", + ep->core_addr[i]); + FI_WARN(&rxr_prov, FI_LOG_CQ, + "Host %s received a packet with invalid protocol version %d.\n" + "This host can only support protocol version %d and above.\n", + host_gid, base_hdr->version, RXR_BASE_PROTOCOL_VERSION); + efa_eq_write_error(&ep->util_ep, FI_EIO, -FI_EINVAL); + fprintf(stderr, "Host %s received a packet with invalid protocol version %d.\n" + "This host can only support protocol version %d and above. %s:%d\n", + host_gid, base_hdr->version, RXR_BASE_PROTOCOL_VERSION, __FILE__, __LINE__); + abort(); + } + + assert(base_hdr->type >= RXR_REQ_PKT_BEGIN); + + efa_ep = container_of(ep->rdm_ep, struct efa_ep, util_ep.ep_fid); + ret = efa_av_insert_addr(efa_ep->av, (struct efa_ep_addr *)raw_addr, + &rdm_addr, 0, NULL); + if (OFI_UNLIKELY(ret != 0)) { + efa_eq_write_error(&ep->util_ep, FI_EINVAL, ret); + return -1; + } + + return rdm_addr; +} + +void rxr_pkt_handle_recv_completion(struct rxr_ep *ep, + struct fi_cq_data_entry *cq_entry, + fi_addr_t src_addr) +{ + struct rxr_peer *peer; + struct rxr_base_hdr *base_hdr; + struct rxr_pkt_entry *pkt_entry; + + pkt_entry = (struct rxr_pkt_entry *)cq_entry->op_context; + pkt_entry->pkt_size = cq_entry->len; + assert(pkt_entry->pkt_size > 0); + + base_hdr = rxr_get_base_hdr(pkt_entry->pkt); + if (base_hdr->type >= RXR_EXTRA_REQ_PKT_END) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "Peer %d is requesting feature %d, which this EP does not support.\n", + (int)src_addr, base_hdr->type); + + assert(0 && "invalid REQ packe type"); + rxr_cq_handle_cq_error(ep, -FI_EIO); + return; + } + + if (base_hdr->type >= RXR_REQ_PKT_BEGIN) { + /* + * as long as the REQ packet contain raw address + * we will need to call insert because it might be a new + * EP with new Q-Key. + */ + void *raw_addr; + + raw_addr = rxr_pkt_req_raw_addr(pkt_entry); + if (OFI_UNLIKELY(raw_addr != NULL)) + pkt_entry->addr = rxr_pkt_insert_addr(ep, pkt_entry, raw_addr); + else + pkt_entry->addr = src_addr; + } else { + assert(src_addr != FI_ADDR_NOTAVAIL); + pkt_entry->addr = src_addr; + } + +#if ENABLE_DEBUG + if (!ep->use_zcpy_rx) { + dlist_remove(&pkt_entry->dbg_entry); + dlist_insert_tail(&pkt_entry->dbg_entry, &ep->rx_pkt_list); + } +#ifdef ENABLE_RXR_PKT_DUMP + rxr_pkt_print("Received", ep, (struct rxr_base_hdr *)pkt_entry->pkt); +#endif +#endif + peer = rxr_ep_get_peer(ep, pkt_entry->addr); + if (!(peer->flags & RXR_PEER_HANDSHAKE_SENT)) + rxr_pkt_post_handshake(ep, peer, pkt_entry->addr); + + if (peer->is_local) { + assert(ep->use_shm); + ep->posted_bufs_shm--; + } else { + ep->posted_bufs_efa--; + } + + switch (base_hdr->type) { + case RXR_RETIRED_RTS_PKT: + FI_WARN(&rxr_prov, FI_LOG_CQ, + "Received a RTS packet, which has been retired since protocol version 4\n"); + assert(0 && "deprecated RTS pakcet received"); + rxr_cq_handle_cq_error(ep, -FI_EIO); + return; + case RXR_RETIRED_CONNACK_PKT: + FI_WARN(&rxr_prov, FI_LOG_CQ, + "Received a CONNACK packet, which has been retired since protocol version 4\n"); + assert(0 && "deprecated CONNACK pakcet received"); + rxr_cq_handle_cq_error(ep, -FI_EIO); + return; + case RXR_EOR_PKT: + rxr_pkt_handle_eor_recv(ep, pkt_entry); + return; + case RXR_HANDSHAKE_PKT: + rxr_pkt_handle_handshake_recv(ep, pkt_entry); + return; + case RXR_CTS_PKT: + rxr_pkt_handle_cts_recv(ep, pkt_entry); + return; + case RXR_DATA_PKT: + rxr_pkt_handle_data_recv(ep, pkt_entry); + return; + case RXR_READRSP_PKT: + rxr_pkt_handle_readrsp_recv(ep, pkt_entry); + return; + case RXR_ATOMRSP_PKT: + rxr_pkt_handle_atomrsp_recv(ep, pkt_entry); + return; + case RXR_RECEIPT_PKT: + rxr_pkt_handle_receipt_recv(ep, pkt_entry); + return; + case RXR_EAGER_MSGRTM_PKT: + if (ep->use_zcpy_rx && pkt_entry->type == RXR_PKT_ENTRY_USER) + rxr_pkt_handle_zcpy_recv(ep, pkt_entry); + else + rxr_pkt_handle_rtm_rta_recv(ep, pkt_entry); + return; + case RXR_EAGER_TAGRTM_PKT: + case RXR_DC_EAGER_MSGRTM_PKT: + case RXR_DC_EAGER_TAGRTM_PKT: + case RXR_MEDIUM_MSGRTM_PKT: + case RXR_MEDIUM_TAGRTM_PKT: + case RXR_DC_MEDIUM_MSGRTM_PKT: + case RXR_DC_MEDIUM_TAGRTM_PKT: + case RXR_LONG_MSGRTM_PKT: + case RXR_LONG_TAGRTM_PKT: + case RXR_DC_LONG_MSGRTM_PKT: + case RXR_DC_LONG_TAGRTM_PKT: + case RXR_READ_MSGRTM_PKT: + case RXR_READ_TAGRTM_PKT: + case RXR_WRITE_RTA_PKT: + case RXR_DC_WRITE_RTA_PKT: + case RXR_FETCH_RTA_PKT: + case RXR_COMPARE_RTA_PKT: + rxr_pkt_handle_rtm_rta_recv(ep, pkt_entry); + return; + case RXR_EAGER_RTW_PKT: + rxr_pkt_handle_eager_rtw_recv(ep, pkt_entry); + return; + case RXR_LONG_RTW_PKT: + case RXR_DC_LONG_RTW_PKT: + rxr_pkt_handle_long_rtw_recv(ep, pkt_entry); + return; + case RXR_READ_RTW_PKT: + rxr_pkt_handle_read_rtw_recv(ep, pkt_entry); + return; + case RXR_SHORT_RTR_PKT: + case RXR_LONG_RTR_PKT: + rxr_pkt_handle_rtr_recv(ep, pkt_entry); + return; + case RXR_DC_EAGER_RTW_PKT: + rxr_pkt_handle_dc_eager_rtw_recv(ep, pkt_entry); + return; + default: + FI_WARN(&rxr_prov, FI_LOG_CQ, + "invalid control pkt type %d\n", + rxr_get_base_hdr(pkt_entry->pkt)->type); + assert(0 && "invalid control pkt type"); + rxr_cq_handle_cq_error(ep, -FI_EIO); + return; + } +} + +#if ENABLE_DEBUG + +/* + * Functions used to dump packets + */ + +#define RXR_PKT_DUMP_DATA_LEN 64 + +static +void rxr_pkt_print_handshake(char *prefix, + struct rxr_handshake_hdr *handshake_hdr) +{ + FI_DBG(&rxr_prov, FI_LOG_EP_DATA, + "%s RxR HANDSHAKE packet - version: %" PRIu8 + " flags: %x\n", prefix, handshake_hdr->version, + handshake_hdr->flags); + + FI_DBG(&rxr_prov, FI_LOG_EP_DATA, + "%s RxR HANDSHAKE packet, maxproto: %d\n", + prefix, handshake_hdr->maxproto); +} + +static +void rxr_pkt_print_cts(char *prefix, struct rxr_cts_hdr *cts_hdr) +{ + FI_DBG(&rxr_prov, FI_LOG_EP_DATA, + "%s RxR CTS packet - version: %" PRIu8 + " flags: %x tx_id: %" PRIu32 + " rx_id: %" PRIu32 + " window: %" PRIu64 + "\n", prefix, cts_hdr->version, cts_hdr->flags, + cts_hdr->tx_id, cts_hdr->rx_id, cts_hdr->window); +} + +static +void rxr_pkt_print_data(char *prefix, struct rxr_data_pkt *data_pkt) +{ + char str[RXR_PKT_DUMP_DATA_LEN * 4]; + size_t str_len = RXR_PKT_DUMP_DATA_LEN * 4, l; + int i; + + str[str_len - 1] = '\0'; + + FI_DBG(&rxr_prov, FI_LOG_EP_DATA, + "%s RxR DATA packet - version: %" PRIu8 + " flags: %x rx_id: %" PRIu32 + " seg_size: %" PRIu64 + " seg_offset: %" PRIu64 + "\n", prefix, data_pkt->hdr.version, data_pkt->hdr.flags, + data_pkt->hdr.rx_id, data_pkt->hdr.seg_size, + data_pkt->hdr.seg_offset); + + l = snprintf(str, str_len, ("\tdata: ")); + for (i = 0; i < MIN(data_pkt->hdr.seg_size, RXR_PKT_DUMP_DATA_LEN); + i++) + l += snprintf(str + l, str_len - l, "%02x ", + ((uint8_t *)data_pkt->data)[i]); + FI_DBG(&rxr_prov, FI_LOG_EP_DATA, "%s\n", str); +} + +void rxr_pkt_print(char *prefix, struct rxr_ep *ep, struct rxr_base_hdr *hdr) +{ + switch (hdr->type) { + case RXR_HANDSHAKE_PKT: + rxr_pkt_print_handshake(prefix, (struct rxr_handshake_hdr *)hdr); + break; + case RXR_CTS_PKT: + rxr_pkt_print_cts(prefix, (struct rxr_cts_hdr *)hdr); + break; + case RXR_DATA_PKT: + rxr_pkt_print_data(prefix, (struct rxr_data_pkt *)hdr); + break; + default: + FI_WARN(&rxr_prov, FI_LOG_CQ, "invalid ctl pkt type %d\n", + rxr_get_base_hdr(hdr)->type); + assert(0); + return; + } +} +#endif + diff --git a/prov/efa/src/rxr/rxr_pkt_cmd.h b/prov/efa/src/rxr/rxr_pkt_cmd.h new file mode 100644 index 00000000000..34a04dd0eb0 --- /dev/null +++ b/prov/efa/src/rxr/rxr_pkt_cmd.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _RXR_PKT_CMD_H +#define _RXR_PKT_CMD_H + +#include "rxr.h" + +ssize_t rxr_pkt_post_data(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry); + +ssize_t rxr_pkt_post_ctrl(struct rxr_ep *ep, int entry_type, void *x_entry, + int ctrl_type, bool inject); + +ssize_t rxr_pkt_post_ctrl_or_queue(struct rxr_ep *ep, int entry_type, void *x_entry, + int ctrl_type, bool inject); + +size_t rxr_pkt_data_size(struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_copy_to_rx(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry, + size_t data_offset, + struct rxr_pkt_entry *pkt_entry, + char *data, size_t data_size); + +void rxr_pkt_handle_data_copied(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry, + size_t data_size); + +void rxr_pkt_handle_send_completion(struct rxr_ep *ep, + struct fi_cq_data_entry *cq_entry); + +void rxr_pkt_handle_recv_completion(struct rxr_ep *ep, + struct fi_cq_data_entry *cq_entry, + fi_addr_t src_addr); + +ssize_t rxr_pkt_wait_handshake(struct rxr_ep *ep, fi_addr_t addr, struct rxr_peer *peer); + +ssize_t rxr_pkt_trigger_handshake(struct rxr_ep *ep, + fi_addr_t addr, struct rxr_peer *peer); + +#if ENABLE_DEBUG +void rxr_pkt_print(char *prefix, + struct rxr_ep *ep, + struct rxr_base_hdr *hdr); +#endif + +#endif + diff --git a/prov/efa/src/rxr/rxr_pkt_entry.c b/prov/efa/src/rxr/rxr_pkt_entry.c new file mode 100644 index 00000000000..0780620bc30 --- /dev/null +++ b/prov/efa/src/rxr/rxr_pkt_entry.c @@ -0,0 +1,477 @@ +/* + * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "ofi.h" +#include +#include + +#include "rxr.h" +#include "efa.h" +#include "rxr_msg.h" +#include "rxr_rma.h" +#include "rxr_pkt_cmd.h" + +/* + * General purpose utility functions + */ + +struct rxr_pkt_entry *rxr_pkt_entry_init_prefix(struct rxr_ep *ep, + const struct fi_msg *posted_buf, + struct ofi_bufpool *pkt_pool) +{ + struct rxr_pkt_entry *pkt_entry; + struct efa_mr *mr; + + /* + * Given the pkt_entry->pkt immediately follows the pkt_entry + * fields, we can directly map the user-provided fi_msg address + * as the pkt_entry, which will hold the metadata in the prefix. + */ + assert(posted_buf->msg_iov->iov_len >= sizeof(struct rxr_pkt_entry) + sizeof(struct rxr_eager_msgrtm_hdr)); + pkt_entry = (struct rxr_pkt_entry *) posted_buf->msg_iov->iov_base; + if (!pkt_entry) + return NULL; + + /* + * The ownership of the prefix buffer lies with the application, do not + * put it on the dbg list for cleanup during shutdown or poison it. The + * provider loses jurisdiction over it soon after writing the rx + * completion. + */ + dlist_init(&pkt_entry->entry); + mr = (struct efa_mr *) posted_buf->desc[0]; + pkt_entry->mr = &mr->mr_fid; + + pkt_entry->type = RXR_PKT_ENTRY_USER; + pkt_entry->state = RXR_PKT_ENTRY_IN_USE; + pkt_entry->next = NULL; + + return pkt_entry; +} + +struct rxr_pkt_entry *rxr_pkt_entry_alloc(struct rxr_ep *ep, + struct ofi_bufpool *pkt_pool) +{ + struct rxr_pkt_entry *pkt_entry; + void *mr = NULL; + + pkt_entry = ofi_buf_alloc_ex(pkt_pool, &mr); + if (!pkt_entry) + return NULL; + +#ifdef ENABLE_EFA_POISONING + memset(pkt_entry, 0, sizeof(*pkt_entry)); +#endif + dlist_init(&pkt_entry->entry); +#if ENABLE_DEBUG + dlist_init(&pkt_entry->dbg_entry); +#endif + pkt_entry->mr = (struct fid_mr *) mr; +#ifdef ENABLE_EFA_POISONING + memset(pkt_entry->pkt, 0, ep->mtu_size); +#endif + pkt_entry->type = RXR_PKT_ENTRY_POSTED; + pkt_entry->state = RXR_PKT_ENTRY_IN_USE; + pkt_entry->next = NULL; + + return pkt_entry; +} + +static +void rxr_pkt_entry_release_single_tx(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt) +{ + struct rxr_peer *peer; + +#if ENABLE_DEBUG + dlist_remove(&pkt->dbg_entry); +#endif + /* + * Decrement rnr_queued_pkts counter and reset backoff for this peer if + * we get a send completion for a retransmitted packet. + */ + if (OFI_UNLIKELY(pkt->state == RXR_PKT_ENTRY_RNR_RETRANSMIT)) { + peer = rxr_ep_get_peer(ep, pkt->addr); + peer->rnr_queued_pkt_cnt--; + peer->timeout_interval = 0; + peer->rnr_timeout_exp = 0; + if (peer->flags & RXR_PEER_IN_BACKOFF) + dlist_remove(&peer->rnr_entry); + peer->flags &= ~RXR_PEER_IN_BACKOFF; + FI_DBG(&rxr_prov, FI_LOG_EP_DATA, + "reset backoff timer for peer: %" PRIu64 "\n", + pkt->addr); + } +#ifdef ENABLE_EFA_POISONING + rxr_poison_mem_region((uint32_t *)pkt, ep->tx_pkt_pool_entry_sz); +#endif + pkt->state = RXR_PKT_ENTRY_FREE; + ofi_buf_free(pkt); +} + +void rxr_pkt_entry_release_tx(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_pkt_entry *next; + + while (pkt_entry) { + next = pkt_entry->next; + rxr_pkt_entry_release_single_tx(ep, pkt_entry); + pkt_entry = next; + } +} + +/* + * rxr_pkt_entry_release_rx() release a rx packet entry. + * It requires input pkt_entry to be unlinked. + * + * RX packet entry can be linked when medium message protocol + * is used. + * + * In that case, caller is responsible to unlink the pkt_entry + * can call this function on next packet entry. + */ +void rxr_pkt_entry_release_rx(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + assert(pkt_entry->next == NULL); + + if (ep->use_zcpy_rx && pkt_entry->type == RXR_PKT_ENTRY_USER) + return; + + if (pkt_entry->type == RXR_PKT_ENTRY_POSTED) { + struct rxr_peer *peer; + + peer = rxr_ep_get_peer(ep, pkt_entry->addr); + + if (peer->is_local) + ep->rx_bufs_shm_to_post++; + else + ep->rx_bufs_efa_to_post++; + } + + if (pkt_entry->type == RXR_PKT_ENTRY_READ_COPY) { + assert(ep->rx_readcopy_pkt_pool_used > 0); + ep->rx_readcopy_pkt_pool_used--; + } + +#if ENABLE_DEBUG + dlist_remove(&pkt_entry->dbg_entry); +#endif +#ifdef ENABLE_EFA_POISONING + /* the same pool size is used for all types of rx pkt_entries */ + rxr_poison_mem_region((uint32_t *)pkt_entry, ep->rx_pkt_pool_entry_sz); +#endif + pkt_entry->state = RXR_PKT_ENTRY_FREE; + ofi_buf_free(pkt_entry); +} + +void rxr_pkt_entry_copy(struct rxr_ep *ep, + struct rxr_pkt_entry *dest, + struct rxr_pkt_entry *src, + int new_entry_type) +{ + FI_DBG(&rxr_prov, FI_LOG_EP_CTRL, + "Copying packet out of posted buffer! src_entry_type: %d new_entry_type: %d\n", + src->type, new_entry_type); + dlist_init(&dest->entry); +#if ENABLE_DEBUG + dlist_init(&dest->dbg_entry); +#endif + /* dest->mr was set in rxr_pkt_entry_alloc(), and + * is tied to the memory region, therefore should + * not be changed. + */ + dest->x_entry = src->x_entry; + dest->pkt_size = src->pkt_size; + dest->addr = src->addr; + dest->type = new_entry_type; + dest->state = RXR_PKT_ENTRY_IN_USE; + dest->next = NULL; + memcpy(dest->pkt, src->pkt, ep->mtu_size); +} + +/* + * Create a new rx_entry for an unexpected message. Store the packet for later + * processing and put the rx_entry on the appropriate unexpected list. + */ +struct rxr_pkt_entry *rxr_pkt_get_unexp(struct rxr_ep *ep, + struct rxr_pkt_entry **pkt_entry_ptr) +{ + struct rxr_pkt_entry *unexp_pkt_entry; + + if (rxr_env.rx_copy_unexp && (*pkt_entry_ptr)->type == RXR_PKT_ENTRY_POSTED) { + unexp_pkt_entry = rxr_pkt_entry_clone(ep, ep->rx_unexp_pkt_pool, *pkt_entry_ptr, RXR_PKT_ENTRY_UNEXP); + if (OFI_UNLIKELY(!unexp_pkt_entry)) { + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, + "Unable to allocate rx_pkt_entry for unexp msg\n"); + return NULL; + } + rxr_pkt_entry_release_rx(ep, *pkt_entry_ptr); + *pkt_entry_ptr = unexp_pkt_entry; + } else { + unexp_pkt_entry = *pkt_entry_ptr; + } + + return unexp_pkt_entry; +} + +void rxr_pkt_entry_release_cloned(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_pkt_entry *next; + + while (pkt_entry) { + assert(pkt_entry->type == RXR_PKT_ENTRY_OOO || + pkt_entry->type == RXR_PKT_ENTRY_UNEXP); +#ifdef ENABLE_EFA_POISONING + rxr_poison_mem_region((uint32_t *)pkt_entry, ep->tx_pkt_pool_entry_sz); +#endif + pkt_entry->state = RXR_PKT_ENTRY_FREE; + ofi_buf_free(pkt_entry); + next = pkt_entry->next; + pkt_entry = next; + } +} + +struct rxr_pkt_entry *rxr_pkt_entry_clone(struct rxr_ep *ep, + struct ofi_bufpool *pkt_pool, + struct rxr_pkt_entry *src, + int new_entry_type) +{ + struct rxr_pkt_entry *root = NULL; + struct rxr_pkt_entry *dst; + + assert(src); + assert(new_entry_type == RXR_PKT_ENTRY_OOO || + new_entry_type == RXR_PKT_ENTRY_UNEXP || + new_entry_type == RXR_PKT_ENTRY_READ_COPY); + + dst = rxr_pkt_entry_alloc(ep, pkt_pool); + if (!dst) + return NULL; + + if (new_entry_type == RXR_PKT_ENTRY_READ_COPY) { + assert(pkt_pool == ep->rx_readcopy_pkt_pool); + ep->rx_readcopy_pkt_pool_used++; + ep->rx_readcopy_pkt_pool_max_used = MAX(ep->rx_readcopy_pkt_pool_used, + ep->rx_readcopy_pkt_pool_max_used); + } + + rxr_pkt_entry_copy(ep, dst, src, new_entry_type); + root = dst; + while (src->next) { + dst->next = rxr_pkt_entry_alloc(ep, pkt_pool); + if (!dst->next) { + rxr_pkt_entry_release_cloned(ep, root); + return NULL; + } + + rxr_pkt_entry_copy(ep, dst->next, src->next, new_entry_type); + src = src->next; + dst = dst->next; + } + + assert(dst && !dst->next); + return root; +} + +void rxr_pkt_entry_append(struct rxr_pkt_entry *dst, + struct rxr_pkt_entry *src) +{ + assert(dst); + + while (dst->next) + dst = dst->next; + assert(dst && !dst->next); + dst->next = src; +} + +static inline +ssize_t rxr_pkt_entry_sendmsg(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry, + const struct fi_msg *msg, uint64_t flags) +{ + struct rxr_peer *peer; + size_t ret; + + peer = rxr_ep_get_peer(ep, pkt_entry->addr); + assert(ep->tx_pending <= ep->max_outstanding_tx); + + if (ep->tx_pending == ep->max_outstanding_tx) + return -FI_EAGAIN; + + if (peer->flags & RXR_PEER_IN_BACKOFF) + return -FI_EAGAIN; + +#if ENABLE_DEBUG + dlist_insert_tail(&pkt_entry->dbg_entry, &ep->tx_pkt_list); +#ifdef ENABLE_RXR_PKT_DUMP + rxr_pkt_print("Sent", ep, (struct rxr_base_hdr *)pkt_entry->pkt); +#endif +#endif + if (peer->is_local) { + assert(ep->use_shm); + ret = fi_sendmsg(ep->shm_ep, msg, flags); + } else { + ret = fi_sendmsg(ep->rdm_ep, msg, flags); + if (OFI_LIKELY(!ret)) + rxr_ep_inc_tx_pending(ep, peer); + } + + return ret; +} + +ssize_t rxr_pkt_entry_sendv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry, + fi_addr_t addr, const struct iovec *iov, + void **desc, size_t count, uint64_t flags) +{ + struct fi_msg msg; + struct rxr_peer *peer; + + msg.msg_iov = iov; + msg.desc = desc; + msg.iov_count = count; + peer = rxr_ep_get_peer(ep, addr); + msg.addr = (peer->is_local) ? peer->shm_fiaddr : addr; + msg.context = pkt_entry; + msg.data = 0; + + return rxr_pkt_entry_sendmsg(ep, pkt_entry, &msg, flags); +} + +/* rxr_pkt_start currently expects data pkt right after pkt hdr */ +ssize_t rxr_pkt_entry_send_with_flags(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry, + fi_addr_t addr, uint64_t flags) +{ + struct iovec iov; + void *desc; + + iov.iov_base = rxr_pkt_start(pkt_entry); + iov.iov_len = pkt_entry->pkt_size; + + if (rxr_ep_get_peer(ep, addr)->is_local) { + assert(ep->use_shm); + desc = NULL; + } else { + desc = fi_mr_desc(pkt_entry->mr); + } + + return rxr_pkt_entry_sendv(ep, pkt_entry, addr, &iov, &desc, 1, flags); +} + +ssize_t rxr_pkt_entry_send(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry, + fi_addr_t addr) +{ + return rxr_pkt_entry_send_with_flags(ep, pkt_entry, addr, 0); +} + +ssize_t rxr_pkt_entry_inject(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry, + fi_addr_t addr) +{ + struct rxr_peer *peer; + + /* currently only EOR packet is injected using shm ep */ + peer = rxr_ep_get_peer(ep, addr); + + assert(ep->use_shm && peer->is_local); + return fi_inject(ep->shm_ep, rxr_pkt_start(pkt_entry), pkt_entry->pkt_size, + peer->shm_fiaddr); +} + +/* + * Functions for pkt_rx_map + */ +struct rxr_rx_entry *rxr_pkt_rx_map_lookup(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_pkt_rx_map *entry = NULL; + struct rxr_pkt_rx_key key; + + key.msg_id = rxr_pkt_msg_id(pkt_entry); + key.addr = pkt_entry->addr; + HASH_FIND(hh, ep->pkt_rx_map, &key, sizeof(struct rxr_pkt_rx_key), entry); + return entry ? entry->rx_entry : NULL; +} + +void rxr_pkt_rx_map_insert(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry, + struct rxr_rx_entry *rx_entry) +{ + struct rxr_pkt_rx_map *entry; + + entry = ofi_buf_alloc(ep->map_entry_pool); + if (OFI_UNLIKELY(!entry)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "Map entries for medium size message exhausted.\n"); + efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS); + return; + } + + entry->key.msg_id = rxr_pkt_msg_id(pkt_entry); + entry->key.addr = pkt_entry->addr; + +#if ENABLE_DEBUG + { + struct rxr_pkt_rx_map *existing_entry = NULL; + + HASH_FIND(hh, ep->pkt_rx_map, &entry->key, sizeof(struct rxr_pkt_rx_key), existing_entry); + assert(!existing_entry); + } +#endif + + entry->rx_entry = rx_entry; + HASH_ADD(hh, ep->pkt_rx_map, key, sizeof(struct rxr_pkt_rx_key), entry); +} + +void rxr_pkt_rx_map_remove(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry, + struct rxr_rx_entry *rx_entry) +{ + struct rxr_pkt_rx_map *entry; + struct rxr_pkt_rx_key key; + + key.msg_id = rxr_pkt_msg_id(pkt_entry); + key.addr = pkt_entry->addr; + + HASH_FIND(hh, ep->pkt_rx_map, &key, sizeof(key), entry); + assert(entry && entry->rx_entry == rx_entry); + HASH_DEL(ep->pkt_rx_map, entry); + ofi_buf_free(entry); +} + diff --git a/prov/efa/src/rxr/rxr_pkt_entry.h b/prov/efa/src/rxr/rxr_pkt_entry.h new file mode 100644 index 00000000000..ee58e071839 --- /dev/null +++ b/prov/efa/src/rxr/rxr_pkt_entry.h @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _RXR_PKT_ENTRY_H +#define _RXR_PKT_ENTRY_H + +#include + +/* pkt_entry state for retransmit tracking */ +enum rxr_pkt_entry_state { + RXR_PKT_ENTRY_FREE = 0, + RXR_PKT_ENTRY_IN_USE, + RXR_PKT_ENTRY_RNR_RETRANSMIT, +}; + +/* pkt_entry types for rx pkts */ +enum rxr_pkt_entry_type { + RXR_PKT_ENTRY_POSTED = 1, /* entries that are posted to the device from the RX bufpool */ + RXR_PKT_ENTRY_UNEXP, /* entries used to stage unexpected msgs */ + RXR_PKT_ENTRY_OOO, /* entries used to stage out-of-order RTM or RTA */ + RXR_PKT_ENTRY_USER, /* entries backed by user-provided msg prefix (FI_MSG_PREFIX)*/ + RXR_PKT_ENTRY_READ_COPY, /* entries used to stage copy by read */ +}; + +struct rxr_pkt_sendv { + /* Because core EP current only support 2 iov, + * and for the sake of code simplicity, we use 2 iov. + * One for header, and the other for data. + * iov_count here is used as an indication + * of whether iov is used, it is either 0 or 2. + */ + int iov_count; + struct iovec iov[2]; + void *desc[2]; +}; + +struct rxr_pkt_entry { + /* for rx/tx_entry queued_pkts list */ + struct dlist_entry entry; +#if ENABLE_DEBUG + /* for tx/rx debug list or posted buf list */ + struct dlist_entry dbg_entry; +#endif + void *x_entry; /* pointer to rxr rx/tx entry */ + size_t pkt_size; + + struct fid_mr *mr; + fi_addr_t addr; + enum rxr_pkt_entry_type type; + enum rxr_pkt_entry_state state; + + /* + * next is used on receiving end. + * send is used on sending end. + */ + union { + struct rxr_pkt_entry *next; + struct rxr_pkt_sendv *send; + }; + +#if ENABLE_DEBUG + /* pad to cache line size of 64 bytes */ + uint8_t pad[48]; +#endif + char pkt[0]; /* rxr_ctrl_*_pkt, or rxr_data_pkt */ +}; + +static inline void *rxr_pkt_start(struct rxr_pkt_entry *pkt_entry) +{ + return pkt_entry->pkt; +} + +#if defined(static_assert) && defined(__x86_64__) +#if ENABLE_DEBUG +static_assert(sizeof(struct rxr_pkt_entry) == 128, "rxr_pkt_entry check"); +#else +static_assert(sizeof(struct rxr_pkt_entry) == 64, "rxr_pkt_entry check"); +#endif +#endif + +OFI_DECL_RECVWIN_BUF(struct rxr_pkt_entry*, rxr_robuf, uint32_t); +OFI_DECLARE_FREESTACK(struct rxr_robuf, rxr_robuf_fs); + +struct rxr_ep; + +struct rxr_tx_entry; + +struct rxr_pkt_entry *rxr_pkt_entry_init_prefix(struct rxr_ep *ep, + const struct fi_msg *posted_buf, + struct ofi_bufpool *pkt_pool); + +struct rxr_pkt_entry *rxr_pkt_entry_alloc(struct rxr_ep *ep, + struct ofi_bufpool *pkt_pool); + +void rxr_pkt_entry_release_tx(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_entry_release_rx(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_entry_append(struct rxr_pkt_entry *dst, + struct rxr_pkt_entry *src); + +struct rxr_pkt_entry *rxr_pkt_entry_clone(struct rxr_ep *ep, + struct ofi_bufpool *pkt_pool, + struct rxr_pkt_entry *src, + int new_entry_type); + +struct rxr_pkt_entry *rxr_pkt_get_unexp(struct rxr_ep *ep, + struct rxr_pkt_entry **pkt_entry_ptr); + +ssize_t rxr_pkt_entry_send_with_flags(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry, + fi_addr_t addr, uint64_t flags); + +ssize_t rxr_pkt_entry_sendv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry, + fi_addr_t addr, const struct iovec *iov, + void **desc, size_t count, uint64_t flags); + +ssize_t rxr_pkt_entry_send(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry, + fi_addr_t addr); + +ssize_t rxr_pkt_entry_inject(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry, + fi_addr_t addr); + +struct rxr_pkt_rx_key { + uint64_t msg_id; + fi_addr_t addr; +}; + +struct rxr_pkt_rx_map { + struct rxr_pkt_rx_key key; + struct rxr_rx_entry *rx_entry; + UT_hash_handle hh; +}; + +struct rxr_rx_entry *rxr_pkt_rx_map_lookup(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_rx_map_insert(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry, + struct rxr_rx_entry *rx_entry); + +void rxr_pkt_rx_map_remove(struct rxr_ep *pkt_rx_map, + struct rxr_pkt_entry *pkt_entry, + struct rxr_rx_entry *rx_entry); + +#endif diff --git a/prov/efa/src/rxr/rxr_pkt_type.h b/prov/efa/src/rxr/rxr_pkt_type.h new file mode 100644 index 00000000000..2555f10809c --- /dev/null +++ b/prov/efa/src/rxr/rxr_pkt_type.h @@ -0,0 +1,446 @@ +/* + * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _RXR_PKT_TYPE_H +#define _RXR_PKT_TYPE_H + +/* This header file contain the ID of all RxR packet types, and + * the necessary data structures and functions for each packet type + * + * RxR packet types can be classified into 3 categories: + * data packet, control packet and context packet + * + * For each packet type, the following items are needed: + * + * First, each packet type need to define a struct for its header, + * and the header must be start with ```struct rxr_base_hdr```. + * + * Second, each control packet type need to define an init() + * function and a handle_sent() function. These functions + * are called by rxr_pkt_post_ctrl_or_queue(). + * + * Finally, each packet type (except context packet) need to + * define a handle_recv() functions which is called by + * rxr_pkt_handle_recv_completion(). + */ + +/* ID of each packet type. Changing ID would break inter + * operability thus is strictly prohibited. + */ + +#define RXR_RETIRED_RTS_PKT 1 +#define RXR_RETIRED_CONNACK_PKT 2 +#define RXR_CTS_PKT 3 +#define RXR_DATA_PKT 4 +#define RXR_READRSP_PKT 5 +#define RXR_RMA_CONTEXT_PKT 6 +#define RXR_EOR_PKT 7 +#define RXR_ATOMRSP_PKT 8 +#define RXR_HANDSHAKE_PKT 9 +#define RXR_RECEIPT_PKT 10 + +#define RXR_REQ_PKT_BEGIN 64 +#define RXR_BASELINE_REQ_PKT_BEGIN 64 +#define RXR_EAGER_MSGRTM_PKT 64 +#define RXR_EAGER_TAGRTM_PKT 65 +#define RXR_MEDIUM_MSGRTM_PKT 66 +#define RXR_MEDIUM_TAGRTM_PKT 67 +#define RXR_LONG_MSGRTM_PKT 68 +#define RXR_LONG_TAGRTM_PKT 69 +#define RXR_EAGER_RTW_PKT 70 +#define RXR_LONG_RTW_PKT 71 +#define RXR_SHORT_RTR_PKT 72 +#define RXR_LONG_RTR_PKT 73 +#define RXR_WRITE_RTA_PKT 74 +#define RXR_FETCH_RTA_PKT 75 +#define RXR_COMPARE_RTA_PKT 76 +#define RXR_BASELINE_REQ_PKT_END 77 + +#define RXR_EXTRA_REQ_PKT_BEGIN 128 +#define RXR_READ_MSGRTM_PKT 128 +#define RXR_READ_TAGRTM_PKT 129 +#define RXR_READ_RTW_PKT 130 +#define RXR_READ_RTR_PKT 131 + +#define RXR_DC_REQ_PKT_BEGIN 132 +#define RXR_DC_EAGER_MSGRTM_PKT 133 +#define RXR_DC_EAGER_TAGRTM_PKT 134 +#define RXR_DC_MEDIUM_MSGRTM_PKT 135 +#define RXR_DC_MEDIUM_TAGRTM_PKT 136 +#define RXR_DC_LONG_MSGRTM_PKT 137 +#define RXR_DC_LONG_TAGRTM_PKT 138 +#define RXR_DC_EAGER_RTW_PKT 139 +#define RXR_DC_LONG_RTW_PKT 140 +#define RXR_DC_WRITE_RTA_PKT 141 +#define RXR_DC_REQ_PKT_END 142 +#define RXR_EXTRA_REQ_PKT_END 142 + +/* + * Packet fields common to all rxr packets. The other packet headers below must + * be changed if this is updated. + */ +struct rxr_base_hdr { + uint8_t type; + uint8_t version; + uint16_t flags; +}; + +#if defined(static_assert) && defined(__x86_64__) +static_assert(sizeof(struct rxr_base_hdr) == 4, "rxr_base_hdr check"); +#endif + +static inline struct rxr_base_hdr *rxr_get_base_hdr(void *pkt) +{ + return (struct rxr_base_hdr *)pkt; +} + +struct rxr_ep; +struct rxr_peer; +struct rxr_tx_entry; +struct rxr_rx_entry; +struct rxr_read_entry; + +/* + * HANDSHAKE packet header and functions + * implementation of the functions are in rxr_pkt_type_misc.c + */ +struct rxr_handshake_hdr { + uint8_t type; + uint8_t version; + uint16_t flags; + /* end of rxr_base_hdr */ + uint32_t maxproto; + uint64_t features[0]; +}; + +#if defined(static_assert) && defined(__x86_64__) +static_assert(sizeof(struct rxr_handshake_hdr) == 8, "rxr_handshake_hdr check"); +#endif + +static inline +struct rxr_handshake_hdr *rxr_get_handshake_hdr(void *pkt) +{ + return (struct rxr_handshake_hdr *)pkt; +} + +ssize_t rxr_pkt_init_handshake(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry, + fi_addr_t addr); + +void rxr_pkt_post_handshake(struct rxr_ep *ep, + struct rxr_peer *peer, + fi_addr_t addr); + +void rxr_pkt_handle_handshake_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); +/* + * CTS packet data structures and functions. + * Definition of the functions is in rxr_pkt_type_misc.c + */ +struct rxr_cts_hdr { + uint8_t type; + uint8_t version; + uint16_t flags; + /* end of rxr_base_hdr */ + uint8_t pad[4]; + /* TODO: need to add msg_id -> tx_id/rx_id mapping */ + uint32_t tx_id; + uint32_t rx_id; + uint64_t window; +}; + +#if defined(static_assert) && defined(__x86_64__) +static_assert(sizeof(struct rxr_cts_hdr) == 24, "rxr_cts_hdr check"); +#endif + +/* this flag is to indicated the CTS is the response of a RTR packet */ +#define RXR_CTS_READ_REQ BIT_ULL(7) +#define RXR_CTS_HDR_SIZE (sizeof(struct rxr_cts_hdr)) + +static inline +struct rxr_cts_hdr *rxr_get_cts_hdr(void *pkt) +{ + return (struct rxr_cts_hdr *)pkt; +} + +void rxr_pkt_calc_cts_window_credits(struct rxr_ep *ep, struct rxr_peer *peer, + uint64_t size, int request, + int *window, int *credits); + +ssize_t rxr_pkt_init_cts(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_cts_sent(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_cts_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +/* + * DATA packet data structures and functions + * Definition of the functions is in rxr_pkt_data.c + */ +struct rxr_data_hdr { + uint8_t type; + uint8_t version; + uint16_t flags; + /* end of rxr_base_hdr */ + /* TODO: need to add msg_id -> tx_id/rx_id mapping */ + uint32_t rx_id; + uint64_t seg_size; + uint64_t seg_offset; +}; + +#if defined(static_assert) && defined(__x86_64__) +static_assert(sizeof(struct rxr_data_hdr) == 24, "rxr_data_hdr check"); +#endif + +#define RXR_DATA_HDR_SIZE (sizeof(struct rxr_data_hdr)) + +struct rxr_data_pkt { + struct rxr_data_hdr hdr; + char data[]; +}; + +static inline +struct rxr_data_pkt *rxr_get_data_pkt(void *pkt) +{ + return (struct rxr_data_pkt *)pkt; +} + +ssize_t rxr_pkt_send_data(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_send_data_desc(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_proc_data(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry, + struct rxr_pkt_entry *pkt_entry, + char *data, size_t seg_offset, + size_t seg_size); + +void rxr_pkt_handle_data_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + + +void rxr_pkt_handle_data_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +/* + * READRSP packet data structures and functions + * The definition of functions are in rxr_pkt_type_misc.c + */ +struct rxr_readrsp_hdr { + uint8_t type; + uint8_t version; + uint16_t flags; + /* end of rxr_base_hdr */ + uint8_t pad[4]; + uint32_t rx_id; + uint32_t tx_id; + uint64_t seg_size; +}; + +static inline struct rxr_readrsp_hdr *rxr_get_readrsp_hdr(void *pkt) +{ + return (struct rxr_readrsp_hdr *)pkt; +} + +#define RXR_READRSP_HDR_SIZE (sizeof(struct rxr_readrsp_hdr)) + +#if defined(static_assert) && defined(__x86_64__) +static_assert(sizeof(struct rxr_readrsp_hdr) == sizeof(struct rxr_data_hdr), "rxr_readrsp_hdr check"); +#endif + +struct rxr_readrsp_pkt { + struct rxr_readrsp_hdr hdr; + char data[]; +}; + +int rxr_pkt_init_readrsp(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_readrsp_sent(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_readrsp_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_readrsp_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +/* + * RMA context packet, used to differentiate the normal RMA read, normal RMA + * write, and the RMA read in two-sided large message transfer + * Implementation of the function is in rxr_pkt_type_misc.c + */ +struct rxr_rma_context_pkt { + uint8_t type; + uint8_t version; + uint16_t flags; + /* end of rxr_base_hdr */ + uint32_t context_type; + uint32_t tx_id; /* used by write context */ + uint32_t read_id; /* used by read context */ + size_t seg_size; /* used by read context */ +}; + +enum rxr_rma_context_pkt_type { + RXR_READ_CONTEXT = 1, + RXR_WRITE_CONTEXT, +}; + +void rxr_pkt_init_write_context(struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_init_read_context(struct rxr_ep *rxr_ep, + struct rxr_read_entry *read_entry, + size_t seg_size, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_rma_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +/* + * EOR packet, used to acknowledge the sender that large message + * copy has been finished. + * Implementaion of the functions are in rxr_pkt_misc.c + */ +struct rxr_eor_hdr { + uint8_t type; + uint8_t version; + uint16_t flags; + /* end of rxr_base_hdr */ + uint32_t tx_id; + uint32_t rx_id; +}; + +#if defined(static_assert) && defined(__x86_64__) +static_assert(sizeof(struct rxr_eor_hdr) == 12, "rxr_eor_hdr check"); +#endif + +static inline +struct rxr_eor_hdr *rxr_get_eor_hdr(void *pkt) +{ + return (struct rxr_eor_hdr *)pkt; +} + +int rxr_pkt_init_eor(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry, + struct rxr_pkt_entry *pkt_entry); + + +void rxr_pkt_handle_eor_sent(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_eor_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_eor_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +/* atomrsp types */ +struct rxr_atomrsp_hdr { + uint8_t type; + uint8_t version; + uint16_t flags; + /* end of rxr_base_hdr */ + uint8_t pad[4]; + uint32_t rx_id; + uint32_t tx_id; + uint64_t seg_size; +}; + +#if defined(static_assert) && defined(__x86_64__) +static_assert(sizeof(struct rxr_atomrsp_hdr) == 24, "rxr_atomrsp_hdr check"); +#endif + +#define RXR_ATOMRSP_HDR_SIZE (sizeof(struct rxr_atomrsp_hdr)) + +struct rxr_atomrsp_pkt { + struct rxr_atomrsp_hdr hdr; + char data[]; +}; + +static inline struct rxr_atomrsp_hdr *rxr_get_atomrsp_hdr(void *pkt) +{ + return (struct rxr_atomrsp_hdr *)pkt; +} + +/* receipt packet headers */ +struct rxr_receipt_hdr { + uint8_t type; + uint8_t version; + uint16_t flags; + /* end of rxr_base_hdr */ + uint32_t tx_id; + uint32_t msg_id; + int32_t padding; +}; + +static inline +struct rxr_receipt_hdr *rxr_get_receipt_hdr(void *pkt) +{ + return (struct rxr_receipt_hdr *)pkt; +} + +/* receipt packet functions: init, handle_sent, handle_send_completion, recv*/ +int rxr_pkt_init_receipt(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_receipt_sent(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_receipt_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_receipt_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +/* atomrsp functions: init, handle_sent, handle_send_completion, recv */ +int rxr_pkt_init_atomrsp(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_atomrsp_sent(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_atomrsp_send_completion(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_atomrsp_recv(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry); + +#endif + +#include "rxr_pkt_type_req.h" diff --git a/prov/efa/src/rxr/rxr_pkt_type_data.c b/prov/efa/src/rxr/rxr_pkt_type_data.c new file mode 100644 index 00000000000..732836c517b --- /dev/null +++ b/prov/efa/src/rxr/rxr_pkt_type_data.c @@ -0,0 +1,340 @@ +/* + * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "efa.h" +#include "rxr.h" +#include "rxr_msg.h" +#include "rxr_pkt_cmd.h" + +/* + * This function contains data packet related functions + * Data packet is used by long message protocol. + */ + +/* + * Functions to send data packet, including + */ + +ssize_t rxr_pkt_send_data(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + uint64_t payload_size, copied_size; + struct rxr_data_pkt *data_pkt; + struct efa_mr *desc; + + pkt_entry->x_entry = (void *)tx_entry; + pkt_entry->addr = tx_entry->addr; + desc = tx_entry->desc[0]; + + payload_size = MIN(tx_entry->total_len - tx_entry->bytes_sent, + ep->max_data_payload_size); + payload_size = MIN(payload_size, tx_entry->window); + + data_pkt = (struct rxr_data_pkt *)pkt_entry->pkt; + data_pkt->hdr.seg_size = payload_size; + + copied_size = ofi_copy_from_hmem_iov(data_pkt->data, + payload_size, + desc ? desc->peer.iface : FI_HMEM_SYSTEM, + desc ? desc->peer.device.reserved : 0, + tx_entry->iov, + tx_entry->iov_count, + tx_entry->bytes_sent); + assert(copied_size == payload_size); + + pkt_entry->pkt_size = copied_size + sizeof(struct rxr_data_hdr); + pkt_entry->addr = tx_entry->addr; + + return rxr_pkt_entry_send_with_flags(ep, pkt_entry, pkt_entry->addr, + tx_entry->send_flags); +} + +/* + * Copies all consecutive small iov's into one buffer. If the function reaches + * an iov greater than the max memcpy size, it will end, only copying up to + * that iov. + */ +static size_t rxr_copy_from_iov(void *buf, uint64_t remaining_len, + struct rxr_tx_entry *tx_entry) +{ + struct iovec *tx_iov = tx_entry->iov; + uint64_t done = 0, len; + + while (tx_entry->iov_index < tx_entry->iov_count && + done < remaining_len) { + len = tx_iov[tx_entry->iov_index].iov_len; + if (tx_entry->mr[tx_entry->iov_index]) + break; + + len -= tx_entry->iov_offset; + + /* + * If the amount to be written surpasses the remaining length, + * copy up to the remaining length and return, else copy the + * entire iov and continue. + */ + if (done + len > remaining_len) { + len = remaining_len - done; + memcpy((char *)buf + done, + (char *)tx_iov[tx_entry->iov_index].iov_base + + tx_entry->iov_offset, len); + tx_entry->iov_offset += len; + done += len; + break; + } + memcpy((char *)buf + done, + (char *)tx_iov[tx_entry->iov_index].iov_base + + tx_entry->iov_offset, len); + tx_entry->iov_index++; + tx_entry->iov_offset = 0; + done += len; + } + return done; +} + +ssize_t rxr_pkt_send_data_desc(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_data_pkt *data_pkt; + /* The user's iov */ + struct iovec *tx_iov = tx_entry->iov; + /* The constructed iov to be passed to sendv + * and corresponding fid_mrs + */ + struct iovec iov[ep->core_iov_limit]; + void *desc[ep->core_iov_limit]; + /* Constructed iov's total size */ + uint64_t payload_size = 0; + /* pkt_entry offset to write data into */ + uint64_t pkt_used = 0; + uint64_t orig_iov_index; + uint64_t orig_iov_offset; + /* Remaining size that can fit in the constructed iov */ + uint64_t remaining_len = MIN(tx_entry->window, + ep->max_data_payload_size); + /* The constructed iov's index */ + size_t i = 0; + size_t len = 0; + + ssize_t ret; + + orig_iov_index = tx_entry->iov_index; + orig_iov_offset = tx_entry->iov_offset; + + data_pkt = (struct rxr_data_pkt *)pkt_entry->pkt; + /* Assign packet header in constructed iov */ + iov[i].iov_base = rxr_pkt_start(pkt_entry); + iov[i].iov_len = sizeof(struct rxr_data_hdr); + desc[i] = fi_mr_desc(pkt_entry->mr); + i++; + + /* + * Loops until payload size is at max, all user iovs are sent, the + * constructed iov count is greater than the core iov limit, or the tx + * entry window is exhausted. Each iteration fills one entry of the + * iov to be sent. + */ + while (tx_entry->iov_index < tx_entry->iov_count && + remaining_len > 0 && i < ep->core_iov_limit) { + if (tx_entry->desc[tx_entry->iov_index]) { + iov[i].iov_base = + (char *)tx_iov[tx_entry->iov_index].iov_base + + tx_entry->iov_offset; + desc[i] = tx_entry->desc[tx_entry->iov_index]; + + len = tx_iov[tx_entry->iov_index].iov_len + - tx_entry->iov_offset; + if (len > remaining_len) { + len = remaining_len; + tx_entry->iov_offset += len; + } else { + tx_entry->iov_index++; + tx_entry->iov_offset = 0; + } + iov[i].iov_len = len; + } else { + /* It should be noted for cuda buffer, caller will always + * provide desc, and will not enter this branch. + * + * Copies any consecutive small iov's, returning size + * written while updating iov index and offset + */ + + len = rxr_copy_from_iov((char *)data_pkt->data + + pkt_used, + remaining_len, + tx_entry); + + iov[i].iov_base = (char *)data_pkt->data + pkt_used; + iov[i].iov_len = len; + desc[i] = fi_mr_desc(pkt_entry->mr); + pkt_used += len; + } + payload_size += len; + remaining_len -= len; + i++; + } + data_pkt->hdr.seg_size = (uint16_t)payload_size; + pkt_entry->pkt_size = payload_size + RXR_DATA_HDR_SIZE; + pkt_entry->x_entry = tx_entry; + pkt_entry->addr = tx_entry->addr; + + FI_DBG(&rxr_prov, FI_LOG_EP_DATA, + "Sending an iov count, %zu with payload size: %lu.\n", + i, payload_size); + ret = rxr_pkt_entry_sendv(ep, pkt_entry, tx_entry->addr, + (const struct iovec *)iov, + desc, i, tx_entry->send_flags); + if (OFI_UNLIKELY(ret)) { + /* Reset tx_entry iov pointer on send failure. */ + tx_entry->iov_index = orig_iov_index; + tx_entry->iov_offset = orig_iov_offset; + } + return ret; +} + +void rxr_pkt_handle_data_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_tx_entry *tx_entry; + + tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry; + tx_entry->bytes_acked += + rxr_get_data_pkt(pkt_entry->pkt)->hdr.seg_size; + + if (tx_entry->total_len == tx_entry->bytes_acked) { + if (!(tx_entry->rxr_flags & RXR_DELIVERY_COMPLETE_REQUESTED)) + rxr_cq_handle_tx_completion(ep, tx_entry); + else + if (tx_entry->rxr_flags & RXR_RECEIPT_RECEIVED) + /* + * For long message protocol, + * when FI_DELIVERY_COMPLETE + * is requested, + * we have to write tx completions + * in either + * rxr_pkt_handle_data_send_completion() + * or rxr_pkt_handle_receipt_recv() + * depending on which of them + * is called later due + * to avoid accessing released + * tx_entry. + */ + rxr_cq_handle_tx_completion(ep, tx_entry); + } +} + +/* + * rxr_pkt_handle_data_recv() and related functions + */ + +/* + * rxr_pkt_proc_data() processes data in a DATA/READRSP + * pakcet entry. + */ +void rxr_pkt_proc_data(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry, + struct rxr_pkt_entry *pkt_entry, + char *data, size_t seg_offset, + size_t seg_size) +{ + struct rxr_peer *peer; + bool all_received = 0; + ssize_t err; + +#if ENABLE_DEBUG + int pkt_type = rxr_get_base_hdr(pkt_entry->pkt)->type; + + assert(pkt_type == RXR_DATA_PKT || pkt_type == RXR_READRSP_PKT); +#endif + rx_entry->bytes_received += seg_size; + assert(rx_entry->bytes_received <= rx_entry->total_len); + all_received = (rx_entry->bytes_received == rx_entry->total_len); + + peer = rxr_ep_get_peer(ep, rx_entry->addr); + peer->rx_credits += ofi_div_ceil(seg_size, ep->max_data_payload_size); + + rx_entry->window -= seg_size; + if (ep->available_data_bufs < rxr_get_rx_pool_chunk_cnt(ep)) + ep->available_data_bufs++; + +#if ENABLE_DEBUG + /* rx_entry can be released by rxr_pkt_copy_to_rx + * so the call to dlist_remove must happen before + * call to rxr_copy_to_rx + */ + if (all_received) { + dlist_remove(&rx_entry->rx_pending_entry); + ep->rx_pending--; + } +#endif + err = rxr_pkt_copy_to_rx(ep, rx_entry, seg_offset, + pkt_entry, data, seg_size); + if (err) { + rxr_pkt_entry_release_rx(ep, pkt_entry); + rxr_cq_handle_rx_error(ep, rx_entry, err); + } + + if (all_received) + return; + + if (!rx_entry->window) { + assert(rx_entry->state == RXR_RX_RECV); + err = rxr_pkt_post_ctrl_or_queue(ep, RXR_RX_ENTRY, rx_entry, RXR_CTS_PKT, 0); + if (err) { + FI_WARN(&rxr_prov, FI_LOG_CQ, "post CTS packet failed!\n"); + rxr_cq_handle_rx_error(ep, rx_entry, err); + } + } +} + +void rxr_pkt_handle_data_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_data_pkt *data_pkt; + struct rxr_rx_entry *rx_entry; + + data_pkt = (struct rxr_data_pkt *)pkt_entry->pkt; + + rx_entry = ofi_bufpool_get_ibuf(ep->rx_entry_pool, + data_pkt->hdr.rx_id); + + rxr_pkt_proc_data(ep, rx_entry, + pkt_entry, + data_pkt->data, + data_pkt->hdr.seg_offset, + data_pkt->hdr.seg_size); +} + diff --git a/prov/efa/src/rxr/rxr_pkt_type_misc.c b/prov/efa/src/rxr/rxr_pkt_type_misc.c new file mode 100644 index 00000000000..65a8dc12269 --- /dev/null +++ b/prov/efa/src/rxr/rxr_pkt_type_misc.c @@ -0,0 +1,653 @@ +/* + * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "efa.h" +#include "rxr.h" +#include "rxr_msg.h" +#include "rxr_cntr.h" +#include "rxr_pkt_cmd.h" +#include "rxr_read.h" + +/* This file define functons for the following packet type: + * HANDSHAKE + * CTS + * READRSP + * RMA_CONTEXT + * EOR + */ + +/* HANDSHAKE packet related functions */ +ssize_t rxr_pkt_init_handshake(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry, + fi_addr_t addr) +{ + struct rxr_handshake_hdr *handshake_hdr; + + handshake_hdr = (struct rxr_handshake_hdr *)pkt_entry->pkt; + handshake_hdr->type = RXR_HANDSHAKE_PKT; + handshake_hdr->version = RXR_BASE_PROTOCOL_VERSION; + handshake_hdr->flags = 0; + handshake_hdr->maxproto = RXR_CUR_PROTOCOL_VERSION; + memcpy(handshake_hdr->features, ep->features, + RXR_NUM_PROTOCOL_VERSION * sizeof(uint64_t)); + + pkt_entry->pkt_size = sizeof(struct rxr_handshake_hdr) + + RXR_NUM_PROTOCOL_VERSION * sizeof(uint64_t); + pkt_entry->addr = addr; + return 0; +} + +void rxr_pkt_post_handshake(struct rxr_ep *ep, + struct rxr_peer *peer, + fi_addr_t addr) +{ + struct rxr_pkt_entry *pkt_entry; + ssize_t ret; + + assert(!(peer->flags & RXR_PEER_HANDSHAKE_SENT)); + + pkt_entry = rxr_pkt_entry_alloc(ep, ep->tx_pkt_efa_pool); + if (OFI_UNLIKELY(!pkt_entry)) + return; + + rxr_pkt_init_handshake(ep, pkt_entry, addr); + + /* + * TODO: Once we start using a core's selective completion capability, + * post the HANDSHAKE packets without FI_COMPLETION. + */ + ret = rxr_pkt_entry_send(ep, pkt_entry, addr); + + /* + * Skip sending this handshake on error and try again when processing the + * next REQ from this peer containing the source information + */ + if (OFI_UNLIKELY(ret)) { + rxr_pkt_entry_release_tx(ep, pkt_entry); + if (ret == -FI_EAGAIN) + return; + FI_WARN(&rxr_prov, FI_LOG_CQ, + "Failed to send a HANDSHAKE packet: ret %zd\n", ret); + return; + } + + peer->flags |= RXR_PEER_HANDSHAKE_SENT; +} + +void rxr_pkt_handle_handshake_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_peer *peer; + struct rxr_handshake_hdr *handshake_pkt; + + assert(pkt_entry->addr != FI_ADDR_NOTAVAIL); + + peer = rxr_ep_get_peer(ep, pkt_entry->addr); + assert(!(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED)); + + handshake_pkt = (struct rxr_handshake_hdr *)pkt_entry->pkt; + + peer->maxproto = handshake_pkt->maxproto; + memcpy(peer->features, handshake_pkt->features, + (handshake_pkt->maxproto - RXR_BASE_PROTOCOL_VERSION + 1) * sizeof(uint64_t)); + peer->flags |= RXR_PEER_HANDSHAKE_RECEIVED; + FI_DBG(&rxr_prov, FI_LOG_CQ, + "HANDSHAKE received from %" PRIu64 "\n", pkt_entry->addr); + rxr_pkt_entry_release_rx(ep, pkt_entry); + +} + +/* CTS packet related functions */ +void rxr_pkt_calc_cts_window_credits(struct rxr_ep *ep, struct rxr_peer *peer, + uint64_t size, int request, + int *window, int *credits) +{ + struct efa_av *av; + int num_peers; + + /* + * Adjust the peer credit pool based on the current AV size, which could + * have grown since the time this peer was initialized. + */ + av = rxr_ep_av(ep); + num_peers = av->used - 1; + if (num_peers && ofi_div_ceil(rxr_env.rx_window_size, num_peers) < peer->rx_credits) + peer->rx_credits = ofi_div_ceil(peer->rx_credits, num_peers); + + /* + * Allocate credits for this transfer based on the request, the number + * of available data buffers, and the number of outstanding peers this + * endpoint is actively tracking in the AV. Also ensure that a minimum + * number of credits are allocated to the transfer so the sender can + * make progress. + */ + *credits = MIN(MIN(ep->available_data_bufs, ep->posted_bufs_efa), + peer->rx_credits); + *credits = MIN(request, *credits); + *credits = MAX(*credits, rxr_env.tx_min_credits); + *window = MIN(size, *credits * ep->max_data_payload_size); + if (peer->rx_credits > ofi_div_ceil(*window, ep->max_data_payload_size)) + peer->rx_credits -= ofi_div_ceil(*window, ep->max_data_payload_size); +} + +ssize_t rxr_pkt_init_cts(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + int window = 0; + struct rxr_cts_hdr *cts_hdr; + struct rxr_peer *peer; + size_t bytes_left; + + cts_hdr = (struct rxr_cts_hdr *)pkt_entry->pkt; + cts_hdr->type = RXR_CTS_PKT; + cts_hdr->version = RXR_BASE_PROTOCOL_VERSION; + cts_hdr->flags = 0; + + if (rx_entry->cq_entry.flags & FI_READ) + cts_hdr->flags |= RXR_CTS_READ_REQ; + + cts_hdr->tx_id = rx_entry->tx_id; + cts_hdr->rx_id = rx_entry->rx_id; + + bytes_left = rx_entry->total_len - rx_entry->bytes_received; + peer = rxr_ep_get_peer(ep, rx_entry->addr); + rxr_pkt_calc_cts_window_credits(ep, peer, bytes_left, + rx_entry->credit_request, + &window, &rx_entry->credit_cts); + cts_hdr->window = window; + pkt_entry->pkt_size = sizeof(struct rxr_cts_hdr); + pkt_entry->addr = rx_entry->addr; + pkt_entry->x_entry = (void *)rx_entry; + return 0; +} + +void rxr_pkt_handle_cts_sent(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rx_entry *rx_entry; + + rx_entry = (struct rxr_rx_entry *)pkt_entry->x_entry; + rx_entry->window = rxr_get_cts_hdr(pkt_entry->pkt)->window; + ep->available_data_bufs -= rx_entry->credit_cts; + + /* + * Set a timer if available_bufs is exhausted. We may encounter a + * scenario where a peer has stopped responding so we need a fallback + * to replenish the credits. + */ + if (OFI_UNLIKELY(ep->available_data_bufs == 0)) + ep->available_data_bufs_ts = ofi_gettime_us(); +} + +void rxr_pkt_handle_cts_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_peer *peer; + struct rxr_cts_hdr *cts_pkt; + struct rxr_tx_entry *tx_entry; + + cts_pkt = (struct rxr_cts_hdr *)pkt_entry->pkt; + if (cts_pkt->flags & RXR_CTS_READ_REQ) + tx_entry = ofi_bufpool_get_ibuf(ep->readrsp_tx_entry_pool, cts_pkt->tx_id); + else + tx_entry = ofi_bufpool_get_ibuf(ep->tx_entry_pool, cts_pkt->tx_id); + + tx_entry->rx_id = cts_pkt->rx_id; + tx_entry->window = cts_pkt->window; + + /* Return any excess tx_credits that were borrowed for the request */ + peer = rxr_ep_get_peer(ep, tx_entry->addr); + tx_entry->credit_allocated = ofi_div_ceil(cts_pkt->window, ep->max_data_payload_size); + if (tx_entry->credit_allocated < tx_entry->credit_request) + peer->tx_credits += tx_entry->credit_request - tx_entry->credit_allocated; + + rxr_pkt_entry_release_rx(ep, pkt_entry); + + if (tx_entry->state != RXR_TX_SEND) { + tx_entry->state = RXR_TX_SEND; + dlist_insert_tail(&tx_entry->entry, &ep->tx_pending_list); + } +} + +/* READRSP packet functions */ +int rxr_pkt_init_readrsp(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_readrsp_pkt *readrsp_pkt; + struct rxr_readrsp_hdr *readrsp_hdr; + size_t mtu = ep->mtu_size; + + readrsp_pkt = (struct rxr_readrsp_pkt *)pkt_entry->pkt; + readrsp_hdr = &readrsp_pkt->hdr; + readrsp_hdr->type = RXR_READRSP_PKT; + readrsp_hdr->version = RXR_BASE_PROTOCOL_VERSION; + readrsp_hdr->flags = 0; + readrsp_hdr->tx_id = tx_entry->tx_id; + readrsp_hdr->rx_id = tx_entry->rx_id; + readrsp_hdr->seg_size = ofi_copy_from_iov(readrsp_pkt->data, + mtu - RXR_READRSP_HDR_SIZE, + tx_entry->iov, + tx_entry->iov_count, 0); + pkt_entry->pkt_size = RXR_READRSP_HDR_SIZE + readrsp_hdr->seg_size; + pkt_entry->addr = tx_entry->addr; + pkt_entry->x_entry = tx_entry; + return 0; +} + +void rxr_pkt_handle_readrsp_sent(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_tx_entry *tx_entry; + size_t data_len; + struct efa_domain *efa_domain; + struct rxr_domain *rxr_domain = rxr_ep_domain(ep); + + efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain, + util_domain.domain_fid); + + tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry; + data_len = rxr_get_readrsp_hdr(pkt_entry->pkt)->seg_size; + tx_entry->bytes_sent += data_len; + tx_entry->window -= data_len; + assert(tx_entry->window >= 0); + if (tx_entry->bytes_sent < tx_entry->total_len) { + assert(!efa_ep_is_cuda_mr(tx_entry->desc[0])); + if (tx_entry->desc[0] || efa_is_cache_available(efa_domain)) + rxr_prepare_desc_send(rxr_ep_domain(ep), tx_entry); + + tx_entry->state = RXR_TX_SEND; + dlist_insert_tail(&tx_entry->entry, + &ep->tx_pending_list); + } +} + +void rxr_pkt_handle_readrsp_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_tx_entry *tx_entry; + struct rxr_readrsp_hdr *readrsp_hdr; + + readrsp_hdr = (struct rxr_readrsp_hdr *)pkt_entry->pkt; + + tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry; + assert(tx_entry->cq_entry.flags & FI_READ); + + tx_entry->bytes_acked += readrsp_hdr->seg_size; + if (tx_entry->total_len == tx_entry->bytes_acked) + rxr_cq_handle_tx_completion(ep, tx_entry); +} + +void rxr_pkt_handle_readrsp_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_readrsp_pkt *readrsp_pkt = NULL; + struct rxr_readrsp_hdr *readrsp_hdr = NULL; + struct rxr_rx_entry *rx_entry = NULL; + + readrsp_pkt = (struct rxr_readrsp_pkt *)pkt_entry->pkt; + readrsp_hdr = &readrsp_pkt->hdr; + rx_entry = ofi_bufpool_get_ibuf(ep->rx_entry_pool, readrsp_hdr->rx_id); + assert(rx_entry->cq_entry.flags & FI_READ); + rx_entry->tx_id = readrsp_hdr->tx_id; + rxr_pkt_proc_data(ep, rx_entry, pkt_entry, + readrsp_pkt->data, + 0, readrsp_hdr->seg_size); +} + +/* RMA_CONTEXT packet functions + * + * RMA context packet is used a context of RMA operations and is not + * sent over wire. It is named packet because currently all EFA operation + * use a packet as context. + */ +void rxr_pkt_init_write_context(struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rma_context_pkt *rma_context_pkt; + + pkt_entry->x_entry = (void *)tx_entry; + rma_context_pkt = (struct rxr_rma_context_pkt *)pkt_entry->pkt; + rma_context_pkt->type = RXR_RMA_CONTEXT_PKT; + rma_context_pkt->version = RXR_BASE_PROTOCOL_VERSION; + rma_context_pkt->context_type = RXR_WRITE_CONTEXT; + rma_context_pkt->tx_id = tx_entry->tx_id; +} + +void rxr_pkt_init_read_context(struct rxr_ep *rxr_ep, + struct rxr_read_entry *read_entry, + size_t seg_size, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rma_context_pkt *ctx_pkt; + + pkt_entry->x_entry = read_entry; + pkt_entry->addr = read_entry->addr; + pkt_entry->pkt_size = sizeof(struct rxr_rma_context_pkt); + + ctx_pkt = (struct rxr_rma_context_pkt *)pkt_entry->pkt; + ctx_pkt->type = RXR_RMA_CONTEXT_PKT; + ctx_pkt->flags = 0; + ctx_pkt->version = RXR_BASE_PROTOCOL_VERSION; + ctx_pkt->context_type = RXR_READ_CONTEXT; + ctx_pkt->read_id = read_entry->read_id; + ctx_pkt->seg_size = seg_size; +} + +static +void rxr_pkt_handle_rma_read_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *context_pkt_entry) +{ + struct rxr_tx_entry *tx_entry; + struct rxr_rx_entry *rx_entry; + struct rxr_pkt_entry *pkt_entry; + struct rxr_read_entry *read_entry; + struct rxr_rma_context_pkt *rma_context_pkt; + struct rxr_peer *peer; + int inject; + size_t data_size; + ssize_t ret; + + rma_context_pkt = (struct rxr_rma_context_pkt *)context_pkt_entry->pkt; + assert(rma_context_pkt->type == RXR_RMA_CONTEXT_PKT); + assert(rma_context_pkt->context_type == RXR_READ_CONTEXT); + + read_entry = (struct rxr_read_entry *)context_pkt_entry->x_entry; + read_entry->bytes_finished += rma_context_pkt->seg_size; + assert(read_entry->bytes_finished <= read_entry->total_len); + + if (read_entry->bytes_finished == read_entry->total_len) { + if (read_entry->context_type == RXR_READ_CONTEXT_TX_ENTRY) { + tx_entry = read_entry->context; + assert(tx_entry && tx_entry->cq_entry.flags & FI_READ); + rxr_cq_write_tx_completion(ep, tx_entry); + } else if (read_entry->context_type == RXR_READ_CONTEXT_RX_ENTRY) { + rx_entry = read_entry->context; + if (rx_entry->op == ofi_op_msg || rx_entry->op == ofi_op_tagged) { + rxr_cq_write_rx_completion(ep, rx_entry); + } else { + assert(rx_entry->op == ofi_op_write); + if (rx_entry->cq_entry.flags & FI_REMOTE_CQ_DATA) + rxr_cq_write_rx_completion(ep, rx_entry); + } + + inject = (read_entry->lower_ep_type == SHM_EP); + ret = rxr_pkt_post_ctrl_or_queue(ep, RXR_RX_ENTRY, rx_entry, RXR_EOR_PKT, inject); + if (OFI_UNLIKELY(ret)) { + if (rxr_cq_handle_rx_error(ep, rx_entry, ret)) + assert(0 && "failed to write err cq entry"); + rxr_release_rx_entry(ep, rx_entry); + } + } else { + assert(read_entry->context_type == RXR_READ_CONTEXT_PKT_ENTRY); + pkt_entry = read_entry->context; + data_size = rxr_pkt_data_size(pkt_entry); + assert(data_size > 0); + rxr_pkt_handle_data_copied(ep, pkt_entry, data_size); + } + + rxr_read_release_entry(ep, read_entry); + } + + if (read_entry->context_type == RXR_READ_CONTEXT_PKT_ENTRY) { + assert(context_pkt_entry->addr == FI_ADDR_NOTAVAIL); + ep->tx_pending--; + } else { + peer = rxr_ep_get_peer(ep, context_pkt_entry->addr); + if (!peer->is_local) + rxr_ep_dec_tx_pending(ep, peer, 0); + } +} + +void rxr_pkt_handle_rma_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *context_pkt_entry) +{ + struct rxr_tx_entry *tx_entry = NULL; + struct rxr_rma_context_pkt *rma_context_pkt; + + assert(rxr_get_base_hdr(context_pkt_entry->pkt)->version == RXR_BASE_PROTOCOL_VERSION); + + rma_context_pkt = (struct rxr_rma_context_pkt *)context_pkt_entry->pkt; + + switch (rma_context_pkt->context_type) { + case RXR_WRITE_CONTEXT: + tx_entry = (struct rxr_tx_entry *)context_pkt_entry->x_entry; + if (tx_entry->fi_flags & FI_COMPLETION) { + rxr_cq_write_tx_completion(ep, tx_entry); + } else { + efa_cntr_report_tx_completion(&ep->util_ep, tx_entry->cq_entry.flags); + rxr_release_tx_entry(ep, tx_entry); + } + break; + case RXR_READ_CONTEXT: + rxr_pkt_handle_rma_read_completion(ep, context_pkt_entry); + break; + default: + FI_WARN(&rxr_prov, FI_LOG_CQ, "invalid rma_context_type in RXR_RMA_CONTEXT_PKT %d\n", + rma_context_pkt->context_type); + assert(0 && "invalid RXR_RMA_CONTEXT_PKT rma_context_type\n"); + } + + rxr_pkt_entry_release_tx(ep, context_pkt_entry); +} + +/* EOR packet related functions */ +int rxr_pkt_init_eor(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry, struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_eor_hdr *eor_hdr; + + eor_hdr = (struct rxr_eor_hdr *)pkt_entry->pkt; + eor_hdr->type = RXR_EOR_PKT; + eor_hdr->version = RXR_BASE_PROTOCOL_VERSION; + eor_hdr->flags = 0; + eor_hdr->tx_id = rx_entry->tx_id; + eor_hdr->rx_id = rx_entry->rx_id; + pkt_entry->pkt_size = sizeof(struct rxr_eor_hdr); + pkt_entry->addr = rx_entry->addr; + pkt_entry->x_entry = rx_entry; + return 0; +} + +void rxr_pkt_handle_eor_sent(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry) +{ +} + +void rxr_pkt_handle_eor_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rx_entry *rx_entry; + + rx_entry = pkt_entry->x_entry; + assert(rx_entry && rx_entry->rx_id == rxr_get_eor_hdr(pkt_entry->pkt)->rx_id); + rxr_release_rx_entry(ep, rx_entry); +} + +/* + * Sender handles the acknowledgment (RXR_EOR_PKT) from receiver on the completion + * of the large message copy via fi_readmsg operation + */ +void rxr_pkt_handle_eor_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_eor_hdr *eor_hdr; + struct rxr_tx_entry *tx_entry; + + eor_hdr = (struct rxr_eor_hdr *)pkt_entry->pkt; + + /* pre-post buf used here, so can NOT track back to tx_entry with x_entry */ + tx_entry = ofi_bufpool_get_ibuf(ep->tx_entry_pool, eor_hdr->tx_id); + rxr_cq_write_tx_completion(ep, tx_entry); + rxr_pkt_entry_release_rx(ep, pkt_entry); +} + +/* receipt packet related functions */ +int rxr_pkt_init_receipt(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_receipt_hdr *receipt_hdr; + + receipt_hdr = rxr_get_receipt_hdr(pkt_entry->pkt); + receipt_hdr->type = RXR_RECEIPT_PKT; + receipt_hdr->version = RXR_BASE_PROTOCOL_VERSION; + receipt_hdr->flags = 0; + receipt_hdr->tx_id = rx_entry->tx_id; + receipt_hdr->msg_id = rx_entry->msg_id; + + pkt_entry->pkt_size = sizeof(struct rxr_receipt_hdr); + pkt_entry->addr = rx_entry->addr; + pkt_entry->x_entry = rx_entry; + + return 0; +} + +void rxr_pkt_handle_receipt_sent(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ +} + +void rxr_pkt_handle_receipt_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rx_entry *rx_entry; + + rx_entry = (struct rxr_rx_entry *)pkt_entry->x_entry; + rxr_release_rx_entry(ep, rx_entry); +} + +/* atomrsp packet related functions: init, handle_sent, handle_send_completion and recv + * + * initialize atomic response packet by creating a packet that hold original data + * in rx_entry->iov. rx_entry->iov will then be changed by atomic operation. + * release that packet entry until it is sent. + */ +int rxr_pkt_init_atomrsp(struct rxr_ep *ep, struct rxr_rx_entry *rx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_atomrsp_hdr *atomrsp_hdr; + + assert(rx_entry->atomrsp_data); + pkt_entry->addr = rx_entry->addr; + pkt_entry->x_entry = rx_entry; + + atomrsp_hdr = (struct rxr_atomrsp_hdr *)pkt_entry->pkt; + atomrsp_hdr->type = RXR_ATOMRSP_PKT; + atomrsp_hdr->version = RXR_BASE_PROTOCOL_VERSION; + atomrsp_hdr->flags = 0; + atomrsp_hdr->tx_id = rx_entry->tx_id; + atomrsp_hdr->rx_id = rx_entry->rx_id; + atomrsp_hdr->seg_size = ofi_total_iov_len(rx_entry->iov, rx_entry->iov_count); + + assert(RXR_ATOMRSP_HDR_SIZE + atomrsp_hdr->seg_size < ep->mtu_size); + + /* rx_entry->atomrsp_data was filled in rxr_pkt_handle_req_recv() */ + memcpy((char*)pkt_entry->pkt + RXR_ATOMRSP_HDR_SIZE, rx_entry->atomrsp_data, atomrsp_hdr->seg_size); + pkt_entry->pkt_size = RXR_ATOMRSP_HDR_SIZE + atomrsp_hdr->seg_size; + return 0; +} + +void rxr_pkt_handle_atomrsp_sent(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry) +{ +} + +void rxr_pkt_handle_atomrsp_send_completion(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rx_entry *rx_entry; + + rx_entry = (struct rxr_rx_entry *)pkt_entry->x_entry; + ofi_buf_free(rx_entry->atomrsp_data); + rx_entry->atomrsp_data = NULL; + rxr_release_rx_entry(ep, rx_entry); +} + +void rxr_pkt_handle_atomrsp_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_atomrsp_pkt *atomrsp_pkt = NULL; + struct rxr_atomrsp_hdr *atomrsp_hdr = NULL; + struct rxr_tx_entry *tx_entry = NULL; + + atomrsp_pkt = (struct rxr_atomrsp_pkt *)pkt_entry->pkt; + atomrsp_hdr = &atomrsp_pkt->hdr; + tx_entry = ofi_bufpool_get_ibuf(ep->tx_entry_pool, atomrsp_hdr->tx_id); + + ofi_copy_to_iov(tx_entry->atomic_ex.resp_iov, + tx_entry->atomic_ex.resp_iov_count, + 0, atomrsp_pkt->data, + atomrsp_hdr->seg_size); + + if (tx_entry->fi_flags & FI_COMPLETION) { + /* Note write_tx_completion() will release tx_entry */ + rxr_cq_write_tx_completion(ep, tx_entry); + } else { + efa_cntr_report_tx_completion(&ep->util_ep, tx_entry->cq_entry.flags); + rxr_release_tx_entry(ep, tx_entry); + } + + rxr_pkt_entry_release_rx(ep, pkt_entry); +} + +void rxr_pkt_handle_receipt_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_tx_entry *tx_entry = NULL; + struct rxr_receipt_hdr *receipt_hdr; + + receipt_hdr = rxr_get_receipt_hdr(pkt_entry->pkt); + /* Retrieve the tx_entry that will be written into TX CQ*/ + tx_entry = ofi_bufpool_get_ibuf(ep->tx_entry_pool, + receipt_hdr->tx_id); + if (!tx_entry) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "Failed to retrive the tx_entry when hadling receipt packet.\n"); + return; + } + + tx_entry->rxr_flags |= RXR_RECEIPT_RECEIVED; + if (tx_entry->rxr_flags & RXR_LONGCTS_PROTOCOL) { + /* + * For long message protocol, when FI_DELIVERY_COMPLETE + * is requested, we have to write tx completions + * in either rxr_pkt_handle_data_send_completion() + * or rxr_pkt_handle_receipt_recv() depending on which of them + * is called later due to avoid accessing released + * tx_entry. + */ + if (tx_entry->total_len == tx_entry->bytes_acked) + rxr_cq_handle_tx_completion(ep, tx_entry); + } else { + rxr_cq_handle_tx_completion(ep, tx_entry); + } + + rxr_pkt_entry_release_rx(ep, pkt_entry); +} + diff --git a/prov/efa/src/rxr/rxr_pkt_type_req.c b/prov/efa/src/rxr/rxr_pkt_type_req.c new file mode 100644 index 00000000000..1325b18c59a --- /dev/null +++ b/prov/efa/src/rxr/rxr_pkt_type_req.c @@ -0,0 +1,2181 @@ +/* + * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "efa.h" +#include "rxr.h" +#include "rxr_rma.h" +#include "rxr_msg.h" +#include "rxr_pkt_cmd.h" +#include "rxr_read.h" + +/* + * Utility constants and funnctions shared by all REQ packe + * types. + */ +struct rxr_req_inf { + uint64_t protover; + uint64_t base_hdr_size; + uint64_t ex_feature_flag; +}; + +/* + * starting from protocol version 4, each REQ packet type will be assigned a + * version number, and once assigned, the version number will not change. + * + * Baseline features will always be version 4 features, baseline and + * not have a ex_feature_flag. + * + * Each extra feature will be assign a version and an ex_feature_flag. + * Each extra feature will correspond to 1 or more REQ packet types. + */ +static const +struct rxr_req_inf REQ_INF_LIST[] = { + /* rtm header */ + [RXR_EAGER_MSGRTM_PKT] = {4, sizeof(struct rxr_eager_msgrtm_hdr), 0}, + [RXR_EAGER_TAGRTM_PKT] = {4, sizeof(struct rxr_eager_tagrtm_hdr), 0}, + [RXR_MEDIUM_MSGRTM_PKT] = {4, sizeof(struct rxr_medium_msgrtm_hdr), 0}, + [RXR_MEDIUM_TAGRTM_PKT] = {4, sizeof(struct rxr_medium_tagrtm_hdr), 0}, + [RXR_LONG_MSGRTM_PKT] = {4, sizeof(struct rxr_long_msgrtm_hdr), 0}, + [RXR_LONG_TAGRTM_PKT] = {4, sizeof(struct rxr_long_tagrtm_hdr), 0}, + [RXR_READ_MSGRTM_PKT] = {4, sizeof(struct rxr_read_msgrtm_hdr), RXR_REQ_FEATURE_RDMA_READ}, + [RXR_READ_TAGRTM_PKT] = {4, sizeof(struct rxr_read_tagrtm_hdr), RXR_REQ_FEATURE_RDMA_READ}, + [RXR_DC_EAGER_MSGRTM_PKT] = {4, sizeof(struct rxr_dc_eager_msgrtm_hdr), RXR_REQ_FEATURE_DELIVERY_COMPLETE}, + [RXR_DC_EAGER_TAGRTM_PKT] = {4, sizeof(struct rxr_dc_eager_tagrtm_hdr), RXR_REQ_FEATURE_DELIVERY_COMPLETE}, + [RXR_DC_MEDIUM_MSGRTM_PKT] = {4, sizeof(struct rxr_dc_medium_msgrtm_hdr), RXR_REQ_FEATURE_DELIVERY_COMPLETE}, + [RXR_DC_MEDIUM_TAGRTM_PKT] = {4, sizeof(struct rxr_dc_medium_tagrtm_hdr), RXR_REQ_FEATURE_DELIVERY_COMPLETE}, + [RXR_DC_LONG_MSGRTM_PKT] = {4, sizeof(struct rxr_long_msgrtm_hdr), RXR_REQ_FEATURE_DELIVERY_COMPLETE}, + [RXR_DC_LONG_TAGRTM_PKT] = {4, sizeof(struct rxr_long_tagrtm_hdr), RXR_REQ_FEATURE_DELIVERY_COMPLETE}, + /* rtw header */ + [RXR_EAGER_RTW_PKT] = {4, sizeof(struct rxr_eager_rtw_hdr), 0}, + [RXR_DC_EAGER_RTW_PKT] = {4, sizeof(struct rxr_dc_eager_rtw_hdr), RXR_REQ_FEATURE_DELIVERY_COMPLETE}, + [RXR_LONG_RTW_PKT] = {4, sizeof(struct rxr_long_rtw_hdr), 0}, + [RXR_DC_LONG_RTW_PKT] = {4, sizeof(struct rxr_long_rtw_hdr), RXR_REQ_FEATURE_DELIVERY_COMPLETE}, + [RXR_READ_RTW_PKT] = {4, sizeof(struct rxr_read_rtw_hdr), RXR_REQ_FEATURE_RDMA_READ}, + /* rtr header */ + [RXR_SHORT_RTR_PKT] = {4, sizeof(struct rxr_rtr_hdr), 0}, + [RXR_LONG_RTR_PKT] = {4, sizeof(struct rxr_rtr_hdr), 0}, + [RXR_READ_RTR_PKT] = {4, sizeof(struct rxr_base_hdr), RXR_REQ_FEATURE_RDMA_READ}, + /* rta header */ + [RXR_WRITE_RTA_PKT] = {4, sizeof(struct rxr_rta_hdr), 0}, + [RXR_DC_WRITE_RTA_PKT] = {4, sizeof(struct rxr_rta_hdr), RXR_REQ_FEATURE_DELIVERY_COMPLETE}, + [RXR_FETCH_RTA_PKT] = {4, sizeof(struct rxr_rta_hdr), 0}, + [RXR_COMPARE_RTA_PKT] = {4, sizeof(struct rxr_rta_hdr), 0}, +}; + +size_t rxr_pkt_req_data_size(struct rxr_pkt_entry *pkt_entry) +{ + size_t hdr_size; + + hdr_size = rxr_pkt_req_hdr_size(pkt_entry); + assert(hdr_size > 0); + return pkt_entry->pkt_size - hdr_size; +} + +void rxr_pkt_init_req_hdr(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + int pkt_type, + struct rxr_pkt_entry *pkt_entry) +{ + char *opt_hdr; + struct rxr_peer *peer; + struct rxr_base_hdr *base_hdr; + + /* init the base header */ + base_hdr = rxr_get_base_hdr(pkt_entry->pkt); + base_hdr->type = pkt_type; + base_hdr->version = REQ_INF_LIST[pkt_type].protover; + base_hdr->flags = 0; + + peer = rxr_ep_get_peer(ep, tx_entry->addr); + + if (OFI_UNLIKELY(!(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED))) { + /* + * This is the first communication with this peer on this + * endpoint, so send the core's address for this EP in the REQ + * so the remote side can insert it into its address vector. + */ + base_hdr->flags |= RXR_REQ_OPT_RAW_ADDR_HDR; + } + + if (tx_entry->fi_flags & FI_REMOTE_CQ_DATA) { + base_hdr->flags |= RXR_REQ_OPT_CQ_DATA_HDR; + } + + /* init the opt header */ + opt_hdr = (char *)base_hdr + rxr_pkt_req_base_hdr_size(pkt_entry); + if (base_hdr->flags & RXR_REQ_OPT_RAW_ADDR_HDR) { + struct rxr_req_opt_raw_addr_hdr *raw_addr_hdr; + + raw_addr_hdr = (struct rxr_req_opt_raw_addr_hdr *)opt_hdr; + raw_addr_hdr->addr_len = ep->core_addrlen; + memcpy(raw_addr_hdr->raw_addr, ep->core_addr, raw_addr_hdr->addr_len); + opt_hdr += sizeof(*raw_addr_hdr) + raw_addr_hdr->addr_len; + } + + if (base_hdr->flags & RXR_REQ_OPT_CQ_DATA_HDR) { + struct rxr_req_opt_cq_data_hdr *cq_data_hdr; + + cq_data_hdr = (struct rxr_req_opt_cq_data_hdr *)opt_hdr; + cq_data_hdr->cq_data = tx_entry->cq_entry.data; + opt_hdr += sizeof(*cq_data_hdr); + } + + pkt_entry->addr = tx_entry->addr; +} + +size_t rxr_pkt_req_base_hdr_size(struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_base_hdr *base_hdr; + size_t hdr_size; + + base_hdr = rxr_get_base_hdr(pkt_entry->pkt); + assert(base_hdr->type >= RXR_REQ_PKT_BEGIN); + + hdr_size = REQ_INF_LIST[base_hdr->type].base_hdr_size; + if (base_hdr->type == RXR_EAGER_RTW_PKT || + base_hdr->type == RXR_DC_EAGER_RTW_PKT || + base_hdr->type == RXR_LONG_RTW_PKT || + base_hdr->type == RXR_DC_LONG_RTW_PKT || + base_hdr->type == RXR_READ_RTW_PKT) + hdr_size += rxr_get_rtw_base_hdr(pkt_entry->pkt)->rma_iov_count * sizeof(struct fi_rma_iov); + else if (base_hdr->type == RXR_SHORT_RTR_PKT || + base_hdr->type == RXR_LONG_RTR_PKT) + hdr_size += rxr_get_rtr_hdr(pkt_entry->pkt)->rma_iov_count * sizeof(struct fi_rma_iov); + else if (base_hdr->type == RXR_WRITE_RTA_PKT || + base_hdr->type == RXR_DC_WRITE_RTA_PKT || + base_hdr->type == RXR_FETCH_RTA_PKT || + base_hdr->type == RXR_COMPARE_RTA_PKT) + hdr_size += rxr_get_rta_hdr(pkt_entry->pkt)->rma_iov_count * sizeof(struct fi_rma_iov); + + return hdr_size; +} + +void *rxr_pkt_req_raw_addr(struct rxr_pkt_entry *pkt_entry) +{ + char *opt_hdr; + struct rxr_base_hdr *base_hdr; + struct rxr_req_opt_raw_addr_hdr *raw_addr_hdr; + + base_hdr = rxr_get_base_hdr(pkt_entry->pkt); + opt_hdr = (char *)pkt_entry->pkt + rxr_pkt_req_base_hdr_size(pkt_entry); + if (base_hdr->flags & RXR_REQ_OPT_RAW_ADDR_HDR) { + raw_addr_hdr = (struct rxr_req_opt_raw_addr_hdr *)opt_hdr; + return raw_addr_hdr->raw_addr; + } + + return NULL; +} + +size_t rxr_pkt_req_hdr_size(struct rxr_pkt_entry *pkt_entry) +{ + char *opt_hdr; + struct rxr_base_hdr *base_hdr; + struct rxr_req_opt_raw_addr_hdr *raw_addr_hdr; + + base_hdr = rxr_get_base_hdr(pkt_entry->pkt); + opt_hdr = (char *)pkt_entry->pkt + rxr_pkt_req_base_hdr_size(pkt_entry); + if (base_hdr->flags & RXR_REQ_OPT_RAW_ADDR_HDR) { + raw_addr_hdr = (struct rxr_req_opt_raw_addr_hdr *)opt_hdr; + opt_hdr += sizeof(struct rxr_req_opt_raw_addr_hdr) + raw_addr_hdr->addr_len; + } + + if (base_hdr->flags & RXR_REQ_OPT_CQ_DATA_HDR) + opt_hdr += sizeof(struct rxr_req_opt_cq_data_hdr); + + return opt_hdr - (char *)pkt_entry->pkt; +} + +int64_t rxr_pkt_req_cq_data(struct rxr_pkt_entry *pkt_entry) +{ + char *opt_hdr; + struct rxr_base_hdr *base_hdr; + struct rxr_req_opt_cq_data_hdr *cq_data_hdr; + struct rxr_req_opt_raw_addr_hdr *raw_addr_hdr; + + base_hdr = rxr_get_base_hdr(pkt_entry->pkt); + opt_hdr = (char *)pkt_entry->pkt + rxr_pkt_req_base_hdr_size(pkt_entry); + if (base_hdr->flags & RXR_REQ_OPT_RAW_ADDR_HDR) { + raw_addr_hdr = (struct rxr_req_opt_raw_addr_hdr *)opt_hdr; + opt_hdr += sizeof(struct rxr_req_opt_raw_addr_hdr) + raw_addr_hdr->addr_len; + } + + assert(base_hdr->flags & RXR_REQ_OPT_CQ_DATA_HDR); + cq_data_hdr = (struct rxr_req_opt_cq_data_hdr *)opt_hdr; + return cq_data_hdr->cq_data; +} + +size_t rxr_pkt_req_max_header_size(int pkt_type) +{ + int max_hdr_size = REQ_INF_LIST[pkt_type].base_hdr_size + + sizeof(struct rxr_req_opt_raw_addr_hdr) + RXR_MAX_NAME_LENGTH + + sizeof(struct rxr_req_opt_cq_data_hdr); + + if (pkt_type == RXR_EAGER_RTW_PKT || + pkt_type == RXR_DC_EAGER_RTW_PKT || + pkt_type == RXR_LONG_RTW_PKT) + max_hdr_size += RXR_IOV_LIMIT * sizeof(struct fi_rma_iov); + + return max_hdr_size; +} + +size_t rxr_pkt_max_header_size(void) +{ + size_t max_hdr_size = 0; + size_t pkt_type = RXR_REQ_PKT_BEGIN; + + while (pkt_type < RXR_EXTRA_REQ_PKT_END) { + max_hdr_size = MAX(max_hdr_size, + rxr_pkt_req_max_header_size(pkt_type)); + if (pkt_type == RXR_BASELINE_REQ_PKT_END) + pkt_type = RXR_EXTRA_REQ_PKT_BEGIN; + else + pkt_type += 1; + } + + return max_hdr_size; + +} + +size_t rxr_pkt_req_max_data_size(struct rxr_ep *ep, fi_addr_t addr, int pkt_type) +{ + struct rxr_peer *peer; + + peer = rxr_ep_get_peer(ep, addr); + + if (peer->is_local) { + assert(ep->use_shm); + return rxr_env.shm_max_medium_size; + } + + return ep->mtu_size - rxr_pkt_req_max_header_size(pkt_type); +} + +/* + * REQ packet type functions + * + * init() functions + */ + +/* + * this function is called after you have set header in pkt_entry->pkt + */ +void rxr_pkt_data_from_tx(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry, + struct rxr_tx_entry *tx_entry, size_t data_offset, + size_t data_size) +{ + int tx_iov_index; + size_t tx_iov_offset; + char *data; + size_t hdr_size; + struct efa_mr *desc; + + assert(pkt_entry->send); + hdr_size = rxr_pkt_req_hdr_size(pkt_entry); + assert(hdr_size > 0); + if (data_size == 0) { + pkt_entry->send->iov_count = 0; + pkt_entry->pkt_size = hdr_size; + return; + } + + rxr_locate_iov_pos(tx_entry->iov, tx_entry->iov_count, data_offset, + &tx_iov_index, &tx_iov_offset); + desc = tx_entry->desc[0]; + assert(tx_iov_index < tx_entry->iov_count); + assert(tx_iov_offset < tx_entry->iov[tx_iov_index].iov_len); + + /* + * We want to go through the bounce-buffers here only when + * one of the following conditions are true: + * 1. The application can not register buffers (no FI_MR_LOCAL) + * 2. desc.peer.iface is anything but FI_HMEM_SYSTEM + * 3. prov/shm is not used for this transfer, and #1 or #2 hold true. + * + * In the first case, we use the pre-registered pkt_entry's MR. In the + * second case, this is for the eager and medium-message protocols which + * can not rendezvous and pull the data from a peer. In the third case, + * the bufpool would not have been created with a registration handler, + * so pkt_entry->mr will be NULL. + * + */ + if (!tx_entry->desc[tx_iov_index] && pkt_entry->mr) { + data = (char *)pkt_entry->pkt + hdr_size; + data_size = ofi_copy_from_hmem_iov(data, + data_size, + desc ? desc->peer.iface : FI_HMEM_SYSTEM, + desc ? desc->peer.device.reserved : 0, + tx_entry->iov, + tx_entry->iov_count, + data_offset); + pkt_entry->send->iov_count = 0; + pkt_entry->pkt_size = hdr_size + data_size; + return; + } + + assert(ep->core_iov_limit >= 2); + pkt_entry->send->iov[0].iov_base = pkt_entry->pkt; + pkt_entry->send->iov[0].iov_len = hdr_size; + pkt_entry->send->desc[0] = pkt_entry->mr ? fi_mr_desc(pkt_entry->mr) : NULL; + + pkt_entry->send->iov[1].iov_base = (char *)tx_entry->iov[tx_iov_index].iov_base + tx_iov_offset; + pkt_entry->send->iov[1].iov_len = MIN(data_size, tx_entry->iov[tx_iov_index].iov_len - tx_iov_offset); + pkt_entry->send->desc[1] = tx_entry->desc[tx_iov_index]; + pkt_entry->send->iov_count = 2; + pkt_entry->pkt_size = hdr_size + pkt_entry->send->iov[1].iov_len; +} + +void rxr_pkt_init_rtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + int pkt_type, uint64_t data_offset, + struct rxr_pkt_entry *pkt_entry) +{ + size_t data_size; + struct rxr_rtm_base_hdr *rtm_hdr; + rxr_pkt_init_req_hdr(ep, tx_entry, pkt_type, pkt_entry); + + rtm_hdr = (struct rxr_rtm_base_hdr *)pkt_entry->pkt; + rtm_hdr->flags |= RXR_REQ_MSG; + rtm_hdr->msg_id = tx_entry->msg_id; + + data_size = MIN(tx_entry->total_len - data_offset, + ep->mtu_size - rxr_pkt_req_hdr_size(pkt_entry)); + rxr_pkt_data_from_tx(ep, pkt_entry, tx_entry, data_offset, data_size); + pkt_entry->x_entry = tx_entry; +} + +ssize_t rxr_pkt_init_eager_msgrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + rxr_pkt_init_rtm(ep, tx_entry, RXR_EAGER_MSGRTM_PKT, 0, pkt_entry); + assert(tx_entry->total_len == rxr_pkt_req_data_size(pkt_entry)); + return 0; +} + +ssize_t rxr_pkt_init_dc_eager_msgrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_dc_eager_msgrtm_hdr *dc_eager_msgrtm_hdr; + + rxr_pkt_init_rtm(ep, tx_entry, RXR_DC_EAGER_MSGRTM_PKT, 0, pkt_entry); + dc_eager_msgrtm_hdr = rxr_get_dc_eager_msgrtm_hdr(pkt_entry->pkt); + dc_eager_msgrtm_hdr->hdr.tx_id = tx_entry->tx_id; + return 0; +} + +ssize_t rxr_pkt_init_eager_tagrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_base_hdr *base_hdr; + + rxr_pkt_init_rtm(ep, tx_entry, RXR_EAGER_TAGRTM_PKT, 0, pkt_entry); + assert(tx_entry->total_len == rxr_pkt_req_data_size(pkt_entry)); + base_hdr = rxr_get_base_hdr(pkt_entry->pkt); + base_hdr->flags |= RXR_REQ_TAGGED; + rxr_pkt_rtm_settag(pkt_entry, tx_entry->tag); + return 0; +} + +ssize_t rxr_pkt_init_dc_eager_tagrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_base_hdr *base_hdr; + struct rxr_dc_eager_tagrtm_hdr *dc_eager_tagrtm_hdr; + + rxr_pkt_init_rtm(ep, tx_entry, RXR_DC_EAGER_TAGRTM_PKT, 0, pkt_entry); + base_hdr = rxr_get_base_hdr(pkt_entry->pkt); + base_hdr->flags |= RXR_REQ_TAGGED; + rxr_pkt_rtm_settag(pkt_entry, tx_entry->tag); + + dc_eager_tagrtm_hdr = rxr_get_dc_eager_tagrtm_hdr(pkt_entry->pkt); + dc_eager_tagrtm_hdr->hdr.tx_id = tx_entry->tx_id; + return 0; +} + +ssize_t rxr_pkt_init_medium_msgrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_medium_rtm_base_hdr *rtm_hdr; + + rxr_pkt_init_rtm(ep, tx_entry, RXR_MEDIUM_MSGRTM_PKT, + tx_entry->bytes_sent, pkt_entry); + rtm_hdr = rxr_get_medium_rtm_base_hdr(pkt_entry->pkt); + rtm_hdr->data_len = tx_entry->total_len; + rtm_hdr->offset = tx_entry->bytes_sent; + return 0; +} + +ssize_t rxr_pkt_init_dc_medium_msgrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_dc_medium_msgrtm_hdr *dc_medium_msgrtm_hdr; + + rxr_pkt_init_rtm(ep, tx_entry, RXR_DC_MEDIUM_MSGRTM_PKT, + tx_entry->bytes_sent, pkt_entry); + + dc_medium_msgrtm_hdr = rxr_get_dc_medium_msgrtm_hdr(pkt_entry->pkt); + dc_medium_msgrtm_hdr->hdr.data_len = tx_entry->total_len; + dc_medium_msgrtm_hdr->hdr.offset = tx_entry->bytes_sent; + dc_medium_msgrtm_hdr->hdr.tx_id = tx_entry->tx_id; + return 0; +} + +ssize_t rxr_pkt_init_medium_tagrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_medium_rtm_base_hdr *rtm_hdr; + + rxr_pkt_init_rtm(ep, tx_entry, RXR_MEDIUM_TAGRTM_PKT, + tx_entry->bytes_sent, pkt_entry); + rtm_hdr = rxr_get_medium_rtm_base_hdr(pkt_entry->pkt); + rtm_hdr->data_len = tx_entry->total_len; + rtm_hdr->offset = tx_entry->bytes_sent; + rtm_hdr->hdr.flags |= RXR_REQ_TAGGED; + rxr_pkt_rtm_settag(pkt_entry, tx_entry->tag); + return 0; +} + +ssize_t rxr_pkt_init_dc_medium_tagrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_dc_medium_tagrtm_hdr *dc_medium_tagrtm_hdr; + + rxr_pkt_init_rtm(ep, tx_entry, RXR_DC_MEDIUM_TAGRTM_PKT, + tx_entry->bytes_sent, pkt_entry); + + dc_medium_tagrtm_hdr = rxr_get_dc_medium_tagrtm_hdr(pkt_entry->pkt); + dc_medium_tagrtm_hdr->hdr.data_len = tx_entry->total_len; + dc_medium_tagrtm_hdr->hdr.offset = tx_entry->bytes_sent; + dc_medium_tagrtm_hdr->hdr.hdr.flags |= RXR_REQ_TAGGED; + dc_medium_tagrtm_hdr->hdr.tx_id = tx_entry->tx_id; + rxr_pkt_rtm_settag(pkt_entry, tx_entry->tag); + return 0; +} + +void rxr_pkt_init_long_rtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + int pkt_type, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_long_rtm_base_hdr *rtm_hdr; + + rxr_pkt_init_rtm(ep, tx_entry, pkt_type, 0, pkt_entry); + rtm_hdr = rxr_get_long_rtm_base_hdr(pkt_entry->pkt); + rtm_hdr->data_len = tx_entry->total_len; + rtm_hdr->tx_id = tx_entry->tx_id; + rtm_hdr->credit_request = tx_entry->credit_request; +} + +ssize_t rxr_pkt_init_long_msgrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + rxr_pkt_init_long_rtm(ep, tx_entry, RXR_LONG_MSGRTM_PKT, pkt_entry); + return 0; +} + +ssize_t rxr_pkt_init_dc_long_msgrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + rxr_pkt_init_long_rtm(ep, tx_entry, RXR_DC_LONG_MSGRTM_PKT, pkt_entry); + return 0; +} + +ssize_t rxr_pkt_init_long_tagrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_base_hdr *base_hdr; + + rxr_pkt_init_long_rtm(ep, tx_entry, RXR_LONG_TAGRTM_PKT, pkt_entry); + base_hdr = rxr_get_base_hdr(pkt_entry->pkt); + base_hdr->flags |= RXR_REQ_TAGGED; + rxr_pkt_rtm_settag(pkt_entry, tx_entry->tag); + return 0; +} + +ssize_t rxr_pkt_init_dc_long_tagrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_base_hdr *base_hdr; + + rxr_pkt_init_long_rtm(ep, tx_entry, RXR_DC_LONG_TAGRTM_PKT, pkt_entry); + base_hdr = rxr_get_base_hdr(pkt_entry->pkt); + base_hdr->flags |= RXR_REQ_TAGGED; + rxr_pkt_rtm_settag(pkt_entry, tx_entry->tag); + return 0; +} + +ssize_t rxr_pkt_init_read_rtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + int pkt_type, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_read_rtm_base_hdr *rtm_hdr; + struct fi_rma_iov *read_iov; + size_t hdr_size; + int err; + + rxr_pkt_init_req_hdr(ep, tx_entry, pkt_type, pkt_entry); + + rtm_hdr = rxr_get_read_rtm_base_hdr(pkt_entry->pkt); + rtm_hdr->hdr.flags |= RXR_REQ_MSG; + rtm_hdr->hdr.msg_id = tx_entry->msg_id; + rtm_hdr->data_len = tx_entry->total_len; + rtm_hdr->tx_id = tx_entry->tx_id; + rtm_hdr->read_iov_count = tx_entry->iov_count; + + hdr_size = rxr_pkt_req_hdr_size(pkt_entry); + read_iov = (struct fi_rma_iov *)((char *)pkt_entry->pkt + hdr_size); + err = rxr_read_init_iov(ep, tx_entry, read_iov); + if (OFI_UNLIKELY(err)) + return err; + + pkt_entry->pkt_size = hdr_size + tx_entry->iov_count * sizeof(struct fi_rma_iov); + return 0; +} + +ssize_t rxr_pkt_init_read_msgrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + return rxr_pkt_init_read_rtm(ep, tx_entry, RXR_READ_MSGRTM_PKT, pkt_entry); +} + +ssize_t rxr_pkt_init_read_tagrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + ssize_t err; + struct rxr_base_hdr *base_hdr; + + err = rxr_pkt_init_read_rtm(ep, tx_entry, RXR_READ_TAGRTM_PKT, pkt_entry); + if (err) + return err; + + base_hdr = rxr_get_base_hdr(pkt_entry->pkt); + base_hdr->flags |= RXR_REQ_TAGGED; + rxr_pkt_rtm_settag(pkt_entry, tx_entry->tag); + return 0; +} + +/* + * handle_sent() functions + */ + +/* + * rxr_pkt_handle_eager_rtm_sent() is empty and is defined in rxr_pkt_type_req.h + */ +void rxr_pkt_handle_medium_rtm_sent(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_tx_entry *tx_entry; + + tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry; + tx_entry->bytes_sent += rxr_pkt_req_data_size(pkt_entry); +} + +void rxr_pkt_handle_long_rtm_sent(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_tx_entry *tx_entry; + struct efa_domain *efa_domain; + struct rxr_domain *rxr_domain = rxr_ep_domain(ep); + + efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain, + util_domain.domain_fid); + + tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry; + tx_entry->bytes_sent += rxr_pkt_req_data_size(pkt_entry); + assert(tx_entry->bytes_sent < tx_entry->total_len); + + if (tx_entry->desc[0] || efa_is_cache_available(efa_domain)) + rxr_prepare_desc_send(rxr_ep_domain(ep), tx_entry); +} + +/* + * handle_send_completion() functions + */ +void rxr_pkt_handle_eager_rtm_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_tx_entry *tx_entry; + + tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry; + assert(tx_entry->total_len == rxr_pkt_req_data_size(pkt_entry)); + rxr_cq_handle_tx_completion(ep, tx_entry); +} + +void rxr_pkt_handle_medium_rtm_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_tx_entry *tx_entry; + + tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry; + tx_entry->bytes_acked += rxr_pkt_req_data_size(pkt_entry); + if (tx_entry->total_len == tx_entry->bytes_acked) + rxr_cq_handle_tx_completion(ep, tx_entry); +} + +void rxr_pkt_handle_long_rtm_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_tx_entry *tx_entry; + + tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry; + tx_entry->bytes_acked += rxr_pkt_req_data_size(pkt_entry); + if (tx_entry->total_len == tx_entry->bytes_acked) + rxr_cq_handle_tx_completion(ep, tx_entry); +} + +void rxr_pkt_handle_dc_long_rtm_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_tx_entry *tx_entry; + + tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry; + tx_entry->bytes_acked += rxr_pkt_req_data_size(pkt_entry); + if (tx_entry->total_len == tx_entry->bytes_acked && + tx_entry->rxr_flags & RXR_RECEIPT_RECEIVED) + rxr_cq_handle_tx_completion(ep, tx_entry); +} + +/* + * proc() functions + */ +size_t rxr_pkt_rtm_total_len(struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_base_hdr *base_hdr; + + base_hdr = rxr_get_base_hdr(pkt_entry->pkt); + switch (base_hdr->type) { + case RXR_EAGER_MSGRTM_PKT: + case RXR_EAGER_TAGRTM_PKT: + case RXR_DC_EAGER_MSGRTM_PKT: + case RXR_DC_EAGER_TAGRTM_PKT: + return rxr_pkt_req_data_size(pkt_entry); + case RXR_MEDIUM_MSGRTM_PKT: + case RXR_MEDIUM_TAGRTM_PKT: + return rxr_get_medium_rtm_base_hdr(pkt_entry->pkt)->data_len; + case RXR_DC_MEDIUM_MSGRTM_PKT: + case RXR_DC_MEDIUM_TAGRTM_PKT: + return rxr_get_dc_medium_rtm_base_hdr(pkt_entry->pkt)->data_len; + case RXR_LONG_MSGRTM_PKT: + case RXR_LONG_TAGRTM_PKT: + case RXR_DC_LONG_MSGRTM_PKT: + case RXR_DC_LONG_TAGRTM_PKT: + return rxr_get_long_rtm_base_hdr(pkt_entry->pkt)->data_len; + case RXR_READ_MSGRTM_PKT: + case RXR_READ_TAGRTM_PKT: + return rxr_get_read_rtm_base_hdr(pkt_entry->pkt)->data_len; + default: + assert(0 && "Unknown REQ packet type\n"); + } + + return 0; +} + +void rxr_pkt_rtm_init_rx_entry(struct rxr_pkt_entry *pkt_entry, + struct rxr_rx_entry *rx_entry) +{ + struct rxr_base_hdr *base_hdr; + + base_hdr = rxr_get_base_hdr(pkt_entry->pkt); + if (base_hdr->flags & RXR_REQ_OPT_CQ_DATA_HDR) { + rx_entry->rxr_flags |= RXR_REMOTE_CQ_DATA; + rx_entry->cq_entry.flags |= FI_REMOTE_CQ_DATA; + rx_entry->cq_entry.data = rxr_pkt_req_cq_data(pkt_entry); + } + + rx_entry->addr = pkt_entry->addr; + rx_entry->msg_id = rxr_pkt_msg_id(pkt_entry); + rx_entry->total_len = rxr_pkt_rtm_total_len(pkt_entry); + rx_entry->tag = rxr_pkt_rtm_tag(pkt_entry); + rx_entry->cq_entry.tag = rx_entry->tag; +} + +struct rxr_rx_entry *rxr_pkt_get_rtm_matched_rx_entry(struct rxr_ep *ep, + struct dlist_entry *match, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rx_entry *rx_entry; + + assert(match); + rx_entry = container_of(match, struct rxr_rx_entry, entry); + if (rx_entry->rxr_flags & RXR_MULTI_RECV_POSTED) { + rx_entry = rxr_ep_split_rx_entry(ep, rx_entry, + NULL, pkt_entry); + if (OFI_UNLIKELY(!rx_entry)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "RX entries exhausted.\n"); + efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS); + return NULL; + } + } else { + rxr_pkt_rtm_init_rx_entry(pkt_entry, rx_entry); + } + + rx_entry->state = RXR_RX_MATCHED; + + if (!(rx_entry->fi_flags & FI_MULTI_RECV) || + !rxr_msg_multi_recv_buffer_available(ep, rx_entry->master_entry)) + dlist_remove(match); + + return rx_entry; +} + +static +int rxr_pkt_rtm_match_recv_anyaddr(struct dlist_entry *item, const void *arg) +{ + return 1; +} + +static +int rxr_pkt_rtm_match_recv(struct dlist_entry *item, const void *arg) +{ + const struct rxr_pkt_entry *pkt_entry = arg; + struct rxr_rx_entry *rx_entry; + + rx_entry = container_of(item, struct rxr_rx_entry, entry); + return rxr_match_addr(rx_entry->addr, pkt_entry->addr); +} + +static +int rxr_pkt_rtm_match_trecv_anyaddr(struct dlist_entry *item, const void *arg) +{ + struct rxr_pkt_entry *pkt_entry = (struct rxr_pkt_entry *)arg; + struct rxr_rx_entry *rx_entry; + uint64_t match_tag; + + rx_entry = container_of(item, struct rxr_rx_entry, entry); + match_tag = rxr_pkt_rtm_tag(pkt_entry); + + return rxr_match_tag(rx_entry->cq_entry.tag, rx_entry->ignore, + match_tag); +} + +static +int rxr_pkt_rtm_match_trecv(struct dlist_entry *item, const void *arg) +{ + struct rxr_pkt_entry *pkt_entry = (struct rxr_pkt_entry *)arg; + struct rxr_rx_entry *rx_entry; + uint64_t match_tag; + + rx_entry = container_of(item, struct rxr_rx_entry, entry); + match_tag = rxr_pkt_rtm_tag(pkt_entry); + + return rxr_match_addr(rx_entry->addr, pkt_entry->addr) && + rxr_match_tag(rx_entry->cq_entry.tag, rx_entry->ignore, + match_tag); +} + +static +struct rxr_rx_entry *rxr_pkt_get_msgrtm_rx_entry(struct rxr_ep *ep, + struct rxr_pkt_entry **pkt_entry_ptr) +{ + struct rxr_rx_entry *rx_entry; + struct dlist_entry *match; + dlist_func_t *match_func; + int pkt_type; + + if (ep->util_ep.caps & FI_DIRECTED_RECV) + match_func = &rxr_pkt_rtm_match_recv; + else + match_func = &rxr_pkt_rtm_match_recv_anyaddr; + + match = dlist_find_first_match(&ep->rx_list, match_func, + *pkt_entry_ptr); + if (OFI_UNLIKELY(!match)) { + /* + * rxr_ep_alloc_unexp_rx_entry_for_msgrtm() might release pkt_entry, + * thus we have to use pkt_entry_ptr here + */ + rx_entry = rxr_ep_alloc_unexp_rx_entry_for_msgrtm(ep, pkt_entry_ptr); + if (OFI_UNLIKELY(!rx_entry)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "RX entries exhausted.\n"); + efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS); + return NULL; + } + + } else { + rx_entry = rxr_pkt_get_rtm_matched_rx_entry(ep, match, *pkt_entry_ptr); + } + + pkt_type = rxr_get_base_hdr((*pkt_entry_ptr)->pkt)->type; + if (pkt_type == RXR_MEDIUM_MSGRTM_PKT || + pkt_type == RXR_DC_MEDIUM_MSGRTM_PKT) + rxr_pkt_rx_map_insert(ep, *pkt_entry_ptr, rx_entry); + + return rx_entry; +} + +static +struct rxr_rx_entry *rxr_pkt_get_tagrtm_rx_entry(struct rxr_ep *ep, + struct rxr_pkt_entry **pkt_entry_ptr) +{ + struct rxr_rx_entry *rx_entry; + struct dlist_entry *match; + dlist_func_t *match_func; + int pkt_type; + + if (ep->util_ep.caps & FI_DIRECTED_RECV) + match_func = &rxr_pkt_rtm_match_trecv; + else + match_func = &rxr_pkt_rtm_match_trecv_anyaddr; + + match = dlist_find_first_match(&ep->rx_tagged_list, match_func, + *pkt_entry_ptr); + if (OFI_UNLIKELY(!match)) { + /* + * rxr_ep_alloc_unexp_rx_entry_for_tagrtm() might release pkt_entry, + * thus we have to use pkt_entry_ptr here + */ + rx_entry = rxr_ep_alloc_unexp_rx_entry_for_tagrtm(ep, pkt_entry_ptr); + if (OFI_UNLIKELY(!rx_entry)) { + efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS); + return NULL; + } + } else { + rx_entry = rxr_pkt_get_rtm_matched_rx_entry(ep, match, *pkt_entry_ptr); + } + + pkt_type = rxr_get_base_hdr((*pkt_entry_ptr)->pkt)->type; + if (pkt_type == RXR_MEDIUM_TAGRTM_PKT || + pkt_type == RXR_DC_MEDIUM_TAGRTM_PKT) + rxr_pkt_rx_map_insert(ep, *pkt_entry_ptr, rx_entry); + + return rx_entry; +} + +ssize_t rxr_pkt_proc_matched_read_rtm(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_read_rtm_base_hdr *rtm_hdr; + struct fi_rma_iov *read_iov; + + rtm_hdr = rxr_get_read_rtm_base_hdr(pkt_entry->pkt); + read_iov = (struct fi_rma_iov *)((char *)pkt_entry->pkt + rxr_pkt_req_hdr_size(pkt_entry)); + + rx_entry->tx_id = rtm_hdr->tx_id; + rx_entry->rma_iov_count = rtm_hdr->read_iov_count; + memcpy(rx_entry->rma_iov, read_iov, + rx_entry->rma_iov_count * sizeof(struct fi_rma_iov)); + + rxr_pkt_entry_release_rx(ep, pkt_entry); + + /* truncate rx_entry->iov to save memory registration pages because we + * need to do memory registration for the receiving buffer. + */ + ofi_truncate_iov(rx_entry->iov, &rx_entry->iov_count, rx_entry->total_len); + return rxr_read_post_remote_read_or_queue(ep, RXR_RX_ENTRY, rx_entry); +} + +ssize_t rxr_pkt_proc_matched_medium_rtm(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_pkt_entry *cur, *nxt; + char *data; + ssize_t ret, err; + size_t offset, hdr_size, data_size; + + ret = 0; + cur = pkt_entry; + while (cur) { + hdr_size = rxr_pkt_req_hdr_size(cur); + data = (char *)cur->pkt + hdr_size; + if (rx_entry->rxr_flags & RXR_DELIVERY_COMPLETE_REQUESTED) + offset = rxr_get_dc_medium_rtm_base_hdr(cur->pkt)->offset; + else + offset = rxr_get_medium_rtm_base_hdr(cur->pkt)->offset; + data_size = cur->pkt_size - hdr_size; + + /* rxr_pkt_copy_to_rx() can release rx_entry, so + * bytes_received must be calculated before it. + */ + rx_entry->bytes_received += data_size; + if (rx_entry->total_len == rx_entry->bytes_received) + rxr_pkt_rx_map_remove(ep, cur, rx_entry); + + /* rxr_pkt_copy_to_rx() will release cur, so + * cur->next must be copied out before it. + */ + nxt = cur->next; + cur->next = NULL; + + err = rxr_pkt_copy_to_rx(ep, rx_entry, offset, cur, data, data_size); + if (err) { + rxr_pkt_entry_release_rx(ep, cur); + ret = err; + } + + cur = nxt; + } + + return ret; +} + +ssize_t rxr_pkt_proc_matched_rtm(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + int pkt_type; + char *data; + size_t hdr_size, data_size; + ssize_t ret; + + assert(rx_entry->state == RXR_RX_MATCHED); + + /* Adjust rx_entry->cq_entry.len as needed. + * Initialy rx_entry->cq_entry.len is total recv buffer size. + * rx_entry->total_len is from REQ packet and is total send buffer size. + * if send buffer size < recv buffer size, we adjust value of rx_entry->cq_entry.len + * if send buffer size > recv buffer size, we have a truncated message and will + * write error CQ entry. + */ + if (rx_entry->cq_entry.len > rx_entry->total_len) + rx_entry->cq_entry.len = rx_entry->total_len; + + pkt_type = rxr_get_base_hdr(pkt_entry->pkt)->type; + + if (pkt_type > RXR_DC_REQ_PKT_BEGIN && + pkt_type < RXR_DC_REQ_PKT_END) + rx_entry->rxr_flags |= RXR_DELIVERY_COMPLETE_REQUESTED; + + if (pkt_type == RXR_LONG_MSGRTM_PKT || + pkt_type == RXR_LONG_TAGRTM_PKT) + rx_entry->tx_id = rxr_get_long_rtm_base_hdr(pkt_entry->pkt)->tx_id; + else if (pkt_type == RXR_DC_EAGER_MSGRTM_PKT || + pkt_type == RXR_DC_EAGER_TAGRTM_PKT) + rx_entry->tx_id = rxr_get_dc_eager_rtm_base_hdr(pkt_entry->pkt)->tx_id; + else if (pkt_type == RXR_DC_MEDIUM_MSGRTM_PKT || + pkt_type == RXR_DC_MEDIUM_TAGRTM_PKT) + rx_entry->tx_id = rxr_get_dc_medium_rtm_base_hdr(pkt_entry->pkt)->tx_id; + else if (pkt_type == RXR_DC_LONG_MSGRTM_PKT || + pkt_type == RXR_DC_LONG_TAGRTM_PKT) + rx_entry->tx_id = rxr_get_long_rtm_base_hdr(pkt_entry->pkt)->tx_id; + + rx_entry->msg_id = rxr_get_rtm_base_hdr(pkt_entry->pkt)->msg_id; + + if (pkt_type == RXR_READ_MSGRTM_PKT || pkt_type == RXR_READ_TAGRTM_PKT) + return rxr_pkt_proc_matched_read_rtm(ep, rx_entry, pkt_entry); + + if (pkt_type == RXR_MEDIUM_MSGRTM_PKT || + pkt_type == RXR_MEDIUM_TAGRTM_PKT || + pkt_type == RXR_DC_MEDIUM_MSGRTM_PKT || + pkt_type == RXR_DC_MEDIUM_TAGRTM_PKT) + return rxr_pkt_proc_matched_medium_rtm(ep, rx_entry, pkt_entry); + + hdr_size = rxr_pkt_req_hdr_size(pkt_entry); + data = (char *)pkt_entry->pkt + hdr_size; + data_size = pkt_entry->pkt_size - hdr_size; + + rx_entry->bytes_received += data_size; + ret = rxr_pkt_copy_to_rx(ep, rx_entry, 0, pkt_entry, data, data_size); + if (ret) { + rxr_pkt_entry_release_rx(ep, pkt_entry); + return ret; + } + + if (pkt_type == RXR_EAGER_MSGRTM_PKT || + pkt_type == RXR_EAGER_TAGRTM_PKT || + pkt_type == RXR_DC_EAGER_MSGRTM_PKT || + pkt_type == RXR_DC_EAGER_TAGRTM_PKT) { + ret = 0; + } else { + /* + * long message protocol + */ +#if ENABLE_DEBUG + dlist_insert_tail(&rx_entry->rx_pending_entry, &ep->rx_pending_list); + ep->rx_pending++; +#endif + rx_entry->state = RXR_RX_RECV; + /* we have noticed using the default value achieve better bandwidth */ + rx_entry->credit_request = rxr_env.tx_min_credits; + ret = rxr_pkt_post_ctrl_or_queue(ep, RXR_RX_ENTRY, rx_entry, RXR_CTS_PKT, 0); + } + + return ret; +} + +ssize_t rxr_pkt_proc_msgrtm(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + ssize_t err; + struct rxr_rx_entry *rx_entry; + + rx_entry = rxr_pkt_get_msgrtm_rx_entry(ep, &pkt_entry); + if (OFI_UNLIKELY(!rx_entry)) { + efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return -FI_ENOBUFS; + } + + if (rx_entry->state == RXR_RX_MATCHED) { + err = rxr_pkt_proc_matched_rtm(ep, rx_entry, pkt_entry); + if (OFI_UNLIKELY(err)) { + if (rxr_cq_handle_rx_error(ep, rx_entry, err)) { + assert(0 && "cannot write cq error entry"); + efa_eq_write_error(&ep->util_ep, -err, err); + } + rxr_pkt_entry_release_rx(ep, pkt_entry); + rxr_release_rx_entry(ep, rx_entry); + return err; + } + } + + return 0; +} + +ssize_t rxr_pkt_proc_tagrtm(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + ssize_t err; + struct rxr_rx_entry *rx_entry; + + rx_entry = rxr_pkt_get_tagrtm_rx_entry(ep, &pkt_entry); + if (OFI_UNLIKELY(!rx_entry)) { + efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return -FI_ENOBUFS; + } + + if (rx_entry->state == RXR_RX_MATCHED) { + err = rxr_pkt_proc_matched_rtm(ep, rx_entry, pkt_entry); + if (OFI_UNLIKELY(err)) { + if (rxr_cq_handle_rx_error(ep, rx_entry, err)) { + assert(0 && "cannot write error cq entry"); + efa_eq_write_error(&ep->util_ep, -err, err); + } + rxr_pkt_entry_release_rx(ep, pkt_entry); + rxr_release_rx_entry(ep, rx_entry); + return err; + } + } + + return 0; +} + +/* + * proc() functions called by rxr_pkt_handle_recv_completion() + */ +ssize_t rxr_pkt_proc_rtm_rta(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_base_hdr *base_hdr; + + base_hdr = rxr_get_base_hdr(pkt_entry->pkt); + assert(base_hdr->type >= RXR_BASELINE_REQ_PKT_BEGIN); + + switch (base_hdr->type) { + case RXR_EAGER_MSGRTM_PKT: + case RXR_MEDIUM_MSGRTM_PKT: + case RXR_LONG_MSGRTM_PKT: + case RXR_READ_MSGRTM_PKT: + case RXR_DC_EAGER_MSGRTM_PKT: + case RXR_DC_MEDIUM_MSGRTM_PKT: + case RXR_DC_LONG_MSGRTM_PKT: + return rxr_pkt_proc_msgrtm(ep, pkt_entry); + case RXR_EAGER_TAGRTM_PKT: + case RXR_MEDIUM_TAGRTM_PKT: + case RXR_LONG_TAGRTM_PKT: + case RXR_READ_TAGRTM_PKT: + case RXR_DC_EAGER_TAGRTM_PKT: + case RXR_DC_MEDIUM_TAGRTM_PKT: + case RXR_DC_LONG_TAGRTM_PKT: + return rxr_pkt_proc_tagrtm(ep, pkt_entry); + case RXR_WRITE_RTA_PKT: + return rxr_pkt_proc_write_rta(ep, pkt_entry); + case RXR_DC_WRITE_RTA_PKT: + return rxr_pkt_proc_dc_write_rta(ep, pkt_entry); + case RXR_FETCH_RTA_PKT: + return rxr_pkt_proc_fetch_rta(ep, pkt_entry); + case RXR_COMPARE_RTA_PKT: + return rxr_pkt_proc_compare_rta(ep, pkt_entry); + default: + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, + "Unknown packet type ID: %d\n", + base_hdr->type); + if (rxr_cq_handle_cq_error(ep, -FI_EINVAL)) + assert(0 && "failed to write err cq entry"); + } + + return -FI_EINVAL; +} + +void rxr_pkt_handle_zcpy_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rx_entry *rx_entry; + + struct rxr_base_hdr *base_hdr __attribute__((unused)); + base_hdr = rxr_get_base_hdr(pkt_entry->pkt); + assert(base_hdr->type >= RXR_BASELINE_REQ_PKT_BEGIN); + assert(base_hdr->type != RXR_MEDIUM_MSGRTM_PKT); + assert(base_hdr->type != RXR_MEDIUM_TAGRTM_PKT); + assert(base_hdr->type != RXR_DC_MEDIUM_MSGRTM_PKT); + assert(base_hdr->type != RXR_DC_MEDIUM_MSGRTM_PKT); + assert(pkt_entry->type == RXR_PKT_ENTRY_USER); + + rx_entry = rxr_pkt_get_msgrtm_rx_entry(ep, &pkt_entry); + if (OFI_UNLIKELY(!rx_entry)) { + efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return; + } + pkt_entry->x_entry = rx_entry; + if (rx_entry->state != RXR_RX_MATCHED) + return; + + /* + * The incoming receive will always get matched to the first posted + * rx_entry available, so this is a constant cost. No real tag or + * address matching happens. + */ + assert(rx_entry->state == RXR_RX_MATCHED); + + /* + * Adjust rx_entry->cq_entry.len as needed. + * Initialy rx_entry->cq_entry.len is total recv buffer size. + * rx_entry->total_len is from REQ packet and is total send buffer size. + * if send buffer size < recv buffer size, we adjust value of rx_entry->cq_entry.len + * if send buffer size > recv buffer size, we have a truncated message and will + * write error CQ entry. + */ + if (rx_entry->cq_entry.len > rx_entry->total_len) + rx_entry->cq_entry.len = rx_entry->total_len; + + rxr_cq_write_rx_completion(ep, rx_entry); + rxr_pkt_entry_release_rx(ep, pkt_entry); + rxr_release_rx_entry(ep, rx_entry); +} + +void rxr_pkt_handle_rtm_rta_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_base_hdr *base_hdr; + struct rxr_peer *peer; + bool need_ordering; + int ret, msg_id; + + base_hdr = rxr_get_base_hdr(pkt_entry->pkt); + assert(base_hdr->type >= RXR_BASELINE_REQ_PKT_BEGIN); + + if (base_hdr->type == RXR_MEDIUM_MSGRTM_PKT || + base_hdr->type == RXR_MEDIUM_TAGRTM_PKT || + base_hdr->type == RXR_DC_MEDIUM_MSGRTM_PKT || + base_hdr->type == RXR_DC_MEDIUM_TAGRTM_PKT) { + struct rxr_rx_entry *rx_entry; + struct rxr_pkt_entry *unexp_pkt_entry; + + rx_entry = rxr_pkt_rx_map_lookup(ep, pkt_entry); + if (rx_entry) { + if (rx_entry->state == RXR_RX_MATCHED) { + rxr_pkt_proc_matched_medium_rtm(ep, rx_entry, pkt_entry); + } else { + assert(rx_entry->unexp_pkt); + unexp_pkt_entry = rxr_pkt_get_unexp(ep, &pkt_entry); + rxr_pkt_entry_append(rx_entry->unexp_pkt, unexp_pkt_entry); + } + + return; + } + } + + need_ordering = false; + peer = rxr_ep_get_peer(ep, pkt_entry->addr); + + if (!peer->is_local) { + /* + * only need to reorder msg for efa_ep + */ + base_hdr = (struct rxr_base_hdr *)pkt_entry->pkt; + if ((base_hdr->flags & RXR_REQ_MSG) && rxr_need_sas_ordering(ep)) + need_ordering = true; + else if (base_hdr->flags & RXR_REQ_ATOMIC) + need_ordering = true; + } + + if (!need_ordering) { + /* rxr_pkt_proc_rtm will write error cq entry if needed */ + rxr_pkt_proc_rtm_rta(ep, pkt_entry); + return; + } + + msg_id = rxr_pkt_msg_id(pkt_entry); + ret = rxr_cq_reorder_msg(ep, peer, pkt_entry); + if (ret == 1) { + /* Packet was queued */ + return; + } + + if (OFI_UNLIKELY(ret == -FI_EALREADY)) { + /* Packet with same msg_id has been processed before */ + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, + "Invalid msg_id: %" PRIu32 + " robuf->exp_msg_id: %" PRIu32 "\n", + msg_id, peer->robuf->exp_msg_id); + efa_eq_write_error(&ep->util_ep, FI_EIO, ret); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return; + } + + if (OFI_UNLIKELY(ret == -FI_ENOMEM)) { + /* running out of memory while copy packet */ + efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS); + return; + } + + if (OFI_UNLIKELY(ret < 0)) { + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, + "Unknown error %d processing REQ packet msg_id: %" + PRIu32 "\n", ret, msg_id); + efa_eq_write_error(&ep->util_ep, FI_EIO, ret); + return; + } + + + /* + * rxr_pkt_proc_rtm_rta() will write error cq entry if needed, + * thus we do not write error cq entry + */ + ret = rxr_pkt_proc_rtm_rta(ep, pkt_entry); + if (OFI_UNLIKELY(ret)) + return; + + ofi_recvwin_slide(peer->robuf); + /* process pending items in reorder buff */ + rxr_cq_proc_pending_items_in_recvwin(ep, peer); +} + +/* + * RTW pakcet type functions + */ +void rxr_pkt_init_rtw_data(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry, + struct fi_rma_iov *rma_iov) +{ + char *data; + size_t hdr_size; + size_t data_size; + int i; + + for (i = 0; i < tx_entry->rma_iov_count; ++i) { + rma_iov[i].addr = tx_entry->rma_iov[i].addr; + rma_iov[i].len = tx_entry->rma_iov[i].len; + rma_iov[i].key = tx_entry->rma_iov[i].key; + } + + hdr_size = rxr_pkt_req_hdr_size(pkt_entry); + data = (char *)pkt_entry->pkt + hdr_size; + data_size = ofi_copy_from_iov(data, ep->mtu_size - hdr_size, + tx_entry->iov, tx_entry->iov_count, 0); + + pkt_entry->pkt_size = hdr_size + data_size; + pkt_entry->x_entry = tx_entry; +} + +ssize_t rxr_pkt_init_eager_rtw(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_eager_rtw_hdr *rtw_hdr; + + assert(tx_entry->op == ofi_op_write); + + rtw_hdr = (struct rxr_eager_rtw_hdr *)pkt_entry->pkt; + rtw_hdr->rma_iov_count = tx_entry->rma_iov_count; + rxr_pkt_init_req_hdr(ep, tx_entry, RXR_EAGER_RTW_PKT, pkt_entry); + rxr_pkt_init_rtw_data(ep, tx_entry, pkt_entry, rtw_hdr->rma_iov); + return 0; +} + +ssize_t rxr_pkt_init_dc_eager_rtw(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_dc_eager_rtw_hdr *dc_eager_rtw_hdr; + + assert(tx_entry->op == ofi_op_write); + + dc_eager_rtw_hdr = (struct rxr_dc_eager_rtw_hdr *)pkt_entry->pkt; + dc_eager_rtw_hdr->rma_iov_count = tx_entry->rma_iov_count; + rxr_pkt_init_req_hdr(ep, tx_entry, RXR_DC_EAGER_RTW_PKT, pkt_entry); + rxr_pkt_init_rtw_data(ep, tx_entry, pkt_entry, + (struct fi_rma_iov *)dc_eager_rtw_hdr->rma_iov); + dc_eager_rtw_hdr->tx_id = tx_entry->tx_id; + return 0; +} + +static inline void rxr_pkt_init_long_rtw_hdr(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry, + int pkt_type) +{ + struct rxr_long_rtw_hdr *rtw_hdr; + + rtw_hdr = (struct rxr_long_rtw_hdr *)pkt_entry->pkt; + rtw_hdr->rma_iov_count = tx_entry->rma_iov_count; + rtw_hdr->data_len = tx_entry->total_len; + rtw_hdr->tx_id = tx_entry->tx_id; + rtw_hdr->credit_request = tx_entry->credit_request; + rxr_pkt_init_req_hdr(ep, tx_entry, pkt_type, pkt_entry); +} + +ssize_t rxr_pkt_init_long_rtw(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_long_rtw_hdr *rtw_hdr; + + assert(tx_entry->op == ofi_op_write); + + rtw_hdr = (struct rxr_long_rtw_hdr *)pkt_entry->pkt; + rxr_pkt_init_long_rtw_hdr(ep, tx_entry, pkt_entry, RXR_LONG_RTW_PKT); + rxr_pkt_init_rtw_data(ep, tx_entry, pkt_entry, rtw_hdr->rma_iov); + return 0; +} + +ssize_t rxr_pkt_init_dc_long_rtw(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_long_rtw_hdr *rtw_hdr; + + assert(tx_entry->op == ofi_op_write); + + rtw_hdr = (struct rxr_long_rtw_hdr *)pkt_entry->pkt; + rxr_pkt_init_long_rtw_hdr(ep, tx_entry, pkt_entry, RXR_DC_LONG_RTW_PKT); + rxr_pkt_init_rtw_data(ep, tx_entry, pkt_entry, rtw_hdr->rma_iov); + + return 0; +} + +ssize_t rxr_pkt_init_read_rtw(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_read_rtw_hdr *rtw_hdr; + struct fi_rma_iov *rma_iov, *read_iov; + size_t hdr_size; + int i, err; + + assert(tx_entry->op == ofi_op_write); + + rtw_hdr = (struct rxr_read_rtw_hdr *)pkt_entry->pkt; + rtw_hdr->rma_iov_count = tx_entry->rma_iov_count; + rtw_hdr->data_len = tx_entry->total_len; + rtw_hdr->tx_id = tx_entry->tx_id; + rtw_hdr->read_iov_count = tx_entry->iov_count; + rxr_pkt_init_req_hdr(ep, tx_entry, RXR_READ_RTW_PKT, pkt_entry); + + rma_iov = rtw_hdr->rma_iov; + for (i = 0; i < tx_entry->rma_iov_count; ++i) { + rma_iov[i].addr = tx_entry->rma_iov[i].addr; + rma_iov[i].len = tx_entry->rma_iov[i].len; + rma_iov[i].key = tx_entry->rma_iov[i].key; + } + + hdr_size = rxr_pkt_req_hdr_size(pkt_entry); + read_iov = (struct fi_rma_iov *)((char *)pkt_entry->pkt + hdr_size); + err = rxr_read_init_iov(ep, tx_entry, read_iov); + if (OFI_UNLIKELY(err)) + return err; + + pkt_entry->pkt_size = hdr_size + tx_entry->iov_count * sizeof(struct fi_rma_iov); + return 0; +} + +/* + * handle_sent() functions for RTW packet types + * + * rxr_pkt_handle_long_rtw_sent() is empty and is defined in rxr_pkt_type_req.h + */ +void rxr_pkt_handle_long_rtw_sent(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_tx_entry *tx_entry; + struct efa_domain *efa_domain; + struct rxr_domain *rxr_domain = rxr_ep_domain(ep); + + efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain, + util_domain.domain_fid); + + tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry; + tx_entry->bytes_sent += rxr_pkt_req_data_size(pkt_entry); + assert(tx_entry->bytes_sent < tx_entry->total_len); + if (tx_entry->desc[0] || efa_is_cache_available(efa_domain)) + rxr_prepare_desc_send(rxr_ep_domain(ep), tx_entry); +} + +/* + * handle_send_completion() functions + */ +void rxr_pkt_handle_eager_rtw_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_tx_entry *tx_entry; + + tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry; + assert(tx_entry->total_len == rxr_pkt_req_data_size(pkt_entry)); + rxr_cq_handle_tx_completion(ep, tx_entry); +} + +void rxr_pkt_handle_long_rtw_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_tx_entry *tx_entry; + + tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry; + tx_entry->bytes_acked += rxr_pkt_req_data_size(pkt_entry); + if (tx_entry->total_len == tx_entry->bytes_acked) + rxr_cq_handle_tx_completion(ep, tx_entry); +} + +void rxr_pkt_handle_dc_long_rtw_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_tx_entry *tx_entry; + + tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry; + tx_entry->bytes_acked += rxr_pkt_req_data_size(pkt_entry); + if (tx_entry->total_len == tx_entry->bytes_acked && + tx_entry->rxr_flags & RXR_RECEIPT_RECEIVED) + rxr_cq_handle_tx_completion(ep, tx_entry); +} + +/* + * handle_recv() functions + */ + +static +struct rxr_rx_entry *rxr_pkt_alloc_rtw_rx_entry(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rx_entry *rx_entry; + struct rxr_base_hdr *base_hdr; + struct fi_msg msg = {0}; + + msg.addr = pkt_entry->addr; + rx_entry = rxr_ep_get_rx_entry(ep, &msg, 0, ~0, ofi_op_write, 0); + if (OFI_UNLIKELY(!rx_entry)) + return NULL; + + base_hdr = rxr_get_base_hdr(pkt_entry->pkt); + if (base_hdr->flags & RXR_REQ_OPT_CQ_DATA_HDR) { + rx_entry->rxr_flags |= RXR_REMOTE_CQ_DATA; + rx_entry->cq_entry.flags |= FI_REMOTE_CQ_DATA; + rx_entry->cq_entry.data = rxr_pkt_req_cq_data(pkt_entry); + } + + rx_entry->addr = pkt_entry->addr; + rx_entry->bytes_received = 0; + rx_entry->bytes_copied = 0; + return rx_entry; +} + +void rxr_pkt_proc_eager_rtw(struct rxr_ep *ep, + struct fi_rma_iov *rma_iov, + size_t rma_iov_count, + struct rxr_rx_entry *rx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + ssize_t err; + char *data; + size_t data_size, hdr_size; + + err = rxr_rma_verified_copy_iov(ep, rma_iov, rma_iov_count, + FI_REMOTE_WRITE, rx_entry->iov, rx_entry->desc); + + if (OFI_UNLIKELY(err)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, "RMA address verify failed!\n"); + efa_eq_write_error(&ep->util_ep, FI_EIO, err); + rxr_release_rx_entry(ep, rx_entry); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return; + } + + rx_entry->cq_entry.flags |= (FI_RMA | FI_WRITE); + rx_entry->cq_entry.len = ofi_total_iov_len(rx_entry->iov, rx_entry->iov_count); + rx_entry->cq_entry.buf = rx_entry->iov[0].iov_base; + rx_entry->total_len = rx_entry->cq_entry.len; + + hdr_size = rxr_pkt_req_hdr_size(pkt_entry); + data = (char *)pkt_entry->pkt + hdr_size; + data_size = pkt_entry->pkt_size - hdr_size; + + rx_entry->bytes_received += data_size; + if (data_size != rx_entry->total_len) { + FI_WARN(&rxr_prov, FI_LOG_CQ, "Eager RTM size mismatch! data_size: %ld total_len: %ld.", + data_size, rx_entry->total_len); + FI_WARN(&rxr_prov, FI_LOG_CQ, "target buffer: %p length: %ld", rx_entry->iov[0].iov_base, + rx_entry->iov[0].iov_len); + err = FI_EINVAL; + } else { + err = rxr_pkt_copy_to_rx(ep, rx_entry, 0, pkt_entry, data, data_size); + } + + if (err) { + efa_eq_write_error(&ep->util_ep, err, -err); + rxr_pkt_entry_release_rx(ep, pkt_entry); + rxr_release_rx_entry(ep, rx_entry); + } +} + +void rxr_pkt_handle_eager_rtw_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rx_entry *rx_entry; + struct rxr_eager_rtw_hdr *rtw_hdr; + + rx_entry = rxr_pkt_alloc_rtw_rx_entry(ep, pkt_entry); + + if (!rx_entry) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "RX entries exhausted.\n"); + efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return; + } + + rtw_hdr = (struct rxr_eager_rtw_hdr *)pkt_entry->pkt; + rx_entry->iov_count = rtw_hdr->rma_iov_count; + rxr_pkt_proc_eager_rtw(ep, + rtw_hdr->rma_iov, + rtw_hdr->rma_iov_count, + rx_entry, pkt_entry); +} + +void rxr_pkt_handle_dc_eager_rtw_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rx_entry *rx_entry; + struct rxr_dc_eager_rtw_hdr *rtw_hdr; + + rx_entry = rxr_pkt_alloc_rtw_rx_entry(ep, pkt_entry); + if (!rx_entry) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "RX entries exhausted.\n"); + efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return; + } + + rx_entry->rxr_flags |= RXR_DELIVERY_COMPLETE_REQUESTED; + rtw_hdr = (struct rxr_dc_eager_rtw_hdr *)pkt_entry->pkt; + rx_entry->tx_id = rtw_hdr->tx_id; + rx_entry->iov_count = rtw_hdr->rma_iov_count; + rxr_pkt_proc_eager_rtw(ep, + (struct fi_rma_iov *)rtw_hdr->rma_iov, + rtw_hdr->rma_iov_count, + rx_entry, pkt_entry); +} + +void rxr_pkt_handle_long_rtw_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rx_entry *rx_entry; + struct rxr_long_rtw_hdr *rtw_hdr; + char *data; + size_t hdr_size, data_size; + ssize_t err; + uint32_t tx_id; + + rx_entry = rxr_pkt_alloc_rtw_rx_entry(ep, pkt_entry); + if (!rx_entry) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "RX entries exhausted.\n"); + efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return; + } + + rtw_hdr = (struct rxr_long_rtw_hdr *)pkt_entry->pkt; + tx_id = rtw_hdr->tx_id; + if (rtw_hdr->type == RXR_DC_LONG_RTW_PKT) + rx_entry->rxr_flags |= RXR_DELIVERY_COMPLETE_REQUESTED; + + rx_entry->iov_count = rtw_hdr->rma_iov_count; + err = rxr_rma_verified_copy_iov(ep, rtw_hdr->rma_iov, rtw_hdr->rma_iov_count, + FI_REMOTE_WRITE, rx_entry->iov, rx_entry->desc); + if (OFI_UNLIKELY(err)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, "RMA address verify failed!\n"); + efa_eq_write_error(&ep->util_ep, FI_EIO, err); + rxr_release_rx_entry(ep, rx_entry); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return; + } + + rx_entry->cq_entry.flags |= (FI_RMA | FI_WRITE); + rx_entry->cq_entry.len = ofi_total_iov_len(rx_entry->iov, rx_entry->iov_count); + rx_entry->cq_entry.buf = rx_entry->iov[0].iov_base; + rx_entry->total_len = rx_entry->cq_entry.len; + + hdr_size = rxr_pkt_req_hdr_size(pkt_entry); + data = (char *)pkt_entry->pkt + hdr_size; + data_size = pkt_entry->pkt_size - hdr_size; + + rx_entry->bytes_received += data_size; + if (data_size >= rx_entry->total_len) { + FI_WARN(&rxr_prov, FI_LOG_CQ, "Long RTM size mismatch! pkt_data_size: %ld total_len: %ld\n", + data_size, rx_entry->total_len); + FI_WARN(&rxr_prov, FI_LOG_CQ, "target buffer: %p length: %ld", rx_entry->iov[0].iov_base, + rx_entry->iov[0].iov_len); + err = FI_EINVAL; + } else { + err = rxr_pkt_copy_to_rx(ep, rx_entry, 0, pkt_entry, data, data_size); + } + + if (err) { + efa_eq_write_error(&ep->util_ep, FI_EINVAL, -FI_EINVAL); + rxr_release_rx_entry(ep, rx_entry); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return; + } + +#if ENABLE_DEBUG + dlist_insert_tail(&rx_entry->rx_pending_entry, &ep->rx_pending_list); + ep->rx_pending++; +#endif + rx_entry->state = RXR_RX_RECV; + rx_entry->tx_id = tx_id; + rx_entry->credit_request = rxr_env.tx_min_credits; + err = rxr_pkt_post_ctrl_or_queue(ep, RXR_RX_ENTRY, rx_entry, RXR_CTS_PKT, 0); + if (OFI_UNLIKELY(err)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, "Cannot post CTS packet\n"); + rxr_cq_handle_rx_error(ep, rx_entry, err); + rxr_release_rx_entry(ep, rx_entry); + } +} + +void rxr_pkt_handle_read_rtw_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rx_entry *rx_entry; + struct rxr_read_rtw_hdr *rtw_hdr; + struct fi_rma_iov *read_iov; + size_t hdr_size; + ssize_t err; + + rx_entry = rxr_pkt_alloc_rtw_rx_entry(ep, pkt_entry); + if (!rx_entry) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "RX entries exhausted.\n"); + efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return; + } + + rtw_hdr = (struct rxr_read_rtw_hdr *)pkt_entry->pkt; + rx_entry->iov_count = rtw_hdr->rma_iov_count; + err = rxr_rma_verified_copy_iov(ep, rtw_hdr->rma_iov, rtw_hdr->rma_iov_count, + FI_REMOTE_WRITE, rx_entry->iov, rx_entry->desc); + if (OFI_UNLIKELY(err)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, "RMA address verify failed!\n"); + efa_eq_write_error(&ep->util_ep, FI_EINVAL, -FI_EINVAL); + rxr_release_rx_entry(ep, rx_entry); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return; + } + + rx_entry->cq_entry.flags |= (FI_RMA | FI_WRITE); + rx_entry->cq_entry.len = ofi_total_iov_len(rx_entry->iov, rx_entry->iov_count); + rx_entry->cq_entry.buf = rx_entry->iov[0].iov_base; + rx_entry->total_len = rx_entry->cq_entry.len; + + hdr_size = rxr_pkt_req_hdr_size(pkt_entry); + read_iov = (struct fi_rma_iov *)((char *)pkt_entry->pkt + hdr_size); + rx_entry->addr = pkt_entry->addr; + rx_entry->tx_id = rtw_hdr->tx_id; + rx_entry->rma_iov_count = rtw_hdr->read_iov_count; + memcpy(rx_entry->rma_iov, read_iov, + rx_entry->rma_iov_count * sizeof(struct fi_rma_iov)); + + rxr_pkt_entry_release_rx(ep, pkt_entry); + err = rxr_read_post_remote_read_or_queue(ep, RXR_RX_ENTRY, rx_entry); + if (OFI_UNLIKELY(err)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "RDMA post read or queue failed.\n"); + efa_eq_write_error(&ep->util_ep, err, err); + rxr_release_rx_entry(ep, rx_entry); + rxr_pkt_entry_release_rx(ep, pkt_entry); + } +} + +/* + * RTR packet functions + * init() functions for RTR packets + */ +void rxr_pkt_init_rtr(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + int pkt_type, int window, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rtr_hdr *rtr_hdr; + int i; + + assert(tx_entry->op == ofi_op_read_req); + rtr_hdr = (struct rxr_rtr_hdr *)pkt_entry->pkt; + rtr_hdr->rma_iov_count = tx_entry->rma_iov_count; + rxr_pkt_init_req_hdr(ep, tx_entry, pkt_type, pkt_entry); + rtr_hdr->data_len = tx_entry->total_len; + rtr_hdr->read_req_rx_id = tx_entry->rma_loc_rx_id; + rtr_hdr->read_req_window = window; + for (i = 0; i < tx_entry->rma_iov_count; ++i) { + rtr_hdr->rma_iov[i].addr = tx_entry->rma_iov[i].addr; + rtr_hdr->rma_iov[i].len = tx_entry->rma_iov[i].len; + rtr_hdr->rma_iov[i].key = tx_entry->rma_iov[i].key; + } + + pkt_entry->pkt_size = rxr_pkt_req_hdr_size(pkt_entry); + pkt_entry->x_entry = tx_entry; +} + +ssize_t rxr_pkt_init_short_rtr(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + rxr_pkt_init_rtr(ep, tx_entry, RXR_SHORT_RTR_PKT, tx_entry->total_len, pkt_entry); + return 0; +} + +ssize_t rxr_pkt_init_long_rtr(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + rxr_pkt_init_rtr(ep, tx_entry, RXR_LONG_RTR_PKT, tx_entry->rma_window, pkt_entry); + return 0; +} + +/* + * handle_sent() functions for RTR packet types + */ +void rxr_pkt_handle_rtr_sent(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_tx_entry *tx_entry; + + tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry; + tx_entry->bytes_sent = 0; + tx_entry->state = RXR_TX_WAIT_READ_FINISH; +} + +/* + * handle_send_completion() funciton for RTR packet + */ +void rxr_pkt_handle_rtr_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + /* + * Unlike other protocol, for emulated read, tx_entry + * is release in rxr_cq_handle_rx_completion(). + * therefore there is nothing to be done here. + */ + return; +} + +/* + * handle_recv() functions for RTR packet + */ +void rxr_pkt_handle_rtr_recv(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rtr_hdr *rtr_hdr; + struct rxr_rx_entry *rx_entry; + struct rxr_tx_entry *tx_entry; + ssize_t err; + struct fi_msg msg = {0}; + + msg.addr = pkt_entry->addr; + rx_entry = rxr_ep_get_rx_entry(ep, &msg, 0, ~0, ofi_op_read_rsp, 0); + if (OFI_UNLIKELY(!rx_entry)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "RX entries exhausted.\n"); + efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return; + } + + rx_entry->addr = pkt_entry->addr; + rx_entry->bytes_received = 0; + rx_entry->bytes_copied = 0; + rx_entry->cq_entry.flags |= (FI_RMA | FI_READ); + rx_entry->cq_entry.len = ofi_total_iov_len(rx_entry->iov, rx_entry->iov_count); + rx_entry->cq_entry.buf = rx_entry->iov[0].iov_base; + rx_entry->total_len = rx_entry->cq_entry.len; + + rtr_hdr = (struct rxr_rtr_hdr *)pkt_entry->pkt; + rx_entry->rma_initiator_rx_id = rtr_hdr->read_req_rx_id; + rx_entry->window = rtr_hdr->read_req_window; + rx_entry->iov_count = rtr_hdr->rma_iov_count; + err = rxr_rma_verified_copy_iov(ep, rtr_hdr->rma_iov, rtr_hdr->rma_iov_count, + FI_REMOTE_READ, rx_entry->iov, rx_entry->desc); + if (OFI_UNLIKELY(err)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, "RMA address verification failed!\n"); + efa_eq_write_error(&ep->util_ep, FI_EINVAL, -FI_EINVAL); + rxr_release_rx_entry(ep, rx_entry); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return; + } + + tx_entry = rxr_rma_alloc_readrsp_tx_entry(ep, rx_entry); + if (OFI_UNLIKELY(!tx_entry)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, "Readrsp tx entry exhausted!\n"); + efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS); + rxr_release_rx_entry(ep, rx_entry); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return; + } + + err = rxr_pkt_post_ctrl_or_queue(ep, RXR_TX_ENTRY, tx_entry, RXR_READRSP_PKT, 0); + if (OFI_UNLIKELY(err)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, "Posting of readrsp packet failed! err=%ld\n", err); + efa_eq_write_error(&ep->util_ep, FI_EIO, err); + rxr_release_tx_entry(ep, tx_entry); + rxr_release_rx_entry(ep, rx_entry); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return; + } + + rx_entry->state = RXR_RX_WAIT_READ_FINISH; + rxr_pkt_entry_release_rx(ep, pkt_entry); +} + +/* + * RTA packet functions + */ +ssize_t rxr_pkt_init_rta(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, + int pkt_type, struct rxr_pkt_entry *pkt_entry) +{ + struct fi_rma_iov *rma_iov; + struct rxr_rta_hdr *rta_hdr; + char *data; + size_t hdr_size, data_size; + int i; + + rta_hdr = (struct rxr_rta_hdr *)pkt_entry->pkt; + rta_hdr->msg_id = tx_entry->msg_id; + rta_hdr->rma_iov_count = tx_entry->rma_iov_count; + rta_hdr->atomic_datatype = tx_entry->atomic_hdr.datatype; + rta_hdr->atomic_op = tx_entry->atomic_hdr.atomic_op; + rta_hdr->tx_id = tx_entry->tx_id; + rxr_pkt_init_req_hdr(ep, tx_entry, pkt_type, pkt_entry); + rta_hdr->flags |= RXR_REQ_ATOMIC; + rma_iov = rta_hdr->rma_iov; + for (i=0; i < tx_entry->rma_iov_count; ++i) { + rma_iov[i].addr = tx_entry->rma_iov[i].addr; + rma_iov[i].len = tx_entry->rma_iov[i].len; + rma_iov[i].key = tx_entry->rma_iov[i].key; + } + + hdr_size = rxr_pkt_req_hdr_size(pkt_entry); + data = (char *)pkt_entry->pkt + hdr_size; + data_size = ofi_copy_from_iov(data, ep->mtu_size - hdr_size, + tx_entry->iov, tx_entry->iov_count, 0); + + pkt_entry->pkt_size = hdr_size + data_size; + pkt_entry->x_entry = tx_entry; + return 0; +} + +ssize_t rxr_pkt_init_write_rta(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + rxr_pkt_init_rta(ep, tx_entry, RXR_WRITE_RTA_PKT, pkt_entry); + return 0; +} + +ssize_t rxr_pkt_init_dc_write_rta(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + rxr_pkt_init_rta(ep, tx_entry, RXR_DC_WRITE_RTA_PKT, pkt_entry); + return 0; +} + +ssize_t rxr_pkt_init_fetch_rta(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + rxr_pkt_init_rta(ep, tx_entry, RXR_FETCH_RTA_PKT, pkt_entry); + return 0; +} + +ssize_t rxr_pkt_init_compare_rta(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry) +{ + char *data; + size_t data_size; + + rxr_pkt_init_rta(ep, tx_entry, RXR_COMPARE_RTA_PKT, pkt_entry); + + /* rxr_pkt_init_rta() will copy data from tx_entry->iov to pkt entry + * the following append the data to be compared + */ + data = (char *)pkt_entry->pkt + pkt_entry->pkt_size; + data_size = ofi_copy_from_iov(data, ep->mtu_size - pkt_entry->pkt_size, + tx_entry->atomic_ex.comp_iov, + tx_entry->atomic_ex.comp_iov_count, 0); + assert(data_size == tx_entry->total_len); + pkt_entry->pkt_size += data_size; + return 0; +} + +void rxr_pkt_handle_write_rta_send_completion(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_tx_entry *tx_entry; + + tx_entry = (struct rxr_tx_entry *)pkt_entry->x_entry; + rxr_cq_handle_tx_completion(ep, tx_entry); +} + +int rxr_pkt_proc_write_rta(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry) +{ + struct iovec iov[RXR_IOV_LIMIT]; + struct rxr_rta_hdr *rta_hdr; + void *desc[RXR_IOV_LIMIT]; + char *data; + int iov_count, op, dt, i; + size_t dtsize, offset, hdr_size; + + rta_hdr = (struct rxr_rta_hdr *)pkt_entry->pkt; + op = rta_hdr->atomic_op; + dt = rta_hdr->atomic_datatype; + dtsize = ofi_datatype_size(dt); + + hdr_size = rxr_pkt_req_hdr_size(pkt_entry); + data = (char *)pkt_entry->pkt + hdr_size; + iov_count = rta_hdr->rma_iov_count; + rxr_rma_verified_copy_iov(ep, rta_hdr->rma_iov, iov_count, FI_REMOTE_WRITE, iov, desc); + + offset = 0; + for (i = 0; i < iov_count; ++i) { + ofi_atomic_write_handlers[op][dt](iov[i].iov_base, + data + offset, + iov[i].iov_len / dtsize); + offset += iov[i].iov_len; + } + + rxr_pkt_entry_release_rx(ep, pkt_entry); + return 0; +} + +struct rxr_rx_entry *rxr_pkt_alloc_rta_rx_entry(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry, int op) +{ + struct rxr_rx_entry *rx_entry; + struct rxr_rta_hdr *rta_hdr; + struct fi_msg msg = {0}; + + msg.addr = pkt_entry->addr; + rx_entry = rxr_ep_get_rx_entry(ep, &msg, 0, ~0, op, 0); + if (OFI_UNLIKELY(!rx_entry)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "RX entries exhausted.\n"); + return NULL; + } + + if (op == ofi_op_atomic) { + rx_entry->addr = pkt_entry->addr; + return rx_entry; + } + + rta_hdr = (struct rxr_rta_hdr *)pkt_entry->pkt; + rx_entry->atomic_hdr.atomic_op = rta_hdr->atomic_op; + rx_entry->atomic_hdr.datatype = rta_hdr->atomic_datatype; + + rx_entry->iov_count = rta_hdr->rma_iov_count; + rxr_rma_verified_copy_iov(ep, rta_hdr->rma_iov, rx_entry->iov_count, + FI_REMOTE_READ, rx_entry->iov, rx_entry->desc); + rx_entry->tx_id = rta_hdr->tx_id; + rx_entry->total_len = ofi_total_iov_len(rx_entry->iov, rx_entry->iov_count); + /* + * prepare a buffer to hold response data. + * Atomic_op operates on 3 data buffers: + * local_data (input/output), + * request_data (input), + * response_data (output) + * The fact local data will be changed by atomic_op means + * response_data is not reproducible. + * Because sending response packet can fail due to + * -FI_EAGAIN, we need a buffer to hold response_data. + * The buffer will be release in rxr_handle_atomrsp_send_completion() + */ + rx_entry->atomrsp_data = ofi_buf_alloc(ep->rx_atomrsp_pool); + if (!rx_entry->atomrsp_data) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "atomic repsonse buffer pool exhausted.\n"); + rxr_release_rx_entry(ep, rx_entry); + return NULL; + } + + return rx_entry; +} + +int rxr_pkt_proc_dc_write_rta(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rx_entry *rx_entry; + struct rxr_rta_hdr *rta_hdr; + ssize_t err; + int ret; + + rx_entry = rxr_pkt_alloc_rta_rx_entry(ep, pkt_entry, ofi_op_atomic); + if (OFI_UNLIKELY(!rx_entry)) { + efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return -FI_ENOBUFS; + } + + rta_hdr = (struct rxr_rta_hdr *)pkt_entry->pkt; + rx_entry->tx_id = rta_hdr->tx_id; + rx_entry->rxr_flags |= RXR_DELIVERY_COMPLETE_REQUESTED; + + ret = rxr_pkt_proc_write_rta(ep, pkt_entry); + if (OFI_UNLIKELY(ret)) { + FI_WARN(&rxr_prov, + FI_LOG_CQ, + "Error while processing the write rta packet\n"); + return ret; + } + + err = rxr_pkt_post_ctrl_or_queue(ep, + RXR_RX_ENTRY, + rx_entry, + RXR_RECEIPT_PKT, 0); + if (OFI_UNLIKELY(err)) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "Posting of receipt packet failed! err=%s\n", + fi_strerror(err)); + if (rxr_cq_handle_rx_error(ep, rx_entry, err)) + assert(0 && "Cannot handle rx error"); + return err; + } + + return ret; +} + +int rxr_pkt_proc_fetch_rta(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rx_entry *rx_entry; + char *data; + int op, dt, i; + size_t offset, dtsize; + ssize_t err; + + rx_entry = rxr_pkt_alloc_rta_rx_entry(ep, pkt_entry, ofi_op_atomic_fetch); + if(OFI_UNLIKELY(!rx_entry)) { + efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS); + return -FI_ENOBUFS; + } + + op = rx_entry->atomic_hdr.atomic_op; + dt = rx_entry->atomic_hdr.datatype; + dtsize = ofi_datatype_size(rx_entry->atomic_hdr.datatype); + + data = (char *)pkt_entry->pkt + rxr_pkt_req_hdr_size(pkt_entry); + + offset = 0; + for (i = 0; i < rx_entry->iov_count; ++i) { + ofi_atomic_readwrite_handlers[op][dt](rx_entry->iov[i].iov_base, + data + offset, + rx_entry->atomrsp_data + offset, + rx_entry->iov[i].iov_len / dtsize); + offset += rx_entry->iov[i].iov_len; + } + + err = rxr_pkt_post_ctrl_or_queue(ep, RXR_RX_ENTRY, rx_entry, RXR_ATOMRSP_PKT, 0); + if (OFI_UNLIKELY(err)) { + if (rxr_cq_handle_rx_error(ep, rx_entry, err)) + assert(0 && "Cannot handle rx error"); + } + + rxr_pkt_entry_release_rx(ep, pkt_entry); + return 0; +} + +int rxr_pkt_proc_compare_rta(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rx_entry *rx_entry; + char *src_data, *cmp_data; + int op, dt, i; + size_t offset, dtsize; + ssize_t err; + + rx_entry = rxr_pkt_alloc_rta_rx_entry(ep, pkt_entry, ofi_op_atomic_compare); + if(OFI_UNLIKELY(!rx_entry)) { + efa_eq_write_error(&ep->util_ep, FI_ENOBUFS, -FI_ENOBUFS); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return -FI_ENOBUFS; + } + + op = rx_entry->atomic_hdr.atomic_op; + dt = rx_entry->atomic_hdr.datatype; + dtsize = ofi_datatype_size(rx_entry->atomic_hdr.datatype); + + src_data = (char *)pkt_entry->pkt + rxr_pkt_req_hdr_size(pkt_entry); + cmp_data = src_data + rx_entry->total_len; + + offset = 0; + for (i = 0; i < rx_entry->iov_count; ++i) { + ofi_atomic_swap_handlers[op - FI_CSWAP][dt](rx_entry->iov[i].iov_base, + src_data + offset, + cmp_data + offset, + rx_entry->atomrsp_data + offset, + rx_entry->iov[i].iov_len / dtsize); + offset += rx_entry->iov[i].iov_len; + } + + err = rxr_pkt_post_ctrl_or_queue(ep, RXR_RX_ENTRY, rx_entry, RXR_ATOMRSP_PKT, 0); + if (OFI_UNLIKELY(err)) { + efa_eq_write_error(&ep->util_ep, FI_EIO, err); + ofi_buf_free(rx_entry->atomrsp_data); + rxr_release_rx_entry(ep, rx_entry); + rxr_pkt_entry_release_rx(ep, pkt_entry); + return err; + } + + rxr_pkt_entry_release_rx(ep, pkt_entry); + return 0; +} diff --git a/prov/efa/src/rxr/rxr_pkt_type_req.h b/prov/efa/src/rxr/rxr_pkt_type_req.h new file mode 100644 index 00000000000..a07bdfa4ef8 --- /dev/null +++ b/prov/efa/src/rxr/rxr_pkt_type_req.h @@ -0,0 +1,735 @@ +/* + * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _RXR_PKT_TYPE_REQ_H +#define _RXR_PKT_TYPE_REQ_H + +/* + * This file contain REQ packet type related struct and functions + * REQ packets can be classifed into 4 categories: + * RTM (Request To Message) is used by message + * RTW (Request To Write) is used by RMA write + * RTR (Request To Read) is used by RMA read + * RTA (Request To Atomic) is used by Atomic + * + * For each REQ packet type need to have the following: + * + * 1. a header struct + * 2. an init() function called by rxr_pkt_init_ctrl() + * 3. a handle_sent() function called by rxr_pkt_post_ctrl() + * 4. a handle_send_completion() function called by + * rxr_pkt_handle_send_completion() + * 5. a proc() function called by + * rxr_pkt_proc_req() + * + * Some req packet types are so similar that they can share + * some functions. + */ + +/* + * Utilities shared by all REQ packets + * + * Packet Header Flags + */ +#define RXR_REQ_OPT_RAW_ADDR_HDR BIT_ULL(0) +#define RXR_REQ_OPT_CQ_DATA_HDR BIT_ULL(1) +#define RXR_REQ_MSG BIT_ULL(2) +#define RXR_REQ_TAGGED BIT_ULL(3) +#define RXR_REQ_RMA BIT_ULL(4) +#define RXR_REQ_ATOMIC BIT_ULL(5) + +/* + * Extra Feature Flags + */ +#define RXR_REQ_FEATURE_RDMA_READ BIT_ULL(0) +#define RXR_REQ_FEATURE_DELIVERY_COMPLETE BIT_ULL(1) + +/* + * Utility struct and functions for + * REQ packet types + */ +struct rxr_req_opt_raw_addr_hdr { + uint32_t addr_len; + char raw_addr[0]; +}; + +struct rxr_req_opt_cq_data_hdr { + int64_t cq_data; +}; + +void *rxr_pkt_req_raw_addr(struct rxr_pkt_entry *pkt_entry); + +int64_t rxr_pkt_req_cq_data(struct rxr_pkt_entry *pkt_entry); + +size_t rxr_pkt_req_hdr_size(struct rxr_pkt_entry *pkt_entry); + +size_t rxr_pkt_req_base_hdr_size(struct rxr_pkt_entry *pkt_entry); + +size_t rxr_pkt_req_max_header_size(int pkt_type); + +size_t rxr_pkt_max_header_size(void); + +size_t rxr_pkt_req_max_data_size(struct rxr_ep *ep, fi_addr_t addr, int pkt_type); + +/* + * Structs and funcitons for RTM (Message) packet types + * There are 4 message protocols + * Eager message protocol, + * Medium message protocol, + * Long message protocol, + * Read message protocol (message by read) + * Each protocol employes two packet types: non-tagged and tagged. + * Thus altogether there are 8 RTM packet types. + */ + +/* + * Utility structs and functions shared by all + * RTM packet types + */ +struct rxr_rtm_base_hdr { + uint8_t type; + uint8_t version; + uint16_t flags; + uint32_t msg_id; +}; + +static inline +struct rxr_rtm_base_hdr *rxr_get_rtm_base_hdr(void *pkt) +{ + return (struct rxr_rtm_base_hdr *)pkt; +} + +static inline +uint32_t rxr_pkt_msg_id(struct rxr_pkt_entry *pkt_entry) +{ + struct rxr_rtm_base_hdr *rtm_hdr; + + rtm_hdr = rxr_get_rtm_base_hdr(pkt_entry->pkt); + /* only msg and atomic request has msg_id */ + assert(rtm_hdr->flags & (RXR_REQ_MSG | RXR_REQ_ATOMIC)); + return rtm_hdr->msg_id; +} + +size_t rxr_pkt_rtm_total_len(struct rxr_pkt_entry *pkt_entry); + +static inline +uint64_t rxr_pkt_rtm_tag(struct rxr_pkt_entry *pkt_entry) +{ + size_t offset; + uint64_t *tagptr; + + /* + * In consideration of performance, this function did not cast header + * into different header types to get tag, but assume tag is always + * the last member of header. + */ + offset = rxr_pkt_req_base_hdr_size(pkt_entry) - sizeof(uint64_t); + tagptr = (uint64_t *)((char *)pkt_entry->pkt + offset); + return *tagptr; +} + +static inline +void rxr_pkt_rtm_settag(struct rxr_pkt_entry *pkt_entry, uint64_t tag) +{ + size_t offset; + uint64_t *tagptr; + + offset = rxr_pkt_req_base_hdr_size(pkt_entry) - sizeof(uint64_t); + /* tag is always the last member */ + tagptr = (uint64_t *)((char *)pkt_entry->pkt + offset); + *tagptr = tag; +} + +/* + * Header structs for each REQ packe type + */ +struct rxr_eager_msgrtm_hdr { + struct rxr_rtm_base_hdr hdr; +}; + +struct rxr_eager_tagrtm_hdr { + struct rxr_rtm_base_hdr hdr; + uint64_t tag; +}; + +struct rxr_dc_eager_rtm_base_hdr { + uint8_t type; + uint8_t version; + uint16_t flags; + uint32_t msg_id; + uint32_t tx_id; + uint32_t padding; +}; + +static inline +struct rxr_dc_eager_rtm_base_hdr *rxr_get_dc_eager_rtm_base_hdr(void *pkt) +{ + return (struct rxr_dc_eager_rtm_base_hdr *)pkt; +} + +struct rxr_dc_eager_msgrtm_hdr { + struct rxr_dc_eager_rtm_base_hdr hdr; +}; + +static inline +struct rxr_dc_eager_msgrtm_hdr *rxr_get_dc_eager_msgrtm_hdr(void *pkt) +{ + return (struct rxr_dc_eager_msgrtm_hdr *)pkt; +} + +struct rxr_dc_eager_tagrtm_hdr { + struct rxr_dc_eager_rtm_base_hdr hdr; + uint64_t tag; +}; + +static inline +struct rxr_dc_eager_tagrtm_hdr *rxr_get_dc_eager_tagrtm_hdr(void *pkt) +{ + return (struct rxr_dc_eager_tagrtm_hdr *)pkt; +} + +struct rxr_medium_rtm_base_hdr { + struct rxr_rtm_base_hdr hdr; + uint64_t data_len; + uint64_t offset; +}; + +struct rxr_dc_medium_rtm_base_hdr { + struct rxr_rtm_base_hdr hdr; + uint32_t tx_id; + uint32_t padding; + uint64_t data_len; + uint64_t offset; +}; + +struct rxr_medium_msgrtm_hdr { + struct rxr_medium_rtm_base_hdr hdr; +}; + +struct rxr_dc_medium_msgrtm_hdr { + struct rxr_dc_medium_rtm_base_hdr hdr; +}; + +struct rxr_medium_tagrtm_hdr { + struct rxr_medium_rtm_base_hdr hdr; + uint64_t tag; +}; + +struct rxr_dc_medium_tagrtm_hdr { + struct rxr_dc_medium_rtm_base_hdr hdr; + uint64_t tag; +}; + +static inline +struct rxr_medium_rtm_base_hdr *rxr_get_medium_rtm_base_hdr(void *pkt) +{ + return (struct rxr_medium_rtm_base_hdr *)pkt; +} + +static inline +struct rxr_dc_medium_rtm_base_hdr *rxr_get_dc_medium_rtm_base_hdr(void *pkt) +{ + return (struct rxr_dc_medium_rtm_base_hdr *)pkt; +} + +static inline +struct rxr_dc_medium_msgrtm_hdr *rxr_get_dc_medium_msgrtm_hdr(void *pkt) +{ + return (struct rxr_dc_medium_msgrtm_hdr *)pkt; +} + +static inline +struct rxr_dc_medium_tagrtm_hdr *rxr_get_dc_medium_tagrtm_hdr(void *pkt) +{ + return (struct rxr_dc_medium_tagrtm_hdr *)pkt; +} + +struct rxr_long_rtm_base_hdr { + struct rxr_rtm_base_hdr hdr; + uint64_t data_len; + uint32_t tx_id; + uint32_t credit_request; +}; + +static inline +struct rxr_long_rtm_base_hdr *rxr_get_long_rtm_base_hdr(void *pkt) +{ + return (struct rxr_long_rtm_base_hdr *)pkt; +} + +struct rxr_long_msgrtm_hdr { + struct rxr_long_rtm_base_hdr hdr; +}; + +struct rxr_long_tagrtm_hdr { + struct rxr_long_rtm_base_hdr hdr; + uint64_t tag; +}; + +struct rxr_read_rtm_base_hdr { + struct rxr_rtm_base_hdr hdr; + uint64_t data_len; + uint32_t tx_id; + uint32_t read_iov_count; +}; + +static inline +struct rxr_read_rtm_base_hdr *rxr_get_read_rtm_base_hdr(void *pkt) +{ + return (struct rxr_read_rtm_base_hdr *)pkt; +} + +struct rxr_read_msgrtm_hdr { + struct rxr_read_rtm_base_hdr hdr; +}; + +struct rxr_read_tagrtm_hdr { + struct rxr_read_rtm_base_hdr hdr; + uint64_t tag; +}; + +static inline +int rxr_read_rtm_pkt_type(int op) +{ + assert(op == ofi_op_tagged || op == ofi_op_msg); + return (op == ofi_op_tagged) ? RXR_READ_TAGRTM_PKT + : RXR_READ_MSGRTM_PKT; +} + +/* + * init() functions for RTM packets + */ +ssize_t rxr_pkt_init_eager_msgrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_dc_eager_msgrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_eager_tagrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_medium_msgrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_dc_eager_tagrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_dc_medium_msgrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_medium_tagrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_dc_medium_tagrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_long_msgrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_dc_long_msgrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_long_tagrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_dc_long_tagrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_read_msgrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_read_tagrtm(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); +/* + * handle_sent() functions for RTM packets + */ +static inline +void rxr_pkt_handle_eager_rtm_sent(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + /* there is nothing to be done for eager RTM */ + return; +} + +void rxr_pkt_handle_medium_rtm_sent(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_long_rtm_sent(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +static inline +void rxr_pkt_handle_read_rtm_sent(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ +} + +/* + * handle_send_completion() functions for RTM packet types + */ +void rxr_pkt_handle_eager_rtm_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_medium_rtm_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_long_rtm_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_dc_long_rtm_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +static inline +void rxr_pkt_handle_read_rtm_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ +} + +/* + * proc() functions for RTM packet types + */ +void rxr_pkt_rtm_init_rx_entry(struct rxr_pkt_entry *pkt_entry, + struct rxr_rx_entry *rx_entry); + +/* This function is called by both + * rxr_pkt_handle_rtm_recv() and + * rxr_msg_handle_unexp_match() + */ +ssize_t rxr_pkt_proc_matched_rtm(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_proc_rtm_rta(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); +/* + * This function handles zero-copy receives that do not require ordering + */ +void rxr_pkt_handle_zcpy_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); +/* + * This function is shared by all RTM packet types which handle + * reordering + */ +void rxr_pkt_handle_rtm_rta_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +/* Structs and functions for RTW packet types + * There are 3 write protocols + * Eager write protocol, + * Long write protocol and + * Read write protocol (write by read) + * Each protocol correspond to a packet type + */ + +/* + * Header structs + */ +struct rxr_rtw_base_hdr { + uint8_t type; + uint8_t version; + uint16_t flags; + /* end of rxr_base_hdr */ + uint32_t rma_iov_count; +}; + +static inline +struct rxr_rtw_base_hdr *rxr_get_rtw_base_hdr(void *pkt) +{ + return (struct rxr_rtw_base_hdr *)pkt; +} + +struct efa_rma_iov { + uint64_t addr; + size_t len; + uint64_t key; +}; + +struct rxr_eager_rtw_hdr { + uint8_t type; + uint8_t version; + uint16_t flags; + /* end of rxr_base_hdr */ + uint32_t rma_iov_count; + struct fi_rma_iov rma_iov[0]; +}; + +struct rxr_dc_eager_rtw_hdr { + uint8_t type; + uint8_t version; + uint16_t flags; + /* end of rxr_base_hdr */ + uint32_t rma_iov_count; + /* end of rxr_rtw_base_hdr */ + uint32_t tx_id; + uint32_t padding; + struct efa_rma_iov rma_iov[0]; +}; + +static inline +struct rxr_dc_eager_rtw_hdr *rxr_get_dc_eager_rtw_hdr(void *pkt) +{ + return (struct rxr_dc_eager_rtw_hdr *)pkt; +} + +struct rxr_long_rtw_hdr { + uint8_t type; + uint8_t version; + uint16_t flags; + /* end of rxr_base_hdr */ + uint32_t rma_iov_count; + uint64_t data_len; + uint32_t tx_id; + uint32_t credit_request; + struct fi_rma_iov rma_iov[0]; +}; + +struct rxr_read_rtw_hdr { + uint8_t type; + uint8_t version; + uint16_t flags; + /* end of rxr_base_hdr */ + uint32_t rma_iov_count; + uint64_t data_len; + uint32_t tx_id; + uint32_t read_iov_count; + struct fi_rma_iov rma_iov[0]; +}; + +/* + * init() functions for each RTW packet types + */ +ssize_t rxr_pkt_init_eager_rtw(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_long_rtw(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_read_rtw(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_dc_eager_rtw(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_dc_long_rtw(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +/* + * handle_sent() functions + */ +static inline +void rxr_pkt_handle_eager_rtw_sent(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ + /* For eager RTW, there is nothing to be done here */ + return; +} + +void rxr_pkt_handle_long_rtw_sent(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +static inline +void rxr_pkt_handle_read_rtw_sent(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ +} + +/* + * handle_send_completion() functions + */ +void rxr_pkt_handle_eager_rtw_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_long_rtw_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_dc_long_rtw_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +static inline +void rxr_pkt_handle_read_rtw_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ +} + +/* + * handle_recv() functions + */ +void rxr_pkt_handle_eager_rtw_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_dc_eager_rtw_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_long_rtw_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_read_rtw_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +/* Structs and functions for RTR packet types + * There are 3 read protocols + * Short protocol, + * Long read protocol and + * RDMA read protocol + * Each protocol correspond to a packet type + */ + +/* + * Header structs + */ +struct rxr_rtr_hdr { + uint8_t type; + uint8_t version; + uint16_t flags; + /* end of rxr_base_hdr */ + uint32_t rma_iov_count; + uint64_t data_len; + uint32_t read_req_rx_id; + uint32_t read_req_window; + struct fi_rma_iov rma_iov[0]; +}; + +static inline +struct rxr_rtr_hdr *rxr_get_rtr_hdr(void *pkt) +{ + return (struct rxr_rtr_hdr *)pkt; +} + +/* + * init() functions for each RTW packet types + */ +ssize_t rxr_pkt_init_short_rtr(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_long_rtr(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +/* + * handle_sent() functions + */ +void rxr_pkt_handle_rtr_sent(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +/* + * handle_send_completion() functions + */ +void rxr_pkt_handle_rtr_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); +/* + * handle_recv() functions + */ +void rxr_pkt_handle_rtr_recv(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +/* Structs and functions for RTW packet types + * There are 2 atomic protocols + * write atomic protocol and, + * read/compare atomic protocol and + * Each protocol correspond to a packet type + */ +struct rxr_rta_hdr { + uint8_t type; + uint8_t version; + uint16_t flags; + uint32_t msg_id; + /* end of rtm_base_hdr, atomic packet need msg_id for reordering */ + uint32_t rma_iov_count; + uint32_t atomic_datatype; + uint32_t atomic_op; + uint32_t tx_id; + struct fi_rma_iov rma_iov[0]; +}; + +static inline +struct rxr_rta_hdr *rxr_get_rta_hdr(void *pkt) +{ + return (struct rxr_rta_hdr *)pkt; +} + +ssize_t rxr_pkt_init_write_rta(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_dc_write_rta(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_fetch_rta(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, struct rxr_pkt_entry *pkt_entry); + +ssize_t rxr_pkt_init_compare_rta(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry, struct rxr_pkt_entry *pkt_entry); + +static inline +void rxr_pkt_handle_rta_sent(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry) +{ +} + +void rxr_pkt_handle_write_rta_send_completion(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +/* no action to be taken for compare_rta and fetch rta's send completion therefore + * there are not functions named rxr_pkt_handle_compare/fetch_rta_send_completion() + */ + +int rxr_pkt_proc_write_rta(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +int rxr_pkt_proc_dc_write_rta(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +int rxr_pkt_proc_fetch_rta(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +int rxr_pkt_proc_compare_rta(struct rxr_ep *ep, + struct rxr_pkt_entry *pkt_entry); + +void rxr_pkt_handle_rta_recv(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry); +#endif diff --git a/prov/efa/src/rxr/rxr_read.c b/prov/efa/src/rxr/rxr_read.c new file mode 100644 index 00000000000..3ba786f3375 --- /dev/null +++ b/prov/efa/src/rxr/rxr_read.c @@ -0,0 +1,627 @@ +/* + * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "efa.h" +#include "rxr.h" +#include "rxr_rma.h" +#include "rxr_cntr.h" +#include "rxr_read.h" + +int rxr_locate_iov_pos(struct iovec *iov, int iov_count, size_t offset, + int *iov_idx, size_t *iov_offset) +{ + int i; + size_t curoffset; + + curoffset = 0; + for (i = 0; i < iov_count; ++i) { + if (offset >= curoffset && + offset < curoffset + iov[i].iov_len) { + *iov_idx = i; + *iov_offset = offset - curoffset; + return 0; + } + + curoffset += iov[i].iov_len; + } + + return -1; +} + +int rxr_locate_rma_iov_pos(struct fi_rma_iov *rma_iov, int rma_iov_count, size_t offset, + int *rma_iov_idx, size_t *rma_iov_offset) +{ + int i; + size_t curoffset; + + curoffset = 0; + for (i = 0; i < rma_iov_count; ++i) { + if (offset >= curoffset && + offset < curoffset + rma_iov[i].len) { + *rma_iov_idx = i; + *rma_iov_offset = offset - curoffset; + return 0; + } + + curoffset += rma_iov[i].len; + } + + return -1; +} + +/* + * rxr_read_prepare_pkt_entry_mr() ensure pkt_entry's memory is registered. + * + * For a packet entry whose memory is not registered, it will reserve a pkt entry + * from rx_readcopy_pkt_pool and copy data their. + * + * Return value: + * + * On success, return 0 + * On pack entry reservation failure, return -FI_EAGAIN + */ +static +ssize_t rxr_read_prepare_pkt_entry_mr(struct rxr_ep *ep, struct rxr_read_entry *read_entry) +{ + size_t pkt_offset; + struct rxr_pkt_entry *pkt_entry; + struct rxr_pkt_entry *pkt_entry_copy; + + assert(read_entry->context_type == RXR_READ_CONTEXT_PKT_ENTRY); + /* + * In this case, target buffer is data in a pkt_entry, so rma_iov_count must be 1. + */ + assert(read_entry->rma_iov_count == 1); + + pkt_entry = read_entry->context; + if (pkt_entry->mr) { + assert(read_entry->rma_iov[0].key == fi_mr_key(pkt_entry->mr)); + return 0; + } + + /* only ooo and unexp packet entry's memory is not registered with device */ + assert(pkt_entry->type == RXR_PKT_ENTRY_OOO || + pkt_entry->type == RXR_PKT_ENTRY_UNEXP); + + pkt_offset = (char *)read_entry->rma_iov[0].addr - (char *)pkt_entry->pkt; + assert(pkt_offset > sizeof(struct rxr_base_hdr)); + + pkt_entry_copy = rxr_pkt_entry_clone(ep, ep->rx_readcopy_pkt_pool, + pkt_entry, RXR_PKT_ENTRY_READ_COPY); + if (!pkt_entry_copy) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "readcopy pkt pool exhausted! Set FI_EFA_READCOPY_POOL_SIZE to a higher value!"); + return -FI_EAGAIN; + } + + rxr_pkt_entry_release_rx(ep, pkt_entry); + + assert(pkt_entry_copy->mr); + read_entry->context = pkt_entry_copy; + read_entry->rma_iov[0].addr = (uint64_t)pkt_entry_copy->pkt + pkt_offset; + read_entry->rma_iov[0].key = fi_mr_key(pkt_entry_copy->mr); + + return 0; +} + +/* + * rxr_read_mr_reg register the memory of local buffer if application did not + * provide descriptor. + * It is called by rxr_read_post(). + * On success, it return 0. + * If memory registration failed with -FI_ENOMEM, it will return -FI_EAGAIN. + * If memory registration failed with other error, it will return the error code. + */ +ssize_t rxr_read_mr_reg(struct rxr_ep *ep, struct rxr_read_entry *read_entry) +{ + size_t i; + int err; + + for (i = 0; i < read_entry->iov_count; ++i) { + if (read_entry->mr_desc[i] || read_entry->mr[i]) { + continue; + } + + err = fi_mr_reg(rxr_ep_domain(ep)->rdm_domain, + read_entry->iov[i].iov_base, read_entry->iov[i].iov_len, + FI_RECV, 0, 0, 0, &read_entry->mr[i], NULL); + + if (err) { + /* If registration failed with -FI_ENOMEM, we return -FI_EAGAIN. + * This read entry will be put into a queue. + * + * The progress engine will progress other message transfers, which + * will release registrations. Thus, when the progress engine call this + * function again later, there will be registrations available. + * + * All registration opened here will be closed during release of + * the read_entry. + */ + FI_WARN(&rxr_prov, FI_LOG_MR, "Unable to register MR buf for read!\n"); + if (err == -FI_ENOMEM) + err = -FI_EAGAIN; + return err; + } + + read_entry->mr_desc[i] = fi_mr_desc(read_entry->mr[i]); + } + + return 0; +} + +/* rxr_read_alloc_entry allocates a read entry. + * It is called by rxr_read_post_or_queue(). + * Input: + * x_entry: can be a tx_entry or an rx_entry. + * If x_entry is tx_entry, application called fi_read(). + * If x_entry is rx_entry, read message protocol is being used. + * lower_ep_type: EFA_EP or SHM_EP + * Return: + * On success, return the pointer of allocated read_entry + * Otherwise, return NULL + */ +struct rxr_read_entry *rxr_read_alloc_entry(struct rxr_ep *ep, int entry_type, void *x_entry, + enum rxr_lower_ep_type lower_ep_type) +{ + struct rxr_tx_entry *tx_entry = NULL; + struct rxr_rx_entry *rx_entry = NULL; + struct rxr_read_entry *read_entry; + int i; + size_t total_iov_len, total_rma_iov_len; + + read_entry = ofi_buf_alloc(ep->read_entry_pool); + if (OFI_UNLIKELY(!read_entry)) { + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "RDMA entries exhausted\n"); + return NULL; + } + + read_entry->type = RXR_READ_ENTRY; + read_entry->read_id = ofi_buf_index(read_entry); + read_entry->state = RXR_RDMA_ENTRY_CREATED; + + if (entry_type == RXR_TX_ENTRY) { + tx_entry = (struct rxr_tx_entry *)x_entry; + assert(tx_entry->op == ofi_op_read_req); + + read_entry->context_type = RXR_READ_CONTEXT_TX_ENTRY; + read_entry->context = tx_entry; + read_entry->addr = tx_entry->addr; + + read_entry->iov_count = tx_entry->iov_count; + memcpy(read_entry->iov, tx_entry->iov, + tx_entry->iov_count * sizeof(struct iovec)); + + read_entry->rma_iov_count = tx_entry->rma_iov_count; + memcpy(read_entry->rma_iov, tx_entry->rma_iov, + tx_entry->rma_iov_count * sizeof(struct fi_rma_iov)); + + total_iov_len = ofi_total_iov_len(tx_entry->iov, tx_entry->iov_count); + total_rma_iov_len = ofi_total_rma_iov_len(tx_entry->rma_iov, tx_entry->rma_iov_count); + read_entry->total_len = MIN(total_iov_len, total_rma_iov_len); + + if (tx_entry->desc) { + memcpy(read_entry->mr_desc, tx_entry->desc, + read_entry->iov_count * sizeof(void *)); + } + + } else { + rx_entry = (struct rxr_rx_entry *)x_entry; + assert(rx_entry->op == ofi_op_write || rx_entry->op == ofi_op_msg || + rx_entry->op == ofi_op_tagged); + + read_entry->context_type = RXR_READ_CONTEXT_RX_ENTRY; + read_entry->context = rx_entry; + read_entry->addr = rx_entry->addr; + + read_entry->iov_count = rx_entry->iov_count; + memcpy(read_entry->iov, rx_entry->iov, + rx_entry->iov_count * sizeof(struct iovec)); + + read_entry->rma_iov_count = rx_entry->rma_iov_count; + memcpy(read_entry->rma_iov, rx_entry->rma_iov, + rx_entry->rma_iov_count * sizeof(struct fi_rma_iov)); + + total_iov_len = ofi_total_iov_len(rx_entry->iov, rx_entry->iov_count); + total_rma_iov_len = ofi_total_rma_iov_len(rx_entry->rma_iov, rx_entry->rma_iov_count); + read_entry->total_len = MIN(total_iov_len, total_rma_iov_len); + + if (rx_entry->desc) { + memcpy(read_entry->mr_desc, rx_entry->desc, + read_entry->iov_count * sizeof(void *)); + } + } + + memset(read_entry->mr, 0, read_entry->iov_count * sizeof(struct fid_mr *)); + + if (lower_ep_type == SHM_EP) { + assert(lower_ep_type == SHM_EP); + /* FI_MR_VIRT_ADDR is not being set, use 0-based offset instead. */ + if (!(shm_info->domain_attr->mr_mode & FI_MR_VIRT_ADDR)) { + for (i = 0; i < read_entry->rma_iov_count; ++i) + read_entry->rma_iov[i].addr = 0; + } + } + + read_entry->lower_ep_type = lower_ep_type; + read_entry->bytes_submitted = 0; + read_entry->bytes_finished = 0; + return read_entry; +} + +void rxr_read_release_entry(struct rxr_ep *ep, struct rxr_read_entry *read_entry) +{ + int i, err; + + for (i = 0; i < read_entry->iov_count; ++i) { + if (read_entry->mr[i]) { + err = fi_close((struct fid *)read_entry->mr[i]); + if (err) { + FI_WARN(&rxr_prov, FI_LOG_MR, "Unable to close mr\n"); + rxr_read_handle_error(ep, read_entry, err); + } + } + } + +#ifdef ENABLE_EFA_POISONING + rxr_poison_mem_region((uint32_t *)read_entry, sizeof(struct rxr_read_entry)); +#endif + read_entry->state = RXR_RDMA_ENTRY_FREE; + ofi_buf_free(read_entry); +} + +static inline +int rxr_read_post_or_queue(struct rxr_ep *ep, struct rxr_read_entry *read_entry) +{ + int err; + + err = rxr_read_post(ep, read_entry); + if (err == -FI_EAGAIN) { + dlist_insert_tail(&read_entry->pending_entry, &ep->read_pending_list); + read_entry->state = RXR_RDMA_ENTRY_PENDING; + err = 0; + } else if(err) { + rxr_read_release_entry(ep, read_entry); + FI_WARN(&rxr_prov, FI_LOG_CQ, + "RDMA post read failed. errno=%d.\n", err); + } + + return err; +} + +int rxr_read_post_remote_read_or_queue(struct rxr_ep *ep, int entry_type, void *x_entry) +{ + struct rxr_peer *peer; + struct rxr_read_entry *read_entry; + int lower_ep_type; + + if (entry_type == RXR_TX_ENTRY) { + peer = rxr_ep_get_peer(ep, ((struct rxr_tx_entry *)x_entry)->addr); + } else { + assert(entry_type == RXR_RX_ENTRY); + peer = rxr_ep_get_peer(ep, ((struct rxr_rx_entry *)x_entry)->addr); + } + + lower_ep_type = (peer->is_local) ? SHM_EP : EFA_EP; + read_entry = rxr_read_alloc_entry(ep, entry_type, x_entry, lower_ep_type); + if (!read_entry) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "RDMA entries exhausted.\n"); + return -FI_ENOBUFS; + } + + return rxr_read_post_or_queue(ep, read_entry); +} + +int rxr_read_post_local_read_or_queue(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry, + size_t data_offset, + struct rxr_pkt_entry *pkt_entry, + char *data, size_t data_size) +{ + int err; + struct rxr_read_entry *read_entry; + + read_entry = ofi_buf_alloc(ep->read_entry_pool); + if (!read_entry) { + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "RDMA entries exhausted\n"); + return -FI_ENOBUFS; + } + + read_entry->read_id = ofi_buf_index(read_entry); + read_entry->lower_ep_type = EFA_EP; + read_entry->context_type = RXR_READ_CONTEXT_PKT_ENTRY; + read_entry->context = pkt_entry; + read_entry->state = RXR_RDMA_ENTRY_CREATED; + read_entry->addr = FI_ADDR_NOTAVAIL; + read_entry->total_len = data_size; + read_entry->bytes_submitted = 0; + read_entry->bytes_finished = 0; + + /* setup rma_iov */ + read_entry->rma_iov_count = 1; + read_entry->rma_iov[0].addr = (uint64_t)data; + read_entry->rma_iov[0].len = data_size; + read_entry->rma_iov[0].key = (pkt_entry->mr) ? fi_mr_key(pkt_entry->mr) : 0; + + /* setup iov */ + assert(pkt_entry->x_entry == rx_entry); + assert(rx_entry->desc && efa_ep_is_cuda_mr(rx_entry->desc[0])); + read_entry->iov_count = rx_entry->iov_count; + memcpy(read_entry->iov, rx_entry->iov, rx_entry->iov_count * sizeof(struct iovec)); + memcpy(read_entry->mr_desc, rx_entry->desc, rx_entry->iov_count * sizeof(void *)); + ofi_consume_iov_desc(read_entry->iov, read_entry->mr_desc, &read_entry->iov_count, data_offset); + if (read_entry->iov_count == 0) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "data_offset %ld out of range\n", + data_offset); + ofi_buf_free(read_entry); + return -FI_ETRUNC; + } + + assert(efa_ep_is_cuda_mr(read_entry->mr_desc[0])); + err = ofi_truncate_iov(read_entry->iov, &read_entry->iov_count, data_size); + if (err) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "data_offset %ld data_size %ld out of range\n", + data_offset, data_size); + ofi_buf_free(read_entry); + return -FI_ETRUNC; + } + + return rxr_read_post_or_queue(ep, read_entry); +} + +int rxr_read_init_iov(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct fi_rma_iov *read_iov) +{ + int i, err; + struct fid_mr *mr; + struct rxr_peer *peer; + + peer = rxr_ep_get_peer(ep, tx_entry->addr); + + for (i = 0; i < tx_entry->iov_count; ++i) { + read_iov[i].addr = (uint64_t)tx_entry->iov[i].iov_base; + read_iov[i].len = tx_entry->iov[i].iov_len; + } + + if (tx_entry->desc[0]) { + for (i = 0; i < tx_entry->iov_count; ++i) { + mr = (struct fid_mr *)tx_entry->desc[i]; + read_iov[i].key = fi_mr_key(mr); + } + } else { + /* note mr could be been set by an unsucessful rxr_ep_post_ctrl */ + if (!tx_entry->mr[0]) { + for (i = 0; i < tx_entry->iov_count; ++i) { + assert(!tx_entry->mr[i]); + + if (peer->is_local) + err = efa_mr_reg_shm(rxr_ep_domain(ep)->rdm_domain, + tx_entry->iov + i, + FI_REMOTE_READ, &tx_entry->mr[i]); + else + err = fi_mr_regv(rxr_ep_domain(ep)->rdm_domain, + tx_entry->iov + i, 1, + FI_REMOTE_READ, + 0, 0, 0, &tx_entry->mr[i], NULL); + if (err) { + FI_WARN(&rxr_prov, FI_LOG_MR, + "Unable to register MR buf %p as FI_REMOTE_READ", + tx_entry->iov[i].iov_base); + return err; + } + } + } + + for (i = 0; i < tx_entry->iov_count; ++i) { + assert(tx_entry->mr[i]); + read_iov[i].key = fi_mr_key(tx_entry->mr[i]); + } + } + + return 0; +} + +int rxr_read_post(struct rxr_ep *ep, struct rxr_read_entry *read_entry) +{ + int ret; + int iov_idx = 0, rma_iov_idx = 0; + bool self_comm; + size_t iov_offset = 0, rma_iov_offset = 0; + size_t total_iov_len, total_rma_iov_len, max_read_size; + struct rxr_pkt_entry *pkt_entry; + struct iovec iov; + struct fi_rma_iov rma_iov; + struct fi_msg_rma msg; + struct efa_ep *efa_ep; + struct rxr_peer *peer; + fi_addr_t shm_fiaddr = FI_ADDR_NOTAVAIL; + + assert(read_entry->iov_count > 0); + assert(read_entry->rma_iov_count > 0); + assert(read_entry->bytes_submitted < read_entry->total_len); + + if (read_entry->context_type == RXR_READ_CONTEXT_PKT_ENTRY) { + assert(read_entry->lower_ep_type == EFA_EP); + ret = rxr_read_prepare_pkt_entry_mr(ep, read_entry); + if (ret) + return ret; + } + + if (read_entry->lower_ep_type == EFA_EP) { + ret = rxr_read_mr_reg(ep, read_entry); + if (ret) + return ret; + } + + peer = rxr_ep_get_peer(ep, read_entry->addr); + + if (read_entry->lower_ep_type == SHM_EP) + shm_fiaddr = peer->shm_fiaddr; + + max_read_size = (read_entry->lower_ep_type == EFA_EP) ? + efa_max_rdma_size(ep->rdm_ep) : SIZE_MAX; + assert(max_read_size > 0); + + ret = rxr_locate_iov_pos(read_entry->iov, read_entry->iov_count, + read_entry->bytes_submitted, + &iov_idx, &iov_offset); + assert(ret == 0); + + ret = rxr_locate_rma_iov_pos(read_entry->rma_iov, read_entry->rma_iov_count, + read_entry->bytes_submitted, + &rma_iov_idx, &rma_iov_offset); + assert(ret == 0); + + total_iov_len = ofi_total_iov_len(read_entry->iov, read_entry->iov_count); + total_rma_iov_len = ofi_total_rma_iov_len(read_entry->rma_iov, read_entry->rma_iov_count); + assert(read_entry->total_len == MIN(total_iov_len, total_rma_iov_len)); + + while (read_entry->bytes_submitted < read_entry->total_len) { + + if (ep->tx_pending == ep->max_outstanding_tx) + return -FI_EAGAIN; + + assert(iov_idx < read_entry->iov_count); + assert(iov_offset < read_entry->iov[iov_idx].iov_len); + assert(rma_iov_idx < read_entry->rma_iov_count); + assert(rma_iov_offset < read_entry->rma_iov[rma_iov_idx].len); + + iov.iov_base = (char *)read_entry->iov[iov_idx].iov_base + iov_offset; + iov.iov_len = read_entry->iov[iov_idx].iov_len - iov_offset; + + rma_iov.addr = (uintptr_t)read_entry->rma_iov[rma_iov_idx].addr + rma_iov_offset; + rma_iov.len = read_entry->rma_iov[rma_iov_idx].len - rma_iov_offset; + rma_iov.key = read_entry->rma_iov[rma_iov_idx].key; + + iov.iov_len = MIN(iov.iov_len, rma_iov.len); + if (read_entry->lower_ep_type == EFA_EP) + iov.iov_len = MIN(iov.iov_len, rxr_env.efa_read_segment_size); + iov.iov_len = MIN(iov.iov_len, max_read_size); + rma_iov.len = iov.iov_len; + + /* because fi_send uses a pkt_entry as context + * we had to use a pkt_entry as context too + */ + if (read_entry->lower_ep_type == SHM_EP) + pkt_entry = rxr_pkt_entry_alloc(ep, ep->tx_pkt_shm_pool); + else + pkt_entry = rxr_pkt_entry_alloc(ep, ep->tx_pkt_efa_pool); + + if (OFI_UNLIKELY(!pkt_entry)) + return -FI_EAGAIN; + + rxr_pkt_init_read_context(ep, read_entry, iov.iov_len, pkt_entry); + + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = &iov; + msg.desc = &read_entry->mr_desc[iov_idx]; + msg.iov_count = 1; + msg.rma_iov = &rma_iov; + msg.rma_iov_count = 1; + msg.context = pkt_entry; + + if (read_entry->lower_ep_type == SHM_EP) { + msg.addr = shm_fiaddr; + ret = fi_readmsg(ep->shm_ep, &msg, 0); + } else { + efa_ep = container_of(ep->rdm_ep, struct efa_ep, util_ep.ep_fid); + msg.addr = read_entry->addr; + self_comm = (read_entry->context_type == RXR_READ_CONTEXT_PKT_ENTRY); + ret = efa_rma_post_read(efa_ep, &msg, 0, self_comm); + } + + if (OFI_UNLIKELY(ret)) { + rxr_pkt_entry_release_tx(ep, pkt_entry); + return ret; + } + + if (read_entry->context_type == RXR_READ_CONTEXT_PKT_ENTRY) { + assert(read_entry->lower_ep_type == EFA_EP); + /* read from self, no peer */ + ep->tx_pending++; + } else if (read_entry->lower_ep_type == EFA_EP) { + rxr_ep_inc_tx_pending(ep, peer); + } + + read_entry->bytes_submitted += iov.iov_len; + + iov_offset += iov.iov_len; + assert(iov_offset <= read_entry->iov[iov_idx].iov_len); + if (iov_offset == read_entry->iov[iov_idx].iov_len) { + iov_idx += 1; + iov_offset = 0; + } + + rma_iov_offset += rma_iov.len; + assert(rma_iov_offset <= read_entry->rma_iov[rma_iov_idx].len); + if (rma_iov_offset == read_entry->rma_iov[rma_iov_idx].len) { + rma_iov_idx += 1; + rma_iov_offset = 0; + } + } + + if (read_entry->total_len == total_iov_len) { + assert(iov_idx == read_entry->iov_count); + assert(iov_offset == 0); + } + + if (read_entry->total_len == total_rma_iov_len) { + assert(rma_iov_idx == read_entry->rma_iov_count); + assert(rma_iov_offset == 0); + } + + return 0; +} + +int rxr_read_handle_error(struct rxr_ep *ep, struct rxr_read_entry *read_entry, int ret) +{ + struct rxr_tx_entry *tx_entry; + struct rxr_rx_entry *rx_entry; + + if (read_entry->context_type == RXR_READ_CONTEXT_TX_ENTRY) { + tx_entry = read_entry->context; + ret = rxr_cq_handle_tx_error(ep, tx_entry, ret); + } else { + assert(read_entry->context_type == RXR_READ_CONTEXT_RX_ENTRY); + rx_entry = read_entry->context; + ret = rxr_cq_handle_rx_error(ep, rx_entry, ret); + } + + if (read_entry->state == RXR_RDMA_ENTRY_PENDING) + dlist_remove(&read_entry->pending_entry); + return ret; +} + diff --git a/prov/efa/src/rxr/rxr_read.h b/prov/efa/src/rxr/rxr_read.h new file mode 100644 index 00000000000..334648a0c69 --- /dev/null +++ b/prov/efa/src/rxr/rxr_read.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +#include +#endif /* HAVE_CONFIG_H */ + +#ifndef _RXR_RDMA_H_ +#define _RXR_RDMA_H_ + +/* + * read can used in 2 scenarios: + * + * 1. application posted a read request. + * + * 2. read message protocol is being used, receiver is going + * to post a read requst. + * + * 3. a packet entry with data has been received, and the + * receiving buffer is on GPU memroy. A read request is + * being posted to copy data to receiving buffer. + * + * To distinguish them, we use a pointer as context. + * + * For 1, the tx_entry is used as context + * For 2, the rx_entry is used as context + * For 3, the pkt_entry is used as context + * + * We also store rxr_read_context_type in read_entry to specify + * context type. + */ +enum rxr_read_context_type { + RXR_READ_CONTEXT_TX_ENTRY, + RXR_READ_CONTEXT_RX_ENTRY, + RXR_READ_CONTEXT_PKT_ENTRY, +}; + +enum rxr_read_entry_state { + RXR_RDMA_ENTRY_FREE = 0, + RXR_RDMA_ENTRY_CREATED, + RXR_RDMA_ENTRY_PENDING, + RXR_RDMA_ENTRY_SUBMITTED, +}; + +/* + * rxr_read_entry contains the information of a read request + */ +struct rxr_read_entry { + enum rxr_x_entry_type type; + int read_id; + enum rxr_lower_ep_type lower_ep_type; + + void *context; + enum rxr_read_context_type context_type; + + enum rxr_read_entry_state state; + + fi_addr_t addr; + + struct iovec iov[RXR_IOV_LIMIT]; + size_t iov_count; + struct fid_mr *mr[RXR_IOV_LIMIT]; + void *mr_desc[RXR_IOV_LIMIT]; + + struct fi_rma_iov rma_iov[RXR_IOV_LIMIT]; + size_t rma_iov_count; + + size_t total_len; + size_t bytes_submitted; /* bytes fi_read() succeeded */ + size_t bytes_finished; /* bytes received completion */ + + struct dlist_entry pending_entry; +}; + +struct rxr_read_entry *rxr_read_alloc_entry(struct rxr_ep *ep, int entry_type, void *x_entry, + enum rxr_lower_ep_type lower_ep_type); + +void rxr_read_release_entry(struct rxr_ep *ep, struct rxr_read_entry *read_entry); + +/* used by read message protocol and read write protocol */ +int rxr_locate_iov_pos(struct iovec *iov, int iov_count, size_t offset, + int *iov_idx, size_t *iov_offset); + +int rxr_read_init_iov(struct rxr_ep *ep, + struct rxr_tx_entry *tx_entry, + struct fi_rma_iov *read_iov); + +int rxr_read_post(struct rxr_ep *ep, struct rxr_read_entry *read_entry); + +int rxr_read_post_remote_read_or_queue(struct rxr_ep *ep, int entry_type, void *x_entry); + +int rxr_read_post_local_read_or_queue(struct rxr_ep *ep, + struct rxr_rx_entry *rx_entry, + size_t data_offset, + struct rxr_pkt_entry *pkt_entry, + char *data, size_t data_size); + +void rxr_read_handle_read_completion(struct rxr_ep *ep, struct rxr_pkt_entry *pkt_entry); + +int rxr_read_handle_error(struct rxr_ep *ep, struct rxr_read_entry *read_entry, int ret); + +#endif + diff --git a/prov/efa/src/rxr/rxr_rma.c b/prov/efa/src/rxr/rxr_rma.c index d02032d5212..3cf9f6b433d 100644 --- a/prov/efa/src/rxr/rxr_rma.c +++ b/prov/efa/src/rxr/rxr_rma.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. + * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. * All rights reserved. * * This software is available to you under a choice of one of two @@ -35,27 +35,37 @@ #include #include #include +#include "efa.h" #include "rxr.h" #include "rxr_rma.h" +#include "rxr_pkt_cmd.h" +#include "rxr_cntr.h" +#include "rxr_read.h" int rxr_rma_verified_copy_iov(struct rxr_ep *ep, struct fi_rma_iov *rma, - size_t count, uint32_t flags, struct iovec *iov) + size_t count, uint32_t flags, + struct iovec *iov, void **desc) { - struct util_domain *util_domain; + void *context; + struct efa_mr *efa_mr; + struct efa_ep *efa_ep; int i, ret; - util_domain = &rxr_ep_domain(ep)->util_domain; + efa_ep = container_of(ep->rdm_ep, struct efa_ep, util_ep.ep_fid); for (i = 0; i < count; i++) { - ret = ofi_mr_verify(&util_domain->mr_map, - rma[i].len, - (uintptr_t *)(&rma[i].addr), - rma[i].key, - flags); + fastlock_acquire(&efa_ep->domain->util_domain.lock); + ret = ofi_mr_map_verify(&efa_ep->domain->util_domain.mr_map, + (uintptr_t *)(&rma[i].addr), + rma[i].len, rma[i].key, flags, + &context); + efa_mr = context; + desc[i] = fi_mr_desc(&efa_mr->mr_fid); + fastlock_release(&efa_ep->domain->util_domain.lock); if (ret) { FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, - "MR verification failed (%s)\n", - fi_strerror(-ret)); + "MR verification failed (%s), addr: %lx key: %ld\n", + fi_strerror(-ret), rma[i].addr, rma[i].key); return -FI_EACCES; } @@ -64,14 +74,15 @@ int rxr_rma_verified_copy_iov(struct rxr_ep *ep, struct fi_rma_iov *rma, } return 0; } - /* Upon receiving a read request, Remote EP call this function to create * a tx entry for sending data back. */ -struct rxr_tx_entry *rxr_readrsp_tx_entry_init(struct rxr_ep *rxr_ep, - struct rxr_rx_entry *rx_entry) +struct rxr_tx_entry * +rxr_rma_alloc_readrsp_tx_entry(struct rxr_ep *rxr_ep, + struct rxr_rx_entry *rx_entry) { struct rxr_tx_entry *tx_entry; + struct fi_msg msg; tx_entry = ofi_buf_alloc(rxr_ep->readrsp_tx_entry_pool); if (OFI_UNLIKELY(!tx_entry)) { @@ -84,13 +95,18 @@ struct rxr_tx_entry *rxr_readrsp_tx_entry_init(struct rxr_ep *rxr_ep, dlist_insert_tail(&tx_entry->tx_entry_entry, &rxr_ep->tx_entry_list); #endif + msg.msg_iov = rx_entry->iov; + msg.iov_count = rx_entry->iov_count; + msg.addr = rx_entry->addr; + msg.desc = rx_entry->desc; + msg.context = NULL; + msg.data = 0; + /* * this tx_entry works similar to a send tx_entry thus its op was * set to ofi_op_msg. Note this tx_entry will not write a completion */ - rxr_generic_tx_entry_init(rxr_ep, tx_entry, rx_entry->iov, - rx_entry->iov_count, NULL, 0, rx_entry->addr, - 0, 0, NULL, ofi_op_msg, 0); + rxr_tx_entry_init(rxr_ep, tx_entry, &msg, ofi_op_msg, 0); tx_entry->cq_entry.flags |= FI_READ; /* rma_loc_rx_id is for later retrieve of rx_entry @@ -102,40 +118,256 @@ struct rxr_tx_entry *rxr_readrsp_tx_entry_init(struct rxr_ep *rxr_ep, tx_entry->rx_id = rx_entry->rma_initiator_rx_id; tx_entry->window = rx_entry->window; - /* this tx_entry does not send rts + /* this tx_entry does not send request * therefore should not increase msg_id */ tx_entry->msg_id = 0; return tx_entry; } -ssize_t rxr_generic_rma(struct fid_ep *ep, - const struct iovec *iov, size_t iov_count, - const struct fi_rma_iov *rma_iov, size_t rma_iov_count, - fi_addr_t addr, uint64_t data, void *context, uint32_t op, - uint64_t flags) +struct rxr_tx_entry * +rxr_rma_alloc_tx_entry(struct rxr_ep *rxr_ep, + const struct fi_msg_rma *msg_rma, + uint32_t op, + uint64_t flags) { - assert(iov_count <= RXR_IOV_LIMIT && rma_iov_count <= RXR_IOV_LIMIT); - int tag = 0; // RMA is not tagged + struct rxr_tx_entry *tx_entry; + struct fi_msg msg; + + tx_entry = ofi_buf_alloc(rxr_ep->tx_entry_pool); + if (OFI_UNLIKELY(!tx_entry)) { + FI_WARN(&rxr_prov, FI_LOG_EP_CTRL, "TX entries exhausted.\n"); + return NULL; + } + + msg.addr = msg_rma->addr; + msg.msg_iov = msg_rma->msg_iov; + msg.context = msg_rma->context; + msg.iov_count = msg_rma->iov_count; + msg.data = msg_rma->data; + msg.desc = msg_rma->desc; + rxr_tx_entry_init(rxr_ep, tx_entry, &msg, op, flags); - return rxr_tx(ep, iov, iov_count, rma_iov, rma_iov_count, addr, - tag, data, context, op, flags); + assert(msg_rma->rma_iov_count > 0); + assert(msg_rma->rma_iov); + tx_entry->rma_iov_count = msg_rma->rma_iov_count; + memcpy(tx_entry->rma_iov, msg_rma->rma_iov, + sizeof(struct fi_rma_iov) * msg_rma->rma_iov_count); + +#if ENABLE_DEBUG + dlist_insert_tail(&tx_entry->tx_entry_entry, &rxr_ep->tx_entry_list); +#endif + return tx_entry; } -ssize_t rxr_read(struct fid_ep *ep, void *buf, size_t len, void *desc, - fi_addr_t src_addr, uint64_t addr, uint64_t key, - void *context) +size_t rxr_rma_post_shm_write(struct rxr_ep *rxr_ep, struct rxr_tx_entry *tx_entry) { - struct iovec iov; + struct rxr_pkt_entry *pkt_entry; + struct fi_msg_rma msg; + struct rxr_peer *peer; + int i, err; - iov.iov_base = (void *)buf; - iov.iov_len = len; - return rxr_readv(ep, &iov, &desc, 1, src_addr, addr, key, context); + assert(tx_entry->op == ofi_op_write); + peer = rxr_ep_get_peer(rxr_ep, tx_entry->addr); + pkt_entry = rxr_pkt_entry_alloc(rxr_ep, rxr_ep->tx_pkt_shm_pool); + if (OFI_UNLIKELY(!pkt_entry)) + return -FI_EAGAIN; + + rxr_pkt_init_write_context(tx_entry, pkt_entry); + + /* If no FI_MR_VIRT_ADDR being set, have to use 0-based offset */ + if (!(shm_info->domain_attr->mr_mode & FI_MR_VIRT_ADDR)) { + for (i = 0; i < tx_entry->iov_count; i++) + tx_entry->rma_iov[i].addr = 0; + } + + msg.msg_iov = tx_entry->iov; + msg.iov_count = tx_entry->iov_count; + msg.addr = peer->shm_fiaddr; + msg.rma_iov = tx_entry->rma_iov; + msg.rma_iov_count = tx_entry->rma_iov_count; + msg.context = pkt_entry; + msg.data = tx_entry->cq_entry.data; + + err = fi_writemsg(rxr_ep->shm_ep, &msg, tx_entry->fi_flags); + if (err) + rxr_pkt_entry_release_tx(rxr_ep, pkt_entry); + + return err; +} + +/* rma_read functions */ +ssize_t rxr_rma_post_efa_emulated_read(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry) +{ + int err, window, credits; + struct rxr_peer *peer; + struct rxr_rx_entry *rx_entry; + struct fi_msg msg = {0}; + + /* create a rx_entry to receve data + * use ofi_op_msg for its op. + * it does not write a rx completion. + */ + msg.msg_iov = tx_entry->iov; + msg.iov_count = tx_entry->iov_count; + msg.addr = tx_entry->addr; + rx_entry = rxr_ep_get_rx_entry(ep, &msg, 0, ~0, ofi_op_msg, 0); + if (!rx_entry) { + FI_WARN(&rxr_prov, FI_LOG_CQ, + "RX entries exhausted for read.\n"); + rxr_ep_progress_internal(ep); + return -FI_EAGAIN; + } + + /* + * this rx_entry does not know its tx_id, because remote + * tx_entry has not been created yet. + * set tx_id to -1, and the correct one will be filled in + * rxr_cq_handle_readrsp() + */ + assert(rx_entry); + rx_entry->tx_id = -1; + rx_entry->cq_entry.flags |= FI_READ; + rx_entry->total_len = rx_entry->cq_entry.len; + + /* + * there will not be a CTS for fi_read, we calculate CTS + * window here, and send it via REQ. + * meanwhile set rx_entry->state to RXR_RX_RECV so that + * this rx_entry is ready to receive. + */ + + /* But if there is no available buffer, we do not even proceed. + * call rxr_ep_progress_internal() might release some buffer + */ + if (ep->available_data_bufs == 0) { + rxr_release_rx_entry(ep, rx_entry); + rxr_ep_progress_internal(ep); + return -FI_EAGAIN; + } + + rx_entry->state = RXR_RX_RECV; + /* rma_loc_tx_id is used in rxr_cq_handle_rx_completion() + * to locate the tx_entry for tx completion. + */ + rx_entry->rma_loc_tx_id = tx_entry->tx_id; +#if ENABLE_DEBUG + dlist_insert_tail(&rx_entry->rx_pending_entry, + &ep->rx_pending_list); + ep->rx_pending++; +#endif + /* + * this tx_entry does not need a rx_id, because it does not + * send any data. + * the rma_loc_rx_id and rma_window will be sent to remote EP + * via REQ + */ + tx_entry->rma_loc_rx_id = rx_entry->rx_id; + + if (tx_entry->total_len < ep->mtu_size - sizeof(struct rxr_readrsp_hdr)) { + err = rxr_pkt_post_ctrl(ep, RXR_TX_ENTRY, tx_entry, RXR_SHORT_RTR_PKT, 0); + } else { + peer = rxr_ep_get_peer(ep, tx_entry->addr); + + rxr_pkt_calc_cts_window_credits(ep, peer, + tx_entry->total_len, + tx_entry->credit_request, + &window, + &credits); + + rx_entry->window = window; + rx_entry->credit_cts = credits; + tx_entry->rma_window = rx_entry->window; + err = rxr_pkt_post_ctrl(ep, RXR_TX_ENTRY, tx_entry, RXR_LONG_RTR_PKT, 0); + } + + if (OFI_UNLIKELY(err)) { +#if ENABLE_DEBUG + dlist_remove(&rx_entry->rx_pending_entry); + ep->rx_pending--; +#endif + rxr_release_rx_entry(ep, rx_entry); + } + + return err; } -ssize_t rxr_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, - size_t iov_count, fi_addr_t src_addr, uint64_t addr, - uint64_t key, void *context) +ssize_t rxr_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags) +{ + ssize_t err; + struct rxr_ep *rxr_ep; + struct rxr_peer *peer; + struct rxr_tx_entry *tx_entry = NULL; + bool use_lower_ep_read; + + FI_DBG(&rxr_prov, FI_LOG_EP_DATA, + "read iov_len: %lu flags: %lx\n", + ofi_total_iov_len(msg->msg_iov, msg->iov_count), + flags); + + rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid); + assert(msg->iov_count <= rxr_ep->tx_iov_limit); + + rxr_perfset_start(rxr_ep, perf_rxr_tx); + fastlock_acquire(&rxr_ep->util_ep.lock); + + if (OFI_UNLIKELY(is_tx_res_full(rxr_ep))) { + err = -FI_EAGAIN; + goto out; + } + + peer = rxr_ep_get_peer(rxr_ep, msg->addr); + + if (peer->flags & RXR_PEER_IN_BACKOFF) { + err = -FI_EAGAIN; + goto out; + } + + tx_entry = rxr_rma_alloc_tx_entry(rxr_ep, msg, ofi_op_read_req, flags); + if (OFI_UNLIKELY(!tx_entry)) { + rxr_ep_progress_internal(rxr_ep); + err = -FI_EAGAIN; + goto out; + } + + use_lower_ep_read = false; + if (peer->is_local) { + assert(rxr_ep->use_shm); + use_lower_ep_read = true; + } else if (efa_both_support_rdma_read(rxr_ep, peer)) { + /* efa_both_support_rdma_read also check rxr_env.use_device_rdma, + * so we do not check it here + */ + use_lower_ep_read = true; + } + + if (use_lower_ep_read) { + err = rxr_read_post_remote_read_or_queue(rxr_ep, RXR_TX_ENTRY, tx_entry); + if (OFI_UNLIKELY(err == -FI_ENOBUFS)) { + err = -FI_EAGAIN; + rxr_ep_progress_internal(rxr_ep); + goto out; + } + } else { + err = rxr_ep_set_tx_credit_request(rxr_ep, tx_entry); + if (OFI_UNLIKELY(err)) + goto out; + + err = rxr_rma_post_efa_emulated_read(rxr_ep, tx_entry); + } + +out: + if (OFI_UNLIKELY(err && tx_entry)) + rxr_release_tx_entry(rxr_ep, tx_entry); + + fastlock_release(&rxr_ep->util_ep.lock); + rxr_perfset_end(rxr_ep, perf_rxr_tx); + return err; +} + +ssize_t rxr_rma_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, + size_t iov_count, fi_addr_t src_addr, uint64_t addr, + uint64_t key, void *context) { struct fi_rma_iov rma_iov; struct fi_msg_rma msg; @@ -153,31 +385,151 @@ ssize_t rxr_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, msg.rma_iov = &rma_iov; msg.rma_iov_count = 1; - return rxr_readmsg(ep, &msg, 0); -} - -ssize_t rxr_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags) -{ - return rxr_generic_rma(ep, msg->msg_iov, msg->iov_count, - msg->rma_iov, msg->rma_iov_count, - msg->addr, msg->data, msg->context, - ofi_op_read_req, flags); + return rxr_rma_readmsg(ep, &msg, 0); } -ssize_t rxr_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, - fi_addr_t dest_addr, uint64_t addr, uint64_t key, - void *context) +ssize_t rxr_rma_read(struct fid_ep *ep, void *buf, size_t len, void *desc, + fi_addr_t src_addr, uint64_t addr, uint64_t key, + void *context) { struct iovec iov; iov.iov_base = (void *)buf; iov.iov_len = len; - return rxr_writev(ep, &iov, &desc, 1, dest_addr, addr, key, context); + return rxr_rma_readv(ep, &iov, &desc, 1, src_addr, addr, key, context); +} + +/* rma_write functions */ +ssize_t rxr_rma_post_write(struct rxr_ep *ep, struct rxr_tx_entry *tx_entry) +{ + ssize_t err; + struct rxr_peer *peer; + struct efa_domain *efa_domain; + bool delivery_complete_requested; + int ctrl_type; + size_t max_rtm_data_size; + struct rxr_domain *rxr_domain = rxr_ep_domain(ep); + + efa_domain = container_of(rxr_domain->rdm_domain, struct efa_domain, + util_domain.domain_fid); + + peer = rxr_ep_get_peer(ep, tx_entry->addr); + + if (peer->is_local) + return rxr_rma_post_shm_write(ep, tx_entry); + + delivery_complete_requested = tx_entry->fi_flags & FI_DELIVERY_COMPLETE; + if (delivery_complete_requested) { + tx_entry->rxr_flags |= RXR_DELIVERY_COMPLETE_REQUESTED; + /* + * Because delivery complete is defined as an extra + * feature, the receiver might not support it. + * + * The sender cannot send with FI_DELIVERY_COMPLETE + * if the peer is not able to handle it. + * + * If the sender does not know whether the peer + * can handle it, it needs to trigger + * a handshake packet from the peer. + * + * The handshake packet contains + * the information whether the peer + * support it or not. + */ + err = rxr_pkt_trigger_handshake(ep, tx_entry->addr, peer); + if (OFI_UNLIKELY(err)) + return err; + + if (!(peer->flags & RXR_PEER_HANDSHAKE_RECEIVED)) + return -FI_EAGAIN; + else if (!rxr_peer_support_delivery_complete(peer)) + return -FI_EOPNOTSUPP; + + max_rtm_data_size = rxr_pkt_req_max_data_size(ep, + tx_entry->addr, + RXR_DC_EAGER_RTW_PKT); + } else { + max_rtm_data_size = rxr_pkt_req_max_data_size(ep, + tx_entry->addr, + RXR_EAGER_RTW_PKT); + } + + /* Inter instance */ + if (tx_entry->total_len < max_rtm_data_size) { + ctrl_type = delivery_complete_requested ? + RXR_DC_EAGER_RTW_PKT : RXR_EAGER_RTW_PKT; + return rxr_pkt_post_ctrl(ep, RXR_TX_ENTRY, tx_entry, ctrl_type, 0); + } + + if (tx_entry->total_len >= rxr_env.efa_min_read_write_size && + efa_both_support_rdma_read(ep, peer) && + (tx_entry->desc[0] || efa_is_cache_available(efa_domain))) { + err = rxr_pkt_post_ctrl(ep, RXR_TX_ENTRY, tx_entry, RXR_READ_RTW_PKT, 0); + if (err != -FI_ENOMEM) + return err; + /* + * If read write protocol failed due to memory registration, fall back to use long + * message protocol + */ + } + + err = rxr_ep_set_tx_credit_request(ep, tx_entry); + if (OFI_UNLIKELY(err)) + return err; + + ctrl_type = delivery_complete_requested ? + RXR_DC_LONG_RTW_PKT : RXR_LONG_RTW_PKT; + tx_entry->rxr_flags |= RXR_LONGCTS_PROTOCOL; + return rxr_pkt_post_ctrl(ep, RXR_TX_ENTRY, tx_entry, ctrl_type, 0); } -ssize_t rxr_writev(struct fid_ep *ep, const struct iovec *iov, void **desc, - size_t iov_count, fi_addr_t dest_addr, uint64_t addr, - uint64_t key, void *context) +ssize_t rxr_rma_writemsg(struct fid_ep *ep, + const struct fi_msg_rma *msg, + uint64_t flags) +{ + ssize_t err; + struct rxr_peer *peer; + struct rxr_ep *rxr_ep; + struct rxr_tx_entry *tx_entry; + + FI_DBG(&rxr_prov, FI_LOG_EP_DATA, + "write iov_len %lu flags: %lx\n", + ofi_total_iov_len(msg->msg_iov, msg->iov_count), + flags); + + rxr_ep = container_of(ep, struct rxr_ep, util_ep.ep_fid.fid); + assert(msg->iov_count <= rxr_ep->tx_iov_limit); + + rxr_perfset_start(rxr_ep, perf_rxr_tx); + fastlock_acquire(&rxr_ep->util_ep.lock); + + peer = rxr_ep_get_peer(rxr_ep, msg->addr); + + if (peer->flags & RXR_PEER_IN_BACKOFF) { + err = -FI_EAGAIN; + goto out; + } + + tx_entry = rxr_rma_alloc_tx_entry(rxr_ep, msg, ofi_op_write, flags); + if (OFI_UNLIKELY(!tx_entry)) { + rxr_ep_progress_internal(rxr_ep); + err = -FI_EAGAIN; + goto out; + } + + err = rxr_rma_post_write(rxr_ep, tx_entry); + if (OFI_UNLIKELY(err)) { + rxr_release_tx_entry(rxr_ep, tx_entry); + } +out: + fastlock_release(&rxr_ep->util_ep.lock); + rxr_perfset_end(rxr_ep, perf_rxr_tx); + return err; +} + +ssize_t rxr_rma_writev(struct fid_ep *ep, const struct iovec *iov, void **desc, + size_t iov_count, fi_addr_t dest_addr, uint64_t addr, + uint64_t key, void *context) { struct fi_rma_iov rma_iov; struct fi_msg_rma msg; @@ -195,32 +547,23 @@ ssize_t rxr_writev(struct fid_ep *ep, const struct iovec *iov, void **desc, msg.rma_iov = &rma_iov; msg.rma_iov_count = 1; - return rxr_writemsg(ep, &msg, 0); + return rxr_rma_writemsg(ep, &msg, 0); } -ssize_t rxr_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg, - uint64_t flags) +ssize_t rxr_rma_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, + fi_addr_t dest_addr, uint64_t addr, uint64_t key, + void *context) { - ssize_t ret = 0; - - if (msg->data == 0) { - ret = rxr_generic_rma(ep, msg->msg_iov, msg->iov_count, - msg->rma_iov, msg->rma_iov_count, - msg->addr, 0, NULL, ofi_op_write, 0); - } else { - ret = rxr_generic_rma(ep, msg->msg_iov, msg->iov_count, - msg->rma_iov, msg->rma_iov_count, - msg->addr, msg->data, - msg->context, ofi_op_write, - FI_REMOTE_CQ_DATA); - } + struct iovec iov; - return ret; + iov.iov_base = (void *)buf; + iov.iov_len = len; + return rxr_rma_writev(ep, &iov, &desc, 1, dest_addr, addr, key, context); } -ssize_t rxr_writedata(struct fid_ep *ep, const void *buf, size_t len, - void *desc, uint64_t data, fi_addr_t dest_addr, - uint64_t addr, uint64_t key, void *context) +ssize_t rxr_rma_writedata(struct fid_ep *ep, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, void *context) { struct iovec iov; struct fi_rma_iov rma_iov; @@ -234,7 +577,7 @@ ssize_t rxr_writedata(struct fid_ep *ep, const void *buf, size_t len, memset(&msg, 0, sizeof(msg)); msg.msg_iov = &iov; - msg.desc = desc; + msg.desc = &desc; msg.iov_count = 1; msg.addr = dest_addr; msg.context = context; @@ -242,12 +585,13 @@ ssize_t rxr_writedata(struct fid_ep *ep, const void *buf, size_t len, msg.rma_iov_count = 1; msg.data = data; - return rxr_writemsg(ep, &msg, 0); + return rxr_rma_writemsg(ep, &msg, FI_REMOTE_CQ_DATA); } -ssize_t rxr_inject(struct fid_ep *ep, const void *buf, size_t len, - fi_addr_t dest_addr, uint64_t addr, uint64_t key) +ssize_t rxr_rma_inject_write(struct fid_ep *ep, const void *buf, size_t len, + fi_addr_t dest_addr, uint64_t addr, uint64_t key) { + struct fi_msg_rma msg; struct iovec iov; struct fi_rma_iov rma_iov; @@ -256,15 +600,22 @@ ssize_t rxr_inject(struct fid_ep *ep, const void *buf, size_t len, rma_iov.addr = addr; rma_iov.len = len; rma_iov.key = key; - return rxr_generic_rma(ep, &iov, 1, &rma_iov, 1, dest_addr, - 0, NULL, ofi_op_write, FI_INJECT | - RXR_NO_COMPLETION); + + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = &iov; + msg.iov_count = 1; + msg.rma_iov = &rma_iov; + msg.rma_iov_count = 1; + msg.addr = dest_addr; + + return rxr_rma_writemsg(ep, &msg, FI_INJECT | RXR_NO_COMPLETION); } -ssize_t rxr_inject_data(struct fid_ep *ep, const void *buf, size_t len, - uint64_t data, fi_addr_t dest_addr, uint64_t addr, - uint64_t key) +ssize_t rxr_rma_inject_writedata(struct fid_ep *ep, const void *buf, size_t len, + uint64_t data, fi_addr_t dest_addr, uint64_t addr, + uint64_t key) { + struct fi_msg_rma msg; struct iovec iov; struct fi_rma_iov rma_iov; @@ -273,20 +624,29 @@ ssize_t rxr_inject_data(struct fid_ep *ep, const void *buf, size_t len, rma_iov.addr = addr; rma_iov.len = len; rma_iov.key = key; - return rxr_generic_rma(ep, &iov, 1, &rma_iov, 1, dest_addr, - data, NULL, ofi_op_write, FI_INJECT | - RXR_NO_COMPLETION | FI_REMOTE_CQ_DATA); + + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = &iov; + msg.iov_count = 1; + msg.rma_iov = &rma_iov; + msg.rma_iov_count = 1; + msg.addr = dest_addr; + msg.data = data; + + return rxr_rma_writemsg(ep, &msg, FI_INJECT | RXR_NO_COMPLETION | + FI_REMOTE_CQ_DATA); } struct fi_ops_rma rxr_ops_rma = { .size = sizeof(struct fi_ops_rma), - .read = rxr_read, - .readv = rxr_readv, - .readmsg = rxr_readmsg, - .write = rxr_write, - .writev = rxr_writev, - .writemsg = rxr_writemsg, - .inject = rxr_inject, - .writedata = rxr_writedata, - .injectdata = rxr_inject_data, + .read = rxr_rma_read, + .readv = rxr_rma_readv, + .readmsg = rxr_rma_readmsg, + .write = rxr_rma_write, + .writev = rxr_rma_writev, + .writemsg = rxr_rma_writemsg, + .inject = rxr_rma_inject_write, + .writedata = rxr_rma_writedata, + .injectdata = rxr_rma_inject_writedata, }; + diff --git a/prov/efa/src/rxr/rxr_rma.h b/prov/efa/src/rxr/rxr_rma.h index c33782042e0..3514f0c9e62 100644 --- a/prov/efa/src/rxr/rxr_rma.h +++ b/prov/efa/src/rxr/rxr_rma.h @@ -40,42 +40,13 @@ #include int rxr_rma_verified_copy_iov(struct rxr_ep *ep, struct fi_rma_iov *rma, - size_t count, uint32_t flags, struct iovec *iov); + size_t count, uint32_t flags, + struct iovec *iov, void **desc); -struct rxr_tx_entry *rxr_readrsp_tx_entry_init(struct rxr_ep *rxr_ep, - struct rxr_rx_entry *rx_entry); - -ssize_t rxr_read(struct fid_ep *ep, void *buf, size_t len, void *desc, - fi_addr_t src_addr, uint64_t addr, uint64_t key, - void *context); - -ssize_t rxr_readv(struct fid_ep *ep, const struct iovec *iov, void **desc, - size_t iov_count, fi_addr_t src_addr, uint64_t addr, - uint64_t key, void *context); - -ssize_t rxr_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, uint64_t flags); - -ssize_t rxr_write(struct fid_ep *ep, const void *buf, size_t len, void *desc, - fi_addr_t dest_addr, uint64_t addr, uint64_t key, - void *context); - -ssize_t rxr_writev(struct fid_ep *ep, const struct iovec *iov, void **desc, - size_t iov_count, fi_addr_t dest_addr, uint64_t addr, - uint64_t key, void *context); - -ssize_t rxr_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg, - uint64_t flags); - -ssize_t rxr_writedata(struct fid_ep *ep, const void *buf, size_t len, - void *desc, uint64_t data, fi_addr_t dest_addr, - uint64_t addr, uint64_t key, void *context); - -ssize_t rxr_inject(struct fid_ep *ep, const void *buf, size_t len, - fi_addr_t dest_addr, uint64_t addr, uint64_t key); - -ssize_t rxr_inject_data(struct fid_ep *ep, const void *buf, size_t len, - uint64_t data, fi_addr_t dest_addr, uint64_t addr, - uint64_t key); +/* read response related functions */ +struct rxr_tx_entry * +rxr_rma_alloc_readrsp_tx_entry(struct rxr_ep *rxr_ep, + struct rxr_rx_entry *rx_entry); extern struct fi_ops_rma rxr_ops_rma; diff --git a/prov/gni/include/gnix.h b/prov/gni/include/gnix.h index 326dbe3160a..50fc5a486a3 100644 --- a/prov/gni/include/gnix.h +++ b/prov/gni/include/gnix.h @@ -3,6 +3,7 @@ * Copyright (c) 2015-2018 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2015-2016 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2020 Triad National Security, LLC. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -178,7 +179,7 @@ /* No overhead secondary capabilities. These can be silently enabled by the * provider. */ -#define GNIX_EP_SEC_CAPS (FI_MULTI_RECV | FI_TRIGGER | FI_FENCE) +#define GNIX_EP_SEC_CAPS (FI_MULTI_RECV | FI_TRIGGER | FI_FENCE | FI_LOCAL_COMM | FI_REMOTE_COMM) /* Secondary capabilities that introduce overhead. Must be requested. */ #define GNIX_EP_SEC_CAPS_OH (FI_SOURCE | FI_RMA_EVENT | FI_SOURCE_ERR) diff --git a/prov/gni/include/gnix_cm.h b/prov/gni/include/gnix_cm.h index 5344caa0718..81fdea134af 100644 --- a/prov/gni/include/gnix_cm.h +++ b/prov/gni/include/gnix_cm.h @@ -1,7 +1,8 @@ /* * Copyright (c) 2016 Cray Inc. All rights reserved. * Copyright (c) 2017 Los Alamos National Security, LLC. All rights reserved. - * Copyright (c) 2019 Triad National Security, LLC. All rights reserved. + * Copyright (c) 2019-2020 Triad National Security, LLC. + * All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -117,7 +118,7 @@ _gnix_resolve_gni_ep_name(const char *ep_name, int idx, int ret = FI_SUCCESS; static size_t addr_size = sizeof(struct gnix_ep_name); - GNIX_TRACE(FI_LOG_TRACE, "\n"); + GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); /*TODO (optimization): Just return offset into ep_name */ memcpy(addr, &ep_name[addr_size * idx], addr_size); @@ -138,7 +139,7 @@ _gnix_resolve_str_ep_name(const char *ep_name, int idx, int ret = FI_SUCCESS; static size_t addr_size = GNIX_FI_ADDR_STR_LEN; - GNIX_TRACE(FI_LOG_TRACE, "\n"); + GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); ret = _gnix_ep_name_from_str(&ep_name[addr_size * idx], addr); return ret; diff --git a/prov/gni/include/gnix_freelist.h b/prov/gni/include/gnix_freelist.h index 7a2fbb8aaa1..dea00fd253d 100644 --- a/prov/gni/include/gnix_freelist.h +++ b/prov/gni/include/gnix_freelist.h @@ -1,6 +1,7 @@ /* * Copyright (c) 2015-2016 Cray Inc. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights reserved. + * Copyright (c) 2020 Triad National Security, LLC. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -128,7 +129,7 @@ static inline int _gnix_fl_alloc(struct dlist_entry **e, struct gnix_freelist *f if (fl->refill_size == 0) { ret = -FI_ECANCELED; - GNIX_DEBUG(FI_LOG_DEBUG, "Freelist not growable (refill " + GNIX_DEBUG(FI_LOG_EP_CTRL, "Freelist not growable (refill " "size is 0\n"); goto err; diff --git a/prov/gni/src/gnix_av.c b/prov/gni/src/gnix_av.c index 749c7537b96..3aeca6624c2 100644 --- a/prov/gni/src/gnix_av.c +++ b/prov/gni/src/gnix_av.c @@ -2,7 +2,7 @@ * Copyright (c) 2015-2017 Cray Inc. All rights reserved. * Copyright (c) 2015-2017 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2019 Triad National Security, LLC. + * Copyright (c) 2019-2020 Triad National Security, LLC. * All rights reserved. * * This software is available to you under a choice of one of two @@ -361,7 +361,7 @@ static int map_insert(struct gnix_fid_av *av_priv, const void *addr, ret_cnt = -FI_EINVAL; continue; } - GNIX_DEBUG(FI_LOG_DEBUG, "ep_name doesn't fit " + GNIX_DEBUG(FI_LOG_AV, "ep_name doesn't fit " "into the av context bits\n"); return -FI_EINVAL; /* TODO: should try to do cleanup */ @@ -745,7 +745,7 @@ DIRECT_FN const char *gnix_av_straddr(struct fid_av *av, struct gnix_fid_av *av_priv; if (!av || !addr || !buf || !len) { - GNIX_DEBUG(FI_LOG_DEBUG, "NULL parameter in gnix_av_straddr\n"); + GNIX_DEBUG(FI_LOG_AV, "NULL parameter in gnix_av_straddr\n"); return NULL; } diff --git a/prov/gni/src/gnix_cm.c b/prov/gni/src/gnix_cm.c index f30aa0cd4c7..f3c301e77ce 100644 --- a/prov/gni/src/gnix_cm.c +++ b/prov/gni/src/gnix_cm.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2015-2017 Cray Inc. All rights reserved. * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights reserved. + * Copyright (c) 2020 Triad National Security, LLC. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -55,7 +56,7 @@ int _gnix_ep_name_to_str(struct gnix_ep_name *ep_name, char **out_buf) char *str; size_t len = GNIX_FI_ADDR_STR_LEN; - GNIX_TRACE(FI_LOG_TRACE, "\n"); + GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); if (*out_buf == NULL) { str = calloc(len, sizeof(char)); @@ -90,10 +91,10 @@ int _gnix_ep_name_from_str(const char *addr, long tok_val; char *dup_addr; - GNIX_TRACE(FI_LOG_TRACE, "\n"); + GNIX_TRACE(FI_LOG_EP_CTRL, "\n"); if (!addr || !resolved_addr) { - GNIX_WARN(FI_LOG_WARN, "NULL parameter in " + GNIX_WARN(FI_LOG_EP_CTRL, "NULL parameter in " "__gnix_resolved_name_from_str"); return -FI_EINVAL; } @@ -105,34 +106,34 @@ int _gnix_ep_name_from_str(const char *addr, tok = strtok(dup_addr, ";"); if (!tok) { - GNIX_WARN(FI_LOG_WARN, "Invalid address.\n"); + GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n"); return -FI_EINVAL; } ret = memcmp(tok, "gni", 3); if (ret) { - GNIX_WARN(FI_LOG_WARN, "Invalid address.\n"); + GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n"); free(dup_addr); return -FI_EINVAL; } tok = strtok(NULL, ";");/*node*/ if (!tok) { - GNIX_WARN(FI_LOG_WARN, "Invalid address.\n"); + GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n"); free(dup_addr); return -FI_EINVAL; } tok = strtok(NULL, ";");/*service*/ if (!tok) { - GNIX_WARN(FI_LOG_WARN, "Invalid address.\n"); + GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n"); free(dup_addr); return -FI_EINVAL; } tok = strtok(NULL, ";");/*GNIX_AV_STR_ADDR_VERSION*/ if (!tok) { - GNIX_WARN(FI_LOG_WARN, "Invalid address.\n"); + GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n"); free(dup_addr); return -FI_EINVAL; } @@ -140,13 +141,13 @@ int _gnix_ep_name_from_str(const char *addr, /*device_addr*/ tok = strtok(NULL, ";"); if (!tok) { - GNIX_WARN(FI_LOG_WARN, "Invalid address.\n"); + GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n"); free(dup_addr); return -FI_EINVAL; } tok_val = strtol(tok, &endptr, 16); if (*endptr) { - GNIX_WARN(FI_LOG_WARN, "Invalid device_addr.\n"); + GNIX_WARN(FI_LOG_EP_CTRL, "Invalid device_addr.\n"); free(dup_addr); return -FI_EINVAL; } @@ -155,13 +156,13 @@ int _gnix_ep_name_from_str(const char *addr, /*cdm_id*/ tok = strtok(NULL, ";"); if (!tok) { - GNIX_WARN(FI_LOG_WARN, "Invalid address.\n"); + GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n"); free(dup_addr); return -FI_EINVAL; } tok_val = strtol(tok, &endptr, 16); if (*endptr) { - GNIX_WARN(FI_LOG_WARN, "Invalid cdm_id.\n"); + GNIX_WARN(FI_LOG_EP_CTRL, "Invalid cdm_id.\n"); free(dup_addr); return -FI_EINVAL; } @@ -170,13 +171,13 @@ int _gnix_ep_name_from_str(const char *addr, /*name_type*/ tok = strtok(NULL, ";"); if (!tok) { - GNIX_WARN(FI_LOG_WARN, "Invalid address.\n"); + GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n"); free(dup_addr); return -FI_EINVAL; } tok_val = strtol(tok, &endptr, 10); if (*endptr) { - GNIX_WARN(FI_LOG_WARN, "Invalid name_type.\n"); + GNIX_WARN(FI_LOG_EP_CTRL, "Invalid name_type.\n"); free(dup_addr); return -FI_EINVAL; } @@ -185,13 +186,13 @@ int _gnix_ep_name_from_str(const char *addr, /*cm_nic_cdm_id*/ tok = strtok(NULL, ";"); if (!tok) { - GNIX_WARN(FI_LOG_WARN, "Invalid address.\n"); + GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n"); free(dup_addr); return -FI_EINVAL; } tok_val = strtol(tok, &endptr, 16); if (*endptr) { - GNIX_WARN(FI_LOG_WARN, "Invalid cm_nic_cdm_id.\n"); + GNIX_WARN(FI_LOG_EP_CTRL, "Invalid cm_nic_cdm_id.\n"); free(dup_addr); return -FI_EINVAL; } @@ -200,13 +201,13 @@ int _gnix_ep_name_from_str(const char *addr, /*cookie*/ tok = strtok(NULL, ";"); if (!tok) { - GNIX_WARN(FI_LOG_WARN, "Invalid address.\n"); + GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n"); free(dup_addr); return -FI_EINVAL; } tok_val = strtol(tok, &endptr, 16); if (*endptr) { - GNIX_WARN(FI_LOG_WARN, "Invalid cookie.\n"); + GNIX_WARN(FI_LOG_EP_CTRL, "Invalid cookie.\n"); free(dup_addr); return -FI_EINVAL; } @@ -215,13 +216,13 @@ int _gnix_ep_name_from_str(const char *addr, /*rx_ctx_cnt*/ tok = strtok(NULL, ";"); if (!tok) { - GNIX_WARN(FI_LOG_WARN, "Invalid address.\n"); + GNIX_WARN(FI_LOG_EP_CTRL, "Invalid address.\n"); free(dup_addr); return -FI_EINVAL; } tok_val = strtol(tok, &endptr, 10); if (*endptr) { - GNIX_WARN(FI_LOG_WARN, "Invalid rx_ctx_cnt.\n"); + GNIX_WARN(FI_LOG_EP_CTRL, "Invalid rx_ctx_cnt.\n"); free(dup_addr); return -FI_EINVAL; } diff --git a/prov/gni/src/gnix_ep.c b/prov/gni/src/gnix_ep.c index cd09fc95ee0..c5e171571cd 100644 --- a/prov/gni/src/gnix_ep.c +++ b/prov/gni/src/gnix_ep.c @@ -2,7 +2,8 @@ * Copyright (c) 2015-2019 Cray Inc. All rights reserved. * Copyright (c) 2015-2018 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2019 Triad National Security, LLC. All rights reserved. + * Copyright (c) 2019-2020 Triad National Security, LLC. + * All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -2350,7 +2351,7 @@ DIRECT_FN int gnix_ep_open(struct fid_domain *domain, struct fi_info *info, ep_priv->info = fi_dupinfo(info); ep_priv->info->addr_format = info->addr_format; - GNIX_DEBUG(FI_LOG_DEBUG, "ep(%p) is using addr_format(%s)\n", ep_priv, + GNIX_DEBUG(FI_LOG_EP_CTRL, "ep(%p) is using addr_format(%s)\n", ep_priv, ep_priv->info->addr_format == FI_ADDR_STR ? "FI_ADDR_STR" : "FI_ADDR_GNI"); diff --git a/prov/gni/src/gnix_fabric.c b/prov/gni/src/gnix_fabric.c index 93304132c5c..b04346bef3f 100644 --- a/prov/gni/src/gnix_fabric.c +++ b/prov/gni/src/gnix_fabric.c @@ -724,7 +724,7 @@ static void gnix_fini(void) struct fi_provider gnix_prov = { .name = gnix_prov_name, .version = FI_VERSION(GNI_MAJOR_VERSION, GNI_MINOR_VERSION), - .fi_version = FI_VERSION(1, 8), + .fi_version = OFI_VERSION_LATEST, .getinfo = gnix_getinfo, .fabric = gnix_fabric_open, .cleanup = gnix_fini diff --git a/prov/gni/src/gnix_mbox_allocator.c b/prov/gni/src/gnix_mbox_allocator.c index d398e0055f9..fb8bd9435e0 100644 --- a/prov/gni/src/gnix_mbox_allocator.c +++ b/prov/gni/src/gnix_mbox_allocator.c @@ -125,6 +125,7 @@ static int __generate_file_name(size_t page_size, char **filename) int my_file_id; int size; int ret; + int file_name_size; if (!filename) { GNIX_WARN(FI_LOG_EP_CTRL, "filename pointer is NULL.\n"); @@ -151,7 +152,8 @@ static int __generate_file_name(size_t page_size, char **filename) goto err_snprintf; } - full_filename = malloc(size + 1); + file_name_size = size + 1; + full_filename = malloc(file_name_size); if (!full_filename) { error = strerror_r(errno, error_buf, sizeof(error_buf)); GNIX_WARN(FI_LOG_EP_CTRL, @@ -161,8 +163,8 @@ static int __generate_file_name(size_t page_size, char **filename) goto err_snprintf; } - sprintf(full_filename, "%s/%s.%d.%d", huge_page, basename, getpid(), - my_file_id); + snprintf(full_filename, file_name_size, "%s/%s.%d.%d", huge_page, basename, + getpid(), my_file_id); GNIX_DEBUG(FI_LOG_EP_CTRL, "Generated filename: %s\n", full_filename); diff --git a/prov/gni/src/gnix_msg.c b/prov/gni/src/gnix_msg.c index c835d2c577a..4b24d8702fb 100644 --- a/prov/gni/src/gnix_msg.c +++ b/prov/gni/src/gnix_msg.c @@ -2,7 +2,8 @@ * Copyright (c) 2015-2019 Cray Inc. All rights reserved. * Copyright (c) 2015-2018 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2019 Triad National Security, LLC. All rights reserved. + * Copyright (c) 2019-2020 Triad National Security, LLC. + * All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -407,11 +408,11 @@ static int __recv_completion_src( char *buffer; size_t buf_len; - GNIX_DBG_TRACE(FI_LOG_TRACE, "\n"); + GNIX_DBG_TRACE(FI_LOG_EP_DATA, "\n"); if ((req->msg.recv_flags & FI_COMPLETION) && ep->recv_cq) { if ((src_addr == FI_ADDR_NOTAVAIL) && - (req->msg.recv_flags & FI_SOURCE_ERR) != 0) { + (ep->caps & FI_SOURCE_ERR) != 0) { if (ep->domain->addr_format == FI_ADDR_STR) { buffer = malloc(GNIX_FI_ADDR_STR_LEN); rc = _gnix_ep_name_to_str(req->vc->gnix_ep_name, (char **)&buffer); @@ -2012,7 +2013,6 @@ static int __smsg_eager_msg_w_data(void *data, void *msg) struct gnix_tag_storage *unexp_queue; struct gnix_tag_storage *posted_queue; int tagged; - bool multi_recv = false; GNIX_DBG_TRACE(FI_LOG_EP_DATA, "\n"); @@ -2033,7 +2033,6 @@ static int __smsg_eager_msg_w_data(void *data, void *msg) if (req == NULL) { return -FI_ENOMEM; } - multi_recv = true; } req->addr = vc->peer_addr; @@ -2057,14 +2056,10 @@ static int __smsg_eager_msg_w_data(void *data, void *msg) GNIX_DEBUG(FI_LOG_EP_DATA, "Freeing req: %p\n", req); /* - * Dequeue and free the request if not - * matching a FI_MULTI_RECV buffer. + * Dequeue and free the request. */ - if (multi_recv == false) { - _gnix_remove_tag(posted_queue, req); - _gnix_fr_free(ep, req); - } - + _gnix_remove_tag(posted_queue, req); + _gnix_fr_free(ep, req); } else { /* Add new unexpected receive request. */ req = _gnix_fr_alloc(ep); @@ -2178,7 +2173,6 @@ static int __smsg_rndzv_start(void *data, void *msg) struct gnix_tag_storage *unexp_queue; struct gnix_tag_storage *posted_queue; int tagged; - bool multi_recv = false; GNIX_DBG_TRACE(FI_LOG_EP_DATA, "\n"); @@ -2198,7 +2192,6 @@ static int __smsg_rndzv_start(void *data, void *msg) if (req == NULL) { return -FI_ENOMEM; } - multi_recv = true; } req->addr = vc->peer_addr; @@ -2246,8 +2239,7 @@ static int __smsg_rndzv_start(void *data, void *msg) req, req->msg.recv_info[0].recv_addr, req->msg.send_info[0].send_len); - if (multi_recv == false) - _gnix_remove_tag(posted_queue, req); + _gnix_remove_tag(posted_queue, req); /* Queue request to initiate pull of source data. */ ret = _gnix_vc_queue_work_req(req); diff --git a/prov/gni/test/api.c b/prov/gni/test/api.c index c61304df8c6..5808e069383 100644 --- a/prov/gni/test/api.c +++ b/prov/gni/test/api.c @@ -2,6 +2,7 @@ * Copyright (c) 2015-2017 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2015-2017 Cray Inc. All rights reserved. + * Copyright (c) 2020 Triad National Security, LLC. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -70,24 +71,23 @@ static uint64_t mode_bits = ~FI_NOTIFY_FLAGS_ONLY; static struct fid_fabric *fab; static struct fid_domain *dom[NUMEPS]; -struct fi_gni_ops_domain *gni_domain_ops[NUMEPS]; +static struct fi_gni_ops_domain *gni_domain_ops[NUMEPS]; static struct fid_ep *ep[NUMEPS]; static struct fid_av *av[NUMEPS]; -void *ep_name[NUMEPS]; -fi_addr_t gni_addr[NUMEPS]; +static void *ep_name[NUMEPS]; +static fi_addr_t gni_addr[NUMEPS]; static struct fid_cq *msg_cq[NUMEPS]; static struct fi_info *fi[NUMEPS]; static struct fi_cq_attr cq_attr; -const char *api_cdm_id[NUMEPS] = { "5000", "5001" }; -struct fi_info *hints[NUMEPS]; +static struct fi_info *hints[NUMEPS]; #define BUF_SZ (1<<20) -char *target, *target_base; -char *source, *source_base; -char *uc_target; -char *uc_source; -struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS]; -uint64_t mr_key[NUMEPS]; +static char *target, *target_base; +static char *source, *source_base; +static char *uc_target; +static char *uc_source; +static struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS]; +static uint64_t mr_key[NUMEPS]; static struct fid_cntr *send_cntr[NUMEPS], *recv_cntr[NUMEPS]; static struct fi_cntr_attr cntr_attr = { diff --git a/prov/gni/test/api_cntr.c b/prov/gni/test/api_cntr.c index 3ed370a56c9..cf3a878cc49 100644 --- a/prov/gni/test/api_cntr.c +++ b/prov/gni/test/api_cntr.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2016-2017 Cray Inc. All rights reserved. + * Copyright (c) 2020 Triad National Security, LLC. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -67,22 +68,22 @@ static uint64_t mode_bits = ~FI_NOTIFY_FLAGS_ONLY; static struct fid_fabric *fab; static struct fid_domain *dom[NUMEPS]; -struct fi_gni_ops_domain *gni_domain_ops[NUMEPS]; +static struct fi_gni_ops_domain *gni_domain_ops[NUMEPS]; static struct fid_ep *ep[NUMEPS]; static struct fid_av *av[NUMEPS]; -void *ep_name[NUMEPS]; -fi_addr_t gni_addr[NUMEPS]; +static void *ep_name[NUMEPS]; +static fi_addr_t gni_addr[NUMEPS]; static struct fi_info *fi[NUMEPS]; -struct fi_info *hints[NUMEPS]; +static struct fi_info *hints[NUMEPS]; #define BUF_SZ (1<<20) -char *target, *target_base; -char *source, *source_base; -char *uc_target; -char *uc_source; -struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS]; -uint64_t mr_key[NUMEPS]; -uint64_t cntr_bind_flags; +static char *target, *target_base; +static char *source, *source_base; +static char *uc_target; +static char *uc_source; +static struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS]; +static uint64_t mr_key[NUMEPS]; +static uint64_t cntr_bind_flags; static struct fid_cntr *send_cntr[NUMEPS], *recv_cntr[NUMEPS]; static struct fid_cntr *write_cntr[NUMEPS], *read_cntr[NUMEPS]; diff --git a/prov/gni/test/api_cq.c b/prov/gni/test/api_cq.c index 6559c043f1b..26296f6ad72 100644 --- a/prov/gni/test/api_cq.c +++ b/prov/gni/test/api_cq.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights reserved. * Copyright (c) 2015-2017 Cray Inc. All rights reserved. + * Copyright (c) 2020 Triad National Security, LLC. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -68,24 +69,24 @@ static uint64_t mode_bits = ~FI_NOTIFY_FLAGS_ONLY; static struct fid_fabric *fab; static struct fid_domain *dom[NUMEPS]; -struct fi_gni_ops_domain *gni_domain_ops[NUMEPS]; +static struct fi_gni_ops_domain *gni_domain_ops[NUMEPS]; static struct fid_ep *ep[NUMEPS]; static struct fid_av *av[NUMEPS]; -void *ep_name[NUMEPS]; -fi_addr_t gni_addr[NUMEPS]; +static void *ep_name[NUMEPS]; +static fi_addr_t gni_addr[NUMEPS]; static struct fid_cq *msg_cq[NUMEPS]; static struct fi_info *fi[NUMEPS]; static struct fi_cq_attr cq_attr; -struct fi_info *hints[NUMEPS]; +static struct fi_info *hints[NUMEPS]; #define BUF_SZ (1<<20) -char *target, *target_base; -char *source, *source_base; -char *uc_target; -char *uc_source; -struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS]; -uint64_t mr_key[NUMEPS]; -uint64_t cq_bind_flags; +static char *target, *target_base; +static char *source, *source_base; +static char *uc_target; +static char *uc_source; +static struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS]; +static uint64_t mr_key[NUMEPS]; +static uint64_t cq_bind_flags; void api_cq_bind(uint64_t flags) { diff --git a/prov/gni/test/av.c b/prov/gni/test/av.c index 824f33f76ff..f0f4b20ccf1 100644 --- a/prov/gni/test/av.c +++ b/prov/gni/test/av.c @@ -3,7 +3,7 @@ * All rights reserved. * Copyright (c) 2015-2017 Cray Inc. All rights reserved. * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2019 Triad National Security, LLC. + * Copyright (c) 2019-2020 Triad National Security, LLC. * All rights reserved. * * This software is available to you under a choice of one of two @@ -54,7 +54,7 @@ static struct fid_fabric *fab; static struct fid_domain *dom; static struct fi_info *hints; static struct fi_info *fi; -struct gnix_ep_name *fake_names; +static struct gnix_ep_name *fake_names; static struct fid_av *av; static struct gnix_fid_av *gnix_av; diff --git a/prov/gni/test/cancel.c b/prov/gni/test/cancel.c index a410d3fc350..eb24433c015 100644 --- a/prov/gni/test/cancel.c +++ b/prov/gni/test/cancel.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights reserved. * Copyright (c) 2015-2017 Cray Inc. All rights reserved. + * Copyright (c) 2020 Triad National Security, LLC. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -63,16 +64,16 @@ static struct fid_ep *ep[2]; static struct fid_av *av; static struct fi_info *hints; static struct fi_info *fi; -void *ep_name[2]; -size_t gni_addr[2]; +static void *ep_name[2]; +static size_t gni_addr[2]; static struct fid_cq *msg_cq[2]; static struct fi_cq_attr cq_attr; #define BUF_SZ (8*1024) -char *target, *target_base; -char *source, *source_base; -struct fid_mr *rem_mr, *loc_mr; -uint64_t mr_key; +static char *target, *target_base; +static char *source, *source_base; +static struct fid_mr *rem_mr, *loc_mr; +static uint64_t mr_key; void cancel_setup(void) { diff --git a/prov/gni/test/cm.c b/prov/gni/test/cm.c index 1572405e399..0ce599c4d71 100644 --- a/prov/gni/test/cm.c +++ b/prov/gni/test/cm.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2016-2017 Cray Inc. All rights reserved. - * Copyright (c) 2019 Triad National Security, LLC. All rights reserved. + * Copyright (c) 2019-2020 Triad National Security, LLC. + * All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -83,7 +84,7 @@ static struct fi_info *cli_hints; static struct fi_info *cli_fi; static struct fid_eq *cli_eq; static struct fid_cq *cli_cq; -char *cli_cm_in_data = "Hola. Soy cliente."; +static char *cli_cm_in_data = "Hola. Soy cliente."; static struct fid_fabric *srv_fab; static struct fid_domain *srv_dom; @@ -93,7 +94,7 @@ static struct fi_info *srv_hints; static struct fi_info *srv_fi; static struct fid_eq *srv_eq; static struct fid_cq *srv_cq; -char *srv_cm_in_data = "Este es servidor."; +static char *srv_cm_in_data = "Este es servidor."; struct fi_eq_attr eq_attr = { .wait_obj = FI_WAIT_UNSPEC diff --git a/prov/gni/test/cntr.c b/prov/gni/test/cntr.c index bd6eb5b873e..51bdf9ce08e 100644 --- a/prov/gni/test/cntr.c +++ b/prov/gni/test/cntr.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights reserved. * Copyright (c) 2015-2017 Cray Inc. All rights reserved. + * Copyright (c) 2020 Triad National Security, LLC. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -90,10 +91,10 @@ static struct fi_cntr_attr cntr_attr = {.events = FI_CNTR_EVENTS_COMP, .flags = 0}; #define BUF_SZ (64*1024) -char *target, *target_base; -char *source, *source_base; -struct fid_mr *rem_mr[NUM_EPS], *loc_mr[NUM_EPS]; -uint64_t mr_key[NUM_EPS]; +static char *target, *target_base; +static char *source, *source_base; +static struct fid_mr *rem_mr[NUM_EPS], *loc_mr[NUM_EPS]; +static uint64_t mr_key[NUM_EPS]; static inline void cntr_setup_eps(const uint64_t caps, uint32_t version, diff --git a/prov/gni/test/datagram.c b/prov/gni/test/datagram.c index 5f0f9c28850..dc5bf641168 100644 --- a/prov/gni/test/datagram.c +++ b/prov/gni/test/datagram.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights reserved. * Copyright (c) 2015-2017 Cray Inc. All rights reserved. + * Copyright (c) 2020 Triad National Security, LLC. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -64,7 +65,7 @@ static struct fid_ep *ep; static struct fi_info *hints; static struct fi_info *fi; static struct gnix_fid_ep *ep_priv; -const char my_cdm_id[] = "3000"; +static const char my_cdm_id[] = "3000"; void dg_setup(void) { diff --git a/prov/gni/test/rdm_addr_str_sr.c b/prov/gni/test/rdm_addr_str_sr.c index 312d3bf0899..9f600618b03 100644 --- a/prov/gni/test/rdm_addr_str_sr.c +++ b/prov/gni/test/rdm_addr_str_sr.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2020 Triad National Security, LLC. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -69,7 +70,7 @@ static fi_addr_t gni_addr[NUMEPS]; static struct fid_cq *msg_cq[NUMEPS]; static struct fi_info *fi[NUMEPS]; static struct fi_cq_attr cq_attr; -struct fi_info *hints; +static struct fi_info *hints; static size_t addrlen = 0; #define BUF_SZ (1<<20) diff --git a/prov/gni/test/rdm_atomic.c b/prov/gni/test/rdm_atomic.c index 98560dc60ca..99d237ee361 100644 --- a/prov/gni/test/rdm_atomic.c +++ b/prov/gni/test/rdm_atomic.c @@ -1,7 +1,8 @@ /* * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights reserved. * Copyright (c) 2015-2017 Cray Inc. All rights reserved. - * Copyright (c) 2019 Triad National Security, LLC. All rights reserved. + * Copyright (c) 2019-2020 Triad National Security, LLC. + * All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -72,23 +73,23 @@ static uint64_t mode_bits = ~FI_NOTIFY_FLAGS_ONLY; static struct fid_fabric *fab; static struct fid_domain *dom[NUMEPS]; -struct fi_gni_ops_domain *gni_domain_ops[NUMEPS]; +static struct fi_gni_ops_domain *gni_domain_ops[NUMEPS]; static struct fid_ep *ep[NUMEPS]; static struct fid_av *av[NUMEPS]; static struct fi_info *hints; static struct fi_info *fi; -void *ep_name[NUMEPS]; -size_t gni_addr[NUMEPS]; +static void *ep_name[NUMEPS]; +static size_t gni_addr[NUMEPS]; static struct fid_cq *send_cq[NUMEPS]; static struct fid_cq *recv_cq[NUMEPS]; static struct fi_cq_attr cq_attr; #define BUF_SZ (64*1024) -char *target, *target_base; -char *source, *source_base; -char *uc_source, *uc_source_base; -struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS]; -uint64_t mr_key[NUMEPS]; +static char *target, *target_base; +static char *source, *source_base; +static char *uc_source, *uc_source_base; +static struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS]; +static uint64_t mr_key[NUMEPS]; static struct fid_cntr *write_cntr[NUMEPS], *read_cntr[NUMEPS]; static struct fid_cntr *rwrite_cntr; diff --git a/prov/gni/test/rdm_dgram_rma.c b/prov/gni/test/rdm_dgram_rma.c index 96b54aaf41e..f07cc8497d4 100644 --- a/prov/gni/test/rdm_dgram_rma.c +++ b/prov/gni/test/rdm_dgram_rma.c @@ -2,7 +2,8 @@ * Copyright (c) 2015-2017 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2015-2018 Cray Inc. All rights reserved. - * Copyright (c) 2019 Triad National Security, LLC. All rights reserved. + * Copyright (c) 2019-2020 Triad National Security, LLC. + * All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -70,25 +71,25 @@ static uint64_t mode_bits = ~FI_NOTIFY_FLAGS_ONLY; static struct fid_fabric *fab; static struct fid_domain *dom[2]; -struct fi_gni_ops_domain *gni_domain_ops[2]; +static struct fi_gni_ops_domain *gni_domain_ops[2]; static struct fid_ep *ep[2]; static struct fid_av *av[2]; static struct fi_info *hints; static struct fi_info *fi; -void *ep_name[2]; -size_t gni_addr[2]; +static void *ep_name[2]; +static size_t gni_addr[2]; static struct fid_cq *send_cq[2]; static struct fid_cq *recv_cq[2]; static struct fi_cq_attr cq_attr[2]; #define BUF_SZ (64*1024) -char *target, *target_base; -char *target2, *target2_base; -char *source, *source_base; -char *source2, *source2_base; -char *uc_source; -struct fid_mr *rem_mr[2], *loc_mr[2], *rem_mr2[2], *loc_mr2[2]; -uint64_t mr_key[2], mr_key2[2]; +static char *target, *target_base; +static char *target2, *target2_base; +static char *source, *source_base; +static char *source2, *source2_base; +static char *uc_source; +static struct fid_mr *rem_mr[2], *loc_mr[2], *rem_mr2[2], *loc_mr2[2]; +static uint64_t mr_key[2], mr_key2[2]; static struct fid_cntr *write_cntr[2], *read_cntr[2]; static struct fid_cntr *rwrite_cntr; diff --git a/prov/gni/test/rdm_dgram_stx.c b/prov/gni/test/rdm_dgram_stx.c index c57a7f707cd..c1bceade365 100644 --- a/prov/gni/test/rdm_dgram_stx.c +++ b/prov/gni/test/rdm_dgram_stx.c @@ -2,7 +2,8 @@ * Copyright (c) 2015-2017 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2015-2018 Cray Inc. All rights reserved. - * Copyright (c) 2019 Triad National Security, LLC. All rights reserved. + * Copyright (c) 2019-2020 Triad National Security, LLC. + * All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -71,13 +72,13 @@ static uint64_t mode_bits = ~FI_NOTIFY_FLAGS_ONLY; static struct fid_fabric *fab; static struct fid_domain *dom[2]; -struct fi_gni_ops_domain *gni_domain_ops[2]; +static struct fi_gni_ops_domain *gni_domain_ops[2]; static struct fid_ep *ep[2]; static struct fid_av *av[2]; static struct fi_info *hints; static struct fi_info *fi; -void *ep_name[2]; -size_t gni_addr[2]; +static void *ep_name[2]; +static size_t gni_addr[2]; static struct fid_cq *send_cq[2]; static struct fid_cq *recv_cq[2]; static struct fi_cq_attr cq_attr[2]; @@ -85,11 +86,11 @@ static struct fid_stx *stx_ctx[2]; static struct fid_stx *stx_ctx_too_late; #define BUF_SZ (64*1024) -char *target, *target_base; -char *source, *source_base; -char *uc_source; -struct fid_mr *rem_mr[2], *loc_mr[2]; -uint64_t mr_key[2]; +static char *target, *target_base; +static char *source, *source_base; +static char *uc_source; +static struct fid_mr *rem_mr[2], *loc_mr[2]; +static uint64_t mr_key[2]; static struct fid_cntr *write_cntr[2], *read_cntr[2]; static struct fid_cntr *rwrite_cntr; diff --git a/prov/gni/test/rdm_multi_recv.c b/prov/gni/test/rdm_multi_recv.c index a3ed435f406..021e479d8b7 100644 --- a/prov/gni/test/rdm_multi_recv.c +++ b/prov/gni/test/rdm_multi_recv.c @@ -2,7 +2,8 @@ * Copyright (c) 2015-2017 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2015-2017 Cray Inc. All rights reserved. - * Copyright (c) 2019 Triad National Security, LLC. All rights reserved. + * Copyright (c) 2019-2020 Triad National Security, LLC. + * All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -73,24 +74,24 @@ static uint64_t mode_bits = ~FI_NOTIFY_FLAGS_ONLY; static struct fid_fabric *fab; static struct fid_domain *dom[NUMEPS]; -struct fi_gni_ops_domain *gni_domain_ops[NUMEPS]; +static struct fi_gni_ops_domain *gni_domain_ops[NUMEPS]; static struct fid_ep *ep[NUMEPS]; static struct fid_av *av[NUMEPS]; -void *ep_name[NUMEPS]; -fi_addr_t gni_addr[NUMEPS]; +static void *ep_name[NUMEPS]; +static fi_addr_t gni_addr[NUMEPS]; static struct fid_cq *msg_cq[NUMEPS]; static struct fi_info *fi[NUMEPS]; static struct fi_cq_attr cq_attr; -struct fi_info *hints; +static struct fi_info *hints; #define BUF_SZ (1<<20) #define BUF_RNDZV (1<<14) #define IOV_CNT (1<<3) -char *target, *target_base; -char *target2, *target2_base; -char *source, *source_base; -char *source2, *source2_base; +static char *target, *target_base; +static char *target2, *target2_base; +static char *source, *source_base; +static char *source2, *source2_base; struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS]; static struct fid_cntr *send_cntr[NUMEPS], *recv_cntr[NUMEPS]; diff --git a/prov/gni/test/rdm_rx_overrun.c b/prov/gni/test/rdm_rx_overrun.c index 2f53e4bb2ec..eb4da67c8f2 100644 --- a/prov/gni/test/rdm_rx_overrun.c +++ b/prov/gni/test/rdm_rx_overrun.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights reserved. * Copyright (c) 2015-2017 Cray Inc. All rights reserved. + * Copyright (c) 2020 Triad National Security, LLC. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -80,7 +81,7 @@ static struct fi_cq_attr cq_attr; static int target[NUM_EPS]; static int source[NUM_EPS]; -struct fid_mr *rem_mr[NUM_EPS], *loc_mr[NUM_EPS]; +static struct fid_mr *rem_mr[NUM_EPS], *loc_mr[NUM_EPS]; static uint64_t mr_key[NUM_EPS]; static int max_eps = NUM_EPS; diff --git a/prov/gni/test/rdm_sr.c b/prov/gni/test/rdm_sr.c index 3f087480955..a645dc8ed01 100644 --- a/prov/gni/test/rdm_sr.c +++ b/prov/gni/test/rdm_sr.c @@ -2,7 +2,8 @@ * Copyright (c) 2015-2017 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2015-2018 Cray Inc. All rights reserved. - * Copyright (c) 2019 Triad National Security, LLC. All rights reserved. + * Copyright (c) 2019-2020 Triad National Security, LLC. + * All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -76,12 +77,12 @@ static struct fid_domain *dom[NUMEPS]; struct fi_gni_ops_domain *gni_domain_ops[NUMEPS]; static struct fid_ep *ep[NUMEPS]; static struct fid_av *av[NUMEPS]; -void *ep_name[NUMEPS]; -fi_addr_t gni_addr[NUMEPS]; +static void *ep_name[NUMEPS]; +static fi_addr_t gni_addr[NUMEPS]; static struct fid_cq *msg_cq[NUMEPS]; static struct fi_info *fi[NUMEPS]; static struct fi_cq_attr cq_attr; -const char *cdm_id[NUMEPS] = { "5000", "5001" }; +static const char *cdm_id[NUMEPS] = { "5000", "5001" }; struct fi_info *hints; static int using_bnd_ep = 0; static int dgram_should_fail; @@ -92,18 +93,18 @@ static int peer_src_known = 1; #define BUF_RNDZV (1<<14) #define IOV_CNT (1<<3) -char *target, *target_base; -char *target2, *target2_base; -char *source, *source_base; -char *source2, *source2_base; -struct iovec *src_iov, *dest_iov, *s_iov, *d_iov; -char *iov_src_buf, *iov_dest_buf, *iov_src_buf_base, *iov_dest_buf_base; -char *uc_target; -char *uc_source; -struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS]; -struct fid_mr *iov_dest_buf_mr[NUMEPS], *iov_src_buf_mr[NUMEPS]; -uint64_t iov_dest_buf_mr_key[NUMEPS]; -uint64_t mr_key[NUMEPS]; +static char *target, *target_base; +static char *target2, *target2_base; +static char *source, *source_base; +static char *source2, *source2_base; +static struct iovec *src_iov, *dest_iov, *s_iov, *d_iov; +static char *iov_src_buf, *iov_dest_buf, *iov_src_buf_base, *iov_dest_buf_base; +static char *uc_target; +static char *uc_source; +static struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS]; +static struct fid_mr *iov_dest_buf_mr[NUMEPS], *iov_src_buf_mr[NUMEPS]; +static uint64_t iov_dest_buf_mr_key[NUMEPS]; +static uint64_t mr_key[NUMEPS]; static struct fid_cntr *send_cntr[NUMEPS], *recv_cntr[NUMEPS]; static struct fi_cntr_attr cntr_attr = { diff --git a/prov/gni/test/rdm_tagged_sr.c b/prov/gni/test/rdm_tagged_sr.c index c6a4805a4cd..a36f97b756f 100644 --- a/prov/gni/test/rdm_tagged_sr.c +++ b/prov/gni/test/rdm_tagged_sr.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights reserved. * Copyright (c) 2015-2017 Cray Inc. All rights reserved. + * Copyright (c) 2020 Triad National Security, LLC. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -72,20 +73,20 @@ static struct fid_ep *ep[2]; static struct fid_av *av; static struct fi_info *hints; static struct fi_info *fi; -void *ep_name[2]; -size_t gni_addr[2]; +static void *ep_name[2]; +static size_t gni_addr[2]; static struct fid_cq *msg_cq[2]; static struct fi_cq_attr cq_attr; #define BUF_SZ (1<<16) #define IOV_CNT (1<<3) -char *target, *target_base; -char *source, *source_base; -struct iovec *src_iov, *dest_iov; -char *iov_src_buf, *iov_dest_buf, *iov_src_buf_base, *iov_dest_buf_base; -struct fid_mr *rem_mr, *loc_mr; -uint64_t mr_key; +static char *target, *target_base; +static char *source, *source_base; +static struct iovec *src_iov, *dest_iov; +static char *iov_src_buf, *iov_dest_buf; +static struct fid_mr *rem_mr, *loc_mr; +static uint64_t mr_key; static void setup_dom(enum fi_progress pm, uint32_t version, int mr_mode) { diff --git a/prov/gni/test/sep.c b/prov/gni/test/sep.c index 28877469846..2fb6ab3b0a8 100644 --- a/prov/gni/test/sep.c +++ b/prov/gni/test/sep.c @@ -2,7 +2,8 @@ * Copyright (c) 2015-2018 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2015-2017 Cray Inc. All rights reserved. - * Copyright (c) 2019 Triad National Security, LLC. All rights reserved. + * Copyright (c) 2019-2020 Triad National Security, LLC. + * All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -67,19 +68,19 @@ static struct fid_domain *dom[NUMEPS]; static struct fid_av *av[NUMEPS]; static struct fid_av *t_av; static void *ep_name[TOTALEPS]; -fi_addr_t gni_addr[NUMEPS]; +static fi_addr_t gni_addr[NUMEPS]; static struct fi_cq_attr cq_attr; -struct fi_info *hints; +static struct fi_info *hints; static struct fi_info *fi[NUMEPS]; static struct fid_ep *sep[TOTALEPS]; -char *target, *target_base; -char *source, *source_base; -struct iovec *src_iov, *dest_iov; -char *iov_src_buf, *iov_dest_buf, *iov_src_buf_base, *iov_dest_buf_base; -struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS]; -struct fid_mr *iov_dest_buf_mr[NUMEPS], *iov_src_buf_mr[NUMEPS]; -uint64_t mr_key[NUMEPS]; +static char *target, *target_base; +static char *source, *source_base; +static struct iovec *src_iov, *dest_iov; +static char *iov_src_buf, *iov_dest_buf, *iov_src_buf_base, *iov_dest_buf_base; +static struct fid_mr *rem_mr[NUMEPS], *loc_mr[NUMEPS]; +static struct fid_mr *iov_dest_buf_mr[NUMEPS], *iov_src_buf_mr[NUMEPS]; +static uint64_t mr_key[NUMEPS]; static int ctx_cnt = NUMCONTEXTS; static int rx_ctx_bits; @@ -92,8 +93,8 @@ static struct fi_cntr_attr cntr_attr = { .events = FI_CNTR_EVENTS_COMP, .flags = 0 }; -struct fi_tx_attr tx_attr; -struct fi_rx_attr rx_attr; +static struct fi_tx_attr tx_attr; +static struct fi_rx_attr rx_attr; static uint64_t sends[NUMEPS] = {0}, recvs[NUMEPS] = {0}, send_errs[NUMEPS] = {0}, recv_errs[NUMEPS] = {0}; diff --git a/prov/gni/test/vc.c b/prov/gni/test/vc.c index 148a3a2fb75..fa61cbf7a05 100644 --- a/prov/gni/test/vc.c +++ b/prov/gni/test/vc.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights reserved * Copyright (c) 2015-2017 Cray Inc. All rights reserved. + * Copyright (c) 2020 Triad National Security, LLC. All rights reserved. * * * This software is available to you under a choice of one of two @@ -65,23 +66,22 @@ static struct fi_info *hints; static struct fi_info *fi; static struct fid_cq *cq[2]; static struct fi_cq_attr cq_attr; -void *ep_name[2]; -fi_addr_t gni_addr[2]; -struct gnix_av_addr_entry gnix_addr[2]; +static void *ep_name[2]; +static fi_addr_t gni_addr[2]; +static struct gnix_av_addr_entry gnix_addr[2]; /* Third EP with unqiue domain is used to test inter-CM connect. */ static struct fid_domain *dom3; static struct fid_ep *ep3; static struct fid_av *av3; static struct fid_cq *cq3; -void *ep_name3; -fi_addr_t gni_addr3; +static void *ep_name3; /* Register a target buffer with both domains for pings. */ -void *target_buf, *target_buf_base; -int target_len = 64; -struct fid_mr *rem_mr, *rem_mr3; -uint64_t mr_key, mr_key3; +static void *target_buf, *target_buf_base; +static int target_len = 64; +static struct fid_mr *rem_mr, *rem_mr3; +static uint64_t mr_key, mr_key3; static void vc_setup_common(uint32_t version, int mr_mode); diff --git a/prov/hook/hook_debug/include/hook_debug.h b/prov/hook/hook_debug/include/hook_debug.h index fd7e4621991..483d2ef8595 100644 --- a/prov/hook/hook_debug/include/hook_debug.h +++ b/prov/hook/hook_debug/include/hook_debug.h @@ -51,6 +51,7 @@ struct hook_debug_config { struct hook_debug_eq { struct hook_eq hook_eq; ofi_atomic64_t event_cntr[HOOK_DEBUG_EQ_EVENT_MAX]; + size_t eagain_count; }; struct hook_debug_cq { diff --git a/prov/hook/hook_debug/src/hook_debug.c b/prov/hook/hook_debug/src/hook_debug.c index 66ebc0b1926..3610e7f8466 100644 --- a/prov/hook/hook_debug/src/hook_debug.c +++ b/prov/hook/hook_debug/src/hook_debug.c @@ -89,33 +89,61 @@ static void hook_debug_trace_exit(struct fid *fid, struct fid *hfid, if (ret > 0) { FI_TRACE(hook_to_hprov(fid), subsys, "%s (fid: %p) returned: " - "%zd\n", fn, hfid, ret); + "%zd\n", fn, (void *) hfid, ret); goto out; } if (ret != -FI_EAGAIN || !eagain_count || !((*eagain_count)++ % HOOK_DEBUG_EAGAIN_LOG)) FI_TRACE(hook_to_hprov(fid), subsys, "%s (fid: %p) returned: " - "%zd (%s)\n", fn, fid, ret, fi_strerror(-ret)); + "%zd (%s)\n", fn, (void *) hfid, ret, fi_strerror(-ret)); out: if (eagain_count && ret != -FI_EAGAIN) *eagain_count = 0; } +static void +hook_debug_trace_exit_eq(struct hook_debug_eq *eq, const char *fn, ssize_t ret) +{ + hook_debug_trace_exit(&eq->hook_eq.eq.fid, &eq->hook_eq.heq->fid, + FI_LOG_EQ, fn, ret, &eq->eagain_count); +} + +static void +hook_debug_trace_exit_cq(struct hook_debug_cq *cq, const char *fn, ssize_t ret) +{ + hook_debug_trace_exit(&cq->hook_cq.cq.fid, &cq->hook_cq.hcq->fid, + FI_LOG_CQ, fn, ret, &cq->eagain_count); +} + +static void +hook_debug_trace_exit_cntr(struct hook_cntr *cntr, const char *fn, ssize_t ret) +{ + hook_debug_trace_exit(&cntr->cntr.fid, &cntr->hcntr->fid, + FI_LOG_CNTR, fn, ret, NULL); +} + +static void +hook_debug_trace_exit_ep(struct hook_debug_ep *ep, const char *fn, ssize_t ret, + size_t *eagain_count) +{ + hook_debug_trace_exit(&ep->hook_ep.ep.fid, &ep->hook_ep.hep->fid, + FI_LOG_EP_DATA, fn, ret, eagain_count); +} + static void hook_debug_rx_end(struct hook_debug_ep *ep, char *fn, ssize_t ret, void *mycontext) { struct hook_debug_txrx_entry *rx_entry; - hook_debug_trace_exit(&ep->hook_ep.ep.fid, &ep->hook_ep.hep->fid, - FI_LOG_EP_DATA, fn, ret, &ep->rx_eagain_count); + hook_debug_trace_exit_ep(ep, fn, ret, &ep->rx_eagain_count); if (config.track_recvs) { if (!ret) { ep->rx_outs++; FI_TRACE(hook_to_hprov(&ep->hook_ep.ep.fid), FI_LOG_EP_DATA, "ep: %p rx_outs: %zu\n", - ep->hook_ep.hep, ep->rx_outs); + (void *) ep->hook_ep.hep, ep->rx_outs); } else { rx_entry = mycontext; ofi_buf_free(rx_entry); @@ -203,15 +231,14 @@ static void hook_debug_tx_end(struct hook_debug_ep *ep, char *fn, { struct hook_debug_txrx_entry *tx_entry; - hook_debug_trace_exit(&ep->hook_ep.ep.fid, &ep->hook_ep.hep->fid, - FI_LOG_EP_DATA, fn, ret, &ep->tx_eagain_count); + hook_debug_trace_exit_ep(ep, fn, ret, &ep->tx_eagain_count); if (mycontext && config.track_sends) { if (!ret) { ep->tx_outs++; FI_TRACE(hook_to_hprov(&ep->hook_ep.ep.fid), FI_LOG_EP_DATA, "ep: %p tx_outs: %zu\n", - ep->hook_ep.hep, ep->tx_outs); + (void *) ep->hook_ep.hep, ep->tx_outs); } else { tx_entry = mycontext; ofi_buf_free(tx_entry); @@ -478,8 +505,7 @@ static void hook_debug_cq_process_entry(struct hook_debug_cq *mycq, struct fi_cq_tagged_entry *cq_entry; int i; - hook_debug_trace_exit(&mycq->hook_cq.cq.fid, &mycq->hook_cq.hcq->fid, - FI_LOG_CQ, fn, ret, &mycq->eagain_count); + hook_debug_trace_exit_cq(mycq, fn, ret); for (i = 0; i < ret; i++, buf += mycq->entry_size) { cq_entry = (struct fi_cq_tagged_entry *)buf; @@ -496,7 +522,7 @@ static void hook_debug_cq_process_entry(struct hook_debug_cq *mycq, rx_entry->ep->rx_outs--; FI_TRACE(hook_to_hprov(&mycq->hook_cq.cq.fid), FI_LOG_CQ, "ep: %p rx_outs: %zu\n", - rx_entry->ep->hook_ep.hep, + (void *) rx_entry->ep->hook_ep.hep, rx_entry->ep->rx_outs); ofi_buf_free(rx_entry); } @@ -509,7 +535,7 @@ static void hook_debug_cq_process_entry(struct hook_debug_cq *mycq, tx_entry->ep->tx_outs--; FI_TRACE(hook_to_hprov(&mycq->hook_cq.cq.fid), FI_LOG_CQ, "ep: %p tx_outs: %zu\n", - tx_entry->ep->hook_ep.hep, + (void *) tx_entry->ep->hook_ep.hep, tx_entry->ep->tx_outs); ofi_buf_free(tx_entry); } @@ -527,7 +553,19 @@ static ssize_t hook_debug_cq_read(struct fid_cq *cq, void *buf, size_t count) return ret; } -int hook_debug_cq_close(struct fid *fid) +static ssize_t hook_debug_cq_readfrom(struct fid_cq *cq, void *buf, size_t count, + fi_addr_t *src_addr) +{ + struct hook_debug_cq *mycq = container_of(cq, struct hook_debug_cq, + hook_cq.cq); + ssize_t ret; + + ret = fi_cq_readfrom(mycq->hook_cq.hcq, buf, count, src_addr); + hook_debug_cq_process_entry(mycq, "fi_cq_readfrom", ret, buf); + return ret; +} + +static int hook_debug_cq_close(struct fid *fid) { struct hook_debug_cq *mycq = container_of(fid, struct hook_debug_cq, hook_cq.cq.fid); @@ -563,11 +601,13 @@ static void hook_debug_cq_attr_log(struct hook_domain *dom, HOOK_DEBUG_TRACE(dom->fabric, FI_LOG_CQ, "\tsignaling_vector: %d\n", attr->signaling_vector); HOOK_DEBUG_TRACE(dom->fabric, FI_LOG_CQ, "\twait_cond: %s\n", "TBD"); - HOOK_DEBUG_TRACE(dom->fabric, FI_LOG_CQ, "\twait_set: %p\n", attr->wait_set); + HOOK_DEBUG_TRACE(dom->fabric, FI_LOG_CQ, "\twait_set: %p\n", + (void *) attr->wait_set); } -int hook_debug_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, - struct fid_cq **cq, void *context) +static int +hook_debug_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, + struct fid_cq **cq, void *context) { struct hook_domain *domain = container_of(domain_fid, struct hook_domain, domain); @@ -594,6 +634,9 @@ int hook_debug_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, if (ret) goto err; + FI_TRACE(hook_fabric_to_hprov(mycq->hook_cq.domain->fabric), FI_LOG_CQ, + "cq opened, fid: %p\n", (void *) &mycq->hook_cq.hcq->fid); + mycq->hook_cq.cq.fid.ops = &hook_debug_cq_fid_ops; mycq->hook_cq.cq.ops = &hook_debug_cq_ops; mycq->format = attr->format; @@ -626,21 +669,29 @@ static int hook_debug_ep_close(struct fid *fid) return ret; } -int hook_debug_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +static int hook_debug_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) { struct fid *hfid, *hbfid; struct hook_cntr *cntr; + struct hook_cq *cq; hfid = hook_to_hfid(fid); hbfid = hook_to_hfid(bfid); if (!hfid || !hbfid) return -FI_EINVAL; - switch (fid->fclass) { + switch (bfid->fclass) { + case FI_CLASS_CQ: + cq = container_of(bfid, struct hook_cq, cq.fid); + HOOK_DEBUG_TRACE(cq->domain->fabric, FI_LOG_EP_CTRL, + "cq: %p bind flags: %s\n", (void *) cq->hcq, + fi_tostr(&flags, FI_TYPE_CAPS)); + break; case FI_CLASS_CNTR: - cntr = container_of(fid, struct hook_cntr, cntr.fid); + cntr = container_of(bfid, struct hook_cntr, cntr.fid); HOOK_DEBUG_TRACE(cntr->domain->fabric, FI_LOG_EP_CTRL, - "cntr: %p bind flags: %s\n", cntr->hcntr, + "cntr: %p bind flags: %s\n", + (void *) cntr->hcntr, fi_tostr(&flags, FI_TYPE_CAPS)); break; } @@ -681,8 +732,9 @@ struct fi_ops_tagged hook_debug_tagged_ops = { .injectdata = hook_debug_tinjectdata, }; -int hook_debug_endpoint(struct fid_domain *domain, struct fi_info *info, - struct fid_ep **ep, void *context) +static int +hook_debug_endpoint(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context) { struct hook_debug_ep *myep; struct ofi_bufpool_attr bufpool_attr = { @@ -731,7 +783,7 @@ int hook_debug_endpoint(struct fid_domain *domain, struct fi_info *info, goto err; FI_TRACE(hook_to_hprov(&myep->hook_ep.ep.fid), FI_LOG_EP_CTRL, - "endpoint opened, fid: %p\n", &myep->hook_ep.hep->fid); + "endpoint opened, fid: %p\n", (void *) &myep->hook_ep.hep->fid); myep->hook_ep.ep.fid.ops = &hook_debug_ep_fid_ops; myep->hook_ep.ep.msg = &hook_debug_msg_ops; @@ -760,6 +812,7 @@ static ssize_t hook_debug_eq_read(struct fid_eq *eq, uint32_t *event, if (ret > 0) ofi_atomic_inc64(&myeq->event_cntr[*event]); + hook_debug_trace_exit_eq(myeq, "fi_eq_read", (ssize_t)ret); return ret; } @@ -775,6 +828,7 @@ static ssize_t hook_debug_eq_sread(struct fid_eq *eq, uint32_t *event, if (ret > 0) ofi_atomic_inc64(&myeq->event_cntr[*event]); + hook_debug_trace_exit_eq(myeq, "fi_eq_sread", (ssize_t)ret); return ret; } @@ -802,8 +856,9 @@ static int hook_debug_eq_close(struct fid *fid) static struct fi_ops_eq hook_debug_eq_ops; static struct fi_ops hook_debug_eq_fid_ops; -int hook_debug_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, - struct fid_eq **eq, void *context) +static int +hook_debug_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, + struct fid_eq **eq, void *context) { struct hook_debug_eq *myeq; int i, ret; @@ -813,8 +868,10 @@ int hook_debug_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, return -FI_ENOMEM; ret = hook_eq_init(fabric, attr, eq, context, &myeq->hook_eq); - if (ret) + if (ret) { free(myeq); + return ret; + } myeq->hook_eq.eq.ops = &hook_debug_eq_ops; myeq->hook_eq.eq.fid.ops = &hook_debug_eq_fid_ops; @@ -852,9 +909,9 @@ static int hook_debug_fabric(struct fi_fabric_attr *attr, struct hook_prov_ctx hook_debug_prov_ctx = { .prov = { - .version = FI_VERSION(1,0), + .version = OFI_VERSION_DEF_PROV, /* We're a pass-through provider, so the fi_version is always the latest */ - .fi_version = FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + .fi_version = OFI_VERSION_LATEST, .name = "ofi_hook_debug", .getinfo = NULL, .fabric = hook_debug_fabric, @@ -868,8 +925,7 @@ static uint64_t hook_debug_cntr_read(struct fid_cntr *cntr) uint64_t ret; ret = fi_cntr_read(mycntr->hcntr); - hook_debug_trace_exit(&mycntr->cntr.fid, &mycntr->hcntr->fid, - FI_LOG_CNTR, "fi_cntr_read", (ssize_t)ret, NULL); + hook_debug_trace_exit_cntr(mycntr, "fi_cntr_read", (ssize_t)ret); return ret; } @@ -881,29 +937,28 @@ static int hook_debug_cntr_wait(struct fid_cntr *cntr, uint64_t threshold, int t HOOK_DEBUG_TRACE(mycntr->domain->fabric, FI_LOG_CNTR, "cntr: %p, threshold: %" PRIu64 ", timeout: %d\n", - mycntr->hcntr, threshold, timeout); + (void *) mycntr->hcntr, threshold, timeout); ret = fi_cntr_wait(mycntr->hcntr, threshold, timeout); - hook_debug_trace_exit(&mycntr->cntr.fid, &mycntr->hcntr->fid, - FI_LOG_CNTR, "fi_cntr_wait", (ssize_t)ret, NULL); + hook_debug_trace_exit_cntr(mycntr, "fi_cntr_wait", (ssize_t)ret); return ret; } static struct fi_ops_cntr hook_debug_cntr_ops; -int hook_debug_cntr_init(struct fid *fid) +static int hook_debug_cntr_init(struct fid *fid) { struct hook_cntr *mycntr = container_of(fid, struct hook_cntr, cntr.fid); HOOK_DEBUG_TRACE(mycntr->domain->fabric, FI_LOG_CNTR, - "fi_cntr_open: %p\n", mycntr->hcntr); + "fi_cntr_open: %p\n", (void *) mycntr->hcntr); mycntr->cntr.ops = &hook_debug_cntr_ops; return 0; } static struct fi_ops_domain hook_debug_domain_ops; -int hook_debug_domain_init(struct fid *fid) +static int hook_debug_domain_init(struct fid *fid) { struct fid_domain *domain = container_of(fid, struct fid_domain, fid); domain->ops = &hook_debug_domain_ops; @@ -933,7 +988,7 @@ HOOK_DEBUG_INI hook_debug_cq_ops = hook_cq_ops; hook_debug_cq_ops.read = hook_debug_cq_read; - hook_debug_cq_ops.readfrom = fi_no_cq_readfrom; + hook_debug_cq_ops.readfrom = hook_debug_cq_readfrom; hook_debug_cq_ops.sread = fi_no_cq_sread; hook_debug_cq_ops.sreadfrom = fi_no_cq_sreadfrom; diff --git a/prov/hook/perf/src/hook_perf.c b/prov/hook/perf/src/hook_perf.c index afa686f4ca0..4b7ca297013 100644 --- a/prov/hook/perf/src/hook_perf.c +++ b/prov/hook/perf/src/hook_perf.c @@ -898,9 +898,9 @@ static int hook_perf_fabric(struct fi_fabric_attr *attr, struct hook_prov_ctx hook_perf_ctx = { .prov = { - .version = FI_VERSION(1,0), + .version = OFI_VERSION_DEF_PROV, /* We're a pass-through provider, so the fi_version is always the latest */ - .fi_version = FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + .fi_version = OFI_VERSION_LATEST, .name = "ofi_hook_perf", .getinfo = NULL, .fabric = hook_perf_fabric, diff --git a/prov/hook/src/hook.c b/prov/hook/src/hook.c index 58b6558f12b..ff101aaa0f8 100644 --- a/prov/hook/src/hook.c +++ b/prov/hook/src/hook.c @@ -267,9 +267,9 @@ static int hook_noop_fabric(struct fi_fabric_attr *attr, struct hook_prov_ctx hook_noop_ctx = { .prov = { - .version = FI_VERSION(1,0), + .version = OFI_VERSION_DEF_PROV, /* We're a pass-through provider, so the fi_version is always the latest */ - .fi_version = FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), + .fi_version = OFI_VERSION_LATEST, .name = "ofi_hook_noop", .getinfo = NULL, .fabric = hook_noop_fabric, diff --git a/prov/hook/src/hook_domain.c b/prov/hook/src/hook_domain.c index 4e37d437ca4..004e19ee6ec 100644 --- a/prov/hook/src/hook_domain.c +++ b/prov/hook/src/hook_domain.c @@ -77,6 +77,7 @@ static int hook_mr_regv(struct fid *fid, const struct iovec *iov, attr.context = context; attr.auth_key_size = 0; attr.auth_key = NULL; + attr.iface = FI_HMEM_SYSTEM; return hook_mr_regattr(fid, &attr, flags, mr); } @@ -108,6 +109,14 @@ int hook_query_atomic(struct fid_domain *domain, enum fi_datatype datatype, return fi_query_atomic(dom->hdomain, datatype, op, attr, flags); } +static int hook_query_collective(struct fid_domain *domain, enum fi_collective_op coll, + struct fi_collective_attr *attr, uint64_t flags) +{ + struct hook_domain *dom = container_of(domain, struct hook_domain, domain); + + return fi_query_collective(dom->hdomain, coll, attr, flags); +} + struct fi_ops_domain hook_domain_ops = { .size = sizeof(struct fi_ops_domain), .av_open = hook_av_open, @@ -119,6 +128,7 @@ struct fi_ops_domain hook_domain_ops = { .stx_ctx = hook_stx_ctx, .srx_ctx = hook_srx_ctx, .query_atomic = hook_query_atomic, + .query_collective = hook_query_collective, }; diff --git a/prov/mlx/Makefile.include b/prov/mlx/Makefile.include deleted file mode 100644 index 260aeeefee3..00000000000 --- a/prov/mlx/Makefile.include +++ /dev/null @@ -1,34 +0,0 @@ -if HAVE_MLX -_mlx_files = prov/mlx/src/mlx.h \ - prov/mlx/src/mlx_av.c \ - prov/mlx/src/mlx_cm.c \ - prov/mlx/src/mlx_cq.c \ - prov/mlx/src/mlx_domain.c \ - prov/mlx/src/mlx_ep.c \ - prov/mlx/src/mlx_init.c \ - prov/mlx/src/mlx_tagged.c \ - prov/mlx/src/mlx_fabric.c \ - prov/mlx/src/mlx_callbacks.c - - -if HAVE_MLX_DL -pkglib_LTLIBRARIES += libmlx-fi.la -libmlx_fi_la_CPPFLAGS = $(AM_CPPFLAGS) $(mlx_CPPFLAGS) -libmlx_fi_la_SOURCES = $(_mlx_files) $(common_srcs) -libmlx_fi_la_LDFLAGS = \ - $(mlx_LDFLAGS) \ - -module -avoid-version -shared -export-dynamic -libmlx_fi_la_LIBADD = $(linkback) $(mlx_LIBS) -libmlx_fi_la_DEPENDENCIES = $(linkback) -else -src_libfabric_la_SOURCES += $(_mlx_files) -src_libfabric_la_CPPFLAGS += $(mlx_CPPFLAGS) -src_libfabric_la_LDFLAGS += $(mlx_LDFLAGS) -src_libfabric_la_LIBADD += $(mlx_LIBS) -endif - -prov_install_man_pages += man/man7/fi_mlx.7 - -endif #HAVE_MLX - -prov_dist_man_pages += man/man7/fi_mlx.7 diff --git a/prov/mlx/configure.m4 b/prov/mlx/configure.m4 deleted file mode 100644 index 1c13d58a6be..00000000000 --- a/prov/mlx/configure.m4 +++ /dev/null @@ -1,26 +0,0 @@ -dnl Configury specific to the libfabrics mlx provider - -dnl Called to configure this provider -dnl -dnl Arguments: -dnl -dnl $1: action if configured successfully -dnl $2: action if not configured successfully -dnl -AC_DEFUN([FI_MLX_CONFIGURE],[ - # Determine if we can support the mxm provider - mlx_happy=0 - AS_IF([test x"$enable_mlx" = x"yes"], - [FI_CHECK_PACKAGE([mlx], - [ucp/api/ucp.h], - [ucp], - [ucp_get_version_string], - [], - [$mlx_PREFIX], - [$mlx_LIBDIR], - [mlx_happy=1], - [mlx_happy=0]) - ]) - AS_IF([test $mlx_happy -eq 1], [$1], [$2]) -]) - diff --git a/prov/mlx/src/mlx.h b/prov/mlx/src/mlx.h deleted file mode 100644 index cdc06973cd0..00000000000 --- a/prov/mlx/src/mlx.h +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (c) 2016 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenFabrics.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef _FI_MLX_H -#define _FI_MLX_H - - -#ifdef __cplusplus -extern "C" { -#endif - -#include "config.h" -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "ofi_enosys.h" -#include -#include -#include -#include - -#include -#include -#include -#include - -#define FI_MLX_FABRIC_NAME "mlx" -#define FI_MLX_DEFAULT_INJECT_SIZE 1024 -#define FI_MLX_DEFAULT_NS_PORT 12345 -#define FI_MLX_DEF_CQ_SIZE (1024) -#define FI_MLX_DEF_MR_CNT (1 << 16) - -#define FI_MLX_VERSION_MINOR 5 -#define FI_MLX_VERSION_MAJOR 1 -#define FI_MLX_VERSION (FI_VERSION(FI_MLX_VERSION_MAJOR, FI_MLX_VERSION_MINOR)) - -#define FI_MLX_RKEY_MAX_LEN (256) - -#define FI_MLX_MAX_NAME_LEN (1024) - -#define FI_MLX_CAPS (FI_SEND | FI_RECV | FI_TAGGED) -#define FI_MLX_MODE_REQUIRED (0ULL) -#define FI_MLX_MODE_SUPPORTED (FI_CONTEXT | FI_ASYNC_IOV) -#define FI_MLX_OP_FLAGS (FI_SEND | FI_RECV) -#define FI_MLX_ANY_SERVICE (0) -struct mlx_global_descriptor{ - ucp_config_t *config; - int use_ns; - int ns_port; - struct util_ns name_serv; - char *localhost; -}; - -struct mlx_fabric { - struct util_fabric u_fabric; -}; - -struct mlx_domain { - struct util_domain u_domain; - ucp_context_h context; - - struct ofi_bufpool *fast_path_pool; - fastlock_t fpp_lock; -}; - - -struct mlx_ep { - struct util_ep ep; - struct mlx_av *av; /*until AV is not implemented via utils*/ - ucp_worker_h worker; - short service; - void *addr; - size_t addr_len; -}; - -struct mlx_av { - struct fid_av av; - struct mlx_domain *domain; - struct mlx_ep *ep; - struct util_eq *eq; - int type; - int async; - size_t count; - size_t addr_len; -}; - -typedef enum mlx_req_type { - MLX_FI_REQ_UNINITIALIZED = 0, - MLX_FI_REQ_REGULAR = 0xFD, - MLX_FI_REQ_UNEXPECTED_ERR = 0xFE, - MLX_FI_REQ_UNEXPECTED = 0xFF, -} mlx_req_type_t; - -struct mlx_request { - mlx_req_type_t type; - - union { - struct fi_cq_tagged_entry tagged; - struct fi_cq_err_entry error; - } completion; - - struct util_cq* cq; - struct mlx_ep* ep; -}; - -OFI_DECLARE_CIRQUE(struct fi_cq_tagged_entry, mlx_comp_cirq); - -extern int mlx_errcode_translation_table[]; -#define MLX_TRANSLATE_ERRCODE(X) mlx_errcode_translation_table[(-X)+1] -extern struct fi_provider mlx_prov; -extern struct mlx_global_descriptor mlx_descriptor; -extern struct util_prov mlx_util_prov; - -extern struct fi_ops_cm mlx_cm_ops; -extern struct fi_ops_tagged mlx_tagged_ops; -extern struct fi_ops_mr mlx_mr_ops; -extern struct fi_fabric_attr mlx_fabric_attrs; - -int mlx_fabric_open( - struct fi_fabric_attr *attr, - struct fid_fabric **fabric, - void *context); - -int mlx_domain_open( - struct fid_fabric *fabric, struct fi_info *info, - struct fid_domain **fid, void *context); - -int mlx_ep_open( - struct fid_domain *domain, struct fi_info *info, - struct fid_ep **fid, void *context); - -int mlx_cq_open( - struct fid_domain *domain, struct fi_cq_attr *attr, - struct fid_cq **cq, void *context); - -int mlx_av_open( - struct fid_domain *domain, struct fi_av_attr *attr, - struct fid_av **av, void *context); - -int mlx_ns_is_service_wildcard(void *svc); -int mlx_ns_service_cmp(void *svc1, void *svc2); -/* Callbacks */ -void mlx_send_callback_no_compl( void *request, ucs_status_t status); -void mlx_send_callback( void *request, ucs_status_t status); -void mlx_recv_callback_no_compl(void *request, ucs_status_t status, - ucp_tag_recv_info_t *info); -void mlx_recv_callback( void *request, ucs_status_t status, - ucp_tag_recv_info_t *info); -#ifdef __cplusplus -} -#endif - -#endif diff --git a/prov/mlx/src/mlx_av.c b/prov/mlx/src/mlx_av.c deleted file mode 100644 index bb9e944c117..00000000000 --- a/prov/mlx/src/mlx_av.c +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Copyright (c) 2016 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenFabrics.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "mlx.h" - -static int mlx_av_write_event( - struct mlx_av *av, uint64_t data, - int err, void *context) -{ - struct fi_eq_err_entry entry; - size_t size; - uint64_t flags; - - entry.fid = &(av->av.fid); - entry.context = context; - entry.data = data; - - if (err) { - entry.err = err; - size = sizeof(struct fi_eq_err_entry); - flags = UTIL_FLAG_ERROR; - } else { - size = sizeof(struct fi_eq_entry); - flags = 0; - } - - fi_eq_write( - &(av->eq->eq_fid), FI_AV_COMPLETE, - &entry, size, flags); - return FI_SUCCESS; -} - -static int mlx_av_remove( - struct fid_av *fi_av, fi_addr_t *fi_addr, size_t count, - uint64_t flags) -{ - struct mlx_av *av; - int i; - - av = container_of(fi_av, struct mlx_av, av); - if ((av->async) && (!av->eq)) { - return -FI_ENOEQ; - } - - for (i = 0; i < count; ++i) { - ucp_ep_destroy((ucp_ep_h)(fi_addr[i])); - } - return FI_SUCCESS; -} - - -static inline int mlx_av_resolve_if_addr( - const struct sockaddr *saddr, - char **address) -{ - char peer_host[INET_ADDRSTRLEN] = {0}; - char peer_serv[INET_ADDRSTRLEN] = {0}; - int intserv, peer_host_len, peer_serv_len; - peer_host_len = peer_serv_len = INET_ADDRSTRLEN; - int rv; - - rv = getnameinfo(saddr, sizeof(struct sockaddr_in), - peer_host, peer_host_len, - peer_serv, peer_serv_len, - NI_NUMERICSERV|NI_NUMERICHOST); - if (0 != rv) { - FI_WARN( &mlx_prov, FI_LOG_CORE, - "Unable to resolve address: %s \n", - gai_strerror(rv)); - return -FI_EINVAL; - } - - intserv = atoi(peer_serv); - (*address) = ofi_ns_resolve_name( - &mlx_descriptor.name_serv, - peer_host, &intserv); - if (!(*address)) { - FI_WARN( &mlx_prov, FI_LOG_CORE, - "Unable to resolve address: %s:%s\n", - peer_host, peer_serv); - return -FI_EINVAL; - } - return FI_SUCCESS; -} - -static int mlx_av_insert( - struct fid_av *fi_av, const void *addr, size_t count, - fi_addr_t *fi_addr, uint64_t flags, void *context) -{ - struct mlx_av *av; - struct mlx_ep *ep; - size_t i; - ucs_status_t status = UCS_OK; - int added = 0; - - av = container_of(fi_av, struct mlx_av, av); - ep = av->ep; - - if ((av->async) && (!av->eq)) { - return -FI_ENOEQ; - } - - for (i = 0; i < count ; ++i) { - ucp_ep_params_t ep_params = { 0 }; - - if (mlx_descriptor.use_ns) { - if (mlx_av_resolve_if_addr( - (struct sockaddr*) - (&(((struct sockaddr_in *) addr)[i])), - (char**) &ep_params.address) != FI_SUCCESS) - break; - } else { - ep_params.address = (const ucp_address_t *) - (&(((const char *) addr)[i * av->addr_len])); - } - - ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; - FI_WARN(&mlx_prov, FI_LOG_CORE, - "Try to insert address #%zd, offset=%zd (size=%zd)" - " fi_addr=%p \naddr = %s\n", - i, i * av->addr_len, count, - fi_addr, &(((const char *) addr)[i * av->addr_len])); - - status = ucp_ep_create(ep->worker, &ep_params, - (ucp_ep_h *)(&(fi_addr[i]))); - if (mlx_descriptor.use_ns) { - free((void *) ep_params.address); - } - if (status == UCS_OK) { - FI_WARN(&mlx_prov, FI_LOG_CORE, "address inserted\n"); - added++; - } else { - if (av->eq) { - mlx_av_write_event( av, i, - MLX_TRANSLATE_ERRCODE(status), - context); - } - break; - } - } - - if (av->eq) { - mlx_av_write_event(av, added, 0, context); - count = 0; - } else { - count = added; - } - return count; -} - - -static int mlx_av_close(fid_t fid) -{ - struct mlx_av *fid_av; - fid_av = container_of(fid, struct mlx_av, av); - free (fid_av); - return FI_SUCCESS; -} - -static int mlx_av_bind(struct fid *fid, struct fid *bfid, uint64_t flags) -{ - struct mlx_av *av; - struct util_eq *eq; - - av = container_of(fid, struct mlx_av, av.fid); - if ((!(av->async)) || (bfid->fclass != FI_CLASS_EQ)){ - FI_WARN( &mlx_prov, FI_LOG_EP_CTRL, - "Try to bind not a EQ to AV, " - "or attemt to bind EQ and syncronious AV\n"); - return -FI_EINVAL; - } - eq = container_of(bfid, struct util_eq, eq_fid.fid); - av->eq = eq; - return FI_SUCCESS; -} - -static struct fi_ops mlx_fi_ops = { - .size = sizeof(struct fi_ops), - .close = mlx_av_close, - .bind = mlx_av_bind, -}; - -static struct fi_ops_av mlx_av_ops = { - .size = sizeof(struct fi_ops_av), - .insert = mlx_av_insert, - .remove = mlx_av_remove, -}; - -int mlx_av_open( - struct fid_domain *fi_domain, struct fi_av_attr *attr, - struct fid_av **fi_av, void *context) -{ - struct mlx_domain *domain; - struct mlx_av *av; - int type = FI_AV_MAP; - size_t count = 64; - domain = container_of(fi_domain, struct mlx_domain, u_domain.domain_fid); - - int is_async = 0; - if (attr) { - switch (attr->type) { - case FI_AV_MAP: - type = attr->type; - break; - case FI_AV_UNSPEC: - /* Set FI_AV_MAP by default */ - type = FI_AV_MAP; - break; - default: - return -EINVAL; - } - if (attr->flags & FI_EVENT){ - is_async = 1; - } - count = attr->count; - } - - av = (struct mlx_av *)calloc(1, sizeof(struct mlx_av)); - if (!av) - return -ENOMEM; - - av->domain = domain; - av->async = is_async; - av->type = type; - av->eq = NULL; - - if (mlx_descriptor.use_ns) { - av->addr_len = sizeof(struct sockaddr_in); - } else { - av->addr_len = FI_MLX_MAX_NAME_LEN; - } - - av->count = count; - av->av.fid.fclass = FI_CLASS_AV; - av->av.fid.context = context; - av->av.fid.ops = &mlx_fi_ops; - av->av.ops = &mlx_av_ops; - - *fi_av = &av->av; - return FI_SUCCESS; -} - - diff --git a/prov/mlx/src/mlx_callbacks.c b/prov/mlx/src/mlx_callbacks.c deleted file mode 100644 index ab1abc99005..00000000000 --- a/prov/mlx/src/mlx_callbacks.c +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright (c) 2016 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenFabrics.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "mlx.h" - -/*using for fi_tinject path*/ -/*Using for selective completions scenario*/ -void mlx_send_callback_no_compl(void *request, ucs_status_t status) -{ - ucp_request_release(request); -} - -void mlx_send_callback(void *request, - ucs_status_t status) -{ - struct util_cq *cq; - struct mlx_request *mlx_req = request; - struct fi_cq_tagged_entry *t_entry; - struct util_cq_oflow_err_entry *err; - - cq = mlx_req->cq; - - if (status == UCS_ERR_CANCELED) { - ucp_request_release(request); - return; - } - - fastlock_acquire(&cq->cq_lock); - - t_entry = ofi_cirque_tail(cq->cirq); - *t_entry = (mlx_req->completion.tagged); - ofi_cirque_commit(cq->cirq); - - if (status != UCS_OK){ - t_entry->flags |= UTIL_FLAG_ERROR; - err = calloc(1, sizeof(struct util_cq_oflow_err_entry)); - if (!err) { - FI_WARN(&mlx_prov, FI_LOG_CQ, - "out of memory, cannot report CQ error\n"); - goto fn; - } - - err->comp = (mlx_req->completion.error); - err->comp.prov_errno = (int)status; - err->comp.err = MLX_TRANSLATE_ERRCODE(status); - err->comp.olen = 0; - slist_insert_tail(&err->list_entry, &cq->oflow_err_list); - } -fn: - mlx_req->type = MLX_FI_REQ_UNINITIALIZED; - fastlock_release(&cq->cq_lock); - ucp_request_release(request); -} - -/*Using for selective completions scenario*/ -void mlx_recv_callback_no_compl(void *request, - ucs_status_t status, - ucp_tag_recv_info_t *info) -{ - ucp_request_release(request); -} - -void mlx_recv_callback(void *request, - ucs_status_t status, - ucp_tag_recv_info_t *info) -{ - struct util_cq *cq; - struct mlx_request *mlx_req; - - mlx_req = (struct mlx_request*)request; - if (status == UCS_ERR_CANCELED) { - ucp_request_release(request); - return; - } - - cq = mlx_req->cq; - - mlx_req->completion.tagged.tag = info->sender_tag; - mlx_req->completion.tagged.len = info->length; - - if (status != UCS_OK) { - mlx_req->completion.error.prov_errno = (int)status; - mlx_req->completion.error.err = MLX_TRANSLATE_ERRCODE(status); - } - - fastlock_acquire(&cq->cq_lock); - if (mlx_req->type == MLX_FI_REQ_UNINITIALIZED) { - if (status != UCS_OK) { - mlx_req->completion.error.olen = info->length; - mlx_req->type = MLX_FI_REQ_UNEXPECTED_ERR; - } else { - mlx_req->type = MLX_FI_REQ_UNEXPECTED; - } - fastlock_release(&cq->cq_lock); - return; - } else { - if (status != UCS_OK) { - mlx_req->completion.error.olen = info->length - - mlx_req->completion.error.len; - } - - struct fi_cq_tagged_entry *t_entry; - t_entry = ofi_cirque_tail(cq->cirq); - *t_entry = (mlx_req->completion.tagged); - - if (status != UCS_OK) { - struct util_cq_oflow_err_entry *err; - t_entry->flags |= UTIL_FLAG_ERROR; - - err = calloc(1, sizeof(struct util_cq_oflow_err_entry)); - if (!err) { - FI_WARN(&mlx_prov, FI_LOG_CQ, - "out of memory, cannot report CQ error\n"); - mlx_req->type = MLX_FI_REQ_UNINITIALIZED; - goto fn; - } - - err->comp = (mlx_req->completion.error); - slist_insert_tail(&err->list_entry, &cq->oflow_err_list); - } - - if (cq->src){ - cq->src[ofi_cirque_windex((struct mlx_comp_cirq*)(cq->cirq))] = - FI_ADDR_NOTAVAIL; - } - - if (cq->wait) { - cq->wait->signal(cq->wait); - } - - mlx_req->type = MLX_FI_REQ_UNINITIALIZED; - ofi_cirque_commit(cq->cirq); - } -fn: - fastlock_release(&cq->cq_lock); - ucp_request_release(request); -} - diff --git a/prov/mlx/src/mlx_cm.c b/prov/mlx/src/mlx_cm.c deleted file mode 100644 index c0b7668fa7e..00000000000 --- a/prov/mlx/src/mlx_cm.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) 2016 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenFabrics.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "mlx.h" -#include - -static int mlx_cm_getname_mlx_format( - fid_t fid, - void *addr, - size_t *addrlen) -{ - ucs_status_t status = UCS_OK; - void *addr_local = NULL; - size_t addr_len_local; - struct mlx_ep* ep; - int ofi_status = FI_SUCCESS; - - ep = container_of(fid, struct mlx_ep, ep.ep_fid.fid); - - status = ucp_worker_get_address( ep->worker, - (ucp_address_t **)&addr_local, - (size_t*) &addr_len_local ); - if (status != UCS_OK) { - FI_WARN( &mlx_prov, FI_LOG_CORE, - "ucp_worker_get_address error!!!\n"); - return MLX_TRANSLATE_ERRCODE(status); - } - - if (addr_len_local > FI_MLX_MAX_NAME_LEN) { - FI_WARN( &mlx_prov, FI_LOG_CORE, - "Address returned by UCX is too long %"PRIu64"\n", - addr_len_local); - return -FI_EINVAL; - } - - if ((*addrlen) < FI_MLX_MAX_NAME_LEN) { - FI_WARN( &mlx_prov, FI_LOG_CORE, - "Buffer storage for ep address is too small %"PRIu64 - " instead of %d [%s]\n", - *addrlen, FI_MLX_MAX_NAME_LEN, (char *)addr_local); - ofi_status = -FI_ETOOSMALL; - } - FI_INFO(&mlx_prov, FI_LOG_CORE, - "Loaded UCP address: [%"PRIu64"]%s\n", - addr_len_local, (char *)addr_local); - - if (addr_local != NULL) - memcpy(addr, addr_local, (((*addrlen) < addr_len_local) ? - (*addrlen) : addr_len_local)); - - *addrlen = FI_MLX_MAX_NAME_LEN; - ucp_worker_release_address( - ep->worker, - (ucp_address_t *)addr_local); - return ofi_status; -} - -static int mlx_cm_getname_ai_format( - fid_t fid, - void *addr, - size_t *addrlen) -{ - int ofi_status = FI_SUCCESS; - struct mlx_ep* ep = container_of(fid, struct mlx_ep, ep.ep_fid.fid); - - if (ep->addr) { - if (ep->addr_len > *addrlen) { - ofi_status = -FI_ETOOSMALL; - FI_WARN(&mlx_prov, FI_LOG_EP_CTRL, - "addrlen expected: %"PRIu64", got: %"PRIu64"\n", - ep->addr_len, *addrlen); - } else { - memcpy(addr, ep->addr, ep->addr_len); - } - *addrlen = ep->addr_len; - } else { - char *hostname = mlx_descriptor.localhost; - int service = (((getpid() & 0xFFFF))); - struct addrinfo hints = { - .ai_family = AF_INET, - .ai_socktype = SOCK_STREAM, - .ai_protocol = IPPROTO_TCP, - }; - struct addrinfo *res; - - if (getaddrinfo(hostname, NULL, &hints, &res) != 0) { - FI_WARN(&mlx_prov, FI_LOG_CORE, - "Unable to resolve hostname:%s\n", hostname); - return -FI_EAVAIL; - } - FI_INFO(&mlx_prov, FI_LOG_CORE, - "Loaded IPv4 address: [%jd]%s:%d\n", - (intmax_t) res->ai_addrlen, hostname, service); - - if (res->ai_addrlen > *addrlen) { - ofi_status = -FI_ETOOSMALL; - FI_WARN(&mlx_prov, FI_LOG_EP_CTRL, - "addrlen expected: %jd, got: %"PRIu64"\n", - (intmax_t) res->ai_addrlen, *addrlen); - } else { - memcpy(addr, res->ai_addr, res->ai_addrlen); - ((struct sockaddr_in *)addr)->sin_port = htons((short)service); - } - - *addrlen = res->ai_addrlen; - - freeaddrinfo(res); - } - - return ofi_status; -} - -static int mlx_cm_getname( - fid_t fid, - void *addr, - size_t *addrlen) -{ - int ofi_status = FI_SUCCESS; - if (mlx_descriptor.use_ns) { - ofi_status = mlx_cm_getname_ai_format(fid, addr, addrlen); - } else { - ofi_status = mlx_cm_getname_mlx_format(fid, addr, addrlen); - } - return ofi_status; -} - - - -struct fi_ops_cm mlx_cm_ops = { - .size = sizeof(struct fi_ops_cm), - .getname = mlx_cm_getname, - .getpeer = fi_no_getpeer, - .connect = fi_no_connect, - .listen = fi_no_listen, - .accept = fi_no_accept, - .reject = fi_no_reject, - .shutdown = fi_no_shutdown, -}; diff --git a/prov/mlx/src/mlx_domain.c b/prov/mlx/src/mlx_domain.c deleted file mode 100644 index e20f95fa324..00000000000 --- a/prov/mlx/src/mlx_domain.c +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2016 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenFabrics.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "mlx.h" - -static int mlx_domain_close(fid_t fid) -{ - struct mlx_domain *domain; - int status; - - domain = container_of( fid, - struct mlx_domain, - u_domain.domain_fid.fid); - - ucp_cleanup(domain->context); - status = ofi_domain_close( &(domain->u_domain)); - if (!status) { - ofi_bufpool_destroy(domain->fast_path_pool); - free(domain); - } - return status; -} - -static struct fi_ops mlx_fi_ops = { - .size = sizeof(struct fi_ops), - .close = mlx_domain_close, -}; - -struct fi_ops_domain mlx_domain_ops = { - .size = sizeof(struct fi_ops_domain), - .av_open = mlx_av_open, - .cq_open = mlx_cq_open, - .endpoint = mlx_ep_open, - .poll_open = fi_poll_create, -}; - - -struct fi_ops_mr mlx_mr_ops = { - .size = sizeof(struct fi_ops_mr), - .reg = fi_no_mr_reg, - .regv = fi_no_mr_regv, - .regattr = fi_no_mr_regattr, -}; - -int mlx_domain_open(struct fid_fabric *fabric, struct fi_info *info, - struct fid_domain **fid, void *context) -{ - ucs_status_t status = UCS_OK; - int ofi_status; - struct mlx_domain* domain; - const ucp_params_t params = { - .features = UCP_FEATURE_TAG, - .request_size = sizeof(struct mlx_request), - .request_init = NULL, - .request_cleanup = NULL, - .field_mask = UCP_PARAM_FIELD_FEATURES | - UCP_PARAM_FIELD_REQUEST_SIZE, - }; - - if (!info->domain_attr->name || - strcmp(info->domain_attr->name, FI_MLX_FABRIC_NAME)) { - return -FI_EINVAL; - } - - ofi_status = ofi_prov_check_info(&mlx_util_prov, - fabric->api_version, - info); - if (ofi_status) { - return ofi_status; - } - - domain = calloc(1, sizeof(struct mlx_domain)); - if (!domain) { - return -ENOMEM; - } - - ofi_status = ofi_domain_init(fabric, info, - &(domain->u_domain), context); - if (ofi_status) { - goto domain_free; - } - - status = ucp_init(¶ms, mlx_descriptor.config, - &(domain->context)); - if (status != UCS_OK) { - ofi_status = MLX_TRANSLATE_ERRCODE(status); - goto destroy_domain; - } - fastlock_init(&(domain->fpp_lock)); - - ofi_status = ofi_bufpool_create(&domain->fast_path_pool, - sizeof(struct mlx_request), - 16, 0, 1024, 0); - if (ofi_status) - goto cleanup_mlx; - - domain->u_domain.domain_fid.fid.ops = &mlx_fi_ops; - domain->u_domain.domain_fid.ops = &mlx_domain_ops; - domain->u_domain.domain_fid.mr = &mlx_mr_ops; - - *fid = &(domain->u_domain.domain_fid); - return FI_SUCCESS; - -cleanup_mlx: - ucp_cleanup(domain->context); -destroy_domain: - ofi_domain_close(&(domain->u_domain)); -domain_free: - free(domain); - if (!ofi_status) { - ofi_status = FI_ENETUNREACH; - } - return ofi_status; -} - diff --git a/prov/mlx/src/mlx_ep.c b/prov/mlx/src/mlx_ep.c deleted file mode 100644 index 6fef94bcbdd..00000000000 --- a/prov/mlx/src/mlx_ep.c +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright (c) 2016 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenFabrics.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "mlx.h" - -static void mlx_ep_progress( struct util_ep *util_ep) -{ - struct mlx_ep *ep; - ep = container_of(util_ep, struct mlx_ep, ep); - ucp_worker_progress(ep->worker); -} - - -static ssize_t mlx_ep_cancel( fid_t fid, void *ctx) -{ - struct mlx_ep *ep; - void *req; - struct fi_context *context = (struct fi_context*)ctx; - - ep = container_of( fid, struct mlx_ep, ep.ep_fid.fid); - if (!ep->ep.domain) - return -EBADF; - if (!context) - return -EINVAL; - if (context->internal[0] == NULL) - return -FI_EINVAL; - - req = context->internal[0]; - ucp_request_cancel(ep->worker, req); - - return FI_SUCCESS; -} - -static int mlx_ep_getopt( fid_t fid, int level, int optname, - void *optval, size_t *optlen) -{ - return -ENOSYS; -} - -static int mlx_ep_setopt(fid_t fid, int level, int optname, - const void *optval, size_t optlen) -{ - return FI_SUCCESS; -} - -static int mlx_ep_close(fid_t fid) -{ - struct mlx_ep *ep; - ucs_status_t status = UCS_OK; - void *addr_local = NULL; - size_t addr_len_local; - - ep = container_of(fid, struct mlx_ep, ep.ep_fid.fid); - - if (mlx_descriptor.use_ns) { - status = ucp_worker_get_address( ep->worker, - (ucp_address_t **)&addr_local, - (size_t*) &addr_len_local ); - if (status != UCS_OK) - return MLX_TRANSLATE_ERRCODE(status); - - ofi_ns_del_local_name(&mlx_descriptor.name_serv, - &ep->service, addr_local); - - ucp_worker_release_address( - ep->worker, - (ucp_address_t *)addr_local); - } - - ucp_worker_flush(ep->worker); - ucp_worker_destroy(ep->worker); - - ofi_endpoint_close(&ep->ep); - free(ep); - return FI_SUCCESS; -} - -static int mlx_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) -{ - struct mlx_ep *ep; - struct util_cq *cq; - - ep = container_of(fid, struct mlx_ep, ep.ep_fid.fid); - int status = FI_SUCCESS; - - switch (bfid->fclass) { - case FI_CLASS_CQ: - cq = container_of(bfid, struct util_cq, cq_fid.fid); - status = ofi_ep_bind_cq(&ep->ep, cq, flags); - break; - case FI_CLASS_AV: - if (ep->av) { - FI_WARN( &mlx_prov, FI_LOG_EP_CTRL, - "AV already binded\n"); - status = -FI_EINVAL; - break; - } - ep->av = container_of(bfid, struct mlx_av, av.fid); - ep->av->ep = ep; - break; - default: - status = -FI_EINVAL; - break; - } - return status; -} - - -static int mlx_ep_control(fid_t fid, int command, void *arg) -{ - - struct mlx_ep *ep; - - ep = container_of(fid, struct mlx_ep, ep.ep_fid.fid); - switch (command) { - case FI_ENABLE: - if (!ep->ep.rx_cq || !ep->ep.tx_cq) - return -FI_ENOCQ; - if (!ep->av) - return -FI_EOPBADSTATE; /* TODO: Add FI_ENOAV */ - break; - default: - return -FI_ENOSYS; - } - return FI_SUCCESS; -} - -struct fi_ops_ep mlx_ep_ops = { - .size = sizeof(struct fi_ops_ep), - .cancel = mlx_ep_cancel, - .getopt = mlx_ep_getopt, - .setopt = mlx_ep_setopt, -}; - -static struct fi_ops mlx_fi_ops = { - .size = sizeof(struct fi_ops), - .close = mlx_ep_close, - .bind = mlx_ep_bind, - .control = mlx_ep_control, -}; - -int mlx_ep_open( struct fid_domain *domain, struct fi_info *info, - struct fid_ep **fid, void *context) -{ - struct mlx_ep *ep; - struct mlx_domain *u_domain; - int ofi_status = FI_SUCCESS; - ucs_status_t status = UCS_OK; - ucp_worker_params_t worker_params = { }; - worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE; - worker_params.thread_mode = UCS_THREAD_MODE_SINGLE; - u_domain = container_of( domain, struct mlx_domain, u_domain.domain_fid); - - void *addr_local = NULL; - size_t addr_len_local; - - - ep = (struct mlx_ep *) calloc(1, sizeof (struct mlx_ep)); - if (!ep) { - return -ENOMEM; - } - - ofi_status = ofi_endpoint_init(domain, &mlx_util_prov, info, - &ep->ep, context, mlx_ep_progress); - if (ofi_status) { - goto free_ep; - } - - status = ucp_worker_create( u_domain->context, - &worker_params, - &(ep->worker)); - if (status != UCS_OK) { - ofi_status = MLX_TRANSLATE_ERRCODE(status); - ofi_atomic_dec32(&(u_domain->u_domain.ref)); - goto free_ep; - } - - if (mlx_descriptor.use_ns) { - char tmpb [FI_MLX_MAX_NAME_LEN]={0}; - status = ucp_worker_get_address( ep->worker, - (ucp_address_t **)&addr_local, - (size_t*) &addr_len_local ); - if (status != UCS_OK) - return MLX_TRANSLATE_ERRCODE(status); - ep->service = (short)((getpid() & 0xFFFF )); - memcpy(tmpb,addr_local,addr_len_local); - FI_INFO(&mlx_prov, FI_LOG_CORE, - "PUBLISHED UCP address(size=%zd): [%hu] %s\n", - addr_len_local,ep->service,(char*)(addr_local)); - - ofi_ns_add_local_name(&mlx_descriptor.name_serv, - &ep->service, tmpb); - - ucp_worker_release_address( ep->worker, - (ucp_address_t *)addr_local); - } - - ep->ep.ep_fid.fid.ops = &mlx_fi_ops; - ep->ep.ep_fid.ops = &mlx_ep_ops; - ep->ep.ep_fid.cm = &mlx_cm_ops; - ep->ep.ep_fid.tagged = &mlx_tagged_ops; - ep->ep.flags = info->mode; - ep->ep.caps = u_domain->u_domain.info_domain_caps; - - *fid = &(ep->ep.ep_fid); - - return FI_SUCCESS; -free_ep: - free(ep); - return ofi_status; -} diff --git a/prov/mlx/src/mlx_fabric.c b/prov/mlx/src/mlx_fabric.c deleted file mode 100644 index f443da750e1..00000000000 --- a/prov/mlx/src/mlx_fabric.c +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Copyright (c) 2016 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenFabrics.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "mlx.h" - -int mlx_fabric_close(struct fid *fid) -{ - int status; - - if (mlx_descriptor.use_ns) - ofi_ns_stop_server (&mlx_descriptor.name_serv); - - status = ofi_fabric_close( - container_of(fid, struct util_fabric, fabric_fid.fid)); - return status; -} - -static struct fi_ops mlx_fabric_fi_ops = { - .size = sizeof(struct fi_ops), - .close = mlx_fabric_close, - .bind = fi_no_bind, - .control = fi_no_control, - .ops_open = fi_no_ops_open, -}; - -static struct fi_ops_fabric mlx_fabric_ops = { - .size = sizeof(struct fi_ops_fabric), - .domain = mlx_domain_open, - .passive_ep = fi_no_passive_ep, - .eq_open = ofi_eq_create, - .wait_open = ofi_wait_fd_open, - .trywait = fi_no_trywait, -}; - -int mlx_ns_service_cmp(void *svc1, void *svc2) -{ - int service1 = *(int *)svc1, service2 = *(int *)svc2; - if (service1 == FI_MLX_ANY_SERVICE || - service2 == FI_MLX_ANY_SERVICE) - return 0; - return (service1 < service2) ? - -1 : (service1 > service2); -} - -int mlx_ns_is_service_wildcard(void *svc) -{ - return (*(int *)svc == FI_MLX_ANY_SERVICE); -} - -#define MLX_IGNORED_LO_ADDR "127.0.0.1" -static char* mlx_local_host_resolve() -{ - int status; - struct ifaddrs *ifaddr, *ifa; - char host[NI_MAXHOST]; - char *iface = NULL; - char *result = NULL; - - status = fi_param_get( &mlx_prov, "ns_iface", - &iface); - if (!status) { - iface = NULL; - } - - if (-1 == getifaddrs(&ifaddr)) { - FI_WARN( &mlx_prov, FI_LOG_CORE, - "Unable to resolve local host address"); - return NULL; - } - - for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { - /*Ignore not IPv$ ifaces*/ - if ((ifa->ifa_addr == NULL) || - (ifa->ifa_addr->sa_family != AF_INET)) { - continue; - } - - if (getnameinfo(ifa->ifa_addr, sizeof(struct sockaddr_in), - host, NI_MAXHOST, - NULL, 0, NI_NUMERICHOST) != 0) { - host[0] = '\0'; - continue; - } - - /*Skip loopback device*/ - if (strncmp(host, MLX_IGNORED_LO_ADDR, - strlen(MLX_IGNORED_LO_ADDR))==0) { - host[0] = '\0'; - continue; - } - - /* If iface name is specified */ - if (iface && strcmp(iface, ifa->ifa_name)!=0) { - host[0] = '\0'; - continue; - } - - result = strdup(host); - break; - } - if (result == NULL) { - FI_WARN( &mlx_prov, FI_LOG_CORE, - "No IPv4-compatible interface was found. (match mask:%s)", - iface?iface:"*"); - } - freeifaddrs(ifaddr); - return result; -} - -int mlx_ns_start () -{ - if (!mlx_descriptor.localhost) - mlx_descriptor.localhost = mlx_local_host_resolve(); - - if (!mlx_descriptor.localhost) { - FI_INFO(&mlx_prov, FI_LOG_CORE, - "Unable to resolve local host address:\n" - "\t - unable to start NS\n" - "\t - Please try MLX-address format"); - return -FI_EINVAL; - } - - mlx_descriptor.name_serv.hostname = mlx_descriptor.localhost; - mlx_descriptor.name_serv.port = (int) mlx_descriptor.ns_port; - mlx_descriptor.name_serv.name_len = FI_MLX_MAX_NAME_LEN; - mlx_descriptor.name_serv.service_len = sizeof(short); - mlx_descriptor.name_serv.service_cmp = mlx_ns_service_cmp; - mlx_descriptor.name_serv.is_service_wildcard = mlx_ns_is_service_wildcard; - - ofi_ns_init(&mlx_descriptor.name_serv); - ofi_ns_start_server(&mlx_descriptor.name_serv); - - return FI_SUCCESS; -} - -int mlx_fabric_open( - struct fi_fabric_attr *attr, - struct fid_fabric **fabric, - void *context) -{ - struct mlx_fabric *fabric_priv; - int status; - - FI_INFO( &mlx_prov, FI_LOG_CORE, "\n" ); - - if (strcmp(attr->name, FI_MLX_FABRIC_NAME)) - return -FI_ENODATA; - - fabric_priv = calloc(1, sizeof(struct mlx_fabric)); - if (!fabric_priv) { - return -FI_ENOMEM; - } - - status = ofi_fabric_init(&mlx_prov, &mlx_fabric_attrs, attr, - &(fabric_priv->u_fabric), context); - if (status) { - FI_INFO( &mlx_prov, FI_LOG_CORE, - "Error in ofi_fabric_init: %d\n", status); - free(fabric_priv); - return status; - } - - fabric_priv->u_fabric.fabric_fid.fid.ops = &mlx_fabric_fi_ops; - fabric_priv->u_fabric.fabric_fid.ops = &mlx_fabric_ops; - *fabric = &(fabric_priv->u_fabric.fabric_fid); - - if (mlx_descriptor.use_ns) { - if(mlx_ns_start() != FI_SUCCESS) { - free(fabric_priv); - return status; - } - } - - return FI_SUCCESS; -} diff --git a/prov/mlx/src/mlx_init.c b/prov/mlx/src/mlx_init.c deleted file mode 100644 index 99e3f89a4d3..00000000000 --- a/prov/mlx/src/mlx_init.c +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Copyright (c) 2016 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenFabrics.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "mlx.h" - - -int mlx_errcode_translation_table[(-UCS_ERR_LAST)+2] = { -FI_EOTHER }; - -struct mlx_global_descriptor mlx_descriptor = { - .config = NULL, - .use_ns = 0, - .ns_port = FI_MLX_DEFAULT_NS_PORT, - .localhost = NULL, -}; - -static int mlx_init_errcodes() -{ - MLX_TRANSLATE_ERRCODE (UCS_OK) = -FI_SUCCESS; - MLX_TRANSLATE_ERRCODE (UCS_INPROGRESS) = -FI_EINPROGRESS; - MLX_TRANSLATE_ERRCODE (UCS_ERR_NO_MESSAGE) = -FI_ENOMSG; - MLX_TRANSLATE_ERRCODE (UCS_ERR_NO_RESOURCE) = -FI_EINVAL; - MLX_TRANSLATE_ERRCODE (UCS_ERR_IO_ERROR) = -FI_EIO; - MLX_TRANSLATE_ERRCODE (UCS_ERR_NO_MEMORY) = -FI_ENOMEM; - MLX_TRANSLATE_ERRCODE (UCS_ERR_INVALID_PARAM) = -FI_EINVAL; - MLX_TRANSLATE_ERRCODE (UCS_ERR_UNREACHABLE) = -FI_ENETUNREACH; - MLX_TRANSLATE_ERRCODE (UCS_ERR_INVALID_ADDR) = -FI_EINVAL; - MLX_TRANSLATE_ERRCODE (UCS_ERR_NOT_IMPLEMENTED) = -FI_ENOSYS; - MLX_TRANSLATE_ERRCODE (UCS_ERR_MESSAGE_TRUNCATED) = -FI_EMSGSIZE; - MLX_TRANSLATE_ERRCODE (UCS_ERR_NO_PROGRESS) = -FI_EAGAIN; - MLX_TRANSLATE_ERRCODE (UCS_ERR_BUFFER_TOO_SMALL)= -FI_ETOOSMALL; - MLX_TRANSLATE_ERRCODE (UCS_ERR_NO_ELEM) = -FI_ENOENT; - MLX_TRANSLATE_ERRCODE (UCS_ERR_SOME_CONNECTS_FAILED) = -FI_EIO; - MLX_TRANSLATE_ERRCODE (UCS_ERR_NO_DEVICE) = -FI_ENODEV; - MLX_TRANSLATE_ERRCODE (UCS_ERR_BUSY) = -FI_EBUSY; - MLX_TRANSLATE_ERRCODE (UCS_ERR_CANCELED) = -FI_ECANCELED; - MLX_TRANSLATE_ERRCODE (UCS_ERR_SHMEM_SEGMENT) = -FI_EINVAL; - MLX_TRANSLATE_ERRCODE (UCS_ERR_ALREADY_EXISTS) = -EEXIST; - MLX_TRANSLATE_ERRCODE (UCS_ERR_OUT_OF_RANGE) = -FI_EINVAL; - MLX_TRANSLATE_ERRCODE (UCS_ERR_TIMED_OUT) = -FI_ETIMEDOUT; - MLX_TRANSLATE_ERRCODE (UCS_ERR_EXCEEDS_LIMIT) = -FI_E2BIG; - MLX_TRANSLATE_ERRCODE (UCS_ERR_UNSUPPORTED) = -FI_ENOSYS; - return 0; -} - - -struct fi_domain_attr mlx_domain_attrs = { - .domain = NULL, - .name = FI_MLX_FABRIC_NAME, - .threading = FI_THREAD_SAFE, - .control_progress = FI_PROGRESS_AUTO, - .data_progress = FI_PROGRESS_MANUAL, - .resource_mgmt = FI_RM_DISABLED, - .av_type = FI_AV_UNSPEC, - .mr_mode = OFI_MR_BASIC_MAP | FI_MR_BASIC, - .mr_key_size = -1, /*Should be setup after init*/ - .tx_ctx_cnt = 1, - .rx_ctx_cnt = 1, - .max_ep_tx_ctx = 1, - .max_ep_rx_ctx = 1, - .mr_cnt = FI_MLX_DEF_MR_CNT, -}; - -struct fi_rx_attr mlx_rx_attrs = { - .caps = FI_MLX_CAPS, - .mode = FI_MLX_MODE_REQUIRED, - .op_flags = FI_MLX_OP_FLAGS, - .msg_order = FI_ORDER_SAS, - .comp_order = FI_ORDER_NONE, - .total_buffered_recv = ~(0ULL), - .size = UINT64_MAX, - .iov_limit = 1 -}; - -struct fi_tx_attr mlx_tx_attrs = { - .caps = FI_MLX_CAPS, - .mode = FI_MLX_MODE_REQUIRED, - .op_flags = FI_MLX_OP_FLAGS, - .msg_order = FI_ORDER_SAS, - .comp_order = FI_ORDER_NONE, - .inject_size = FI_MLX_DEFAULT_INJECT_SIZE, /*Should be setup after init*/ - .size = UINT64_MAX, - .iov_limit = 1, - .rma_iov_limit = 0 -}; - -struct fi_fabric_attr mlx_fabric_attrs = { - .name = FI_MLX_FABRIC_NAME, - .prov_version = FI_MLX_VERSION, - .fabric = NULL -}; - -struct fi_ep_attr mlx_ep_attrs = { - .type = FI_EP_RDM, - .protocol = FI_PROTO_MLX, -#if defined(UCP_API_RELEASE) && (UCP_API_RELEASE <= 2947) -#warning "HPCX 1.9.7 have an issue with UCP_API_VERSION macro" - .protocol_version = (((UCP_API_MAJOR) << UCP_VERSION_MAJOR_SHIFT)| - ((UCP_API_MINOR) << UCP_VERSION_MINOR_SHIFT)), -#else - .protocol_version = (UCP_API_VERSION), -#endif - .max_msg_size = 0xFFFFFFFF, - .mem_tag_format = 0x0, - .tx_ctx_cnt = 1, - .rx_ctx_cnt = 1, -}; - - -struct fi_info mlx_info = { - .caps = FI_MLX_CAPS, - .mode = FI_MLX_MODE_REQUIRED, - .addr_format = FI_ADDR_MLX, - .src_addrlen = 0, - .dest_addr = 0, - .tx_attr = &mlx_tx_attrs, - .rx_attr = &mlx_rx_attrs, - .ep_attr = &mlx_ep_attrs, - .domain_attr = &mlx_domain_attrs, - .fabric_attr = &mlx_fabric_attrs -}; - -struct util_prov mlx_util_prov = { - .prov = &mlx_prov, - .info = &mlx_info, - .flags = 0, -}; - - -static int mlx_getinfo ( - uint32_t version, const char *node, - const char *service, uint64_t flags, - const struct fi_info *hints, struct fi_info **info) -{ - int status = -ENODATA; - char *configfile_name = NULL; - int inject_thresh = -1; - - mlx_descriptor.config = NULL; - - status = fi_param_get( &mlx_prov, - "tinject_limit", - &inject_thresh); - if (!status) - inject_thresh = FI_MLX_DEFAULT_INJECT_SIZE; - - FI_INFO( &mlx_prov, FI_LOG_CORE, - "used inject size = %d \n", inject_thresh); - - status = fi_param_get( &mlx_prov, "config", &configfile_name); - if (!status) { - configfile_name = NULL; - } - - /* NS is disabled by default */ - status = fi_param_get( &mlx_prov, "ns_enable", - &mlx_descriptor.use_ns); - if (!status) { - mlx_descriptor.use_ns = 0; - } - status = fi_param_get( &mlx_prov, "ns_port", - &mlx_descriptor.ns_port); - if (!status) { - mlx_descriptor.ns_port = FI_MLX_DEFAULT_NS_PORT; - } - - - - status = ucp_config_read( NULL, - status? NULL: configfile_name, - &mlx_descriptor.config); - if (status != UCS_OK) { - FI_WARN( &mlx_prov, FI_LOG_CORE, - "MLX error: invalid config file\n\t%d (%s)\n", - status, ucs_status_string(status)); - } - - /*Setup some presets*/ - status = ucm_config_modify("MALLOC_HOOKS", "no"); - if (status != UCS_OK) { - FI_WARN( &mlx_prov, FI_LOG_CORE, - "MLX error: failed to switch off UCM memory hooks:\t%d (%s)\n", - status, ucs_status_string(status)); - } - - FI_INFO( &mlx_prov, FI_LOG_CORE, - "Loaded MLX version %s\n", - ucp_get_version_string()); - -#if ENABLE_DEBUG - if (mlx_descriptor.config && - fi_log_enabled( &mlx_prov, FI_LOG_INFO, FI_LOG_CORE)) { - ucp_config_print( mlx_descriptor.config, - stderr, "Used MLX configuration", (1<<4)-1); - } -#endif - - *info = NULL; - if (node || service) { - FI_WARN(&mlx_prov, FI_LOG_CORE, - "fi_getinfo with \"node != NULL \" or \"service != NULL \" is temporary not supported\n"); - node = service = NULL; - flags = 0; - } - - /* Only Pure MLX address and IPv4 are supported */ - if (hints) { - if (hints->addr_format <= FI_SOCKADDR_IN) { - mlx_descriptor.use_ns = 1; - mlx_info.addr_format = FI_SOCKADDR_IN; - } else { - mlx_info.addr_format = FI_ADDR_MLX; - } - } - - - status = util_getinfo( &mlx_util_prov, version, - service, node, flags, hints, info); - - return status; -} - -void mlx_cleanup(void) -{ - FI_INFO(&mlx_prov, FI_LOG_CORE, "provider goes cleanup sequence\n"); - if (mlx_descriptor.config) { - ucp_config_release(mlx_descriptor.config); - mlx_descriptor.config = NULL; - } -} - - -struct fi_provider mlx_prov = { - .name = FI_MLX_FABRIC_NAME, - .version = FI_MLX_VERSION, - .fi_version = FI_VERSION(1, 8), - .getinfo = mlx_getinfo, - .fabric = mlx_fabric_open, - .cleanup = mlx_cleanup, -}; - - -MLX_INI -{ - mlx_init_errcodes(); - fi_param_define( &mlx_prov, - "config", FI_PARAM_STRING, - "MLX configuration file name"); - - fi_param_define(&mlx_prov, - "tinject_limit", FI_PARAM_INT, - "Maximal tinject message size"); - - fi_param_define(&mlx_prov, - "ns_port", FI_PARAM_INT, - "MLX Name server port"); - - fi_param_define(&mlx_prov, - "ns_enable",FI_PARAM_BOOL, - "Enforce usage of name server for MLX provider"); - - fi_param_define(&mlx_prov, - "ns_iface",FI_PARAM_STRING, - "Specify IPv4 network interface for MLX provider's name server'"); - return &mlx_prov; -} diff --git a/prov/mlx/src/mlx_tagged.c b/prov/mlx/src/mlx_tagged.c deleted file mode 100644 index afb3f090aa5..00000000000 --- a/prov/mlx/src/mlx_tagged.c +++ /dev/null @@ -1,360 +0,0 @@ -/* - * Copyright (c) 2016 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenFabrics.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "mlx.h" - -#define __mlx_get_dstep_from_fi_addr(EP, ADDR) ((ucp_ep_h)(ADDR)) - -static ssize_t mlx_tagged_recvmsg( - struct fid_ep *ep, - const struct fi_msg_tagged *msg, - uint64_t flags) -{ - ucs_status_ptr_t status = NULL; - ucp_tag_recv_callback_t cbf; - struct mlx_ep *u_ep; - struct mlx_request *req; - struct util_cq *cq; - u_ep = container_of(ep, struct mlx_ep, ep.ep_fid); - - if (flags & FI_REMOTE_CQ_DATA) { - return -FI_EBADFLAGS; - } - - cbf = ((!(u_ep->ep.rx_op_flags & FI_SELECTIVE_COMPLETION)) - || (flags & FI_COMPLETION)) ? - mlx_recv_callback : mlx_recv_callback_no_compl; - - if (msg->iov_count == 1) { - status = ucp_tag_recv_nb(u_ep->worker, msg->msg_iov[0].iov_base, - msg->msg_iov[0].iov_len, - ucp_dt_make_contig(1), - msg->tag, (~(msg->ignore)), cbf); - } else { - return -FI_EINVAL; /*Do not return IOV for a while*/ - } - - if (UCS_PTR_IS_ERR(status)) { - FI_DBG( &mlx_prov,FI_LOG_CORE, - "Send operation returns error: %s", - ucs_status_string(*(ucs_status_t*)status)); - return MLX_TRANSLATE_ERRCODE(*(ucs_status_t*)status); - } - - req = (struct mlx_request *)status; - cq = u_ep->ep.rx_cq; - req->cq = cq; - req->ep =u_ep; - - if (msg->context) { - struct fi_context *_ctx = - ((struct fi_context *)(msg->context)); - _ctx->internal[0] = (void*)req; - } - req->completion.tagged.op_context = msg->context; - req->completion.tagged.flags = FI_RECV; - req->completion.tagged.buf = msg->msg_iov[0].iov_base; - req->completion.tagged.data = 0; - - if (req->type == MLX_FI_REQ_UNINITIALIZED) { - req->type = MLX_FI_REQ_REGULAR; - req->completion.tagged.tag = msg->tag; - req->completion.tagged.len = msg->msg_iov[0].iov_len; - goto fence; - } - - /*Unexpected path*/ - struct fi_cq_tagged_entry *t_entry; - fastlock_acquire(&cq->cq_lock); - t_entry = ofi_cirque_tail(cq->cirq); - *t_entry = (req->completion.tagged); - - if (req->type == MLX_FI_REQ_UNEXPECTED_ERR) { - struct util_cq_oflow_err_entry* err; - req->completion.error.olen -= req->completion.tagged.len; - t_entry->flags |= UTIL_FLAG_ERROR; - - err = calloc(1, sizeof(struct util_cq_oflow_err_entry)); - if (!err) { - FI_WARN(&mlx_prov, FI_LOG_CQ, - "out of memory, cannot report CQ error\n"); - fastlock_release(&cq->cq_lock); - return -FI_ENOMEM; - } - err->comp = (req->completion.error); - slist_insert_tail(&err->list_entry, &cq->oflow_err_list); - } - - ofi_cirque_commit(cq->cirq); - fastlock_release(&cq->cq_lock); - -fence: - if (flags & FI_FENCE) { - ucs_status_t cstatus; - cstatus = ucp_worker_flush(u_ep->worker); - if (status != UCS_OK) - return MLX_TRANSLATE_ERRCODE(cstatus); - } - return FI_SUCCESS; -} - -static ssize_t mlx_tagged_sendmsg( - struct fid_ep *ep, - const struct fi_msg_tagged *msg, - uint64_t flags) -{ - struct mlx_ep* u_ep; - ucp_send_callback_t cbf; - ucp_ep_h dst_ep; - ucs_status_ptr_t status = NULL; - ucs_status_t cstatus; - struct util_cq *cq; - ucp_tag_recv_info_t info; - - u_ep = container_of(ep, struct mlx_ep, ep.ep_fid); - dst_ep = __mlx_get_dstep_from_fi_addr(u_ep, msg->addr); - cq = u_ep->ep.tx_cq; - - if(flags & FI_REMOTE_CQ_DATA) { - return -FI_EBADFLAGS; - } - - cbf = ((!(u_ep->ep.tx_op_flags & FI_SELECTIVE_COMPLETION)) - || (flags & FI_COMPLETION)) ? - mlx_send_callback : mlx_send_callback_no_compl; - if (msg->iov_count == 1) { - if (flags & FI_TRANSMIT_COMPLETE) { - status = ucp_tag_send_sync_nb ( - dst_ep, - msg->msg_iov[0].iov_base, - msg->msg_iov[0].iov_len, - ucp_dt_make_contig(1), - msg->tag, cbf); - } else { - status = ucp_tag_send_nb( - dst_ep, - msg->msg_iov[0].iov_base, - msg->msg_iov[0].iov_len, - ucp_dt_make_contig(1), - msg->tag, cbf); - } - } else { - return -FI_EINVAL; /*Do not return IOV for a while*/ - } - - if (UCS_PTR_IS_ERR(status)) { - FI_DBG( &mlx_prov,FI_LOG_CORE, - "Send operation returns error: %s", - ucs_status_string(*(ucs_status_t*)status)); - return MLX_TRANSLATE_ERRCODE(*(ucs_status_t*)status); - } - - if ((flags & FI_INJECT) && (UCS_PTR_STATUS(status) == UCS_OK)) { - while (ucp_request_test(status, &info) != UCS_INPROGRESS) - ucp_worker_progress(u_ep->worker); - goto fence; - } - - if((u_ep->ep.tx_op_flags & FI_SELECTIVE_COMPLETION) - && !(flags & FI_COMPLETION)) { - goto fence; - } - - if (msg->context) { - struct fi_context* _ctx = - ((struct fi_context*)(msg->context)); - _ctx->internal[0] = status; - } - - if (UCS_PTR_STATUS(status) != UCS_OK) { - struct mlx_request *req; - req = (struct mlx_request *) status; - req->cq = cq; - req->ep = u_ep; - req->type = MLX_FI_REQ_REGULAR; - req->completion.tagged.op_context = msg->context; - req->completion.tagged.flags = FI_SEND; - req->completion.tagged.len = msg->msg_iov[0].iov_len; - req->completion.tagged.buf = msg->msg_iov[0].iov_base; - req->completion.tagged.data = 0; - req->completion.tagged.tag = msg->tag; - } else { - struct fi_cq_tagged_entry *t_entry; - fastlock_acquire(&cq->cq_lock); - t_entry = ofi_cirque_tail(cq->cirq); - t_entry->op_context = msg->context; - t_entry->flags = FI_SEND; - t_entry->len = msg->msg_iov[0].iov_len; - t_entry->buf = msg->msg_iov[0].iov_base; - t_entry->data = 0; - t_entry->tag = msg->tag; - ofi_cirque_commit(cq->cirq); - fastlock_release(&cq->cq_lock); - } - -fence: - if(flags & FI_FENCE) { - cstatus = ucp_worker_flush(u_ep->worker); - if(status != UCS_OK) { - return MLX_TRANSLATE_ERRCODE(cstatus); - } - } - return FI_SUCCESS; -} - - -static ssize_t mlx_tagged_inject( - struct fid_ep *ep, const void *buf, size_t len, - fi_addr_t dest_addr, uint64_t tag) -{ - struct mlx_ep* u_ep; - ucp_ep_h dst_ep; - ucs_status_ptr_t status = NULL; - ucp_tag_recv_info_t info; - - u_ep = container_of(ep, struct mlx_ep, ep.ep_fid); - dst_ep = __mlx_get_dstep_from_fi_addr(u_ep, dest_addr); - - status = ucp_tag_send_nb(dst_ep, buf, len, - ucp_dt_make_contig(1), - tag, mlx_send_callback_no_compl); - if (UCS_PTR_STATUS(status) == UCS_OK) - return FI_SUCCESS; - - if (UCS_PTR_IS_ERR(status)) { - FI_DBG( &mlx_prov,FI_LOG_CORE, - "Send operation returns error: %s", - ucs_status_string(*(ucs_status_t*)status)); - return MLX_TRANSLATE_ERRCODE(*(ucs_status_t*)status); - } - - /* `info` is left unitialized, because this is send operation */ - while (ucp_request_test(status, &info) != UCS_INPROGRESS) - ucp_worker_progress(u_ep->worker); - - return FI_SUCCESS; -} - -static ssize_t mlx_tagged_send( - struct fid_ep *ep, const void *buf, - size_t len, void *desc, - fi_addr_t dest_addr, - uint64_t tag, void *context) -{ - struct iovec iov = { - .iov_base = (void*)buf, - .iov_len = len, - }; - - struct fi_msg_tagged msg = { - .msg_iov = &iov, - .desc = desc, - .iov_count = 1, - .addr = dest_addr, - .tag = tag, - .context = context, - }; - - return mlx_tagged_sendmsg( ep, &msg, 0); -} - -static ssize_t mlx_tagged_sendv( - struct fid_ep *ep, const struct iovec *iov, - void **desc, - size_t count, fi_addr_t dest_addr, - uint64_t tag, void *context) -{ - struct fi_msg_tagged msg = { - .msg_iov = iov, - .desc = desc, - .iov_count = count, - .addr = dest_addr, - .tag = tag, - .context = context, - }; - - return mlx_tagged_sendmsg( ep, &msg, 0); -} - -static ssize_t mlx_tagged_recvv( - struct fid_ep *ep, const struct iovec *iov, void **desc, - size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context) -{ - struct fi_msg_tagged msg = { - .msg_iov = iov, - .desc = desc, - .iov_count = count, - .addr = src_addr, - .tag = tag, - .ignore = ignore, - .context = context, - }; - return mlx_tagged_recvmsg(ep, &msg, 0); -} - -static ssize_t mlx_tagged_recv( - struct fid_ep *ep, void *buf, size_t len, void *desc, - fi_addr_t src_addr, - uint64_t tag, - uint64_t ignore, - void *context) -{ - struct iovec iov = { - .iov_base = buf, - .iov_len = len, - }; - - struct fi_msg_tagged msg = { - .msg_iov = &iov, - .desc = desc, - .iov_count = 1, - .addr = src_addr, - .tag = tag, - .ignore = ignore, - .context = context, - }; - return mlx_tagged_recvmsg(ep, &msg, 0); -} - -struct fi_ops_tagged mlx_tagged_ops = { - .size = sizeof(struct fi_ops_tagged), - .recv = mlx_tagged_recv, - .recvv = mlx_tagged_recvv, - .recvmsg = mlx_tagged_recvmsg, - .send = mlx_tagged_send, - .senddata = fi_no_tagged_senddata, - .sendv = mlx_tagged_sendv, - .inject = mlx_tagged_inject, - .sendmsg = mlx_tagged_sendmsg, - .injectdata = fi_no_tagged_injectdata, -}; - diff --git a/prov/mrail/src/TODO b/prov/mrail/src/TODO deleted file mode 100644 index ec6f16658be..00000000000 --- a/prov/mrail/src/TODO +++ /dev/null @@ -1,28 +0,0 @@ -TODO: ------ - -Feature / issue Status ----------------------------------------------------- -CQ in-progress -AV in-progress -EP in-progress -small msgs (un-ordered) in-progress -support for multiple layering - -(above is needed for multi-rail - over ofi_rxm provider) -OFI_MULTI_RAIL env var - -fi_dupinfo issue - -App mode bit to make it aware - -of list of rails in fi_info -addressing: - - - FI_ADDR_STRV - - primary/failover -small msg ordering: - - - bounce buffers -large msg support: - - - use FI_VARIABLE_MSG -Memory registration - -RMA - -rail failure handling - -rail selection / striping algorithm - -Atomics - diff --git a/prov/mrail/src/mrail.h b/prov/mrail/src/mrail.h index 3b4e73e1b8b..6af26046687 100644 --- a/prov/mrail/src/mrail.h +++ b/prov/mrail/src/mrail.h @@ -54,9 +54,6 @@ #include #include -#define MRAIL_MAJOR_VERSION 1 -#define MRAIL_MINOR_VERSION 0 - #define MRAIL_MAX_INFO 100 #define MRAIL_PASSTHRU_TX_OP_FLAGS (FI_INJECT_COMPLETE | \ @@ -218,7 +215,7 @@ struct mrail_recv { uint64_t ignore; struct mrail_rndv_recv rndv; }; -DECLARE_FREESTACK(struct mrail_recv, mrail_recv_fs); +OFI_DECLARE_FREESTACK(struct mrail_recv, mrail_recv_fs); int mrail_cq_process_buf_recv(struct fi_cq_tagged_entry *comp, struct mrail_recv *recv); @@ -319,8 +316,8 @@ mrail_pop_recv(struct mrail_ep *mrail_ep) { struct mrail_recv *recv; ofi_ep_lock_acquire(&mrail_ep->util_ep); - recv = freestack_isempty(mrail_ep->recv_fs) ? NULL : - freestack_pop(mrail_ep->recv_fs); + recv = ofi_freestack_isempty(mrail_ep->recv_fs) ? NULL : + ofi_freestack_pop(mrail_ep->recv_fs); ofi_ep_lock_release(&mrail_ep->util_ep); return recv; } @@ -329,7 +326,7 @@ static inline void mrail_push_recv(struct mrail_recv *recv) { ofi_ep_lock_acquire(&recv->ep->util_ep); - freestack_push(recv->ep->recv_fs, recv); + ofi_freestack_push(recv->ep->recv_fs, recv); ofi_ep_lock_release(&recv->ep->util_ep); } diff --git a/prov/mrail/src/mrail_attr.c b/prov/mrail/src/mrail_attr.c index 18870515390..5b9d81413ec 100644 --- a/prov/mrail/src/mrail_attr.c +++ b/prov/mrail/src/mrail_attr.c @@ -94,7 +94,7 @@ struct fi_domain_attr mrail_domain_attr = { }; struct fi_fabric_attr mrail_fabric_attr = { - .prov_version = FI_VERSION(MRAIL_MAJOR_VERSION, MRAIL_MINOR_VERSION), + .prov_version = OFI_VERSION_DEF_PROV, .name = "ofi_mrail_fabric", }; diff --git a/prov/mrail/src/mrail_av.c b/prov/mrail/src/mrail_av.c index af2d7a91db9..f4d53ae29c4 100644 --- a/prov/mrail/src/mrail_av.c +++ b/prov/mrail/src/mrail_av.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2018-2020 Intel Corporation, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -141,7 +141,6 @@ static int mrail_av_insert(struct fid_av *av_fid, const void *addr, size_t count if (ret) { FI_WARN(&mrail_prov, FI_LOG_AV, \ "Unable to get rail fi_addr\n"); - index = FI_ADDR_NOTAVAIL; } else { assert(index == index_rail0); num_inserted++; @@ -195,6 +194,7 @@ int mrail_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, util_attr.addrlen = sizeof(struct mrail_peer_info); /* We just need a table to store the mapping */ util_attr.flags = 0; + util_attr.context_len = 0; if (attr->type == FI_AV_UNSPEC) attr->type = FI_AV_TABLE; diff --git a/prov/mrail/src/mrail_cq.c b/prov/mrail/src/mrail_cq.c index 481ca14ffee..4aab096eb70 100644 --- a/prov/mrail/src/mrail_cq.c +++ b/prov/mrail/src/mrail_cq.c @@ -387,6 +387,7 @@ static void mrail_save_ooo_recv(struct mrail_ep *mrail_ep, if (!ooo_recv) { FI_WARN(&mrail_prov, FI_LOG_CQ, "Cannot allocate ooo_recv\n"); assert(0); + return; } ooo_recv->entry.next = NULL; ooo_recv->seq_no = seq_no; diff --git a/prov/mrail/src/mrail_domain.c b/prov/mrail/src/mrail_domain.c index 4e46bf7e9fa..ded83ef287d 100644 --- a/prov/mrail/src/mrail_domain.c +++ b/prov/mrail/src/mrail_domain.c @@ -355,6 +355,7 @@ static struct fi_ops_domain mrail_domain_ops = { .stx_ctx = fi_no_stx_context, .srx_ctx = fi_no_srx_context, .query_atomic = fi_no_query_atomic, + .query_collective = fi_no_query_collective, }; int mrail_domain_open(struct fid_fabric *fabric, struct fi_info *info, diff --git a/prov/mrail/src/mrail_ep.c b/prov/mrail/src/mrail_ep.c index deaad5c8eca..cae3c004a8e 100644 --- a/prov/mrail/src/mrail_ep.c +++ b/prov/mrail/src/mrail_ep.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2018-2019 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -533,7 +534,7 @@ mrail_send_common(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, if (total_len < mrail_ep->rails[rail].info->tx_attr->inject_size) flags |= FI_INJECT; - FI_DBG(&mrail_prov, FI_LOG_EP_DATA, "Posting send of length: %" PRIu64 + FI_DBG(&mrail_prov, FI_LOG_EP_DATA, "Posting send of length: %zu" " dest_addr: 0x%" PRIx64 " tag: 0x%" PRIx64 " seq: %d" " on rail: %d\n", len, dest_addr, tag, peer_info->seq_no - 1, rail); diff --git a/prov/mrail/src/mrail_init.c b/prov/mrail/src/mrail_init.c index bc58b2874ac..5ebad48634e 100644 --- a/prov/mrail/src/mrail_init.c +++ b/prov/mrail/src/mrail_init.c @@ -101,17 +101,26 @@ static int mrail_parse_env_vars(void) mrail_num_config = i; } - fi_param_define(&mrail_prov, "addr_strc", FI_PARAM_STRING, "List of rail" - " addresses of format FI_ADDR_STR delimited by comma"); - ret = fi_param_get_str(&mrail_prov, "addr_strc", &addr_strc); + fi_param_define(&mrail_prov, "addr_strc", FI_PARAM_STRING, "Deprecated. " + "Replaced by FI_OFI_MRAIL_ADDR."); + + fi_param_define(&mrail_prov, "addr", FI_PARAM_STRING, "Comma separated list " + "of rail addresses (FI_ADDR_STR, host name, IP address, or " + "netdev interface name)"); + + ret = fi_param_get_str(&mrail_prov, "addr", &addr_strc); + if (ret) + ret = fi_param_get_str(&mrail_prov, "addr_strc", &addr_strc); if (ret) { - FI_WARN(&mrail_prov, FI_LOG_CORE, "Unable to read " - "OFI_MRAIL_ADDR_STRC env variable\n"); + FI_INFO(&mrail_prov, FI_LOG_CORE, "unable to read " + "FI_OFI_MRAIL_ADDR env variable\n"); return ret; } mrail_addr_strv = mrail_split_addr_strc(addr_strc); - if (!mrail_addr_strv) + if (!mrail_addr_strv) { + FI_WARN(&mrail_prov, FI_LOG_CORE, "unable to alloc memory\n"); return -FI_ENOMEM; + } /* * Local rank is used to set the default tx rail when fixed mapping @@ -271,10 +280,13 @@ static int mrail_get_core_info(uint32_t version, const char *node, const char *s size_t i; int ret = 0; int num_rails; + enum fi_log_level level = ((hints && hints->fabric_attr && + hints->fabric_attr->prov_name) ? + FI_LOG_WARN : FI_LOG_INFO); if (!mrail_addr_strv) { - FI_WARN(&mrail_prov, FI_LOG_FABRIC, - "OFI_MRAIL_ADDR_STRC env variable not set!\n"); + FI_LOG(&mrail_prov, level, FI_LOG_FABRIC, + "OFI_MRAIL_ADDR_STRC env variable not set!\n"); return -FI_ENODATA; } @@ -318,7 +330,36 @@ static int mrail_get_core_info(uint32_t version, const char *node, const char *s FI_DBG(&mrail_prov, FI_LOG_CORE, "--- Begin fi_getinfo for rail: %zd ---\n", i); - ret = fi_getinfo(version, NULL, NULL, OFI_GETINFO_INTERNAL, core_hints, &rail_info[i]); + if (!hints || !hints->caps) { + struct fi_info *tmp_info = NULL; + uint64_t saved_core_hints_caps = core_hints->caps; + /* + * Get the default caps that would be returned for empty + * hints, otherwise the returned caps would only contain + * those specifed in the hints (FI_SOURCE) and secondary + * capabilities. + */ + core_hints->caps = 0; + ret = fi_getinfo(version, NULL, NULL, + OFI_GETINFO_INTERNAL, core_hints, + &tmp_info); + if (tmp_info) { + core_hints->caps = tmp_info->caps | + saved_core_hints_caps; + fi_freeinfo(tmp_info); + } else { + core_hints->caps = saved_core_hints_caps; + } + + ret = fi_getinfo(version, NULL, NULL, + OFI_GETINFO_INTERNAL, core_hints, + &rail_info[i]); + core_hints->caps = saved_core_hints_caps; + } else { + ret = fi_getinfo(version, NULL, NULL, + OFI_GETINFO_INTERNAL, core_hints, + &rail_info[i]); + } FI_DBG(&mrail_prov, FI_LOG_CORE, "--- End fi_getinfo for rail: %zd ---\n", i); @@ -385,8 +426,7 @@ static struct fi_info *mrail_get_prefix_info(struct fi_info *core_info, int id) fi->ep_attr->protocol = mrail_info.ep_attr->protocol; fi->ep_attr->protocol_version = mrail_info.ep_attr->protocol_version; - fi->fabric_attr->prov_version = FI_VERSION(MRAIL_MAJOR_VERSION, - MRAIL_MINOR_VERSION); + fi->fabric_attr->prov_version = OFI_VERSION_DEF_PROV; fi->domain_attr->mr_key_size = (num_rails * sizeof(struct mrail_addr_key)); fi->domain_attr->mr_mode |= FI_MR_RAW; @@ -484,8 +524,8 @@ static void mrail_fini(void) struct fi_provider mrail_prov = { .name = OFI_UTIL_PREFIX "mrail", - .version = FI_VERSION(MRAIL_MAJOR_VERSION, MRAIL_MINOR_VERSION), - .fi_version = FI_VERSION(1, 8), + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, .getinfo = mrail_getinfo, .fabric = mrail_fabric_open, .cleanup = mrail_fini diff --git a/prov/netdir/src/netdir.h b/prov/netdir/src/netdir.h index 347dc241e53..446faab3d13 100644 --- a/prov/netdir/src/netdir.h +++ b/prov/netdir/src/netdir.h @@ -48,8 +48,6 @@ extern "C" { #endif /* __cplusplus */ -#define OFI_ND_MAJOR_VERSION 1 -#define OFI_ND_MINOR_VERSION 0 #define ND_MSG_IOV_LIMIT (256) #define ND_MSG_INTERNAL_IOV_LIMIT (512) @@ -181,9 +179,9 @@ static inline int ofi_nd_hresult_2_fierror(HRESULT hr) #define OFI_ND_TIMEOUT_INIT(timeout) \ uint64_t sfinish = ((timeout) >= 0) ? \ - (fi_gettime_ms() + (timeout) * 10000) : -1; + (ofi_gettime_ms() + (timeout) * 10000) : -1; -#define OFI_ND_TIMEDOUT() ((sfinish > 0) ? fi_gettime_ms() >= sfinish : 0) +#define OFI_ND_TIMEDOUT() ((sfinish > 0) ? ofi_gettime_ms() >= sfinish : 0) #ifdef ENABLE_DEBUG # define NODEFAULT assert(0) diff --git a/prov/netdir/src/netdir_init.c b/prov/netdir/src/netdir_init.c index 1af1eca194b..8f718fb9307 100644 --- a/prov/netdir/src/netdir_init.c +++ b/prov/netdir/src/netdir_init.c @@ -47,8 +47,8 @@ const char ofi_nd_prov_name[] = "netdir"; struct fi_provider ofi_nd_prov = { .name = ofi_nd_prov_name, - .version = FI_VERSION(OFI_ND_MAJOR_VERSION, OFI_ND_MINOR_VERSION), - .fi_version = FI_VERSION(1, 8), + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, .getinfo = ofi_nd_getinfo, .fabric = ofi_nd_fabric, .cleanup = ofi_nd_fini @@ -138,7 +138,7 @@ static int ofi_nd_adapter_cb(const ND2_ADAPTER_INFO *adapter, const char *name) info->domain_attr->mr_cnt = OFI_ND_MAX_MR_CNT; info->fabric_attr->name = strdup(ofi_nd_prov_name); - info->fabric_attr->prov_version = FI_VERSION(OFI_ND_MAJOR_VERSION, OFI_ND_MINOR_VERSION); + info->fabric_attr->prov_version = OFI_VERSION_DEF_PROV; info->caps = OFI_ND_EP_CAPS | OFI_ND_DOMAIN_CAPS; info->addr_format = FI_SOCKADDR; diff --git a/prov/psm/src/psmx.h b/prov/psm/src/psmx.h index 85bebc68216..560c1145226 100644 --- a/prov/psm/src/psmx.h +++ b/prov/psm/src/psmx.h @@ -76,7 +76,6 @@ extern struct fi_provider psmx_prov; extern int psmx_am_compat_mode; -#define PSMX_VERSION (FI_VERSION(1, 8)) #define PSMX_OP_FLAGS (FI_INJECT | FI_MULTI_RECV | FI_COMPLETION | \ FI_TRIGGER | FI_INJECT_COMPLETE | \ @@ -288,7 +287,7 @@ struct psmx_fid_domain { * purpose. The tag-matching functions automatically treat these bits * as 0. This field is a bit mask, with reserved bits valued as "1". */ - uint64_t reserved_tag_bits; + uint64_t reserved_tag_bits; /* lock to prevent the sequence of psm_mq_ipeek and psm_mq_test be * interleaved in a multithreaded environment. @@ -535,7 +534,7 @@ struct psmx_fid_mr { uint64_t flags; uint64_t offset; size_t iov_count; - struct iovec iov[0]; /* must be the last field */ + struct iovec iov[]; /* must be the last field */ }; struct psmx_epaddr_context { diff --git a/prov/psm/src/psmx_domain.c b/prov/psm/src/psmx_domain.c index 277df1de7cc..d5e67a945a5 100644 --- a/prov/psm/src/psmx_domain.c +++ b/prov/psm/src/psmx_domain.c @@ -246,6 +246,7 @@ static struct fi_ops_domain psmx_domain_ops = { .stx_ctx = psmx_stx_ctx, .srx_ctx = fi_no_srx_context, .query_atomic = psmx_query_atomic, + .query_collective = fi_no_query_collective, }; static int psmx_key_compare(void *key1, void *key2) diff --git a/prov/psm/src/psmx_fabric.c b/prov/psm/src/psmx_fabric.c index f99d24b9ba8..0ce4c8fad14 100644 --- a/prov/psm/src/psmx_fabric.c +++ b/prov/psm/src/psmx_fabric.c @@ -79,7 +79,7 @@ static struct fi_ops_fabric psmx_fabric_ops = { static struct fi_fabric_attr psmx_fabric_attr = { .name = PSMX_FABRIC_NAME, - .prov_version = PSMX_VERSION, + .prov_version = OFI_VERSION_DEF_PROV, }; int psmx_fabric(struct fi_fabric_attr *attr, diff --git a/prov/psm/src/psmx_init.c b/prov/psm/src/psmx_init.c index 3e0a89e70e7..c0a93f44a5b 100644 --- a/prov/psm/src/psmx_init.c +++ b/prov/psm/src/psmx_init.c @@ -639,7 +639,7 @@ static int psmx_getinfo(uint32_t version, const char *node, const char *service, psmx_info->dest_addrlen = sizeof(*dest_addr); psmx_info->fabric_attr->name = strdup(PSMX_FABRIC_NAME); psmx_info->fabric_attr->prov_name = NULL; - psmx_info->fabric_attr->prov_version = PSMX_VERSION; + psmx_info->fabric_attr->prov_version = OFI_VERSION_DEF_PROV; psmx_info->tx_attr->caps = psmx_info->caps; psmx_info->tx_attr->mode = psmx_info->mode; @@ -696,8 +696,8 @@ static void psmx_fini(void) struct fi_provider psmx_prov = { .name = PSMX_PROV_NAME, - .version = PSMX_VERSION, - .fi_version = PSMX_VERSION, + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, .getinfo = psmx_getinfo, .fabric = psmx_fabric, .cleanup = psmx_fini diff --git a/prov/psm2/Makefile.include b/prov/psm2/Makefile.include index 7e4783ceb41..993d3eea977 100644 --- a/prov/psm2/Makefile.include +++ b/prov/psm2/Makefile.include @@ -22,6 +22,9 @@ _psm2_files = \ prov/psm2/src/psmx2_wait.c \ prov/psm2/src/psmx2_util.c +_psm2_cppflags = \ + -I$(top_srcdir)/prov/psm2/include + if HAVE_PSM2_SRC _psm2_files += \ prov/psm2/src/psm2_revision.c @@ -97,7 +100,7 @@ _psm2_nodist_files += \ prov/psm2/src/psm2/opa/opa_dwordcpy-x86_64-fast.S endif -_psm2_cppflags = \ +_psm2_cppflags += \ -I$(top_srcdir)/prov/psm2/src/psm2 \ -I$(top_srcdir)/prov/psm2/src/psm2/include \ -I$(top_srcdir)/prov/psm2/src/psm2/include/linux-i386 \ @@ -132,6 +135,7 @@ src_libfabric_la_LIBADD += libpsmx2.la src_libfabric_la_DEPENDENCIES += libpsmx2.la endif !HAVE_PSM2_DL +rdmainclude_HEADERS += prov/psm2/include/fi_ext_psm2.h prov_install_man_pages += man/man7/fi_psm2.7 endif HAVE_PSM2 diff --git a/prov/psm2/configure.m4 b/prov/psm2/configure.m4 index 3b7b6e95bb6..f46d5d3c035 100644 --- a/prov/psm2/configure.m4 +++ b/prov/psm2/configure.m4 @@ -9,7 +9,7 @@ dnl $2: action if not configured successfully dnl AC_DEFUN([FI_PSM2_CONFIGURE],[ # Determine if we can support the psm2 provider - psm2_ARCH=`uname -m | sed -e 's,\(i[456]86\|athlon$$\),i386,'` + psm2_ARCH=$host_cpu AM_CONDITIONAL([HAVE_PSM2_X86_64], [test x$psm2_ARCH = xx86_64]) AC_SUBST([HAVE_PSM2_X86_64]) AC_SUBST([psm2_ARCH]) diff --git a/prov/efa/src/efa_verbs/efa_verbs.h b/prov/psm2/include/fi_ext_psm2.h similarity index 74% rename from prov/efa/src/efa_verbs/efa_verbs.h rename to prov/psm2/include/fi_ext_psm2.h index 4d992cd77c4..3a48d83e17f 100644 --- a/prov/efa/src/efa_verbs/efa_verbs.h +++ b/prov/psm2/include/fi_ext_psm2.h @@ -1,11 +1,11 @@ /* - * Copyright (c) 2017-2018 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright (c) 2020 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: + * BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following @@ -30,19 +30,18 @@ * SOFTWARE. */ -#ifndef EFA_VERBS_H -#define EFA_VERBS_H +#ifndef FI_EXT_PSM2_H +#define FI_EXT_PSM2_H -#include -#include +#ifdef __cplusplus +extern "C" { +#endif -#include "efa-abi.h" -#include "efa_cmd.h" +/* Provider specific name for fi_set_val() / fi_get_val() */ +#define FI_PSM2_DISCONNECT (1U | FI_PROV_SPECIFIC) -int efa_device_init(void); -void efa_device_free(void); +#ifdef __cplusplus +} +#endif -struct efa_context **efa_device_get_context_list(int *num_ctx); -void efa_device_free_context_list(struct efa_context **list); - -#endif /* EFA_VERBS_H */ +#endif /* FI_EXT_PSM2_H */ diff --git a/prov/psm2/src/psmx2.h b/prov/psm2/src/psmx2.h index 655636bb6ea..da5b6cfe67b 100644 --- a/prov/psm2/src/psmx2.h +++ b/prov/psm2/src/psmx2.h @@ -72,6 +72,7 @@ extern "C" { #include "ofi_mem.h" #include "rbtree.h" #include "version.h" +#include "fi_ext_psm2.h" #ifdef FABRIC_DIRECT_ENABLED #define DIRECT_FN __attribute__((visibility ("default"))) @@ -83,31 +84,27 @@ extern "C" { extern struct fi_provider psmx2_prov; -#define PSMX2_VERSION (FI_VERSION(1, 8)) #define PSMX2_OP_FLAGS (FI_INJECT | FI_MULTI_RECV | FI_COMPLETION | \ FI_TRIGGER | FI_INJECT_COMPLETE | \ FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE) -#define PSMX2_PRI_CAPS (FI_TAGGED | FI_MSG | FI_RMA | FI_ATOMICS | \ - FI_NAMED_RX_CTX | FI_DIRECTED_RECV | \ - FI_SEND | FI_RECV | FI_READ | FI_WRITE | \ - FI_REMOTE_READ | FI_REMOTE_WRITE) - -#define PSMX2_SEC_CAPS (FI_MULTI_RECV | FI_SOURCE | FI_RMA_EVENT | \ - FI_TRIGGER | FI_LOCAL_COMM | FI_REMOTE_COMM | \ - FI_SOURCE_ERR | FI_SHARED_AV) - -#define PSMX2_CAPS (PSMX2_PRI_CAPS | PSMX2_SEC_CAPS | FI_REMOTE_CQ_DATA) +#define PSMX2_TX_CAPS (OFI_TX_MSG_CAPS | FI_TAGGED | OFI_TX_RMA_CAPS | FI_ATOMICS | \ + FI_NAMED_RX_CTX | FI_TRIGGER) +#define PSMX2_RX_CAPS (FI_SOURCE | FI_SOURCE_ERR | FI_RMA_EVENT | OFI_RX_MSG_CAPS | \ + FI_TAGGED | OFI_RX_RMA_CAPS | FI_ATOMICS | FI_DIRECTED_RECV | \ + FI_MULTI_RECV | FI_TRIGGER) +#define PSMX2_DOM_CAPS (FI_SHARED_AV | FI_LOCAL_COMM | FI_REMOTE_COMM) +#define PSMX2_CAPS (PSMX2_TX_CAPS | PSMX2_RX_CAPS | PSMX2_DOM_CAPS) -#define PSMX2_RMA_CAPS (PSMX2_CAPS & ~(FI_TAGGED | FI_MSG | FI_SEND | \ - FI_RECV | FI_DIRECTED_RECV | FI_MULTI_RECV)) +#define PSMX2_RMA_TX_CAPS (PSMX2_TX_CAPS & ~(FI_TAGGED | FI_MSG | FI_SEND)) +#define PSMX2_RMA_RX_CAPS (PSMX2_RX_CAPS & ~(FI_TAGGED | FI_MSG | FI_RECV | \ + FI_DIRECTED_RECV | FI_MULTI_RECV)) +#define PSMX2_RMA_CAPS (PSMX2_RMA_TX_CAPS | PSMX2_RMA_RX_CAPS | PSMX2_DOM_CAPS) #define PSMX2_SUB_CAPS (FI_SEND | FI_RECV | FI_READ | FI_WRITE | \ FI_REMOTE_READ | FI_REMOTE_WRITE) -#define PSMX2_DOM_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM) - #define PSMX2_ALL_TRX_CTXT ((void *)-1) #define PSMX2_MAX_MSG_SIZE ((0x1ULL << 32) - 1) #define PSMX2_RMA_ORDER_SIZE (4096) @@ -479,7 +476,6 @@ struct psmx2_multi_recv { struct psmx2_fid_fabric { struct util_fabric util_fabric; - psm2_uuid_t uuid; struct util_ns name_server; /* list of all opened domains */ @@ -532,6 +528,8 @@ struct psmx2_trx_ctxt { ofi_atomic32_t poll_refcnt; int poll_active; + psm2_uuid_t uuid; + struct dlist_entry entry; }; @@ -544,6 +542,7 @@ struct psmx2_fid_domain { struct psmx2_fid_fabric *fabric; uint64_t mode; uint64_t caps; + psm2_uuid_t uuid; enum fi_mr_mode mr_mode; fastlock_t mr_lock; @@ -591,6 +590,11 @@ struct psmx2_fid_domain { psmx2_unlock_fn_t context_unlock_fn; psmx2_trylock_fn_t poll_trylock_fn; psmx2_unlock_fn_t poll_unlock_fn; + + /* parameters that can be set via domain_ops */ + struct { + int disconnect; + } params; }; #define PSMX2_EP_REGULAR 0 @@ -632,7 +636,8 @@ struct psmx2_cq_event { struct fi_cq_err_entry err; } cqe; int error; - int source_is_valid; + int8_t source_is_valid; + uint8_t source_sep_id; psm2_epaddr_t source; struct psmx2_fid_av *source_av; struct slist_entry list_entry; @@ -700,6 +705,7 @@ struct psmx2_av_addr { psm2_epid_t epid; uint8_t type; uint8_t sep_id; + uint8_t valid; }; struct psmx2_av_sep { @@ -769,6 +775,7 @@ struct psmx2_fid_ep { size_t min_multi_recv; uint32_t iov_seq_num; int service; + int sep_id; }; struct psmx2_sep_ctxt { @@ -827,9 +834,6 @@ struct psmx2_env { int prog_interval; char *prog_affinity; int multi_ep; - int max_trx_ctxt; - int free_trx_ctxt; - int num_devunits; int inject_size; int lock_level; int lazy_conn; @@ -839,6 +843,19 @@ struct psmx2_env { #endif }; +#define PSMX2_MAX_UNITS 4 +struct psmx2_hfi_info { + int max_trx_ctxt; + int free_trx_ctxt; + int num_units; + int num_active_units; + int active_units[PSMX2_MAX_UNITS]; + int unit_is_active[PSMX2_MAX_UNITS]; + int unit_nctxts[PSMX2_MAX_UNITS]; + int unit_nfreectxts[PSMX2_MAX_UNITS]; + char default_domain_name[PSMX2_MAX_UNITS * 8]; /* hfi1_0;hfi1_1;...;hfi1_n */ +}; + extern struct fi_ops_mr psmx2_mr_ops; extern struct fi_ops_cm psmx2_cm_ops; extern struct fi_ops_tagged psmx2_tagged_ops; @@ -863,6 +880,7 @@ extern struct fi_ops_msg psmx2_msg2_ops; extern struct fi_ops_rma psmx2_rma_ops; extern struct fi_ops_atomic psmx2_atomic_ops; extern struct psmx2_env psmx2_env; +extern struct psmx2_hfi_info psmx2_hfi_info; extern struct psmx2_fid_fabric *psmx2_active_fabric; /* @@ -979,7 +997,8 @@ int psmx2_domain_enable_ep(struct psmx2_fid_domain *domain, struct psmx2_fid_ep void psmx2_trx_ctxt_free(struct psmx2_trx_ctxt *trx_ctxt, int usage_flags); struct psmx2_trx_ctxt *psmx2_trx_ctxt_alloc(struct psmx2_fid_domain *domain, struct psmx2_ep_name *src_addr, - int sep_ctxt_idx, int usage_flags); + int sep_ctxt_idx, int usage_flags, + uint8_t *uuid); static inline int psmx2_ns_service_cmp(void *svc1, void *svc2) @@ -1013,7 +1032,7 @@ struct psmx2_cq_event *psmx2_cq_create_event(struct psmx2_fid_cq *cq, int psmx2_cq_poll_mq(struct psmx2_fid_cq *cq, struct psmx2_trx_ctxt *trx_ctxt, struct psmx2_cq_event *event, int count, fi_addr_t *src_addr); -int psmx2_epid_to_epaddr(struct psmx2_trx_ctxt *trx_ctxt, +void psmx2_epid_to_epaddr(struct psmx2_trx_ctxt *trx_ctxt, psm2_epid_t epid, psm2_epaddr_t *epaddr); int psmx2_av_add_trx_ctxt(struct psmx2_fid_av *av, struct psmx2_trx_ctxt *trx_ctxt); @@ -1033,7 +1052,6 @@ psm2_epaddr_t psmx2_av_translate_addr(struct psmx2_fid_av *av, psm2_epaddr_t epaddr; size_t idx; int ctxt; - int err; if (av_type == FI_AV_MAP) return (psm2_epaddr_t) addr; @@ -1041,7 +1059,7 @@ psm2_epaddr_t psmx2_av_translate_addr(struct psmx2_fid_av *av, av->domain->av_lock_fn(&av->lock, 1); idx = PSMX2_ADDR_IDX(addr); - assert(idx < av->hdr->last); + assert(idx < av->hdr->last && av->table[idx].valid); if (OFI_UNLIKELY(av->table[idx].type == PSMX2_EP_SCALABLE)) { if (OFI_UNLIKELY(!av->sep_info[idx].epids)) { @@ -1058,25 +1076,18 @@ psm2_epaddr_t psmx2_av_translate_addr(struct psmx2_fid_av *av, ctxt = PSMX2_ADDR_CTXT(addr, av->rx_ctx_bits); assert(ctxt < av->sep_info[idx].ctxt_cnt); - if (OFI_UNLIKELY(!av->conn_info[trx_ctxt->id].sepaddrs[idx][ctxt])) { - err = psmx2_epid_to_epaddr(trx_ctxt, - av->sep_info[idx].epids[ctxt], - &av->conn_info[trx_ctxt->id].sepaddrs[idx][ctxt]); - assert(!err); - } + if (OFI_UNLIKELY(!av->conn_info[trx_ctxt->id].sepaddrs[idx][ctxt])) + psmx2_epid_to_epaddr(trx_ctxt, + av->sep_info[idx].epids[ctxt], + &av->conn_info[trx_ctxt->id].sepaddrs[idx][ctxt]); epaddr = av->conn_info[trx_ctxt->id].sepaddrs[idx][ctxt]; } else { - if (OFI_UNLIKELY(!av->conn_info[trx_ctxt->id].epaddrs[idx])) { - err = psmx2_epid_to_epaddr(trx_ctxt, av->table[idx].epid, - &av->conn_info[trx_ctxt->id].epaddrs[idx]); - assert(!err); - } + if (OFI_UNLIKELY(!av->conn_info[trx_ctxt->id].epaddrs[idx])) + psmx2_epid_to_epaddr(trx_ctxt, av->table[idx].epid, + &av->conn_info[trx_ctxt->id].epaddrs[idx]); epaddr = av->conn_info[trx_ctxt->id].epaddrs[idx]; } -#ifdef NDEBUG - (void) err; -#endif av->domain->av_unlock_fn(&av->lock, 1); return epaddr; } @@ -1150,22 +1161,29 @@ static inline void psmx2_cntr_inc(struct psmx2_fid_cntr *cntr, int error) cntr->wait->signal(cntr->wait); } -fi_addr_t psmx2_av_translate_source(struct psmx2_fid_av *av, psm2_epaddr_t source); +fi_addr_t psmx2_av_translate_source(struct psmx2_fid_av *av, + psm2_epaddr_t source, int source_sep_id); -static inline void psmx2_get_source_name(psm2_epaddr_t source, struct psmx2_ep_name *name) +static inline void psmx2_get_source_name(psm2_epaddr_t source, + int source_sep_id, + struct psmx2_ep_name *name) { memset(name, 0, sizeof(*name)); psm2_epaddr_to_epid(source, &name->epid); - name->type = PSMX2_EP_REGULAR; + name->sep_id = source_sep_id; + name->type = source_sep_id ? PSMX2_EP_SCALABLE : PSMX2_EP_REGULAR; } -static inline void psmx2_get_source_string_name(psm2_epaddr_t source, char *name, size_t *len) +static inline void psmx2_get_source_string_name(psm2_epaddr_t source, + int source_sep_id, + char *name, size_t *len) { struct psmx2_ep_name ep_name; memset(&ep_name, 0, sizeof(ep_name)); psm2_epaddr_to_epid(source, &ep_name.epid); - ep_name.type = PSMX2_EP_REGULAR; + ep_name.sep_id = source_sep_id; + ep_name.type = source_sep_id ? PSMX2_EP_SCALABLE : PSMX2_EP_REGULAR; ofi_straddr(name, len, FI_ADDR_PSMX2, &ep_name); } @@ -1211,6 +1229,14 @@ static inline void psmx2_am_poll(struct psmx2_trx_ctxt *trx_ctxt) } } +static inline int psmx2_peer_match(struct dlist_entry *item, const void *arg) +{ + struct psmx2_epaddr_context *peer; + + peer = container_of(item, struct psmx2_epaddr_context, entry); + return (peer->epaddr == arg); +} + #ifdef __cplusplus } #endif diff --git a/prov/psm2/src/psmx2_atomic.c b/prov/psm2/src/psmx2_atomic.c index 108735f92f1..ba8a3876a6c 100644 --- a/prov/psm2/src/psmx2_atomic.c +++ b/prov/psm2/src/psmx2_atomic.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -808,6 +808,7 @@ ssize_t psmx2_atomic_write_generic(struct fid_ep *ep, psm2_epid_t psm2_epid; int am_flags = PSM2_AM_FLAG_ASYNC; int chunk_size, len; + int err; ep_priv = container_of(ep, struct psmx2_fid_ep, ep); @@ -873,9 +874,15 @@ ssize_t psmx2_atomic_write_generic(struct fid_ep *ep, args[3].u64 = key; args[4].u32w0 = datatype; args[4].u32w1 = op; - psm2_am_request_short(psm2_epaddr, - PSMX2_AM_ATOMIC_HANDLER, args, 5, - (void *)buf, len, am_flags, NULL, NULL); + err = psm2_am_request_short(psm2_epaddr, + PSMX2_AM_ATOMIC_HANDLER, args, 5, + (void *)buf, len, am_flags, NULL, NULL); + if (err) { + free(req->tmpbuf); + psmx2_am_request_free(ep_priv->tx, req); + return psmx2_errno(err); + } + psmx2_am_poll(ep_priv->tx); return 0; } @@ -982,9 +989,15 @@ ssize_t psmx2_atomic_writev_generic(struct fid_ep *ep, args[3].u64 = key; args[4].u32w0 = datatype; args[4].u32w1 = op; - psm2_am_request_short(psm2_epaddr, - PSMX2_AM_ATOMIC_HANDLER, args, 5, - (void *)buf, len, am_flags, NULL, NULL); + err = psm2_am_request_short(psm2_epaddr, + PSMX2_AM_ATOMIC_HANDLER, args, 5, + (void *)buf, len, am_flags, NULL, NULL); + if (err) { + free(req->tmpbuf); + psmx2_am_request_free(ep_priv->tx, req); + return psmx2_errno(err); + } + psmx2_am_poll(ep_priv->tx); return 0; } @@ -1097,6 +1110,7 @@ ssize_t psmx2_atomic_readwrite_generic(struct fid_ep *ep, psm2_epid_t psm2_epid; int am_flags = PSM2_AM_FLAG_ASYNC; int chunk_size, len; + int err; ep_priv = container_of(ep, struct psmx2_fid_ep, ep); @@ -1168,9 +1182,16 @@ ssize_t psmx2_atomic_readwrite_generic(struct fid_ep *ep, args[3].u64 = key; args[4].u32w0 = datatype; args[4].u32w1 = op; - psm2_am_request_short(psm2_epaddr, - PSMX2_AM_ATOMIC_HANDLER, args, 5, - (void *)buf, (buf?len:0), am_flags, NULL, NULL); + err = psm2_am_request_short(psm2_epaddr, + PSMX2_AM_ATOMIC_HANDLER, args, 5, + (void *)buf, (buf?len:0), am_flags, NULL, + NULL); + if (err) { + free(req->tmpbuf); + psmx2_am_request_free(ep_priv->tx, req); + return psmx2_errno(err); + } + psmx2_am_poll(ep_priv->tx); return 0; } @@ -1341,9 +1362,16 @@ ssize_t psmx2_atomic_readwritev_generic(struct fid_ep *ep, args[3].u64 = key; args[4].u32w0 = datatype; args[4].u32w1 = op; - psm2_am_request_short(psm2_epaddr, - PSMX2_AM_ATOMIC_HANDLER, args, 5, - (void *)buf, (buf?len:0), am_flags, NULL, NULL); + err = psm2_am_request_short(psm2_epaddr, + PSMX2_AM_ATOMIC_HANDLER, args, 5, + (void *)buf, (buf?len:0), am_flags, NULL, + NULL); + if (err) { + free(req->tmpbuf); + psmx2_am_request_free(ep_priv->tx, req); + return psmx2_errno(err); + } + psmx2_am_poll(ep_priv->tx); return 0; } @@ -1476,6 +1504,7 @@ ssize_t psmx2_atomic_compwrite_generic(struct fid_ep *ep, psm2_epid_t psm2_epid; int am_flags = PSM2_AM_FLAG_ASYNC; int chunk_size, len; + int err; ep_priv = container_of(ep, struct psmx2_fid_ep, ep); @@ -1548,10 +1577,16 @@ ssize_t psmx2_atomic_compwrite_generic(struct fid_ep *ep, args[3].u64 = key; args[4].u32w0 = datatype; args[4].u32w1 = op; - psm2_am_request_short(psm2_epaddr, - PSMX2_AM_ATOMIC_HANDLER, args, 5, - (void *)buf, len * 2, am_flags, - NULL, NULL); + err = psm2_am_request_short(psm2_epaddr, + PSMX2_AM_ATOMIC_HANDLER, args, 5, + (void *)buf, len * 2, am_flags, + NULL, NULL); + if (err) { + free(req->tmpbuf); + psmx2_am_request_free(ep_priv->tx, req); + return psmx2_errno(err); + } + psmx2_am_poll(ep_priv->tx); return 0; } @@ -1745,9 +1780,15 @@ ssize_t psmx2_atomic_compwritev_generic(struct fid_ep *ep, args[3].u64 = key; args[4].u32w0 = datatype; args[4].u32w1 = op; - psm2_am_request_short(psm2_epaddr, - PSMX2_AM_ATOMIC_HANDLER, args, 5, - buf, len * 2, am_flags, NULL, NULL); + err = psm2_am_request_short(psm2_epaddr, + PSMX2_AM_ATOMIC_HANDLER, args, 5, + buf, len * 2, am_flags, NULL, NULL); + if (err) { + free(req->tmpbuf); + psmx2_am_request_free(ep_priv->tx, req); + return psmx2_errno(err); + } + psmx2_am_poll(ep_priv->tx); return 0; } diff --git a/prov/psm2/src/psmx2_attr.c b/prov/psm2/src/psmx2_attr.c index 9f6ba0458bd..eedda590fb2 100644 --- a/prov/psm2/src/psmx2_attr.c +++ b/prov/psm2/src/psmx2_attr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -46,7 +46,7 @@ */ static struct fi_tx_attr psmx2_tx_attr = { - .caps = PSMX2_CAPS, /* PSMX2_RMA_CAPS */ + .caps = PSMX2_TX_CAPS, /* PSMX2_RMA_TX_CAPS */ .mode = FI_CONTEXT, /* 0 */ .op_flags = PSMX2_OP_FLAGS, .msg_order = PSMX2_MSG_ORDER, @@ -58,7 +58,7 @@ static struct fi_tx_attr psmx2_tx_attr = { }; static struct fi_rx_attr psmx2_rx_attr = { - .caps = PSMX2_CAPS, /* PSMX2_RMA_CAPS */ + .caps = PSMX2_RX_CAPS, /* PSMX2_RMA_RX_CAPS */ .mode = FI_CONTEXT, /* 0 */ .op_flags = PSMX2_OP_FLAGS, .msg_order = PSMX2_MSG_ORDER, @@ -72,9 +72,9 @@ static struct fi_ep_attr psmx2_ep_attr = { .type = FI_EP_RDM, /* FI_EP_DGRAM */ .protocol = FI_PROTO_PSMX2, .protocol_version = PSM2_VERNO, - .max_msg_size = PSMX2_MAX_MSG_SIZE, + .max_msg_size = PSMX2_MAX_MSG_SIZE & ~0x0FFF, .msg_prefix_size = 0, - .max_order_raw_size = PSMX2_MAX_MSG_SIZE, + .max_order_raw_size = PSMX2_RMA_ORDER_SIZE, .max_order_war_size = PSMX2_RMA_ORDER_SIZE, .max_order_waw_size = PSMX2_RMA_ORDER_SIZE, .mem_tag_format = FI_TAG_GENERIC, /* >>= 4 */ @@ -97,11 +97,11 @@ static struct fi_domain_attr psmx2_domain_attr = { .cq_data_size = 0, /* 4, 8 */ .cq_cnt = 65535, .ep_cnt = 65535, - .tx_ctx_cnt = 1, /* psmx2_env.free_trx_ctxt */ - .rx_ctx_cnt = 1, /* psmx2_env.free_trx_ctxt */ - .max_ep_tx_ctx = 1, /* psmx2_env.max_trx_ctxt */ - .max_ep_rx_ctx = 1, /* psmx2_env.max_trx_ctxt */ - .max_ep_stx_ctx = 1, /* psmx2_env.max_trx_ctxt */ + .tx_ctx_cnt = 1, /* psmx2_hfi_info.free_trx_ctxt */ + .rx_ctx_cnt = 1, /* psmx2_hfi_info.free_trx_ctxt */ + .max_ep_tx_ctx = 1, /* psmx2_hfi_info.max_trx_ctxt */ + .max_ep_rx_ctx = 1, /* psmx2_hfi_info.max_trx_ctxt */ + .max_ep_stx_ctx = 1, /* psmx2_hfi_info.max_trx_ctxt */ .max_ep_srx_ctx = 0, .cntr_cnt = 65535, .mr_iov_limit = 65535, @@ -115,7 +115,7 @@ static struct fi_domain_attr psmx2_domain_attr = { static struct fi_fabric_attr psmx2_fabric_attr = { .name = PSMX2_FABRIC_NAME, - .prov_version = PSMX2_VERSION, + .prov_version = OFI_VERSION_DEF_PROV, }; static struct fi_info psmx2_prov_info = { @@ -182,7 +182,7 @@ int psmx2_init_prov_info(const struct fi_info *hints, struct fi_info **info) } if (hints->domain_attr && hints->domain_attr->name && - strcasecmp(hints->domain_attr->name, domain_attr->name)) { + strncasecmp(hints->domain_attr->name, domain_attr->name, strlen(PSMX2_DOMAIN_NAME))) { FI_INFO(&psmx2_prov, FI_LOG_CORE, "Unknown domain name\n"); FI_INFO_NAME(&psmx2_prov, domain_attr, hints->domain_attr); return -FI_ENODATA; @@ -244,9 +244,9 @@ int psmx2_init_prov_info(const struct fi_info *hints, struct fi_info **info) info_new->ep_attr->type = ep_type; info_new->caps = PSMX2_RMA_CAPS; info_new->mode = 0; - info_new->tx_attr->caps = PSMX2_RMA_CAPS; + info_new->tx_attr->caps = PSMX2_RMA_TX_CAPS; info_new->tx_attr->mode = 0; - info_new->rx_attr->caps = PSMX2_RMA_CAPS; + info_new->rx_attr->caps = PSMX2_RMA_RX_CAPS; info_new->rx_attr->mode = 0; info_new->domain_attr->cq_data_size = 8; info_out = info_new; @@ -268,17 +268,13 @@ int psmx2_init_prov_info(const struct fi_info *hints, struct fi_info **info) "TAG60 instance included\n"); } - /* - * Special arrangement to help auto tag layout selection. - * See psmx2_alter_prov_info(). - */ - if (!hints || !(hints->caps & FI_REMOTE_CQ_DATA)) { + if (!hints || !hints->domain_attr || + !hints->domain_attr->cq_data_size) { info_new = fi_dupinfo(&psmx2_prov_info); if (info_new) { /* 64 bit tag, no CQ data */ info_new->addr_format = addr_format; info_new->ep_attr->type = ep_type; - info_new->caps &= ~FI_REMOTE_CQ_DATA; info_new->next = info_out; info_out = info_new; FI_INFO(&psmx2_prov, FI_LOG_CORE, @@ -304,22 +300,78 @@ static void psmx2_dup_addr(int format, struct psmx2_ep_name *addr, } } +static void psmx2_expand_default_unit(struct fi_info *info) +{ + struct fi_info *p, *next; + struct psmx2_ep_name *src_addr; + int i; + + p = info; + while (p) { + next = p->next; + src_addr = p->src_addr; + if (src_addr->unit == PSMX2_DEFAULT_UNIT) { + if (psmx2_hfi_info.num_active_units == 1) { + src_addr->unit = psmx2_hfi_info.active_units[0]; + } else { + for (i = 0; i < psmx2_hfi_info.num_active_units; i++) { + p->next = fi_dupinfo(p); + if (!p->next) { + FI_WARN(&psmx2_prov, FI_LOG_CORE, + "Failed to duplicate info for HFI unit %d\n", + psmx2_hfi_info.active_units[i]); + break; + } + p = p->next; + src_addr = p->src_addr; + src_addr->unit = psmx2_hfi_info.active_units[i]; + } + } + } + p->next = next; + p = next; + } +} + void psmx2_update_prov_info(struct fi_info *info, struct psmx2_ep_name *src_addr, struct psmx2_ep_name *dest_addr) { - for ( ; info; info = info->next) { - psmx2_dup_addr(info->addr_format, src_addr, - &info->src_addr, &info->src_addrlen); - psmx2_dup_addr(info->addr_format, dest_addr, - &info->dest_addr, &info->dest_addrlen); - - info->domain_attr->tx_ctx_cnt = psmx2_env.free_trx_ctxt; - info->domain_attr->rx_ctx_cnt = psmx2_env.free_trx_ctxt; - info->domain_attr->max_ep_tx_ctx = psmx2_env.max_trx_ctxt; - info->domain_attr->max_ep_rx_ctx = psmx2_env.max_trx_ctxt; - info->domain_attr->max_ep_stx_ctx = psmx2_env.max_trx_ctxt; - info->tx_attr->inject_size = psmx2_env.inject_size; + struct fi_info *p; + + for (p = info; p; p = p->next) { + psmx2_dup_addr(p->addr_format, src_addr, + &p->src_addr, &p->src_addrlen); + psmx2_dup_addr(p->addr_format, dest_addr, + &p->dest_addr, &p->dest_addrlen); + } + + psmx2_expand_default_unit(info); + + for (p = info; p; p = p->next) { + int unit = ((struct psmx2_ep_name *)p->src_addr)->unit; + + if (unit == PSMX2_DEFAULT_UNIT || !psmx2_env.multi_ep) { + p->domain_attr->tx_ctx_cnt = psmx2_hfi_info.free_trx_ctxt; + p->domain_attr->rx_ctx_cnt = psmx2_hfi_info.free_trx_ctxt; + p->domain_attr->max_ep_tx_ctx = psmx2_hfi_info.max_trx_ctxt; + p->domain_attr->max_ep_rx_ctx = psmx2_hfi_info.max_trx_ctxt; + p->domain_attr->max_ep_stx_ctx = psmx2_hfi_info.max_trx_ctxt; + } else { + p->domain_attr->tx_ctx_cnt = psmx2_hfi_info.unit_nfreectxts[unit]; + p->domain_attr->rx_ctx_cnt = psmx2_hfi_info.unit_nfreectxts[unit]; + p->domain_attr->max_ep_tx_ctx = psmx2_hfi_info.unit_nctxts[unit]; + p->domain_attr->max_ep_rx_ctx = psmx2_hfi_info.unit_nctxts[unit]; + p->domain_attr->max_ep_stx_ctx = psmx2_hfi_info.unit_nctxts[unit]; + } + + free(p->domain_attr->name); + if (unit == PSMX2_DEFAULT_UNIT) + p->domain_attr->name = strdup(psmx2_hfi_info.default_domain_name); + else + asprintf(&p->domain_attr->name, "hfi1_%d", unit); + + p->tx_attr->inject_size = psmx2_env.inject_size; } } @@ -386,18 +438,8 @@ void psmx2_alter_prov_info(uint32_t api_version, if (hints && hints->caps && !(hints->caps & FI_TRIGGER)) info->caps &= ~FI_TRIGGER; - /* - * Special arrangement for auto tag layout selection. - * See psmx2_init_prov_info(). Set this flag to allow - * follow-up fi_getinfo() calls to pick the same tag - * layout by copying caps from this instance without - * setting the cq_data_size field. Notice that the flag - * may be cleared by ofi_alter_info(). - */ - if (info->domain_attr->cq_data_size) { - info->caps |= FI_REMOTE_CQ_DATA; + if (info->domain_attr->cq_data_size) cq_data_cnt++; - } cnt++; } diff --git a/prov/psm2/src/psmx2_av.c b/prov/psm2/src/psmx2_av.c index bb4fd615ce0..204fb95a0a7 100644 --- a/prov/psm2/src/psmx2_av.c +++ b/prov/psm2/src/psmx2_av.c @@ -169,12 +169,14 @@ static void psmx2_set_epaddr_context(struct psmx2_trx_ctxt *trx_ctxt, psm2_epid_t epid, psm2_epaddr_t epaddr) { struct psmx2_epaddr_context *context; + struct psmx2_epaddr_context *old_context = NULL; context = (void *)psm2_epaddr_getctxt(epaddr); if (context) { if (context->trx_ctxt != trx_ctxt || context->epid != epid) { FI_WARN(&psmx2_prov, FI_LOG_AV, "trx_ctxt or epid doesn't match\n"); + old_context = context; context = NULL; } } @@ -193,14 +195,15 @@ static void psmx2_set_epaddr_context(struct psmx2_trx_ctxt *trx_ctxt, context->epid = epid; context->epaddr = epaddr; psm2_epaddr_setctxt(epaddr, context); + free(old_context); trx_ctxt->domain->peer_lock_fn(&trx_ctxt->peer_lock, 2); dlist_insert_before(&context->entry, &trx_ctxt->peer_list); trx_ctxt->domain->peer_unlock_fn(&trx_ctxt->peer_lock, 2); } -int psmx2_epid_to_epaddr(struct psmx2_trx_ctxt *trx_ctxt, - psm2_epid_t epid, psm2_epaddr_t *epaddr) +void psmx2_epid_to_epaddr(struct psmx2_trx_ctxt *trx_ctxt, + psm2_epid_t epid, psm2_epaddr_t *epaddr) { int err; psm2_error_t errors; @@ -212,7 +215,7 @@ int psmx2_epid_to_epaddr(struct psmx2_trx_ctxt *trx_ctxt, context = psm2_epaddr_getctxt(epconn.addr); if (context && context->epid == epid) { *epaddr = epconn.addr; - return 0; + return; } } @@ -220,13 +223,17 @@ int psmx2_epid_to_epaddr(struct psmx2_trx_ctxt *trx_ctxt, (int64_t) psmx2_env.conn_timeout * 1000000000LL); if (err == PSM2_OK || err == PSM2_EPID_ALREADY_CONNECTED) { psmx2_set_epaddr_context(trx_ctxt, epid, *epaddr); - return 0; + return; } - FI_WARN(&psmx2_prov, FI_LOG_AV, - "psm2_ep_connect retured error %s, remote epid=%lx.\n", - psm2_error_get_string(err), epid); - return psmx2_errno(err); + /* call fi_log() directly to always generate the output */ + fi_log(&psmx2_prov, FI_LOG_WARN, FI_LOG_AV, __func__, __LINE__, + "psm2_ep_connect retured error %s, remote epid=%lx." + "If it is a timeout error, try setting FI_PSM2_CONN_TIMEOUT " + "to a larger value (current: %d seconds).\n", + psm2_error_get_string(err), epid, psmx2_env.conn_timeout); + + abort(); } /* @@ -332,11 +339,9 @@ int psmx2_av_query_sep(struct psmx2_fid_av *av, psm2_amarg_t args[3]; int error; - if (!av->conn_info[trx_ctxt->id].epaddrs[idx]) { + if (!av->conn_info[trx_ctxt->id].epaddrs[idx]) psmx2_epid_to_epaddr(trx_ctxt, av->table[idx].epid, &av->conn_info[trx_ctxt->id].epaddrs[idx]); - assert(av->conn_info[trx_ctxt->id].epaddrs[idx]); - } psmx2_am_init(trx_ctxt); /* check AM handler installation */ @@ -347,9 +352,12 @@ int psmx2_av_query_sep(struct psmx2_fid_av *av, args[0].u32w1 = av->table[idx].sep_id; args[1].u64 = (uint64_t)(uintptr_t)&av->sep_info[idx]; args[2].u64 = (uint64_t)(uintptr_t)&status; - psm2_am_request_short(av->conn_info[trx_ctxt->id].epaddrs[idx], - PSMX2_AM_SEP_HANDLER, args, 3, NULL, - 0, 0, NULL, NULL); + error = psm2_am_request_short(av->conn_info[trx_ctxt->id].epaddrs[idx], + PSMX2_AM_SEP_HANDLER, args, 3, NULL, + 0, 0, NULL, NULL); + + if (error) + return error; /* * make sure AM is progressed promptly. don't call @@ -474,11 +482,13 @@ STATIC int psmx2_av_insert(struct fid_av *av, const void *addr, av_priv->table[idx].type = ep_name->type; av_priv->table[idx].epid = ep_name->epid; av_priv->table[idx].sep_id = ep_name->sep_id; + av_priv->table[idx].valid = 1; free(ep_name); } else { av_priv->table[idx].type = names[i].type; av_priv->table[idx].epid = names[i].epid; av_priv->table[idx].sep_id = names[i].sep_id; + av_priv->table[idx].valid = 1; } av_priv->sep_info[idx].ctxt_cnt = 1; av_priv->sep_info[idx].epids = NULL; @@ -620,6 +630,7 @@ static int psmx2_av_disconnect_addr(int trx_ctxt_id, psm2_epid_t epid, psm2_epaddr_t epaddr) { struct psmx2_epaddr_context *epaddr_context; + struct psmx2_trx_ctxt *trx_ctxt; psm2_error_t errors; int err; @@ -633,15 +644,24 @@ static int psmx2_av_disconnect_addr(int trx_ctxt_id, psm2_epid_t epid, if (!epaddr_context) return -FI_EINVAL; - if (trx_ctxt_id != epaddr_context->trx_ctxt->id) + trx_ctxt = epaddr_context->trx_ctxt; + if (trx_ctxt_id != trx_ctxt->id) return -FI_EINVAL; if (epid != epaddr_context->epid) return -FI_EINVAL; - err = psm2_ep_disconnect2(epaddr_context->trx_ctxt->psm2_ep, 1, &epaddr, + trx_ctxt->domain->peer_lock_fn(&trx_ctxt->peer_lock, 2); + dlist_remove_first_match(&trx_ctxt->peer_list, + psmx2_peer_match, epaddr); + trx_ctxt->domain->peer_unlock_fn(&trx_ctxt->peer_lock, 2); + + psm2_epaddr_setctxt(epaddr, NULL); + + err = psm2_ep_disconnect2(trx_ctxt->psm2_ep, 1, &epaddr, NULL, &errors, PSM2_EP_DISCONNECT_FORCE, 0); + free(epaddr_context); return psmx2_errno(err); } @@ -677,6 +697,7 @@ STATIC int psmx2_av_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count, if (!err) av_priv->conn_info[j].epaddrs[idx] = NULL; } + av_priv->table[idx].epid = 0; } else { if (!av_priv->sep_info[idx].epids) continue; @@ -696,7 +717,10 @@ STATIC int psmx2_av_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count, av_priv->conn_info[j].sepaddrs[idx][k] = NULL; } } + free(av_priv->sep_info[idx].epids); + av_priv->sep_info[idx].epids = NULL; } + av_priv->table[idx].valid = 0; } av_priv->domain->av_unlock_fn(&av_priv->lock, 1); @@ -711,6 +735,7 @@ STATIC int psmx2_av_map_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t cou struct psmx2_fid_av *av_priv; struct psmx2_trx_ctxt *trx_ctxt; psm2_error_t *errors; + int i; av_priv = container_of(av, struct psmx2_fid_av, av); @@ -725,6 +750,17 @@ STATIC int psmx2_av_map_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t cou if (!errors) return -FI_ENOMEM; + trx_ctxt->domain->peer_lock_fn(&trx_ctxt->peer_lock, 2); + for (i = 0; i < count; i++) { + dlist_remove_first_match(&trx_ctxt->peer_list, + psmx2_peer_match, + (psm2_epaddr_t)(fi_addr[i])); + } + trx_ctxt->domain->peer_unlock_fn(&trx_ctxt->peer_lock, 2); + + for (i = 0; i < count; i++) + psm2_epaddr_setctxt((psm2_epaddr_t)(fi_addr[i]), NULL); + psm2_ep_disconnect2(trx_ctxt->psm2_ep, count, (psm2_epaddr_t *)fi_addr, NULL, errors, PSM2_EP_DISCONNECT_FORCE, 0); @@ -755,6 +791,11 @@ STATIC int psmx2_av_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr, goto out; } + if (!av_priv->table[idx].valid) { + err = -FI_EINVAL; + goto out; + } + name.type = av_priv->table[idx].type; name.epid = av_priv->table[idx].epid; name.sep_id = av_priv->table[idx].sep_id; @@ -797,11 +838,13 @@ STATIC int psmx2_av_map_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr, return 0; } -fi_addr_t psmx2_av_translate_source(struct psmx2_fid_av *av, psm2_epaddr_t source) +fi_addr_t psmx2_av_translate_source(struct psmx2_fid_av *av, + psm2_epaddr_t source, int source_sep_id) { psm2_epid_t epid; fi_addr_t ret; int i, j, found; + int ep_type = source_sep_id ? PSMX2_EP_SCALABLE : PSMX2_EP_REGULAR; if (av->type == FI_AV_MAP) return (fi_addr_t) source; @@ -813,12 +856,26 @@ fi_addr_t psmx2_av_translate_source(struct psmx2_fid_av *av, psm2_epaddr_t sourc ret = FI_ADDR_NOTAVAIL; found = 0; for (i = av->hdr->last - 1; i >= 0 && !found; i--) { + if (!av->table[i].valid) + continue; + if (av->table[i].type == PSMX2_EP_REGULAR) { + if (ep_type == PSMX2_EP_SCALABLE) + continue; if (av->table[i].epid == epid) { ret = (fi_addr_t)i; found = 1; } } else { + /* + * scalable endpoint must match sep_id exactly. + * regular endpoint can match a context of any + * scalable endpoint. + */ + if (ep_type == PSMX2_EP_SCALABLE && + av->table[i].sep_id != source_sep_id) + continue; + if (!av->sep_info[i].epids) { for (j = 0; j < av->max_trx_ctxt; j++) { if (av->conn_info[j].trx_ctxt) @@ -830,6 +887,7 @@ fi_addr_t psmx2_av_translate_source(struct psmx2_fid_av *av, psm2_epaddr_t sourc if (!av->sep_info[i].epids) continue; } + for (j=0; jsep_info[i].ctxt_cnt; j++) { if (av->sep_info[i].epids[j] == epid) { ret = fi_rx_addr((fi_addr_t)i, j, @@ -860,6 +918,8 @@ void psmx2_av_remove_conn(struct psmx2_fid_av *av, av->domain->av_lock_fn(&av->lock, 1); for (i = 0; i < av->hdr->last; i++) { + if (!av->table[i].valid) + continue; if (av->table[i].type == PSMX2_EP_REGULAR) { if (av->table[i].epid == epid && av->conn_info[trx_ctxt->id].epaddrs[i] == epaddr) @@ -919,6 +979,7 @@ static int psmx2_av_close(fid_t fid) free(av->hdr); } + free(av->sep_info); out: free(av); return 0; @@ -1036,7 +1097,7 @@ int psmx2_av_open(struct fid_domain *domain, struct fi_av_attr *attr, if (av_type == FI_AV_MAP) conn_size = 0; else - conn_size = psmx2_env.max_trx_ctxt * sizeof(struct psmx2_av_conn); + conn_size = psmx2_hfi_info.max_trx_ctxt * sizeof(struct psmx2_av_conn); av_priv = (struct psmx2_fid_av *) calloc(1, sizeof(*av_priv) + conn_size); if (!av_priv) @@ -1099,7 +1160,7 @@ int psmx2_av_open(struct fid_domain *domain, struct fi_av_attr *attr, av_priv->count = count; av_priv->flags = flags; av_priv->rx_ctx_bits = rx_ctx_bits; - av_priv->max_trx_ctxt = psmx2_env.max_trx_ctxt; + av_priv->max_trx_ctxt = psmx2_hfi_info.max_trx_ctxt; av_priv->addr_format = domain_priv->addr_format; av_priv->type = av_type; diff --git a/prov/psm2/src/psmx2_cntr.c b/prov/psm2/src/psmx2_cntr.c index 0e77aa802de..d8ce11ec6b0 100644 --- a/prov/psm2/src/psmx2_cntr.c +++ b/prov/psm2/src/psmx2_cntr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/prov/psm2/src/psmx2_cq.c b/prov/psm2/src/psmx2_cq.c index f3d877b98cc..b9a01c16ffb 100644 --- a/prov/psm2/src/psmx2_cq.c +++ b/prov/psm2/src/psmx2_cq.c @@ -248,10 +248,12 @@ static inline int psmx2_cq_any_complete(struct psmx2_fid_cq *poll_cq, if (is_recv) { psm2_epaddr_t source = PSMX2_STATUS_PEER(status); + int source_sep_id = (flags & FI_REMOTE_CQ_DATA) ? 0 : data; if (event == event_in) { if (src_addr) { - src_addr[0] = psmx2_av_translate_source(av, source); + src_addr[0] = psmx2_av_translate_source(av, source, + source_sep_id); if (src_addr[0] == FI_ADDR_NOTAVAIL) { *event_saved = 0; event = psmx2_cq_alloc_event(comp_cq); @@ -264,16 +266,21 @@ static inline int psmx2_cq_any_complete(struct psmx2_fid_cq *poll_cq, event->error = !!event->cqe.err.err; if (av->addr_format == FI_ADDR_STR) { event->cqe.err.err_data_size = PSMX2_ERR_DATA_SIZE; - psmx2_get_source_string_name(source, (void *)&comp_cq->error_data, - &event->cqe.err.err_data_size); + psmx2_get_source_string_name( + source, source_sep_id, + (void *)&comp_cq->error_data, + &event->cqe.err.err_data_size); } else { - psmx2_get_source_name(source, (void *)&comp_cq->error_data); + psmx2_get_source_name( + source, source_sep_id, + (void *)&comp_cq->error_data); event->cqe.err.err_data_size = sizeof(struct psmx2_ep_name); } } } } else { event->source_is_valid = 1; + event->source_sep_id = source_sep_id; event->source = source; event->source_av = av; } @@ -433,11 +440,9 @@ psmx2_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry if (ep->recv_cq) { op_context = fi_context; buf = PSMX2_CTXT_USER(fi_context); - data = 0; - if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(req)))) { + data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(req)); + if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(req)))) flags |= FI_REMOTE_CQ_DATA; - data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(req)); - } err = psmx2_cq_rx_complete( status_data->poll_cq, ep->recv_cq, ep->av, req, op_context, buf, flags, data, @@ -457,11 +462,9 @@ psmx2_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry if (ep->recv_cq) { op_context = fi_context; buf = PSMX2_CTXT_USER(fi_context); - data = 0; - if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(req)))) { + data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(req)); + if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(req)))) flags |= FI_REMOTE_CQ_DATA; - data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(req)); - } err = psmx2_cq_rx_complete( status_data->poll_cq, ep->recv_cq, ep->av, req, op_context, buf, flags, data, @@ -481,11 +484,9 @@ psmx2_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry } PSMX2_EP_PUT_OP_CONTEXT(ep, fi_context); if (OFI_UNLIKELY(ep->recv_cq && PSMX2_STATUS_ERROR(req))) { - data = 0; - if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(req)))) { + data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(req)); + if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(req)))) flags |= FI_REMOTE_CQ_DATA; - data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(req)); - } err = psmx2_cq_rx_complete( status_data->poll_cq, ep->recv_cq, ep->av, req, NULL, NULL, flags, data, @@ -505,9 +506,12 @@ psmx2_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry } PSMX2_EP_PUT_OP_CONTEXT(ep, fi_context); if (OFI_UNLIKELY(ep->recv_cq && PSMX2_STATUS_ERROR(req))) { + data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(req)); + if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(req)))) + flags |= FI_REMOTE_CQ_DATA; err = psmx2_cq_rx_complete( status_data->poll_cq, ep->recv_cq, ep->av, - req, NULL, NULL, flags, 0, + req, NULL, NULL, flags, data, entry, status_data->src_addr, &event_saved); if (OFI_UNLIKELY(err)) return err; @@ -619,11 +623,9 @@ psmx2_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry if (ep->recv_cq) { op_context = fi_context; buf = multi_recv_req->buf + multi_recv_req->offset; - data = 0; - if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(req)))) { + data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(req)); + if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(req)))) flags |= FI_REMOTE_CQ_DATA; - data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(req)); - } if (multi_recv_req->offset + PSMX2_STATUS_RCVLEN(req) + multi_recv_req->min_buf_size > multi_recv_req->len) flags |= FI_MULTI_RECV; /* buffer used up */ @@ -955,10 +957,12 @@ static inline int psmx2_cq_any_complete(struct psmx2_fid_cq *poll_cq, if (is_recv) { psm2_epaddr_t source = PSMX2_STATUS_PEER(status); + int source_sep_id = (flags & FI_REMOTE_CQ_DATA) ? 0 : data; if (event == event_in) { if (src_addr) { - src_addr[*read_count] = psmx2_av_translate_source(av, source); + src_addr[*read_count] = + psmx2_av_translate_source(av, source, source_sep_id); if (src_addr[*read_count] == FI_ADDR_NOTAVAIL) { event = psmx2_cq_alloc_event(comp_cq); if (!event) @@ -970,10 +974,14 @@ static inline int psmx2_cq_any_complete(struct psmx2_fid_cq *poll_cq, event->error = !!event->cqe.err.err; if (av->addr_format == FI_ADDR_STR) { event->cqe.err.err_data_size = PSMX2_ERR_DATA_SIZE; - psmx2_get_source_string_name(source, (void *)&comp_cq->error_data, - &event->cqe.err.err_data_size); + psmx2_get_source_string_name( + source, source_sep_id, + (void *)&comp_cq->error_data, + &event->cqe.err.err_data_size); } else { - psmx2_get_source_name(source, (void *)&comp_cq->error_data); + psmx2_get_source_name( + source, source_sep_id, + (void *)&comp_cq->error_data); event->cqe.err.err_data_size = sizeof(struct psmx2_ep_name); } @@ -982,6 +990,7 @@ static inline int psmx2_cq_any_complete(struct psmx2_fid_cq *poll_cq, } } else { event->source_is_valid = 1; + event->source_sep_id = source_sep_id; event->source = source; event->source_av = av; } @@ -1136,12 +1145,9 @@ int psmx2_cq_poll_mq(struct psmx2_fid_cq *cq, op_context = fi_context; buf = PSMX2_CTXT_USER(fi_context); flags = psmx2_comp_flags[context_type]; - if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(status)))) { + data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(status)); + if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(status)))) flags |= FI_REMOTE_CQ_DATA; - data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(status)); - } else { - data = 0; - } err = psmx2_cq_rx_complete( cq, ep->recv_cq, ep->av, status, op_context, buf, flags, data, @@ -1162,12 +1168,9 @@ int psmx2_cq_poll_mq(struct psmx2_fid_cq *cq, op_context = fi_context; buf = PSMX2_CTXT_USER(fi_context); flags = psmx2_comp_flags[context_type]; - if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(status)))) { + data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(status)); + if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(status)))) flags |= FI_REMOTE_CQ_DATA; - data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(status)); - } else { - data = 0; - } err = psmx2_cq_rx_complete( cq, ep->recv_cq, ep->av, status, op_context, buf, flags, data, @@ -1191,12 +1194,9 @@ int psmx2_cq_poll_mq(struct psmx2_fid_cq *cq, op_context = NULL; buf = NULL; flags = psmx2_comp_flags[context_type]; - if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(status)))) { + data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(status)); + if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(status)))) flags |= FI_REMOTE_CQ_DATA; - data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(status)); - } else { - data = 0; - } err = psmx2_cq_rx_complete( cq, ep->recv_cq, ep->av, status, op_context, buf, flags, data, @@ -1220,9 +1220,12 @@ int psmx2_cq_poll_mq(struct psmx2_fid_cq *cq, op_context = NULL; buf = NULL; flags = psmx2_comp_flags[context_type]; + data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(status)); + if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(status)))) + flags |= FI_REMOTE_CQ_DATA; err = psmx2_cq_rx_complete( cq, ep->recv_cq, ep->av, - status, op_context, buf, flags, 0, + status, op_context, buf, flags, data, event_in, count, &read_count, &read_more, src_addr); if (err) @@ -1347,12 +1350,9 @@ int psmx2_cq_poll_mq(struct psmx2_fid_cq *cq, op_context = fi_context; buf = multi_recv_req->buf + multi_recv_req->offset; flags = psmx2_comp_flags[context_type]; - if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(status)))) { + data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(status)); + if (PSMX2_HAS_IMM(PSMX2_GET_FLAGS(PSMX2_STATUS_TAG(status)))) flags |= FI_REMOTE_CQ_DATA; - data = PSMX2_GET_CQDATA(PSMX2_STATUS_TAG(status)); - } else { - data = 0; - } if (multi_recv_req->offset + PSMX2_STATUS_RCVLEN(status) + multi_recv_req->min_buf_size > multi_recv_req->len) flags |= FI_MULTI_RECV; /* buffer used up */ @@ -1609,16 +1609,21 @@ STATIC ssize_t psmx2_cq_readfrom(struct fid_cq *cq, void *buf, size_t count, if (event) { if (!event->error) { if (src_addr && event->source_is_valid) { - source = psmx2_av_translate_source(event->source_av, - event->source); + source = psmx2_av_translate_source( + event->source_av, event->source, + event->source_sep_id); if (source == FI_ADDR_NOTAVAIL) { if (cq_priv->domain->addr_format == FI_ADDR_STR) { event->cqe.err.err_data_size = PSMX2_ERR_DATA_SIZE; - psmx2_get_source_string_name(event->source, - (void *)&cq_priv->error_data, - &event->cqe.err.err_data_size); + psmx2_get_source_string_name( + event->source, event->source_sep_id, + (void *)&cq_priv->error_data, + &event->cqe.err.err_data_size); } else { - psmx2_get_source_name(event->source, (void *)&cq_priv->error_data); + psmx2_get_source_name( + event->source, + event->source_sep_id, + (void *)&cq_priv->error_data); event->cqe.err.err_data_size = sizeof(struct psmx2_ep_name); } event->cqe.err.err_data = &cq_priv->error_data; @@ -1820,6 +1825,12 @@ static int psmx2_cq_close(fid_t fid) free(item); } + while (!slist_empty(&cq->event_queue)) { + entry = slist_remove_head(&cq->event_queue); + item = container_of(entry, struct psmx2_cq_event, list_entry); + free(item); + } + fastlock_destroy(&cq->lock); if (cq->wait) { diff --git a/prov/psm2/src/psmx2_domain.c b/prov/psm2/src/psmx2_domain.c index 741e79dc955..d7c327945e8 100644 --- a/prov/psm2/src/psmx2_domain.c +++ b/prov/psm2/src/psmx2_domain.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -203,10 +203,51 @@ static int psmx2_domain_close(fid_t fid) return 0; } +static int psmx2_domain_get_val(struct fid *fid, int var, void *val) +{ + struct psmx2_fid_domain *domain; + + if (!val) + return -FI_EINVAL; + + domain = container_of(fid, struct psmx2_fid_domain, + util_domain.domain_fid.fid); + + switch (var) { + case FI_PSM2_DISCONNECT: + *(uint32_t *)val = domain->params.disconnect; + break; + default: + return -FI_EINVAL; + } + return 0; +} + +static int psmx2_domain_set_val(struct fid *fid, int var, void *val) +{ + struct psmx2_fid_domain *domain; + + if (!val) + return -FI_EINVAL; + + domain = container_of(fid, struct psmx2_fid_domain, + util_domain.domain_fid.fid); + + switch (var) { + case FI_PSM2_DISCONNECT: + domain->params.disconnect = *(uint32_t *)val; + break; + default: + return -FI_EINVAL; + } + return 0; +} + DIRECT_FN STATIC int psmx2_domain_control(fid_t fid, int command, void *arg) { struct fi_mr_map_raw *map; + struct fi_fid_var *var; switch (command) { case FI_MAP_RAW_MR: @@ -220,6 +261,14 @@ STATIC int psmx2_domain_control(fid_t fid, int command, void *arg) /* Nothing to do here */ break; + case FI_GET_VAL: + var = arg; + return psmx2_domain_get_val(fid, var->name, var->val); + + case FI_SET_VAL: + var = arg; + return psmx2_domain_set_val(fid, var->name, var->val); + default: return -FI_ENOSYS; } @@ -246,6 +295,7 @@ static struct fi_ops_domain psmx2_domain_ops = { .stx_ctx = psmx2_stx_ctx, .srx_ctx = fi_no_srx_context, .query_atomic = psmx2_query_atomic, + .query_collective = fi_no_query_collective, }; static int psmx2_key_compare(void *key1, void *key2) @@ -309,7 +359,7 @@ int psmx2_domain_open(struct fid_fabric *fabric, struct fi_info *info, util_fabric.fabric_fid); if (!info->domain_attr->name || - strcmp(info->domain_attr->name, PSMX2_DOMAIN_NAME)) { + strncmp(info->domain_attr->name, PSMX2_DOMAIN_NAME, strlen(PSMX2_DOMAIN_NAME))) { err = -FI_EINVAL; goto err_out; } @@ -320,6 +370,21 @@ int psmx2_domain_open(struct fid_fabric *fabric, struct fi_info *info, goto err_out; } + psmx2_get_uuid(domain_priv->uuid); + if (info->ep_attr && info->ep_attr->auth_key) { + if (info->ep_attr->auth_key_size != sizeof(psm2_uuid_t)) { + FI_WARN(&psmx2_prov, FI_LOG_DOMAIN, + "Invalid auth_key_len %"PRIu64 + ", should be %"PRIu64".\n", + info->ep_attr->auth_key_size, + sizeof(psm2_uuid_t)); + err = -FI_EINVAL; + goto err_out_free_domain; + } + memcpy(domain_priv->uuid, info->ep_attr->auth_key, + sizeof(psm2_uuid_t)); + } + err = ofi_domain_init(fabric, info, &domain_priv->util_domain, context); if (err) goto err_out_free_domain; @@ -335,6 +400,7 @@ int psmx2_domain_open(struct fid_fabric *fabric, struct fi_info *info, domain_priv->progress_thread_enabled = (info->domain_attr->data_progress == FI_PROGRESS_AUTO); domain_priv->addr_format = info->addr_format; + domain_priv->params.disconnect = psmx2_env.disconnect; if (info->addr_format == FI_ADDR_STR) src_addr = psmx2_string_to_ep_name(info->src_addr); @@ -375,7 +441,6 @@ int psmx2_domain_open(struct fid_fabric *fabric, struct fi_info *info, domain_priv->av_lock_fn = psmx2_lock_disabled; domain_priv->trx_ctxt_lock_fn = psmx2_lock_disabled; domain_priv->trigger_queue_lock_fn = psmx2_lock_disabled; - domain_priv->peer_lock_fn = psmx2_lock_disabled; domain_priv->sep_lock_fn = psmx2_lock_disabled; domain_priv->trigger_lock_fn = psmx2_lock_disabled; domain_priv->cq_lock_fn = psmx2_lock_disabled; @@ -386,7 +451,6 @@ int psmx2_domain_open(struct fid_fabric *fabric, struct fi_info *info, domain_priv->av_unlock_fn = psmx2_lock_disabled; domain_priv->trx_ctxt_unlock_fn = psmx2_lock_disabled; domain_priv->trigger_queue_unlock_fn = psmx2_lock_disabled; - domain_priv->peer_unlock_fn = psmx2_lock_disabled; domain_priv->sep_unlock_fn = psmx2_lock_disabled; domain_priv->trigger_unlock_fn = psmx2_lock_disabled; domain_priv->cq_unlock_fn = psmx2_lock_disabled; @@ -394,11 +458,15 @@ int psmx2_domain_open(struct fid_fabric *fabric, struct fi_info *info, domain_priv->context_unlock_fn = psmx2_lock_disabled; domain_priv->poll_unlock_fn = psmx2_lock_disabled; + /* Enable lock accessed by the disconnection thread */ + domain_priv->peer_lock_fn = psmx2_lock_enabled; + domain_priv->peer_unlock_fn = psmx2_unlock_enabled; + /* * If FI_RMA or FI_ATOMIC caps are enabled, then locks are - * required for the CQ, am_req_poll, & rma_queue + * required for the CQ, am_req_pool, & rma_queue * due to the PSM2 Recv thread. - * NOTE: am_req_poll & rma_queue are only used when FI_RMA + * NOTE: am_req_pool & rma_queue are only used when FI_RMA * and FI_ATOMIC capabilities are enabled. */ if ((info->caps & FI_RMA) || (info->caps & FI_ATOMIC)) { @@ -409,6 +477,32 @@ int psmx2_domain_open(struct fid_fabric *fabric, struct fi_info *info, domain_priv->am_req_pool_unlock_fn = psmx2_unlock_enabled; domain_priv->rma_queue_unlock_fn = psmx2_unlock_enabled; } + + /* + * Locks accessed by the progress thread are required because + * they are outside the scope of domain access serialization + * implied by FI_THREAD_DOMAIN. + */ + if (domain_priv->progress_thread_enabled) { + domain_priv->trx_ctxt_lock_fn = psmx2_lock_enabled; + domain_priv->poll_trylock_fn = psmx2_trylock_enabled; + domain_priv->cq_lock_fn = psmx2_lock_enabled; + domain_priv->trx_ctxt_unlock_fn = psmx2_unlock_enabled; + domain_priv->poll_unlock_fn = psmx2_unlock_enabled; + domain_priv->cq_unlock_fn = psmx2_unlock_enabled; + if (info->caps & FI_TRIGGER) { + domain_priv->trigger_queue_lock_fn = psmx2_lock_enabled; + domain_priv->trigger_lock_fn = psmx2_lock_enabled; + domain_priv->av_lock_fn = psmx2_lock_enabled; + domain_priv->mr_lock_fn = psmx2_lock_enabled; + domain_priv->context_lock_fn = psmx2_lock_enabled; + domain_priv->trigger_queue_unlock_fn = psmx2_unlock_enabled; + domain_priv->trigger_unlock_fn = psmx2_unlock_enabled; + domain_priv->av_unlock_fn = psmx2_unlock_enabled; + domain_priv->mr_unlock_fn = psmx2_unlock_enabled; + domain_priv->context_unlock_fn = psmx2_unlock_enabled; + } + } break; default: /* Otherwise, enable all locks */ diff --git a/prov/psm2/src/psmx2_ep.c b/prov/psm2/src/psmx2_ep.c index 20d4c71ca91..b924d25b88e 100644 --- a/prov/psm2/src/psmx2_ep.c +++ b/prov/psm2/src/psmx2_ep.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -526,24 +526,6 @@ int psmx2_ep_open_internal(struct psmx2_fid_domain *domain_priv, else ep_cap = FI_TAGGED; - if (info && info->ep_attr && info->ep_attr->auth_key) { - if (info->ep_attr->auth_key_size != sizeof(psm2_uuid_t)) { - FI_WARN(&psmx2_prov, FI_LOG_EP_CTRL, - "Invalid auth_key_len %"PRIu64 - ", should be %"PRIu64".\n", - info->ep_attr->auth_key_size, - sizeof(psm2_uuid_t)); - goto errout; - } - if (memcmp(domain_priv->fabric->uuid, info->ep_attr->auth_key, - sizeof(psm2_uuid_t))) { - FI_WARN(&psmx2_prov, FI_LOG_EP_CTRL, - "Invalid auth_key: %s\n", - psmx2_uuid_to_string((void *)info->ep_attr->auth_key)); - goto errout; - } - } - ep_priv = (struct psmx2_fid_ep *) calloc(1, sizeof *ep_priv); if (!ep_priv) { err = -FI_ENOMEM; @@ -625,6 +607,7 @@ int psmx2_ep_open(struct fid_domain *domain, struct fi_info *info, struct psmx2_trx_ctxt *trx_ctxt = NULL; int err = -FI_EINVAL; int usage_flags = PSMX2_TX_RX; + uint8_t *uuid = NULL; domain_priv = container_of(domain, struct psmx2_fid_domain, util_domain.domain_fid.fid); @@ -655,9 +638,21 @@ int psmx2_ep_open(struct fid_domain *domain, struct fi_info *info, src_addr = info->src_addr; } + if (info && info->ep_attr && info->ep_attr->auth_key) { + if (info->ep_attr->auth_key_size != sizeof(psm2_uuid_t)) { + FI_WARN(&psmx2_prov, FI_LOG_EP_CTRL, + "Invalid auth_key_len %"PRIu64 + ", should be %"PRIu64".\n", + info->ep_attr->auth_key_size, + sizeof(psm2_uuid_t)); + goto errout; + } + uuid = info->ep_attr->auth_key; + } + if (usage_flags) { trx_ctxt = psmx2_trx_ctxt_alloc(domain_priv, src_addr, -1, - usage_flags); + usage_flags, uuid); if (!trx_ctxt) goto errout; } else { @@ -758,7 +753,9 @@ int psmx2_stx_ctx(struct fid_domain *domain, struct fi_tx_attr *attr, goto errout; } - trx_ctxt = psmx2_trx_ctxt_alloc(domain_priv, NULL, -1, PSMX2_TX); + /* no auth_key is provided, use NULL to pick the default uuid */ + trx_ctxt = psmx2_trx_ctxt_alloc(domain_priv, NULL, -1, PSMX2_TX, + NULL); if (!trx_ctxt) { err = -FI_ENOMEM; goto errout_free_stx; @@ -941,6 +938,7 @@ int psmx2_sep_open(struct fid_domain *domain, struct fi_info *info, size_t ctxt_cnt = 1; size_t ctxt_size; int err = -FI_EINVAL; + uint8_t *uuid = NULL; int i; domain_priv = container_of(domain, struct psmx2_fid_domain, @@ -949,18 +947,28 @@ int psmx2_sep_open(struct fid_domain *domain, struct fi_info *info, goto errout; if (info && info->ep_attr) { - if (info->ep_attr->tx_ctx_cnt > psmx2_env.max_trx_ctxt) { + if (info->ep_attr->auth_key_size != sizeof(psm2_uuid_t)) { + FI_WARN(&psmx2_prov, FI_LOG_EP_CTRL, + "Invalid auth_key_len %"PRIu64 + ", should be %"PRIu64".\n", + info->ep_attr->auth_key_size, + sizeof(psm2_uuid_t)); + goto errout; + } + uuid = info->ep_attr->auth_key; + + if (info->ep_attr->tx_ctx_cnt > psmx2_hfi_info.max_trx_ctxt) { FI_WARN(&psmx2_prov, FI_LOG_EP_CTRL, "tx_ctx_cnt %"PRIu64" exceed limit %d.\n", info->ep_attr->tx_ctx_cnt, - psmx2_env.max_trx_ctxt); + psmx2_hfi_info.max_trx_ctxt); goto errout; } - if (info->ep_attr->rx_ctx_cnt > psmx2_env.max_trx_ctxt) { + if (info->ep_attr->rx_ctx_cnt > psmx2_hfi_info.max_trx_ctxt) { FI_WARN(&psmx2_prov, FI_LOG_EP_CTRL, "rx_ctx_cnt %"PRIu64" exceed limit %d.\n", info->ep_attr->rx_ctx_cnt, - psmx2_env.max_trx_ctxt); + psmx2_hfi_info.max_trx_ctxt); goto errout; } ctxt_cnt = info->ep_attr->tx_ctx_cnt; @@ -1000,7 +1008,7 @@ int psmx2_sep_open(struct fid_domain *domain, struct fi_info *info, for (i = 0; i < ctxt_cnt; i++) { trx_ctxt = psmx2_trx_ctxt_alloc(domain_priv, src_addr, (ctxt_cnt > 1) ? i : -1, - PSMX2_TX_RX); + PSMX2_TX_RX, uuid); if (!trx_ctxt) { err = -FI_ENOMEM; goto errout_free_ctxt; @@ -1032,6 +1040,8 @@ int psmx2_sep_open(struct fid_domain *domain, struct fi_info *info, ((uintptr_t)sep_priv & 0xFFFF); sep_priv->id = ofi_atomic_inc32(&domain_priv->sep_cnt); + for (i = 0; i < ctxt_cnt; i++) + sep_priv->ctxts[i].ep->sep_id = sep_priv->id; domain_priv->sep_lock_fn(&domain_priv->sep_lock, 1); dlist_insert_before(&sep_priv->entry, &domain_priv->sep_list); diff --git a/prov/psm2/src/psmx2_fabric.c b/prov/psm2/src/psmx2_fabric.c index cc93d36392c..fcd8398c031 100644 --- a/prov/psm2/src/psmx2_fabric.c +++ b/prov/psm2/src/psmx2_fabric.c @@ -78,7 +78,7 @@ static struct fi_ops_fabric psmx2_fabric_ops = { static struct fi_fabric_attr psmx2_fabric_attr = { .name = PSMX2_FABRIC_NAME, - .prov_version = PSMX2_VERSION, + .prov_version = OFI_VERSION_DEF_PROV, }; int psmx2_fabric(struct fi_fabric_attr *attr, @@ -105,9 +105,11 @@ int psmx2_fabric(struct fi_fabric_attr *attr, fastlock_init(&fabric_priv->domain_lock); dlist_init(&fabric_priv->domain_list); - psmx2_get_uuid(fabric_priv->uuid); if (psmx2_env.name_server) { - fabric_priv->name_server.port = psmx2_uuid_to_port(fabric_priv->uuid); + psm2_uuid_t uuid; + + psmx2_get_uuid(uuid); + fabric_priv->name_server.port = psmx2_uuid_to_port(uuid); fabric_priv->name_server.name_len = sizeof(struct psmx2_ep_name); fabric_priv->name_server.service_len = sizeof(int); fabric_priv->name_server.service_cmp = psmx2_ns_service_cmp; diff --git a/prov/psm2/src/psmx2_init.c b/prov/psm2/src/psmx2_init.c index 168773ddfc8..09ebbea3f39 100644 --- a/prov/psm2/src/psmx2_init.c +++ b/prov/psm2/src/psmx2_init.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. + * Copyright (c) 2013-2020 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -39,6 +39,8 @@ static int psmx2_init_count = 0; static int psmx2_lib_initialized = 0; static pthread_mutex_t psmx2_lib_mutex; +struct psmx2_hfi_info psmx2_hfi_info; + struct psmx2_env psmx2_env = { .name_server = 1, .tagged_rma = 1, @@ -49,9 +51,6 @@ struct psmx2_env psmx2_env = { .prog_interval = -1, .prog_affinity = NULL, .multi_ep = 0, - .max_trx_ctxt = 1, - .free_trx_ctxt = 1, - .num_devunits = 1, .inject_size = 64, .lock_level = 2, .lazy_conn = 0, @@ -71,9 +70,29 @@ int psmx2_tag_layout_locked = 0; static void psmx2_init_env(void) { + char *ompi_job_key; + psm2_uuid_t uuid = {}; + unsigned long long *u = (unsigned long long *)uuid; + if (getenv("OMPI_COMM_WORLD_RANK") || getenv("PMI_RANK") || getenv("PMIX_RANK")) psmx2_env.name_server = 0; + /* + * Check for Open MPI job key. If set, convert it to the default uuid + * string. This will be overridden by the FI_PSM2_UUID variable, and + * both will have lower priority than the auth_key passed via ep_attr. + */ + ompi_job_key = getenv("OMPI_MCA_orte_precondition_transports"); + if (ompi_job_key) { + FI_INFO(&psmx2_prov, FI_LOG_CORE, + "Open MPI job key: %s.\n", ompi_job_key); + if (sscanf(ompi_job_key, "%016llx-%016llx", &u[0], &u[1]) == 2) + psmx2_env.uuid = strdup(psmx2_uuid_to_string(uuid)); + else + FI_INFO(&psmx2_prov, FI_LOG_CORE, + "Invalid Open MPI job key format.\n"); + } + fi_param_get_bool(&psmx2_prov, "name_server", &psmx2_env.name_server); fi_param_get_bool(&psmx2_prov, "tagged_rma", &psmx2_env.tagged_rma); fi_param_get_str(&psmx2_prov, "uuid", &psmx2_env.uuid); @@ -251,8 +270,8 @@ static int psmx2_init_lib(void) return ret; } -#if !HAVE_PSM2_INFO_QUERY #define PSMX2_SYSFS_PATH "/sys/class/infiniband/hfi1" +#if !HAVE_PSM2_INFO_QUERY static int psmx2_read_sysfs_int(int unit, char *entry) { char path[64]; @@ -276,27 +295,41 @@ static int psmx2_unit_active(int unit) } #endif -#define PSMX2_MAX_UNITS 4 -static int psmx2_active_units[PSMX2_MAX_UNITS]; -static int psmx2_num_active_units; - -static void psmx2_update_hfi_info(void) +static int psmx2_update_hfi_info(void) { - int i; + unsigned short i; int nctxts = 0; int nfreectxts = 0; int hfi_unit = -1; int multirail = 0; char *s; + char unit_name[8]; + uint32_t cnt = 0; + int tmp_nctxts, tmp_nfreectxts; + int offset = 0; #if HAVE_PSM2_INFO_QUERY int unit_active; int ret; - int tmp_cnt; psm2_info_query_arg_t args[1]; #endif - assert(psmx2_env.num_devunits <= PSMX2_MAX_UNITS); + if (psmx2_hfi_info.num_units > 0) + return 0; + +#if HAVE_PSM2_INFO_QUERY + if (psm2_info_query(PSM2_INFO_QUERY_NUM_UNITS, &cnt, 0, NULL) || !cnt) +#else + if (psm2_ep_num_devunits(&cnt) || !cnt) +#endif + { + FI_INFO(&psmx2_prov, FI_LOG_CORE, + "no PSM2 device is found.\n"); + return -FI_ENODEV; + } + psmx2_hfi_info.num_units = cnt; + + assert(psmx2_hfi_info.num_units <= PSMX2_MAX_UNITS); s = getenv("HFI_UNIT"); if (s) @@ -306,8 +339,8 @@ static void psmx2_update_hfi_info(void) if (s) multirail = atoi(s); - psmx2_num_active_units = 0; - for (i = 0; i < psmx2_env.num_devunits; i++) { + psmx2_hfi_info.num_active_units = 0; + for (i = 0; i < psmx2_hfi_info.num_units; i++) { #if HAVE_PSM2_INFO_QUERY args[0].unit = i; ret = psm2_info_query(PSM2_INFO_QUERY_UNIT_STATUS, &unit_active, 1, args); @@ -333,24 +366,22 @@ static void psmx2_update_hfi_info(void) } if (PSM2_OK != psm2_info_query(PSM2_INFO_QUERY_NUM_FREE_CONTEXTS, - &tmp_cnt, 1, args) || (tmp_cnt < 0)) + &tmp_nfreectxts, 1, args) || (tmp_nfreectxts < 0)) { FI_WARN(&psmx2_prov, FI_LOG_CORE, "Failed to read number of free contexts from HFI unit %d\n", i); continue; } - nfreectxts += tmp_cnt; if (PSM2_OK != psm2_info_query(PSM2_INFO_QUERY_NUM_CONTEXTS, - &tmp_cnt, 1, args) || (tmp_cnt < 0)) + &tmp_nctxts, 1, args) || (tmp_nctxts < 0)) { FI_WARN(&psmx2_prov, FI_LOG_CORE, "Failed to read number of contexts from HFI unit %d\n", i); continue; } - nctxts += tmp_cnt; #else if (!psmx2_unit_active(i)) { FI_INFO(&psmx2_prov, FI_LOG_CORE, @@ -365,10 +396,25 @@ static void psmx2_update_hfi_info(void) continue; } - nctxts += psmx2_read_sysfs_int(i, "nctxts"); - nfreectxts += psmx2_read_sysfs_int(i, "nfreectxts"); + tmp_nctxts = psmx2_read_sysfs_int(i, "nctxts"); + tmp_nfreectxts = psmx2_read_sysfs_int(i, "nfreectxts"); #endif - psmx2_active_units[psmx2_num_active_units++] = i; + + nctxts += tmp_nctxts; + nfreectxts += tmp_nfreectxts; + + psmx2_hfi_info.unit_is_active[i] = 1; + psmx2_hfi_info.unit_nctxts[i] = tmp_nctxts; + psmx2_hfi_info.unit_nfreectxts[i] = tmp_nfreectxts; + psmx2_hfi_info.active_units[psmx2_hfi_info.num_active_units++] = i; + + snprintf(unit_name, sizeof(unit_name), "hfi1_%hu", i); + if (psmx2_hfi_info.num_active_units > 1) + offset = snprintf(psmx2_hfi_info.default_domain_name, + sizeof(psmx2_hfi_info.default_domain_name), ";"); + snprintf(psmx2_hfi_info.default_domain_name, + sizeof(psmx2_hfi_info.default_domain_name) - offset, + "%s", unit_name); if (multirail) break; @@ -377,28 +423,91 @@ static void psmx2_update_hfi_info(void) FI_INFO(&psmx2_prov, FI_LOG_CORE, "hfi1 units: total %d, active %d; " "hfi1 contexts: total %d, free %d\n", - psmx2_env.num_devunits, psmx2_num_active_units, + psmx2_hfi_info.num_units, psmx2_hfi_info.num_active_units, nctxts, nfreectxts); if (psmx2_env.multi_ep) { - psmx2_env.max_trx_ctxt = nctxts; - psmx2_env.free_trx_ctxt = nfreectxts; - } else if (nfreectxts == 0) { - psmx2_env.free_trx_ctxt = nfreectxts; + psmx2_hfi_info.max_trx_ctxt = nctxts; + psmx2_hfi_info.free_trx_ctxt = nfreectxts; + } else { + psmx2_hfi_info.max_trx_ctxt = 1; + psmx2_hfi_info.free_trx_ctxt = (nfreectxts == 0) ? 0 : 1; } FI_INFO(&psmx2_prov, FI_LOG_CORE, "Tx/Rx contexts: %d in total, %d available.\n", - psmx2_env.max_trx_ctxt, psmx2_env.free_trx_ctxt); + psmx2_hfi_info.max_trx_ctxt, psmx2_hfi_info.free_trx_ctxt); + + return 0; } int psmx2_get_round_robin_unit(int idx) { - return psmx2_num_active_units ? - psmx2_active_units[idx % psmx2_num_active_units] : + return psmx2_hfi_info.num_active_units ? + psmx2_hfi_info.active_units[idx % psmx2_hfi_info.num_active_units] : -1; } +static void psmx2_update_hfi_nic_info(struct fi_info *info) +{ + char *path; + char buffer[80]; + char *s; + ssize_t n; + unsigned int a, b, c, d; + int unit; + + for ( ; info; info = info->next) { + unit = ((struct psmx2_ep_name *)info->src_addr)->unit; + + if (unit == PSMX2_DEFAULT_UNIT) + continue; + + if (!info->nic) { + info->nic = ofi_nic_dup(NULL); + if (!info->nic) { + FI_WARN(&psmx2_prov, FI_LOG_CORE, + "Failed to allocate nic info for HFI unit %d\n", unit); + continue; + } + } + + if (asprintf(&path, "%s_%d/%s", PSMX2_SYSFS_PATH, unit, "device") < 0) { + FI_WARN(&psmx2_prov, FI_LOG_CORE, + "Failed to read nic info for HFI unit %d\n", unit); + continue; + } + + n = readlink(path, buffer, 80); + free(path); + + if (n < 0) { + FI_WARN(&psmx2_prov, FI_LOG_CORE, + "Failed to read nic info for HFI unit %d\n", unit); + continue; + } + + buffer[n] = '\0'; + if ((s = strrchr(buffer, '/'))) + s++; + else + s = buffer; + + n = sscanf(s, "%x:%x:%x.%x", &a, &b, &c, &d); + if (n < 4) { + FI_WARN(&psmx2_prov, FI_LOG_CORE, + "Failed to read nic info for HFI unit %d\n", unit); + continue; + } + + info->nic->bus_attr->bus_type = FI_BUS_PCI; + info->nic->bus_attr->attr.pci.domain_id = (uint16_t) a; + info->nic->bus_attr->attr.pci.bus_id = (uint8_t) b; + info->nic->bus_attr->attr.pci.device_id = (uint8_t) c; + info->nic->bus_attr->attr.pci.function_id = (uint8_t) d; + } +} + static int psmx2_getinfo(uint32_t api_version, const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct fi_info **info) @@ -410,7 +519,6 @@ static int psmx2_getinfo(uint32_t api_version, const char *node, size_t len; void *addr; uint32_t fmt; - uint32_t cnt = 0; FI_INFO(&psmx2_prov, FI_LOG_CORE,"\n"); @@ -420,20 +528,10 @@ static int psmx2_getinfo(uint32_t api_version, const char *node, if (psmx2_init_lib()) goto err_out; -#if HAVE_PSM2_INFO_QUERY - if (psm2_info_query(PSM2_INFO_QUERY_NUM_UNITS, &cnt, 0, NULL) || !cnt) -#else - if (psm2_ep_num_devunits(&cnt) || !cnt) -#endif - { - FI_INFO(&psmx2_prov, FI_LOG_CORE, - "no PSM2 device is found.\n"); + if (psmx2_update_hfi_info()) goto err_out; - } - psmx2_env.num_devunits = cnt; - psmx2_update_hfi_info(); - if (!psmx2_num_active_units) { + if (!psmx2_hfi_info.num_active_units) { FI_INFO(&psmx2_prov, FI_LOG_CORE, "no PSM2 device is active.\n"); goto err_out; @@ -483,6 +581,20 @@ static int psmx2_getinfo(uint32_t api_version, const char *node, } } + /* Check that the src address contains valid unit */ + if (src_addr->unit != PSMX2_DEFAULT_UNIT) { + if (src_addr->unit < 0 || src_addr->unit >= PSMX2_MAX_UNITS) { + FI_INFO(&psmx2_prov, FI_LOG_CORE, + "invalid source address: unit %d out of range\n", src_addr->unit); + goto err_out; + } + if (!psmx2_hfi_info.unit_is_active[src_addr->unit]) { + FI_INFO(&psmx2_prov, FI_LOG_CORE, + "invalid source address: unit %d is inactive\n", src_addr->unit); + goto err_out; + } + } + /* Resovle dest address using "node", "service" pair */ if (!dest_addr && node && !(flags & FI_SOURCE)) { psm2_uuid_t uuid; @@ -513,7 +625,7 @@ static int psmx2_getinfo(uint32_t api_version, const char *node, } } - /* Update prov info with resovled addresses and environment settings */ + /* Update prov info with resovled addresses and hfi info */ psmx2_update_prov_info(prov_info, src_addr, dest_addr); /* Remove prov info that don't match the hints */ @@ -522,7 +634,13 @@ static int psmx2_getinfo(uint32_t api_version, const char *node, /* Apply hints to the prov info */ psmx2_alter_prov_info(api_version, hints, prov_info); + + /* Set fi_nic struture */ + psmx2_update_hfi_nic_info(prov_info); + *info = prov_info; + free(src_addr); + free(dest_addr); return 0; err_out: @@ -557,8 +675,8 @@ static void psmx2_fini(void) struct fi_provider psmx2_prov = { .name = PSMX2_PROV_NAME, - .version = PSMX2_VERSION, - .fi_version = FI_VERSION(1, 8), + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, .getinfo = psmx2_getinfo, .fabric = psmx2_fabric, .cleanup = psmx2_fini diff --git a/prov/psm2/src/psmx2_msg.c b/prov/psm2/src/psmx2_msg.c index 52ae7188146..2384ab23bd1 100644 --- a/prov/psm2/src/psmx2_msg.c +++ b/prov/psm2/src/psmx2_msg.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -211,7 +211,10 @@ ssize_t psmx2_send_generic(struct fid_ep *ep, const void *buf, size_t len, assert(av); psm2_epaddr = psmx2_av_translate_addr(av, ep_priv->tx, dest_addr, av->type); - PSMX2_SET_TAG(psm2_tag, 0, data, PSMX2_TYPE_MSG | PSMX2_IMM_BIT_SET(have_data)); + if (have_data) + PSMX2_SET_TAG(psm2_tag, 0, data, PSMX2_TYPE_MSG | PSMX2_IMM_BIT); + else + PSMX2_SET_TAG(psm2_tag, 0, ep_priv->sep_id, PSMX2_TYPE_MSG); if ((flags & PSMX2_NO_COMPLETION) || (ep_priv->send_selective_completion && !(flags & FI_COMPLETION))) @@ -353,10 +356,12 @@ ssize_t psmx2_sendv_generic(struct fid_ep *ep, const struct iovec *iov, assert(av); psm2_epaddr = psmx2_av_translate_addr(av, ep_priv->tx, dest_addr, av->type); - if (flags & FI_REMOTE_CQ_DATA) + if (flags & FI_REMOTE_CQ_DATA) { msg_flags |= PSMX2_IMM_BIT; - - PSMX2_SET_TAG(psm2_tag, 0ULL, data, msg_flags); + PSMX2_SET_TAG(psm2_tag, 0ULL, data, msg_flags); + } else { + PSMX2_SET_TAG(psm2_tag, 0ULL, ep_priv->sep_id, msg_flags); + } if ((flags & PSMX2_NO_COMPLETION) || (ep_priv->send_selective_completion && !(flags & FI_COMPLETION))) diff --git a/prov/psm2/src/psmx2_rma.c b/prov/psm2/src/psmx2_rma.c index eec369ad387..db7873e0bae 100644 --- a/prov/psm2/src/psmx2_rma.c +++ b/prov/psm2/src/psmx2_rma.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -577,6 +577,7 @@ static ssize_t psmx2_rma_self(int am_cmd, void psmx2_am_ack_rma(struct psmx2_am_request *req) { psm2_amarg_t args[8]; + int err; if ((req->op & PSMX2_AM_OP_MASK) != PSMX2_AM_REQ_WRITE_LONG) return; @@ -585,9 +586,12 @@ void psmx2_am_ack_rma(struct psmx2_am_request *req) args[0].u32w1 = req->error; args[1].u64 = (uint64_t)(uintptr_t)req->write.context; - psm2_am_request_short(req->write.peer_addr, - PSMX2_AM_RMA_HANDLER, args, 2, NULL, 0, - PSM2_AM_FLAG_NOREPLY, NULL, NULL); + err = psm2_am_request_short(req->write.peer_addr, + PSMX2_AM_RMA_HANDLER, args, 2, NULL, 0, + PSM2_AM_FLAG_NOREPLY, NULL, NULL); + if (err) + FI_INFO(&psmx2_prov, FI_LOG_EP_DATA, + "failed to send am_ack: err %d.\n", err); } #if !HAVE_PSM2_MQ_FP_MSG @@ -636,6 +640,8 @@ ssize_t psmx2_read_generic(struct fid_ep *ep, void *buf, size_t len, psm2_epid_t psm2_epid; psm2_mq_req_t psm2_req; psm2_mq_tag_t psm2_tag, psm2_tagsel; + size_t req_refcnt = 0; + int err; ep_priv = container_of(ep, struct psmx2_fid_ep, ep); @@ -684,17 +690,25 @@ ssize_t psmx2_read_generic(struct fid_ep *ep, void *buf, size_t len, if (psmx2_env.tagged_rma && len > chunk_size) { PSMX2_SET_TAG(psm2_tag, (uint64_t)req, 0, PSMX2_RMA_TYPE_READ); PSMX2_SET_MASK(psm2_tagsel, PSMX2_MATCH_ALL, PSMX2_RMA_TYPE_MASK); - psm2_mq_irecv2(ep_priv->tx->psm2_mq, psm2_epaddr, - &psm2_tag, &psm2_tagsel, 0, buf, len, - (void *)&req->fi_context, &psm2_req); + err = psm2_mq_irecv2(ep_priv->tx->psm2_mq, psm2_epaddr, + &psm2_tag, &psm2_tagsel, 0, buf, len, + (void *)&req->fi_context, &psm2_req); + if (err) { + psmx2_am_request_free(ep_priv->tx, req); + return psmx2_errno(err); + } PSMX2_AM_SET_OP(args[0].u32w0, PSMX2_AM_REQ_READ_LONG); args[0].u32w1 = len; args[1].u64 = (uint64_t)req; args[2].u64 = addr; args[3].u64 = key; - psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, - args, 4, NULL, 0, 0, NULL, NULL); + err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, + args, 4, NULL, 0, 0, NULL, NULL); + if (err) { + /* req in use, don't free */ + return psmx2_errno(err); + } psmx2_am_poll(ep_priv->tx); return 0; } @@ -706,20 +720,31 @@ ssize_t psmx2_read_generic(struct fid_ep *ep, void *buf, size_t len, args[0].u32w1 = chunk_size; args[2].u64 = addr; args[4].u64 = offset; - psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, - args, 5, NULL, 0, 0, NULL, NULL); + err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, + args, 5, NULL, 0, 0, NULL, NULL); + if (err) { + if (!req_refcnt) + psmx2_am_request_free(ep_priv->tx, req); + return psmx2_errno(err); + } psmx2_am_poll(ep_priv->tx); addr += chunk_size; len -= chunk_size; offset += chunk_size; + req_refcnt++; } PSMX2_AM_SET_FLAG(args[0].u32w0, PSMX2_AM_EOM); args[0].u32w1 = len; args[2].u64 = addr; args[4].u64 = offset; - psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, - args, 5, NULL, 0, 0, NULL, NULL); + err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, + args, 5, NULL, 0, 0, NULL, NULL); + if (err) { + if (!req_refcnt) + psmx2_am_request_free(ep_priv->tx, req); + return psmx2_errno(err); + } psmx2_am_poll(ep_priv->tx); return 0; } @@ -742,6 +767,8 @@ ssize_t psmx2_readv_generic(struct fid_ep *ep, const struct iovec *iov, size_t total_len, long_len = 0, short_len; void *long_buf = NULL; int i; + size_t req_refcnt = 0; + int err; ep_priv = container_of(ep, struct psmx2_fid_ep, ep); @@ -819,39 +846,68 @@ ssize_t psmx2_readv_generic(struct fid_ep *ep, const struct iovec *iov, args[0].u32w1 = chunk_size; args[2].u64 = addr; args[4].u64 = offset; - psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, - args, 5, NULL, 0, 0, NULL, NULL); + err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, + args, 5, NULL, 0, 0, NULL, NULL); + if (err) { + if (!req_refcnt) { + free(req->tmpbuf); + psmx2_am_request_free(ep_priv->tx, req); + } + return psmx2_errno(err); + } psmx2_am_poll(ep_priv->tx); addr += chunk_size; short_len -= chunk_size; offset += chunk_size; + req_refcnt++; } - if (!long_len) - PSMX2_AM_SET_FLAG(args[0].u32w0, PSMX2_AM_EOM); - args[0].u32w1 = short_len; - args[2].u64 = addr; - args[4].u64 = offset; - psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, - args, 5, NULL, 0, 0, NULL, NULL); - psmx2_am_poll(ep_priv->tx); + if (short_len) { + if (!long_len) + PSMX2_AM_SET_FLAG(args[0].u32w0, PSMX2_AM_EOM); + args[0].u32w1 = short_len; + args[2].u64 = addr; + args[4].u64 = offset; + err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, + args, 5, NULL, 0, 0, NULL, NULL); + if (err) { + if (!req_refcnt) { + free(req->tmpbuf); + psmx2_am_request_free(ep_priv->tx, req); + } + return psmx2_errno(err); + } + psmx2_am_poll(ep_priv->tx); + req_refcnt++; + } /* Use the long protocol for the last segment */ if (long_len) { PSMX2_SET_TAG(psm2_tag, (uint64_t)req, 0, PSMX2_RMA_TYPE_READ); PSMX2_SET_MASK(psm2_tagsel, PSMX2_MATCH_ALL, PSMX2_RMA_TYPE_MASK); - psm2_mq_irecv2(ep_priv->tx->psm2_mq, psm2_epaddr, - &psm2_tag, &psm2_tagsel, 0, - long_buf, long_len, - (void *)&req->fi_context, &psm2_req); + err = psm2_mq_irecv2(ep_priv->tx->psm2_mq, psm2_epaddr, + &psm2_tag, &psm2_tagsel, 0, + long_buf, long_len, + (void *)&req->fi_context, &psm2_req); + if (err) { + if (!req_refcnt) { + free(req->tmpbuf); + psmx2_am_request_free(ep_priv->tx, req); + } + return psmx2_errno(err); + } PSMX2_AM_SET_OP(args[0].u32w0, PSMX2_AM_REQ_READ_LONG); args[0].u32w1 = long_len; args[1].u64 = (uint64_t)req; args[2].u64 = addr + short_len; args[3].u64 = key; - psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, - args, 4, NULL, 0, 0, NULL, NULL); + err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, + args, 4, NULL, 0, 0, NULL, NULL); + if (err) { + /* req in use, don't free */ + return psmx2_errno(err); + } psmx2_am_poll(ep_priv->tx); } @@ -937,6 +993,8 @@ ssize_t psmx2_write_generic(struct fid_ep *ep, const void *buf, size_t len, psm2_mq_tag_t psm2_tag; void *psm2_context; int no_event; + size_t req_refcnt = 0; + int err; ep_priv = container_of(ep, struct psmx2_fid_ep, ep); @@ -1022,13 +1080,22 @@ ssize_t psmx2_write_generic(struct fid_ep *ep, const void *buf, size_t len, psm2_context = (void *)&req->fi_context; } - psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, args, - nargs, NULL, 0, am_flags, NULL, NULL); + err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, + args, nargs, NULL, 0, am_flags, + NULL, NULL); + if (err) { + free(req->tmpbuf); + psmx2_am_request_free(ep_priv->tx, req); + return psmx2_errno(err); + } psmx2_am_poll(ep_priv->tx); - psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr, 0, - &psm2_tag, buf, len, psm2_context, &psm2_req); - + err = psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr, 0, + &psm2_tag, buf, len, psm2_context, &psm2_req); + if (err) { + /* req in use, don't free */ + return psmx2_errno(err); + } return 0; } @@ -1039,13 +1106,21 @@ ssize_t psmx2_write_generic(struct fid_ep *ep, const void *buf, size_t len, args[1].u64 = (uint64_t)(uintptr_t)req; args[2].u64 = addr; args[3].u64 = key; - psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, args, - nargs, (void *)buf, chunk_size, am_flags, - NULL, NULL); + err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, + args, nargs, (void *)buf, + chunk_size, am_flags, NULL, NULL); + if (err) { + if (!req_refcnt) { + free(req->tmpbuf); + psmx2_am_request_free(ep_priv->tx, req); + } + return psmx2_errno(err); + } psmx2_am_poll(ep_priv->tx); buf = (const uint8_t *)buf + chunk_size; addr += chunk_size; len -= chunk_size; + req_refcnt++; } args[0].u32w1 = len; @@ -1059,8 +1134,16 @@ ssize_t psmx2_write_generic(struct fid_ep *ep, const void *buf, size_t len, } else { PSMX2_AM_SET_FLAG(args[0].u32w0, PSMX2_AM_EOM); } - psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, args, nargs, - (void *)buf, len, am_flags, NULL, NULL); + err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, + args, nargs, (void *)buf, len, am_flags, + NULL, NULL); + if (err) { + if (!req_refcnt) { + free(req->tmpbuf); + psmx2_am_request_free(ep_priv->tx, req); + } + return psmx2_errno(err); + } psmx2_am_poll(ep_priv->tx); return 0; } @@ -1086,6 +1169,8 @@ ssize_t psmx2_writev_generic(struct fid_ep *ep, const struct iovec *iov, size_t total_len, len, len_sent; uint8_t *buf, *p; int i; + size_t req_refcnt = 0; + int err; ep_priv = container_of(ep, struct psmx2_fid_ep, ep); @@ -1162,8 +1247,14 @@ ssize_t psmx2_writev_generic(struct fid_ep *ep, const struct iovec *iov, } else { PSMX2_AM_SET_FLAG(args[0].u32w0, PSMX2_AM_EOM); } - psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, args, nargs, - (void *)buf, len, am_flags, NULL, NULL); + err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, + args, nargs, (void *)buf, len, + am_flags, NULL, NULL); + if (err) { + free(req->tmpbuf); + psmx2_am_request_free(ep_priv->tx, req); + return psmx2_errno(err); + } psmx2_am_poll(ep_priv->tx); return 0; } @@ -1220,14 +1311,24 @@ ssize_t psmx2_writev_generic(struct fid_ep *ep, const struct iovec *iov, psm2_context = (void *)&req->fi_context; } - psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, args, - nargs, NULL, 0, am_flags, NULL, NULL); + err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, + args, nargs, NULL, 0, am_flags, + NULL, NULL); + if (err) { + if (!req_refcnt) + psmx2_am_request_free(ep_priv->tx, req); + return psmx2_errno(err); + } psmx2_am_poll(ep_priv->tx); - psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr, 0, - &psm2_tag, iov[i].iov_base, iov[i].iov_len, - psm2_context, &psm2_req); - + err = psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr, + 0, &psm2_tag, iov[i].iov_base, + iov[i].iov_len, psm2_context, + &psm2_req); + if (err) { + /* req in use, don't free */ + return psmx2_errno(err); + } return 0; } @@ -1241,14 +1342,21 @@ ssize_t psmx2_writev_generic(struct fid_ep *ep, const struct iovec *iov, args[1].u64 = (uint64_t)(uintptr_t)req; args[2].u64 = addr; args[3].u64 = key; - psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, args, - nargs, (void *)buf, chunk_size, am_flags, - NULL, NULL); + err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, + args, nargs, (void *)buf, + chunk_size, am_flags, + NULL, NULL); + if (err) { + if (!req_refcnt) + psmx2_am_request_free(ep_priv->tx, req); + return psmx2_errno(err); + } psmx2_am_poll(ep_priv->tx); buf += chunk_size; addr += chunk_size; len -= chunk_size; len_sent += chunk_size; + req_refcnt++; } args[0].u32w1 = len; @@ -1264,12 +1372,19 @@ ssize_t psmx2_writev_generic(struct fid_ep *ep, const struct iovec *iov, PSMX2_AM_SET_FLAG(args[0].u32w0, PSMX2_AM_EOM); } } - psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, args, nargs, - (void *)buf, len, am_flags, NULL, NULL); + err = psm2_am_request_short(psm2_epaddr, PSMX2_AM_RMA_HANDLER, + args, nargs, (void *)buf, len, + am_flags, NULL, NULL); + if (err) { + if (!req_refcnt) + psmx2_am_request_free(ep_priv->tx, req); + return psmx2_errno(err); + } psmx2_am_poll(ep_priv->tx); addr += len; len_sent += len; + req_refcnt++; } return 0; diff --git a/prov/psm2/src/psmx2_tagged.c b/prov/psm2/src/psmx2_tagged.c index 4c16773c114..bb99b6a3714 100644 --- a/prov/psm2/src/psmx2_tagged.c +++ b/prov/psm2/src/psmx2_tagged.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -535,8 +535,12 @@ ssize_t psmx2_tagged_send_generic(struct fid_ep *ep, assert(av); psm2_epaddr = psmx2_av_translate_addr(av, ep_priv->tx, dest_addr, av->type); - PSMX2_SET_TAG(psm2_tag, tag, (uint32_t)data, - PSMX2_TYPE_TAGGED | PSMX2_IMM_BIT_SET(have_data)); + if (have_data) + PSMX2_SET_TAG(psm2_tag, tag, (uint32_t)data, + PSMX2_TYPE_TAGGED | PSMX2_IMM_BIT); + else + PSMX2_SET_TAG(psm2_tag, tag, (uint32_t)ep_priv->sep_id, + PSMX2_TYPE_TAGGED); if ((flags & PSMX2_NO_COMPLETION) || (ep_priv->send_selective_completion && !(flags & FI_COMPLETION))) @@ -602,7 +606,7 @@ psmx2_tagged_send_specialized(struct fid_ep *ep, const void *buf, fi_addr_t dest_addr, uint64_t tag, void *context, int enable_completion, int av_map, - int has_data, uint64_t data) + int have_data, uint64_t data) { struct psmx2_fid_ep *ep_priv; psm2_epaddr_t psm2_epaddr; @@ -622,10 +626,10 @@ psmx2_tagged_send_specialized(struct fid_ep *ep, const void *buf, psm2_epaddr = psmx2_av_translate_addr(ep_priv->av, ep_priv->tx, dest_addr, FI_AV_TABLE); } - if (has_data) + if (have_data) PSMX2_SET_TAG(psm2_tag, tag, data, PSMX2_TYPE_TAGGED | PSMX2_IMM_BIT); else - PSMX2_SET_TAG(psm2_tag, tag, 0, PSMX2_TYPE_TAGGED); + PSMX2_SET_TAG(psm2_tag, tag, ep_priv->sep_id, PSMX2_TYPE_TAGGED); if (enable_completion) { fi_context = context; @@ -734,7 +738,7 @@ static inline ssize_t psmx2_tagged_inject_specialized(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag, int av_map, - int has_data, uint64_t data) + int have_data, uint64_t data) { struct psmx2_fid_ep *ep_priv; psm2_epaddr_t psm2_epaddr; @@ -755,10 +759,10 @@ psmx2_tagged_inject_specialized(struct fid_ep *ep, const void *buf, psm2_epaddr = psmx2_av_translate_addr(ep_priv->av, ep_priv->tx, dest_addr, FI_AV_TABLE); } - if (has_data) + if (have_data) PSMX2_SET_TAG(psm2_tag, tag, data, PSMX2_TYPE_TAGGED | PSMX2_IMM_BIT); else - PSMX2_SET_TAG(psm2_tag, tag, 0, PSMX2_TYPE_TAGGED); + PSMX2_SET_TAG(psm2_tag, tag, ep_priv->sep_id, PSMX2_TYPE_TAGGED); err = psm2_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, 0, &psm2_tag, buf, len); @@ -895,9 +899,10 @@ ssize_t psmx2_tagged_sendv_generic(struct fid_ep *ep, assert(av); psm2_epaddr = psmx2_av_translate_addr(av, ep_priv->tx, dest_addr, av->type); - PSMX2_SET_TAG(psm2_tag, tag, (uint32_t)data, - msg_flags | PSMX2_IMM_BIT_SET(have_data)); - + if (have_data) + PSMX2_SET_TAG(psm2_tag, tag, (uint32_t)data, msg_flags | PSMX2_IMM_BIT); + else + PSMX2_SET_TAG(psm2_tag, tag, (uint32_t)ep_priv->sep_id, msg_flags); if ((flags & PSMX2_NO_COMPLETION) || (ep_priv->send_selective_completion && !(flags & FI_COMPLETION))) diff --git a/prov/psm2/src/psmx2_trx_ctxt.c b/prov/psm2/src/psmx2_trx_ctxt.c index 726a9e7dd02..31c5fcd741c 100644 --- a/prov/psm2/src/psmx2_trx_ctxt.c +++ b/prov/psm2/src/psmx2_trx_ctxt.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -47,32 +47,38 @@ int psmx2_trx_ctxt_cnt = 0; */ struct disconnect_args { - psm2_ep_t ep; - psm2_epaddr_t epaddr; + struct psmx2_trx_ctxt *trx_ctxt; + psm2_epaddr_t epaddr; }; static void *disconnect_func(void *args) { struct disconnect_args *disconn = args; + struct psmx2_trx_ctxt *trx_ctxt = disconn->trx_ctxt; + struct psmx2_epaddr_context *epaddr_context; psm2_error_t errors; FI_INFO(&psmx2_prov, FI_LOG_CORE, - "psm2_ep: %p, epaddr: %p\n", disconn->ep, disconn->epaddr); + "psm2_ep: %p, epaddr: %p\n", trx_ctxt->psm2_ep, disconn->epaddr); - psm2_ep_disconnect2(disconn->ep, 1, &disconn->epaddr, NULL, + trx_ctxt->domain->peer_lock_fn(&trx_ctxt->peer_lock, 2); + dlist_remove_first_match(&trx_ctxt->peer_list, + psmx2_peer_match, disconn->epaddr); + trx_ctxt->domain->peer_unlock_fn(&trx_ctxt->peer_lock, 2); + if (trx_ctxt->ep && trx_ctxt->ep->av) + psmx2_av_remove_conn(trx_ctxt->ep->av, trx_ctxt, disconn->epaddr); + + epaddr_context = psm2_epaddr_getctxt(disconn->epaddr); + psm2_epaddr_setctxt(disconn->epaddr, NULL); + free(epaddr_context); + + psm2_ep_disconnect2(trx_ctxt->psm2_ep, 1, &disconn->epaddr, NULL, &errors, PSM2_EP_DISCONNECT_FORCE, 0); + free(args); return NULL; } -static int psmx2_peer_match(struct dlist_entry *item, const void *arg) -{ - struct psmx2_epaddr_context *peer; - - peer = container_of(item, struct psmx2_epaddr_context, entry); - return (peer->epaddr == arg); -} - int psmx2_am_trx_ctxt_handler(psm2_am_token_t token, psm2_amarg_t *args, int nargs, void *src, uint32_t len, void *hctx) { @@ -93,16 +99,14 @@ int psmx2_am_trx_ctxt_handler(psm2_am_token_t token, psm2_amarg_t *args, * we can't call psm2_ep_disconnect from the AM * handler. instead, create a thread to do the work. * the performance of this operation is not important. + * + * also put the av cleanup operations into the thread + * to avoid deadlock because the AM handler may be + * called with the av lock held. */ disconn = malloc(sizeof(*disconn)); if (disconn) { - trx_ctxt->domain->peer_lock_fn(&trx_ctxt->peer_lock, 2); - dlist_remove_first_match(&trx_ctxt->peer_list, - psmx2_peer_match, epaddr); - trx_ctxt->domain->peer_unlock_fn(&trx_ctxt->peer_lock, 2); - if (trx_ctxt->ep && trx_ctxt->ep->av) - psmx2_av_remove_conn(trx_ctxt->ep->av, trx_ctxt, epaddr); - disconn->ep = trx_ctxt->psm2_ep; + disconn->trx_ctxt = trx_ctxt; disconn->epaddr = epaddr; pthread_create(&disconnect_thread, NULL, disconnect_func, disconn); @@ -124,6 +128,7 @@ void psmx2_trx_ctxt_disconnect_peers(struct psmx2_trx_ctxt *trx_ctxt) struct psmx2_epaddr_context *peer; struct dlist_entry peer_list; psm2_amarg_t arg; + int err; arg.u32w0 = PSMX2_AM_REQ_TRX_CTXT_DISCONNECT; @@ -138,9 +143,17 @@ void psmx2_trx_ctxt_disconnect_peers(struct psmx2_trx_ctxt *trx_ctxt) dlist_foreach_safe(&peer_list, item, tmp) { peer = container_of(item, struct psmx2_epaddr_context, entry); - FI_INFO(&psmx2_prov, FI_LOG_CORE, "epaddr: %p\n", peer->epaddr); - psm2_am_request_short(peer->epaddr, PSMX2_AM_TRX_CTXT_HANDLER, - &arg, 1, NULL, 0, 0, NULL, NULL); + if (trx_ctxt->domain->params.disconnect) { + FI_INFO(&psmx2_prov, FI_LOG_CORE, "epaddr: %p\n", peer->epaddr); + err = psm2_am_request_short(peer->epaddr, + PSMX2_AM_TRX_CTXT_HANDLER, + &arg, 1, NULL, 0, 0, NULL, + NULL); + if (err) + FI_INFO(&psmx2_prov, FI_LOG_CORE, + "failed to send disconnect, err %d\n", + err); + } psm2_epaddr_setctxt(peer->epaddr, NULL); free(peer); } @@ -183,8 +196,7 @@ void psmx2_trx_ctxt_free(struct psmx2_trx_ctxt *trx_ctxt, int usage_flags) dlist_remove(&trx_ctxt->entry); trx_ctxt->domain->trx_ctxt_unlock_fn(&trx_ctxt->domain->trx_ctxt_lock, 1); - if (psmx2_env.disconnect) - psmx2_trx_ctxt_disconnect_peers(trx_ctxt); + psmx2_trx_ctxt_disconnect_peers(trx_ctxt); if (trx_ctxt->am_initialized) psmx2_am_fini(trx_ctxt); @@ -224,7 +236,8 @@ void psmx2_trx_ctxt_free(struct psmx2_trx_ctxt *trx_ctxt, int usage_flags) struct psmx2_trx_ctxt *psmx2_trx_ctxt_alloc(struct psmx2_fid_domain *domain, struct psmx2_ep_name *src_addr, int sep_ctxt_idx, - int usage_flags) + int usage_flags, + uint8_t *uuid) { struct psmx2_trx_ctxt *trx_ctxt; struct psm2_ep_open_opts opts; @@ -234,12 +247,16 @@ struct psmx2_trx_ctxt *psmx2_trx_ctxt_alloc(struct psmx2_fid_domain *domain, int asked_flags = usage_flags & PSMX2_TX_RX; int compatible_flags = ~asked_flags & PSMX2_TX_RX; + if (!uuid) + uuid = domain->uuid; + /* Check existing allocations first if only Tx or Rx is needed */ if (compatible_flags) { domain->trx_ctxt_lock_fn(&domain->trx_ctxt_lock, 1); dlist_foreach(&domain->trx_ctxt_list, item) { trx_ctxt = container_of(item, struct psmx2_trx_ctxt, entry); - if (compatible_flags == trx_ctxt->usage_flags) { + if (compatible_flags == trx_ctxt->usage_flags && + !memcmp(uuid, trx_ctxt->uuid, sizeof(psm2_uuid_t))) { trx_ctxt->usage_flags |= asked_flags; domain->trx_ctxt_unlock_fn(&domain->trx_ctxt_lock, 1); FI_INFO(&psmx2_prov, FI_LOG_CORE, @@ -252,10 +269,10 @@ struct psmx2_trx_ctxt *psmx2_trx_ctxt_alloc(struct psmx2_fid_domain *domain, domain->trx_ctxt_unlock_fn(&domain->trx_ctxt_lock, 1); } - if (psmx2_trx_ctxt_cnt >= psmx2_env.max_trx_ctxt) { + if (psmx2_trx_ctxt_cnt >= psmx2_hfi_info.max_trx_ctxt) { FI_WARN(&psmx2_prov, FI_LOG_CORE, "number of Tx/Rx contexts exceeds limit (%d).\n", - psmx2_env.max_trx_ctxt); + psmx2_hfi_info.max_trx_ctxt); return NULL; } @@ -276,8 +293,9 @@ struct psmx2_trx_ctxt *psmx2_trx_ctxt_alloc(struct psmx2_fid_domain *domain, } psm2_ep_open_opts_get_defaults(&opts); + memcpy(trx_ctxt->uuid, uuid, sizeof(psm2_uuid_t)); FI_INFO(&psmx2_prov, FI_LOG_CORE, - "uuid: %s\n", psmx2_uuid_to_string(domain->fabric->uuid)); + "uuid: %s\n", psmx2_uuid_to_string(uuid)); opts.unit = src_addr ? src_addr->unit : PSMX2_DEFAULT_UNIT; opts.port = src_addr ? src_addr->port : PSMX2_DEFAULT_PORT; @@ -291,7 +309,7 @@ struct psmx2_trx_ctxt *psmx2_trx_ctxt_alloc(struct psmx2_fid_domain *domain, "sep %d: ep_open_opts: unit=%d\n", sep_ctxt_idx, opts.unit); } - err = psm2_ep_open(domain->fabric->uuid, &opts, + err = psm2_ep_open(uuid, &opts, &trx_ctxt->psm2_ep, &trx_ctxt->psm2_epid); if (err != PSM2_OK) { FI_WARN(&psmx2_prov, FI_LOG_CORE, @@ -301,7 +319,7 @@ struct psmx2_trx_ctxt *psmx2_trx_ctxt_alloc(struct psmx2_fid_domain *domain, /* When round-robin fails, retry w/o explicit assignment */ opts.unit = -1; - err = psm2_ep_open(domain->fabric->uuid, &opts, + err = psm2_ep_open(uuid, &opts, &trx_ctxt->psm2_ep, &trx_ctxt->psm2_epid); if (err != PSM2_OK) { FI_WARN(&psmx2_prov, FI_LOG_CORE, diff --git a/prov/psm2/src/psmx2_util.c b/prov/psm2/src/psmx2_util.c index 382ba172269..71ce68cbf0a 100644 --- a/prov/psm2/src/psmx2_util.c +++ b/prov/psm2/src/psmx2_util.c @@ -79,7 +79,7 @@ char *psmx2_uuid_to_string(psm2_uuid_t uuid) { static char s[40]; - sprintf(s, + snprintf(s, sizeof(s), "%02hhX%02hhX%02hhX%02hhX-" "%02hhX%02hhX-%02hhX%02hhX-%02hhX%02hhX-" "%02hhX%02hhX%02hhX%02hhX%02hhX%02hhX", diff --git a/prov/psm2/src/psmx2_wait.c b/prov/psm2/src/psmx2_wait.c index c53024240eb..62df2d6dafc 100644 --- a/prov/psm2/src/psmx2_wait.c +++ b/prov/psm2/src/psmx2_wait.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU diff --git a/prov/psm2/src/version.h b/prov/psm2/src/version.h index 50187977e15..f99d4ccbfaa 100644 --- a/prov/psm2/src/version.h +++ b/prov/psm2/src/version.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -47,7 +47,7 @@ #endif #define PSMX2_PROV_NAME "psm2" -#define PSMX2_DOMAIN_NAME "psm2" +#define PSMX2_DOMAIN_NAME "hfi1" #define PSMX2_FABRIC_NAME "psm2" #define PSMX2_DEFAULT_UUID "00FF00FF-0000-0000-0000-00FF00FF00FF" diff --git a/prov/psm3/.gitignore b/prov/psm3/.gitignore new file mode 100644 index 00000000000..993024e61e6 --- /dev/null +++ b/prov/psm3/.gitignore @@ -0,0 +1,5 @@ +libpsm3-fi.map +libpsm3-fi.pc + +libpsm3-fi-*.tar.bz2 +libpsm3-fi-*.tar.gz diff --git a/prov/psm3/Makefile.am b/prov/psm3/Makefile.am new file mode 100644 index 00000000000..08f214d18db --- /dev/null +++ b/prov/psm3/Makefile.am @@ -0,0 +1,248 @@ +# +# Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2017-2018 Intel Corporation, Inc. All right reserved. +# Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All rights reserved. +# (C) Copyright 2020 Hewlett Packard Enterprise Development LP +# +# Makefile.am for libpsm3-fi +EXTRA_DIST = + +AM_CPPFLAGS = \ + -I$(srcdir)/inc \ + -D_GNU_SOURCE -D__USE_XOPEN2K8 \ + -DSYSCONFDIR=\"$(sysconfdir)\" \ + -DRDMADIR=\"@rdmadir@\" \ + -DPROVDLDIR=\"$(pkglibdir)\" +if HAVE_PSM3_SRC +AM_CPPFLAGS += -I$(srcdir)/psm3 +endif + +noinst_LTLIBRARIES = +libfabric_pkglibdir = $(libdir)/libfabric +libfabric_pkglib_LTLIBRARIES = + +if EMBEDDED +noinst_LTLIBRARIES += src/libpsm3-fi.la +else +libfabric_pkglib_LTLIBRARIES += src/libpsm3-fi.la +endif + +ACLOCAL_AMFLAGS = -I config +AM_CFLAGS = -Wall + +if HAVE_LD_VERSION_SCRIPT + libpsm3_fi_version_script = -Wl,--version-script=$(builddir)/libpsm3-fi.map +else !HAVE_LD_VERSION_SCRIPT + libpsm3_fi_version_script = +endif !HAVE_LD_VERSION_SCRIPT + +# rdmaincludedir = $(includedir)/rdma + +# rdmainclude_HEADERS = + +# internal utility functions shared by in-tree providers: +common_srcs = \ + shared/hmem.c \ + shared/hmem_rocr.c \ + shared/hmem_cuda.c \ + shared/hmem_ze.c \ + shared/common.c \ + shared/enosys.c \ + shared/rbtree.c \ + shared/tree.c \ + shared/fasthash.c \ + shared/indexer.c \ + shared/mem.c \ + shared/iov.c \ + shared/shared/ofi_str.c \ + util/src/util_atomic.c \ + util/src/util_attr.c \ + util/src/util_av.c \ + util/src/util_buf.c \ + util/src/util_coll.c \ + util/src/util_cq.c \ + util/src/util_cntr.c \ + util/src/util_domain.c \ + util/src/util_ep.c \ + util/src/util_eq.c \ + util/src/util_fabric.c \ + util/src/util_main.c \ + util/src/util_mem_hooks.c \ + util/src/util_mem_monitor.c \ + util/src/util_mr_cache.c \ + util/src/util_mr_map.c \ + util/src/util_ns.c \ + util/src/util_pep.c \ + util/src/util_poll.c \ + util/src/util_shm.c \ + util/src/util_wait.c \ + util/src/cuda_mem_monitor.c \ + util/src/rocr_mem_monitor.c + +if MACOS +common_srcs += shared/unix/osd.c +common_srcs += inc/osx/osd.h +common_srcs += inc/unix/osd.h +endif + +if FREEBSD +common_srcs += shared/unix/osd.c +common_srcs += inc/freebsd/osd.h +common_srcs += inc/unix/osd.h +endif + +if LINUX +common_srcs += shared/unix/osd.c +common_srcs += shared/linux/osd.c +if HAVE_LINUX_PERF_RDPMC +if !HAVE_PSM3_SRC +common_srcs += shared/linux/rdpmc.c #seems to be a copy of psm3/psm_perf.c +endif +endif +common_srcs += inc/linux/rdpmc.h +common_srcs += inc/linux/osd.h +common_srcs += inc/unix/osd.h +endif + +# ensure dl-built providers link back to libfabric +# linkback = src/libfabric.la + +bin_SCRIPTS = + +nodist_src_libpsm3_fi_la_SOURCES = +src_libpsm3_fi_la_SOURCES = \ + inc/ofi_hmem.h \ + inc/ofi.h \ + inc/ofi_abi.h \ + inc/ofi_atom.h \ + inc/ofi_enosys.h \ + inc/ofi_file.h \ + inc/ofi_hook.h \ + inc/ofi_indexer.h \ + inc/ofi_iov.h \ + inc/ofi_list.h \ + inc/ofi_bitmask.h \ + inc/shared/ofi_str.h \ + inc/ofi_lock.h \ + inc/ofi_mem.h \ + inc/ofi_osd.h \ + inc/ofi_proto.h \ + inc/ofi_recvwin.h \ + inc/ofi_rbuf.h \ + inc/ofi_shm.h \ + inc/ofi_signal.h \ + inc/ofi_epoll.h \ + inc/ofi_tree.h \ + inc/ofi_util.h \ + inc/ofi_atomic.h \ + inc/ofi_mr.h \ + inc/ofi_net.h \ + inc/ofi_perf.h \ + inc/ofi_coll.h \ + inc/fasthash.h \ + inc/rbtree.h \ + inc/uthash.h \ + inc/ofi_prov.h \ + inc/rdma/providers/fi_log.h \ + inc/rdma/providers/fi_prov.h \ + inc/rdma/fabric.h \ + inc/rdma/fi_atomic.h \ + inc/rdma/fi_cm.h \ + inc/rdma/fi_collective.h \ + inc/rdma/fi_domain.h \ + inc/rdma/fi_eq.h \ + inc/rdma/fi_rma.h \ + inc/rdma/fi_endpoint.h \ + inc/rdma/fi_errno.h \ + inc/rdma/fi_tagged.h \ + inc/rdma/fi_trigger.h \ + src/psmx3.h \ + src/psmx3_am.c \ + src/psmx3_atomic.c \ + src/psmx3_attr.c \ + src/psmx3_av.c \ + src/psmx3_cm.c \ + src/psmx3_cntr.c \ + src/psmx3_cq.c \ + src/psmx3_domain.c \ + src/psmx3_ep.c \ + src/psmx3_fabric.c \ + src/psmx3_init.c \ + src/psmx3_mr.c \ + src/psmx3_msg.c \ + src/psmx3_rma.c \ + src/psmx3_tagged.c \ + src/psmx3_trigger.h \ + src/psmx3_trx_ctxt.c \ + src/psmx3_util.c \ + src/psmx3_wait.c \ + src/version.h \ + $(common_srcs) + +src_libpsm3_fi_la_CPPFLAGS = $(AM_CPPFLAGS) +src_libpsm3_fi_la_DEPENDENCIES = libpsm3-fi.map +src_libpsm3_fi_la_LDFLAGS = +src_libpsm3_fi_la_LIBADD = + +src_libpsm3_fi_la_LDFLAGS += \ + -export-dynamic \ + $(libpsm3_fi_version_script) + +chksum_srcs = $(src_libpsm3_fi_la_SOURCES) +if HAVE_PSM3_SRC +src_libpsm3_fi_la_SOURCES += src/psm3_revision.c + +include psm3/Makefile.include +src_libpsm3_fi_la_LIBADD += libpsm2.la +src_libpsm3_fi_la_DEPENDENCIES += libpsm2.la + +else !HAVE_PSM3_SRC +src_libpsm3_fi_la_LDFLAGS += -lpsm2 +endif !HAVE_PSM3_SRC + +if !EMBEDDED +src_libpsm3_fi_la_LDFLAGS += -version-info 15:2:14 +endif + +prov_install_man_pages = man/man7/fi_psm3.7 + +prov_dist_man_pages = man/man7/fi_psm3.7 + +man_MANS = $(prov_install_man_pages) + +EXTRA_DIST += \ + libpsm3-fi.spec.in \ + config/distscript.pl \ + $(prov_dist_man_pages) + +pkgconfigdir = $(libdir)/pkgconfig +pkgconfig_DATA = libpsm3-fi.pc + +chksum_srcs += $(EXTRA_DIST) $(pkgconfig_DATA) + +all-local: + @echo "Building src checksum..."; \ + chksum=`cat $(chksum_srcs) | sha1sum | cut -d' ' -f 1`; \ + if ! grep -q $$chksum src/psm3_revision.c 2>/dev/null; then \ + sed -i "/define PSMX3_SRC_CHECKSUM/s/\".*\"/\"$$chksum\"/" src/psm3_revision.c; \ + echo "SRC checksum updated to $$chksum"; \ + else \ + echo "SRC checksum not changed: $$chksum"; \ + fi; \ + timestamp=`date`; \ + sed -i "/define PSMX3_BUILD_TIMESTAMP/s/\".*\"/\"$$timestamp\"/" src/psm3_revision.c; \ + echo "Updated build timestamp: $$timestamp" + +nroff: + @for file in $(prov_install_man_pages); do \ + source=`echo $$file | sed -e 's@/man[0-9]@@'`; \ + perl $(top_srcdir)/config/md2nroff.pl --source=$(top_srcdir)/$$source.md; \ + done + +dist-hook: libpsm3-fi.spec + cp libpsm3-fi.spec $(distdir) + perl $(top_srcdir)/config/distscript.pl "$(distdir)" "$(PACKAGE_VERSION)" + +rpm: dist + LDFLAGS=-Wl,--build-id rpmbuild -ta libpsm3-fi-$(PACKAGE_VERSION).tar.bz2 + diff --git a/prov/psm3/Makefile.include b/prov/psm3/Makefile.include new file mode 100644 index 00000000000..93e021a5e26 --- /dev/null +++ b/prov/psm3/Makefile.include @@ -0,0 +1,331 @@ +if HAVE_PSM3 +_psm3_files = \ + prov/psm3/src/version.h \ + prov/psm3/src/psmx3.h \ + prov/psm3/src/psmx3_am.c \ + prov/psm3/src/psmx3_atomic.c \ + prov/psm3/src/psmx3_attr.c \ + prov/psm3/src/psmx3_av.c \ + prov/psm3/src/psmx3_cm.c \ + prov/psm3/src/psmx3_cntr.c \ + prov/psm3/src/psmx3_cq.c \ + prov/psm3/src/psmx3_domain.c \ + prov/psm3/src/psmx3_ep.c \ + prov/psm3/src/psmx3_fabric.c \ + prov/psm3/src/psmx3_init.c \ + prov/psm3/src/psmx3_mr.c \ + prov/psm3/src/psmx3_msg.c \ + prov/psm3/src/psmx3_rma.c \ + prov/psm3/src/psmx3_tagged.c \ + prov/psm3/src/psmx3_trigger.h \ + prov/psm3/src/psmx3_trx_ctxt.c \ + prov/psm3/src/psmx3_util.c \ + prov/psm3/src/psmx3_wait.c + +_psm3_cppflags = \ + -I$(top_srcdir)/prov/psm3 \ + -I$(top_srcdir)/prov/psm3/include + +chksum_srcs = $(_psm3_files) + +if HAVE_PSM3_SRC +_psm3_cflags = -mavx2 +#include prov/psm3/psm3/Makefile.include +_psm3_files += \ + prov/psm3/src/psm3_revision.c +chksum_srcs += \ + prov/psm3/src/psm3_revision.c +_psm3_cppflags += \ + -I$(top_srcdir)/prov/psm3/psm3 \ + -I$(top_srcdir)/prov/psm3/psm3/ptl_ips/ \ + -I$(top_srcdir)/prov/psm3/psm3/include/ \ + -I$(top_srcdir)/prov/psm3/psm3/include/linux-i386/ \ + -I$(top_srcdir)/prov/psm3/psm3/mpspawn \ + -I$(top_srcdir)/prov/psm3/psm3/opa \ + -D_GNU_SOURCE=1 + +noinst_LTLIBRARIES += libopa.la libuuid.la \ + libptl_am.la libptl_ips.la libptl_self.la \ + libpsm_hal_gen1.la libpsm3i.la + +libptl_am_la_SOURCES = \ + prov/psm3/psm3/ptl_am/am_config.h \ + prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.c \ + prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.h \ + prov/psm3/psm3/ptl_am/am_reqrep.c \ + prov/psm3/psm3/ptl_am/am_reqrep_shmem.c \ + prov/psm3/psm3/ptl_am/cmarw.h \ + prov/psm3/psm3/ptl_am/cmarwu.c \ + prov/psm3/psm3/ptl_am/psm_am_internal.h \ + prov/psm3/psm3/ptl_am/ptl.c \ + prov/psm3/psm3/ptl_am/ptl_fwd.h +libptl_am_la_CPPFLAGS = \ + -I$(top_srcdir)/prov/psm3/psm3/ptl_am/ \ + $(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags) +libptl_am_la_CFLAGS = \ + $(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags) + +libptl_ips_la_SOURCES = \ + prov/psm3/psm3/ptl_ips/ips_config.h \ + prov/psm3/psm3/ptl_ips/ips_crc32.c \ + prov/psm3/psm3/ptl_ips/ips_epstate.c \ + prov/psm3/psm3/ptl_ips/ips_epstate.h \ + prov/psm3/psm3/ptl_ips/ips_expected_proto.h \ + prov/psm3/psm3/ptl_ips/ips_opp_path_rec.c \ + prov/psm3/psm3/ptl_ips/ips_path_rec.c \ + prov/psm3/psm3/ptl_ips/ips_path_rec.h \ + prov/psm3/psm3/ptl_ips/ips_proto.c \ + prov/psm3/psm3/ptl_ips/ips_proto.h \ + prov/psm3/psm3/ptl_ips/ips_proto_am.c \ + prov/psm3/psm3/ptl_ips/ips_proto_am.h \ + prov/psm3/psm3/ptl_ips/ips_proto_connect.c \ + prov/psm3/psm3/ptl_ips/ips_proto_dump.c \ + prov/psm3/psm3/ptl_ips/ips_proto_expected.c \ + prov/psm3/psm3/ptl_ips/ips_proto_header.h \ + prov/psm3/psm3/ptl_ips/ips_proto_help.h \ + prov/psm3/psm3/ptl_ips/ips_proto_internal.h \ + prov/psm3/psm3/ptl_ips/ips_proto_mq.c \ + prov/psm3/psm3/ptl_ips/ips_proto_params.h \ + prov/psm3/psm3/ptl_ips/ips_proto_recv.c \ + prov/psm3/psm3/ptl_ips/ips_recvhdrq.c \ + prov/psm3/psm3/ptl_ips/ips_recvhdrq.h \ + prov/psm3/psm3/ptl_ips/ips_recvq.c \ + prov/psm3/psm3/ptl_ips/ips_recvq.h \ + prov/psm3/psm3/ptl_ips/ips_scb.c \ + prov/psm3/psm3/ptl_ips/ips_scb.h \ + prov/psm3/psm3/ptl_ips/ips_stats.h \ + prov/psm3/psm3/ptl_ips/ips_subcontext.h \ + prov/psm3/psm3/ptl_ips/ips_tid.c \ + prov/psm3/psm3/ptl_ips/ips_tid.h \ + prov/psm3/psm3/ptl_ips/ips_tidcache.c \ + prov/psm3/psm3/ptl_ips/ips_tidcache.h \ + prov/psm3/psm3/ptl_ips/ips_tidflow.c \ + prov/psm3/psm3/ptl_ips/ips_tidflow.h \ + prov/psm3/psm3/ptl_ips/ips_writehdrq.c \ + prov/psm3/psm3/ptl_ips/ips_writehdrq.h \ + prov/psm3/psm3/ptl_ips/ptl.c \ + prov/psm3/psm3/ptl_ips/ptl_fwd.h \ + prov/psm3/psm3/ptl_ips/ptl_ips.h \ + prov/psm3/psm3/ptl_ips/ptl_rcvthread.c +libptl_ips_la_CPPFLAGS = \ + -I$(top_srcdir)/prov/psm3/psm3/ptl_ips/ \ + $(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags) +libptl_ips_la_CFLAGS = \ + $(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags) +libptl_ips_la_DEPENDENCIES = \ + libopa.la + +libptl_self_la_SOURCES = \ + prov/psm3/psm3/ptl_self/ptl.c \ + prov/psm3/psm3/ptl_self/ptl_fwd.h +libptl_self_la_CPPFLAGS = \ + -I$(top_srcdir)/prov/psm3/psm3/ptl_self/ \ + $(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags) +libptl_self_la_CFLAGS = \ + $(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags) + +libuuid_la_SOURCES = \ + prov/psm3/psm3/libuuid/pack.c \ + prov/psm3/psm3/libuuid/parse.c \ + prov/psm3/psm3/libuuid/psm_uuid.c \ + prov/psm3/psm3/libuuid/psm_uuid.h \ + prov/psm3/psm3/libuuid/unpack.c \ + prov/psm3/psm3/libuuid/unparse.c +# prov/psm3/psm3/libuuid/compare.c # Omitted as it is not needed to build lib +libuuid_la_CPPFLAGS = \ + -I$(top_srcdir)/prov/psm3/psm3/libuuid/ \ + $(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags) +libuuid_la_CFLAGS = \ + $(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags) + +libopa_la_SOURCES = \ + prov/psm3/psm3/opa/opa_debug.c \ + prov/psm3/psm3/opa/opa_dwordcpy-x86_64.c \ + prov/psm3/psm3/opa/opa_service.c \ + prov/psm3/psm3/opa/opa_sysfs.c \ + prov/psm3/psm3/opa/opa_syslog.c \ + prov/psm3/psm3/opa/opa_time.c \ + prov/psm3/psm3/opa/opa_utils.c \ + prov/psm3/psm3/include/opa_byteorder.h \ + prov/psm3/psm3/include/opa_debug.h \ + prov/psm3/psm3/include/opa_intf.h \ + prov/psm3/psm3/include/opa_queue.h \ + prov/psm3/psm3/include/opa_revision.h \ + prov/psm3/psm3/include/opa_service.h \ + prov/psm3/psm3/include/opa_udebug.h \ + prov/psm3/psm3/include/opa_user.h \ + prov/psm3/psm3/include/psm2_mock_testing.h \ + prov/psm3/psm3/include/rbtree.h \ + prov/psm3/psm3/include/linux-i386/bit_ops.h \ + prov/psm3/psm3/include/linux-i386/sysdep.h \ + prov/psm3/psm3/mpspawn/mpspawn_stats.h +libopa_la_CPPFLAGS = \ + $(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags) +libopa_la_CFLAGS = \ + $(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags) + +libpsm_hal_gen1_la_SOURCES = \ + prov/psm3/psm3/psm_hal_gen1/hfi1_deprecated_gen1.h \ + prov/psm3/psm3/psm_hal_gen1/opa_common_gen1.h \ + prov/psm3/psm3/psm_hal_gen1/opa_i2cflash_gen1.c \ + prov/psm3/psm3/psm_hal_gen1/opa_proto_gen1.c \ + prov/psm3/psm3/psm_hal_gen1/opa_service_gen1.c \ + prov/psm3/psm3/psm_hal_gen1/opa_service_gen1.h \ + prov/psm3/psm3/psm_hal_gen1/opa_user_gen1.h \ + prov/psm3/psm3/psm_hal_gen1/opa_utils_gen1.c \ + prov/psm3/psm3/psm_hal_gen1/psm_gdrcpy.c \ + prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1.c \ + prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1.h \ + prov/psm3/psm3/psm_hal_gen1/psm_hal_inline_i.h \ + prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1_spio.h +libpsm_hal_gen1_la_CPPFLAGS = \ + -I$(top_srcdir)/prov/psm3/psm3/psm_hal_gen1/ \ + $(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags) +libpsm_hal_gen1_la_CFLAGS = \ + $(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags) + +libpsm3i_la_SOURCES = \ + prov/psm3/psm3/psm.c \ + prov/psm3/psm3/psm_am.c \ + prov/psm3/psm3/psm_am_internal.h \ + prov/psm3/psm3/psm_config.h \ + prov/psm3/psm3/psm_context.c \ + prov/psm3/psm3/psm_context.h \ + prov/psm3/psm3/psm_diags.c \ + prov/psm3/psm3/psm_ep.c \ + prov/psm3/psm3/psm_ep.h \ + prov/psm3/psm3/psm_ep_connect.c \ + prov/psm3/psm3/psm_error.c \ + prov/psm3/psm3/psm_error.h \ + prov/psm3/psm3/psm_gdrcpy.h \ + prov/psm3/psm3/psm_help.h \ + prov/psm3/psm3/psm_lock.h \ + prov/psm3/psm3/psm_log.h \ + prov/psm3/psm3/psm_memcpy.c \ + prov/psm3/psm3/psm_mock.c \ + prov/psm3/psm3/psm_mpool.c \ + prov/psm3/psm3/psm_mpool.h \ + prov/psm3/psm3/psm_mq.c \ + prov/psm3/psm3/psm_mq_internal.h \ + prov/psm3/psm3/psm_mq_recv.c \ + prov/psm3/psm3/psm_mq_utils.c \ + prov/psm3/psm3/psm_netutils.h \ + prov/psm3/psm3/psm_perf.c \ + prov/psm3/psm3/psm_perf.h \ + prov/psm3/psm3/psm_rndv_mod.c \ + prov/psm3/psm3/psm_rndv_mod.h \ + prov/psm3/psm3/psm_stats.c \ + prov/psm3/psm3/psm_stats.h \ + prov/psm3/psm3/psm_sysbuf.c \ + prov/psm3/psm3/psm_sysbuf.h \ + prov/psm3/psm3/psm_timer.c \ + prov/psm3/psm3/psm_timer.h \ + prov/psm3/psm3/psm_user.h \ + prov/psm3/psm3/psm_utils.c \ + prov/psm3/psm3/psm_utils.h \ + prov/psm3/psm3/psm_verbs_ep.c \ + prov/psm3/psm3/psm_verbs_ep.h \ + prov/psm3/psm3/psm_verbs_mr.c \ + prov/psm3/psm3/psm_verbs_mr.h \ + prov/psm3/psm3/psm_udp_ep.c \ + prov/psm3/psm3/psm_udp_ep.h \ + prov/psm3/psm3/psmi_wrappers.c \ + prov/psm3/psm3/psmi_wrappers.h \ + prov/psm3/psm3/psm2.h \ + prov/psm3/psm3/psm2_am.h \ + prov/psm3/psm3/psm2_hal.c \ + prov/psm3/psm3/psm2_hal.h \ + prov/psm3/psm3/psm2_hal_inlines_i.h \ + prov/psm3/psm3/psm2_hal_inlines_d.h \ + prov/psm3/psm3/psm2_hal_inline_t.h \ + prov/psm3/psm3/psm2_mq.h \ + prov/psm3/psm3/ptl.h +libpsm3i_la_CPPFLAGS = \ + -I$(top_srcdir)/prov/psm3/psm3/include/ \ + $(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags) +libpsm3i_la_CFLAGS = \ + $(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags) + +libpsm3i_la_LIBADD = \ + libopa.la \ + libuuid.la \ + libptl_am.la \ + libptl_ips.la \ + libptl_self.la \ + libpsm_hal_gen1.la + +libpsm3i_la_DEPENDENCIES = \ + libopa.la \ + libuuid.la \ + libptl_am.la \ + libptl_ips.la \ + libptl_self.la \ + libpsm_hal_gen1.la + +EXTRA_DIST += \ + prov/psm3/psm3/include/rbtree.c \ + prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1_spio.c \ + prov/psm3/psm3/opa/opa_dwordcpy-x86_64-fast.S + +chksum_srcs += \ + $(libptl_am_la_SOURCES) $(libptl_ips_la_SOURCES) $(libptl_self_la_SOURCES) \ + $(libuuid_la_SOURCES) $(libopa_la_SOURCES) $(libpsm_hal_gen1_la_SOURCES) \ + $(libpsm3i_la_SOURCES) $(EXTRA_DIST) + +_psm3_LIBS = libpsm3i.la +libpsm3_la_DEPENDENCIES = libpsm3i.la + +all-local: + @echo "Building src checksum..."; \ + chksum=`cat $(chksum_srcs) | sha1sum | cut -d' ' -f 1`; \ + if ! grep -q $$chksum prov/psm3/src/psm3_revision.c 2>/dev/null; then \ + sed -i "/define PSMX3_SRC_CHECKSUM/s/\".*\"/\"$$chksum\"/" prov/psm3/src/psm3_revision.c; \ + echo "SRC checksum updated to $$chksum"; \ + else \ + echo "SRC checksum not changed: $$chksum"; \ + fi; \ + timestamp=`date`; \ + sed -i "/define PSMX3_BUILD_TIMESTAMP/s/\".*\"/\"$$timestamp\"/" prov/psm3/src/psm3_revision.c; \ + echo "Updated build timestamp: $$timestamp" + +endif HAVE_PSM3_SRC + +if HAVE_PSM3_DL +pkglib_LTLIBRARIES += libpsm3-fi.la +libpsm3_fi_la_SOURCES = $(_psm3_files) $(common_srcs) +libpsm3_fi_la_CFLAGS = $(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags) +libpsm3_fi_la_CPPFLAGS = $(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags) +libpsm3_fi_la_LDFLAGS = \ + -module -avoid-version -shared -export-dynamic \ + -export-symbols-regex ^fi_prov_ini $(psm3_LDFLAGS) +libpsm3_fi_la_LIBADD = $(linkback) $(psm3_LIBS) $(_psm3_LIBS) +libpsm3_fi_la_DEPENDENCIES = $(linkback) +else !HAVE_PSM3_DL +noinst_LTLIBRARIES += libpsm3.la +libpsm3_la_SOURCES = $(_psm3_files) +libpsm3_la_CFLAGS = $(src_libfabric_la_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags) +libpsm3_la_CPPFLAGS = $(src_libfabric_la_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags) +libpsm3_la_LDFLAGS = $(psm3_LDFLAGS) +libpsm3_la_LIBADD = $(psm3_LIBS) $(_psm3_LIBS) +src_libfabric_la_LIBADD += libpsm3.la +src_libfabric_la_DEPENDENCIES += libpsm3.la + +.libs/libpsm3_full.lo: $(libpsm3_la_OBJECTS) $(libpsm3_la_DEPENDENCIES) $(EXTRA_libpsm3_la_DEPENDENCIES) + $(AM_V_CCLD)$(libpsm3_la_LINK) -r $(am_libpsm3_la_rpath) $(libpsm3_la_OBJECTS) libpsm3i.la + +.libs/libpsm3_exp.o: .libs/libpsm3_full.lo + @objcopy --keep-global-symbol=fi_psm3_ini .libs/libpsm3_full.o .libs/libpsm3_exp.o + +libpsm3.la: .libs/libpsm3_exp.o + $(AM_V_CCLD)$(libpsm3_la_LINK) $(am_libpsm3_la_rpath) $(libpsm3_la_OBJECTS) $(libpsm3_la_LIBADD) $(LIBS); \ + rm -f .libs/libpsm3.a libpsm3.a; \ + ar cru .libs/libpsm3.a .libs/libpsm3_exp.o + +endif !HAVE_PSM3_DL + +prov_install_man_pages += man/man7/fi_psm3.7 + +endif HAVE_PSM3 + +prov_dist_man_pages += man/man7/fi_psm3.7 + diff --git a/prov/psm3/README b/prov/psm3/README new file mode 100644 index 00000000000..bd185a2237d --- /dev/null +++ b/prov/psm3/README @@ -0,0 +1,11 @@ +This is derived from libfabric source. See libfabric for README, AUTHORS, COPYING +and other notices. + + +To Build PSM3 OFI Provider: +1. ./configure +2. make -j + +To Build PSM3 OFI Provider RPM: +1. ./configure +2. make rpm diff --git a/prov/psm3/autogen.sh b/prov/psm3/autogen.sh new file mode 100755 index 00000000000..e4a8eed41f9 --- /dev/null +++ b/prov/psm3/autogen.sh @@ -0,0 +1,10 @@ +#! /bin/sh + +if test ! -f src/psmx3.h; then + echo You really need to run this script in the prov psm3 directory in git + exit 1 +fi + +set -x +autoreconf -ivf + diff --git a/prov/psm3/config b/prov/psm3/config new file mode 120000 index 00000000000..899f6989820 --- /dev/null +++ b/prov/psm3/config @@ -0,0 +1 @@ +../../config \ No newline at end of file diff --git a/prov/psm3/configure.ac b/prov/psm3/configure.ac new file mode 100644 index 00000000000..d5c6a14a652 --- /dev/null +++ b/prov/psm3/configure.ac @@ -0,0 +1,663 @@ +dnl +dnl Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. +dnl Copyright (c) 2019 Intel, Inc. All rights reserved. +dnl Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. All rights reserved. +dnl +dnl Process this file with autoconf to produce a configure script. + +AC_PREREQ([2.60]) +AC_INIT([libpsm3-fi], [1.11.2], [ofiwg@lists.openfabrics.org]) +AC_CONFIG_SRCDIR([src/psmx3.h]) +AC_CONFIG_AUX_DIR(config) +AC_CONFIG_MACRO_DIR(config) +AC_CONFIG_HEADERS(config.h) +AM_INIT_AUTOMAKE([1.11 dist-bzip2 foreign -Wall -Werror subdir-objects parallel-tests tar-pax]) +m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([no])]) +dnl --- m4_include(config/fi_check_package.m4) +AC_DEFINE([HAVE_PSM3], [1], [Build libfabric PSM3 provider]) +AC_DEFINE([HAVE_PSM3_DL], [1], [Build libfabric PSM3 provider]) + +dnl Override Default flags +CPPFLAGS="-D_DEFAULT_SOURCE -D_SVID_SOURCE -D_BSD_SOURCE" +AS_IF([test ! -z "$CC" && test "x$CC" == "xicc"], + [ dnl ICC + CFLAGS="-Werror -xATOM_SSE4.2 -DPSM_AVX512 -fpic -fPIC -D_GNU_SOURCE -DPACK_STRUCT_STL=packed," + LDFLAGS="-Wc,-static-intel" + ], [ dnl GCC/other + CFLAGS="-Werror -mavx2 -fpic -fPIC -funwind-tables -Wformat -Wformat-security" + ]) + +AC_ARG_ENABLE([psm-src], + [AS_HELP_STRING([--enable-psm-src], + [Enable Monolithic provider @<:@default=yes@:>@])], + [], + [enable_psm_src=yes]) +AS_IF([test "x$enable_psm_src" != "xno"], [psm_src=1], [psm_src=0]) +AM_CONDITIONAL([HAVE_PSM3_SRC], [test "x$enable_psm_src" != "xno"], [build PSM3 src into provider]) +AC_DEFINE_UNQUOTED([HAVE_PSM3_SRC], $psm_src, [PSM3 source is built-in]) + +PSM_HAL_CNT=1 +PSM_HAL_INST=gen1 + +AC_ARG_ENABLE([psm-ud], + [AS_HELP_STRING([--enable-psm-ud], + [Enable Verbs UD support @<:@default=yes@:>@])], + [], + [enable_psm_ud=yes]) +AC_ARG_ENABLE([psm-rc], + [AS_HELP_STRING([--enable-psm-rc], + [Enable Verbs RC support (requires UD support) @<:@default=yes@:>@])], + [], + [enable_psm_rc=yes]) +AC_ARG_WITH([psm3-rv], + [AS_HELP_STRING([--with-psm3-rv], + [Enable RV module use @<:@default=check@:>@])]) +AS_IF([test x$with_psm3_rv = xno], + [ + CPPFLAGS="$CPPFLAGS -URNDV_MOD_MR" + ],[ + AS_IF([test "x$with_psm3_rv" = "x"], + [ + psm3_rv_check=1 + with_psm3_rv=/usr/include/uapi + ]) + _FI_CHECK_PACKAGE_HEADER([psm3_rv], + [$with_psm3_rv/rv/rv_user_ioctls.h], + [], + [psm3_rv_happy=1], + [psm3_rv_happy=0]) + AS_IF([test "$psm3_rv_happy" -eq 0 && test "$psm3_rv_check" -eq 0], + [ + AC_MSG_ERROR([RV Module headers requested but not found.]) + ], + [ + AS_IF([test "$psm3_rv_happy" -eq 1], + [ + CPPFLAGS="$CPPFLAGS -DRNDV_MOD_MR -I$with_psm3_rv" + ], [ + CPPFLAGS="$CPPFLAGS -URNDV_MOD_MR" + ]) + ]) + ]) +AC_ARG_WITH([psm-headers], + [AC_HELP_STRING([--with-psm-headers=DIR], + [Provide path to where the psm headers are installed for split build. @<:@default=no@:>@])], + [], [with_psm_headers="no"]) +if test "$with_psm_headers" != "" && test "$with_psm_headers" != "no"; then + CPPFLAGS="$CPPFLAGS -I$with_psm_headers" + AC_CHECK_HEADER(psm2.h, [], + AC_MSG_ERROR([PSM Headers requested but not found.])) +fi +AC_ARG_ENABLE([psm-rdma-read], + [AS_HELP_STRING([--enable-psm-rdma-read], + [Enable RDMA READ (requires UD and RC support) @<:@default=no@:>@])], + [], + [enable_psm_rdma_read=no]) + +AS_IF([test "x$enable_psm_src" == "xyes" && test "x$enable_psm_ud" == "xyes"], + [ + CPPFLAGS="$CPPFLAGS -DPSM_UD" + AS_IF([test "x$enable_psm_rc" == "xyes"], + [ + CPPFLAGS="$CPPFLAGS -DUSE_RC" + AS_IF([test "x$enable_psm_rdma_read" == "yes"],[CPPFLAGS="$CPPFLAGS -DUSE_RDMA_READ"]) + ], + [ + CPPFLAGS="$CPPFLAGS -UUSE_RC" + ]) + AS_IF([test "x$enable_psm_rndv_mod" == "xyes"], + [CPPFLAGS="$CPPFLAGS -DRNDV_MOD_MR"], + [CPPFLAGS="$CPPFLAGS -URNDV_MOD_MR"]) + ]) +AS_IF([test "x$enable_psm_src" == "xyes"], + [ + AC_SEARCH_LIBS([shm_open], [rt], [], [AC_MSG_ERROR([unable to find shm_open() in librt])]) + AC_SEARCH_LIBS([dlopen], [dl], [], [AC_MSG_ERROR([unable to find dlopen() in libdl])]) + AC_SEARCH_LIBS([numa_node_of_cpu], [numa], [], [AC_MSG_ERROR([unable to find numa_node_of_cpu() in libnuma])]) + AS_IF([test "x$enable_psm_ud" == "xyes"], + [AC_SEARCH_LIBS([ibv_get_device_list], [ibverbs], [], + [AC_MSG_ERROR([unable to find ibv_get_device_list() in libibverbs])]) + ], []) + + AS_IF([test ! -z "$PSM2_MOCK_TESTING"], [CPPFLAGS="$CPPFLAGS -DPSM2_MOCK_TESTING=1"], []) + AS_IF([test ! -z "$PSM_FI"], [CPPFLAGS="$CPPFLAGS -DPSM_FI"], []) + AS_IF([test ! -z "$PSM_DEBUG"], + [ + CFLAGS="-O0 -g3 $CFLAGS" + CPPFLAGS="$CPPFLAGS -DPSM_DEBUG -D_HFI_DEBUGGING -funit-at-a-time -Wp,-D_FORTIFY_SOURCE=2" + ], + [CFLAGS="-O3 -g3 $CFLAGS"]) + + AS_IF([test ! -z "$PSM_COVERAGE"], + [ + CFLAGS="$CFLAGS -O -fprofile-arcs -ftest-coverage" + LDFLAGS="$LDFLAGS -fprofile-arcs" + ], []) + + AS_IF([test ! -z "$PSM_LOG"], + [ + CPPFLAGS="$CPPFLAGS -DPSM_LOG" + AS_IF([test ! -z "$PSM_LOG_FAST_IO"], + [CPPFLAGS="$CPPFLAGS -DPSM_LOG"], []) + ], []) + AS_IF([test ! -z "$PSM_PERF"], [CPPFLAGS="$CPPFLAGS -DRDPMC_PERF_FRAMEWORK"], []) + AS_IF([test ! -z "$PSM_HEAP_DEBUG"], [CPPFLAGS="$CPPFLAGS -DPSM_HEAP_DEBUG"], []) + AS_IF([test ! -z "$PSM_PROFILE"], [CPPFLAGS="$CPPFLAGS -DPSM_PROFILE"], []) + AS_IF([test ! -z "$PSM_CPPFLAGS"], [CPPFLAGS="$CPPFLAGS $PSM_CPPFLAGS"], []) + AS_IF([test ! -z "$PSM_CFLAGS"], [CFLAGS="$CFLAGS $PSM_CFLAGS"], []) + ]) + +AM_CONDITIONAL([HAVE_PSM3_ADDITIONAL_GLOBALS], [test ! -z "$PSM2_ADDITIONAL_GLOBALS"], []) +AM_COND_IF([HAVE_PSM3_ADDITIONAL_GLOBALS], [PSM3_ADDITIONAL_GLOBALS="$PSM2_ADDITIONAL_GLOBALS"],[]) + + +psm3_happy=1 + +AC_CANONICAL_HOST + +macos=0 +linux=0 +freebsd=0 + +case $host_os in +*darwin*) + macos=1 + ;; +*linux*) + linux=1 + ;; +*freebsd*) + freebsd=1 + ;; +*) + AC_MSG_ERROR([libfabric only builds on Linux, OS X, and FreeBSD]) + ;; +esac + +AM_CONDITIONAL([MACOS], [test "x$macos" = "x1"]) +AM_CONDITIONAL([LINUX], [test "x$linux" = "x1"]) +AM_CONDITIONAL([FREEBSD], [test "x$freebsd" = "x1"]) + +base_c_warn_flags="-Wall -Wundef -Wpointer-arith" +debug_c_warn_flags="-Wextra -Wno-unused-parameter -Wno-sign-compare -Wno-missing-field-initializers" +debug_c_other_flags="-fstack-protector-strong" +picky_c_warn_flags="-Wno-long-long -Wmissing-prototypes -Wstrict-prototypes -Wcomment -pedantic" + +AC_ARG_WITH([build_id], + [AC_HELP_STRING([--with-build_id], + [Enable build_id annotation @<:@default=no@:>@])], + [], [with_build_id=no]) +AS_IF([test x"$with_build_id" = x"no"], [with_build_id=""]) +AC_DEFINE_UNQUOTED([BUILD_ID],["$with_build_id"], + [adds build_id to version if it was defined]) + +# Override autoconf default CFLAG settings (e.g. "-g -O2") while still +# allowing the user to explicitly set CFLAGS="" +: ${CFLAGS="-fvisibility=hidden ${base_c_warn_flags} $CFLAGS"} +: ${CPPFLAGS="$CPPFLAGS"} + +# AM_PROG_AS would set CFLAGS="-g -O2" by default if not set already so it +# should not be called earlier +AM_PROG_AS() + +# AM PROG_AR did not exist pre AM 1.11.x (where x is somewhere >0 and +# <3), but it is necessary in AM 1.12.x. +m4_ifdef([AM_PROG_AR], [AM_PROG_AR]) + +AC_ARG_WITH([valgrind], + AC_HELP_STRING([--with-valgrind], + [Enable valgrind annotations @<:@default=no@:>@])) + +if test "$with_valgrind" != "" && test "$with_valgrind" != "no"; then + AC_DEFINE([INCLUDE_VALGRIND], 1, + [Define to 1 to enable valgrind annotations]) + if test -d $with_valgrind; then + CPPFLAGS="$CPPFLAGS -I$with_valgrind/include" + fi +fi + +AC_ARG_ENABLE([atomics], + [AS_HELP_STRING([--enable-atomics], + [Enable atomics support @<:@default=yes@:>@]) + ], + [], + [enable_atomics=yes]) + +dnl Checks for programs +AC_PROG_CC_C99 +AS_IF([test "$ac_cv_prog_cc_c99" = "no"], + [AC_MSG_WARN([Libfabric requires a C99-compliant compiler]) + AC_MSG_ERROR([Cannot continue])]) +AM_PROG_CC_C_O +AC_PROG_CPP + +AC_ARG_ENABLE([debug], + [AS_HELP_STRING([--enable-debug], + [Enable debugging @<:@default=no@:>@]) + ], + [], + [enable_debug=no]) + +AS_IF([test x"$enable_debug" != x"no"], + [dbg=1 + # See if all the flags in $debug_c_other_flags work + good_flags= + CFLAGS_save="$CFLAGS" + for flag in $debug_c_other_flags; do + AC_MSG_CHECKING([to see if compiler supports $flag]) + CFLAGS="$flag $CFLAGS_save" + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[]], [[int i = 3;]])], + [AC_MSG_RESULT([yes]) + good_flags="$flag $good_flags"], + [AC_MSG_RESULT([no])]) + done + debug_c_other_flags=$good_flags + unset good_flags + + CFLAGS="-g -O0 ${base_c_warn_flags} ${debug_c_warn_flags} ${debug_c_other_flags} ${CFLAGS_save}" + unset CFLAGS_save], + [dbg=0 + CFLAGS="-DNDEBUG $CFLAGS"]) + +AC_DEFINE_UNQUOTED([ENABLE_DEBUG],[$dbg], + [defined to 1 if libfabric was configured with --enable-debug, 0 otherwise]) + +dnl Checks for header files. +AC_HEADER_STDC + +dnl Check for compiler features +AC_C_TYPEOF + +LT_INIT +LT_OUTPUT + +dnl dlopen support is optional +AC_ARG_WITH([dlopen], + AC_HELP_STRING([--with-dlopen], + [dl-loadable provider support @<:@default=yes@:>@]), + ) + +if test "$freebsd" == "0"; then +AS_IF([test x"$with_dlopen" != x"no"], [ +AC_CHECK_LIB(dl, dlopen, [], + AC_MSG_ERROR([dlopen not found. libfabric requires libdl.])) +]) +fi + +dnl handle picky option +AC_ARG_ENABLE([picky], + [AC_HELP_STRING([--enable-picky], + [Enable developer-level compiler pickyness when building @<:@default=no@:>@])]) +AS_IF([test x"$enable_picky" = x"yes" && test x"$GCC" = x"yes"], + [AS_IF([test x"$enable_debug" = x"no"], + [CFLAGS="${base_c_warn_flags} ${debug_c_warn_flags} ${debug_c_other_flags} ${picky_c_warn_flags} $CFLAGS"], + [CFLAGS="${picky_c_warn_flags} $CFLAGS"]) + ]) + +dnl Checks for libraries +AC_CHECK_LIB(pthread, pthread_mutex_init, [], + AC_MSG_ERROR([pthread_mutex_init() not found. libfabric requires libpthread.])) + +AC_CHECK_FUNC([pthread_spin_init], + [have_spinlock=1], + [have_spinlock=0]) + +dnl shm_open not used in the common code on os-x + +AC_DEFINE_UNQUOTED([PT_LOCK_SPIN], [$have_spinlock], + [Define to 1 if pthread_spin_init is available.]) + +AC_ARG_ENABLE([epoll], + [AS_HELP_STRING([--disable-epoll], + [Disable epoll if available@<:@default=no@:>@])], + [], + [enable_epoll=auto] +) + +AS_IF([test x"$enable_epoll" != x"no"], + [AC_CHECK_FUNCS([epoll_create]) + if test "$ac_cv_func_epoll_create" = yes; then + AC_DEFINE([HAVE_EPOLL], [1], [Define if you have epoll support.]) + fi] +) + +AC_CHECK_HEADER([linux/perf_event.h], + [AC_CHECK_DECL([__builtin_ia32_rdpmc], + [ + AC_TRY_LINK([#include ], + [__builtin_ia32_rdpmc(0);], + [linux_perf_rdpmc=1], + [linux_perf_rdpmc=0]) + ], + [linux_perf_rdpmc=0], + [#include ])], + [linux_perf_rdpmc=0]) +AC_DEFINE_UNQUOTED(HAVE_LINUX_PERF_RDPMC, [$linux_perf_rdpmc], + [Whether we have __builtin_ia32_rdpmc() and linux/perf_event.h file or not]) +AM_CONDITIONAL([HAVE_LINUX_PERF_RDPMC], [test "x$linux_perf_rdpmc" = "x1"]) + +dnl Check for gcc atomic intrinsics +AS_IF([test x"$enable_atomics" != x"no"], + AC_MSG_CHECKING(compiler support for c11 atomics) + AC_TRY_LINK([#include ], + [atomic_int a; + atomic_init(&a, 0); + #ifdef __STDC_NO_ATOMICS__ + #error c11 atomics are not supported + #else + return 0; + #endif + ], + [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_ATOMICS, 1, [Set to 1 to use c11 atomic functions]) + ], + [AC_MSG_RESULT(no)]) + + + AC_MSG_CHECKING(compiler support for c11 atomic `least` types) + AC_TRY_LINK([#include ], + [atomic_int_least32_t a; + atomic_int_least64_t b; + ], + [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_ATOMICS_LEAST_TYPES, 1, + [Set to 1 to use c11 atomic `least` types]) + ], + [ + AC_MSG_RESULT(no) + ]), +[ + AC_MSG_RESULT(configure: atomics support for c11 is disabled) +]) + +dnl Check for gcc built-in atomics +AS_IF([test x"$enable_atomics" != x"no"], + AC_MSG_CHECKING(compiler support for built-in atomics) + AC_TRY_LINK([#include ], + [int32_t a; + __sync_add_and_fetch(&a, 0); + __sync_sub_and_fetch(&a, 0); + #if defined(__PPC__) && !defined(__PPC64__) + #error compiler built-in atomics are not supported on PowerPC 32-bit + #else + return 0; + #endif + ], + [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BUILTIN_ATOMICS, 1, [Set to 1 to use built-in intrincics atomics]) + ], + [AC_MSG_RESULT(no)]), +[ + AC_MSG_RESULT(configure: atomics support built-in is disabled) +]) + +dnl Check for gcc memory model aware built-in atomics +dnl If supported check to see if not internal to compiler +LIBS_save=$LIBS +AC_SEARCH_LIBS([__atomic_load_8], [atomic]) +AS_IF([test x"$enable_atomics" != x"no"], + AC_MSG_CHECKING(compiler support for built-in memory model aware atomics) + AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include ]], + [[uint64_t d; + uint64_t s; + uint64_t c; + uint64_t r; + r = __atomic_fetch_add(&d, s, __ATOMIC_SEQ_CST); + r = __atomic_load_8(&d, __ATOMIC_SEQ_CST); + __atomic_exchange(&d, &s, &r, __ATOMIC_SEQ_CST); + __atomic_compare_exchange(&d,&c,&s,0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); + #if defined(__PPC__) && !defined(__PPC64__) + #error compiler built-in memory model aware atomics are not supported on PowerPC 32-bit + #else + return 0; + #endif + ]])], + [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BUILTIN_MM_ATOMICS, 1, [Set to 1 to use built-in intrinsics memory model aware atomics]) + ], + [ + AC_MSG_RESULT(no) + LIBS=$LIBS_save + ]), +[ + AC_MSG_RESULT(configure: -latomic key is disabled) + LIBS=$LIBS_save +]) +unset LIBS_save + +dnl Check for gcc cpuid intrinsics +AC_MSG_CHECKING(compiler support for cpuid) +AC_TRY_LINK([ + #include + #include ], + [ + int a, b, c, d; + __cpuid_count(0, 0, a, b, c, d); + ], + [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_CPUID, 1, [Set to 1 to use cpuid]) + ], + [AC_MSG_RESULT(no)]) + +if test "$with_valgrind" != "" && test "$with_valgrind" != "no"; then +AC_CHECK_HEADER(valgrind/memcheck.h, [], + AC_MSG_ERROR([valgrind requested but not found.])) +fi + +AC_CACHE_CHECK(whether ld accepts --version-script, ac_cv_version_script, + [if test -n "`$LD --help < /dev/null 2>/dev/null | grep version-script`"; then + ac_cv_version_script=yes + else + ac_cv_version_script=no + fi]) + +AC_ARG_ENABLE([embedded], + [AS_HELP_STRING([--enable-embedded], + [Enable embedded support (turns off symbol versioning) @<:@default=no@:>@]) + ], + [ac_asm_symver_support=0 + icc_symver_hack=1], + [enable_embedded=no]) +AM_CONDITIONAL([EMBEDDED], [test x"$enable_embedded" = x"yes"]) + +AM_CONDITIONAL(HAVE_LD_VERSION_SCRIPT, test "$ac_cv_version_script" = "yes") + +dnl Disable symbol versioning when -ipo is in CFLAGS or ipo is disabled by icc. +dnl The gcc equivalent ipo (-fwhole-program) seems to work fine. +AS_CASE([$CFLAGS], + [*-ipo*],[ + AC_MSG_NOTICE([disabling symbol versioning support with -ipo CFLAG]) + icc_symver_hack=1 + ac_asm_symver_support=0 + ], + [] +) + +dnl Check for symbol versioning compiler + linker support. +dnl If icc + ipo, then print disabled and skip check +AC_MSG_CHECKING(for .symver assembler support) +AS_IF([test "$icc_symver_hack"], + [AC_MSG_RESULT(disabled)], +[ + +AC_TRY_LINK([], + [__asm__(".symver main_, main@ABIVER_1.0");], + [ + AC_MSG_RESULT(yes) + ac_asm_symver_support=1 + ], + [ + AC_MSG_RESULT(no) + ac_asm_symver_support=0 + ]) + +]) dnl AS_IF icc_symver_hack + +AC_DEFINE_UNQUOTED([HAVE_SYMVER_SUPPORT], [$ac_asm_symver_support], + [Define to 1 if compiler/linker support symbol versioning.]) + +AC_MSG_CHECKING(for __alias__ attribute support) +AC_TRY_LINK( + [ + int foo(int arg); + int foo(int arg) { return arg + 3; }; + int foo2(int arg) __attribute__ (( __alias__("foo"))); + ], + [ /* empty main */ ], + [ + AC_MSG_RESULT(yes) + ac_prog_cc_alias_symbols=1 + ], + [ + AC_MSG_RESULT(no) + ac_prog_cc_alias_symbols=0 + ]) + +AC_DEFINE_UNQUOTED([HAVE_ALIAS_ATTRIBUTE], [$ac_prog_cc_alias_symbols], + [Define to 1 if the linker supports alias attribute.]) +AC_CHECK_FUNCS([getifaddrs]) + +dnl Check for ethtool support +AC_MSG_CHECKING(ethtool support) +AC_TRY_LINK([ + #include + #include + #include + #include + #include ], + [ + unsigned long ioctl_req = SIOCETHTOOL; + struct ethtool_cmd cmd = { + .cmd = ETHTOOL_GSET, + }; + long speed = cmd.speed; + ], + [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_ETHTOOL, 1, [Set to 1 to use ethtool]) + + ], + [AC_MSG_RESULT(no)]) + +dnl Check for ethtool SPEED_UNKNOWN macro (suppoirted in the linux +dnl kernel >= 3.2) and ethtool_cmd_speed function declarations +dnl supported in the linux kernel >= 2.6.26 +AC_CHECK_DECLS([ethtool_cmd_speed, SPEED_UNKNOWN], [], [], + [#include ]) + +dnl Check for userfault fd support +have_uffd=0 +AC_CHECK_HEADERS([linux/userfaultfd.h], + [AC_CHECK_DECL([__NR_userfaultfd], + [have_uffd=1], + [], + [[#include ]])], + [], []) + +AS_IF([test $have_uffd -eq 1], + [AC_MSG_CHECKING([for userfaultfd unmap support]) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ + #include + #include + #include + #include + #include + #include + ]], + [[ + int fd; + struct uffdio_api api_obj; + api_obj.api = UFFD_API; + api_obj.features = UFFD_FEATURE_EVENT_UNMAP | + UFFD_FEATURE_EVENT_REMOVE | + UFFD_FEATURE_EVENT_REMAP; + fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + return ioctl(fd, UFFDIO_API, &api_obj); + ]]) + ], + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no]) + have_uffd=0])]) + +AC_DEFINE_UNQUOTED([HAVE_UFFD_UNMAP], [$have_uffd], + [Define to 1 if platform supports userfault fd unmap]) + +dnl Check support to intercept syscalls +AC_CHECK_HEADERS_ONCE(elf.h sys/auxv.h) + +dnl Check support to clock_gettime +have_clock_gettime=0 + +AC_SEARCH_LIBS([clock_gettime],[rt], + [have_clock_gettime=1], + []) + +AC_DEFINE_UNQUOTED(HAVE_CLOCK_GETTIME, [$have_clock_gettime], + [Define to 1 if clock_gettime is available.]) +AM_CONDITIONAL(HAVE_CLOCK_GETTIME, [test $have_clock_gettime -eq 1]) + +dnl Check for CUDA runtime libraries. +AC_ARG_WITH([cuda], + [AC_HELP_STRING([--with-cuda=DIR], + [Provide path to where the CUDA development + and runtime libraries are installed.])], + [], []) +AS_IF([test ! -z "$PSM_CUDA"], [with_cuda=/usr/local/cuda]) +have_libcuda=0 +AS_IF([test x"$with_cuda" != x"no"], + [FI_CHECK_PACKAGE([cuda], + [cuda_runtime.h], + [cudart], + [cudaMemcpy], + [-lcuda], + [$with_cuda], + [], + [have_libcuda=1], + [], + [])], + []) +AS_IF([test "$with_cuda" = "yes" && test "$have_libcuda" = "0" ], + [AC_MSG_ERROR([CUDA support requested but CUDA runtime not available.])], + []) +AC_DEFINE_UNQUOTED([HAVE_LIBCUDA], [$have_libcuda], [Whether we have CUDA runtime or not]) +if test $have_libcuda -eq 1; then + cuda_CPPFLAGS="$cuda_CPPFLAGS -DPSM_CUDA -DNVIDIA_GPU_DIRECT" +fi + +CPPFLAGS="$CPPFLAGS $cuda_CPPFLAGS" +LDFLAGS="$LDFLAGS $cuda_LDFLAGS" +LIBS="$LIBS $cuda_LIBS" + +dnl Provider-specific checks +dnl FI_PROVIDER_INIT +dnl FI_PROVIDER_SETUP([psm3]) +dnl FI_PROVIDER_FINI +dnl Configure the .pc file +#FI_PROVIDER_SETUP_PC + +AC_SUBST(PSM_HAL_CNT) +AC_SUBST(PSM_HAL_INST) + +AM_COND_IF([HAVE_PSM3_SRC], + [ + IFS_VERSION="${RELEASE_TAG:-$(git describe --dirty --always --abbrev=8 --broken --tags 2>/dev/null \ + || git describe --dirty --always --abbrev=8 --broken 2>/dev/null || echo 'unknown commit')}" + GIT_HASH="$(git log --oneline --format='%H' -1)" + ]) +AC_SUBST(IFS_VERSION) +AC_SUBST(GIT_HASH) +dnl Set during Make. +dnl AC_SUBST(BUILD_TIMESTAMP) +dnl AC_SUBST(SRC_CHECKSUM) + +AC_SUBST(PSM3_ADDITIONAL_GLOBALS) + +AC_CONFIG_FILES([Makefile libpsm3-fi.spec libpsm3-fi.map libpsm3-fi.pc]) +AM_COND_IF([HAVE_PSM3_SRC], + [AC_CONFIG_FILES([psm3/psm2_hal_inlines_i.h psm3/psm2_hal_inlines_d.h src/psm3_revision.c])]) +AC_OUTPUT diff --git a/prov/psm3/configure.m4 b/prov/psm3/configure.m4 new file mode 100644 index 00000000000..47d206cbff0 --- /dev/null +++ b/prov/psm3/configure.m4 @@ -0,0 +1,158 @@ +dnl Configury specific to the libfabric PSM3 provider + +dnl Called to configure this provider +dnl +dnl Arguments: +dnl +dnl $1: action if configured successfully +dnl $2: action if not configured successfully +dnl +AC_DEFUN([FI_PSM3_CONFIGURE],[ + # Determine if we can support the psm3 provider + psm3_ARCH=$host_cpu + AM_CONDITIONAL([HAVE_PSM3_X86_64], [test x$psm3_ARCH = xx86_64]) + AC_SUBST([HAVE_PSM3_X86_64]) + AC_SUBST([psm3_ARCH]) + + enable_psm3_src=yes + AM_CONDITIONAL([HAVE_PSM3_SRC], [test "x$enable_psm3_src" != "xno"], [build PSM3 src into provider]) + AC_DEFINE([HAVE_PSM3_SRC], [1], [PSM3 source is built-in]) + + PSM_HAL_CNT=1 + PSM_HAL_INST=gen1 + + psm3_happy=1 + AS_IF([test x"$enable_psm3" != x"no"], + [ + FI_CHECK_PACKAGE([psm3_rt], + [sys/mman.h], + [rt], + [shm_open], + [], + [$psm3_PREFIX], + [$psm3_LIBDIR], + [], + [psm3_happy=0]) +ifelse(' + FI_CHECK_PACKAGE([psm3_dl], + [dlfcn.h], + [dl], + [dlopen], + [], + [$psm3_PREFIX], + [$psm3_LIBDIR], + [psm3_dl_happy=1], + [psm3_happy=0]) +')dnl + FI_CHECK_PACKAGE([psm3_numa], + [numa.h], + [numa], + [numa_node_of_cpu], + [], + [$psm3_PREFIX], + [$psm3_LIBDIR], + [psm3_numa_happy=1], + [psm3_happy=0]) + + FI_CHECK_PACKAGE([psm3_ibv], + [infiniband/verbs.h], + [ibverbs], + [ibv_get_device_list], + [], + [$psm3_PREFIX], + [$psm3_LIBDIR], + [psm3_ibv_happy=1], + [psm3_happy=0]) + + AC_MSG_CHECKING([for -msse4.2 support]) + save_CFLAGS=$CFLAGS + CFLAGS="$CFLAGS -msse4.2" + AC_LINK_IFELSE( + [AC_LANG_PROGRAM( + [#include ], + [unsigned int crc = 0; + crc = _mm_crc32_u32(crc, 0); + return crc == 0;]) + ],[ + AC_MSG_RESULT([yes]) + psm3_crc_happy=1 + ARCH_CFLAGS="-msse4.2" + ],[ + psm3_happy=0 + AC_MSG_RESULT([no]) + AC_MSG_NOTICE([psm3 requires minimum of avx instruction set to build]) + ]) + CFLAGS=$save_CFLAGS + + AC_MSG_CHECKING([for -mavx support]) + save_CFLAGS=$CFLAGS + CFLAGS="$CFLAGS -mavx" + AC_LINK_IFELSE( + [AC_LANG_PROGRAM( + [#include ], + [unsigned long long *vec_a = {1,2,3,4}; + __m256i *sp = (__m256i *)vec_a; + __m256i vec = _mm256_load_si256(sp); + return 0;]) + ],[ + AC_MSG_RESULT([yes]) + psm3_256_happy=1 + ARCH_CFLAGS="-mavx" + ],[ + psm3_happy=0 + AC_MSG_RESULT([no]) + AC_MSG_NOTICE([psm3 requires minimum of avx instruction set to build]) + ]) + CFLAGS=$save_CFLAGS + + AS_IF([test x$with_psm3_rv = xno], + [psm3_CPPFLAGS="$psm3_CPPFLAGS -URNDV_MOD_MR"], + [ + AS_IF([test "x$with_psm3_rv" = "x"], + [ + psm3_rv_check=1 + with_psm3_rv=/usr/include/uapi + ]) + save_CPPFLAGS=$CPPFLAGS + CPPFLAGS="$CPPFLAGS -I$with_psm3_rv" + _FI_CHECK_PACKAGE_HEADER([psm3_rv], + [rv/rv_user_ioctls.h], + [], + [psm3_rv_happy=1], + [psm3_rv_happy=0]) + CPPFLAGS=$save_CPPFLAGS + AS_IF([test "$psm3_rv_happy" -eq 0 && test "$psm3_rv_check" -eq 0], + [ + psm3_happy=0 + AC_MSG_ERROR([RV Module headers requested but not found.]) + ],[ + AS_IF([test "$psm3_rv_happy" -eq 1], + [psm3_CPPFLAGS="$psm3_CPPFLAGS -DRNDV_MOD_MR -I$with_psm3_rv"], + [psm3_CPPFLAGS="$psm3_CPPFLAGS -URNDV_MOD_MR"]) + ]) + ]) + AS_IF([test $psm3_happy -eq 1], [ + AC_CONFIG_FILES([prov/psm3/psm3/psm2_hal_inlines_i.h \ + prov/psm3/psm3/psm2_hal_inlines_d.h \ + prov/psm3/src/psm3_revision.c]) + ]) + ],[psm3_happy=0]) + + AS_IF([test $psm3_happy -eq 1], [$1], [$2]) + + psm3_CFLAGS="$ARCH_CFLAGS" + psm3_CPPFLAGS="$psm3_CPPFLAGS $psm3_rt_CPPFLAGS $psm3_dl_CPPFLAGS $psm3_numa_CPPFLAGS $psm3_ibv_CPPFLAGS" + psm3_LDFLAGS="$psm3_LDFLAGS $psm3_rt_LDFLAGS $psm3_dl_LDFLAGS $psm3_numa_LDFLAGS $psm3_ibv_LDFLAGS" + psm3_LIBS="$psm3_LIBS $psm3_rt_LIBS $psm3_dl_LIBS $psm3_numa_LIBS $psm3_ibv_LIBS" + AC_SUBST(psm3_CFLAGS) + AC_SUBST(psm3_CPPFLAGS) + AC_SUBST(psm3_LDFLAGS) + AC_SUBST(psm3_LIBS) + AC_SUBST(PSM_HAL_CNT) + AC_SUBST(PSM_HAL_INST) + +]) + +AC_ARG_WITH([psm3-rv], + [AS_HELP_STRING([--with-psm3-rv], + [Enable RV module use @<:@default=check@:>@])]) diff --git a/prov/psm3/inc b/prov/psm3/inc new file mode 120000 index 00000000000..fcffffbed8d --- /dev/null +++ b/prov/psm3/inc @@ -0,0 +1 @@ +../../include \ No newline at end of file diff --git a/prov/psm3/libpsm3-fi.map.in b/prov/psm3/libpsm3-fi.map.in new file mode 100644 index 00000000000..b6672359af9 --- /dev/null +++ b/prov/psm3/libpsm3-fi.map.in @@ -0,0 +1,6 @@ +PSM3_FI_1.0 { + global: + fi_prov_ini; + @PSM3_ADDITIONAL_GLOBALS@ + local: *; +}; diff --git a/prov/psm3/libpsm3-fi.pc.in b/prov/psm3/libpsm3-fi.pc.in new file mode 100644 index 00000000000..c664ed200b8 --- /dev/null +++ b/prov/psm3/libpsm3-fi.pc.in @@ -0,0 +1,14 @@ +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ + +Name: libpsm3-fi +Description: OFI-WG libfabric PSM3 provider +URL: https://github.com/ofiwg/libfabric.git +Version: @VERSION@ +Requires: libfabric +Cflags: -I${includedir} +Libs: -L${libdir} -lpsm3-fi +Libs.private: +Requires.private: diff --git a/prov/psm3/libpsm3-fi.spec.in b/prov/psm3/libpsm3-fi.spec.in new file mode 100644 index 00000000000..aca6ef7ffb4 --- /dev/null +++ b/prov/psm3/libpsm3-fi.spec.in @@ -0,0 +1,56 @@ +%{!?configopts: %global configopts LDFLAGS=-Wl,--build-id} +%{!?provider: %define provider psm3} +%{!?provider_formal: %define provider_formal PSM3} + +Name: lib%{provider}-fi +Version: @VERSION@ +Release: 1%{?dist} +Summary: Dynamic %{provider_formal} provider for Libfabric + +Group: System Environment/Libraries +License: GPLv2 or BSD +Url: http://www.github.com/ofiwg/libfabric +Source: http://www.github.org/ofiwg/%{name}/releases/download/%{provider}-v{%version}/%{name}-%{version}.tar.bz2 +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) +Requires: libfabric +%if 0%{?suse_version} >= 1 +Provides: lib${provider}-fi1 = %{version}-%{release} +%endif + +%description +This RPM provides the %{provider_formal} provider as a "plugin" to an existing +libfabric installation. This plugin will override older %{provider_formal} +provider functionality in the existing libfabric installation. + +%prep +%setup -q -n %{name}-%{version} + +%build +%configure %{configopts} +%make_build + +%install +rm -rf %{buildroot} + +%make_install installdirs +# remove unpackaged files from the buildroot +rm -f %{buildroot}%{_libdir}/*.la + +%clean +rm -rf %{buildroot} + +%post -p /sbin/ldconfig +%postun -p /sbin/ldconfig + +%files +%defattr(-,root,root,-) +%{_libdir}/libfabric/%{name}* +%doc README +%exclude %{_libdir}/libfabric/*.a +%exclude %{_libdir}/libfabric/*.la +%exclude %{_libdir}/pkgconfig +%exclude %{_mandir} + +%changelog +* Wed May 24 2017 Open Fabrics Interfaces Working Group +- First release of specfile for packaging a single dl provider. diff --git a/prov/psm3/man b/prov/psm3/man new file mode 120000 index 00000000000..ee201c19319 --- /dev/null +++ b/prov/psm3/man @@ -0,0 +1 @@ +../../man \ No newline at end of file diff --git a/prov/psm3/psm3/.gitignore b/prov/psm3/psm3/.gitignore new file mode 100644 index 00000000000..d12089a2694 --- /dev/null +++ b/prov/psm3/psm3/.gitignore @@ -0,0 +1,2 @@ +psm2_hal_inlines_d.h +psm2_hal_inlines_i.h diff --git a/prov/psm3/psm3/Makefile.include b/prov/psm3/psm3/Makefile.include new file mode 100644 index 00000000000..9dad7d06a0b --- /dev/null +++ b/prov/psm3/psm3/Makefile.include @@ -0,0 +1,220 @@ + +_CPPFLAGS = \ + -I./psm3/ -I./psm3/ptl_ips/ \ + -I./psm3/include -I./psm3/include/linux-i386 \ + -I./psm3/mpspawn -I./psm3/opa \ + -D_GNU_SOURCE=1 \ + $(AM_CPPFLAGS) + +noinst_LTLIBRARIES += libopa.la libuuid.la \ + libptl_am.la libptl_ips.la libptl_self.la \ + libpsm_hal_gen1.la libpsm2.la + +libptl_am_la_SOURCES = \ + psm3/ptl_am/am_config.h \ + psm3/ptl_am/am_cuda_memhandle_cache.c \ + psm3/ptl_am/am_cuda_memhandle_cache.h \ + psm3/ptl_am/am_reqrep.c \ + psm3/ptl_am/am_reqrep_shmem.c \ + psm3/ptl_am/cmarw.h \ + psm3/ptl_am/cmarwu.c \ + psm3/ptl_am/psm_am_internal.h \ + psm3/ptl_am/ptl.c \ + psm3/ptl_am/ptl_fwd.h +libptl_am_la_CPPFLAGS = \ + -I./psm3/ptl_am/ \ + $(_CPPFLAGS) + +libptl_ips_la_SOURCES = \ + psm3/ptl_ips/ips_config.h \ + psm3/ptl_ips/ips_crc32.c \ + psm3/ptl_ips/ips_epstate.c \ + psm3/ptl_ips/ips_epstate.h \ + psm3/ptl_ips/ips_expected_proto.h \ + psm3/ptl_ips/ips_opp_path_rec.c \ + psm3/ptl_ips/ips_path_rec.c \ + psm3/ptl_ips/ips_path_rec.h \ + psm3/ptl_ips/ips_proto.c \ + psm3/ptl_ips/ips_proto.h \ + psm3/ptl_ips/ips_proto_am.c \ + psm3/ptl_ips/ips_proto_am.h \ + psm3/ptl_ips/ips_proto_connect.c \ + psm3/ptl_ips/ips_proto_dump.c \ + psm3/ptl_ips/ips_proto_expected.c \ + psm3/ptl_ips/ips_proto_header.h \ + psm3/ptl_ips/ips_proto_help.h \ + psm3/ptl_ips/ips_proto_internal.h \ + psm3/ptl_ips/ips_proto_mq.c \ + psm3/ptl_ips/ips_proto_params.h \ + psm3/ptl_ips/ips_proto_recv.c \ + psm3/ptl_ips/ips_recvhdrq.c \ + psm3/ptl_ips/ips_recvhdrq.h \ + psm3/ptl_ips/ips_recvq.c \ + psm3/ptl_ips/ips_recvq.h \ + psm3/ptl_ips/ips_scb.c \ + psm3/ptl_ips/ips_scb.h \ + psm3/ptl_ips/ips_stats.h \ + psm3/ptl_ips/ips_subcontext.h \ + psm3/ptl_ips/ips_tid.c \ + psm3/ptl_ips/ips_tid.h \ + psm3/ptl_ips/ips_tidcache.c \ + psm3/ptl_ips/ips_tidcache.h \ + psm3/ptl_ips/ips_tidflow.c \ + psm3/ptl_ips/ips_tidflow.h \ + psm3/ptl_ips/ips_writehdrq.c \ + psm3/ptl_ips/ips_writehdrq.h \ + psm3/ptl_ips/ptl.c \ + psm3/ptl_ips/ptl_fwd.h \ + psm3/ptl_ips/ptl_ips.h \ + psm3/ptl_ips/ptl_rcvthread.c +libptl_ips_la_CPPFLAGS = \ + -I./psm3/ptl_ips/ \ + $(_CPPFLAGS) +libptl_ips_la_DEPENDENCIES = \ + libopa.la + +libptl_self_la_SOURCES = \ + psm3/ptl_self/ptl.c \ + psm3/ptl_self/ptl_fwd.h +libptl_self_la_CPPFLAGS = \ + -I./psm3/ptl_self/ \ + $(_CPPFLAGS) + +libuuid_la_SOURCES = \ + psm3/libuuid/pack.c \ + psm3/libuuid/parse.c \ + psm3/libuuid/psm_uuid.c \ + psm3/libuuid/psm_uuid.h \ + psm3/libuuid/unpack.c \ + psm3/libuuid/unparse.c +# psm3/libuuid/compare.c # Omitted as it is not needed to build lib +libuuid_la_CPPFLAGS = \ + -I./psm3/libuuid/ \ + $(_CPPFLAGS) + +libopa_la_SOURCES = \ + psm3/opa/opa_debug.c \ + psm3/opa/opa_dwordcpy-x86_64.c \ + psm3/opa/opa_service.c \ + psm3/opa/opa_sysfs.c \ + psm3/opa/opa_syslog.c \ + psm3/opa/opa_time.c \ + psm3/opa/opa_utils.c \ + psm3/include/opa_byteorder.h \ + psm3/include/opa_debug.h \ + psm3/include/opa_intf.h \ + psm3/include/opa_queue.h \ + psm3/include/opa_revision.h \ + psm3/include/opa_service.h \ + psm3/include/opa_udebug.h \ + psm3/include/opa_user.h \ + psm3/include/psm2_mock_testing.h \ + psm3/include/rbtree.h \ + psm3/include/linux-i386/bit_ops.h \ + psm3/include/linux-i386/sysdep.h \ + psm3/mpspawn/mpspawn_stats.h \ + psm3/opa/opa_dwordcpy-x86_64-fast.S +libopa_la_CPPFLAGS = \ + $(_CPPFLAGS) + +libpsm_hal_gen1_la_SOURCES = \ + psm3/psm_hal_gen1/hfi1_deprecated_gen1.h \ + psm3/psm_hal_gen1/opa_common_gen1.h \ + psm3/psm_hal_gen1/opa_i2cflash_gen1.c \ + psm3/psm_hal_gen1/opa_proto_gen1.c \ + psm3/psm_hal_gen1/opa_service_gen1.c \ + psm3/psm_hal_gen1/opa_service_gen1.h \ + psm3/psm_hal_gen1/opa_user_gen1.h \ + psm3/psm_hal_gen1/opa_utils_gen1.c \ + psm3/psm_hal_gen1/psm_gdrcpy.c \ + psm3/psm_hal_gen1/psm_hal_gen1.c \ + psm3/psm_hal_gen1/psm_hal_gen1.h \ + psm3/psm_hal_gen1/psm_hal_inline_i.h \ + psm3/psm_hal_gen1/psm_hal_gen1_spio.h +libpsm_hal_gen1_la_CPPFLAGS = \ + -I./psm3/psm_hal_gen1/ \ + $(_CPPFLAGS) + +libpsm2_la_SOURCES = \ + psm3/psm.c \ + psm3/psm_am.c \ + psm3/psm_am_internal.h \ + psm3/psm_config.h \ + psm3/psm_context.c \ + psm3/psm_context.h \ + psm3/psm_diags.c \ + psm3/psm_ep.c \ + psm3/psm_ep.h \ + psm3/psm_ep_connect.c \ + psm3/psm_error.c \ + psm3/psm_error.h \ + psm3/psm_gdrcpy.h \ + psm3/psm_help.h \ + psm3/psm_lock.h \ + psm3/psm_log.h \ + psm3/psm_memcpy.c \ + psm3/psm_mock.c \ + psm3/psm_mpool.c \ + psm3/psm_mpool.h \ + psm3/psm_mq.c \ + psm3/psm_mq_internal.h \ + psm3/psm_mq_recv.c \ + psm3/psm_mq_utils.c \ + psm3/psm_netutils.h \ + psm3/psm_perf.c \ + psm3/psm_perf.h \ + psm3/psm_rndv_mod.c \ + psm3/psm_rndv_mod.h \ + psm3/psm_stats.c \ + psm3/psm_stats.h \ + psm3/psm_sysbuf.c \ + psm3/psm_sysbuf.h \ + psm3/psm_timer.c \ + psm3/psm_timer.h \ + psm3/psm_user.h \ + psm3/psm_utils.c \ + psm3/psm_utils.h \ + psm3/psm_verbs_ep.c \ + psm3/psm_verbs_ep.h \ + psm3/psm_verbs_mr.c \ + psm3/psm_verbs_mr.h \ + psm3/psm_udp_ep.c \ + psm3/psm_udp_ep.h \ + psm3/psmi_wrappers.c \ + psm3/psmi_wrappers.h \ + psm3/psm2.h \ + psm3/psm2_am.h \ + psm3/psm2_hal.c \ + psm3/psm2_hal.h \ + psm3/psm2_hal_inlines_i.h \ + psm3/psm2_hal_inlines_d.h \ + psm3/psm2_hal_inline_t.h \ + psm3/psm2_mq.h \ + psm3/ptl.h +libpsm2_la_CPPFLAGS = \ + $(_CPPFLAGS) + +libpsm2_la_LIBADD = \ + libopa.la \ + libuuid.la \ + libptl_am.la \ + libptl_ips.la \ + libptl_self.la \ + libpsm_hal_gen1.la + +libpsm2_la_DEPENDENCIES = \ + libopa.la \ + libuuid.la \ + libptl_am.la \ + libptl_ips.la \ + libptl_self.la \ + libpsm_hal_gen1.la + +EXTRA_DIST += \ + psm3/include/rbtree.c \ + psm3/psm_hal_gen1/psm_hal_gen1_spio.c + +chksum_srcs += \ + $(libptl_am_la_SOURCES) $(libptl_ips_la_SOURCES) $(libptl_self_la_SOURCES) \ + $(libuuid_la_SOURCES) $(libopa_la_SOURCES) $(libpsm_hal_gen1_la_SOURCES) \ + $(libpsm2_la_SOURCES) diff --git a/prov/psm3/psm3/include/linux-i386/bit_ops.h b/prov/psm3/psm3/include/linux-i386/bit_ops.h new file mode 100644 index 00000000000..d272e755d9b --- /dev/null +++ b/prov/psm3/psm3/include/linux-i386/bit_ops.h @@ -0,0 +1,98 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _HFI_i386_BIT_OPS_H +#define _HFI_i386_BIT_OPS_H + +static __inline__ void ips_clear_bit(int nr, volatile unsigned long *addr) +{ + asm volatile (LOCK_PREFIX "btrl %1,%0" : "=m"(*addr) : "dIr"(nr)); +} + +static __inline__ void ips_change_bit(int nr, volatile unsigned long *addr) +{ + asm volatile (LOCK_PREFIX "btcl %1,%0" : "=m"(*addr) : "dIr"(nr)); +} + +static __inline__ int ips_test_and_set_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile (LOCK_PREFIX "btsl %2,%1\n\tsbbl %0,%0" : "=r"(oldbit), + "=m"(*addr) : "dIr"(nr) : "memory"); + return oldbit; +} + +static __inline__ void ips___clear_bit(int nr, volatile unsigned long *addr) +{ + asm volatile ("btrl %1,%0" : "=m" (*addr) : "dIr"(nr)); +} + +static __inline__ void ips___change_bit(int nr, volatile unsigned long *addr) +{ + asm volatile ("btcl %1,%0" : "=m" (*addr) : "dIr"(nr)); +} + +static __inline__ int ips___test_and_set_bit(int nr, + volatile unsigned long *addr) +{ + int oldbit; + + asm volatile ("btsl %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), + "=m"(*addr) : "dIr"(nr) : "memory"); + return oldbit; +} + +#endif /* _HFI_i386_BIT_OPS_H */ diff --git a/prov/psm3/psm3/include/linux-i386/sysdep.h b/prov/psm3/psm3/include/linux-i386/sysdep.h new file mode 100644 index 00000000000..bfd5746455a --- /dev/null +++ b/prov/psm3/psm3/include/linux-i386/sysdep.h @@ -0,0 +1,171 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _HFI_i386_SYSDEP_H +#define _HFI_i386_SYSDEP_H + +typedef struct cpuid { + unsigned eax, ebx, ecx, edx; +} cpuid_t; + +static __inline__ void +get_cpuid(const unsigned func, const unsigned subfunc, cpuid_t *id) +{ + unsigned a, b, c, d; + + asm (" \ + mov %4, %%eax \n\ + mov %5, %%ecx \n\ + cpuid \n\ + mov %%eax, %0 \n\ + mov %%ebx, %1 \n\ + mov %%ecx, %2 \n\ + mov %%edx, %3 \n\ + " : "=g" (a), "=g" (b), "=g" (c), "=g" (d) + : "g" (func), "g" (subfunc) + : "%eax", "%ebx", "%ecx", "%edx" + ); + + id->eax = a; + id->ebx = b; + id->ecx = c; + id->edx = d; +} + +static __inline__ uint64_t get_cycles(void) +{ + uint64_t v; + uint32_t a, d; + + asm volatile ("rdtsc" : "=a" (a), "=d"(d)); + v = ((uint64_t) a) | (((uint64_t) d) << 32); + + return v; +} + +#ifndef LOCK_PREFIX +#define LOCK_PREFIX "lock " +#endif + +static __inline__ void ips_barrier() +{ + asm volatile ("" : : : "memory"); +} + +static __inline__ void ips_mb() +{ + asm volatile ("mfence" : : : "memory"); +} + +/* gcc-3.4 has a bug with this function body at -O0 */ +static +#if defined(__GNUC__) && __GNUC__ == 3 && __GNUC_MINOR__ == 4 +#else +__inline__ +#endif +void ips_rmb() +{ + asm volatile ("" : : : "memory"); +} + +static __inline__ void ips_wmb() +{ + asm volatile ("sfence" : : : "memory"); +} + +static __inline__ void ips_sync_writes() +{ + asm volatile ("sfence" : : : "memory"); +} + +static __inline__ void ips_sync_reads() +{ + asm volatile ("lfence" : : : "memory"); +} + +static __inline__ uint32_t ips_cmpxchg(volatile uint32_t *ptr, + uint32_t old_val, uint32_t new_val) +{ + uint32_t prev; + struct xchg_dummy { + uint32_t a[100]; + }; + + asm volatile (LOCK_PREFIX "cmpxchgl %1,%2" : "=a"(prev) + : "q"(new_val), "m"(*(struct xchg_dummy *)ptr), "0"(old_val) + : "memory"); + + return prev; +} + +typedef struct { + volatile int32_t counter; +} ips_atomic_t; + +#define ips_atomic_set(v, i) (((v)->counter) = (i)) +#define ips_atomic_cmpxchg(p, oval, nval) \ + ips_cmpxchg((volatile uint32_t *) &((p)->counter), oval, nval) + +#if 0 +static __inline__ int32_t +ips_cmpxchg(volatile int32_t *p, int32_t old_value, int32_t new_value) +{ + asm volatile ("lock cmpxchg %2, %0" : + "+m" (*p), "+a"(old_value) : "r"(new_value) : "memory"); + return old_value; +} +#endif + +#endif /* _HFI_i386_SYSDEP_H */ diff --git a/prov/psm3/psm3/include/opa_byteorder.h b/prov/psm3/psm3/include/opa_byteorder.h new file mode 100644 index 00000000000..bc909c18953 --- /dev/null +++ b/prov/psm3/psm3/include/opa_byteorder.h @@ -0,0 +1,265 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef OPA_BYTEORDER_H +#define OPA_BYTEORDER_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#ifndef __BYTE_ORDER +# error "BYTE_ORDER undefined" +#endif + +typedef __u16 __le16; +typedef __u16 __be16; +typedef __u32 __le32; +typedef __u32 __be32; +typedef __u64 __le64; +typedef __u64 __be64; + +static __inline__ __u16 __hfi_fswab16(__u16) + __attribute__ ((always_inline)); +static __inline__ __u32 __hfi_fswab32(__u32) + __attribute__ ((always_inline)); +static __inline__ __u64 __hfi_fswab64(__u64) + __attribute__ ((always_inline)); + +static __inline__ __u16 __hfi_fswab16(__u16 x) { + return ((x & (__u16) 0x00ffU) << 8) + | ((x & (__u16) 0xff00U) >> 8); +} static __inline__ __u32 __hfi_fswab32(__u32 x) { + return ((x & (__u32) 0x000000ffUL) << 24) + | ((x & (__u32) 0x0000ff00UL) << 8) + | ((x & (__u32) 0x00ff0000UL) >> 8) + | ((x & (__u32) 0xff000000UL) >> 24); +} + +static __inline__ __u64 __hfi_fswab64(__u64 x) { + return ((x & (__u64) 0x00000000000000ffULL) << 56) + | ((x & (__u64) 0x000000000000ff00ULL) << 40) + | ((x & (__u64) 0x0000000000ff0000ULL) << 24) + | ((x & (__u64) 0x00000000ff000000ULL) << 8) + | ((x & (__u64) 0x000000ff00000000ULL) >> 8) + | ((x & (__u64) 0x0000ff0000000000ULL) >> 24) + | ((x & (__u64) 0x00ff000000000000ULL) >> 40) + | ((x & (__u64) 0xff00000000000000ULL) >> 56); +} + +static __inline__ __u16 __cpu_to_le16(__le16) + __attribute__ ((always_inline)); +static __inline__ __u32 __cpu_to_le32(__le32) + __attribute__ ((always_inline)); +static __inline__ __u64 __cpu_to_le64(__le64) + __attribute__ ((always_inline)); + +static __inline__ __u16 __le16_to_cpu(__le16) + __attribute__ ((always_inline)); +static __inline__ __u32 __le32_to_cpu(__le32) + __attribute__ ((always_inline)); +static __inline__ __u64 __le64_to_cpu(__le64) + __attribute__ ((always_inline)); + +static __inline__ __u16 __cpu_to_be16(__be16) + __attribute__ ((always_inline)); +static __inline__ __u32 __cpu_to_be32(__be32) + __attribute__ ((always_inline)); +static __inline__ __u64 __cpu_to_be64(__be64) + __attribute__ ((always_inline)); + +static __inline__ __u16 __be16_to_cpu(__be16) + __attribute__ ((always_inline)); +static __inline__ __u32 __be32_to_cpu(__be32) + __attribute__ ((always_inline)); +static __inline__ __u64 __be64_to_cpu(__be64) + __attribute__ ((always_inline)); + +#if __BYTE_ORDER == __LITTLE_ENDIAN + +/* + * __cpu_to_le* routines + */ +static __inline__ __le16 __cpu_to_le16(__u16 x) { + return x; +} + +static __inline__ __le32 __cpu_to_le32(__u32 x) { + return x; +} + +static __inline__ __le64 __cpu_to_le64(__u64 x) { + return x; +} + +/* + * __le*_to_cpu routines + */ +static __inline__ __u16 __le16_to_cpu(__le16 x) { + return x; +} + +static __inline__ __u32 __le32_to_cpu(__le32 x) { + return x; +} + +static __inline__ __u64 __le64_to_cpu(__le64 x) { + return x; +} + +/* + * __cpu_to_be* routines + */ +static __inline__ __be16 __cpu_to_be16(__u16 x) { + return __hfi_fswab16(x); +} + +static __inline__ __be32 __cpu_to_be32(__u32 x) { + return __hfi_fswab32(x); +} + +static __inline__ __be64 __cpu_to_be64(__u64 x) { + return __hfi_fswab64(x); +} + +/* + * __be*_to_cpu routines + */ +static __inline__ __u16 __be16_to_cpu(__be16 x) { + return __hfi_fswab16(x); +} + +static __inline__ __u32 __be32_to_cpu(__be32 x) { + return __hfi_fswab32(x); +} + +static __inline__ __u64 __be64_to_cpu(__be64 x) { + return __hfi_fswab64(x); +} + +#elif __BYTE_ORDER == __BIG_ENDIAN + +/* + * __cpu_to_le* routines + */ +static __inline__ __le16 __cpu_to_le16(__u16 x) { + return __hfi_fswab16(x); +} + +static __inline__ __le32 __cpu_to_le32(__u32 x) { + return __hfi_fswab32(x); +} + +static __inline__ __le64 __cpu_to_le64(__u64 x) { + return __hfi_fswab64(x); +} + +/* + * __le*_to_cpu routines + */ +static __inline__ __u16 __le16_to_cpu(__le16 x) { + return __hfi_fswab16(x); +} + +static __inline__ __u32 __le32_to_cpu(__le32 x) { + return __hfi_fswab32(x); +} + +static __inline__ __u64 __le64_to_cpu(__le64 x) { + return __hfi_fswab64(x); +} + +/* + * __cpu_to_be* routines + */ +static __inline__ __be16 __cpu_to_be16(__u16 x) { + return x; +} + +static __inline__ __be32 __cpu_to_be32(__u32 x) { + return x; +} + +static __inline__ __be64 __cpu_to_be64(__u64 x) { + return x; +} + +/* + * __be*_to_cpu routines + */ +static __inline__ __u16 __be16_to_cpu(__be16 x) { + return x; +} + +static __inline__ __u32 __be32_to_cpu(__be32 x) { + return x; +} + +static __inline__ __u64 __be64_to_cpu(__be64 x) { + return x; +} + +#else +# error "unsupported BYTE_ORDER: " #BYTE_ORDER +#endif + +#ifdef __cplusplus +} /* extern "C" */ +#endif +#endif /* OPA_BYTEORDER_H */ diff --git a/prov/psm3/psm3/include/opa_debug.h b/prov/psm3/psm3/include/opa_debug.h new file mode 100644 index 00000000000..ce6ef1051a5 --- /dev/null +++ b/prov/psm3/psm3/include/opa_debug.h @@ -0,0 +1,116 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef OPA_DEBUG_H +#define OPA_DEBUG_H + +// See opa_udebug.h for macros and comments about these settings + +#ifndef _HFI_DEBUGGING /* debugging enabled or not */ +#define _HFI_DEBUGGING 1 +#endif + +#if _HFI_DEBUGGING + +/* + * Mask values for debugging. The scheme allows us to compile out any + * of the debug tracing stuff, and if compiled in, to enable or disable + * dynamically. This can be set at modprobe time also: + * modprobe hfi.ko hfi_debug=7 + */ + +#define __HFI_INFO 0x1 /* generic low verbosity stuff */ +#define __HFI_DBG 0x2 /* generic debug */ +#define __HFI_TRSAMPLE 0x8 /* generate trace buffer sample entries */ +/* leave some low verbosity spots open */ +/* Debug messages specific to UD */ +#define __HFI_UDDBG 0x10 +/* Debug messages related to the connection protocol. */ +#define __HFI_CONNDBG 0x20 +#define __HFI_VERBDBG 0x40 /* very verbose debug */ +#define __HFI_PKTDBG 0x80 /* print packet data */ +/* print process startup (init)/exit messages and important env vars */ +#define __HFI_PROCDBG 0x100 +/* print MR, mmap/nopage stuff, not using VDBG any more */ +#define __HFI_MMDBG 0x200 +/* low-level environment variables */ +#define __HFI_ENVDBG 0x400 +#define __HFI_EPKTDBG 0x800 /* print error packet data */ +#define __HFI_CCADBG 0x1000 /* print CCA related events */ +#else /* _HFI_DEBUGGING */ + +/* + * define all of these even with debugging off, for the few places that do + * if(hfi_debug & _HFI_xyzzy), but in a way that will make the + * compiler eliminate the code + */ + +#define __HFI_INFO 0x0 /* generic low verbosity stuff */ +#define __HFI_DBG 0x0 /* generic debug */ +#define __HFI_TRSAMPLE 0x0 /* generate trace buffer sample entries */ +#define __HFI_UDDBG 0x0 +#define __HFI_CONNDBG 0x0 +#define __HFI_VERBDBG 0x0 /* very verbose debug */ +#define __HFI_PKTDBG 0x0 /* print packet data */ +#define __HFI_PROCDBG 0x0 /* print process startup (init)/exit messages */ +/* print MR, mmap/nopage stuff, not using VDBG any more */ +#define __HFI_MMDBG 0x0 +#define __HFI_CCADBG 0x0 /* print CCA related events */ + +#endif /* _HFI_DEBUGGING */ + +#define __HFI_VERBOSEDBG __HFI_VERBDBG + +#endif /* OPA_DEBUG_H */ diff --git a/prov/psm3/psm3/include/opa_intf.h b/prov/psm3/psm3/include/opa_intf.h new file mode 100644 index 00000000000..725418765e4 --- /dev/null +++ b/prov/psm3/psm3/include/opa_intf.h @@ -0,0 +1,98 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef OPA_INTF_H +#define OPA_INTF_H + +#include +#include +#include + +#ifdef __inline__ +#undef __inline__ +#endif +#define __inline__ inline __attribute__((always_inline, unused)) + +#include "sysdep.h" +#include "bit_ops.h" + +/* these aren't implemented for user mode, which is OK until we multi-thread */ +typedef struct _atomic { + uint32_t counter; +} atomic_t; /* no atomic_t type in user-land */ +#define atomic_set(a, v) ((a)->counter = (v)) +#define atomic_inc_return(a) (++(a)->counter) + +#if defined(__GNUC__) +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1L) +#endif +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0L) +#endif +#ifndef if_pt +#define if_pt(cond) if (likely(cond)) +#endif +#ifndef if_pf +#define if_pf(cond) if (unlikely(cond)) +#endif +#define _Pragma_unlikely +#define _Pragma_likely +#else +#error "Unsupported compiler" +#endif + +#define yield() sched_yield() +#endif /* OPA_INTF_H */ diff --git a/prov/psm3/psm3/include/opa_queue.h b/prov/psm3/psm3/include/opa_queue.h new file mode 100644 index 00000000000..f3d9595455a --- /dev/null +++ b/prov/psm3/psm3/include/opa_queue.h @@ -0,0 +1,512 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)queue.h 8.5 (Berkeley) 8/20/94 + * $FreeBSD: src/sys/sys/queue.h,v 1.32.2.7 2002/04/17 14:21:02 des Exp $ + */ + +#ifndef OPA_QUEUE_H_ +#define OPA_QUEUE_H_ + +/* + * This file defines five types of data structures: singly-linked lists, + * singly-linked tail queues, lists, tail queues, and circular queues. + * + * A singly-linked list is headed by a single forward pointer. The elements + * are singly linked for minimum space and pointer manipulation overhead at + * the expense of O(n) removal for arbitrary elements. New elements can be + * added to the list after an existing element or at the head of the list. + * Elements being removed from the head of the list should use the explicit + * macro for this purpose for optimum efficiency. A singly-linked list may + * only be traversed in the forward direction. Singly-linked lists are ideal + * for applications with large datasets and few or no removals or for + * implementing a LIFO queue. + * + * A singly-linked tail queue is headed by a pair of pointers, one to the + * head of the list and the other to the tail of the list. The elements are + * singly linked for minimum space and pointer manipulation overhead at the + * expense of O(n) removal for arbitrary elements. New elements can be added + * to the list after an existing element, at the head of the list, or at the + * end of the list. Elements being removed from the head of the tail queue + * should use the explicit macro for this purpose for optimum efficiency. + * A singly-linked tail queue may only be traversed in the forward direction. + * Singly-linked tail queues are ideal for applications with large datasets + * and few or no removals or for implementing a FIFO queue. + * + * A list is headed by a single forward pointer (or an array of forward + * pointers for a hash table header). The elements are doubly linked + * so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before + * or after an existing element or at the head of the list. A list + * may only be traversed in the forward direction. + * + * A tail queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or + * after an existing element, at the head of the list, or at the end of + * the list. A tail queue may be traversed in either direction. + * + * A circle queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or after + * an existing element, at the head of the list, or at the end of the list. + * A circle queue may be traversed in either direction, but has a more + * complex end of list detection. + * + * For details on the use of these macros, see the queue(3) manual page. + * + * + * SLIST LIST STAILQ TAILQ CIRCLEQ + * _HEAD + + + + + + * _HEAD_INITIALIZER + + + + + + * _ENTRY + + + + + + * _INIT + + + + + + * _EMPTY + + + + + + * _FIRST + + + + + + * _NEXT + + + + + + * _PREV - - - + + + * _LAST - - + + + + * _FOREACH + + + + + + * _FOREACH_REVERSE - - - + + + * _INSERT_HEAD + + + + + + * _INSERT_BEFORE - + - + + + * _INSERT_AFTER + + + + + + * _INSERT_TAIL - - + + + + * _REMOVE_HEAD + - + - - + * _REMOVE + + + + + + * + */ + +/* + * Singly-linked List declarations. + */ +#define SLIST_HEAD(name, type) \ +struct name { \ + struct type *slh_first; /* first element */ \ +} + +#define SLIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define SLIST_ENTRY(type) \ +struct { \ + struct type *sle_next; /* next element */ \ +} + +/* + * Singly-linked List functions. + */ +#define SLIST_EMPTY(head) ((head)->slh_first == NULL) + +#define SLIST_FIRST(head) ((head)->slh_first) + +#define SLIST_FOREACH(var, head, field) \ + for ((var) = SLIST_FIRST((head)); \ + (var); \ + (var) = SLIST_NEXT((var), field)) + +#define SLIST_INIT(head) do { \ + SLIST_FIRST((head)) = NULL; \ +} while (0) + +#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \ + SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \ + SLIST_NEXT((slistelm), field) = (elm); \ +} while (0) + +#define SLIST_INSERT_HEAD(head, elm, field) do { \ + SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \ + SLIST_FIRST((head)) = (elm); \ +} while (0) + +#define SLIST_NEXT(elm, field) ((elm)->field.sle_next) + +#define SLIST_REMOVE(head, elm, type, field) do { \ + if (SLIST_FIRST((head)) == (elm)) { \ + SLIST_REMOVE_HEAD((head), field); \ + } \ + else { \ + struct type *curelm = SLIST_FIRST((head)); \ + while (SLIST_NEXT(curelm, field) != (elm)) \ + curelm = SLIST_NEXT(curelm, field); \ + SLIST_NEXT(curelm, field) = \ + SLIST_NEXT(SLIST_NEXT(curelm, field), field); \ + } \ +} while (0) + +#define SLIST_REMOVE_HEAD(head, field) do { \ + SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \ +} while (0) + +/* + * Singly-linked Tail queue declarations. + */ +#define STAILQ_HEAD(name, type) \ +struct name { \ + struct type *stqh_first;/* first element */ \ + struct type **stqh_last;/* addr of last next element */ \ +} + +#define STAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).stqh_first } + +#define STAILQ_ENTRY(type) \ +struct { \ + struct type *stqe_next; /* next element */ \ +} + +/* + * Singly-linked Tail queue functions. + */ +#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL) + +#define STAILQ_FIRST(head) ((head)->stqh_first) + +#define STAILQ_FOREACH(var, head, field) \ + for ((var) = STAILQ_FIRST((head)); \ + (var); \ + (var) = STAILQ_NEXT((var), field)) + +#define STAILQ_INIT(head) do { \ + STAILQ_FIRST((head)) = NULL; \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_NEXT((tqelm), field) = (elm); \ +} while (0) + +#define STAILQ_INSERT_HEAD(head, elm, field) do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_FIRST((head)) = (elm); \ +} while (0) + +#define STAILQ_INSERT_TAIL(head, elm, field) do { \ + STAILQ_NEXT((elm), field) = NULL; \ + *(head)->stqh_last = (elm); \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ +} while (0) + +#define STAILQ_LAST(head, type, field) \ + (STAILQ_EMPTY(head) ? \ + NULL : \ + ((struct type *) \ + ((char *)((head)->stqh_last) - offsetof(struct type, field)))) + +#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next) + +#define STAILQ_REMOVE(head, elm, type, field) do { \ + if (STAILQ_FIRST((head)) == (elm)) { \ + STAILQ_REMOVE_HEAD(head, field); \ + } \ + else { \ + struct type *curelm = STAILQ_FIRST((head)); \ + while (STAILQ_NEXT(curelm, field) != (elm)) \ + curelm = STAILQ_NEXT(curelm, field); \ + if ((STAILQ_NEXT(curelm, field) = \ + STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\ + (head)->stqh_last = &STAILQ_NEXT((curelm), field);\ + } \ +} while (0) + +#define STAILQ_REMOVE_HEAD(head, field) do { \ + if ((STAILQ_FIRST((head)) = \ + STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +#define STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do { \ + if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL) \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +/* + * List declarations. + */ +#define LIST_HEAD(name, type) \ +struct name { \ + struct type *lh_first; /* first element */ \ +} + +#define LIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define LIST_ENTRY(type) \ +struct { \ + struct type *le_next; /* next element */ \ + struct type **le_prev; /* address of previous next element */ \ +} + +/* + * List functions. + */ + +#define LIST_EMPTY(head) ((head)->lh_first == NULL) + +#define LIST_FIRST(head) ((head)->lh_first) + +#define LIST_FOREACH(var, head, field) \ + for ((var) = LIST_FIRST((head)); \ + (var); \ + (var) = LIST_NEXT((var), field)) + +#define LIST_INIT(head) do { \ + LIST_FIRST((head)) = NULL; \ +} while (0) + +#define LIST_INSERT_AFTER(listelm, elm, field) do { \ + if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\ + LIST_NEXT((listelm), field)->field.le_prev = \ + &LIST_NEXT((elm), field); \ + LIST_NEXT((listelm), field) = (elm); \ + (elm)->field.le_prev = &LIST_NEXT((listelm), field); \ +} while (0) + +#define LIST_INSERT_BEFORE(listelm, elm, field) do { \ + (elm)->field.le_prev = (listelm)->field.le_prev; \ + LIST_NEXT((elm), field) = (listelm); \ + *(listelm)->field.le_prev = (elm); \ + (listelm)->field.le_prev = &LIST_NEXT((elm), field); \ +} while (0) + +#define LIST_INSERT_HEAD(head, elm, field) do { \ + if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \ + LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\ + LIST_FIRST((head)) = (elm); \ + (elm)->field.le_prev = &LIST_FIRST((head)); \ +} while (0) + +#define LIST_NEXT(elm, field) ((elm)->field.le_next) + +#define LIST_REMOVE(elm, field) do { \ + if (LIST_NEXT((elm), field) != NULL) \ + LIST_NEXT((elm), field)->field.le_prev = \ + (elm)->field.le_prev; \ + *(elm)->field.le_prev = LIST_NEXT((elm), field); \ +} while (0) + +/* + * Tail queue declarations. + */ +#define TAILQ_HEAD(name, type) \ +struct name { \ + struct type *tqh_first; /* first element */ \ + struct type **tqh_last; /* addr of last next element */ \ +} + +#define TAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).tqh_first } + +#define TAILQ_ENTRY(type) \ +struct { \ + struct type *tqe_next; /* next element */ \ + struct type **tqe_prev; /* address of previous next element */ \ +} + +/* + * Tail queue functions. + */ +#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) + +#define TAILQ_FIRST(head) ((head)->tqh_first) + +#define TAILQ_FOREACH(var, head, field) \ + for ((var) = TAILQ_FIRST((head)); \ + (var); \ + (var) = TAILQ_NEXT((var), field)) + +#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ + for ((var) = TAILQ_LAST((head), headname); \ + (var); \ + (var) = TAILQ_PREV((var), headname, field)) + +#define TAILQ_INIT(head) do { \ + TAILQ_FIRST((head)) = NULL; \ + (head)->tqh_last = &TAILQ_FIRST((head)); \ +} while (0) + +#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ + if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + TAILQ_NEXT((listelm), field) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \ +} while (0) + +#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ + (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ + TAILQ_NEXT((elm), field) = (listelm); \ + *(listelm)->field.tqe_prev = (elm); \ + (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ +} while (0) + +#define TAILQ_INSERT_HEAD(head, elm, field) do { \ + if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \ + TAILQ_FIRST((head))->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + TAILQ_FIRST((head)) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \ +} while (0) + +#define TAILQ_INSERT_TAIL(head, elm, field) do { \ + TAILQ_NEXT((elm), field) = NULL; \ + (elm)->field.tqe_prev = (head)->tqh_last; \ + *(head)->tqh_last = (elm); \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ +} while (0) + +#define TAILQ_LAST(head, headname) \ + (*(((struct headname *)((head)->tqh_last))->tqh_last)) + +#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) + +#define TAILQ_PREV(elm, headname, field) \ + (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) + +#define TAILQ_REMOVE(head, elm, field) do { \ + if ((TAILQ_NEXT((elm), field)) != NULL) \ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + (elm)->field.tqe_prev; \ + else \ + (head)->tqh_last = (elm)->field.tqe_prev; \ + *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \ +} while (0) + +/* + * Circular queue declarations. + */ +#define CIRCLEQ_HEAD(name, type) \ +struct name { \ + struct type *cqh_first; /* first element */ \ + struct type *cqh_last; /* last element */ \ +} + +#define CIRCLEQ_HEAD_INITIALIZER(head) \ + { (void *)&(head), (void *)&(head) } + +#define CIRCLEQ_ENTRY(type) \ +struct { \ + struct type *cqe_next; /* next element */ \ + struct type *cqe_prev; /* previous element */ \ +} + +/* + * Circular queue functions. + */ +#define CIRCLEQ_EMPTY(head) ((head)->cqh_first == (void *)(head)) + +#define CIRCLEQ_FIRST(head) ((head)->cqh_first) + +#define CIRCLEQ_FOREACH(var, head, field) \ + for ((var) = CIRCLEQ_FIRST((head)); \ + (var) != (void *)(head) || ((var) = NULL); \ + (var) = CIRCLEQ_NEXT((var), field)) + +#define CIRCLEQ_FOREACH_REVERSE(var, head, field) \ + for ((var) = CIRCLEQ_LAST((head)); \ + (var) != (void *)(head) || ((var) = NULL); \ + (var) = CIRCLEQ_PREV((var), field)) + +#define CIRCLEQ_INIT(head) do { \ + CIRCLEQ_FIRST((head)) = (void *)(head); \ + CIRCLEQ_LAST((head)) = (void *)(head); \ +} while (0) + +#define CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do { \ + CIRCLEQ_NEXT((elm), field) = CIRCLEQ_NEXT((listelm), field); \ + CIRCLEQ_PREV((elm), field) = (listelm); \ + if (CIRCLEQ_NEXT((listelm), field) == (void *)(head)) \ + CIRCLEQ_LAST((head)) = (elm); \ + else \ + CIRCLEQ_PREV(CIRCLEQ_NEXT((listelm), field), field) = (elm);\ + CIRCLEQ_NEXT((listelm), field) = (elm); \ +} while (0) + +#define CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do { \ + CIRCLEQ_NEXT((elm), field) = (listelm); \ + CIRCLEQ_PREV((elm), field) = CIRCLEQ_PREV((listelm), field); \ + if (CIRCLEQ_PREV((listelm), field) == (void *)(head)) \ + CIRCLEQ_FIRST((head)) = (elm); \ + else \ + CIRCLEQ_NEXT(CIRCLEQ_PREV((listelm), field), field) = (elm);\ + CIRCLEQ_PREV((listelm), field) = (elm); \ +} while (0) + +#define CIRCLEQ_INSERT_HEAD(head, elm, field) do { \ + CIRCLEQ_NEXT((elm), field) = CIRCLEQ_FIRST((head)); \ + CIRCLEQ_PREV((elm), field) = (void *)(head); \ + if (CIRCLEQ_LAST((head)) == (void *)(head)) \ + CIRCLEQ_LAST((head)) = (elm); \ + else \ + CIRCLEQ_PREV(CIRCLEQ_FIRST((head)), field) = (elm); \ + CIRCLEQ_FIRST((head)) = (elm); \ +} while (0) + +#define CIRCLEQ_INSERT_TAIL(head, elm, field) do { \ + CIRCLEQ_NEXT((elm), field) = (void *)(head); \ + CIRCLEQ_PREV((elm), field) = CIRCLEQ_LAST((head)); \ + if (CIRCLEQ_FIRST((head)) == (void *)(head)) \ + CIRCLEQ_FIRST((head)) = (elm); \ + else \ + CIRCLEQ_NEXT(CIRCLEQ_LAST((head)), field) = (elm); \ + CIRCLEQ_LAST((head)) = (elm); \ +} while (0) + +#define CIRCLEQ_LAST(head) ((head)->cqh_last) + +#define CIRCLEQ_NEXT(elm, field) ((elm)->field.cqe_next) + +#define CIRCLEQ_PREV(elm, field) ((elm)->field.cqe_prev) + +#define CIRCLEQ_REMOVE(head, elm, field) do { \ + if (CIRCLEQ_NEXT((elm), field) == (void *)(head)) \ + CIRCLEQ_LAST((head)) = CIRCLEQ_PREV((elm), field); \ + else \ + CIRCLEQ_PREV(CIRCLEQ_NEXT((elm), field), field) = \ + CIRCLEQ_PREV((elm), field); \ + if (CIRCLEQ_PREV((elm), field) == (void *)(head)) \ + CIRCLEQ_FIRST((head)) = CIRCLEQ_NEXT((elm), field); \ + else \ + CIRCLEQ_NEXT(CIRCLEQ_PREV((elm), field), field) = \ + CIRCLEQ_NEXT((elm), field); \ +} while (0) + +#endif /* !OPA_QUEUE_H_ */ diff --git a/prov/psm3/psm3/include/opa_revision.h b/prov/psm3/psm3/include/opa_revision.h new file mode 100644 index 00000000000..4a288219d68 --- /dev/null +++ b/prov/psm3/psm3/include/opa_revision.h @@ -0,0 +1,64 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef OPA_REVISION_H +#define OPA_REVISION_H + +/* Those variables are defined in the _revision.c file +which is dynamically generated during building of the library */ +extern char psmi_hfi_IFS_version[]; +extern char psmi_hfi_build_timestamp[]; +extern char psmi_hfi_sources_checksum[]; +extern char psmi_hfi_git_checksum[]; + +#endif /* OPA_REVISION_H */ diff --git a/prov/psm3/psm3/include/opa_service.h b/prov/psm3/psm3/include/opa_service.h new file mode 100644 index 00000000000..20bdfc96966 --- /dev/null +++ b/prov/psm3/psm3/include/opa_service.h @@ -0,0 +1,97 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef OPA_SERVICE_H +#define OPA_SERVICE_H + +/* This file contains all the lowest level routines calling into sysfs */ +/* and qib driver. All other calls are based on these routines. */ + +#include + +#include "opa_intf.h" +#include "opa_udebug.h" +#include "opa_byteorder.h" + +/* upper and lower bounds for HFI port numbers */ +#define HFI_MIN_PORT 1 +#define HFI_MAX_PORT 1 + +/* any unit id to match. */ +#define PSM3_NIC_ANY ((long)-1) +/* any port num to match. */ +#define PSM3_NIC_PORT_ANY ((long)0) + + +/* sysfs helper routines (only those currently used are exported; + * try to avoid using others) */ + +/* Initializes the following sysfs helper routines. + sysfs_init() returns 0 on success, non-zero on an error: */ +int sysfs_init(const char *dflt_hfi_class_path); + +const char *sysfs_unit_path(int unit_id); +const char *sysfs_unit_dev_name(int unit_id); +int sysfs_find_unit(const char *name); +/* Complementary */ +void sysfs_fini(void); + +/* read a string value into buff, no more than size bytes. + returns the number of bytes read */ +size_t hfi_sysfs_unit_port_read(uint32_t unit, uint32_t port, const char *attr, + char *buff, size_t size); + + +int64_t hfi_sysfs_unit_read_node_s64(uint32_t unit); + +#endif /* OPA_SERVICE_H */ diff --git a/prov/psm3/psm3/include/opa_udebug.h b/prov/psm3/psm3/include/opa_udebug.h new file mode 100644 index 00000000000..ee81d6f430e --- /dev/null +++ b/prov/psm3/psm3/include/opa_udebug.h @@ -0,0 +1,232 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef OPA_UDEBUG_H +#define OPA_UDEBUG_H + +#include +#include "opa_debug.h" + +// To have a a message be unconditionally output for all builds, regardless of +// env variables, use _HFI_ERROR or _HFI_UNIT_ERROR +// All other logging macros are under the control of the user via env variables +// and build options can disable them +// +// Other logging calls are only enabled if _HFI_DEBUGGING is defined, +// in which case _HFI_INFO is also enabled by default (but env can disable it). +// All others controlled by env variable. +// +// Currently opa_debug.h always defines _HFI_DEBUGGING and it is included by +// opa_udebug.h, so logging is presently enabled in all builds. At some point +// may want to explore a performance optimization and disable logging macros +// for lower level debug messages in non-debug builds. +// +// See psmi_handle_error in psm_error.h. Use of it's PSMI_EP_NO_RETURN option +// can unconditionally output a message and abort. + +extern unsigned hfi_debug; +const char *hfi_get_unit_name(int unit); +extern char *__progname; + +static const char hfi_ident_tag[] = "PSM3_IDENTIFY"; +char *hfi_get_mylabel(); +int hfi_get_myrank(); // -1 if unknown +int hfi_get_myrank_count(); // -1 if unknown +int hfi_get_mylocalrank(); // -1 if unknown +int hfi_get_mylocalrank_count(); // -1 if unknown + +#if _HFI_DEBUGGING + +extern char *__hfi_mylabel; +void hfi_set_mylabel(char *); +extern FILE *__hfi_dbgout; + +#define _HFI_UNIT_ERROR(unit, fmt, ...) \ + do { \ + _Pragma_unlikely \ + printf("%s.%s: " fmt, __hfi_mylabel, __progname, \ + ##__VA_ARGS__); \ + } while (0) + +#define _HFI_ERROR(fmt, ...) \ + do { \ + _Pragma_unlikely \ + printf("%s.%s: " fmt, __hfi_mylabel, __progname, \ + ##__VA_ARGS__); \ + } while (0) + +#define _HFI_INFO(fmt, ...) \ + do { \ + _Pragma_unlikely \ + if (unlikely(hfi_debug&__HFI_INFO)) \ + printf("%s.%s: " fmt, __hfi_mylabel, __func__, \ + ##__VA_ARGS__); \ + } while (0) + +#define __HFI_PKTDBG_ON unlikely(hfi_debug & __HFI_PKTDBG) + +#define __HFI_DBG_WHICH(which, fmt, ...) \ + do { \ + _Pragma_unlikely \ + if (unlikely(hfi_debug&(which))) \ + fprintf(__hfi_dbgout, "%s.%s: " fmt, __hfi_mylabel, __func__, \ + ##__VA_ARGS__); \ + } while (0) + +#define __HFI_DBG_WHICH_NOFUNC(which, fmt, ...) \ + do { \ + _Pragma_unlikely \ + if (unlikely(hfi_debug&(which))) \ + fprintf(__hfi_dbgout, "%s: " fmt, __hfi_mylabel, \ + ##__VA_ARGS__); \ + } while (0) + +#define _HFI_DBG(fmt, ...) __HFI_DBG_WHICH(__HFI_DBG, fmt, ##__VA_ARGS__) +#define _HFI_UDDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_UDDBG, fmt, ##__VA_ARGS__) +#define _HFI_CONNDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_CONNDBG, fmt, ##__VA_ARGS__) +#define _HFI_VDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_VERBDBG, fmt, ##__VA_ARGS__) +#define _HFI_PDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_PKTDBG, fmt, ##__VA_ARGS__) +#define _HFI_EPDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_EPKTDBG, fmt, ##__VA_ARGS__) +#define _HFI_PRDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_PROCDBG, fmt, ##__VA_ARGS__) +#define _HFI_ENVDBG(lev, fmt, ...) \ + __HFI_DBG_WHICH_NOFUNC( \ + (lev == 0) ? __HFI_INFO : \ + (lev > 1 ? __HFI_ENVDBG : (__HFI_PROCDBG|__HFI_ENVDBG)),\ + "env " fmt, ##__VA_ARGS__) +#define _HFI_MMDBG(fmt, ...) __HFI_DBG_WHICH(__HFI_MMDBG, fmt, ##__VA_ARGS__) +#define _HFI_CCADBG(fmt, ...) __HFI_DBG_WHICH(__HFI_CCADBG, fmt, ##__VA_ARGS__) + +/* + * Use these macros (_HFI_DBG_ON and _HFI_DBG_ALWAYS) together + * for a scope of code preparing debug info for printing; e.g. + * if (_HFI_DBG_ON) { + * // put your code here + * _HFI_DBG_ALWAYS(print your results here); + * } + */ +#define _HFI_DBG_ON unlikely(hfi_debug & __HFI_DBG) +#define _HFI_DBG_ALWAYS(fmt, ...) \ + do { \ + _Pragma_unlikely \ + fprintf(__hfi_dbgout, "%s: " fmt, __hfi_mylabel, \ + ##__VA_ARGS__); \ + } while (0) + +#define _HFI_UDDBG_ON unlikely(hfi_debug & __HFI_UDDBG) +#define _HFI_UDDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) + +#define _HFI_CONNDBG_ON unlikely(hfi_debug & __HFI_CONNDBG) +#define _HFI_CONNDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) + +#define _HFI_VDBG_ON unlikely(hfi_debug & __HFI_VERBDBG) +#define _HFI_VDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) + +#define _HFI_PDBG_ON unlikely(hfi_debug & __HFI_PKTDBG) +#define _HFI_PDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) + +#define _HFI_PRDBG_ON unlikely(hfi_debug & __HFI_PROCDBG) +#define _HFI_PRDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) + +#define _HFI_CCADBG_ON unlikely(hfi_debug & __HFI_CCADBG) +#define _HFI_CCADBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) + +#define _HFI_INFO_ON unlikely(hfi_debug & __HFI_INFO) +#define _HFI_INFO_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__) + +#else /* ! _HFI_DEBUGGING */ + +#define _HFI_UNIT_ERROR(unit, fmt, ...) \ + do { \ + printf("%s: " fmt, "", ##__VA_ARGS__); \ + } while (0) + +#define _HFI_ERROR(fmt, ...) \ + do { \ + printf("%s: " fmt, "", ##__VA_ARGS__); \ + } while (0) + +#define _HFI_INFO(fmt, ...) + +#define __HFI_PKTDBG_ON 0 + +#define _HFI_DBG(fmt, ...) +#define _HFI_PDBG(fmt, ...) +#define _HFI_EPDBG(fmt, ...) +#define _HFI_PRDBG(fmt, ...) +#define _HFI_ENVDBG(lev, fmt, ...) +#define _HFI_UDDBG(fmt, ...) +#define _HFI_CONNDBG(fmt, ...) +#define _HFI_VDBG(fmt, ...) +#define _HFI_MMDBG(fmt, ...) +#define _HFI_CCADBG(fmt, ...) + +#define _HFI_DBG_ON 0 +#define _HFI_DBG_ALWAYS(fmt, ...) +#define _HFI_UDDBG_ON 0 +#define _HFI_UDDBG_ALWAYS(fmt, ...) +#define _HFI_CONNDBG_ON 0 +#define _HFI_CONNDBG_ALWAYS(fmt, ...) +#define _HFI_VDBG_ON 0 +#define _HFI_VDBG_ALWAYS(fmt, ...) +#define _HFI_PRDBG_ON 0 +#define _HFI_PRDBG_ALWAYS(fmt, ...) +#define _HFI_CCADBG_ON 0 +#define _HFI_CCADBG_ALWAYS(fmt, ...) +#define _HFI_INFO_ON 0 +#define _HFI_INFO_ALWAYS(fmt, ...) + +#endif /* _HFI_DEBUGGING */ + +#endif /* OPA_UDEBUG_H */ diff --git a/prov/psm3/psm3/include/opa_user.h b/prov/psm3/psm3/include/opa_user.h new file mode 100644 index 00000000000..730b619a3f2 --- /dev/null +++ b/prov/psm3/psm3/include/opa_user.h @@ -0,0 +1,261 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef OPA_USER_H +#define OPA_USER_H + +/* This file contains all of the data structures and routines that are + publicly visible and usable (to low level infrastructure code; it is + not expected that any application, or even normal application-level library, + will ever need to use any of this). + + Additional entry points and data structures that are used by these routines + may be referenced in this file, but they should not be generally available; + they are visible here only to allow use in inlined functions. Any variable, + data structure, or function that starts with a leading "_" is in this + category. +*/ + +/* Include header files we need that are unlikely to otherwise be needed by */ +/* programs. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "opa_intf.h" +#include "opa_byteorder.h" +#include "opa_udebug.h" +#include "opa_service.h" + +#define HFI_TF_NFLOWS 32 + +// The sender uses an RDMA Write with Immediate. The immediate data +// carries the receiver's desc genc and idx from which the receiver can +// locate the ips_tid_recv_desc +// we have 16 bits of genc and 5 bits of desc_idx (max of HFI_TF_NFLOWS). +// leaving up to 11 bits for dest_rv_idx for RNDV_MOD (we use 9) +// so desc_idx could grow to 7 bits if needed +#define RV_INDEX_BITS 9 +#define RDMA_PACK_IMMED(desc_genc, desc_idx, dest_rv_idx) \ + ((((uint32_t)(desc_genc))&0xffff) \ + | ((((uint32_t)(desc_idx))&0x7f) << 16) \ + | ((dest_rv_idx) << (32-RV_INDEX_BITS))) +#define RDMA_UNPACK_IMMED_GENC(immed) ((immed) & 0xffff) +#define RDMA_UNPACK_IMMED_IDX(immed) (((immed) >> 16) & 0x7f) +#define RDMA_UNPACK_IMMED_RV_IDX(immed) ((immed) >> (32-RV_INDEX_BITS)) +#define RDMA_IMMED_DESC_MASK 0x7fffff // mask for desc genc and desc idx + +// source of the immediate callback +#define RDMA_IMMED_USER_RC 0 // from a user space RC QP +#define RDMA_IMMED_RV 1 // from RV module kernel QP + +/* IB - LRH header consts */ +#define HFI_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */ +#define HFI_LRH_SC_SHIFT 12 +#define HFI_LRH_SC_MASK 0xf +#define HFI_LRH_SL_SHIFT 4 +#define HFI_LRH_SL_MASK 0xf +#define HFI_LRH_PKTLEN_MASK 0xfff + +/* IB - BTH header consts */ +// bth[0] +#define HFI_BTH_OPCODE_SHIFT 24 +#define HFI_BTH_OPCODE_MASK 0xff +// bth[1] +#define HFI_BTH_FLOWID_SHIFT 11 +#define HFI_BTH_FLOWID_MASK 0x1f +// bth[2] +#define HFI_BTH_SEQ_SHIFT 0 +#define HFI_BTH_SEQ_MASK 0x7ff +#define HFI_BTH_GEN_SHIFT 11 +#define HFI_BTH_GEN_MASK 0xfffff +#define HFI_BTH_ACK_SHIFT 31 + +/* KDETH header consts */ +#define HFI_KHDR_OFFSET_MASK 0x7fff +#define HFI_KHDR_OM_SHIFT 15 +#define HFI_KHDR_TID_SHIFT 16 +#define HFI_KHDR_TID_MASK 0x3ff +#define HFI_KHDR_TIDCTRL_SHIFT 26 +#define HFI_KHDR_TIDCTRL_MASK 0x3 +#define HFI_KHDR_INTR_SHIFT 28 +#define HFI_KHDR_SH_SHIFT 29 +#define HFI_KHDR_KVER_SHIFT 30 +#define HFI_KHDR_KVER_MASK 0x3 + +#define HFI_KHDR_MSGSEQ_MASK 0xffff +#define HFI_KHDR_TINYLEN_MASK 0xf +#define HFI_KHDR_TINYLEN_SHIFT 16 + +#define GET_HFI_KHDR_TIDCTRL(val) \ + (((val) >> HFI_KHDR_TIDCTRL_SHIFT) & \ + HFI_KHDR_TIDCTRL_MASK) + +#ifdef PSM_CUDA +extern int is_driver_gpudirect_enabled; + +#define PSMI_IS_DRIVER_GPUDIRECT_ENABLED likely(is_driver_gpudirect_enabled) +#define PSMI_IS_DRIVER_GPUDIRECT_DISABLED unlikely(!is_driver_gpudirect_enabled) +#endif + +/* hfi kdeth header format */ +struct hfi_kdeth { + __u32 kdeth0; + + union { + struct { + __u16 job_key; // unused for UD/UDP + __u16 hcrc; // unused for UD/UDP + }; + __u32 kdeth1; + }; +} PACK_SUFFIX; + +/* misc. */ +#define HFI_CRC_SIZE_IN_BYTES 4 + +//#define HFI_DEFAULT_SERVICE_ID 0 /* let rv module decide */ +#define HFI_DEFAULT_SERVICE_ID 0x1000125500000001ULL +#define HFI_DEFAULT_P_KEY 0 /* use slot 0 as default */ + +#if 0 +#define HFI_PERMISSIVE_LID 0xFFFF +#define HFI_AETH_CREDIT_SHIFT 24 +#define HFI_AETH_CREDIT_MASK 0x1F +#define HFI_AETH_CREDIT_INVAL 0x1F +#define HFI_PSN_MASK 0xFFFFFF +#define HFI_MSN_MASK 0xFFFFFF +#define HFI_QPN_MASK 0xFFFFFF +#define HFI_MULTICAST_LID_BASE 0xC000 +#define HFI_MULTICAST_QPN 0xFFFFFF +#endif + +/* Receive Header Queue: receive type (from hfi) */ +#define RCVHQ_RCV_TYPE_EXPECTED 0 +#define RCVHQ_RCV_TYPE_EAGER 1 +#define RCVHQ_RCV_TYPE_NON_KD 2 +#define RCVHQ_RCV_TYPE_ERROR 3 + +/* OPA PSM assumes that the message header is always 56 bytes. */ +#define HFI_MESSAGE_HDR_SIZE 56 + +/* interval timing routines */ +/* Convert a count of cycles to elapsed nanoseconds */ +/* this is only accurate for reasonably large numbers of cycles (at least tens) +*/ +static __inline__ uint64_t cycles_to_nanosecs(uint64_t) + __attribute__ ((always_inline)); +/* convert elapsed nanoseconds to elapsed cycles */ +/* this is only accurate for reasonably large numbers of nsecs (at least tens) +*/ +static __inline__ uint64_t nanosecs_to_cycles(uint64_t) + __attribute__ ((always_inline)); + +/* Statistics maintained by the driver */ +const char *hfi_get_next_name(char **names); +int hfi_get_stats_names_count(void); +/* Counters maintained in the chip, globally, and per-prot */ +int hfi_get_ctrs_unit_names_count(int unitno); +int hfi_get_ctrs_port_names_count(int unitno); +/* Convert Timeout value from usec to + * timeout_mult where usec = 4.096usec * 2^timeout_mult + */ +uint8_t timeout_usec_to_mult(uint64_t timeout_us); + +uint64_t hfi_get_single_unitctr(int unit, const char *attr, uint64_t *s); +int hfi_get_single_portctr(int unit, int port, const char *attr, uint64_t *c); +void hfi_release_names(char *namep); + +/* Syslog wrapper + + level is one of LOG_EMERG, LOG_ALERT, LOG_CRIT, LOG_ERR, LOG_WARNING, + LOG_NOTICE, LOG_INFO, LOG_DEBUG. + + prefix should be a short string to describe which part of the software stack + is using syslog, i.e. "PSM", "mpi", "mpirun". +*/ +void hfi_syslog(const char *prefix, int to_console, int level, + const char *format, ...) + __attribute__((format(printf, 4, 5))); + +void hfi_vsyslog(const char *prefix, int to_console, int level, + const char *format, va_list ap); + +/* + * Copy routine that may copy a byte multiple times but optimized for througput + * This is not safe to use for PIO routines where we want a guarantee that a + * byte is only copied/moved across the bus once. + */ +void hfi_dwordcpy(volatile uint32_t *dest, const uint32_t *src, + uint32_t ndwords); + +extern uint32_t __hfi_pico_per_cycle; /* only for use in these functions */ + +/* this is only accurate for reasonably large numbers of cycles (at least tens) */ +static __inline__ uint64_t cycles_to_nanosecs(uint64_t cycs) +{ + return (__hfi_pico_per_cycle * cycs) / 1000ULL; +} + +/* this is only accurate for reasonably large numbers of nsecs (at least tens) */ +static __inline__ uint64_t nanosecs_to_cycles(uint64_t ns) +{ + return (ns * 1000ULL) / __hfi_pico_per_cycle; +} + +#endif /* OPA_USER_H */ diff --git a/prov/psm3/psm3/include/psm2_mock_testing.h b/prov/psm3/psm3/include/psm2_mock_testing.h new file mode 100644 index 00000000000..d1e9bff44c8 --- /dev/null +++ b/prov/psm3/psm3/include/psm2_mock_testing.h @@ -0,0 +1,176 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef PSM2_MOCK_TESTING_H +#define PSM2_MOCK_TESTING_H + +/* PSM2_MOCK_TESTING being defined flips a couple of switches so that a + * testable version of libpsm2.so is built. It'll make properly annotated + * static functions be non-static, visible to the outside. Also, all mockable + * functions will be replaced with function pointers which will originally + * point to the actual implementation. However, those function pointers might + * be reset by the test code, thus allowing for mocking selected PSM2 functions + * for the purpose of the test. + * + * So far the following utilities have been introduced for enabling a + * conditional compilation of the testable vs. production version of the library: + * - ustatic: toggles function visibility + * - MOCKABLE(): decorates function name so that it is visible after being mocked + * - MOCK_DCL_EPILOGUE(): declares a function pointer which will be the seam + * for mocking a function + * - MOCK_DEF_EPILOGUE(): defines a function pointer which will be the seam + * for mocking a function + * + * If the declaration and definition of a static function @c foo reside in + * different files, this would be the common use case: + * + * @code + * // somefile.c: + * int MOCKABLE(foo)(); + * MOCK_DCL_EPILOGUE(foo); + * + * // otherfile.c: + * int MOCKABLE(foo)() { + * printf("I am the original foo!\n"); + * } + * MOCK_DEF_EPILOGUE(foo); + * @endcode + * + * If the production version of the library is being built, the following code + * would result: + * @code + * // somefile.c: + * int foo(); + * + * // otherfile.c: + * int foo() { + * printf("I am the original foo!\n"); + * } + * @endcode + * + * On the other hand, if a testable version of the libary is being build, it + * would produce the following code: + * @code + * // somefile.c: + * int foo_original_(); + * extern typeof(& foo_original_) foo; + * + * // otherfile.c: + * int foo_original_() { + * printf("I am the original foo!\n"); + * } + * typeof(& foo_original_) foo = foo_original_; + * @endcode + * + * If the function to be mocked is a static function residing in the header, + * the following syntax would be used: + * @code + * // somefile.c: + * ustatic int MOCKABLE(foo)() { + * printf("I am the original foo!\n"); + * } + * MOCK_DCL_EPILOGUE(foo); + * MOCK_DEF_EPILOGUE(foo); + * @endcode + * + * If the production version of the library is being built, the following code + * would result: + * @code + * // somefile.c: + * static int foo() { + * printf("I am the original foo!\n"); + * } + * @endcode + * + * Similarly, if a testable version of the libary is being build, it would + * produce the following code: + * @code + * // somefile.c: + * int foo_original_(); + * extern typeof(& foo_original_) foo; + * typeof(& foo_original_) foo = foo_original_; + * @endcode + */ +#ifndef PSM2_MOCK_TESTING + +/* If no testing is being done, ustatic resolves to regular "static" */ +#define ustatic static +/* If no testing is being done, no indirection is introduced */ +#define MOCKABLE(fname) fname +/* If no testing is being done, no declaration epilogue is needed */ +#define MOCK_DCL_EPILOGUE(fname) +/* If no testing is being done, no definition epilogue is needed */ +#define MOCK_DEF_EPILOGUE(fname) + +#else /* ndef PSM2_MOCK_TESTING */ + +/* For the testable version, all _ustatic_ function will NOT be static */ +#define ustatic +/* TODO override inline directives in the same fashion as static */ +/* For the testable version, the actual implementation function is renamed */ +#define MOCKABLE(x) x ## _original_ +/* For the testable version, we declare the function pointer which will be the + * point of indirection for calls to that function. It must be delared after + * the declaration of the actual function happens. + */ +#define MOCK_DCL_EPILOGUE(x) extern typeof(& x ## _original_) x; +/* For the testable version, we define the function pointer which will be the + * point of indirection for calls to that function. It must be delared after + * the definition of the actual function happens. + */ +#define MOCK_DEF_EPILOGUE(x) typeof(& x ## _original_) x = x ## _original_; + +#endif /* ndef PSM2_MOCK_TESTING */ + +#endif /* PSM2_MOCK_TESTING_H */ + diff --git a/prov/psm3/psm3/include/rbtree.c b/prov/psm3/psm3/include/rbtree.c new file mode 100644 index 00000000000..b79f135296f --- /dev/null +++ b/prov/psm3/psm3/include/rbtree.c @@ -0,0 +1,743 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + +/* + * Abstract: + * Implementation of quick map, a binary tree where the caller always provides + * all necessary storage. + * + * Environment: + * All + * + * $Revision$ + */ + + +/***************************************************************************** +* +* Map +* +* Map is an associative array. By providing a key, the caller can retrieve +* an object from the map. All objects in the map have an associated key, +* as specified by the caller when the object was inserted into the map. +* In addition to random access, the caller can traverse the map much like +* a linked list, either forwards from the first object or backwards from +* the last object. The objects in the map are always traversed in +* order since the nodes are stored sorted. +* +* This implementation of Map uses a red black tree verified against +* Cormen-Leiserson-Rivest text, McGraw-Hill Edition, fourteenth +* printing, 1994. +* +*****************************************************************************/ + +#include /* for memset declaration */ + +// RBTREE_CMP should be a comparator, i.e. RBTREE_CMP(a, b) should evaluate to +// -1, 0, or 1 depending on if a < b, a == b, or a > b, respectively. +#ifdef RBTREE_CMP + +#if defined(RBTREE_GET_LEFTMOST) || defined(RBTREE_GET_RIGHTMOST) +#error Cannot define both RBTREE_CMP and RBTREE_GET_(LEFT|RIGHT)MOST +#endif + +#elif !defined ( RBTREE_GET_LEFTMOST ) || \ + ! defined ( RBTREE_GET_RIGHTMOST ) || \ + ! defined ( RBTREE_MAP_COUNT ) || \ + ! defined ( RBTREE_ASSERT ) +#error "You must define RBTREE_GET_LEFTMOST and RBTREE_GET_RIGHTMOST and \ + RBTREE_MAP_COUNT and RBTREE_ASSERT before including rbtree.c" + +#endif /* RBTREE_CMP */ + +#define IN /* nothing */ + +/****************************************************************************** +******************************************************************************* +************** ************ +************** IMPLEMENTATION OF QUICK MAP ************ +************** ************ +******************************************************************************* +******************************************************************************/ + +/* Forward declarations: */ +static void ips_cl_qmap_init( + IN cl_qmap_t *p_map, + IN cl_map_item_t* const root, + IN cl_map_item_t* const nil); +static void ips_cl_qmap_insert_item( + IN cl_qmap_t* const p_map, + IN cl_map_item_t* const p_item); +static void ips_cl_qmap_remove_item( + IN cl_qmap_t* const p_map, + IN cl_map_item_t* const p_item); +static cl_map_item_t* ips_cl_qmap_successor( + IN cl_qmap_t* const p_map, + IN const cl_map_item_t* p_item); + + +#ifndef RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR +static cl_map_item_t* ips_cl_qmap_predecessor( + IN cl_qmap_t* const p_map, + IN const cl_map_item_t* p_item); +#endif + +#if defined(RBTREE_GET_LEFTMOST) +static cl_map_item_t* ips_cl_qmap_search( + IN cl_qmap_t* const p_map, + IN unsigned long start, + IN unsigned long end); +#else +static cl_map_item_t* ips_cl_qmap_searchv( + cl_qmap_t* const p_map, + const RBTREE_MI_PL *key); +#endif + +/* + * Get the root. + */ +static inline cl_map_item_t* +__cl_map_root( + IN const cl_qmap_t* const p_map ) +{ + RBTREE_ASSERT( p_map ); + return( p_map->root->p_left ); +} + + +/* + * Returns whether a given item is on the left of its parent. + */ +static int +__cl_map_is_left_child( + IN const cl_map_item_t* const p_item ) +{ + RBTREE_ASSERT( p_item ); + RBTREE_ASSERT( p_item->p_up ); + RBTREE_ASSERT( p_item->p_up != p_item ); + + return( p_item->p_up->p_left == p_item ); +} + + +/* + * Retrieve the pointer to the parent's pointer to an item. + */ +static cl_map_item_t** +__cl_map_get_parent_ptr_to_item( + IN cl_map_item_t* const p_item ) +{ + RBTREE_ASSERT( p_item ); + RBTREE_ASSERT( p_item->p_up ); + RBTREE_ASSERT( p_item->p_up != p_item ); + + if( __cl_map_is_left_child( p_item ) ) + return( &p_item->p_up->p_left ); + + RBTREE_ASSERT( p_item->p_up->p_right == p_item ); + return( &p_item->p_up->p_right ); +} + + +/* + * Rotate a node to the left. This rotation affects the least number of links + * between nodes and brings the level of C up by one while increasing the depth + * of A one. Note that the links to/from W, X, Y, and Z are not affected. + * + * R R + * | | + * A C + * / \ / \ + * W C A Z + * / \ / \ + * B Z W B + * / \ / \ + * X Y X Y + */ +static void +__cl_map_rot_left( + IN cl_qmap_t* const p_map, + IN cl_map_item_t* const p_item ) +{ + cl_map_item_t **pp_root; + + RBTREE_ASSERT( p_map ); + RBTREE_ASSERT( p_item ); + RBTREE_ASSERT( p_item->p_right != p_map->nil_item ); + + pp_root = __cl_map_get_parent_ptr_to_item( p_item ); + + /* Point R to C instead of A. */ + *pp_root = p_item->p_right; + /* Set C's parent to R. */ + (*pp_root)->p_up = p_item->p_up; + + /* Set A's right to B */ + p_item->p_right = (*pp_root)->p_left; + /* + * Set B's parent to A. We trap for B being NIL since the + * caller may depend on NIL not changing. + */ + if( (*pp_root)->p_left != p_map->nil_item ) + (*pp_root)->p_left->p_up = p_item; + + /* Set C's left to A. */ + (*pp_root)->p_left = p_item; + /* Set A's parent to C. */ + p_item->p_up = *pp_root; +} + + +/* + * Rotate a node to the right. This rotation affects the least number of links + * between nodes and brings the level of A up by one while increasing the depth + * of C one. Note that the links to/from W, X, Y, and Z are not affected. + * + * R R + * | | + * C A + * / \ / \ + * A Z W C + * / \ / \ + * W B B Z + * / \ / \ + * X Y X Y + */ +static void +__cl_map_rot_right( + IN cl_qmap_t* const p_map, + IN cl_map_item_t* const p_item ) +{ + cl_map_item_t **pp_root; + + RBTREE_ASSERT( p_map ); + RBTREE_ASSERT( p_item ); + RBTREE_ASSERT( p_item->p_left != p_map->nil_item ); + + /* Point R to A instead of C. */ + pp_root = __cl_map_get_parent_ptr_to_item( p_item ); + (*pp_root) = p_item->p_left; + /* Set A's parent to R. */ + (*pp_root)->p_up = p_item->p_up; + + /* Set C's left to B */ + p_item->p_left = (*pp_root)->p_right; + /* + * Set B's parent to C. We trap for B being NIL since the + * caller may depend on NIL not changing. + */ + if( (*pp_root)->p_right != p_map->nil_item ) + (*pp_root)->p_right->p_up = p_item; + + /* Set A's right to C. */ + (*pp_root)->p_right = p_item; + /* Set C's parent to A. */ + p_item->p_up = *pp_root; +} + +/* + * Balance a tree starting at a given item back to the root. + */ +static void +__cl_map_ins_bal( + IN cl_qmap_t* const p_map, + IN cl_map_item_t* p_item ) +{ + cl_map_item_t* p_grand_uncle; + + RBTREE_ASSERT( p_map ); + RBTREE_ASSERT( p_item ); + RBTREE_ASSERT( p_item != p_map->root ); + + while( p_item->p_up->color == CL_MAP_RED ) + { + if( __cl_map_is_left_child( p_item->p_up ) ) + { + p_grand_uncle = p_item->p_up->p_up->p_right; + RBTREE_ASSERT( p_grand_uncle ); + if( p_grand_uncle->color == CL_MAP_RED ) + { + p_grand_uncle->color = CL_MAP_BLACK; + p_item->p_up->color = CL_MAP_BLACK; + p_item->p_up->p_up->color = CL_MAP_RED; + p_item = p_item->p_up->p_up; + continue; + } + + if( !__cl_map_is_left_child( p_item ) ) + { + p_item = p_item->p_up; + __cl_map_rot_left( p_map, p_item ); + } + p_item->p_up->color = CL_MAP_BLACK; + p_item->p_up->p_up->color = CL_MAP_RED; + __cl_map_rot_right( p_map, p_item->p_up->p_up ); + } + else + { + p_grand_uncle = p_item->p_up->p_up->p_left; + RBTREE_ASSERT( p_grand_uncle ); + if( p_grand_uncle->color == CL_MAP_RED ) + { + p_grand_uncle->color = CL_MAP_BLACK; + p_item->p_up->color = CL_MAP_BLACK; + p_item->p_up->p_up->color = CL_MAP_RED; + p_item = p_item->p_up->p_up; + continue; + } + + if( __cl_map_is_left_child( p_item ) ) + { + p_item = p_item->p_up; + __cl_map_rot_right( p_map, p_item ); + } + p_item->p_up->color = CL_MAP_BLACK; + p_item->p_up->p_up->color = CL_MAP_RED; + __cl_map_rot_left( p_map, p_item->p_up->p_up ); + } + } +} + +static void ips_cl_qmap_init( + IN cl_qmap_t *p_map, + IN cl_map_item_t* const root, + IN cl_map_item_t* const nil_item) +{ + RBTREE_ASSERT( p_map ); + RBTREE_ASSERT( root ); + RBTREE_ASSERT( nil_item ); + + memset(p_map,0,sizeof(cl_qmap_t)); + + p_map->root = root; + + /* setup the RB tree map */ + p_map->nil_item = nil_item; + + p_map->root->p_up = p_map->root; + p_map->root->p_left = p_map->nil_item; + p_map->root->p_right = p_map->nil_item; + p_map->root->color = CL_MAP_BLACK; + + p_map->nil_item->p_up = p_map->nil_item; + p_map->nil_item->p_left = p_map->nil_item; + p_map->nil_item->p_right = p_map->nil_item; + p_map->nil_item->color = CL_MAP_BLACK; +} + +static void +ips_cl_qmap_insert_item( + IN cl_qmap_t* const p_map, + IN cl_map_item_t* const p_item ) +{ + cl_map_item_t *p_insert_at, *p_comp_item; + int compare_res = 0; + + RBTREE_ASSERT( p_map ); + RBTREE_ASSERT( p_item ); + RBTREE_ASSERT( p_map->root->p_up == p_map->root ); + RBTREE_ASSERT( p_map->root->color != CL_MAP_RED ); + RBTREE_ASSERT( p_map->nil_item->color != CL_MAP_RED ); + + /* Find the insertion location. */ + p_insert_at = p_map->root; + p_comp_item = __cl_map_root( p_map ); + + while( p_comp_item != p_map->nil_item ) + { + p_insert_at = p_comp_item; + + /* Traverse the tree until the correct insertion point is found. */ +#ifdef RBTREE_GET_LEFTMOST + if( RBTREE_GET_LEFTMOST(&p_item->payload) < RBTREE_GET_LEFTMOST(&p_insert_at->payload) ) +#else + if(RBTREE_CMP(&p_item->payload, &p_insert_at->payload) < 0) +#endif + { + p_comp_item = p_insert_at->p_left; + compare_res = 1; + } else { + p_comp_item = p_insert_at->p_right; + compare_res = -1; + } + } + + RBTREE_ASSERT( p_insert_at != p_map->nil_item ); + RBTREE_ASSERT( p_comp_item == p_map->nil_item ); + + /* Insert the item. */ + p_item->p_left = p_map->nil_item; + p_item->p_right = p_map->nil_item; + p_item->color = CL_MAP_RED; + if( p_insert_at == p_map->root ) + { + p_insert_at->p_left = p_item; + } + else if( compare_res > 0 ) /* key < p_insert_at->key */ + { + p_insert_at->p_left = p_item; + } + else + { + p_insert_at->p_right = p_item; + } + /* Increase the count. */ + RBTREE_MAP_COUNT(&p_map->payload)++; + + p_item->p_up = p_insert_at; + + /* + * We have added depth to this section of the tree. + * Rebalance as necessary as we retrace our path through the tree + * and update colors. + */ + __cl_map_ins_bal( p_map, p_item ); + + __cl_map_root( p_map )->color = CL_MAP_BLACK; + + /* + * Note that it is not necessary to re-color the nil node black because all + * red color assignments are made via the p_up pointer, and nil is never + * set as the value of a p_up pointer. + */ +} + +static void +__cl_map_del_bal( + IN cl_qmap_t* const p_map, + IN cl_map_item_t* p_item ) +{ + cl_map_item_t *p_uncle; + + while( (p_item->color != CL_MAP_RED) && (p_item->p_up != p_map->root) ) + { + if( __cl_map_is_left_child( p_item ) ) + { + p_uncle = p_item->p_up->p_right; + + if( p_uncle->color == CL_MAP_RED ) + { + p_uncle->color = CL_MAP_BLACK; + p_item->p_up->color = CL_MAP_RED; + __cl_map_rot_left( p_map, p_item->p_up ); + p_uncle = p_item->p_up->p_right; + } + + if( p_uncle->p_right->color != CL_MAP_RED ) + { + if( p_uncle->p_left->color != CL_MAP_RED ) + { + p_uncle->color = CL_MAP_RED; + p_item = p_item->p_up; + continue; + } + + p_uncle->p_left->color = CL_MAP_BLACK; + p_uncle->color = CL_MAP_RED; + __cl_map_rot_right( p_map, p_uncle ); + p_uncle = p_item->p_up->p_right; + } + p_uncle->color = p_item->p_up->color; + p_item->p_up->color = CL_MAP_BLACK; + p_uncle->p_right->color = CL_MAP_BLACK; + __cl_map_rot_left( p_map, p_item->p_up ); + break; + } + else + { + p_uncle = p_item->p_up->p_left; + + if( p_uncle->color == CL_MAP_RED ) + { + p_uncle->color = CL_MAP_BLACK; + p_item->p_up->color = CL_MAP_RED; + __cl_map_rot_right( p_map, p_item->p_up ); + p_uncle = p_item->p_up->p_left; + } + + if( p_uncle->p_left->color != CL_MAP_RED ) + { + if( p_uncle->p_right->color != CL_MAP_RED ) + { + p_uncle->color = CL_MAP_RED; + p_item = p_item->p_up; + continue; + } + + p_uncle->p_right->color = CL_MAP_BLACK; + p_uncle->color = CL_MAP_RED; + __cl_map_rot_left( p_map, p_uncle ); + p_uncle = p_item->p_up->p_left; + } + p_uncle->color = p_item->p_up->color; + p_item->p_up->color = CL_MAP_BLACK; + p_uncle->p_left->color = CL_MAP_BLACK; + __cl_map_rot_right( p_map, p_item->p_up ); + break; + } + } + p_item->color = CL_MAP_BLACK; +} + +static void +ips_cl_qmap_remove_item( + IN cl_qmap_t* const p_map, + IN cl_map_item_t* const p_item ) +{ + cl_map_item_t *p_child, *p_del_item; + + RBTREE_ASSERT( p_map ); + RBTREE_ASSERT( p_item ); + + if( p_item == p_map->nil_item ) + return; + + if( (p_item->p_right == p_map->nil_item) || (p_item->p_left == p_map->nil_item ) ) + { + /* The item being removed has children on at most on side. */ + p_del_item = p_item; + } + else + { + /* + * The item being removed has children on both side. + * We select the item that will replace it. After removing + * the substitute item and rebalancing, the tree will have the + * correct topology. Exchanging the substitute for the item + * will finalize the removal. + */ + p_del_item = ips_cl_qmap_successor(p_map, p_item); + RBTREE_ASSERT( p_del_item != p_map->nil_item ); + } + + RBTREE_MAP_COUNT(&p_map->payload)--; + + /* Get the pointer to the new root's child, if any. */ + if( p_del_item->p_left != p_map->nil_item ) + p_child = p_del_item->p_left; + else + p_child = p_del_item->p_right; + + /* + * This assignment may modify the parent pointer of the nil node. + * This is inconsequential. + */ + p_child->p_up = p_del_item->p_up; + (*__cl_map_get_parent_ptr_to_item( p_del_item )) = p_child; + + if( p_del_item->color != CL_MAP_RED ) + __cl_map_del_bal( p_map, p_child ); + + /* + * Note that the splicing done below does not need to occur before + * the tree is balanced, since the actual topology changes are made by the + * preceding code. The topology is preserved by the color assignment made + * below (reader should be reminded that p_del_item == p_item in some cases). + */ + if( p_del_item != p_item ) + { + /* + * Finalize the removal of the specified item by exchanging it with + * the substitute which we removed above. + */ + p_del_item->p_up = p_item->p_up; + p_del_item->p_left = p_item->p_left; + p_del_item->p_right = p_item->p_right; + (*__cl_map_get_parent_ptr_to_item( p_item )) = p_del_item; + p_item->p_right->p_up = p_del_item; + p_item->p_left->p_up = p_del_item; + p_del_item->color = p_item->color; + } + + RBTREE_ASSERT( p_map->nil_item->color != CL_MAP_RED ); +} + +static cl_map_item_t * +ips_cl_qmap_successor( + IN cl_qmap_t* const p_map, + IN const cl_map_item_t* p_item ) +{ + cl_map_item_t *p_tmp; + + p_tmp = p_item->p_right; + if (p_tmp != p_map->nil_item) { + while (p_tmp->p_left != p_map->nil_item) + p_tmp = p_tmp->p_left; + return p_tmp; + } else { + p_tmp = p_item->p_up; + while (p_tmp->p_right == p_item) { + p_item = p_tmp; + p_tmp = p_tmp->p_up; + } + if (p_tmp == p_map->root) + return p_map->nil_item; + return p_tmp; + } +} + +// When includer defines RBTREE_CMP, ips_cl_qmap_search() is not emitted. +// When this happens, ips_cl_qmap_predecessor() may not be called. +// Combined with -Werror -Wunused-function, libpsm2 fails to build. +// So provide macro to control emitting this function +#ifndef RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR +static cl_map_item_t * +ips_cl_qmap_predecessor( + IN cl_qmap_t* const p_map, + IN const cl_map_item_t* p_item ) +{ + cl_map_item_t *p_tmp; + + p_tmp = p_item->p_left; + if (p_tmp != p_map->nil_item) { + while (p_tmp->p_right != p_map->nil_item) + p_tmp = p_tmp->p_right; + return p_tmp; + } else { + p_tmp = p_item->p_up; + while (p_tmp->p_left == p_item) { + p_item = p_tmp; + p_tmp = p_tmp->p_up; + } + if (p_tmp == p_map->root) + return p_map->nil_item; + return p_tmp; + } +} +#endif /* RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR */ + +#if defined(RBTREE_GET_LEFTMOST) +/* + * return the first node with buffer overlapping or zero. + */ +static cl_map_item_t * +ips_cl_qmap_search(cl_qmap_t * const p_map, + unsigned long start, unsigned long end) +{ + cl_map_item_t *p_item, *p_tmp; + + RBTREE_ASSERT( p_map ); + p_item = __cl_map_root(p_map); + + while (p_item != p_map->nil_item) { + if (start > RBTREE_GET_LEFTMOST(&p_item->payload)) { + p_tmp = p_item->p_right; + if (p_tmp != p_map->nil_item) { + p_item = p_tmp; + continue; + } + + /* + * p_item is on immediate left side of 'start'. + */ + if (start >= RBTREE_GET_RIGHTMOST(&p_item->payload)) { + /* + * p_item is on immediate right + * side of 'start'. + */ + p_item = ips_cl_qmap_successor(p_map, p_item); + if (p_item != p_map->nil_item && + end <= RBTREE_GET_LEFTMOST(&p_item->payload)) + p_item = p_map->nil_item; + } + } else if (start < RBTREE_GET_LEFTMOST(&p_item->payload)) { + p_tmp = p_item->p_left; + if (p_tmp != p_map->nil_item) { + p_item = p_tmp; + continue; + } + + /* + * p_tmp is on immediate left side of 'start'. + */ + p_tmp = ips_cl_qmap_predecessor(p_map, p_item); + if (p_tmp == p_map->nil_item || + (start >= RBTREE_GET_RIGHTMOST(&p_tmp->payload))) { + /* + * p_item is on immediate right + * side of 'start'. + */ + if (end <= RBTREE_GET_LEFTMOST(&p_item->payload)) + p_item = p_map->nil_item; + } else + p_item = p_tmp; + } + + break; + } + + + return p_item; +} +#else /* defined(...LEFTMOST) || defined(...RIGHTMOST) */ +static cl_map_item_t * +ips_cl_qmap_searchv(cl_qmap_t * const p_map, const RBTREE_MI_PL *key) +{ + RBTREE_ASSERT( p_map ); + cl_map_item_t *p_item = __cl_map_root(p_map); + + while (p_item != p_map->nil_item) { + if (RBTREE_CMP(key, &p_item->payload) > 0) { + p_item = p_item->p_right; + } else if (RBTREE_CMP(key, &p_item->payload) < 0) { + p_item = p_item->p_left; + } else { + break; + } + } + + return p_item; +} +#endif /* defined(...LEFTMOST) || defined(...RIGHTMOST) */ diff --git a/prov/psm3/psm3/include/rbtree.h b/prov/psm3/psm3/include/rbtree.h new file mode 100644 index 00000000000..13245b0d456 --- /dev/null +++ b/prov/psm3/psm3/include/rbtree.h @@ -0,0 +1,90 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __RBTREE_H__ + +#define __RBTREE_H__ + +#include + +#ifndef RBTREE_MAP_PL +#error "You must define RBTREE_MAP_PL before including rbtree.h" +#endif + +#ifndef RBTREE_MI_PL +#error "You must define RBTREE_MI_PL before including rbtree.h" +#endif + +/* + * Red-Black tid cache definition. + */ +typedef struct _cl_map_item { + struct _cl_map_item *p_left; /* left pointer */ + struct _cl_map_item *p_right; /* right pointer */ + struct _cl_map_item *p_up; /* up pointer */ + uint16_t color; /* red-black color */ + + RBTREE_MI_PL payload; +} cl_map_item_t; + +typedef struct _cl_qmap { + cl_map_item_t *root; /* root node pointer */ + cl_map_item_t *nil_item; /* terminator node pointer */ + + RBTREE_MAP_PL payload; +} cl_qmap_t; + +#define CL_MAP_RED 0 +#define CL_MAP_BLACK 1 + +#endif diff --git a/prov/psm3/psm3/libuuid/pack.c b/prov/psm3/psm3/libuuid/pack.c new file mode 100644 index 00000000000..801b89177c9 --- /dev/null +++ b/prov/psm3/psm3/libuuid/pack.c @@ -0,0 +1,69 @@ +/* + * Internal routine for packing UUID's + * + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#include +#include +#include "psm_user.h" +#include "psm_uuid.h" + +void uuid_pack(const struct uuid *uu, uuid_t ptr) +{ + uint32_t tmp; + unsigned char *out = ptr; + + tmp = uu->time_low; + out[3] = (unsigned char) tmp; + tmp >>= 8; + out[2] = (unsigned char) tmp; + tmp >>= 8; + out[1] = (unsigned char) tmp; + tmp >>= 8; + out[0] = (unsigned char) tmp; + + tmp = uu->time_mid; + out[5] = (unsigned char) tmp; + tmp >>= 8; + out[4] = (unsigned char) tmp; + + tmp = uu->time_hi_and_version; + out[7] = (unsigned char) tmp; + tmp >>= 8; + out[6] = (unsigned char) tmp; + + tmp = uu->clock_seq; + out[9] = (unsigned char) tmp; + tmp >>= 8; + out[8] = (unsigned char) tmp; + + memcpy(out+10, uu->node, 6); +} + diff --git a/prov/psm3/psm3/libuuid/parse.c b/prov/psm3/psm3/libuuid/parse.c new file mode 100644 index 00000000000..dd8c2587ba9 --- /dev/null +++ b/prov/psm3/psm3/libuuid/parse.c @@ -0,0 +1,78 @@ +/* + * parse.c --- UUID parsing + * + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#include +#include +#include +#include + +#include "psm_user.h" +#include "psm_uuid.h" + +int uuid_parse(const char *in, uuid_t uu) +{ + struct uuid uuid; + int i; + const char *cp; + char buf[3]; + + if (strlen(in) != 36) + return -1; + for (i=0, cp = in; i <= 36; i++,cp++) { + if ((i == 8) || (i == 13) || (i == 18) || + (i == 23)) { + if (*cp == '-') + continue; + else + return -1; + } + if (i== 36) + if (*cp == 0) + continue; + if (!isxdigit(*cp)) + return -1; + } + uuid.time_low = strtoul(in, NULL, 16); + uuid.time_mid = strtoul(in+9, NULL, 16); + uuid.time_hi_and_version = strtoul(in+14, NULL, 16); + uuid.clock_seq = strtoul(in+19, NULL, 16); + cp = in+24; + buf[2] = 0; + for (i=0; i < 6; i++) { + buf[0] = *cp++; + buf[1] = *cp++; + uuid.node[i] = strtoul(buf, NULL, 16); + } + + uuid_pack(&uuid, uu); + return 0; +} diff --git a/prov/psm3/psm3/libuuid/psm_uuid.c b/prov/psm3/psm3/libuuid/psm_uuid.c new file mode 100644 index 00000000000..4db29a69bff --- /dev/null +++ b/prov/psm3/psm3/libuuid/psm_uuid.c @@ -0,0 +1,114 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include +#include +#include +#include "psm_user.h" +#include "psm_uuid.h" + +static void psmi_make_drand_uuid(psm2_uuid_t uuid_out) +{ + struct drand48_data drand48_data; + int i; + long int rnum; + srand48_r((get_cycles() + getpid()) % LONG_MAX, &drand48_data); + for(i=0; i < 16; i++) { + lrand48_r(&drand48_data, &rnum); + uuid_out[i] = rnum % UCHAR_MAX; + } +} + +/* Since libuuid can call srand, we will generate our own uuids */ +void +__psm2_uuid_generate(psm2_uuid_t uuid_out) +{ + PSM2_LOG_MSG("entering"); + /* Prefer using urandom, fallback to drand48_r */ + struct stat urandom_stat; + size_t nbytes; + int fd; + if(stat("/dev/urandom", &urandom_stat) != 0) { + psmi_make_drand_uuid(uuid_out); + return; + } + + fd = open("/dev/urandom", O_RDONLY); + if(fd == -1) { + psmi_make_drand_uuid(uuid_out); + } else { + nbytes = read(fd, (char *) uuid_out, 16); + if(nbytes != 16) { + psmi_make_drand_uuid(uuid_out); + } + close(fd); + } + PSM2_LOG_MSG("leaving"); + return; +} +PSMI_API_DECL(psm2_uuid_generate) + +void +psmi_uuid_unparse(const uuid_t uu, char *out) +{ + uuid_unparse_lower(uu, out); +} + +int +psmi_uuid_parse(const char *in, uuid_t uu) +{ + return uuid_parse(in, uu); +} + diff --git a/prov/psm3/psm3/libuuid/psm_uuid.h b/prov/psm3/psm3/libuuid/psm_uuid.h new file mode 100644 index 00000000000..09df044d9ca --- /dev/null +++ b/prov/psm3/psm3/libuuid/psm_uuid.h @@ -0,0 +1,78 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _PSM_UUID_H +#define _PSM_UUID_H +struct uuid { + uint32_t time_low; + uint16_t time_mid; + uint16_t time_hi_and_version; + uint16_t clock_seq; + uint8_t node[6]; +}; + +typedef unsigned char uuid_t[16]; + +int psmi_uuid_parse(const char *in, psm2_uuid_t uu); +void psmi_uuid_unparse(const psm2_uuid_t uuid, char *out); +int psmi_uuid_compare(const psm2_uuid_t uuA, const psm2_uuid_t uuB); +int uuid_compare(const uuid_t uu1, const uuid_t uu2); +void uuid_pack(const struct uuid *uu, uuid_t ptr); +void uuid_unparse(const uuid_t uu, char *out); +void uuid_unparse_upper(const uuid_t uu, char *out); +void uuid_unparse_lower(const uuid_t uu, char *out); +void uuid_unpack(const uuid_t in, struct uuid *uu); +int uuid_parse(const char *in, uuid_t uu); +#endif diff --git a/prov/psm3/psm3/libuuid/unpack.c b/prov/psm3/psm3/libuuid/unpack.c new file mode 100644 index 00000000000..26e4394c80c --- /dev/null +++ b/prov/psm3/psm3/libuuid/unpack.c @@ -0,0 +1,63 @@ +/* + * Internal routine for unpacking UUID + * + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#include +#include +#include "psm_user.h" +#include "psm_uuid.h" + +void uuid_unpack(const uuid_t in, struct uuid *uu) +{ + const uint8_t *ptr = in; + uint32_t tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + tmp = (tmp << 8) | *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_low = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_mid = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_hi_and_version = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->clock_seq = tmp; + + memcpy(uu->node, ptr, 6); +} + diff --git a/prov/psm3/psm3/libuuid/unparse.c b/prov/psm3/psm3/libuuid/unparse.c new file mode 100644 index 00000000000..d8593797ad5 --- /dev/null +++ b/prov/psm3/psm3/libuuid/unparse.c @@ -0,0 +1,75 @@ +/* + * unparse.c -- convert a UUID to string + * + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#include + +#include "psm_user.h" +#include "psm_uuid.h" + +static const char *fmt_lower = + "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x"; + +static const char *fmt_upper = + "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X"; + +#ifdef UUID_UNPARSE_DEFAULT_UPPER +#define FMT_DEFAULT fmt_upper +#else +#define FMT_DEFAULT fmt_lower +#endif + +static void uuid_unparse_x(const uuid_t uu, char *out, const char *fmt) +{ + struct uuid uuid; + + uuid_unpack(uu, &uuid); + sprintf(out, fmt, + uuid.time_low, uuid.time_mid, uuid.time_hi_and_version, + uuid.clock_seq >> 8, uuid.clock_seq & 0xFF, + uuid.node[0], uuid.node[1], uuid.node[2], + uuid.node[3], uuid.node[4], uuid.node[5]); +} + +void uuid_unparse_lower(const uuid_t uu, char *out) +{ + uuid_unparse_x(uu, out, fmt_lower); +} + +void uuid_unparse_upper(const uuid_t uu, char *out) +{ + uuid_unparse_x(uu, out, fmt_upper); +} + +void uuid_unparse(const uuid_t uu, char *out) +{ + uuid_unparse_x(uu, out, FMT_DEFAULT); +} diff --git a/prov/psm3/psm3/mpspawn/mpspawn_stats.h b/prov/psm3/psm3/mpspawn/mpspawn_stats.h new file mode 100644 index 00000000000..36be6f20f5a --- /dev/null +++ b/prov/psm3/psm3/mpspawn/mpspawn_stats.h @@ -0,0 +1,135 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _MPSPAWN_STATS_H +#define _MPSPAWN_STATS_H + +#include + +#define MPSPAWN_STATS_VERSION 1 + +typedef enum { + MPSPAWN_STATS_TYPE_DOUBLE = 0x1, +#define MPSPAWN_STATS_TYPE_DOUBLE 0x1 + MPSPAWN_STATS_TYPE_HEADER = 0x2, +#define MPSPAWN_STATS_TYPE_HEADER 0x2 + MPSPAWN_STATS_REDUCTION_MAX = 0x1000, +#define MPSPAWN_STATS_REDUCTION_MAX 0x1000 + MPSPAWN_STATS_REDUCTION_MIN = 0x2000, +#define MPSPAWN_STATS_REDUCTION_MIN 0x2000 + MPSPAWN_STATS_REDUCTION_MEDIAN = 0x4000, +#define MPSPAWN_STATS_REDUCTION_MEDIAN 0x4000 + MPSPAWN_STATS_SKIP_IF_ZERO = 0x8000 +#define MPSPAWN_STATS_SKIP_IF_ZERO 0x8000 +} mpspawn_stats_flags; + +#define MPSPAWN_STATS_REDUCTION_ALL (MPSPAWN_STATS_REDUCTION_MAX | \ + MPSPAWN_STATS_REDUCTION_MIN | MPSPAWN_STATS_REDUCTION_MEDIAN) + +#define MPSPAWN_STATS_DOUBLE_TO_U64(arg) (*((uint64_t *) &(arg))) +#define MPSPAWN_NAN_U64 ((uint64_t) ~0ULL) +#define MPSPAWN_ISNAN_U64(x) (((uint64_t)(x)) == MPSPAWN_NAN_U64) + +#define MPSPAWN_NAN ((uint64_t) ~0ULL) /* NAN) */ +#define MPSPAWN_ISNAN(x) (isnan(x)) + +#if 0 // unused code, specific to QLogic MPI + +struct mpspawn_stats_add_args; /* client->mpspawn stats registration */ +struct mpspawn_stats_req_args; /* mpspawn->client fn callback stats request */ +struct mpspawn_stats_init_args; /* mpspawn->client "downcall" to register */ + +/* Clients implement this function to fill in mpspawn request for stats */ +typedef void (*mpspawn_stats_req_fn) (struct mpspawn_stats_req_args *); +/* mpspawn implements this function to allow clients to register new stats */ +typedef void (*mpspawn_stats_add_fn) (struct mpspawn_stats_add_args *); +/* mpspawn implements this function to map rank indexes into epaddr structs */ +struct psm2_epaddr; +typedef struct psm2_epaddr *(*mpspawn_map_epaddr_fn) (int rank); + +typedef struct mpspawn_stats_req_args { + int version; + int num; + uint64_t *stats; + uint16_t *flags; + void *context; +} mpspawn_stats_req_args_t; + +typedef +struct mpspawn_stats_add_args { + int version; + int num; + char *header; + char **desc; + uint16_t *flags; + mpspawn_stats_req_fn req_fn; + void *context; +} mpspawn_stats_add_args_t; + +typedef +struct mpspawn_stats_init_args { + int version; + psm2_mq_t mq; /* initialized mq endpoint */ + int num_epaddr; /* number of endpoints in job */ + mpspawn_stats_add_fn add_fn; /* function for client to add stats */ + mpspawn_map_epaddr_fn epaddr_map_fn; + const char *stats_types; /* stats type string mpirun -M */ +} mpspawn_stats_init_args_t; + +/* Function in psm exposed to register stats */ +void *psmi_stats_register(struct mpspawn_stats_init_args *args); + +#endif +#endif diff --git a/prov/psm3/psm3/opa/opa_debug.c b/prov/psm3/psm3/opa/opa_debug.c new file mode 100644 index 00000000000..3ac88e2cb6e --- /dev/null +++ b/prov/psm3/psm3/opa/opa_debug.c @@ -0,0 +1,455 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "opa_user.h" +#include "../psm_log.h" + +unsigned hfi_debug = __HFI_INFO; +char *__hfi_mylabel = NULL; +int __hfi_myrank = -1; +int __hfi_myrank_count = -1; +int __hfi_mylocalrank = -1; +int __hfi_mylocalrank_count = -1; +FILE *__hfi_dbgout; +static void init_hfi_mylabel(void) __attribute__ ((constructor)); +static void init_hfi_backtrace(void) __attribute__ ((constructor)); +static void init_hfi_dbgfile(void) __attribute__ ((constructor)); +static void fini_hfi_backtrace(void) __attribute__ ((destructor)); +static void fini_hfi_mylabel(void) __attribute__ ((destructor)); +static struct sigaction SIGSEGV_old_act; +static struct sigaction SIGBUS_old_act; +static struct sigaction SIGILL_old_act; +static struct sigaction SIGABRT_old_act; +static struct sigaction SIGINT_old_act; +static struct sigaction SIGTERM_old_act; +#ifdef PSM3_BRAKE_DEBUG +static void hfi_brake_debug(void) __attribute__ ((constructor)); + +/* + How to use hfi_break_debug code: + + 1. Build psm with PSM3_BRAKE_DEBUG set in the environment. + 2. Create a script for your test case (e.g. mpistress?). In the script + make sure to choose a PSM3 brake file that corresponds to a network + file system that is common to all hosts where you will run your code. + Also, in the script, make sure to propagate the "PSM3_BRAKE_FILE_NAME" + env var to all hosts. + 3. Bring up 3 putty sessions to one of the hosts that your script uses. + 4. In putty session number 1, touch the PSM3_BRAKE_FILE and sync. + 5. In putty session number 1, start the script. You should see messages + of the form: +-bash-4.2$ ./mpistress.0304.sc +:5716 remove the file: "/nfs/user/PSM3_BRAKE" to continue +:5717 remove the file: "/nfs/user/PSM3_BRAKE" to continue +:3456 remove the file: "/nfs/user/PSM3_BRAKE" to continue +:3457 remove the file: "/nfs/user/PSM3_BRAKE" to continue + + Note that the hostname and process id are shown for all of the processes that are started + by your script. + 6. In putty session 2, bring up gdb, and debug the program that is referenced in your script. + For example: /usr/mpi/gcc/openmpi-1.10.2-ofi/tests/intel/mpi_stress + 7. In putty session 2 / gdb, attach to one of the processes that is shown in putty session 1. + 8. Note, at this point, you have only one gdb session. I leave it as an exercise to the reader to + determine how to bring up multiple gdb sessions. + 9. In putty session 3, rm the PSM3_BRAKE_FILE. + 10. You are now debugging a live session of psm. + */ + +static void hfi_brake_debug(void) +{ + struct stat buff; + char hostname[80]; + const char *hfi_brake_file_name = getenv("PSM3_BRAKE_FILE_NAME"); + gethostname(hostname, 80); + hostname[sizeof(hostname) - 1] = '\0'; + + if (!hfi_brake_file_name) + hfi_brake_file_name = "/tmp/PSM3_BRAKE_FILE"; + printf("%s:%d remove the file: \"%s\" to continue\n",hostname,getpid(),hfi_brake_file_name); + while (0 == stat(hfi_brake_file_name, &buff)) + { + printf("%s:pid%d remove the file: \"%s\" to continue\n",hostname,getpid(),hfi_brake_file_name); + sleep(10); + } + printf("%s:pid%d continuing.\n",hostname,getpid()); +} +#endif + +static void init_hfi_mylabel(void) +{ + char lbl[1024]; + char hostname[80]; + char *e; + /* By default, try to come up with a decent default label, it will be + * overridden later. Try getting rank, if that's not available revert to + * pid. */ + gethostname(hostname, 80); + lbl[0] = '\0'; + hostname[sizeof(hostname) - 1] = '\0'; + +#if 0 + /* DEBUG: Used to selectively test possible NIC selection, + * shared context and shm-only settings */ + unsetenv("PSC_MPI_NODE_RANK"); + unsetenv("PSC_MPI_PPN"); + unsetenv("MPI_LOCALRANKID"); + unsetenv("MPI_LOCALRANKS"); +#endif + + if ((((e = getenv("PMI_SIZE")) && *e)) // MPICH & IMPI + || (((e = getenv("OMPI_COMM_WORLD_SIZE")) && *e)) // OMPI + || (((e = getenv("MPI_NRANKS")) && *e)) // Platform MPI + || (((e = getenv("MPIRUN_NPROCS")) && *e)) // older MPICH + // N/A || (((e = getenv("PSC_MPI_TBD")) && *e)) // pathscale MPI + || (((e = getenv("SLURM_NTASKS")) && *e)) // SLURM + || (((e = getenv("SLURM_NPROCS")) && *e)) // older SLURM + ) { + char *ep; + unsigned long val; + val = strtoul(e, &ep, 10); + if (ep != e) /* valid conversion */ + __hfi_myrank_count = val; + } + + if ((((e = getenv("MPI_LOCALRANKID")) && *e)) // MPICH and IMPI + || (((e = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) && *e)) // OMPI + || (((e = getenv("MPI_LOCALRANKID")) && *e)) // Platform MPI + // N/A | (((e = getenv("MPIRUN_TBD")) && *e)) // older MPICH + || (((e = getenv("PSC_MPI_NODE_RANK")) && *e)) // pathscale MPI + || (((e = getenv("SLURM_LOCALID")) && *e)) // SLURM + ) { + char *ep; + unsigned long val; + val = strtoul(e, &ep, 10); + if (ep != e) /* valid conversion */ + __hfi_mylocalrank = val; + } + + if ((((e = getenv("MPI_LOCALNRANKS")) && *e)) // MPICH and IMPI + || (((e = getenv("OMPI_COMM_WORLD_LOCAL_SIZE")) && *e)) // OMPI + || (((e = getenv("MPI_LOCALNRANKS")) && *e)) // Platform MPI + // N/A || (((e = getenv("MPIRUN_TBD")) && *e)) // older MPICH + || (((e = getenv("PSC_MPI_PPN")) && *e)) // pathscale MPI + || (((e = getenv("SLURM_NTASKS_PER_NODE")) && *e)) // SLURM + ) { + char *ep; + unsigned long val; + val = strtoul(e, &ep, 10); + if (ep != e) /* valid conversion */ + __hfi_mylocalrank_count = val; + } + + if ((((e = getenv("PMI_RANK")) && *e)) // MPICH and *_SIZE + || (((e = getenv("OMPI_COMM_WORLD_RANK")) && *e)) // OMPI and *_SIZE + || (((e = getenv("MPI_RANKID")) && *e)) // Platform MPI and *_NRANKS + || (((e = getenv("MPIRUN_RANK")) && *e)) // older MPICH and *_NPROCS + || (((e = getenv("PSC_MPI_RANK")) && *e)) // pathscale MPI + || (((e = getenv("SLURM_TASKID")) && *e)) // SLURM + || (((e = getenv("SLURM_PROCID")) && *e)) // SLURM + ) { + char *ep; + unsigned long val; + val = strtoul(e, &ep, 10); + if (ep != e) { /* valid conversion */ + snprintf(lbl, 1024, "%s:rank%lu", hostname, val); + __hfi_myrank = val; + } + } + if (lbl[0] == '\0') + snprintf(lbl, 1024, "%s:pid%u", hostname, getpid()); + __hfi_mylabel = strdup(lbl); +} + +static void fini_hfi_mylabel(void) +{ + if(__hfi_mylabel != NULL) + free(__hfi_mylabel); +} + +/* FIXME: This signal handler does not conform to the posix standards described + in 'man 7 signal' due to it calling unsafe functions. + + See 'CALLS UNSAFE FUNCTION' notes below for examples. + */ +static void hfi_sighdlr(int sig, siginfo_t *p1, void *ucv) +{ + /* we make these static to try and avoid issues caused + by stack overflow that might have gotten us here. */ + static void *backaddr[128]; /* avoid stack usage */ + static char buf[150], hname[64], fname[128]; + static int i, j, fd, id; + extern char *__progname; + PSM2_LOG_DECLARE_BT_BUFFER(); + + /* CALLS UNSAFE FUNCTION when PSM_LOG is defined. */ + PSM2_LOG_BT(100,__FUNCTION__); + /* If this is a SIGINT do not display backtrace. Just invoke exit + handlers */ + if ((sig == SIGINT) || (sig == SIGTERM)) + /* CALLS UNSAFE FUNCTION (exit) */ + exit(1); + + /* CALLS UNSAFE FUNCTION (snprintf) */ + id = snprintf(buf, sizeof(buf), + "\n%.60s:pid%u terminated with signal %d", __progname, + getpid(), sig); + if (ucv) { + static ucontext_t *uc; + uc = (ucontext_t *) ucv; + id += snprintf(buf + id, sizeof(buf) - id, " at PC=%lx SP=%lx", +#if defined(__x86_64__) + (unsigned long)uc->uc_mcontext.gregs[REG_RIP], + (unsigned long)uc->uc_mcontext.gregs[REG_RSP]); +#elif defined(__i386__) + (unsigned long)uc->uc_mcontext.gregs[REG_EIP], + (unsigned long)uc->uc_mcontext.gregs[REG_ESP]); +#else + 0ul, 0ul); +#warning No stack pointer or instruction pointer for this arch +#endif + } + id += snprintf(buf + id, sizeof(buf) - id, ". Backtrace:\n"); + /* CALLS UNSAFE FUNCTION (fprintf) */ + fprintf(stderr, "%.*s", id, buf); + + i = backtrace(backaddr, sizeof(backaddr) / sizeof(backaddr[0])); + if (i > 2) /* skip ourselves and backtrace */ + j = 2, i -= j; + else + j = 0; + + backtrace_symbols_fd(backaddr + j, i, 2); + (void)fsync(2); + + /* Try to write it to a file as well, in case the rest doesn't make it + out. Do it second, in case we get a second failure (more likely). + We might eventually want to print some more of the registers to the + btr file, to aid debugging, but not for now. Truncate the program + name if overly long, so we always get pid and (at least part of) + hostname. */ + /* CALLS UNSAFE FUNCTION (gethostname) */ + (void)gethostname(hname, sizeof(hname)); + hname[sizeof(hname) - 1] = '\0'; + snprintf(fname, sizeof(fname), "%s.80s-%u,%.32s.btr", __progname, + getpid(), hname); + if ((fd = open(fname, O_CREAT | O_WRONLY, 0644)) >= 0) { + /* CALLS UNSAFE FUNCTION (fdopen) */ + FILE *fp = fdopen(fd, "w"); + if (fp) + fprintf(fp, "%.*s", id, buf); + backtrace_symbols_fd(backaddr + j, i, fd); + if (fp) + /* CALLS UNSAFE FUNCTION (fclose) */ + fclose(fp); + } + switch (sig){ + case SIGSEGV: + (*SIGSEGV_old_act.sa_sigaction)(sig,p1,ucv); + break; + case SIGBUS: + (*SIGBUS_old_act.sa_sigaction)(sig,p1,ucv); + break; + case SIGILL: + (*SIGILL_old_act.sa_sigaction)(sig,p1,ucv); + break; + case SIGABRT: + (*SIGABRT_old_act.sa_sigaction)(sig,p1,ucv); + break; + default: + break; + } + exit(1); /* not _exit(), want atexit handlers to get run */ +} + +/* We do this as a constructor so any user program that sets signal handlers + for these will override our settings, but we still get backtraces if they + don't. +*/ +static void init_hfi_backtrace(void) +{ + /* we need to track memory corruption */ + static struct sigaction act; /* easier than memset */ + act.sa_sigaction = hfi_sighdlr; + act.sa_flags = SA_SIGINFO; + + if (getenv("PSM3_BACKTRACE")) { + /* permanent, although probably + undocumented way to disable backtraces. */ + (void)sigaction(SIGSEGV, &act, &SIGSEGV_old_act); + (void)sigaction(SIGBUS, &act, &SIGBUS_old_act); + (void)sigaction(SIGILL, &act, &SIGILL_old_act); + (void)sigaction(SIGABRT, &act, &SIGABRT_old_act); + (void)sigaction(SIGINT, &act, &SIGINT_old_act); + (void)sigaction(SIGTERM, &act, &SIGTERM_old_act); + } +} + +/* if PSM3_DEBUG_FILENAME is set in the environment, then all the + debug prints (not info and error) will go to that file. + %h is expanded to the hostname, and %p to the pid, if present. */ +static void init_hfi_dbgfile(void) +{ + char *fname = getenv("PSM3_DEBUG_FILENAME"); + char *exph, *expp, tbuf[1024]; + FILE *newf; + + if (!fname) { + __hfi_dbgout = stdout; + return; + } + exph = strstr(fname, "%h"); /* hostname */ + expp = strstr(fname, "%p"); /* pid */ + if (exph || expp) { + int baselen; + char hname[256], pid[12]; + if (exph) { + *hname = hname[sizeof(hname) - 1] = 0; + gethostname(hname, sizeof(hname) - 1); + if (!*hname) + strcpy(hname, "[unknown]"); + } + if (expp) + snprintf(pid, sizeof(pid), "%d", getpid()); + if (exph && expp) { + if (exph < expp) { + baselen = exph - fname; + snprintf(tbuf, sizeof(tbuf), "%.*s%s%.*s%s%s", + baselen, fname, hname, + (int)(expp - (exph + 2)), exph + 2, + pid, expp + 2); + } else { + baselen = expp - fname; + snprintf(tbuf, sizeof(tbuf), "%.*s%s%.*s%s%s", + baselen, fname, pid, + (int)(exph - (expp + 2)), expp + 2, + hname, exph + 2); + } + } else if (exph) { + baselen = exph - fname; + snprintf(tbuf, sizeof(tbuf), "%.*s%s%s", + baselen, fname, hname, exph + 2); + } else { + baselen = expp - fname; + snprintf(tbuf, sizeof(tbuf), "%.*s%s%s", + baselen, fname, pid, expp + 2); + } + fname = tbuf; + } + newf = fopen(fname, "a"); + if (!newf) { + _HFI_ERROR + ("Unable to open \"%s\" for debug output, using stdout: %s\n", + fname, strerror(errno)); + __hfi_dbgout = stdout; + } else { + __hfi_dbgout = newf; + setlinebuf(__hfi_dbgout); + } +} + +void hfi_set_mylabel(char *label) +{ + __hfi_mylabel = label; +} + +char *hfi_get_mylabel() +{ + return __hfi_mylabel; +} + +int hfi_get_myrank() +{ + return __hfi_myrank; +} + +int hfi_get_myrank_count() +{ + return __hfi_myrank_count; +} + +int hfi_get_mylocalrank() +{ + return __hfi_mylocalrank; +} + +int hfi_get_mylocalrank_count() +{ + return __hfi_mylocalrank_count; +} + +static void fini_hfi_backtrace(void) +{ + if (getenv("PSM3_BACKTRACE")) { + (void)sigaction(SIGSEGV, &SIGSEGV_old_act, NULL); + (void)sigaction(SIGBUS, &SIGBUS_old_act, NULL); + (void)sigaction(SIGILL, &SIGILL_old_act, NULL); + (void)sigaction(SIGABRT, &SIGABRT_old_act, NULL); + (void)sigaction(SIGINT, &SIGINT_old_act, NULL); + (void)sigaction(SIGTERM, &SIGTERM_old_act, NULL); + } +} diff --git a/prov/psm3/psm3/opa/opa_dwordcpy-x86_64-fast.S b/prov/psm3/psm3/opa/opa_dwordcpy-x86_64-fast.S new file mode 100644 index 00000000000..12fe9a3e200 --- /dev/null +++ b/prov/psm3/psm3/opa/opa_dwordcpy-x86_64-fast.S @@ -0,0 +1,84 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifdef __CET__ +#include +#endif + + .globl hfi_dwordcpy + .file "opa_dwordcpy-x86_64-fast.S" + .text + .p2align 4,,15 + // standard C calling convention, rdi is dest, rsi is source, rdx is count + // does not return any value +hfi_dwordcpy: + .type hfi_dwordcpy, @function +#ifdef _CET_ENDBR + _CET_ENDBR +#endif + movl %edx,%ecx + shrl $1,%ecx + andl $1,%edx + cld + rep + movsq + movl %edx,%ecx + rep + movsd + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/prov/psm3/psm3/opa/opa_dwordcpy-x86_64.c b/prov/psm3/psm3/opa/opa_dwordcpy-x86_64.c new file mode 100644 index 00000000000..a41a40f984f --- /dev/null +++ b/prov/psm3/psm3/opa/opa_dwordcpy-x86_64.c @@ -0,0 +1,315 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include +#include +#include "opa_intf.h" +#include "psm_user.h" + +#if defined(__x86_64__) && defined(HAVE_PSM3_DWORD_FAST) +#define hfi_dwordcpy hfi_dwordcpy_safe +#define hfi_qwordcpy hfi_qwordcpy_safe +#endif + +void hfi_dwordcpy(volatile uint32_t *dest, const uint32_t *src, uint32_t ndwords) +{ + uint_fast32_t ndw = ndwords; + const uint64_t *src64[4]; + volatile uint64_t *dst64[4]; + src64[0] = (const uint64_t *) src; + dst64[0] = (volatile uint64_t *) dest; + + while (ndw >= 8) { + *dst64[0] = *src64[0]; + src64[1] = src64[0] + 1; + src64[2] = src64[0] + 2; + src64[3] = src64[0] + 3; + ndw -= 8; + dst64[1] = dst64[0] + 1; + dst64[2] = dst64[0] + 2; + dst64[3] = dst64[0] + 3; + *dst64[1] = *src64[1]; + *dst64[2] = *src64[2]; + *dst64[3] = *src64[3]; + src64[0] += 4; + dst64[0] += 4; + } + if (ndw) { + src = (const uint32_t *) src64[0]; + dest = (volatile uint32_t *) dst64[0]; + + switch (ndw) { + case 7: *dest++ = *src++; + /* fall through */ + case 6: *dest++ = *src++; + /* fall through */ + case 5: *dest++ = *src++; + /* fall through */ + case 4: *dest++ = *src++; + /* fall through */ + case 3: *dest++ = *src++; + /* fall through */ + case 2: *dest++ = *src++; + /* fall through */ + case 1: *dest++ = *src++; + } + + } +} + +void hfi_qwordcpy(volatile uint64_t *dest, const uint64_t *src, uint32_t nqwords) +{ + uint_fast32_t nqw = nqwords; + const uint64_t *src64[4]; + volatile uint64_t *dst64[4]; + src64[0] = src; + dst64[0] = dest; + + while (nqw >= 8) { + *dst64[0] = *src64[0]; + src64[1] = src64[0] + 1; + src64[2] = src64[0] + 2; + src64[3] = src64[0] + 3; + dst64[1] = dst64[0] + 1; + dst64[2] = dst64[0] + 2; + dst64[3] = dst64[0] + 3; + *dst64[1] = *src64[1]; + *dst64[2] = *src64[2]; + *dst64[3] = *src64[3]; + src64[0] += 4; + dst64[0] += 4; + + *dst64[0] = *src64[0]; + src64[1] = src64[0] + 1; + src64[2] = src64[0] + 2; + src64[3] = src64[0] + 3; + dst64[1] = dst64[0] + 1; + dst64[2] = dst64[0] + 2; + dst64[3] = dst64[0] + 3; + *dst64[1] = *src64[1]; + *dst64[2] = *src64[2]; + *dst64[3] = *src64[3]; + src64[0] += 4; + dst64[0] += 4; + + nqw -= 8; + } + if (nqw) { + switch (nqw) { + case 7: *(dst64[0])++ = *(src64[0])++; + /* fall through */ + case 6: *(dst64[0])++ = *(src64[0])++; + /* fall through */ + case 5: *(dst64[0])++ = *(src64[0])++; + /* fall through */ + case 4: *(dst64[0])++ = *(src64[0])++; + /* fall through */ + case 3: *(dst64[0])++ = *(src64[0])++; + /* fall through */ + case 2: *(dst64[0])++ = *(src64[0])++; + /* fall through */ + case 1: *(dst64[0])++ = *(src64[0])++; + } + } +} + +#ifdef PSM_AVX512 +void hfi_pio_blockcpy_512(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) +{ + volatile __m512i *dp = (volatile __m512i *) dest; + const __m512i *sp = (const __m512i *) src; + + psmi_assert((dp != NULL) && (sp != NULL)); + psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0); + + if ((((uintptr_t) sp) & 0x3f) == 0x0) { + /* source and destination are both 64 byte aligned */ + do { + __m512i tmp0 = _mm512_load_si512(sp); + _mm512_store_si512((__m512i *)dp, tmp0); + } while ((--nblock) && (++dp) && (++sp)); + } else { + /* only destination is 64 byte aligned - use unaligned loads */ + do { + __m512i tmp0 = _mm512_loadu_si512(sp); + _mm512_store_si512((__m512i *)dp, tmp0); + } while ((--nblock) && (++dp) && (++sp)); + } +} +#endif + +void hfi_pio_blockcpy_256(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) +{ + volatile __m256i *dp = (volatile __m256i *) dest; + const __m256i *sp = (const __m256i *) src; + + psmi_assert((dp != NULL) && (sp != NULL)); + psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0); + + if ((((uintptr_t) sp) & 0x1f) == 0x0) { + /* source and destination are both 32 byte aligned */ + do { + __m256i tmp0 = _mm256_load_si256(sp); + __m256i tmp1 = _mm256_load_si256(sp + 1); + _mm256_store_si256((__m256i *)dp, tmp0); + _mm256_store_si256((__m256i *)(dp + 1), tmp1); + } while ((--nblock) && (dp = dp+2) && (sp = sp+2)); + } else { + /* only destination is 32 byte aligned - use unaligned loads */ + do { + __m256i tmp0 = _mm256_loadu_si256(sp); + __m256i tmp1 = _mm256_loadu_si256(sp + 1); + _mm256_store_si256((__m256i *)dp, tmp0); + _mm256_store_si256((__m256i *)(dp + 1), tmp1); + } while ((--nblock) && (dp = dp+2) && (sp = sp+2)); + } +} + +void hfi_pio_blockcpy_128(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) +{ + volatile __m128i *dp = (volatile __m128i *) dest; + const __m128i *sp = (const __m128i *) src; + + psmi_assert((dp != NULL) && (sp != NULL)); + psmi_assert((((uintptr_t) dp) & 0x3f) == 0x0); + + if ((((uintptr_t) sp) & 0xf) == 0x0) { + /* source and destination are both 16 byte aligned */ + do { + __m128i tmp0 = _mm_load_si128(sp); + __m128i tmp1 = _mm_load_si128(sp + 1); + __m128i tmp2 = _mm_load_si128(sp + 2); + __m128i tmp3 = _mm_load_si128(sp + 3); + _mm_store_si128((__m128i *)dp, tmp0); + _mm_store_si128((__m128i *)(dp + 1), tmp1); + _mm_store_si128((__m128i *)(dp + 2), tmp2); + _mm_store_si128((__m128i *)(dp + 3), tmp3); + } while ((--nblock) && (dp = dp+4) && (sp = sp+4)); + } else { + /* only destination is 16 byte aligned - use unaligned loads */ + do { + __m128i tmp0 = _mm_loadu_si128(sp); + __m128i tmp1 = _mm_loadu_si128(sp + 1); + __m128i tmp2 = _mm_loadu_si128(sp + 2); + __m128i tmp3 = _mm_loadu_si128(sp + 3); + _mm_store_si128((__m128i *)dp, tmp0); + _mm_store_si128((__m128i *)(dp + 1), tmp1); + _mm_store_si128((__m128i *)(dp + 2), tmp2); + _mm_store_si128((__m128i *)(dp + 3), tmp3); + } while ((--nblock) && (dp = dp+4) && (sp = sp+4)); + } +} + +void hfi_pio_blockcpy_64(volatile uint64_t *dest, const uint64_t *src, uint32_t nblock) +{ + const uint64_t *src64[4]; + volatile uint64_t *dst64[4]; + src64[0] = src; + dst64[0] = dest; + + psmi_assert((dst64[0] != NULL) && (src64[0] != NULL)); + psmi_assert((((uintptr_t) dest) & 0x3f) == 0x0); + + do { + *dst64[0] = *src64[0]; + src64[1] = src64[0] + 1; + src64[2] = src64[0] + 2; + src64[3] = src64[0] + 3; + dst64[1] = dst64[0] + 1; + dst64[2] = dst64[0] + 2; + dst64[3] = dst64[0] + 3; + *dst64[1] = *src64[1]; + *dst64[2] = *src64[2]; + *dst64[3] = *src64[3]; + src64[0] += 4; + dst64[0] += 4; + + *dst64[0] = *src64[0]; + src64[1] = src64[0] + 1; + src64[2] = src64[0] + 2; + src64[3] = src64[0] + 3; + dst64[1] = dst64[0] + 1; + dst64[2] = dst64[0] + 2; + dst64[3] = dst64[0] + 3; + *dst64[1] = *src64[1]; + *dst64[2] = *src64[2]; + *dst64[3] = *src64[3]; + src64[0] += 4; + dst64[0] += 4; + } while (--nblock); +} + +void MOCKABLE(psmi_mq_mtucpy)(void *vdest, const void *vsrc, uint32_t nchars) +{ + +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(vdest) || PSMI_IS_CUDA_MEM((void *) vsrc))) { + PSMI_CUDA_CALL(cuMemcpy, + (CUdeviceptr)vdest, (CUdeviceptr)vsrc, nchars); + return; + } +#endif + memcpy(vdest, vsrc, nchars); + return; + + +} +MOCK_DEF_EPILOGUE(psmi_mq_mtucpy); + +void psmi_mq_mtucpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars) +{ + memcpy(vdest, vsrc, nchars); + return; +} diff --git a/prov/psm3/psm3/opa/opa_service.c b/prov/psm3/psm3/opa/opa_service.c new file mode 100644 index 00000000000..eeda438e9cd --- /dev/null +++ b/prov/psm3/psm3/opa/opa_service.c @@ -0,0 +1,59 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* This file contains hfi service routine interface used by the low + level hfi protocol code. */ + +#include "opa_service.h" +#include "psmi_wrappers.h" + diff --git a/prov/psm3/psm3/opa/opa_sysfs.c b/prov/psm3/psm3/opa/opa_sysfs.c new file mode 100644 index 00000000000..59fcaaa2423 --- /dev/null +++ b/prov/psm3/psm3/opa/opa_sysfs.c @@ -0,0 +1,504 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +/* This file contains a simple sysfs interface used by the low level + hfi protocol code. It also implements the interface to hfifs. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "psm_config.h" + +#include "opa_service.h" + +static char sysfs_paths[PSMI_MAX_RAILS][PATH_MAX]; +static int sysfs_path_count = -1; +static long sysfs_page_size; +#define SYSFS_DIR "/sys/class/infiniband/" + +int filter_dir(const struct dirent *item) { + if (item->d_name[0] == '.') return 0; + return 1; +} + +int sysfs_init(const char *dflt_hfi_class_path) +{ + char *hfi_env; + + if (NULL != (hfi_env = getenv("PSM3_SYSFS_PATH"))) + { + snprintf(sysfs_paths[0], PATH_MAX, "%s", hfi_env); + sysfs_path_count = 1; + } + if (sysfs_path_count < 1) { + struct dirent **d = NULL; + int i, n = scandir(SYSFS_DIR, &d, filter_dir, alphasort); + sysfs_path_count = 0; + for (i = 0; i < n; i++) { + if (d[i] != NULL) { + if (sysfs_path_count < PSMI_MAX_RAILS) { + struct stat s; + snprintf(sysfs_paths[sysfs_path_count], PATH_MAX, SYSFS_DIR "%s", d[i]->d_name); + if (stat(sysfs_paths[sysfs_path_count], &s) || !S_ISDIR(s.st_mode)) { + memset(sysfs_paths[sysfs_path_count], 0, PATH_MAX); + } else { + sysfs_path_count++; + } + } else { + _HFI_INFO("Max " SYSFS_DIR " device count (%d) reached: Skipping %s\n", PSMI_MAX_RAILS, d[i]->d_name); + } + free(d[i]); + } + } + if (d) free(d); + } + + + if (!sysfs_page_size) + sysfs_page_size = sysconf(_SC_PAGESIZE); + + if (_HFI_DBG_ON) { + int i; + _HFI_DBG("Found %u devices:\n", sysfs_path_count); + for (i = 0; i < sysfs_path_count; i++) { + _HFI_DBG(" Device[%u]: %s\n", i, sysfs_paths[i]); + } + } + + + return sysfs_path_count >= 1 ? 0 : -1; +} + +void sysfs_fini(void) +{ + memset(sysfs_paths, 0, sizeof(sysfs_paths)); + sysfs_path_count = -1; +} + +const char *sysfs_unit_path(int unit_id) +{ + if (sysfs_path_count > 0 && unit_id < sysfs_path_count) { + return sysfs_paths[unit_id]; + } + return NULL; +} + +const char *sysfs_unit_dev_name(int unit_id) +{ + if (unit_id >= 0 && unit_id < sysfs_path_count) { + char *dev_name = strrchr(sysfs_paths[unit_id], '/'); + if (dev_name && *dev_name) + return dev_name+1; + } + return ""; // make it easier to use in output messages +} + +// accepts a unit number (>=0) or a case insenstive unit name +// there must be no trailing whitespace +// will accept unit number in decimal or hex (0x prefix required) +int sysfs_find_unit(const char *name) +{ + int i; + long unit; + char *end; + + if (! name || ! *name) + return -1; + + // unit specified by name + for (i=0; i< sysfs_path_count; i++) { + const char *dev_name = sysfs_unit_dev_name(i); + if (dev_name && *dev_name && 0 == strcasecmp(dev_name, name)) + return i; + } + + // unit specified by number + unit = strtol(name, &end, 10); + if (end == NULL || *end != 0) { + unit = strtol(name, &end, 16); + if (end == NULL || *end != 0) + return -1; + } + if (unit >= 0 && unit < sysfs_path_count) + return unit; + + // invalid + return -1; +} + + +int hfi_sysfs_unit_open(uint32_t unit, const char *attr, int flags) +{ + int saved_errno; + char buf[1024]; + int fd; + const char *unitpath = sysfs_unit_path(unit); + + if (unitpath == NULL) { + _HFI_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr, + unit, "unit id not valid"); + return -1; + } + + snprintf(buf, sizeof(buf), "%s/%s", unitpath, attr); + fd = open(buf, flags); + saved_errno = errno; + + if (fd == -1) { + _HFI_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr, + unit, strerror(errno)); + _HFI_DBG("Offending file name: %s\n", buf); + } + + errno = saved_errno; + return fd; +} + +static int hfi_sysfs_unit_open_for_node(uint32_t unit, int flags) +{ + int saved_errno; + char buf[1024]; + int fd; + const char *unitpath = sysfs_unit_path(unit); + + if (unitpath == NULL) { + _HFI_DBG("Failed to open attribute numa_node of unit %d: %s\n", + unit, "unit id not valid"); + return -1; + } + + snprintf(buf, sizeof(buf), "%s/device/numa_node", unitpath); + fd = open(buf, flags); + saved_errno = errno; + + if (fd == -1) { + _HFI_DBG("Failed to open attribute numa_node of unit %d: %s\n", + unit, strerror(errno)); + _HFI_DBG("Offending file name: %s\n", buf); + } + + errno = saved_errno; + return fd; +} + +int hfi_sysfs_port_open(uint32_t unit, uint32_t port, const char *attr, + int flags) +{ + int saved_errno; + char buf[1024]; + int fd; + const char *unitpath = sysfs_unit_path(unit); + + if (unitpath == NULL) { + _HFI_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr, + unit, "unit id not valid"); + return -1; + } + snprintf(buf, sizeof(buf), "%s/ports/%u/%s", unitpath, port, attr); + fd = open(buf, flags); + saved_errno = errno; + + if (fd == -1) { + _HFI_DBG("Failed to open attribute '%s' of unit %d:%d: %s\n", + attr, unit, port, strerror(errno)); + _HFI_DBG("Offending file name: %s\n", buf); + } + + errno = saved_errno; + return fd; +} + + +static int read_page(int fd, char **datap) +{ + char *data = NULL; + int saved_errno; + int ret = -1; + + data = malloc(sysfs_page_size); + saved_errno = errno; + + if (!data) { + _HFI_DBG("Could not allocate memory: %s\n", strerror(errno)); + goto bail; + } + + ret = read(fd, data, sysfs_page_size); + saved_errno = errno; + + if (ret == -1) { + _HFI_DBG("Read of attribute failed: %s\n", strerror(errno)); + goto bail; + } + +bail: + if (ret == -1) { + free(data); + } else { + if (ret < sysfs_page_size) + data[ret] = 0; + else + data[sysfs_page_size-1] = 0; + *datap = data; + } + + errno = saved_errno; + return ret; +} + +/* + * On return, caller must free *datap. + */ +int hfi_sysfs_unit_read(uint32_t unit, const char *attr, char **datap) +{ + int fd = -1, ret = -1; + int saved_errno; + + fd = hfi_sysfs_unit_open(unit, attr, O_RDONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = read_page(fd, datap); + saved_errno = errno; + +bail: + if (ret == -1) + *datap = NULL; + + if (fd != -1) { + close(fd); + } + + errno = saved_errno; + return ret; +} + +/* read a string value into buff, no more than size bytes. + returns the number of bytes read */ +size_t hfi_sysfs_unit_port_read(uint32_t unit, uint32_t port, const char *attr, + char *buff, size_t size) +{ + int fd = -1; + size_t rv = -1; + + fd = hfi_sysfs_port_open(unit, port, attr, O_RDONLY); + + if (fd == -1) + return rv; + + rv = read(fd, buff, size); + + close(fd); + + if (rv < size) + buff[rv] = 0; + else + buff[size-1] = 0; + + return rv; +} + +/* + * On return, caller must free *datap. + */ +int hfi_sysfs_port_read(uint32_t unit, uint32_t port, const char *attr, + char **datap) +{ + int fd = -1, ret = -1; + int saved_errno; + + fd = hfi_sysfs_port_open(unit, port, attr, O_RDONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = read_page(fd, datap); + saved_errno = errno; + +bail: + if (ret == -1) + *datap = NULL; + + if (fd != -1) { + close(fd); + } + + errno = saved_errno; + return ret; +} + + +int hfi_sysfs_unit_read_s64(uint32_t unit, const char *attr, + int64_t *valp, int base) +{ + char *data=NULL, *end; + int saved_errno; + long long val; + int ret; + + ret = hfi_sysfs_unit_read(unit, attr, &data); + saved_errno = errno; + + if (ret == -1) { + goto bail; + } + + val = strtoll(data, &end, base); + saved_errno = errno; + + if (!*data || !(*end == '\0' || isspace(*end))) { + ret = -1; + goto bail; + } + + *valp = val; + ret = 0; + +bail: + if (data) + free(data); + errno = saved_errno; + return ret; +} + +static int hfi_sysfs_unit_read_node(uint32_t unit, char **datap) +{ + int fd = -1, ret = -1; + int saved_errno; + + fd = hfi_sysfs_unit_open_for_node(unit, O_RDONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = read_page(fd, datap); + if (ret == -1) + *datap = NULL; + + saved_errno = errno; + close(fd); +bail: + errno = saved_errno; + return ret; +} + +int64_t hfi_sysfs_unit_read_node_s64(uint32_t unit) +{ + char *data=NULL, *end; + int saved_errno; + long long val; + int64_t ret = -1; + + saved_errno = errno; + if (hfi_sysfs_unit_read_node(unit, &data) == -1) { + goto bail; + } + + val = strtoll(data, &end, 0); + saved_errno = errno; + + if (!*data || !(*end == '\0' || isspace(*end))) { + ret = -1; + goto bail; + } + + ret = (int64_t) val; +bail: + free(data); + errno = saved_errno; + return ret; +} + +int hfi_sysfs_port_read_s64(uint32_t unit, uint32_t port, const char *attr, + int64_t *valp, int base) +{ + char *data, *end; + int saved_errno; + long long val; + int ret; + + ret = hfi_sysfs_port_read(unit, port, attr, &data); + saved_errno = errno; + + if (ret == -1) { + goto bail; + } + + val = strtoll(data, &end, base); + saved_errno = errno; + + if (!*data || !(*end == '\0' || isspace(*end))) { + ret = -1; + goto bail; + } + + *valp = val; + ret = 0; + +bail: + free(data); + errno = saved_errno; + return ret; +} diff --git a/prov/psm3/psm3/opa/opa_syslog.c b/prov/psm3/psm3/opa/opa_syslog.c new file mode 100644 index 00000000000..b52551d32aa --- /dev/null +++ b/prov/psm3/psm3/opa/opa_syslog.c @@ -0,0 +1,113 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#define __USE_GNU +#include +#include +#include +#include +#include + +#include "opa_user.h" + +#define SYSLOG_MAXLEN 512 + +extern char *__hfi_mylabel; + +void +hfi_vsyslog(const char *prefix, int to_console, int level, + const char *format, va_list ap) +{ + char logprefix[SYSLOG_MAXLEN]; + size_t len; + + if (to_console) { + char hostname[80]; + va_list ap_cons; + va_copy(ap_cons, ap); + len = strlen(format); + gethostname(hostname, sizeof(hostname)); + hostname[sizeof(hostname) - 1] = '\0'; + + if (__hfi_mylabel) + fprintf(stderr, "%s: ", __hfi_mylabel); + else + fprintf(stderr, "%s: ", hostname); + + vfprintf(stderr, format, ap_cons); + if (format[len] != '\n') + fprintf(stderr, "\n"); + fflush(stderr); + va_end(ap_cons); + } + + len = snprintf(logprefix, sizeof(logprefix), + "(nic/%s)[%d]: %s", prefix ? prefix : "nic", + (int)getpid(), format); + + vsyslog(level | LOG_USER, logprefix, ap); + + return; +} + +void +hfi_syslog(const char *prefix, int to_console, int level, + const char *format, ...) +{ + va_list ap; + va_start(ap, format); + hfi_vsyslog(prefix, to_console, level, format, ap); + va_end(ap); +} diff --git a/prov/psm3/psm3/opa/opa_time.c b/prov/psm3/psm3/opa/opa_time.c new file mode 100644 index 00000000000..33de9959b22 --- /dev/null +++ b/prov/psm3/psm3/opa/opa_time.c @@ -0,0 +1,299 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#define __USE_GNU +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "opa_user.h" + +#ifdef min +#undef min +#endif +#define min(a, b) ((a) < (b) ? (a) : (b)) + +#ifdef max +#undef max +#endif +#define max(a, b) ((a) > (b) ? (a) : (b)) + +/* init the cycle counter to picosecs/cycle conversion automatically */ +/* at program startup, if it's using timing functions. */ +static void init_picos_per_cycle(void) __attribute__ ((constructor)); +static int hfi_timebase_isvalid(uint32_t pico_per_cycle); +static uint32_t hfi_timebase_from_cpuinfo(uint32_t old_pico_per_cycle); + +/* in case two of our mechanisms fail */ +#define SAFEDEFAULT_PICOS_PER_CYCLE 500 + +uint32_t __hfi_pico_per_cycle = SAFEDEFAULT_PICOS_PER_CYCLE; + +/* This isn't perfect, but it's close enough for rough timing. We want this + to work on systems where the cycle counter isn't the same as the clock + frequency. + __hfi_pico_per_cycle isn't going to lead to completely accurate + conversions from timestamps to nanoseconds, but it's close enough for + our purposes, which is mainly to allow people to show events with nsecs + or usecs if desired, rather than cycles. We use it in some performance + analysis, but it has to be done with care, since cpuspeed can change, + different cpu's can have different speeds, etc. + + Some architectures don't have their TSC-equivalent running at anything + related to the processor speed (e.g. G5 Power systems use a fixed + 33 MHz frequency). +*/ + +#define MIN_TEST_TIME_IN_PICOS (100000000000LL) /* 100 milliseconds */ + +static int timebase_debug; /* off by default */ + +#define timebase_warn_always(fmt, ...) \ + hfi_syslog("timebase", 1, LOG_ERR, fmt, ##__VA_ARGS__) +#define timebase_warn(fmt, ...) if (timebase_debug) \ + timebase_warn_always(fmt, ##__VA_ARGS__) + +static int hfi_timebase_isvalid(uint32_t pico_per_cycle) +{ +#if defined(__x86_64__) || defined(__i386__) + /* If pico-per-cycle is less than 200, the clock speed would be greater + * than 5 GHz. Similarly, we minimally support a 1GHz clock. + * Allow some slop, because newer kernels with HPET can be a few + * units off, and we don't want to spend the startup time needlessly */ + if (pico_per_cycle >= 198 && pico_per_cycle <= 1005) + return 1; +#endif + else + return 0; +} + +/* + * Method #1: + * + * Derive the pico-per-cycle by trying to correlate the difference between two + * reads of the tsc counter to gettimeofday. + */ +static void init_picos_per_cycle() +{ + struct timeval tvs, tve; + int64_t usec = 0; + uint64_t ts, te; + int64_t delta; + uint32_t picos = 0; + int trials = 0; + int retry = 0; + cpu_set_t cpuset, cpuset_saved; + int have_cpuset = 1; + + /* + * Make sure we try to calculate the cycle time without being migrated. + */ + CPU_ZERO(&cpuset_saved); + if (sched_getaffinity(0, sizeof(cpuset), &cpuset_saved)) + have_cpuset = 0; + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + if (have_cpuset && sched_setaffinity(0, sizeof(cpuset), &cpuset)) + have_cpuset = 0; + + /* + * If we set affinity correctly, give the scheduler another change to put + * us on processor 0 + */ + if (have_cpuset) + sched_yield(); + +retry_pico_test: + if (++retry == 10) { + __hfi_pico_per_cycle = hfi_timebase_from_cpuinfo(picos); + goto reset_cpu_mask; /* Reset CPU mask before exiting */ + } + + usec = 0; + gettimeofday(&tvs, NULL); + ts = get_cycles(); + while (usec < MIN_TEST_TIME_IN_PICOS) { /* wait for at least 100 millisecs */ + trials++; + usleep(125); + gettimeofday(&tve, NULL); + usec = 1000000LL * (tve.tv_usec - tvs.tv_usec) + + 1000000000000LL * (tve.tv_sec - tvs.tv_sec); + if (usec < 0) { + timebase_warn + ("RTC timebase, gettimeofday is negative (!) %lld\n", + (long long)usec); + goto retry_pico_test; + } + } + te = get_cycles(); + delta = te - ts; + picos = (uint32_t) (usec / delta); + + if (!hfi_timebase_isvalid(picos)) { + cpu_set_t cpuget; + int affinity_valid = + !sched_getaffinity(0, sizeof(cpuget), &cpuget); + if (affinity_valid && !CPU_ISSET(0, &cpuget)) + affinity_valid = 0; + timebase_warn + ("Failed to get valid RTC timebase, gettimeofday delta=%lld, " + "rtc delta=%lld, picos_per_cycle=%d affinity_valid=%s (trial %d/10)\n", + (long long)usec, (long long)delta, picos, + affinity_valid ? "YES" : "NO", retry); + goto retry_pico_test; + } + + /* If we've had to retry even once, let that be known */ + if (retry > 1) + timebase_warn("Clock is %d picos/cycle found in %d trials and " + "%.3f seconds (retry=%d)\n", picos, trials, + (double)usec / 1.0e12, retry); + + __hfi_pico_per_cycle = picos; + +reset_cpu_mask: + /* Restore affinity */ + if (have_cpuset) { + sched_setaffinity(0, sizeof(cpuset), &cpuset_saved); + /* + * Give a chance to other processes that also set affinity to 0 for + * doing this test. + */ + sched_yield(); + } +} + +/* + * Method #2: + * + * Derive the pico-per-cycle from /proc instead of using sleep trick + * that relies on scheduler. + */ +static uint32_t hfi_timebase_from_cpuinfo(uint32_t old_pico_per_cycle) +{ + /* we only validate once */ + uint32_t new_pico_per_cycle = old_pico_per_cycle; + uint32_t max_bet_new_old_pico, min_bet_new_old_pico; + + char hostname[80]; + gethostname(hostname, 80); + hostname[sizeof(hostname) - 1] = '\0'; + + if (getenv("PSM3_DEBUG_TIMEBASE")) + timebase_debug = 1; + + /* If the old one is valid, don't bother with this mechanism */ + if (hfi_timebase_isvalid(old_pico_per_cycle)) + return old_pico_per_cycle; + +#if defined(__x86_64__) || defined(__i386__) + { + FILE *fp = fopen("/proc/cpuinfo", "r"); + char input[255]; + char *p = NULL; + + if (!fp) + goto fail; + + while (!feof(fp) && fgets(input, 255, fp)) { + if (strstr(input, "cpu MHz")) { + p = strchr(input, ':'); + if (p) + { + double MHz = atof(p + 1); + if (MHz != 0.0) + new_pico_per_cycle = + (uint32_t) (1000000. / MHz); + } + break; + } + } + fclose(fp); + if (!p) + goto fail; + } +#endif + + max_bet_new_old_pico = max(new_pico_per_cycle, old_pico_per_cycle); + min_bet_new_old_pico = min(new_pico_per_cycle, old_pico_per_cycle); + /* If there's no change (within a small range), just return the old one */ + if ((max_bet_new_old_pico - min_bet_new_old_pico) < 5) + return old_pico_per_cycle; + + if (hfi_timebase_isvalid(new_pico_per_cycle)) { + timebase_warn_always + ("RTC timebase, using %d picos/cycle from /proc " + "instead of the detected %d picos/cycle\n", + new_pico_per_cycle, old_pico_per_cycle); + return new_pico_per_cycle; + } + +fail: + new_pico_per_cycle = SAFEDEFAULT_PICOS_PER_CYCLE; + timebase_warn_always + ("Problem obtaining CPU time base, detected to be %d " + "pico/cycle, adjusted to safe default %d picos/cycle", + old_pico_per_cycle, new_pico_per_cycle); + return new_pico_per_cycle; +} diff --git a/prov/psm3/psm3/opa/opa_utils.c b/prov/psm3/psm3/opa/opa_utils.c new file mode 100644 index 00000000000..1abe60ecbd6 --- /dev/null +++ b/prov/psm3/psm3/opa/opa_utils.c @@ -0,0 +1,196 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* This file contains hfi service routine interface used by the low */ +/* level hfi protocol code. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "opa_user.h" + +/* keep track whether we disabled mmap in malloc */ +int __hfi_malloc_no_mmap = 0; + +const char *hfi_get_next_name(char **names) +{ + char *p, *start; + + p = start = *names; + while (*p != '\0' && *p != '\n') { + p++; + } + if (*p == '\n') { + *p = '\0'; + p++; + *names = p; + return start; + } else + return NULL; +} + +void hfi_release_names(char *namep) +{ + /* names were initialised in the data section before. Now + * they are allocated when hfi_hfifs_read() is called. Allocation + * for names is done only once at init time. Should we eventually + * have an "stats_type_unregister" type of routine to explicitly + * deallocate memory and free resources ? + */ +#if 0 + if (namep != NULL) + free(namep); +#endif +} + + +/* + * Add a constructor function to disable mmap if asked to do so by the user + */ +static void init_mallopt_disable_mmap(void) __attribute__ ((constructor)); + +static void init_mallopt_disable_mmap(void) +{ + char *env = getenv("PSM3_DISABLE_MMAP_MALLOC"); + + if (env && *env) { + if (mallopt(M_MMAP_MAX, 0) && mallopt(M_TRIM_THRESHOLD, -1)) { + __hfi_malloc_no_mmap = 1; + } + } + + return; +} + +/* Convert Timeout value from usec to + * timeout_mult where usec = 4.096usec * 2^timeout_mult + */ +uint8_t timeout_usec_to_mult(uint64_t timeout_us) +{ + /* all values are rounded up, comments reflect exact value */ + if (timeout_us <= 4) + return 0; /* 4.096 us */ + else if (timeout_us <= 8) + return 1; /* 8.192 us */ + else if (timeout_us <= 16) + return 2; /* 16.384 us */ + else if (timeout_us <= 32) + return 3; /* 32.768 us */ + else if (timeout_us <= 65) + return 4; /* 65.536 us */ + else if (timeout_us <= 131) + return 5; /* 131.072 us */ + else if (timeout_us <= 262) + return 6; /* 262.144 us */ + else if (timeout_us <= 524) + return 7; /* 524.288 us */ + else if (timeout_us <= 1048) + return 8; /* 1048.576 us */ + else if (timeout_us <= 2097) + return 9; /* 2.097 ms */ + else if (timeout_us <= 4194) + return 10; /* 4.197 ms */ + else if (timeout_us <= 8388) + return 11; /* 8.388 ms */ + else if (timeout_us <= 16777) + return 12; /* 16.777 ms */ + else if (timeout_us <= 33554) + return 13; /* 33.554 ms */ + else if (timeout_us <= 67108) + return 14; /* 67.1 ms */ + else if (timeout_us <= 134217) + return 15; /* 134.2 ms */ + else if (timeout_us <= 268435) + return 16; /* 268.4 ms */ + else if (timeout_us <= 536870) + return 17; /* 536.8 ms */ + else if (timeout_us <= 1073741) + return 18;/* 1.073 s */ + else if (timeout_us <= 2147483) + return 19;/* 2.148 s */ + else if (timeout_us <= 4294967) + return 20;/* 4.294 s */ + else if (timeout_us <= 8589934) + return 21;/* 8.589 s */ + else if (timeout_us <= 17179869) + return 22;/* 17.179 s */ + else if (timeout_us <= 34359738) + return 23;/* 34.359 s */ + else if (timeout_us <= 68719476) + return 24;/* 68.719 s */ + else if (timeout_us <= 137438953ll) + return 25;/* 2.2 minutes */ + else if (timeout_us <= 274877906ll) + return 26; /* 4.5 minutes */ + else if (timeout_us <= 549755813ll) + return 27; /* 9 minutes */ + else if (timeout_us <= 1099511628ll) + return 28; /* 18 minutes */ + else if (timeout_us <= 2199023256ll) + return 29; /* 0.6 hr */ + else if (timeout_us <= 4398046511ll) + return 30; /* 1.2 hr */ + else + return 31; /* 2.4 hr */ +} diff --git a/prov/psm3/psm3/psm.c b/prov/psm3/psm3/psm.c new file mode 100644 index 00000000000..8a4ce4bd441 --- /dev/null +++ b/prov/psm3/psm3/psm.c @@ -0,0 +1,1210 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include +#include "psm_user.h" +#include "psm2_hal.h" +#include "opa_revision.h" +#include "psm_mq_internal.h" + +static int psmi_verno_major = PSM2_VERNO_MAJOR; +static int psmi_verno_minor = PSM2_VERNO_MINOR; +static int psmi_verno = PSMI_VERNO_MAKE(PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR); +static int psmi_verno_client_val; +int psmi_epid_ver; +int psmi_allow_routers; + +// Special psmi_refcount values +#define PSMI_NOT_INITIALIZED 0 +#define PSMI_FINALIZED -1 + +// PSM2 doesn't support transitioning out of the PSMI_FINALIZED state +// once psmi_refcount is set to PSMI_FINALIZED, any further attempts to change +// psmi_refcount should be treated as an error +static int psmi_refcount = PSMI_NOT_INITIALIZED; + +/* Global lock used for endpoint creation and destroy + * (in functions psm2_ep_open and psm2_ep_close) and also + * for synchronization with recv_thread (so that recv_thread + * will not work on an endpoint which is in a middle of closing). */ +psmi_lock_t psmi_creation_lock; + +sem_t *sem_affinity_shm_rw = NULL; +int psmi_affinity_shared_file_opened = 0; +int psmi_affinity_semaphore_open = 0; +uint64_t *shared_affinity_ptr; +char *sem_affinity_shm_rw_name; +char *affinity_shm_name; + +uint32_t psmi_cpu_model; + +#ifdef PSM_CUDA +int is_cuda_enabled; +int is_gdr_copy_enabled; +int device_support_gpudirect; +int gpu_p2p_supported = 0; +int my_gpu_device = 0; +int cuda_lib_version; +int is_driver_gpudirect_enabled; +int is_cuda_primary_context_retain = 0; +uint32_t cuda_thresh_rndv; +uint32_t gdr_copy_threshold_send; +uint32_t gdr_copy_threshold_recv; + +void *psmi_cuda_lib; +CUresult (*psmi_cuInit)(unsigned int Flags ); +CUresult (*psmi_cuCtxDetach)(CUcontext c); +CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c); +CUresult (*psmi_cuCtxSetCurrent)(CUcontext c); +CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); +CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); +CUresult (*psmi_cuDeviceCanAccessPeer)(int *canAccessPeer, CUdevice dev, CUdevice peerDev); +CUresult (*psmi_cuDeviceGet)(CUdevice* device, int ordinal); +CUresult (*psmi_cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev); +CUresult (*psmi_cuDriverGetVersion)(int* driverVersion); +CUresult (*psmi_cuDeviceGetCount)(int* count); +CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags); +CUresult (*psmi_cuStreamDestroy)(CUstream phStream); +CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags); +CUresult (*psmi_cuEventDestroy)(CUevent hEvent); +CUresult (*psmi_cuEventQuery)(CUevent hEvent); +CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream); +CUresult (*psmi_cuEventSynchronize)(CUevent hEvent); +CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags); +CUresult (*psmi_cuMemFreeHost)(void* p); +CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); +CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); +CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); +CUresult (*psmi_cuMemcpyHtoD)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount); +CUresult (*psmi_cuMemcpyDtoHAsync)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); +CUresult (*psmi_cuMemcpyHtoDAsync)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream); +CUresult (*psmi_cuIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr); +CUresult (*psmi_cuIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags); +CUresult (*psmi_cuIpcCloseMemHandle)(CUdeviceptr dptr); +CUresult (*psmi_cuMemGetAddressRange)(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr); +CUresult (*psmi_cuDevicePrimaryCtxGetState)(CUdevice dev, unsigned int* flags, int* active); +CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev); +CUresult (*psmi_cuCtxGetDevice)(CUdevice* device); +CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device); +#endif + +/* + * Bit field that contains capability set. + * Each bit represents different capability. + * It is supposed to be filled with logical OR + * on conditional compilation basis + * along with future features/capabilities. + */ +uint64_t psm2_capabilities_bitset = PSM2_MULTI_EP_CAP | PSM2_LIB_REFCOUNT_CAP; + +int psmi_verno_client() +{ + return psmi_verno_client_val; +} + +/* This function is used to determine whether the current library build can + * successfully communicate with another library that claims to be version + * 'verno'. + * + * PSM 2.x is always ABI compatible, but this checks to see if two different + * versions of the library can coexist. + */ +int psmi_verno_isinteroperable(uint16_t verno) +{ + if (PSMI_VERNO_GET_MAJOR(verno) != PSM2_VERNO_MAJOR) + return 0; + + return 1; +} + +int MOCKABLE(psmi_isinitialized)() +{ + return (psmi_refcount > 0); +} +MOCK_DEF_EPILOGUE(psmi_isinitialized); + +#ifdef PSM_CUDA +int psmi_cuda_lib_load() +{ + psm2_error_t err = PSM2_OK; + char *dlerr; + + PSM2_LOG_MSG("entering"); + _HFI_VDBG("Loading CUDA library.\n"); + + psmi_cuda_lib = dlopen("libcuda.so.1", RTLD_LAZY); + if (!psmi_cuda_lib) { + dlerr = dlerror(); + _HFI_ERROR("Unable to open libcuda.so. Error %s\n", + dlerr ? dlerr : "no dlerror()"); + goto fail; + } + + psmi_cuDriverGetVersion = dlsym(psmi_cuda_lib, "cuDriverGetVersion"); + + if (!psmi_cuDriverGetVersion) { + _HFI_ERROR + ("Unable to resolve symbols in CUDA libraries.\n"); + goto fail; + } + + PSMI_CUDA_CALL(cuDriverGetVersion, &cuda_lib_version); + if (cuda_lib_version < 7000) { + _HFI_ERROR("Please update CUDA driver, required minimum version is 7.0\n"); + goto fail; + } + + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuInit); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxGetCurrent); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxDetach); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxSetCurrent); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuPointerGetAttribute); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuPointerSetAttribute); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceCanAccessPeer); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetAttribute); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGet); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetCount); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamCreate); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamDestroy); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventCreate); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventDestroy); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventQuery); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventRecord); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventSynchronize); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostAlloc); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemFreeHost); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpy); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoD); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoH); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyHtoD); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoHAsync); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyHtoDAsync); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuIpcGetMemHandle); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuIpcOpenMemHandle); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuIpcCloseMemHandle); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemGetAddressRange); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDevicePrimaryCtxGetState); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDevicePrimaryCtxRetain); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDevicePrimaryCtxRelease); + PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxGetDevice); + + PSM2_LOG_MSG("leaving"); + return err; +fail: + if (psmi_cuda_lib) + dlclose(psmi_cuda_lib); + err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to load CUDA library.\n"); + return err; +} + +int psmi_cuda_initialize() +{ + psm2_error_t err = PSM2_OK; + int num_devices, dev; + + PSM2_LOG_MSG("entering"); + _HFI_VDBG("Enabling CUDA support.\n"); + + err = psmi_cuda_lib_load(); + if (err != PSM2_OK) + goto fail; + + PSMI_CUDA_CALL(cuInit, 0); + + /* Check if CUDA context is available. If not, we are not allowed to + * launch any CUDA API calls */ + PSMI_CUDA_CALL(cuCtxGetCurrent, &ctxt); + if (ctxt == NULL) { + _HFI_INFO("Unable to find active CUDA context\n"); + is_cuda_enabled = 0; + err = PSM2_OK; + return err; + } + + CUdevice current_device; + CUcontext primary_ctx; + PSMI_CUDA_CALL(cuCtxGetDevice, ¤t_device); + int is_ctx_active; + unsigned ctx_flags; + PSMI_CUDA_CALL(cuDevicePrimaryCtxGetState, current_device, &ctx_flags, + &is_ctx_active); + if (!is_ctx_active) { + /* There is an issue where certain CUDA API calls create + * contexts but does not make it active which cause the + * driver API call to fail with error 709 */ + PSMI_CUDA_CALL(cuDevicePrimaryCtxRetain, &primary_ctx, + current_device); + is_cuda_primary_context_retain = 1; + } + + /* Check if all devices support Unified Virtual Addressing. */ + PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices); + + device_support_gpudirect = 1; + + for (dev = 0; dev < num_devices; dev++) { + CUdevice device; + PSMI_CUDA_CALL(cuDeviceGet, &device, dev); + int unifiedAddressing; + PSMI_CUDA_CALL(cuDeviceGetAttribute, + &unifiedAddressing, + CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, + device); + + if (unifiedAddressing !=1) { + _HFI_ERROR("CUDA device %d does not support Unified Virtual Addressing.\n", dev); + goto fail; + } + + int major; + PSMI_CUDA_CALL(cuDeviceGetAttribute, + &major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + device); + if (major < 3) { + device_support_gpudirect = 0; + _HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev); + } + + if (device != current_device) { + int canAccessPeer = 0; + PSMI_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer, + current_device, device); + + if (canAccessPeer != 1) + _HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev); + else + gpu_p2p_supported |= (1 << device); + } else { + /* Always support p2p on the same GPU */ + my_gpu_device = device; + gpu_p2p_supported |= (1 << device); + } + } + + union psmi_envvar_val env_enable_gdr_copy; + psmi_getenv("PSM3_GDRCOPY", + "Enable (set envvar to 1) for gdr copy support in PSM (Enabled by default)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)1, &env_enable_gdr_copy); + is_gdr_copy_enabled = env_enable_gdr_copy.e_int; + + union psmi_envvar_val env_cuda_thresh_rndv; + psmi_getenv("PSM3_CUDA_THRESH_RNDV", + "RNDV protocol is used for message sizes greater than the threshold \n", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)CUDA_THRESH_RNDV, &env_cuda_thresh_rndv); + cuda_thresh_rndv = env_cuda_thresh_rndv.e_int; + + if (cuda_thresh_rndv < 0 || cuda_thresh_rndv > CUDA_THRESH_RNDV) + cuda_thresh_rndv = CUDA_THRESH_RNDV; + + union psmi_envvar_val env_gdr_copy_thresh_send; + psmi_getenv("PSM3_GDRCOPY_THRESH_SEND", + "GDR Copy is turned off on the send side" + " for message sizes greater than the threshold \n", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)GDR_COPY_THRESH_SEND, &env_gdr_copy_thresh_send); + gdr_copy_threshold_send = env_gdr_copy_thresh_send.e_int; + + if (gdr_copy_threshold_send < 8 || gdr_copy_threshold_send > cuda_thresh_rndv) + gdr_copy_threshold_send = GDR_COPY_THRESH_SEND; + + union psmi_envvar_val env_gdr_copy_thresh_recv; + psmi_getenv("PSM3_GDRCOPY_THRESH_RECV", + "GDR Copy is turned off on the recv side" + " for message sizes greater than the threshold \n", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)GDR_COPY_THRESH_RECV, &env_gdr_copy_thresh_recv); + gdr_copy_threshold_recv = env_gdr_copy_thresh_recv.e_int; + + if (gdr_copy_threshold_recv < 8) + gdr_copy_threshold_recv = GDR_COPY_THRESH_RECV; + + PSM2_LOG_MSG("leaving"); + return err; +fail: + err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to initialize PSM3 CUDA support.\n"); + return err; +} +#endif + +psm2_error_t __psm2_init(int *major, int *minor) +{ + psm2_error_t err = PSM2_OK; + union psmi_envvar_val env_tmask; + + psmi_stats_initialize(); + + psmi_mem_stats_register(); + + psmi_log_initialize(); + + PSM2_LOG_MSG("entering"); + + /* When PSM_PERF is enabled, the following code causes the + PMU to be programmed to measure instruction cycles of the + TX/RX speedpaths of PSM. */ + GENERIC_PERF_INIT(); + GENERIC_PERF_SET_SLOT_NAME(PSM_TX_SPEEDPATH_CTR, "TX"); + GENERIC_PERF_SET_SLOT_NAME(PSM_RX_SPEEDPATH_CTR, "RX"); + + if (psmi_refcount > 0) { + psmi_refcount++; + goto update; + } + + if (psmi_refcount == PSMI_FINALIZED) { + err = PSM2_IS_FINALIZED; + goto fail; + } + + if (major == NULL || minor == NULL) { + err = PSM2_PARAM_ERR; + goto fail; + } + + psmi_init_lock(&psmi_creation_lock); + +#ifdef PSM_DEBUG + if (!getenv("PSM3_NO_WARN")) + fprintf(stderr, + "!!! WARNING !!! YOU ARE RUNNING AN INTERNAL-ONLY PSM *DEBUG* BUILD.\n"); +#endif + +#ifdef PSM_PROFILE + if (!getenv("PSM3_NO_WARN")) + fprintf(stderr, + "!!! WARNING !!! YOU ARE RUNNING AN INTERNAL-ONLY PSM *PROFILE* BUILD.\n"); +#endif + +#ifdef PSM_FI + /* Make sure we complain if fault injection is enabled */ + if (getenv("PSM3_FI") && !getenv("PSM3_NO_WARN")) + fprintf(stderr, + "!!! WARNING !!! YOU ARE RUNNING WITH FAULT INJECTION ENABLED!\n"); +#endif /* #ifdef PSM_FI */ + + /* Make sure, as an internal check, that this version knows how to detect + * compatibility with other library versions it may communicate with */ + if (psmi_verno_isinteroperable(psmi_verno) != 1) { + err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "psmi_verno_isinteroperable() not updated for current version!"); + goto fail; + } + + /* The only way to not support a client is if the major number doesn't + * match */ + if (*major != PSM2_VERNO_MAJOR && *major != PSM2_VERNO_COMPAT_MAJOR) { + err = psmi_handle_error(NULL, PSM2_INIT_BAD_API_VERSION, + "This library does not implement version %d.%d", + *major, *minor); + goto fail; + } + + /* Make sure we don't keep track of a client that claims a higher version + * number than we are */ + psmi_verno_client_val = + min(PSMI_VERNO_MAKE(*major, *minor), psmi_verno); + + /* Check to see if we need to set Architecture flags to something + * besides big core Xeons */ + cpuid_t id; + psmi_cpu_model = CPUID_MODEL_UNDEFINED; + + /* First check to ensure Genuine Intel */ + get_cpuid(0x0, 0, &id); + if(id.ebx == CPUID_GENUINE_INTEL_EBX + && id.ecx == CPUID_GENUINE_INTEL_ECX + && id.edx == CPUID_GENUINE_INTEL_EDX) + { + /* Use cpuid with EAX=1 to get processor info */ + get_cpuid(0x1, 0, &id); + psmi_cpu_model = CPUID_GENUINE_INTEL; + } + + if( (psmi_cpu_model == CPUID_GENUINE_INTEL) && + (id.eax & CPUID_FAMILY_MASK) == CPUID_FAMILY_XEON) + { + psmi_cpu_model = ((id.eax & CPUID_MODEL_MASK) >> 4) | + ((id.eax & CPUID_EXMODEL_MASK) >> 12); + } + + psmi_refcount++; + /* hfi_debug lives in libhfi.so */ + psmi_getenv("PSM3_TRACEMASK", + "Mask flags for tracing", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_ULONG_FLAGS, + (union psmi_envvar_val)hfi_debug, &env_tmask); + hfi_debug = (long)env_tmask.e_ulong; + + /* The "real thing" is done in hfi_proto.c as a constructor function, but + * we getenv it here to report what we're doing with the setting */ + { + extern int __hfi_malloc_no_mmap; + union psmi_envvar_val env_mmap; + char *env = getenv("PSM3_DISABLE_MMAP_MALLOC"); + int broken = (env && *env && !__hfi_malloc_no_mmap); + psmi_getenv("PSM3_DISABLE_MMAP_MALLOC", + broken ? "Skipping mmap disable for malloc()" : + "Disable mmap for malloc()", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_YESNO, + (union psmi_envvar_val)0, &env_mmap); + if (broken) + _HFI_ERROR + ("Couldn't successfully disable mmap in mallocs " + "with mallopt()\n"); + } + + { + union psmi_envvar_val env_epid_ver; + psmi_getenv("PSM3_ADDR_FMT", + "Used to force PSM3 to use a particular version of EPID", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)PSMI_EPID_VERNO_DEFAULT, &env_epid_ver); + psmi_epid_ver = env_epid_ver.e_int; + if (psmi_epid_ver > PSMI_MAX_EPID_VERNO_SUPPORTED) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + " The max epid version supported in this version of PSM3 is %d \n" + "Please upgrade PSM3 \n", + PSMI_MAX_EPID_VERNO_SUPPORTED); + goto fail; + } else if (psmi_epid_ver < PSMI_MIN_EPID_VERNO_SUPPORTED) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + " Invalid value provided through PSM3_ADDR_FMT \n"); + goto fail; + } + } + { + union psmi_envvar_val env_allow_routers; + psmi_getenv("PSM3_ALLOW_ROUTERS", + "Disable check for Ethernet subnet equality between nodes\n" + " allows routers between nodes and assumes single network plane for multi-rail\n", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)0, &env_allow_routers); + psmi_allow_routers = env_allow_routers.e_int; + } + + if (getenv("PSM3_DIAGS")) { + _HFI_INFO("Running diags...\n"); + psmi_diags(); + } + + psmi_multi_ep_init(); + +#ifdef PSM_FI + psmi_faultinj_init(); +#endif /* #ifdef PSM_FI */ + + psmi_epid_init(); + + int rc = psmi_hal_initialize(); + + if (rc) + { + err = PSM2_INTERNAL_ERR; + goto fail; + } + +#ifdef PSM_CUDA + union psmi_envvar_val env_enable_cuda; + psmi_getenv("PSM3_CUDA", + "Enable (set envvar to 1) for cuda support in PSM (Disabled by default)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)0, &env_enable_cuda); + is_cuda_enabled = env_enable_cuda.e_int; + + if (PSMI_IS_CUDA_ENABLED) { + err = psmi_cuda_initialize(); + if (err != PSM2_OK) + goto fail; + } +#endif + +update: + if (psmi_parse_identify()) { + Dl_info info_psm; + char ofed_delta[100] = ""; + strcat(strcat(ofed_delta," built for IFS OFA DELTA "),psmi_hfi_IFS_version); + printf("%s %s PSM3 v%d.%d%s\n" + "%s %s location %s\n" + "%s %s build date %s\n" + "%s %s src checksum %s\n" + "%s %s git checksum %s\n" +#ifdef RNDV_MOD_MR + "%s %s built against rv interface v%d.%d\n" +#endif + "%s %s Global Rank %d (%d total) Local Rank %d (%d total)\n" + , hfi_get_mylabel(), hfi_ident_tag, + PSM2_VERNO_MAJOR,PSM2_VERNO_MINOR, + (strcmp(psmi_hfi_IFS_version,"") != 0) ? ofed_delta +#ifdef PSM_CUDA + : "-cuda", +#else + : "", +#endif + hfi_get_mylabel(), hfi_ident_tag, dladdr(psm2_init, &info_psm) ? + info_psm.dli_fname : "PSM3 path not available", + hfi_get_mylabel(), hfi_ident_tag, psmi_hfi_build_timestamp, + hfi_get_mylabel(), hfi_ident_tag, psmi_hfi_sources_checksum, + hfi_get_mylabel(), hfi_ident_tag, + (strcmp(psmi_hfi_git_checksum,"") != 0) ? + psmi_hfi_git_checksum : "", +#ifdef RNDV_MOD_MR + hfi_get_mylabel(), hfi_ident_tag, + psm2_rv_get_user_major_bldtime_version(), + psm2_rv_get_user_minor_bldtime_version(), +#endif + hfi_get_mylabel(), hfi_ident_tag, + hfi_get_myrank(), hfi_get_myrank_count(), + hfi_get_mylocalrank(), + hfi_get_mylocalrank_count() + ); + } + + *major = (int)psmi_verno_major; + *minor = (int)psmi_verno_minor; +fail: + _HFI_DBG("psmi_refcount=%d,err=%u\n", psmi_refcount, err); + + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_init) + +static +psm2_error_t psmi_get_psm2_config(psm2_mq_t mq, + psm2_epaddr_t epaddr, + uint32_t *out) +{ + psm2_error_t rv = PSM2_INTERNAL_ERR; + + *out = 0; + if (&mq->ep->ptl_ips == epaddr->ptlctl) + { + rv = PSM2_OK; + *out |= PSM2_INFO_QUERY_CONFIG_IPS; +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED) + { + *out |= PSM2_INFO_QUERY_CONFIG_CUDA; + if (PSMI_IS_GDR_COPY_ENABLED) + *out |= PSM2_INFO_QUERY_CONFIG_GDR_COPY; + } +#endif + *out |= PSM2_INFO_QUERY_CONFIG_PIO; + } + else if (&mq->ep->ptl_amsh == epaddr->ptlctl) + { + *out |= PSM2_INFO_QUERY_CONFIG_AMSH; + rv = PSM2_OK; + } + else if (&mq->ep->ptl_self == epaddr->ptlctl) + { + *out |= PSM2_INFO_QUERY_CONFIG_SELF; + rv = PSM2_OK; + } + return rv; +} + +psm2_error_t __psm2_info_query(psm2_info_query_t q, void *out, + size_t nargs, psm2_info_query_arg_t args[]) +{ + static const size_t expected_arg_cnt[PSM2_INFO_QUERY_LAST] = + { + 0, /* PSM2_INFO_QUERY_NUM_UNITS */ + 0, /* PSM2_INFO_QUERY_NUM_PORTS */ + 1, /* PSM2_INFO_QUERY_UNIT_STATUS */ + 2, /* PSM2_INFO_QUERY_UNIT_PORT_STATUS */ + 1, /* PSM2_INFO_QUERY_NUM_FREE_CONTEXTS */ + 1, /* PSM2_INFO_QUERY_NUM_CONTEXTS */ + 2, /* PSM2_INFO_QUERY_CONFIG */ + 3, /* PSM2_INFO_QUERY_THRESH */ + 3, /* PSM2_INFO_QUERY_DEVICE_NAME */ + 2, /* PSM2_INFO_QUERY_MTU */ + 2, /* PSM2_INFO_QUERY_LINK_SPEED */ + 1, /* PSM2_INFO_QUERY_NETWORK_TYPE */ + 0, /* PSM2_INFO_QUERY_FEATURE_MASK */ + 2, /* PSM2_INFO_QUERY_UNIT_NAME */ + 2, /* PSM2_INFO_QUERY_UNIT_SYS_PATH */ + }; + psm2_error_t rv = PSM2_INTERNAL_ERR; + + if ((q < 0) || + (q >= PSM2_INFO_QUERY_LAST)) + return PSM2_IQ_INVALID_QUERY; + + if (nargs != expected_arg_cnt[q]) + return PSM2_PARAM_ERR; + + switch (q) + { + case PSM2_INFO_QUERY_NUM_UNITS: + *((uint32_t*)out) = psmi_hal_get_num_units_(); + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_NUM_PORTS: + *((uint32_t*)out) = psmi_hal_get_num_ports_(); + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_UNIT_STATUS: + *((uint32_t*)out) = psmi_hal_get_unit_active(args[0].unit); + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_UNIT_PORT_STATUS: + *((uint32_t*)out) = psmi_hal_get_port_active(args[0].unit, + args[1].port); + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_NUM_FREE_CONTEXTS: + *((uint32_t*)out) = psmi_hal_get_num_free_contexts(args[0].unit); + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_NUM_CONTEXTS: + *((uint32_t*)out) = psmi_hal_get_num_contexts(args[0].unit); + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_CONFIG: + { + psm2_mq_t mq = args[0].mq; + psm2_epaddr_t epaddr = args[1].epaddr; + rv = psmi_get_psm2_config(mq, epaddr, (uint32_t*)out); + } + break; + case PSM2_INFO_QUERY_THRESH: + { + psm2_mq_t mq = args[0].mq; + psm2_epaddr_t epaddr = args[1].epaddr; + enum psm2_info_query_thresh_et iqt = args[2].mstq; + + uint32_t config; + rv = psmi_get_psm2_config(mq, epaddr, &config); + if (rv == PSM2_OK) + { + *((uint32_t*)out) = 0; + /* Delegate the call to the ptl member function: */ + rv = epaddr->ptlctl->msg_size_thresh_query(iqt, (uint32_t*)out, mq, epaddr); + } + } + break; + case PSM2_INFO_QUERY_DEVICE_NAME: + { + char *hfiName = (char*)out; + psm2_mq_t mq = args[0].mq; + psm2_epaddr_t epaddr = args[1].epaddr; + size_t hfiNameLength = args[2].length; + uint32_t config; + + rv = psmi_get_psm2_config(mq, epaddr, &config); + if (rv == PSM2_OK) + { + if (snprintf(hfiName, hfiNameLength, "%s_%d", + psmi_hal_get_hfi_name(), + mq->ep->unit_id) + < hfiNameLength) + rv = PSM2_OK; + } + } + break; + case PSM2_INFO_QUERY_MTU: + { + psm2_mq_t mq = args[0].mq; + psm2_epaddr_t epaddr = args[1].epaddr; + uint32_t config; + + rv = psmi_get_psm2_config(mq, epaddr, &config); + if (rv == PSM2_OK) + { + // TBD - should get ipsaddr to find pr_mtu negotiated + *((uint32_t*)out) = mq->ep->mtu; + } + } + break; + case PSM2_INFO_QUERY_LINK_SPEED: + { + psm2_mq_t mq = args[0].mq; + psm2_epaddr_t epaddr = args[1].epaddr; + uint32_t config; + + rv = psmi_get_psm2_config(mq, epaddr, &config); + if (rv == PSM2_OK) + { + *((uint32_t*)out) = psmi_hal_get_port_rate(mq->ep->unit_id, + mq->ep->portnum); + } + } + break; + case PSM2_INFO_QUERY_NETWORK_TYPE: + { + char *networkType = (char*)out; + size_t networkTypeLength = args[0].length; + const char *const intelopa = "Intel(R) OPA"; + if (networkTypeLength >= strlen(intelopa)+1) + { + strcpy(networkType,intelopa); + rv = PSM2_OK; + } + } + break; + case PSM2_INFO_QUERY_FEATURE_MASK: + { +#ifdef PSM_CUDA + *((uint32_t*)out) = PSM2_INFO_QUERY_FEATURE_CUDA; +#else + *((uint32_t*)out) = 0; +#endif /* #ifdef PSM_CUDA */ + } + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_UNIT_NAME: + { + char *hfiName = (char*)out; + uint32_t unit = args[0].unit; + size_t hfiNameLength = args[1].length; + const char *pathName = sysfs_unit_path(unit); + char *unitName = NULL; + + if (!pathName) break; + + unitName = strrchr(sysfs_unit_path(unit),'/'); + if (!unitName) break; + + strncpy(hfiName, ++unitName, hfiNameLength); + hfiName[hfiNameLength-1] = '\0'; + rv = PSM2_OK; + } + break; + case PSM2_INFO_QUERY_UNIT_SYS_PATH: + { + char *hfiName = (char*)out; + uint32_t unit = args[0].unit; + size_t hfiNameLength = args[1].length; + const char *pathName = sysfs_unit_path(unit); + //char *unitName = NULL; + + if (!pathName) break; + + strncpy(hfiName, pathName, hfiNameLength); + hfiName[hfiNameLength-1] = '\0'; + rv = PSM2_OK; + } + break; + default: + break; + } + + return rv; +} +PSMI_API_DECL(psm2_info_query) + +uint64_t __psm2_get_capability_mask(uint64_t req_cap_mask) +{ + return (psm2_capabilities_bitset & req_cap_mask); +} +PSMI_API_DECL(psm2_get_capability_mask) + +psm2_error_t __psm2_finalize(void) +{ + struct psmi_eptab_iterator itor; + char *hostname; + psm2_ep_t ep; + + PSM2_LOG_MSG("entering"); + + _HFI_DBG("psmi_refcount=%d\n", psmi_refcount); + PSMI_ERR_UNLESS_INITIALIZED(NULL); + psmi_assert(psmi_refcount > 0); + psmi_refcount--; + + if (psmi_refcount > 0) { + return PSM2_OK; + } + + /* When PSM_PERF is enabled, the following line causes the + instruction cycles gathered in the current run to be dumped + to stderr. */ + GENERIC_PERF_DUMP(stderr); + ep = psmi_opened_endpoint; + while (ep != NULL) { + psm2_ep_t saved_ep = ep->user_ep_next; + psm2_ep_close(ep, PSM2_EP_CLOSE_GRACEFUL, + 2 * PSMI_MIN_EP_CLOSE_TIMEOUT); + psmi_opened_endpoint = ep = saved_ep; + } + +#ifdef PSM_FI + psmi_faultinj_fini(); +#endif /* #ifdef PSM_FI */ + + /* De-allocate memory for any allocated space to store hostnames */ + psmi_epid_itor_init(&itor, PSMI_EP_HOSTNAME); + while ((hostname = psmi_epid_itor_next(&itor))) + psmi_free(hostname); + psmi_epid_itor_fini(&itor); + + psmi_epid_fini(); + + /* unmap shared mem object for affinity */ + if (psmi_affinity_shared_file_opened) { + /* + * Start critical section to decrement ref count and unlink + * affinity shm file. + */ + psmi_sem_timedwait(sem_affinity_shm_rw, sem_affinity_shm_rw_name); + + shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] -= 1; + if (shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] <= 0) { + _HFI_VDBG("Unlink shm file for NIC affinity as there are no more users\n"); + shm_unlink(affinity_shm_name); + } else { + _HFI_VDBG("Number of affinity shared memory users left=%ld\n", + shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION]); + } + + msync(shared_affinity_ptr, AFFINITY_SHMEMSIZE, MS_SYNC); + + /* End critical section */ + psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name); + + munmap(shared_affinity_ptr, AFFINITY_SHMEMSIZE); + psmi_free(affinity_shm_name); + affinity_shm_name = NULL; + psmi_affinity_shared_file_opened = 0; + } + + if (psmi_affinity_semaphore_open) { + _HFI_VDBG("Closing and Unlinking Semaphore: %s.\n", sem_affinity_shm_rw_name); + sem_close(sem_affinity_shm_rw); + sem_unlink(sem_affinity_shm_rw_name); + psmi_free(sem_affinity_shm_rw_name); + sem_affinity_shm_rw_name = NULL; + psmi_affinity_semaphore_open = 0; + } + + psmi_hal_finalize(); +#ifdef PSM_CUDA + if (is_cuda_primary_context_retain) { + /* + * This code will be called during deinitialization, and if + * CUDA is deinitialized before PSM, then + * CUDA_ERROR_DEINITIALIZED will happen here + */ + CUdevice device; + if (psmi_cuCtxGetDevice(&device) == CUDA_SUCCESS) + PSMI_CUDA_CALL(cuDevicePrimaryCtxRelease, device); + } +#endif + + psmi_refcount = PSMI_FINALIZED; + PSM2_LOG_MSG("leaving"); + psmi_log_fini(); + + psmi_stats_finalize(); + + psmi_heapdebug_finalize(); + + return PSM2_OK; +} +PSMI_API_DECL(psm2_finalize) + +/* + * Function exposed in >= 1.05 + */ +psm2_error_t +__psm2_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames) +{ + int i; + psm2_error_t err = PSM2_OK; + + PSM2_LOG_MSG("entering"); + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + if (nids == NULL || hostnames == NULL) { + err = PSM2_PARAM_ERR; + goto fail; + } + + for (i = 0; i < num; i++) { + if ((err = psmi_epid_set_hostname(nids[i], hostnames[i], 1))) + break; + } + +fail: + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_map_nid_hostname) + +void __psm2_epaddr_setlabel(psm2_epaddr_t epaddr, char const *epaddr_label) +{ + PSM2_LOG_MSG("entering"); + PSM2_LOG_MSG("leaving"); + return; /* ignore this function */ +} +PSMI_API_DECL(psm2_epaddr_setlabel) + +void __psm2_epaddr_setctxt(psm2_epaddr_t epaddr, void *ctxt) +{ + + /* Eventually deprecate this API to use set/get opt as this is unsafe. */ + PSM2_LOG_MSG("entering"); + psm2_setopt(PSM2_COMPONENT_CORE, (const void *)epaddr, + PSM2_CORE_OPT_EP_CTXT, (const void *)ctxt, sizeof(void *)); + PSM2_LOG_MSG("leaving"); +} +PSMI_API_DECL(psm2_epaddr_setctxt) + +void *__psm2_epaddr_getctxt(psm2_epaddr_t epaddr) +{ + psm2_error_t err; + uint64_t optlen = sizeof(void *); + void *result = NULL; + + PSM2_LOG_MSG("entering"); + /* Eventually deprecate this API to use set/get opt as this is unsafe. */ + err = psm2_getopt(PSM2_COMPONENT_CORE, (const void *)epaddr, + PSM2_CORE_OPT_EP_CTXT, (void *)&result, &optlen); + + PSM2_LOG_MSG("leaving"); + + if (err == PSM2_OK) + return result; + else + return NULL; +} +PSMI_API_DECL(psm2_epaddr_getctxt) + +psm2_error_t +__psm2_setopt(psm2_component_t component, const void *component_obj, + int optname, const void *optval, uint64_t optlen) +{ + psm2_error_t rv; + PSM2_LOG_MSG("entering"); + switch (component) { + case PSM2_COMPONENT_CORE: + rv = psmi_core_setopt(component_obj, optname, optval, optlen); + PSM2_LOG_MSG("leaving"); + return rv; + break; + case PSM2_COMPONENT_MQ: + /* Use the deprecated MQ set/get opt for now which does not use optlen */ + rv = psm2_mq_setopt((psm2_mq_t) component_obj, optname, optval); + PSM2_LOG_MSG("leaving"); + return rv; + break; + case PSM2_COMPONENT_AM: + /* Hand off to active messages */ + rv = psmi_am_setopt(component_obj, optname, optval, optlen); + PSM2_LOG_MSG("leaving"); + return rv; + break; + case PSM2_COMPONENT_IB: + /* Hand off to IPS ptl to set option */ + rv = psmi_ptl_ips.setopt(component_obj, optname, optval, + optlen); + PSM2_LOG_MSG("leaving"); + return rv; + break; + } + + /* Unrecognized/unknown component */ + rv = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown component %u", + component); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_setopt); + +psm2_error_t +__psm2_getopt(psm2_component_t component, const void *component_obj, + int optname, void *optval, uint64_t *optlen) +{ + psm2_error_t rv; + + PSM2_LOG_MSG("entering"); + switch (component) { + case PSM2_COMPONENT_CORE: + rv = psmi_core_getopt(component_obj, optname, optval, optlen); + PSM2_LOG_MSG("leaving"); + return rv; + break; + case PSM2_COMPONENT_MQ: + /* Use the deprecated MQ set/get opt for now which does not use optlen */ + rv = psm2_mq_getopt((psm2_mq_t) component_obj, optname, optval); + PSM2_LOG_MSG("leaving"); + return rv; + break; + case PSM2_COMPONENT_AM: + /* Hand off to active messages */ + rv = psmi_am_getopt(component_obj, optname, optval, optlen); + PSM2_LOG_MSG("leaving"); + return rv; + break; + case PSM2_COMPONENT_IB: + /* Hand off to IPS ptl to set option */ + rv = psmi_ptl_ips.getopt(component_obj, optname, optval, + optlen); + PSM2_LOG_MSG("leaving"); + return rv; + break; + } + + /* Unrecognized/unknown component */ + rv = psmi_handle_error(NULL, PSM2_PARAM_ERR, "Unknown component %u", + component); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_getopt); + +psm2_error_t __psmi_poll_noop(ptl_t *ptl, int replyonly) +{ + PSM2_LOG_MSG("entering"); + PSM2_LOG_MSG("leaving"); + return PSM2_OK_NO_PROGRESS; +} +PSMI_API_DECL(psmi_poll_noop) + +psm2_error_t __psm2_poll(psm2_ep_t ep) +{ + psm2_error_t err1 = PSM2_OK, err2 = PSM2_OK; + psm2_ep_t tmp; + + PSM2_LOG_MSG("entering"); + + PSMI_ASSERT_INITIALIZED(); + + PSMI_LOCK(ep->mq->progress_lock); + + tmp = ep; + do { + err1 = ep->ptl_amsh.ep_poll(ep->ptl_amsh.ptl, 0); /* poll reqs & reps */ + if (err1 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */ + PSMI_UNLOCK(ep->mq->progress_lock); + PSM2_LOG_MSG("leaving"); + return err1; + } + + err2 = ep->ptl_ips.ep_poll(ep->ptl_ips.ptl, 0); /* get into ips_do_work */ + if (err2 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */ + PSMI_UNLOCK(ep->mq->progress_lock); + PSM2_LOG_MSG("leaving"); + return err2; + } + ep = ep->mctxt_next; + } while (ep != tmp); + + /* This is valid because.. + * PSM2_OK & PSM2_OK_NO_PROGRESS => PSM2_OK + * PSM2_OK & PSM2_OK => PSM2_OK + * PSM2_OK_NO_PROGRESS & PSM2_OK => PSM2_OK + * PSM2_OK_NO_PROGRESS & PSM2_OK_NO_PROGRESS => PSM2_OK_NO_PROGRESS */ + PSMI_UNLOCK(ep->mq->progress_lock); + PSM2_LOG_MSG("leaving"); + return (err1 & err2); +} +PSMI_API_DECL(psm2_poll) + +psm2_error_t __psmi_poll_internal(psm2_ep_t ep, int poll_amsh) +{ + psm2_error_t err1 = PSM2_OK_NO_PROGRESS; + psm2_error_t err2; + psm2_ep_t tmp; + + PSM2_LOG_MSG("entering"); + PSMI_LOCK_ASSERT(ep->mq->progress_lock); + + tmp = ep; + do { + if (poll_amsh) { + err1 = ep->ptl_amsh.ep_poll(ep->ptl_amsh.ptl, 0); /* poll reqs & reps */ + if (err1 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */ + PSM2_LOG_MSG("leaving"); + return err1; + } + } + + err2 = ep->ptl_ips.ep_poll(ep->ptl_ips.ptl, 0); /* get into ips_do_work */ + if (err2 > PSM2_OK_NO_PROGRESS) { /* some error unrelated to polling */ + PSM2_LOG_MSG("leaving"); + return err2; + } + + ep = ep->mctxt_next; + } while (ep != tmp); + PSM2_LOG_MSG("leaving"); + return (err1 & err2); +} +PSMI_API_DECL(psmi_poll_internal) +#ifdef PSM_PROFILE +/* These functions each have weak symbols */ +void psmi_profile_block() +{ + ; /* empty for profiler */ +} + +void psmi_profile_unblock() +{ + ; /* empty for profiler */ +} + +void psmi_profile_reblock(int did_no_progress) +{ + ; /* empty for profiler */ +} +#endif diff --git a/prov/psm3/psm3/psm2.h b/prov/psm3/psm3/psm2.h new file mode 100644 index 00000000000..0b880086d7b --- /dev/null +++ b/prov/psm3/psm3/psm2.h @@ -0,0 +1,1789 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef PSM2_H +#define PSM2_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * @file psm2.h + * @page psm2_main PSM2 API + * + * @brief PSM2 OPA Messaging Library + * + * The PSM2 OPA Messaging API, or PSM2 API, is Intel's low-level + * user-level communications interface for the OPA family of products. + * PSM2 users are enabled with mechanisms necessary to implement higher level + * communications interfaces in parallel environments. + * + * Since PSM2 targets clusters of multicore processors, it internally implements + * two levels of communication: intra-node shared memory communication and + * inter-node OPA communication. Both of these levels are encapsulated + * below the interface and the user is free to assume that intra-node and + * inter-node communication is transparently handled within PSM. + * + * @section compat Compatibility + * + * PSM2 can coexist with other QLogic/Pathscale software distributions, such as + * OpenIB/OpenFabrics, which allows applications to simultaneously target + * PSM-based and non PSM-based applications on a single node without changing + * any system-level configuration. However, PSM2 does not support running + * PSM-based and non PSM-based communication within the same user process. + * + * Except where noted, PSM2 does not assume an SPMD (single program, multiple + * data) parallel model and extends to MPMD (multiple program, multiple data) + * environments in specific areas. However, PSM2 assumes the runtime environment + * to be homogeneous on all nodes in bit width (32-bit or 64-bit) and endianness + * (little or big) and will fail at startup if any of these assumptions do not + * hold. For homogeneous systems PSM2 can run either in 32-bit or 64-bit + * environments. Even though both environments should expect similar + * performance from the API, PSM2 has chosen to favor 64-bit environments in + * some minor areas. + * + * @section ep_model Endpoint Communication Model + * + * PSM2 follows an endpoint communication model where an endpoint is defined as + * an object (or handle) instantiated to support sending and receiving messages + * to other endpoints. In order to prevent PSM2 from being tied to a particular + * parallel model (such as SPMD), control over the parallel layout of endpoints + * is retained by the user. Opening endpoints (@ref psm2_ep_open) and + * connecting endpoints to enable communication (@ref psm2_ep_connect) are two + * decoupled mechanisms. Users that do not dynamically change the number of + * endpoints beyond parallel startup will probably lump both mechanisms + * together at startup. Users that wish to manipulate the location and number + * of endpoints at runtime can do so by explicitly connecting sets or subsets + * of endpoints. + * + * As a side effect, this greater flexibility forces the user to cope with a + * two-stage initialization process. In the first stage of opening an endpoint + * (@ref psm2_ep_open), a user obtains an opaque handle to the endpoint and a + * globally distributable endpoint identifier (@ref psm2_epid_t). Prior to the + * second stage of connecting endpoints (@ref psm2_ep_connect), a user must + * distribute all relevent endpoint identifiers through an out-of-band + * mechanism. Once the endpoint identifiers are successfully distributed to + * all processes that wish to communicate, the user + * connects all endpoint identifiers to the locally opened endpoint + * (@ref psm2_ep_connect). In connecting the endpoints, the user obtains an + * opaque endpoint address (@ref psm2_epaddr_t), which is required for all PSM + * communication primitives. + * + * + * @section components PSM2 Components + * + * PSM2 exposes a single endpoint initialization model, but enables various + * levels of communication functionality and semantics through @e components. + * The first major component available in PSM2 is PSM2 Matched Queues + * (@ref psm2_mq), and the second is PSM2 Active Message (@ref psm2_am). + * + * Matched Queues (MQ) present a queue-based communication model with the + * distinction that queue consumers use a 3-tuple of metadata to match incoming + * messages against a list of preposted receive buffers. The MQ semantics are + * sufficiently akin to MPI to cover the entire MPI-1.2 standard. + * + * The Active Message (AM) component presents a request/reply model where + * the arrival of a message triggers the execution of consumer-provided + * handler code. This can be used to implement many one-sided and two-sided + * communications paradigms. + * + * With future releases of the PSM2 interface, more components will + * be exposed to accommodate users that implement parallel communication + * models that deviate from the Matched Queue semantics. For example, PSM + * plans to expose a connection management component to make it easier to + * handle endpoint management for clients without their own connection + * managers. + * + * + * @section progress PSM2 Communication Progress Guarantees + * + * PSM2 internally ensures progress of both intra-node and inter-node messages, + * but not autonomously. This means that while performance does not depend + * greatly on how the user decides to schedule communication progress, + * explicit progress calls are required for correctness. The @ref psm2_poll + * function is available to make progress over all PSM2 components in a generic + * manner. For more information on making progress over many communication + * operations in the MQ component, see the @ref mq_progress documentation. + * + * + * @section completion PSM2 Completion semantics + * + * PSM2 implements the MQ component, which documents its own + * message completion semantics (@ref mq_completion). + * + * + * @section error_handling PSM2 Error handling + * + * PSM2 exposes a list of user and runtime errors enumerated in @ref psm2_error. + * While most errors are fatal in that the user is not expected to be able to + * recover from them, PSM2 still allows some level of control. By + * default, PSM2 returns all errors to the user but as a convenience, allows + * users to either defer errors internally to PSM2 or to have PSM2 return all + * errors to the user (callers to PSM2 functions). PSM2 attempts to deallocate + * its resources as a best effort, but exits are always non-collective with + * respect to endpoints opened in other processes. The user is expected to be + * able to handle non-collective exits from any endpoint and in turn cleanly + * and independently terminate the parallel environment. Local error handling + * can be handled in three modes: + * + * Errors and error handling can be individually registered either globally or + * per-endpoint: + * @li @b Per-endpoint error handling captures errors for functions where the + * error scoping is determined to be over an endpoint. This includes all + * communication functions that include an EP or MQ handle as the first + * parameter. + * + * @li @b Global error handling captures errors for functions where a + * particular endpoint cannot be identified or for @ref psm2_ep_open, where + * errors (if any) occur before the endpoint is opened. + * + * Error handling is controlled by registering error handlers (@ref + * psm2_error_register_handler). The global error handler can + * be set at any time (even before @ref psm2_init), whereas a per-endpoint error + * handler can be set as soon as a new endpoint is successfully created. If a + * per-endpoint handle is not registered, the per-endpoint handler inherits + * from the global error handler at time of open. + * + * PSM2 predefines two different mechanisms for handling errors: + * + * @li PSM-internal error handler (@ref PSM2_ERRHANDLER_PSM_HANDLER) + * @li No-op PSM2 error handler where errors are returned + * (@ref PSM2_ERRHANDLER_NO_HANDLER) + * + * The default PSM-internal error handler effectively frees the user from + * explicitly handling the return values of ever PSM2 function but may not + * return to the user in a function determined to have caused a fatal error. + * + * The No-op PSM2 error handler bypasses all error handling functionality and + * always returns the error to the user. The user can then use @ref + * psm2_error_get_string to obtain a generic string from an error code (compared + * to a more detailed error message available through registering of error + * handlers). + * + * For even more control, users can register their own error handlers to have + * access to more precise error strings and selectively control when an when + * not to return to callers of PSM2 functions. All error handlers shown defer + * error handling to PSM2 for errors that are not recognized using @ref + * psm2_error_defer. Deferring an error from a custom error handler is + * equivalent to relying on the default error handler. + * + * @section env_var Environment variables + * + * Some PSM2 behaviour can be controlled via environment variables. + * + * @li @b PSM3_DEVICES. PSM2 implements three devices for communication which + * are, in order, @c self, @c shm and @c hfi. For PSM2 jobs that do not + * require shared-memory communications, @b PSM3_DEVICES can be specified as @c + * self, @c hfi. Similarly, for shared-memory only jobs, the @c hfi device + * can be disabled. It is up to the user to ensure that the endpoint ids + * passed in @ref psm2_ep_connect do not require a device that has been + * explicitly disabled by the user. In some instances, enabling only the + * devices that are required may improve performance. + * + * @li @b PSM2_TRACEMASK. Depending on the value of the tracemask, various parts + * of PSM2 will output debugging information. With a default value of @c 0x1, + * informative messages will be printed (this value should be considered a + * minimum). At @c 0x101, startup and finalization messages are added to the + * output. At @c 0x1c3, every communication event is logged and should hence + * be used for extreme debugging only. + * + * @li @b PSM3_MULTI_EP. By default, only one PSM2 endpoint may be opened in + * a process. With the correct setting of this environment variable, a process + * may open more than one PSM2 endpoint. In order to enable multiple endpoint + * per process support, the value of this environment variable should be set + * to "1" or "yes". + * + * @section thr_sfty Thread safety and reentrancy + * Unless specifically noted otherwise, all PSM2 functions should not be considered + * to be thread safe or reentrant. + */ + +/** @brief Local endpoint handle (opaque) + * @ingroup ep + * + * Handle returned to the user when a new local endpoint is created. The + * handle is a local handle to be used in all communication functions and is + * not intended to globally identify the opened endpoint in any way. + * + * All open endpoint handles can be globally identified using the endpoint id + * integral type (@ref psm2_epid_t) and all communication must use an endpoint + * address (@ref psm2_epaddr_t) that can be obtained by connecting a local + * endpoint to one or more endpoint identifiers. + * + * @remark The local endpoint handle is opaque to the user. */ +typedef struct psm2_ep *psm2_ep_t; + +/** @brief MQ handle (opaque) + * @ingroup mq + * + * Handle returned to the user when a new Matched queue is created (@ref + * psm2_mq_init). */ +typedef struct psm2_mq *psm2_mq_t; + +/*! @defgroup init PSM2 Initialization and Maintenance + * @{ + */ +#define PSM2_VERNO 0x0300 /*!< Header-defined Version number */ +#define PSM2_VERNO_MAJOR 0x03 /*!< Header-defined Major Version Number */ +#define PSM2_VERNO_MINOR 0x00 /*!< Header-defined Minor Version Number */ +#define PSM2_VERNO_COMPAT_MAJOR 0x02 /*! PSM2_VERNO_MAJOR) { + if (err) + fprintf(stderr, "PSM3 initialization failure: %s\n", + psm2_error_get_string(err)); + else + fprintf(stderr, "PSM3 loaded an unexpected/unsupported " + "version (%d.%d)\n", verno_major, verno_minor); + return -1; + } + + // We were able to initialize PSM2 but will defer all further error + // handling since most of the errors beyond this point will be fatal. + int err = psm2_error_register_handler(NULL, // Global handler + PSM2_ERRHANDLER_PSM_HANDLER); + if (err) { + fprintf(stderr, "Couldn't register global errhandler: %s\n", + psm2_error_get_string(err)); + return -1; + } + return 1; + } + @endcode + */ +psm2_error_t psm2_init(int *api_verno_major, int *api_verno_minor); + +/*! @brief PSM2 capabilities definitions + * + * Each capability is defined as a separate bit, + * i.e. next capabilities must be defined as + * consecutive bits : 0x2, 0x4 ... and so on. + */ +#define PSM2_MULTI_EP_CAP 0x1 /* Multiple Endpoints capability */ +#define PSM2_LIB_REFCOUNT_CAP 0x2 /* Library finalization is managed with reference count */ + +/** @brief PSM2 capabilities provider + * + * @param[in] req_cap_mask Requested capabilities are given as bit field. + * + * @returns internal capabilities bit field ANDed with a requested bit mask */ +uint64_t psm2_get_capability_mask(uint64_t req_cap_mask); + +/** @brief Finalize PSM2 interface + * + * Single call to finalize PSM2 and close all unclosed endpoints + * + * @post The user guarantees not to make any further PSM2 calls, including @ref + * psm2_init. + * + * @returns PSM2_OK Always returns @c PSM2_OK */ +psm2_error_t psm2_finalize(void); + +/** @brief Error handling opaque token + * + * A token is required for users that register their own handlers and wish to + * defer further error handling to PSM. */ +typedef struct psm2_error_token *psm2_error_token_t; + +/** @brief Error handling function + * + * Users can handle errors explicitly instead of relying on PSM's own error + * handler. There is one global error handler and error handlers that can be + * individually set for each opened endpoint. By default, endpoints will + * inherit the global handler registered at the time of open. + * + * @param[in] ep Handle associated to the endpoint over which the error occurred + * or @c NULL if the error is being handled by the global error + * handler. + * @param[in] error PSM2 error identifier + * @param[in] error_string A descriptive error string of maximum length @ref + * PSM2_ERRSTRING_MAXLEN. + * @param[in] token Opaque PSM2 token associated with the particular event that + * generated the error. The token can be used to extract the + * error string and can be passed to @ref psm2_error_defer to + * defer any remaining or unhandled error handling to PSM. + * + * @post If the error handler returns, the error returned is propagated to the + * caller. */ +typedef psm2_error_t(*psm2_ep_errhandler_t) (psm2_ep_t ep, + const psm2_error_t error, + const char *error_string, + psm2_error_token_t token); + +#define PSM2_ERRHANDLER_DEFAULT ((psm2_ep_errhandler_t)-1) +/**< Obsolete names, only here for backwards compatibility */ +#define PSM2_ERRHANDLER_NOP ((psm2_ep_errhandler_t)-2) +/**< Obsolete names, only here for backwards compatibility */ + +#define PSM2_ERRHANDLER_PSM_HANDLER ((psm2_ep_errhandler_t)-1) +/**< PSM2 error handler as explained in @ref error_handling */ + +#define PSM2_ERRHANDLER_NO_HANDLER ((psm2_ep_errhandler_t)-2) +/**< Bypasses the default PSM2 error handler and returns all errors to the user + * (this is the default) */ + +#define PSM2_ERRSTRING_MAXLEN 512 /**< Maximum error string length. */ + +/** @brief PSM2 error handler registration + * + * Function to register error handlers on a global basis and on a per-endpoint + * basis. PSM2_ERRHANDLER_PSM_HANDLER and PSM2_ERRHANDLER_NO_HANDLER are special + * pre-defined handlers to respectively enable use of the default PSM-internal + * handler or the no-handler that disables registered error handling and + * returns all errors to the caller (both are documented in @ref + * error_handling). + * + * @param[in] ep Handle of the endpoint over which the error handler should be + * registered. With ep set to @c NULL, the behavior of the + * global error handler can be controlled. + * @param[in] errhandler Handler to register. Can be a user-specific error + * handling function or PSM2_ERRHANDLER_PSM_HANDLER or + * PSM2_ERRHANDLER_NO_HANDLER. + * + * @remark When ep is set to @c NULL, this is the only function that can be + * called before @ref psm2_init + */ +psm2_error_t +psm2_error_register_handler(psm2_ep_t ep, const psm2_ep_errhandler_t errhandler); + +/** @brief PSM2 deferred error handler + * + * Function to handle fatal PSM2 errors if no error handler is installed or if + * the user wishes to defer further error handling to PSM. Depending on the + * type of error, PSM2 may or may not return from the function call. + * + * @param[in] err_token Error token initially passed to error handler + * + * @pre The user is calling into the function because it has decided that PSM + * should handle an error case. + * + * @post The function may or may not return depending on the error + */ +psm2_error_t psm2_error_defer(psm2_error_token_t err_token); + +/** @brief Get generic error string from error + * + * Function to return the default error string associated to a PSM2 error. + * + * While a more detailed and precise error string is usually available within + * error handlers, this function is available to obtain an error string out of + * an error handler context or when a no-op error handler is registered. + * + * @param[in] error PSM2 error + */ +const char *psm2_error_get_string(psm2_error_t error); + +/** @brief Option key/pair structure + * + * Currently only used in MQ. + */ +struct psm2_optkey { + uint32_t key; /**< Option key */ + void *value; /**< Option value */ +}; + +/*! @} */ + +/*! @defgroup ep PSM2 Device Endpoint Management + * @{ + */ + +/** @brief Endpoint ID + * + * Integral type of size 8 bytes that can be used by the user to globally + * identify a successfully opened endpoint. Although the contents of the + * endpoint id integral type remains opaque to the user, unique network id and + * OPA port number can be extracted using @ref psm2_epid_nid and @ref + * psm2_epid_context. + */ +typedef uint64_t psm2_epid_t; + +/** @brief Endpoint Address (opaque) + * + * Remote endpoint addresses are created when the user binds an endpoint ID + * to a particular endpoint handle using @ref psm2_ep_connect. A given endpoint + * address is only guaranteed to be valid over a single endpoint. + */ +typedef struct psm2_epaddr *psm2_epaddr_t; + +/** @brief PSM2 Unique UID + * + * PSM2 type equivalent to the DCE-1 uuid_t, used to uniquely identify an + * endpoint within a particular job. Since PSM2 does not participate in job + * allocation and management, users are expected to generate a unique ID to + * associate endpoints to a particular parallel or collective job. + * @see psm2_uuid_generate + */ +typedef uint8_t psm2_uuid_t[16]; + +/** @brief Get Endpoint identifier's Unique Network ID */ +uint64_t psm2_epid_nid(psm2_epid_t epid); + +/** @brief Get Endpoint identifier's OPA context number */ +uint64_t psm2_epid_context(psm2_epid_t epid); + +/** @brief Get Endpoint identifier's OPA port (deprecated, use + * @ref psm2_epid_context instead) */ +uint64_t psm2_epid_port(psm2_epid_t epid); + +/** @brief List the number of available OPA units + * + * Function used to determine the number of locally available OPA units. + * For @c N units, valid unit numbers in @ref psm2_ep_open are @c 0 to @c N-1. + * + * @returns PSM2_OK unless the user has not called @ref psm2_init + */ +psm2_error_t psm2_ep_num_devunits(uint32_t *num_units); + +/** @brief Utility to generate UUIDs for @ref psm2_ep_open + * + * This function is available as a utility for generating unique job-wide ids. + * See discussion in @ref psm2_ep_open for further information. + * + * @remark This function does not require PSM2 to be initialized. + */ +void psm2_uuid_generate(psm2_uuid_t uuid_out); + +/* Affinity modes for the affinity member of struct psm2_ep_open_opts */ +#define PSM2_EP_OPEN_AFFINITY_SKIP 0 /**< Disable setting affinity */ +#define PSM2_EP_OPEN_AFFINITY_SET 1 /**< Enable setting affinity unless + already set */ +#define PSM2_EP_OPEN_AFFINITY_FORCE 2 /**< Enable setting affinity regardless + of current affinity setting */ + +/* Default values for some constants */ +#define PSM2_EP_OPEN_PKEY_DEFAULT 0xffffffffffffffffULL + /**< Default protection key */ + +/** @brief Endpoint Open Options + * + * These options are available for opening a PSM2 endpoint. Each is + * individually documented and setting each option to -1 or passing NULL as the + * options parameter in @ref psm2_ep_open instructs PSM2 to use + * implementation-defined defaults. + * + * Each option is documented in @ref psm2_ep_open + */ +struct psm2_ep_open_opts { + int64_t timeout; /**< timeout in nanoseconds to open device */ + int unit; /**< OPA Unit ID to open on */ + int affinity; /**< How PSM2 should set affinity */ + int shm_mbytes; /**< Megabytes used for intra-node, deprecated */ + int sendbufs_num; /**< Preallocated send buffers */ + uint64_t network_pkey; /**< Network Protection Key (v1.01) */ + int port; /**< IB port to use (1 to N) */ + int outsl; /**< IB SL to use when sending pkts */ + uint64_t service_id; /* IB Service ID to use for endpoint */ + psm2_path_res_t path_res_type; /* Path resolution type */ + int senddesc_num; /* Preallocated send descriptors */ + int imm_size; /* Immediate data size for endpoint */ +}; + +/** @brief OPA endpoint creation + * + * Function used to create a new local communication endpoint on an OPA + * adapter. The returned endpoint handle is required in all PSM2 communication + * operations, as PSM2 can manage communication over multiple endpoints. An + * opened endpoint has no global context until the user connects the endpoint + * to other global endpoints by way of @ref psm2_ep_connect. All local endpoint + * handles are globally identified by endpoint IDs (@ref psm2_epid_t) which are + * also returned when an endpoint is opened. It is assumed that the user can + * provide an out-of-band mechanism to distribute the endpoint IDs in order to + * establish connections between endpoints (@ref psm2_ep_connect for more + * information). + * + * @param[in] unique_job_key Endpoint key, to uniquely identify the endpoint in + * a parallel job. It is up to the user to ensure + * that the key is globally unique over a period long + * enough to prevent duplicate keys over the same set + * of endpoints (see comments below). + * + * @param[in] opts Open options of type @ref psm2_ep_open_opts + * (see @ref psm2_ep_open_opts_get_defaults). + * + * @param[out] ep User-supplied storage to return a pointer to the newly + * created endpoint. The returned pointer of type @ref psm2_ep_t + * is a local handle and cannot be used to globally identify the + * endpoint. + * @param[out] epid User-supplied storage to return the endpoint ID associated + * to the newly created local endpoint returned in the @c ep + * handle. The endpoint ID is an integral type suitable for + * uniquely identifying the local endpoint. + * + * PSM2 does not internally verify the consistency of the uuid, it is up to the + * user to ensure that the uid is unique enough not to collide with other + * currently-running jobs. Users can employ three mechanisms to obtain a uuid. + * + * 1. Use the supplied @ref psm2_uuid_generate utility + * + * 2. Use an OS or library-specific uuid generation utility, that complies with + * OSF DCE 1.1, such as @c uuid_generate on Linux or @c uuid_create on + * FreeBSD. + * (see http://www.opengroup.org/onlinepubs/009629399/uuid_create.htm) + * + * 3. Manually pack a 16-byte string using a utility such as /dev/random or + * other source with enough entropy and proper seeding to prevent two nodes + * from generating the same uuid_t. + * + * The following options are relevent when opening an endpoint: + * @li @c timeout establishes the number of nanoseconds to wait before + * failing to open a port (with -1, defaults to 15 secs). + * @li @c unit sets the OPA unit number to use to open a port (with + * -1, PSM2 determines the best unit to open the port). If @c + * PSM3_NIC is set in the environment, this setting is ignored. + * @li @c affinity enables or disables PSM2 setting processor affinity. The + * option can be controlled to either disable (@ref + * PSM2_EP_OPEN_AFFINITY_SKIP) or enable the affinity setting + * only if it is already unset (@ref + * PSM2_EP_OPEN_AFFINITY_SET) or regardless of affinity being + * set or not (@ref PSM2_EP_OPEN_AFFINITY_FORCE). + * If @c PSM3_NO_CPUAFFINITY is set in the environment, this + * setting is ignored. + * @li @c shm_mbytes sets a maximum number of megabytes that can be allocated + * to each local endpoint ID connected through this + * endpoint (with -1, defaults to 10 MB). + * @li @c sendbufs_num sets the number of send buffers that can be + * pre-allocated for communication (with -1, defaults to + * 512 buffers of MTU size). + * @li @c network_pkey sets the protection key to employ for point-to-point + * PSM2 communication. Unless a specific value is used, + * this parameter should be set to + * PSM2_EP_OPEN_PKEY_DEFAULT. + * + * @warning By default, PSM2 limits the user to calling @ref psm2_ep_open only + * once per process and subsequent calls will fail. In order to enable creation + * of multiple endoints per process, one must properly set the environment variable + * @ref PSM3_MULTI_EP before calling @ref psm2_init. + * + * @code{.c} + // In order to open an endpoint and participate in a job, each endpoint has + // to be distributed a unique 16-byte UUID key from an out-of-band source. + // Presumably this can come from the parallel spawning utility either + // indirectly through an implementors own spawning interface or as in this + // example, the UUID is set as a string in an environment variable + // propagated to all endpoints in the job. + + int try_to_open_psm2_endpoint(psm2_ep_t *ep, // output endpoint handle + psm2_epid_t *epid, // output endpoint identifier + int unit) // unit of our choice + { + struct psm2_ep_open_opts epopts; + psm2_uuid_t job_uuid; + char *c; + + // Let PSM2 assign its default values to the endpoint options. + psm2_ep_open_opts_get_defaults(&epopts); + + // We want a stricter timeout and a specific unit + epopts.timeout = 15*1e9; // 15 second timeout + epopts.unit = unit; // We want a specific unit, -1 would let PSM + // choose the unit for us. + epopts.port = port; // We want a specific unit, <= 0 would let PSM + // choose the port for us. + // We've already set affinity, don't let PSM2 do so if it wants to. + if (epopts.affinity == PSM2_EP_OPEN_AFFINITY_SET) + epopts.affinity = PSM2_EP_OPEN_AFFINITY_SKIP; + + // ENDPOINT_UUID is set to the same value in the environment of all the + // processes that wish to communicate over PSM2 and was generated by + // the process spawning utility + c = getenv("ENDPOINT_UUID"); + if (c && *c) + implementor_string_to_16byte_packing(c, job_uuid); + else { + fprintf(stderr, "Can't find UUID for endpoint\n); + return -1; + } + + // Assume we don't want to handle errors here. + psm2_ep_open(job_uuid, &epopts, ep, epid); + return 1; + } + @endcode + */ +psm2_error_t +psm2_ep_open(const psm2_uuid_t unique_job_key, + const struct psm2_ep_open_opts *opts, psm2_ep_t *ep, + psm2_epid_t *epid); + +/** @brief Endpoint open default options. + * + * Function used to initialize the set of endpoint options to their default + * values for use in @ref psm2_ep_open. + * + * @param[out] opts Endpoint Open options. + * + * @warning For portable operation, users should always call this function + * prior to calling @ref psm2_ep_open. + * + * @return PSM2_OK If result could be updated + * @return PSM2_INIT_NOT_INIT If psm has not been initialized. + */ +psm2_error_t +psm2_ep_open_opts_get_defaults(struct psm2_ep_open_opts *opts); + +/** @brief Endpoint shared memory query + * + * Function used to determine if a remote endpoint shares memory with a + * currently opened local endpiont. + * + * @param[in] ep Endpoint handle + * @param[in] epid Endpoint ID + * + * @param[out] result Result is non-zero if the remote endpoint shares memory with the local + * endpoint @c ep, or zero otherwise. + * + * @return PSM2_OK If result could be updated + * @return PSM2_EPID_UNKNOWN If the epid is not recognized + */ +psm2_error_t +psm2_ep_epid_share_memory(psm2_ep_t ep, psm2_epid_t epid, int *result); + +/** @brief Close endpoint + * @param[in] ep PSM2 endpoint handle + * @param[in] mode One of @ref PSM2_EP_CLOSE_GRACEFUL or @ref PSM2_EP_CLOSE_FORCE + * @param[in] timeout How long to wait in nanoseconds if mode is + * PSM2_EP_CLOSE_GRACEFUL, 0 waits forever. If @c mode is + * @ref PSM2_EP_CLOSE_FORCE, this parameter is ignored. + * + * The following errors are returned, others are handled by the per-endpoint + * error handler: + * + * @return PSM2_OK Endpoint was successfully closed without force or + * successfully closed with force within the supplied timeout. + * @return PSM2_EP_CLOSE_TIMEOUT Endpoint could not be successfully closed + * within timeout. + */ +psm2_error_t psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout); + +#define PSM2_EP_CLOSE_GRACEFUL 0 /**< Graceful mode in @ref psm2_ep_close */ +#define PSM2_EP_CLOSE_FORCE 1 /**< Forceful mode in @ref psm2_ep_close */ + +/** @brief Provide mappings for network id to hostname + * + * Since PSM2 does not assume or rely on the availability of an external + * networkid-to-hostname mapping service, users can provide one or more of + * these mappings. The @ref psm2_map_nid_hostname function allows a list of + * network ids to be associated to hostnames. + * + * This function is not mandatory for correct operation but may allow PSM2 to + * provide better diagnostics when remote endpoints are unavailable and can + * otherwise only be identified by their network id. + * + * @param[in] num Number elements in @c nid and @c hostnames arrays + * @param[in] nids User-provided array of network ids (i.e. OPA LIDs), + * should be obtained by calling @ref psm2_epid_nid on each + * epid. + * @param[in] hostnames User-provided array of hostnames (array of + * NUL-terimated strings) where each hostname index + * maps to the provided nid hostname. + * + * @warning Duplicate nids may be provided in the input @c nids array, only + * the first corresponding hostname will be remembered. + * + * @pre The user may or may not have already provided a hostname mappings. + * @post The user may free any dynamically allocated memory passed to the + * function. + * + */ +psm2_error_t +psm2_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames); + +/** @brief Connect one or more remote endpoints to a local endpoint + * + * Function to non-collectively establish a connection to a set of endpoint IDs + * and translate endpoint IDs into endpoint addresses. Establishing a remote + * connection with a set of remote endpoint IDs does not imply a collective + * operation and the user is free to connect unequal sets on each process. + * Similarly, a given endpoint address does not imply that a pairwise + * communication context exists between the local endpoint and remote endpoint. + * + * @param[in] ep PSM2 endpoint handle + * + * @param[in] num_of_epid The number of endpoints to connect to, which + * also establishes the number of elements contained in + * all of the function's array-based parameters. + * + * @param[in] array_of_epid User-allocated array that contains @c num_of_epid + * valid endpoint identifiers. Each endpoint id (or + * epid) has been obtained through an out-of-band + * mechanism and each endpoint must have been opened + * with the same uuid key. + * + * @param[in] array_of_epid_mask User-allocated array that contains + * @c num_of_epid integers. This array of masks + * allows users to select which of the epids in @c + * array_of_epid should be connected. If the integer + * at index i is zero, psm does not attempt to connect + * to the epid at index i in @c array_of_epid. If + * this parameter is NULL, psm will try to connect to + * each epid. + * + * @param[out] array_of_errors User-allocated array of at least @c num_of_epid + * elements. If the function does not return + * PSM2_OK, this array can be consulted for each + * endpoint not masked off by @c array_of_epid_mask + * to know why the endpoint could not be connected. + * Endpoints that could not be connected because of + * an unrelated failure will be marked as @ref + * PSM2_EPID_UNKNOWN. If the function returns + * PSM2_OK, the errors for all endpoints will also + * contain PSM2_OK. + * + * @param[out] array_of_epaddr User-allocated array of at least @c num_of_epid + * elements of type psm2_epaddr_t. Each + * successfully connected endpoint is updated with + * an endpoint address handle that corresponds to + * the endpoint id at the same index in @c + * array_of_epid. Handles are only updated if the + * endpoint could be connected and if its error in + * array_of_errors is PSM2_OK. + * + * @param[in] timeout Timeout in nanoseconds after which connection attempts + * will be abandoned. Setting this value to 0 disables + * timeout and waits until all endpoints have been + * successfully connected or until an error is detected. + * + * @pre The user has opened a local endpoint and obtained a list of endpoint + * IDs to connect to a given endpoint handle using an out-of-band + * mechanism not provided by PSM. + * + * @post If the connect is successful, @c array_of_epaddr is updated with valid + * endpoint addresses. + * + * @post If unsuccessful, the user can query the return status of each + * individual remote endpoint in @c array_of_errors. + * + * @post The user can call into @ref psm2_ep_connect many times with the same + * endpoint ID and the function is guaranteed to return the same output + * parameters. + * + * @post PSM2 does not keep any reference to the arrays passed into the + * function and the caller is free to deallocate them. + * + * The error value with the highest importance is returned by + * the function if some portion of the communication failed. Users should + * always refer to individual errors in @c array_of_errors whenever the + * function cannot return PSM2_OK. + * + * @returns PSM2_OK The entire set of endpoint IDs were successfully connected + * and endpoint addresses are available for all endpoint IDs. + * + * @code{.c} + int connect_endpoints(psm2_ep_t ep, int numep, + const psm2_epid_t *array_of_epid, + psm2_epaddr_t **array_of_epaddr_out) + { + psm2_error_t *errors = (psm2_error_t *) calloc(numep, sizeof(psm2_error_t)); + if (errors == NULL) + return -1; + + psm2_epaddr_t *all_epaddrs = + (psm2_epaddr_t *) calloc(numep, sizeof(psm2_epaddr_t)); + + if (all_epaddrs == NULL) + return -1; + + psm2_ep_connect(ep, numep, array_of_epid, + NULL, // We want to connect all epids, no mask needed + errors, + all_epaddrs, + 30*e9); // 30 second timeout, <1 ns is forever + *array_of_epaddr_out = all_epaddrs; + free(errors); + return 1; + } + @endcode + */ +psm2_error_t +psm2_ep_connect(psm2_ep_t ep, int num_of_epid, const psm2_epid_t *array_of_epid, + const int *array_of_epid_mask, psm2_error_t *array_of_errors, + psm2_epaddr_t *array_of_epaddr, int64_t timeout); + +/* @brief Disconnect one or more remote endpoints from a local endpoint. +* +* Function to non-collectively disconnect a connection to a set of endpoint +* addresses and free the endpoint addresses. After disconnecting, the +* application cannot send messages to the remote processes and PSM2 is +* restored back to the state before calling psm2_ep_connect. The application +* must call psm2_ep_connect to establish the connections again. +* +* This function is equivalent to calling psm2_ep_disconnect2() with mode == +* PSM2_EP_DISCONNECT_GRACEFUL. +* +* @param[in] ep PSM2 endpoint handle +* +* @param[in] num_of_epaddr The number of endpoint addresses to disconnect from, +* which also indicates the number of elements contained +* in all of the function’s array-based parameters. +* +* @param[in] array_of_epaddr User-allocated array that contains num_of_epaddr +* valid endpoint addresses. Each endpoint address (or +* epaddr) has been obtained through a previous +* psm2_ep_connect call. +* +* @param[in] array_of_epaddr_mask User-allocated array that contains +* num_of_epaddr integers. This array of masks +* allows users to select which of the +* epaddresses in array_of_epaddr should be +* disconnected. If the integer at index i is +* zero, PSM2 does not attempt to disconnect to +* the epaddr at index i in array_of_epaddr. If +* this parameter is NULL, PSM2 tries to +* disconnect all epaddr in array_of_epaddr. +* +* @param[out] array_of_errors User-allocated array of at least num_of_epaddr +* elements. If the function does not return PSM2_OK, +* this array can be consulted for each endpoint +* address not masked off by array_of_epaddr_mask to +* know why the endpoint could not be disconnected. +* Any endpoint address that could not be +* disconnected because of an unrelated failure is +* marked as PSM2_EPID_UNKNOWN. If the function +* returns PSM2_OK, the errors for all endpoint +* addresses also contain PSM2_OK. +* +* @param[in] timeout Timeout in nanoseconds after which disconnection attempts +* are abandoned. Setting this value to 0 disables timeout and +* waits until all endpoints have been successfully +* disconnected or until an error is detected. +* +* @pre You have established the connections with previous psm2_ep_connect calls. +* +* @post If the disconnect is successful, the corresponding epaddr in +* array_of_epaddr is reset to NULL pointer. +* +* @post If unsuccessful, you can query the return status of each individual +* remote endpoint in array_of_errors. +* +* @post PSM2 does not keep any reference to the arrays passed into the function +* and the caller is free to deallocate them. +* +* @post The error value with the highest importance is returned by the function +* if some portion of the communication failed. Refer to individual errors +* in array_of_errors whenever the function cannot return PSM2_OK. +* +* @returns PSM2_OK The entire set of endpoint IDs were successfully disconnected +* and endpoint addresses are freed by PSM2. +* +* @code{.c} +int disconnect_endpoints(psm2_ep_t ep, int num_epaddr, + const psm2_epaddr_t *array_of_epaddr) +{ + psm2_error_t *errors = + (psm2_error_t *)calloc(num_epaddr, sizeof(psm2_error_t)); + if (errors == NULL) + return -1; + psm2_ep_disconnect( + ep, num_epaddr, array_of_epaddr, + NULL, // We want to disconnect all epaddrs, no mask needed, + errors, + 30 * e9); // 30 second timeout, <1 ns is forever + free(errors); + return 1; +} +@endcode +*/ +psm2_error_t psm2_ep_disconnect(psm2_ep_t ep, int num_of_epaddr, + psm2_epaddr_t *array_of_epaddr, + const int *array_of_epaddr_mask, + psm2_error_t *array_of_errors, int64_t timeout); + +/* @brief Disconnect one or more remote endpoints from a local endpoint. +* +* Function to non-collectively disconnect a connection to a set of endpoint +* addresses and free the endpoint addresses. After disconnecting, the +* application cannot send messages to the remote processes and PSM2 is +* restored back to the state before calling psm2_ep_connect. The application +* must call psm2_ep_connect to establish the connections again. +* +* @param[in] ep PSM2 endpoint handle +* +* @param[in] num_of_epaddr The number of endpoint addresses to disconnect from, +* which also indicates the number of elements contained +* in all of the function’s array-based parameters. +* +* @param[in] array_of_epaddr User-allocated array that contains num_of_epaddr +* valid endpoint addresses. Each endpoint address (or +* epaddr) has been obtained through a previous +* psm2_ep_connect call. +* +* @param[in] array_of_epaddr_mask User-allocated array that contains +* num_of_epaddr integers. This array of masks +* allows users to select which of the +* epaddresses in array_of_epaddr should be +* disconnected. If the integer at index i is +* zero, PSM2 does not attempt to disconnect to +* the epaddr at index i in array_of_epaddr. If +* this parameter is NULL, PSM2 tries to +* disconnect all epaddr in array_of_epaddr. +* +* @param[out] array_of_errors User-allocated array of at least num_of_epaddr +* elements. If the function does not return PSM2_OK, +* this array can be consulted for each endpoint +* address not masked off by array_of_epaddr_mask to +* know why the endpoint could not be disconnected. +* Any endpoint address that could not be +* disconnected because of an unrelated failure is +* marked as PSM2_EPID_UNKNOWN. If the function +* returns PSM2_OK, the errors for all endpoint +* addresses also contain PSM2_OK. +* +* @param[in] mode One of @ref PSM2_EP_DISCONECT_GRACEFUL or @ref PSM2_EP_DISCONECT_FORCE +* +* @param[in] timeout Timeout in nanoseconds after which disconnection attempts +* are abandoned. Setting this value to 0 disables timeout and +* waits until all endpoints have been successfully +* disconnected or until an error is detected. Supplying a +* negative value here sets the disconnection mode to "force". +* +* @pre You have established the connections with previous psm2_ep_connect calls. +* +* @post If the disconnect is successful, the corresponding epaddr in +* array_of_epaddr is reset to NULL pointer. +* +* @post If unsuccessful, you can query the return status of each individual +* remote endpoint in array_of_errors. +* +* @post PSM2 does not keep any reference to the arrays passed into the function +* and the caller is free to deallocate them. +* +* @post The error value with the highest importance is returned by the function +* if some portion of the communication failed. Refer to individual errors +* in array_of_errors whenever the function cannot return PSM2_OK. +* +* @returns PSM2_OK The entire set of endpoint IDs were successfully disconnected +* and endpoint addresses are freed by PSM2. +* +* @code{.c} +int disconnect_endpoints(psm2_ep_t ep, int num_epaddr, + const psm2_epaddr_t *array_of_epaddr) +{ + psm2_error_t *errors = + (psm2_error_t *)calloc(num_epaddr, sizeof(psm2_error_t)); + if (errors == NULL) + return -1; + psm2_ep_disconnect2( + ep, num_epaddr, array_of_epaddr, + NULL, // We want to disconnect all epaddrs, no mask needed, + errors, + PSM2_EP_DISCONECT_GRACEFUL, + 30 * e9); // 30 second timeout, 0 ns is forever + free(errors); + return 1; +} +@endcode +*/ +psm2_error_t psm2_ep_disconnect2(psm2_ep_t ep, int num_of_epaddr, + psm2_epaddr_t *array_of_epaddr, + const int *array_of_epaddr_mask, + psm2_error_t *array_of_errors, + int mode, int64_t timeout); + +#define PSM2_EP_DISCONNECT_GRACEFUL PSM2_EP_CLOSE_GRACEFUL /**< Graceful mode in @ref psm2_ep_disconnect2 */ +#define PSM2_EP_DISCONNECT_FORCE PSM2_EP_CLOSE_FORCE /**< Forceful mode in @ref psm2_ep_disconnect2 */ + +/** @brief Ensure endpoint communication progress + * + * Function to ensure progress for all PSM2 components instantiated on an + * endpoint (currently, this only includes the MQ component). The function + * never blocks and is typically required in two cases: + * + * @li Allowing all PSM2 components instantiated over a given endpoint to make + * communication progress. Refer to @ref mq_progress for a detailed + * discussion on MQ-level progress issues. + * + * @li Cases where users write their own synchronization primitives that + * depend on remote communication (such as spinning on a memory location + * which's new value depends on ongoing communication). + * + * The poll function doesn't block, but the user can rely on the @ref + * PSM2_OK_NO_PROGRESS return value to control polling behaviour in terms of + * frequency (poll until an event happens) or execution environment (poll for a + * while but yield to other threads of CPUs are oversubscribed). + * + * @returns PSM2_OK Some communication events were progressed + * @returns PSM2_OK_NO_PROGRESS Polling did not yield any communication progress + * + */ +psm2_error_t psm2_poll(psm2_ep_t ep); + +/** @brief Set a user-determined ep address label. + * + * @param[in] epaddr Endpoint address, obtained from @ref psm2_ep_connect + * @param[in] epaddr_label_string User-allocated string to print when + * identifying endpoint in error handling or other verbose + * printing. The NULL-terminated string must be allocated by + * the user since PSM2 only keeps a pointer to the label. If + * users do not explicitly set a label for each endpoint, + * endpoints will identify themselves as hostname:port. + */ +void psm2_epaddr_setlabel(psm2_epaddr_t epaddr, + const char *epaddr_label_string); + +/** @brief Set a user-determined ep address context. + * + * @param[in] epaddr Endpoint address, obtained from @ref psm2_ep_connect + * @param[in] ctxt Opaque user defined state to associate with an endpoint + * address. This state can be retrieved via + * @ref psm2_epaddr_getctxt. + */ +void +psm2_epaddr_setctxt(psm2_epaddr_t epaddr, void *ctxt); + +/** @brief Get the user-determined ep address context. Users can associate an + * opaque context with each endpoint via @ref psm2_epaddr_setctxt. + * + * @param[in] epaddr Endpoint address, obtained from @ref psm2_ep_connect. + */ +void *psm2_epaddr_getctxt(psm2_epaddr_t epaddr); + +/* Below are all component specific options. The component object for each of + * the options is also specified. + */ + +/* PSM2_COMPONENT_CORE options */ +/* PSM2 debug level */ +#define PSM2_CORE_OPT_DEBUG 0x101 + /**< [@b uint32_t ] Set/Get the PSM2 debug level. This option can be set + * before initializing the PSM2 library. + * + * component object: (null) + * option value: PSM2 Debug mask to set or currently active debug level. + */ + +/* PSM2 endpoint address context */ +#define PSM2_CORE_OPT_EP_CTXT 0x102 + /**< [@b uint32_t ] Set/Get the context associated with a PSM2 endpoint + * address (psm2_epaddr_t). + * + * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. + * option value: Context associated with PSM2 endpoint address. + */ + +/* PSM2_COMPONENT_IB options */ +/* Default service level to use to communicate with remote endpoints */ +#define PSM2_IB_OPT_DF_SL 0x201 + /**< [@b uint32_t ] Default OPA SL to use for all remote communication. + * If unset defaults to Service Level 0. + * + * component object: Opened PSM2 endpoint id (@ref psm2_ep_t). + * option value: Default IB SL to use for endpoint. (0 <= SL < 15) + */ + +/* Set IB service level to use for communication to an endpoint */ +#define PSM2_IB_OPT_EP_SL 0x202 + /**< [@b uint32_t ] OPA SL to use for communication to specified + * remote endpoint. + * + * component object: PSM2 endpoint (@ ref psm2_epaddr_t) address. + * option value: SL used to communicate with remote endpoint. (0 <= SL < 15) + */ + +/* PSM2_COMPONENT_MQ options (deprecates psm2_mq_set|getopt) */ +/* MQ options that can be set in psm2_mq_init and psm2_{set,get}_opt */ +#define PSM2_MQ_OPT_RNDV_IB_SZ 0x301 + /**< [@b uint32_t ] Size at which to start enabling rendezvous + * messaging for OPA messages (if unset, defaults to values + * between 56000 and 72000 depending on the system configuration) + * + * component object: PSM2 Matched Queue (@ref psm2_mq_t). + * option value: Size at which to switch to rendezvous protocol. + */ +#define PSM2_MQ_RNDV_HFI_SZ PSM2_MQ_OPT_RNDV_IB_SZ +#define PSM2_MQ_RNDV_IPATH_SZ PSM2_MQ_OPT_RNDV_IB_SZ + +#define PSM2_MQ_OPT_RNDV_SHM_SZ 0x302 +#define PSM2_MQ_RNDV_SHM_SZ PSM2_MQ_OPT_RNDV_SHM_SZ + /**< [@b uint32_t ] Size at which to start enabling + * rendezvous messaging for shared memory (intra-node) messages (If + * unset, defaults to 64000 bytes). + * + * component object: PSM2 Matched Queue (@ref psm2_mq_t). + * option value: Size at which to switch to rendezvous protocol. + */ + +#define PSM2_MQ_OPT_SYSBUF_MYBYTES 0x303 +#define PSM2_MQ_MAX_SYSBUF_MBYTES PSM2_MQ_OPT_SYSBUF_MYBYTES + /**< [@b uint32_t ] Maximum number of bytes to allocate for unexpected + * messages. + * + * component object: PSM2 Matched Queue (@ref psm2_mq_t). + * option value: Deprecated; this option has no effect. + */ + +/* PSM2_COMPONENT_AM options */ +#define PSM2_AM_OPT_FRAG_SZ 0x401 +#define PSM2_AM_MAX_FRAG_SZ PSM2_AM_OPT_FRAG_SZ +/*!< [@b uint32_t ] Maximum active message fragment size that can be sent + * for a given endpoint or across all endpoints. This value can only be + * queried. + * + * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. If NULL then + * option value is the smalles fragment size across all + * active endpoints. + * option value: Maximum active message fragment size in bytes. + */ + +#define PSM2_AM_OPT_NARGS 0x402 +#define PSM2_AM_MAX_NARGS PSM2_AM_OPT_NARGS + +/*!< [@b uint32_t ] Maximum number of message arguments that can be sent + * for a given endpoint or across all endpoints. This value can only be + * queried. + * + * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. If NULL then + * option value is the smalles fragment size across all + * active endpoints. + * option value: Maximum number of active message arguments. + */ + +#define PSM2_AM_OPT_HANDLERS 0x403 +#define PSM2_AM_MAX_HANDLERS PSM2_AM_OPT_HANDLERS +/*!< [@b uint32_t ] Maximum number of message handlers that can be registered + * for a given endpoint or across all endpoints. This value can only be + * queried. + * + * component object: PSM2 endpoint (@ref psm2_epaddr_t) address. If NULL then + * option value is the smalles fragment size across all + * active endpoints. + * option value: Maximum number of active message handlers. + */ + +/** @brief Set an option for a PSM2 component + * + * Function to set the value of a PSM2 component option + * + * @param[in] component Type of PSM2 component for which to set the option + * @param[in] component_obj Opaque component specify object to apply the set + * operation on. These are passed uninterpreted to the + * appropriate component for interpretation. + * @param[in] optname Name of component option to set. These are component + * specific and passed uninterpreted to the appropriate + * component for interpretation. + * @param[in] optval Pointer to storage that contains the value to be updated + * for the supplied option. It is up to the user to + * ensure that the pointer points to a memory location with a + * correct size and format. + * @param[in] optlen Size of the memory region pointed to by optval. + * + * @returns PSM2_OK if option could be set. + * @returns PSM2_PARAM_ERR if the component or optname are not valid. + * @returns PSM2_OPT_READONLY if the option to be set is a read-only option. + * + */ +psm2_error_t +psm2_setopt(psm2_component_t component, const void *component_obj, + int optname, const void *optval, uint64_t optlen); + +/** @brief Get an option for a PSM2 component + * + * Function to get the value of a PSM2 component option + * + * @param[in] component Type of PSM2 component for which to get the option + * @param[in] component_obj Opaque component specify object to apply the get + * operation on. These are passed uninterpreted to the + * appropriate component for interpretation. + * @param[in] optname Name of component option to get. These are component + * specific and passed uninterpreted to the appropriate + * component for interpretation. + * @param[out] optval Pointer to storage that contains the value to be updated + * for the supplied option. It is up to the user to + * ensure that the pointer points to a valid memory region. + * @param[in,out] optlen This is a value result parameter initially containing + * the size of the memory region pointed to by optval and + * modified to return the actual size of optval. + * + * @returns PSM2_OK if option value could be retrieved successfully. + * @returns PSM2_PARAM_ERR if the component or optname are not valid. + * @returns PSM2_NO_MEMORY if the memory region optval is of insufficient size. + * optlen contains the required memory region size for + * optname value. + * + */ +psm2_error_t +psm2_getopt(psm2_component_t component, const void *component_obj, + int optname, void *optval, uint64_t *optlen); + +/** @brief Datatype for end-point information */ +typedef struct psm2_epinfo { + psm2_ep_t ep; /**< The ep for this end-point*/ + psm2_epid_t epid; /**< The epid for this end-point */ + psm2_uuid_t uuid; /**< The UUID for this end-point */ + uint16_t jkey; /**< The job key for this end-point */ + char uuid_str[64]; /**< String representation of the UUID for this end-point */ +} psm2_epinfo_t; + +/** @brief Datatype for end-point connection */ +typedef struct psm2_epconn { + psm2_epaddr_t addr; /**< The epaddr for this connection */ + psm2_ep_t ep; /**< The ep for this connection */ + psm2_mq_t mq; /**< The mq for this connection */ +} psm2_epconn_t; + +/** @brief Query PSM2 for end-point information. + * + * Function to query PSM2 for end-point information. This allows retrieval of + * end-point information in cases where the caller does not have access to the + * results of psm2_ep_open(). In the default single-rail mode PSM2 will use + * a single endpoint. If either multi-rail mode or multi-endpoint mode is + * enabled, PSM2 will use multiple endpoints. + * + * @param[in,out] num_of_epinfo On input, sizes the available number of entries + * in array_of_epinfo. On output, specifies the + * returned number of entries in array_of_epinfo. + * @param[out] array_of_epinfo Returns end-point information structures. + * + * @pre PSM2 is initialized and the end-point has been opened. + * + * @returns PSM2_OK indicates success. + * @returns PSM2_PARAM_ERR if input num_if_epinfo is less than or equal to zero. + * @returns PSM2_EP_WAS_CLOSED if PSM2 end-point is closed or does not exist. + */ +psm2_error_t psm2_ep_query(int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo); + +/** @brief Query PSM2 for end-point connections. + * + * Function to query PSM2 for end-point connections. This allows retrieval of + * end-point connections in cases where the caller does not have access to the + * results of psm2_ep_connect(). The epid values can be found using + * psm2_ep_query() so that each PSM2 process can determine its own epid. These + * values can then be distributed across the PSM2 process so that each PSM + * process knows the epid for all other PSM2 processes. + * + * @param[in] epid The epid of a PSM2 process. + * @param[out] epconn The connection information for that PSM2 process. + * + * @pre PSM2 is initialized and the end-point has been connected to this epid. + * + * @returns PSM2_OK indicates success. + * @returns PSM2_EP_WAS_CLOSED if PSM2 end-point is closed or does not exist. + * @returns PSM2_EPID_UNKNOWN if the epid value is not known to PSM. + */ +psm2_error_t psm2_ep_epid_lookup(psm2_epid_t epid, psm2_epconn_t *epconn); + +/** @brief Query given PSM2 end-point for its connections. + * + * The need for this function comes with 'multi-ep' feature. + * Function is similar to (@ref psm2_ep_epid_lookup). + * It differs in that an extra parameter which identifies + * the end-point [ep] must be provided which limits the lookup to that single ep. + * + * @returns PSM2_OK indicates success. + * @returns PSM2_EP_WAS_CLOSED if PSM2 end-point [ep] is closed or does not exist. + * @returns PSM2_EPID_UNKNOWN if the [epid] value is not known to PSM. + * @returns PSM2_PARAM_ERR if output [epconn] is NULL. + */ +psm2_error_t psm2_ep_epid_lookup2(psm2_ep_t ep, psm2_epid_t epid, psm2_epconn_t *epconn); + +/** @brief Get PSM2 epid for given epaddr. + * + * @param[in] epaddr The endpoint address. + * @param[out] epid The epid of a PSM2 process. + * + * @returns PSM2_OK indicates success. + * @returns PSM2_PARAM_ERR if input [epaddr] or output [epid] is NULL. + */ +psm2_error_t psm2_epaddr_to_epid(psm2_epaddr_t epaddr, psm2_epid_t *epid); + +/*! @} */ + +/*! @addtogroup init PSM2 Information Query + * @{ + */ + +/** @brief Enumeration for info query APIs + * + * Note that calling the function: + * + @code{.c} + psm2_error_t psm2_info_query(psm2_info_query_t, void *out, + size_t nargs, psm2_info_query_arg_t []); + @endcode + * + * Takes a variable number of input arguments, per the initial psm2_info_query_t + * + * Below, there is an explanation of the number, type and order of the + * required input arguments, as well as a definition of the type of the output. + */ +typedef enum psm2_info_query_et +{ +/*! Required input arguments 0 + Output parameter: uint32_t*, description: the number of units */ + PSM2_INFO_QUERY_NUM_UNITS, + +/*! Required input arguments: 0 + Output parameter: uint32_t*, description: the number of ports */ + PSM2_INFO_QUERY_NUM_PORTS, + +/*! Required input arguments: 1 + 1. type: uint32_t, description: the unit for which status is + desired (use: psm2_info_query_arg_t.unit). + Output parameter: uint32_t, description: zero, when the unit + is not active, non-zero when the unit is + active. */ + PSM2_INFO_QUERY_UNIT_STATUS, + +/*! Required input arguments: 2 + 1. type: uint32_t, description: the unit for which status is + desired (use: psm2_info_query_arg_t.unit). + 2. type: uint32_t, description: the port for which status is + desired (use: psm2_info_query_arg_t.port). + Output parameter: uint32_t, description: zero, when the unit + is not active, non-zero when the unit is + active. */ + PSM2_INFO_QUERY_UNIT_PORT_STATUS, + +/*! Required input arguments: 1 + 1. type: uint32_t, description: the unit for which the number of + free contexts is desired (use: psm2_info_query_arg_t.unit). + Output parameter: uint32_t, description: the number of free + contexts.. */ + PSM2_INFO_QUERY_NUM_FREE_CONTEXTS, + +/*! Required input arguments: 1 + 1. type: uint32_t, description: the unit for which the number of + contexts is desired (use: psm2_info_query_arg_t.unit). + Output parameter: uint32_t, description: the number of + contexts.. */ + PSM2_INFO_QUERY_NUM_CONTEXTS, + +/*! Required input arguments: 2 + 1. type: psm2_mq_t, description: the mq that is associated with the + connection for which configuration information is wanted. + (use: psm2_info_query_arg_t.mq). + 2. type: psm2_epaddr_t, description: the ep address that is + associated with the connection for which configuration + information is wanted (use: psm2_info_query_arg_t.epaddr). + Output parameter: uint32_t, description: a bit mask containing bits defining the configuration. + see psm2_info_query_config for a description of the bits. */ + PSM2_INFO_QUERY_CONFIG, + +/*! Required input arguments: 3 + 1. type: psm2_mq_t, description: the mq that is associated with the + connection for which the msg size query information is wanted. + (use: psm2_info_query_arg_t.mq). + 2. type: psm2_epaddr_t, description: the ep address that is + associated with the connection for which the msg size query + information is wanted (use: psm2_info_query_arg_t.epaddr). + 3. type: enum psm2_info_query_thresh_et, the specific msg size query. + (use: psm2_info_query_arg_t.mstq). + + Output parameter: uint32_t, description: the message size threshold. */ + PSM2_INFO_QUERY_THRESH, + +/*! Required input arguments: 3 + 1. type: psm2_mq_t, description: the mq that is associated with the + connection for which the device name is wanted. + (use: psm2_info_query_arg_t.mq). + 2. type: psm2_epaddr_t, description: the ep address that is + associated with the connection for which device name is wanted. + (use: psm2_info_query_arg_t.epaddr). + 3. type: size_t, the length of the output buffer that will recieve + the device name (use: psm2_info_query_arg_t.length). + Output parameter: char *, description: the device name. */ + PSM2_INFO_QUERY_DEVICE_NAME, + +/*! Required input arguments: 2 + 1. type: psm2_mq_t, description: the mq that is associated with the + connection for which the mtu is wanted (use: psm2_info_query_arg_t.mq). + 2. type: psm2_epaddr_t, description: the ep address that is + associated with the connection for which mtu is wanted. + (use: psm2_info_query_arg_t.epaddr). + Output parameter: uint32_t, description: the mtu. */ + + PSM2_INFO_QUERY_MTU, + +/*! Required input arguments: 2 + 1. type: psm2_mq_t, description: the mq that is associated with the + connection for which the link speed is wanted (use: + psm2_info_query_arg_t.mq). + 2. type: psm2_epaddr_t, description: the ep address that is + associated with the connection for which link speed is wanted. + (use: psm2_info_query_arg_t.epaddr). + Output parameter: uint32_t, description: the link speed. */ + PSM2_INFO_QUERY_LINK_SPEED, + +/*! Required input arguments: 1 + 1. type: size_t, description: the length of the output buffer to receive + the network type (use: psm2_info_query_arg_t.length). + Output parameter: char*, description: the network type. */ + PSM2_INFO_QUERY_NETWORK_TYPE, + +/*! Required input arguments 0 + Output parameter: uint32_t*, description: a bit mask of the features in libpsm2. + See psm2_info_query_feature_mask below for bit mask definition. */ + PSM2_INFO_QUERY_FEATURE_MASK, + +/*! Required input arguments 2 + 1. type: uint32_t, description: the unit # of the device you want to + identify. + 2. type: size_t, description: the length of the output buffer that will + receive the device name. + Output parameter: char*, description: name of the device. */ + PSM2_INFO_QUERY_UNIT_NAME, + +/*! Required input arguments: 2 + 1. type: uint32_t, description: unit number for which the device + name is wanted. + (use: psm2_info_query_arg_t.unit). + 2. type: size_t, description: the length of the output buffer + that will recieve the sysfs path. + (use: psm2_info_query_arg_t.length). + Output parameter: char *, description: the sysfs path. */ + PSM2_INFO_QUERY_UNIT_SYS_PATH, + + PSM2_INFO_QUERY_LAST, /* must appear last, and the info query + constants are used as an index. */ +} psm2_info_query_t; + +/** @brief Enumeration for info query config + */ +enum psm2_info_query_config +{ + /*! The following three are 'main configs': */ + PSM2_INFO_QUERY_CONFIG_IPS = (1 << 0), + PSM2_INFO_QUERY_CONFIG_AMSH = (1 << 1), + PSM2_INFO_QUERY_CONFIG_SELF = (1 << 2), + + /*! The following three are sub-configs of + the IPS main config: */ + + PSM2_INFO_QUERY_CONFIG_CUDA = (1 << 3), + PSM2_INFO_QUERY_CONFIG_PIO = (1 << 4), + PSM2_INFO_QUERY_CONFIG_DMA = (1 << 5), + + /*! The following is a sub-config of IPS & CUDA + main config: */ + + PSM2_INFO_QUERY_CONFIG_GDR_COPY = (1 << 6), +}; + +/** @brief Enumeration info query thresholds + */ +enum psm2_info_query_thresh_et +{ +/*! This is the start of the thresh queries for IPS config: */ + PSM2_INFO_QUERY_THRESH_IPS_START, + +/*! Not shown here are the specific queries supported by the CUDA + and GDR_COPY, sub-configs. + + But, those configs will need to include threshold queries in case the + config includes them. + + Note that for the case of gdr_copy the thresholds varies for the case + of the memory is gpu memory or not. */ + +/*! The following threshold queres are supported for the IPS config + only. */ + +/*! The PSM2_INFO_QUERY_THRESH_IPS_PIO_DMA threshold query indicates at + what message size the send transport transitions from PIO to DMA. + + Note that this threshold query may be meaningless if PIO or DMA is + disabled. */ + PSM2_INFO_QUERY_THRESH_IPS_PIO_DMA = PSM2_INFO_QUERY_THRESH_IPS_START, +/*! Messages with messages sizes less than or equal to the tiny threshold + will be sent by tiny message. */ + PSM2_INFO_QUERY_THRESH_IPS_TINY, +/*! Messages with messages sizes greater than tiny, but less than or equal + to frag size will be sent by short message. */ + PSM2_INFO_QUERY_THRESH_IPS_PIO_FRAG_SIZE, + PSM2_INFO_QUERY_THRESH_IPS_DMA_FRAG_SIZE, +/*! Messages that are greater than the frag_size, but less than RNDV will + be sent by eager message. + Messages with messages sizes greater than or equal to RNDV will be + sent by the rendezvous protocol message. */ + PSM2_INFO_QUERY_THRESH_IPS_RNDV, + PSM2_INFO_QUERY_THRESH_IPS_END = PSM2_INFO_QUERY_THRESH_IPS_RNDV, + +/*! Not shown here are the specific thresh queries supported by AMSH and + SELF configs: */ + PSM2_INFO_QUERY_THRESH_AMSH_START, + PSM2_INFO_QUERY_THRESH_AMSH_END = PSM2_INFO_QUERY_THRESH_AMSH_START, + + PSM2_INFO_QUERY_THRESH_SELF_START, + PSM2_INFO_QUERY_THRESH_SELF_END = PSM2_INFO_QUERY_THRESH_SELF_START, +}; + +enum psm2_info_query_feature_mask +{ + /*! The following bit means that the libpsm2 _can_ support cuda. + If the PSM2_INFO_QUERY_FEATURE_MASK request is made and + the PSM2_INFO_QUERY_FEATURE_CUDA bit is not present, thne cuda + is not supported. */ + PSM2_INFO_QUERY_FEATURE_CUDA = (1 << 0), +}; + +/** @brief Union for info query arg type + */ +typedef union psm2_info_query_arg +{ + uint32_t unit; + uint32_t port; + size_t length; + psm2_mq_t mq; + psm2_epaddr_t epaddr; + enum psm2_info_query_thresh_et mstq; +} psm2_info_query_arg_t; + +/** @brief PSM2 info query + * + * Function that allows a client to interrogate PSM2 for various information. + * + * @param[in] psm2_info_query_t What information is requested. + * @param[out] void * out, where the information will be delivered on a + * PSM2_OK return. + * @param[in] size_t nargs, the number of following arguments. + * @param[in] psm2_info_query_arg_t [], The arguments that are required for + * certain queries. See documentation + * at @ref psm2_info_query_t for what + * arguments are required for what + * queries as well as what the type + * the output is expected to be. + * + * @retval PSM2_OK The out buffer has successfully been written with the + * result of the query. + */ +psm2_error_t psm2_info_query(psm2_info_query_t, void *out, + size_t nargs, psm2_info_query_arg_t []); + +/*! @} */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif +#endif diff --git a/prov/psm3/psm3/psm2_am.h b/prov/psm3/psm3/psm2_am.h new file mode 100644 index 00000000000..a53777bdba1 --- /dev/null +++ b/prov/psm3/psm3/psm2_am.h @@ -0,0 +1,481 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef PSM2_AM_H +#define PSM2_AM_H + +#include +#include +#include + +#ifndef PACK_SUFFIX +/* XXX gcc only */ +#define PACK_SUFFIX __attribute__((packed)) +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * @file psm2_am.h + * @brief PSM2 Active Message. + * + * @page psm2_am Active Message Interface + * + * PSM2 implements an Active Message (AM) component that lives alongside the + * Matched Queues (MQ) component. The active message interface essentially + * provides a remote procedure call mechanism. A PSM2 process can generate a + * request to run an active message handler on a remote PSM2 process + * identified by its end-point address (epaddr). End-point address values + * are returned by PSM2 when connecting end-points using the psm2_ep_connect() + * function. + * + * An AM handler may make local state updates, and may generate at most + * one reply to be returned to the original requestor. This reply will cause + * a handler to be run on that requestor. The requestor handler may make + * local state updates but is not allowed to reply nor request in that handler + * context. A request or reply can convey a small number of in-line arguments + * plus a short amount of data. A tight bound is placed on the number of + * in-line arguments to allow them to be packed into a header. A bound is + * placed on the size of the data payload so that the request or reply can + * be sent as a single packet within the MTU of the underlying communication + * transport. Longer payloads must be synthesized on top of the provided + * short request/reply mechanism by fragmentation and reassembly, or + * transported by some other means. + * + * Handlers are run in the process context of the targeted PSM2 process, + * either in its main thread of execution or in a progress thread. A handler + * may therefore be executed concurrently with the main thread of execution + * of the PSM2 process. PSM2 ensures that its own state is protected against this + * concurrent execution. However, a handler must make its own arrangements to + * protect its own state. Alternatively, the PSM2 progress thread can be + * disabled using the PSM3_RCVTHREAD environment variable if this is too + * onerous for the handler. + * + * PSM2 has an active progress model and requires that the PSM2 library is + * called in order to make progress. This can be achieved using the psm2_poll() + * function. A PSM2 implementatation may provide passive progress through some + * other mechanism (e.g. a receive thread), but a PSM2 consumer must not assume + * this and must arrange to make active progress through calls into the PSM + * library. Note that the PSM2 AM interface is not MTsafe, same as the other PSM + * interfaces, and that MTsafety must be provided by the consumer if required. + * + * The order in which AM requests are issued by an initiator to a particular + * target defines the order in which those AM requests will be executed on + * that target. Therefore the AM implementation will maintain the order + * of handler executions on a flow, and this also applies when progress + * threads are used. For multiple initiators issuing requests to a particular + * target, the handler executions will be interleaved in some sequentially + * consistent ordering. + */ + +/*! @defgroup am PSM2 Active Message + * + * @{ + */ + +/** @brief Datatype for an index representing an active message handler */ +typedef uint32_t psm2_handler_t; + +/** @brief Datatype for a token for an active message handler.*/ +typedef void *psm2_am_token_t; + +/* PSM2 AM flags + * These flags may be combined using bitwise-or. + */ +#define PSM2_AM_FLAG_NONE 0 /**< No other PSM2 AM flags are needed. */ +#define PSM2_AM_FLAG_ASYNC 1 /**< No need to copy source data. */ +#define PSM2_AM_FLAG_NOREPLY 2 /**< The handler for this AM request is + guaranteed not to generate a reply. */ + +/** @brief The psm2_amarg type represents the type of an AM argument. This is + * a 64-bit type and is broken down into four 16-bit fields, two 32-bit + * fields or one 64-bit field for the convenience of code using the PSM2 AM + * interface. + */ +typedef +struct psm2_amarg { + union { + struct { + uint16_t u16w3; + uint16_t u16w2; + uint16_t u16w1; + uint16_t u16w0; + } PACK_SUFFIX; + struct { + uint32_t u32w1; + uint32_t u32w0; + } PACK_SUFFIX; + uint64_t u64w0; + uint64_t u64; + }; +} PACK_SUFFIX psm2_amarg_t; + +/** @brief The AM handler function type + * + * psm2_am_handler_fn_t is the datatype for an AM handler. PSM2 AM will call-back + * into an AM handler using this function prototype. The parameters and result + * of these handler functions are described here. + * + * @param[in] token This is an opaque token value passed into a handler. + * A request handler may send at most one reply back to the + * original requestor, and must pass this value as the token + * parameter to the psm2_am_reply_short() function. A reply + * handler is also passed a token value, but must not attempt + * to reply. + * @param[in] args A pointer to the arguments provided to this handler. + * @param[in] nargs The number of arguments. + * @param[in] src A pointer to the data payload provided to this handler. + * @param[in] len The length of the data payload in bytes. + * + * @returns 0 The handler should always return a result of 0. + */ +typedef +int (*psm2_am_handler_fn_t) (psm2_am_token_t token, + psm2_amarg_t *args, int nargs, + void *src, uint32_t len); + +/** @brief The AM handler function type with caller context + * + * psm2_am_handler_2_fn_t is the datatype for an AM handler that + * includes a user context. PSM2 AM will call-back into an AM handler using + * this function prototype. The parameters and result + * of these handler functions are described here. + * + * @param[in] token This is an opaque token value passed into a handler. + * A request handler may send at most one reply back to the + * original requestor, and must pass this value as the token + * parameter to the psm2_am_reply_short() function. A reply + * handler is also passed a token value, but must not attempt + * to reply. + * @param[in] args A pointer to the arguments provided to this handler. + * @param[in] nargs The number of arguments. + * @param[in] src A pointer to the data payload provided to this handler. + * @param[in] len The length of the data payload in bytes. + * @param[in] hctx The user context pointer provided at handler registration. + * + * @returns 0 The handler should always return a result of 0. + */ +typedef +int (*psm2_am_handler_2_fn_t) (psm2_am_token_t token, + psm2_amarg_t *args, int nargs, + void *src, uint32_t len, void *hctx); + +/** @brief Type for a completion call-back handler. + * + * A completion handler can be specified to give a call-back on the initiation + * side that an AM request or reply has completed on the target side. The + * call-back has a context pointer which is provided along with the call-back + * function pointer when the initiator generates the request or reply. This + * approach will typically give higher performance than using an AM request or + * reply to achieve the same effect, though note that no additional information + * can be passed from the target side back to the initiator side with the + * completion handler approach. + * + * @param[in] context A context pointer. + * @returns void This handler has no return result. + */ +typedef +void (*psm2_am_completion_fn_t) (void *context); + +/** @brief Register AM call-back handlers at the specified end-point. + * + * This function is used to register an array of handlers, and may be called + * multiple times to register additonal handlers. The maximum number of + * handlers that can be registered is limited to the max_handlers value + * returned by psm2_am_get_parameters(). Handlers are associated with a PSM + * end-point. The handlers are allocated index numbers in the the handler table + * for that end-point. The allocated index for the handler function in + * handlers[i] is returned in handlers_idx[i] for i in (0, num_handlers]. These + * handler index values are used in the psm2_am_request_short() and + * psm2_am_reply_short() functions. + * + * @param[in] ep End-point value + * @param[in] handlers Array of handler functions + * @param[in] num_handlers Number of handlers (sizes the handlers and + * handlers_idx arrays) + * @param[out] handlers_idx Used to return handler index mapping table + * + * @returns PSM2_OK Indicates success + * @returns PSM2_EP_NO_RESOURCES Insufficient slots in the AM handler table + */ +psm2_error_t psm2_am_register_handlers(psm2_ep_t ep, + const psm2_am_handler_fn_t * + handlers, int num_handlers, + int *handlers_idx); + +/** @brief Register AM call-back handlers at the specified end-point. + * + * This function is used to register an array of handlers, and may be called + * multiple times to register additonal handlers. The maximum number of + * handlers that can be registered is limited to the max_handlers value + * returned by psm2_am_get_parameters(). Handlers are associated with a PSM + * end-point. The handlers are allocated index numbers in the the handler table + * for that end-point. The allocated index for the handler function in + * handlers[i] is returned in handlers_idx[i] for i in (0, num_handlers]. These + * handler index values are used in the psm2_am_request_short() and + * psm2_am_reply_short() functions. + * + * @param[in] ep End-point value + * @param[in] handlers Array of handler functions + * @param[in] num_handlers Number of handlers (sizes the handlers and + * handlers_idx arrays) + * @param[in] hctx Array of void* pointers to a user contexts for identifying the + * target ep that registered these handlers. + * @param[out] handlers_idx Used to return handler index mapping table + * + * @returns PSM2_OK Indicates success + * @returns PSM2_EP_NO_RESOURCES Insufficient slots in the AM handler table + */ +psm2_error_t psm2_am_register_handlers_2(psm2_ep_t ep, + const psm2_am_handler_2_fn_t * + handlers, int num_handlers, + void **hctx, + int *handlers_idx); + +/** @brief Unregister all AM call-back handlers for the specific end-point. + * + * This function is used to unregister all AM handlers registered to the + * specified end-point. + * + * @param[in] ep End-point value + * + */ +void psm2_am_unregister_handlers(psm2_ep_t ep); + +/** @brief Generate an AM request. + * + * This function generates an AM request causing an AM handler function to be + * called in the PSM2 process associated with the specified end-point address. + * The number of arguments is limited to max_nargs and the payload length in + * bytes to max_request_short returned by the psm2_am_get_parameters() function. + * If arguments are not required, set the number of arguments to 0 and the + * argument pointer will not be dereferenced. If payload is not required, set + * the payload size to 0 and the payload pointer will not be dereferenced. + * + * Optionally a completion function and completion context pointer can be + * provided, and a local call-back will be made to that function passing in + * that context pointer once remote execution of the handler has completed. If + * the completion call-back is not required, the handler should be specified as + * NULL and the pointer value will not be used. + * + * The allowed flags are any combination of the following combined with + * bitwise-or: + * PSM2_AM_FLAG_NONE - No flags + * PSM2_AM_FLAG_ASYNC - Indicates no need to copy source data + * PSM2_AM_FLAG_NOREPLY - The handler for this AM request is guaranteed not to + * generate a reply + * + * The PSM2 AM implementation will not dereference the args pointer after return + * from this function. If PSM2_AM_FLAG_ASYNC is not provided, the PSM2 AM + * implementation will not dereference the src pointer after return from this + * function. This may require the implementation to take a copy of the payload + * if the request cannot be issued immediately. However, if PSM2_AM_FLAG_ASYNC + * is provided then a copy will not be taken and the PSM2 AM implementation + * retains ownership of the payload src memory until the request is locally + * complete. Local completion can be determined using the completion handler + * call-back, or through an AM handler associated with an AM reply. + * + * The PSM2_AM_FLAG_NOREPLY flag indicates ahead of time to the AM handler that + * a reply will not be generated. Use of this flag is optional, but it may + * enable a performance optimization in this case by indicating that reply + * state is not required. + * + * @param[in] epaddr End-point address to run handler on + * @param[in] handler Index of handler to run + * @param[in] args Array of arguments to be provided to the handler + * @param[in] nargs Number of arguments to be provided to the handler + * @param[in] src Pointer to the payload to be delivered to the handler + * @param[in] len Length of the payload in bytes + * @param[in] flags These are PSM2 AM flags and may be combined together with + * bitwise-or + * @param[in] completion_fn The completion function to called locally when + * remote handler is complete + * @param[in] completion_ctxt User-provided context pointer to be passed to the + * completion handler + * + * @returns PSM2_OK indicates success. + */ +psm2_error_t +psm2_am_request_short(psm2_epaddr_t epaddr, psm2_handler_t handler, + psm2_amarg_t *args, int nargs, void *src, + size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt); + +/** @brief Generate an AM reply. + * + * This function may only be called from an AM handler called due to an AM + * request. If the AM request uses the PSM2_AM_FLAG_NOREPLY flag, the AM + * handler must not call this function. Otherwise, the AM request handler may + * call psm2_am_reply_short() at most once, and must pass in the token value + * that it received in its own handler call-back. + * + * This function generates an AM reply causing an AM handler function to be + * called in the PSM2 process associated with the specified end-point address. + * The number of arguments is limited to max_nargs and the payload length in + * bytes to max_reply_short returned by the psm2_am_get_parameters() function. + * If arguments are not required, set the number of arguments to 0 and the + * argument pointer will not be dereferenced. If payload is not required, set + * the payload size to 0 and the payload pointer will not be dereferenced. + * + * Optionally a completion function and completion context pointer can be + * provided, and a local call-back will be made to that function passing in + * that context pointer once remote execution of the handler has completed. If + * the completion call-back is not required, the handler should be specified as + * NULL and the pointer value will not be used. + * + * The allowed flags are any combination of the following combined with + * bitwise-or: + * PSM2_AM_FLAG_NONE - No flags + * PSM2_AM_FLAG_ASYNC - Indicates no need to copy source data + * + * The PSM2 AM implementation will not dereference the args pointer after return + * from this function. If PSM2_AM_FLAG_ASYNC is not provided, the PSM2 AM + * implementation will not dereference the src pointer after return from this + * function. This may require the implementation to take a copy of the payload + * if the reply cannot be issued immediately. However, if PSM2_AM_FLAG_ASYNC is + * provided then a copy will not be taken and the PSM2 AM implementation retains + * ownership of the payload src memory until the reply is locally complete. + * Local completion can be determined using the completion handler call-back. + * + * @param[in] token Token value provided to the AM handler that is generating + * the reply. + * @param[in] handler Index of handler to run + * @param[in] args Array of arguments to be provided to the handler + * @param[in] nargs Number of arguments to be provided to the handler + * @param[in] src Pointer to the payload to be delivered to the handler + * @param[in] len Length of the payload in bytes + * @param[in] flags These are PSM2 AM flags and may be combined together with + * bitwise-or + * @param[in] completion_fn The completion function to called locally when + * remote handler is complete + * @param[in] completion_ctxt User-provided context pointer to be passed to the + * completion handler + * + * @returns PSM2_OK indicates success. + */ +psm2_error_t +psm2_am_reply_short(psm2_am_token_t token, psm2_handler_t handler, + psm2_amarg_t *args, int nargs, void *src, + size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt); + +/** @brief Return the source end-point address for a token. + * + * This function is used to obtain the epaddr object representing the message + * initiator from a token passed by PSM2 to a message handler. + * + * @param[in] token Token value provided to the AM handler that is generating + * the reply. + * @param[out] epaddr_out Pointer to the where the epaddr should be returned. + * + * @returns PSM2_OK indicates success. + * @returns PSM2_PARAM_ERR token is invalid or epaddr_out is NULL. + */ +psm2_error_t psm2_am_get_source(psm2_am_token_t token, + psm2_epaddr_t *epaddr_out); + +/** @brief AM parameters + * + * This structure is used to return PSM2 AM implementation-specific parameter + * values back to the caller of the psm2_am_get_parameters() function. This + * API also specifies the minimum values for these parameters that an + * implementation must at least provide: + * max_handlers >= 64, + * max_nargs >= 2, + * max_request_short >= 256 and + * max_reply_short >= 256. + */ +struct psm2_am_parameters { + /** Maximum number of handlers that can be registered. */ + uint32_t max_handlers; + /** Maximum number of arguments to an AM handler. */ + uint32_t max_nargs; + /** Maximum number of bytes in a request payload. */ + uint32_t max_request_short; + /** Maximum number of bytes in a reply payload. */ + uint32_t max_reply_short; +}; + +/** @brief Get the AM parameter values + * + * This function retrieves the implementation-specific AM parameter values for + * the specified end-point. + * + * @param[in] ep The end-point value returned by psm2_ep_open(). + * @param[out] parameters Pointer to the struct where the parameters will be + * returned. + * @param[in] sizeof_parameters_in The size in bytes of the struct provided by + * the caller. + * @param[out] sizeof_parameters_out The size in bytes of the struct returned + * by PSM. + * + * @returns PSM2_OK indicates success. + */ +psm2_error_t +psm2_am_get_parameters(psm2_ep_t ep, + struct psm2_am_parameters *parameters, + size_t sizeof_parameters_in, + size_t *sizeof_parameters_out); + +/*! @} */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif +#endif diff --git a/prov/psm3/psm3/psm2_hal.c b/prov/psm3/psm3/psm2_hal.c new file mode 100644 index 00000000000..69ab0dbff24 --- /dev/null +++ b/prov/psm3/psm3/psm2_hal.c @@ -0,0 +1,422 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "psm_user.h" +#include "psm2_hal.h" + +#include "ptl_ips/ips_scb.h" + +static SLIST_HEAD(, _psmi_hal_instance) head_hi; + +/* define the current hal instance pointer */ +psmi_hal_instance_t *psmi_hal_current_hal_instance = NULL; + +/* psmi_hal_register_instance */ +void psmi_hal_register_instance(psmi_hal_instance_t *psm_hi) +{ +#define REJECT_IMPROPER_HI(MEMBER) if (!psm_hi->MEMBER) return + + /* If an attempt to register a hal instance contains a NULL func ptr, reject it. */ + /* To allow fast lookups, please keep this code segment alphabetized by hfp_* + func ptr member name: */ +#if PSMI_HAL_INST_CNT > 1 + + REJECT_IMPROPER_HI(hfp_close_context); + REJECT_IMPROPER_HI(hfp_context_open); + + + REJECT_IMPROPER_HI(hfp_finalize_); + + + REJECT_IMPROPER_HI(hfp_get_jkey); + + + REJECT_IMPROPER_HI(hfp_get_node_id); + REJECT_IMPROPER_HI(hfp_get_num_contexts); + REJECT_IMPROPER_HI(hfp_get_num_free_contexts); + + + REJECT_IMPROPER_HI(hfp_get_port_active); + REJECT_IMPROPER_HI(hfp_get_port_subnet); + + + REJECT_IMPROPER_HI(hfp_get_port_lid); + + + REJECT_IMPROPER_HI(hfp_get_port_rate); + + + REJECT_IMPROPER_HI(hfp_get_unit_active); + + + REJECT_IMPROPER_HI(hfp_spio_process_events); + REJECT_IMPROPER_HI(hfp_spio_transfer_frame); + +#endif // PSMI_HAL_INST_CNT > 1 + REJECT_IMPROPER_HI(hfp_get_default_pkey); + REJECT_IMPROPER_HI(hfp_get_num_ports); + REJECT_IMPROPER_HI(hfp_get_num_units); + REJECT_IMPROPER_HI(hfp_initialize); + +#ifndef PSM2_MOCK_TESTING + if (!sysfs_init(psm_hi->hfi_sys_class_path)) +#endif + SLIST_INSERT_HEAD(&head_hi, psm_hi, next_hi); +} + +static struct _psmi_hal_instance *psmi_hal_get_pi_inst(void); + +int psmi_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...) +{ + va_list ap; + va_start(ap, k); + + int rv = 0; + struct _psmi_hal_instance *p = psmi_hal_get_pi_inst(); + + if (!p) + rv = -1; + else + { + switch(k) + { + case psmi_hal_pre_init_cache_func_get_num_units: + rv = p->params.num_units; + break; + case psmi_hal_pre_init_cache_func_get_num_ports: + rv = p->params.num_ports; + break; + case psmi_hal_pre_init_cache_func_get_unit_active: + { + int unit = va_arg(ap,int); + + if ((unit >= 0) && (unit < p->params.num_units)) + { + if (!p->params.unit_active_valid[unit]) { + p->params.unit_active_valid[unit] = 1; + p->params.unit_active[unit] = p->hfp_get_unit_active(unit); + } + rv = p->params.unit_active[unit]; + } + else + rv = -1; + } + break; + case psmi_hal_pre_init_cache_func_get_port_active: + { + int unit = va_arg(ap,int); + + if ((unit >= 0) && (unit < p->params.num_units)) + { + int port = va_arg(ap,int); + if ((port >= 1) && (port <= p->params.num_ports)) + { + if (!p->params.port_active_valid[unit*port]) { + p->params.port_active_valid[unit*port] = 1; + p->params.port_active[unit*port] = p->hfp_get_port_active(unit,port); + } + rv = p->params.port_active[unit*port]; + } + else + rv = -1; + } + else + rv = -1; + } + break; + case psmi_hal_pre_init_cache_func_get_num_contexts: + { + int unit = va_arg(ap,int); + if ((unit >= 0) && (unit < p->params.num_units)) + { + if (!p->params.num_contexts_valid[unit]) { + p->params.num_contexts_valid[unit] = 1; + p->params.num_contexts[unit] = p->hfp_get_num_contexts(unit); + } + rv = p->params.num_contexts[unit]; + } + else + rv = -1; + } + break; + case psmi_hal_pre_init_cache_func_get_num_free_contexts: + { + int unit = va_arg(ap,int); + + if ((unit >= 0) && (unit < p->params.num_units)) + { + if (!p->params.num_free_contexts_valid[unit]) { + p->params.num_free_contexts_valid[unit] = 1; + p->params.num_free_contexts[unit] = p->hfp_get_num_free_contexts(unit); + } + rv = p->params.num_free_contexts[unit]; + } + else + rv = -1; + } + break; + case psmi_hal_pre_init_cache_func_get_default_pkey: + rv = p->params.default_pkey; + break; + case psmi_hal_pre_init_cache_func_get_port_subnet: + { + int unit = va_arg(ap,int); + + if ((unit >= 0) && (unit < p->params.num_units)) + { + int port = va_arg(ap,int); + if ((port >= 1) && (port <= p->params.num_ports)) + { + if (!p->params.port_subnet_valid[unit*port]) { + rv = p->hfp_get_port_subnet(unit, port, + &p->params.port_subnet[unit*port], + &p->params.port_subnet_addr[unit*port], + &p->params.port_ip_addr[unit*port], + &p->params.port_netmask[unit*port], + &p->params.port_subnet_idx[unit*port], + &p->params.port_subnet_gid_hi[unit*port], + &p->params.port_subnet_gid_lo[unit*port]); + if (rv == 0) + p->params.port_subnet_valid[unit*port] = 1; + else + p->params.port_subnet_valid[unit*port] = -1; + } + uint64_t* subnet = va_arg(ap,uint64_t*); + uint64_t* addr = va_arg(ap,uint64_t*); + uint32_t* ip_addr = va_arg(ap,uint32_t*); + uint32_t* netmask = va_arg(ap,uint32_t*); + int* idx = va_arg(ap,int*); + uint64_t* hi = va_arg(ap,uint64_t*); + uint64_t* lo = va_arg(ap,uint64_t*); + rv = (p->params.port_subnet_valid[unit*port] ==1)? 0: -1; + if (subnet) *subnet = p->params.port_subnet[unit*port]; + if (addr) *addr = p->params.port_subnet_addr[unit*port]; + if (ip_addr) *ip_addr = p->params.port_ip_addr[unit*port]; + if (netmask) *netmask = p->params.port_netmask[unit*port]; + if (idx) *idx = p->params.port_subnet_idx[unit*port]; + if (hi) *hi = p->params.port_subnet_gid_hi[unit*port]; + if (lo) *lo = p->params.port_subnet_gid_lo[unit*port]; + } + else + rv = -1; + } + else + rv = -1; + } + break; + default: + rv = -1; + break; + } + } + + va_end(ap); + return rv; +} + +static void psmi_hal_free_cache(struct _psmi_hal_instance *p) +{ +#define FREE_HAL_CACHE(field) \ + do { \ + if (p->params.field) \ + psmi_free(p->params.field); \ + p->params.field = NULL; \ + } while (0) + + FREE_HAL_CACHE(unit_active); + FREE_HAL_CACHE(unit_active_valid); + FREE_HAL_CACHE(port_active); + FREE_HAL_CACHE(port_active_valid); + FREE_HAL_CACHE(num_contexts); + FREE_HAL_CACHE(num_contexts_valid); + FREE_HAL_CACHE(num_free_contexts); + FREE_HAL_CACHE(num_free_contexts_valid); + FREE_HAL_CACHE(port_subnet_valid); + FREE_HAL_CACHE(port_subnet); + FREE_HAL_CACHE(port_subnet_addr); + FREE_HAL_CACHE(port_ip_addr); + FREE_HAL_CACHE(port_netmask); + FREE_HAL_CACHE(port_subnet_idx); + FREE_HAL_CACHE(port_subnet_gid_hi); + FREE_HAL_CACHE(port_subnet_gid_lo); +#undef FREE_HAL_CACHE + p->params.sw_status = 0; +} + +static struct _psmi_hal_instance *psmi_hal_get_pi_inst(void) +{ + + if (psmi_hal_current_hal_instance) + return psmi_hal_current_hal_instance; + + if (SLIST_EMPTY(&head_hi)) + return NULL; + + /* At this point, assuming there are multiple HAL INSTANCES that are + registered, and two or more of the HAL INSTANCES are capable + of initialization on a host, the environment variable PSM3_HAL_PREF + allows the user to identify the one HAL INSTANCE that is desired to + be used. The default policy is, when the PSM3_HAL_PREF is not set, the + first hal instance that successfully initializes is used. */ + + union psmi_envvar_val env_hi_pref; /* HAL instance preference */ + psmi_getenv("PSM3_HAL_PREF", + "Indicate preference for HAL instance (Default is use first HAL" + " instance to successfully initialize))", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)PSM_HAL_INSTANCE_ANY_GEN, &env_hi_pref); + + /* The hfp_get_num_units() call below, will not wait for the HFI driver + to come up and create device nodes in /dev/.) */ + struct _psmi_hal_instance *p; + SLIST_FOREACH(p, &head_hi, next_hi) + { + if ((env_hi_pref.e_int == PSM_HAL_INSTANCE_ANY_GEN) || + (p->type == env_hi_pref.e_int)) + { + const int valid_flags = PSM_HAL_PARAMS_VALID_DEFAULT_PKEY | + PSM_HAL_PARAMS_VALID_NUM_UNITS | + PSM_HAL_PARAMS_VALID_NUM_PORTS | PSM_HAL_PARAMS_VALID_CACHE; + + if ((p->params.sw_status & valid_flags) == valid_flags) + return p; + + int nunits = p->hfp_get_num_units(); + int nports = p->hfp_get_num_ports(); + int dflt_pkey = p->hfp_get_default_pkey(); + if (nunits > 0 && nports > 0 +#ifndef PSM2_MOCK_TESTING + && (0 == sysfs_init(p->hfi_sys_class_path)) +#endif + ) + { + p->params.num_units = nunits; + p->params.num_ports = nports; + p->params.default_pkey = dflt_pkey; + // unit is 0 to nunits-1 + // port is 1 to nports + // size extra entry for ports below, entry 0 unused +#define ALLOC_HAL_CACHE(field, type, cnt) \ + do { \ + p->params.field = (type *)psmi_calloc(PSMI_EP_NONE, UNDEFINED, cnt, sizeof(type)); \ + if (! p->params.field) goto fail_cache_alloc; \ + } while (0) + + ALLOC_HAL_CACHE(unit_active, int8_t, nunits); + ALLOC_HAL_CACHE(unit_active_valid, int8_t, nunits); + ALLOC_HAL_CACHE(port_active, int8_t, nunits*(nports+1)); + ALLOC_HAL_CACHE(port_active_valid, int8_t, nunits*(nports+1)); + ALLOC_HAL_CACHE(num_contexts, uint16_t, nunits); + ALLOC_HAL_CACHE(num_contexts_valid, uint16_t, nunits); + ALLOC_HAL_CACHE(num_free_contexts, uint16_t, nunits); + ALLOC_HAL_CACHE(num_free_contexts_valid, uint16_t, nunits); + ALLOC_HAL_CACHE(port_subnet_valid, int8_t, nunits*(nports+1)); + ALLOC_HAL_CACHE(port_subnet, uint64_t, nunits*(nports+1)); + ALLOC_HAL_CACHE(port_subnet_addr, uint64_t, nunits*(nports+1)); + ALLOC_HAL_CACHE(port_ip_addr, uint32_t, nunits*(nports+1)); + ALLOC_HAL_CACHE(port_netmask, uint32_t, nunits*(nports+1)); + ALLOC_HAL_CACHE(port_subnet_idx, int, nunits*(nports+1)); + ALLOC_HAL_CACHE(port_subnet_gid_hi, uint64_t, nunits*(nports+1)); + ALLOC_HAL_CACHE(port_subnet_gid_lo, uint64_t, nunits*(nports+1)); + p->params.sw_status |= valid_flags; +#undef ALLOC_HAL_CACHE + return p; + } + } + } + return NULL; + +fail_cache_alloc: + psmi_hal_free_cache(p); + return NULL; +} + +/* psmi_hal_initialize */ +int psmi_hal_initialize(void) +{ + struct _psmi_hal_instance *p = psmi_hal_get_pi_inst(); + + if (!p) + return -PSM_HAL_ERROR_INIT_FAILED; + + int rv = p->hfp_initialize(p); + + if (!rv) + { + psmi_hal_current_hal_instance = p; + + + return rv; + } + return -PSM_HAL_ERROR_INIT_FAILED; +} + +int psmi_hal_finalize(void) +{ + struct _psmi_hal_instance *p = psmi_hal_current_hal_instance; + + int rv = psmi_hal_finalize_(); + psmi_hal_free_cache(p); + psmi_hal_current_hal_instance = NULL; + sysfs_fini(); + return rv; +} + + +#ifdef PSM2_MOCK_TESTING + +#include "psm_hal_gen1/opa_user_gen1.h" + + + +#endif diff --git a/prov/psm3/psm3/psm2_hal.h b/prov/psm3/psm3/psm2_hal.h new file mode 100644 index 00000000000..071a47edfdf --- /dev/null +++ b/prov/psm3/psm3/psm2_hal.h @@ -0,0 +1,367 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef __PSM2_HAL_H__ + +#define __PSM2_HAL_H__ + +#include "psm_user.h" + +/* Forward declaration of PSM structs: */ +struct ips_recvhdrq_event; +struct ips_flow; +struct ips_scb; +struct ips_epinfo; +struct ips_message_header; + +/* Declare types: */ +typedef enum +{ + PSM_HAL_INSTANCE_ANY_GEN = 0, + PSM_HAL_INSTANCE_GEN1 = 1, + PSM_HAL_INSTANCE_GEN2 = 2, + PSM_HAL_INSTANCE_GEN3 = 3, + +#ifdef PSM2_MOCK_TESTING + PSM_HAL_INSTANCE_MOCK = 99, +#endif +} psmi_hal_instance_type; + +typedef enum +{ + /* Operation was successful. No error occurred. */ + PSM_HAL_ERROR_OK = 0, + /* The operation can not be done unless HAL is initialized first. */ + PSM_HAL_ERROR_NOT_INITIALIZED = 1, + /* No HAL INSTANCE has been registered. Initialization is impossible. */ + PSM_HAL_ERROR_NO_HI_REGISTERED = 2, + /* Initialization failure. */ + PSM_HAL_ERROR_INIT_FAILED = 3, + /* Can't open device file. */ + PSM_HAL_ERROR_CANNOT_OPEN_DEVICE = 4, + /* Can't open context. */ + PSM_HAL_ERROR_CANNOT_OPEN_CONTEXT = 5, + /* Context is not open. */ + PSM_HAL_ERROR_CONTEXT_IS_NOT_OPEN = 6, + /* General error. */ + PSM_HAL_ERROR_GENERAL_ERROR = 7, + /* Not implemented. */ + PSM_HAL_ERROR_NOT_IMPLEMENTED = 8, + /* Internal error. */ + PSM_HAL_ERROR_INTERNAL_ERROR = 9, + + /* HAL instances should not return errors less than the value + PSM_HAL_ERROR_RESERVED_BY_HAL_API. These errors are reserved by + the HAL API layer. */ + PSM_HAL_ERROR_RESERVED_BY_HAL_API = 1000, +} psmi_hal_errors; + + + +/* The following enum constants correspond to the bits in the + cap_mask member of the psmi_hal_params_t. */ +typedef enum +{ + PSM_HAL_CAP_GPUDIRECT_OT = (1UL << 16), +} psmi_hal_capability_bits; + +/* The following enum constants correspond to the bits in the + sw_status member of the psmi_hal_params_t. */ +typedef enum +{ + /* Request to start rx thread. */ + PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD = (1UL << 0), + /* Rx thread is started. */ + PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED = (1UL << 1), + PSM_HAL_PSMI_RUNTIME_INTR_ENABLED = (1UL << 2), + PSM_HAL_PARAMS_VALID_NUM_UNITS = (1UL << 4), + PSM_HAL_PARAMS_VALID_NUM_PORTS = (1UL << 5), + PSM_HAL_PARAMS_VALID_DEFAULT_PKEY = (1UL << 6), + PSM_HAL_PARAMS_VALID_CACHE = (1UL << 7), + +} psmi_hal_sw_status; + +/* The _psmi_hal_params structure stores values that remain constant for the entire life of + the process and this structure resides in the hal instance structure (below). + The values are settled after the context is opened. */ +typedef struct _psmi_hal_params +{ + uint32_t cap_mask; + uint32_t sw_status; + /* start cached members */ + uint16_t num_units; + uint16_t num_ports; + uint16_t default_pkey; + int8_t *unit_active,*unit_active_valid; + int8_t *port_active,*port_active_valid; + uint16_t *num_contexts,*num_contexts_valid; + uint16_t *num_free_contexts,*num_free_contexts_valid; + // information from port_get_subnet + int8_t *port_subnet_valid; + uint64_t *port_subnet; + uint64_t *port_subnet_addr; + uint32_t *port_ip_addr; + uint32_t *port_netmask; + int *port_subnet_idx; + uint64_t *port_subnet_gid_hi; + uint64_t *port_subnet_gid_lo; +} psmi_hal_params_t; + + +#define PSM_HAL_ALG_ACROSS 0 +#define PSM_HAL_ALG_WITHIN 1 +#define PSM_HAL_ALG_ACROSS_ALL 2 + + +typedef enum { + PSMI_HAL_POLL_TYPE_URGENT = 1 +} psmi_hal_poll_type; + +/* Forward declaration of incomplete struct type _psmi_hal_instance and + * psmi_hal_instance_t typedef: */ + +struct _psmi_hal_instance; +typedef struct _psmi_hal_instance psmi_hal_instance_t; + +struct _psmi_hal_instance +{ + SLIST_ENTRY(_psmi_hal_instance) next_hi; + psmi_hal_instance_type type; + const char *description; + const char *hfi_name; + const char *hfi_sys_class_path; + /* The params member should be read-only for HIC, and + written only by the HAL instance. */ + psmi_hal_params_t params; + /* Initialize the HAL INSTANCE. */ + int (*hfp_initialize)(psmi_hal_instance_t *); + /* Finalize the HAL INSTANCE. */ + int (*hfp_finalize_)(void); + + /* Returns the number of hfi units installed on ths host: + NOTE: hfp_get_num_units is a function that must + be callable before the hal instance is initialized. */ + int (*hfp_get_num_units)(void); + + /* Returns the number of ports on each hfi unit installed. + on ths host. + NOTE: hfp_get_num_ports is a function that must + be callable before the hal instance is initialized. */ + int (*hfp_get_num_ports)(void); + + /* Returns the default pkey: + NOTE: hfp_get_default_pkey is a function that must + be callable before the hal instance is initialized. */ + int (*hfp_get_default_pkey)(void); + + /* Given a unit number, returns 1 if any port on the unit is active. + returns 0 if no port on the unit is active. + returns -1 when an error occurred. + NOTE: hfp_get_unit_active is a function that must + be callable before the hal instance is initialized. */ + int (*hfp_get_unit_active)(int unit); + + int (*hfp_get_port_active)(int unit,int port); + /* NOTE: hfp_get_num_contexts is a function that must + be callable before the hal instance is initialized. */ + int (*hfp_get_num_contexts)(int unit); + /* NOTE: hfp_get_num_free_contexts is a function that must + be callable before the hal instance is initialized. */ + int (*hfp_get_num_free_contexts)(int unit); + + /* Context open includes opening the device file, and get hw params. */ + int (*hfp_context_open)(int unit, + int port, + uint64_t open_timeout, + psm2_ep_t ep, + psm2_uuid_t const job_key, + psmi_context_t *psm_ctxt, + uint32_t cap_mask, + unsigned retryCnt); + + /* Close the context, including the device file. */ + int (*hfp_close_context)(psmi_hal_hw_context *); + + + int (*hfp_get_port_rate)(int unit, int port); + + + int (*hfp_get_port_lid)(int unit, int port); + int (*hfp_get_port_subnet)(int unit, int port, + uint64_t *subnet, uint64_t *addr, + uint32_t *ip_addr, uint32_t *netmask, + int *idx, uint64_t *hi, uint64_t *lo); + + + /* End of receive functions. */ + + + int (*hfp_spio_transfer_frame)(struct ips_proto *proto, + struct ips_flow *flow, struct ips_scb *scb, + uint32_t *payload, uint32_t length, + uint32_t isCtrlMsg, uint32_t cksum_valid, + uint32_t cksum, psmi_hal_hw_context +#ifdef PSM_CUDA + , uint32_t is_cuda_payload +#endif + ); + int (*hfp_spio_process_events)(const struct ptl *ptl); + int (*hfp_get_node_id)(int unit, int *nodep); + + + int (*hfp_get_jkey)(psmi_hal_hw_context); + +}; + +/* This is the current psmi_hal_instance, or, NULL if not initialized. + The HIC should not modify the contents of the HAL instance directly. */ +extern psmi_hal_instance_t *psmi_hal_current_hal_instance; + +/* Declare functions called by the HAL INSTANCES. */ +void psmi_hal_register_instance(psmi_hal_instance_t *); + +/* Declare functions that are called by the HIC: */ +/* All of these functions return a negative int value to + indicate failure, or >= 0 for success. */ + +/* Chooses one of the the psmi_hal_instances that have been + registered and then initializes it. + Returns: -PSM_HAL_ERROR_NOT_REGISTERED_HI if no HAL + INSTANCES are registered, or PSM_HAL_ERROR_INIT_FAILED when + another failure has occured during initialization. */ +int psmi_hal_initialize(void); + +int psmi_hal_finalize(void); + +#include "psm2_hal_inlines_d.h" + +enum psmi_hal_pre_init_cache_func_krnls +{ + psmi_hal_pre_init_cache_func_get_num_units, + psmi_hal_pre_init_cache_func_get_num_ports, + psmi_hal_pre_init_cache_func_get_unit_active, + psmi_hal_pre_init_cache_func_get_port_active, + psmi_hal_pre_init_cache_func_get_num_contexts, + psmi_hal_pre_init_cache_func_get_num_free_contexts, + psmi_hal_pre_init_cache_func_get_default_pkey, + psmi_hal_pre_init_cache_func_get_port_subnet, +}; + +int psmi_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...); + +#define PSMI_HAL_DISPATCH_PI(KERNEL,...) ( psmi_hal_pre_init_cache_func(psmi_hal_pre_init_cache_func_ ## KERNEL , ##__VA_ARGS__ ) ) + +#if PSMI_HAL_INST_CNT == 1 + +#define PSMI_HAL_DISPATCH(KERNEL,...) ( PSMI_HAL_CAT_INL_SYM(KERNEL) ( __VA_ARGS__ ) ) + +#else + +#define PSMI_HAL_DISPATCH(KERNEL,...) ( psmi_hal_current_hal_instance->hfp_ ## KERNEL ( __VA_ARGS__ )) + +#endif + +#define psmi_hal_get_num_units_(...) PSMI_HAL_DISPATCH_PI(get_num_units,##__VA_ARGS__) +#define psmi_hal_get_num_ports_(...) PSMI_HAL_DISPATCH_PI(get_num_ports,##__VA_ARGS__) +#define psmi_hal_get_unit_active(...) PSMI_HAL_DISPATCH_PI(get_unit_active,__VA_ARGS__) +#define psmi_hal_get_port_active(...) PSMI_HAL_DISPATCH_PI(get_port_active,__VA_ARGS__) +#define psmi_hal_get_num_contexts(...) PSMI_HAL_DISPATCH_PI(get_num_contexts,__VA_ARGS__) +#define psmi_hal_get_num_free_contexts(...) PSMI_HAL_DISPATCH_PI(get_num_free_contexts,__VA_ARGS__) +#define psmi_hal_get_default_pkey(...) PSMI_HAL_DISPATCH_PI(get_default_pkey,##__VA_ARGS__) +#define psmi_hal_get_port_subnet(...) PSMI_HAL_DISPATCH_PI(get_port_subnet,__VA_ARGS__) +#define psmi_hal_context_open(...) PSMI_HAL_DISPATCH(context_open,__VA_ARGS__) +#define psmi_hal_close_context(...) PSMI_HAL_DISPATCH(close_context,__VA_ARGS__) + + +#define psmi_hal_get_port_rate(...) PSMI_HAL_DISPATCH(get_port_rate,__VA_ARGS__) + + +#define psmi_hal_get_port_lid(...) PSMI_HAL_DISPATCH(get_port_lid,__VA_ARGS__) + + +#define psmi_hal_finalize_(...) PSMI_HAL_DISPATCH(finalize_,__VA_ARGS__) + + +#define psmi_hal_get_user_major_bldtime_version(...) PSMI_HAL_DISPATCH(get_user_major_bldtime_version,__VA_ARGS__) +#define psmi_hal_get_user_minor_bldtime_version(...) PSMI_HAL_DISPATCH(get_user_minor_bldtime_version,__VA_ARGS__) + + +#define psmi_hal_spio_transfer_frame(...) PSMI_HAL_DISPATCH(spio_transfer_frame,__VA_ARGS__) +#define psmi_hal_spio_process_events(...) PSMI_HAL_DISPATCH(spio_process_events,__VA_ARGS__) +#define psmi_hal_get_node_id(...) PSMI_HAL_DISPATCH(get_node_id,__VA_ARGS__) + + +#define psmi_hal_get_jkey(...) PSMI_HAL_DISPATCH(get_jkey,__VA_ARGS__) + + +#define psmi_hal_get_hal_instance_type() psmi_hal_current_hal_instance->type +#define psmi_hal_get_hal_instance_description() psmi_hal_current_hal_instance->description +#define psmi_hal_get_hfi_name() psmi_hal_current_hal_instance->hfi_name +#define psmi_hal_get_num_units() psmi_hal_current_hal_instance->params.num_units +#define psmi_hal_get_num_ports() psmi_hal_current_hal_instance->params.num_ports +#define psmi_hal_get_cap_mask() psmi_hal_current_hal_instance->params.cap_mask +#define psmi_hal_set_cap_mask(NEW_MASK) (psmi_hal_current_hal_instance->params.cap_mask = (NEW_MASK)) +#define psmi_hal_add_cap(CAP) (psmi_hal_current_hal_instance->params.cap_mask |= (CAP)) +#define psmi_hal_sub_cap(CAP) (psmi_hal_current_hal_instance->params.cap_mask &= (~(CAP))) +#define psmi_hal_has_cap(CAP) ((psmi_hal_get_cap_mask() & (CAP)) == (CAP)) + +#define psmi_hal_get_sw_status() psmi_hal_current_hal_instance->params.sw_status +#define psmi_hal_set_sw_status(NEW_STATUS) (psmi_hal_current_hal_instance->params.sw_status = (NEW_STATUS)) +#define psmi_hal_add_sw_status(STATUS) (psmi_hal_current_hal_instance->params.sw_status |= (STATUS)) +#define psmi_hal_sub_sw_status(STATUS) (psmi_hal_current_hal_instance->params.sw_status &= (~(STATUS))) +#define psmi_hal_has_sw_status(STATUS) ((psmi_hal_get_sw_status() & (STATUS)) == (STATUS)) + + +#include "psm2_hal_inlines_i.h" + +#endif /* #ifndef __PSM2_HAL_H__ */ diff --git a/prov/psm3/psm3/psm2_hal_inline_t.h b/prov/psm3/psm3/psm2_hal_inline_t.h new file mode 100644 index 00000000000..6d7cc966f12 --- /dev/null +++ b/prov/psm3/psm3/psm2_hal_inline_t.h @@ -0,0 +1,120 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* The psm2_hal_inline_t.h file serves as a template to allow all HAL + instances to easily and conveniently declare their HAL methods. */ + +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(initialize) + (psmi_hal_instance_t *); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(finalize_) + (void); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_units) + (void); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_ports) + (void); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_unit_active) + (int unit); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_node_id) + (int unit, int *nodep); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_active) + (int unit, int port); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_contexts) + (int unit); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_num_free_contexts) + (int unit); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(close_context) + (psmi_hal_hw_context *); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(context_open) + (int unit, + int port, + uint64_t open_timeout, + psm2_ep_t ep, + psm2_uuid_t const job_key, + psmi_context_t *psm_ctxt, + uint32_t cap_mask, + unsigned); + + +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_rate) + (int unit, int port); + + +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_lid) + (int unit, int port); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_subnet) + (int unit, int port, uint64_t *subnet, uint64_t *addr, + uint32_t *ip_addr, uint32_t *netmask, + int *idx, uint64_t *hi, uint64_t *lo); + + +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_default_pkey) + (void); + + +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(spio_transfer_frame) + (struct ips_proto *proto, + struct ips_flow *flow, struct ips_scb *scb, + uint32_t *payload, uint32_t length, + uint32_t isCtrlMsg, uint32_t cksum_valid, + uint32_t cksum, psmi_hal_hw_context +#ifdef PSM_CUDA + , uint32_t is_cuda_payload +#endif + ); +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(spio_process_events) + (const struct ptl *ptl); + + +static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_jkey) + (psmi_hal_hw_context ctxt); + diff --git a/prov/psm3/psm3/psm2_hal_inlines_d.h.in b/prov/psm3/psm3/psm2_hal_inlines_d.h.in new file mode 100644 index 00000000000..99d4e4a6216 --- /dev/null +++ b/prov/psm3/psm3/psm2_hal_inlines_d.h.in @@ -0,0 +1,66 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2020 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2020 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#define PSMI_HAL_INST_CNT @PSM_HAL_CNT@ + +#if PSMI_HAL_INST_CNT == 1 + +#define PSMI_HAL_INLINE inline +#define PSMI_HAL_CAT_INL_SYM(KERNEL) hfp_@PSM_HAL_INST@_##KERNEL +#include "psm2_hal_inline_t.h" + +#else +#define PSMI_HAL_INLINE /* nothing */ + +#endif + diff --git a/prov/psm3/psm3/psm2_hal_inlines_i.h.in b/prov/psm3/psm3/psm2_hal_inlines_i.h.in new file mode 100644 index 00000000000..af20bdb40f6 --- /dev/null +++ b/prov/psm3/psm3/psm2_hal_inlines_i.h.in @@ -0,0 +1,58 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2020 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2020 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#if PSMI_HAL_INST_CNT == 1 + +#include "psm_hal_@PSM_HAL_INST@/psm_hal_inline_i.h" + +#endif diff --git a/prov/psm3/psm3/psm2_mq.h b/prov/psm3/psm3/psm2_mq.h new file mode 100644 index 00000000000..6e2afba6d8b --- /dev/null +++ b/prov/psm3/psm3/psm2_mq.h @@ -0,0 +1,1617 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef PSM2_MQ_H +#define PSM2_MQ_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * @file psm2_mq.h + * @brief PSM2 Matched Queues + * + * @page psm2_mq Matched Queues interface + * + * The Matched Queues (MQ) interface implements a queue-based communication + * model with the distinction that queue message consumers use a 3-tuple of + * metadata to match incoming messages against a list of preposted receive + * buffers. These semantics are consistent with those presented by MPI-1.2 + * and all the features and side-effects of Message-Passing find their way into + * Matched Queues. There is currently a single MQ context, + * If need be, MQs may expose a function to allocate more than + * one MQ context in the future. Since an MQ is implicitly bound to a locally + * opened endpoint, handle all MQ functions use an MQ handle instead of an EP + * handle as a communication context. + * + * @section tagmatch MQ Tag Matching + * + * A successful MQ tag match requires an endpoint address (@ref psm2_epaddr_t) + * and a 3-tuple of tag objects. Two of the tag objects are provided by the + * receiver when posting a receive buffer (@ref psm2_mq_irecv) and the last is + * provided by the sender as part of every message sent (@ref psm2_mq_send and + * @ref psm2_mq_isend). Since MQ is a receiver-directed communication model, + * the tag matching done at the receiver involves matching the sent message's + * origin and send tag (@c stag) with the source endpointer address, tag (@c + * rtag), and tag selector (@c rtagsel) attached to every preposted receive + * buffer. The incoming @c stag is compared to the posted @c rtag but only for + * significant bits set to @c 1 in the @c rtagsel. The @c rtagsel can be used + * to mask off parts (or even all) of the bitwise comparison between sender and + * receiver tags. A successful match causes the message to be received into + * the buffer with which the tag is matched. If the incoming message is too + * large, it is truncated to the size of the posted receive buffer. The + * bitwise operation corresponding to a successful match and receipt of an + * expected message amounts to the following expression evaluating as true: + * + * @verbatim ((stag ^ rtag) & rtagsel) == 0 @endverbatim + * + * It is up to the user to encode (pack) into the 64-bit unsigned + * integers, including employing the @c rtagsel tag selector as a method to + * wildcart part or all of the bits significant in the tag matching operation. + * For example, MPI uses triple based on context (MPI communicator), source + * rank, send tag. The following code example shows how the triple can be + * packed into 64 bits: + * + * @code{.c} + // + // 64-bit send tag formed by packing the triple: + // + // ( context_id_16bits | source_rank_16bits | send_tag_32bits ) + // + stag = ( (((context_id)&0xffffULL)<<48)| \ + (((source_rank)&0xffffULL)<<32)| \ + (((send_tag)&0xffffffffULL)) ); + @endcode + * + * Similarly, the receiver applies the @c rtag matching bits and @c rtagsel + * masking bits against a list of send tags and returns the first successful + * match. Zero bits in the @c tagsel can be used to indicate wildcarded bits + * in the 64-bit tag which can be useful for implementing MPI's + * @c MPI_ANY_SOURCE and @c MPI_ANY_TAG. Following the example bit splicing in + * the above @c stag example: + * + * @code{.c} + // Example MPI implementation where MPI_COMM_WORLD implemented as 0x3333 + + // MPI_Irecv source_rank=MPI_ANY_SOURCE, tag=7, comm=MPI_COMM_WORLD + rtag = 0x3333000000000007; + rtagsel = 0xffff0000ffffffff; + + // MPI_Irecv source_rank=3, tag=MPI_ANY_TAG, comm=MPI_COMM_WORLD + rtag = 0x3333000300000000; + rtagsel = 0xffffffff80000000; // can't ignore sign bit in tag + + // MPI_Irecv source_rank=MPI_ANY_SOURCE, tag=MPI_ANY_TAG, comm=MPI_COMM_WORLD + rtag = 0x3333000300000000; + rtagsel = 0xffff000080000000; // can't ignore sign bit in tag + @endcode + * + * + * Applications that do not follow tag matching semantics can simply always + * pass a value of @c 0 for @c rtagsel, which will always yield a successful + * match to the first preposted buffer. If a message cannot be matched to any + * of the preposted buffers, the message is delivered as an unexpected + * message. + * + * @section mq_receive MQ Message Reception + * + * MQ messages are either received as @e expected or @e unexpected: @li The + * received message is @e expected if the incoming message tag matches the + * combination of tag and tag selector of at least one of the user-provided + * receive buffers preposted with @ref psm2_mq_irecv. + * + * @li The received message is @e unexpected if the incoming message tag @b + * doesn't match any combination of tag and tag selector from all the + * user-provided receive buffers preposted with @ref psm2_mq_irecv. + * + * Unexpected messages are messages that the MQ library buffers until the + * user provides a receive buffer that can match the unexpected message. + * With Matched Queues and MPI alike, unexpected messages can occur as a + * side-effect of the programming model, whereby the arrival of messages can be + * slightly out of step with the ordering in which the user + * provides receive buffers. Unexpected messages can also be triggered by the + * difference between the rate at which a sender produces messages and the rate + * at which a paired receiver can post buffers and hence consume the messages. + * + * In all cases, too many @e unexpected messages will negatively affect + * performance. Users can employ some of the following mechanisms to reduce + * the effect of added memory allocations and copies that result from + * unexpected messages: + * @li If and when possible, receive buffers should be posted as early as + * possible and ideally before calling into the progress engine. + * @li Use of rendezvous messaging that can be controlled with + * @ref PSM2_MQ_RNDV_HFI_SZ and @ref PSM2_MQ_RNDV_SHM_SZ options. These + * options default to values determined to make effective use of + * bandwidth and are hence not advisable for all communication message + * sizes, but rendezvous messages inherently prevent unexpected + * messages by synchronizing the sender with the receiver beforehand. + * @li The amount of memory that is allocated to handle unexpected messages + * can be bounded by adjusting the Global @ref PSM2_MQ_MAX_SYSBUF_MBYTES + * option. + * @li MQ statistics, such as the amount of received unexpected messages and + * the aggregate amount of unexpected bytes are available in the @ref + * psm2_mq_stats structure. + * + * Whenever a match occurs, whether the message is expected or unexpected, it + * is generally up to the user to ensure that the message is not truncated. + * Message truncation occurs when the size of the preposted buffer is less than + * the size of the incoming matched message. MQ will correctly handle + * message truncation by always copying the appropriate amount of bytes as to + * not overwrite any data. While it is valid to send less data than the amount + * of data that has been preposted, messages that are truncated will be marked + * @ref PSM2_MQ_TRUNCATION as part of the error code in the message status + * structure (@ref psm2_mq_status_t or @ref psm2_mq_status2_t). + * + * @section mq_completion MQ Completion Semantics + * + * Message completion in Matched Queues follows local completion semantics. + * When sending an MQ message, it is deemed complete when MQ guarantees that + * the source data has been sent and that the entire input source data memory + * location can be safely overwritten. As with standard Message-Passing, + * MQ does not make any remote completion guarantees for sends. MQ does + * however, allow a sender to synchronize with a receiver to send a synchronous + * message which sends a message only after a matching receive buffer has been + * posted by the receiver (@ref PSM2_MQ_FLAG_SENDSYNC). + * + * A receive is deemed complete after it has matched its associated receive + * buffer with an incoming send and that the data from the send has been + * completely delivered to the receive buffer. + * + * @section mq_progress MQ Progress Requirements + * + * Progress on MQs must be @e explicitly ensured by the user for correctness. + * The progress requirement holds even if certain areas of the MQ + * implementation require less network attention than others, or if progress + * may internally be guaranteed through interrupts. The main polling function, + * @ref psm2_poll, is the most general form of ensuring process on a given + * endpoint. Calling @ref psm2_poll ensures that progress is made over all the + * MQs and other components instantiated over the endpoint passed to @ref + * psm2_poll. + * + * While @ref psm2_poll is the only way to directly ensure progress, other MQ + * functions will conditionally ensure progres depending on how they are used: + * + * @li @ref psm2_mq_wait employs polling and waits until the request is + * completed. For blocking communication operations where the caller is + * waiting on a single send or receive to complete, psm2_mq_wait usually + * provides the best responsiveness in terms of latency. + * + * @li @ref psm2_mq_test can test a particular request for completion, but @b + * never directly or indirectly ensures progress as it only tests the + * completion status of a request, nothing more. See functional documentation + * in @ref psm2_mq_test for a detailed discussion. + * + * @li @ref psm2_mq_ipeek ensures progress if and only if the MQ's completion + * queue is empty and will not ensure progress as long as the completion queue + * is non-empty. Users that always aggressively process all elements of the MQ + * completion queue as part of their own progress engine will indirectly always + * ensure MQ progress. The ipeek mechanism is the preferred way for + * ensuring progress when many non-blocking requests are in flight since ipeek + * returns requests in the order in which they complete. Depending on how the + * user initiates and completes communication, this may be preferable to + * calling other progress functions on individual requests. + */ + +/*! @defgroup mq PSM Matched Queues + * + * @{ + */ + +/** @brief Initialize the MQ component for MQ communication + * + * This function provides the Matched Queue handle necessary to perform all + * Matched Queue communication operations. + * + * @param[in] ep Endpoint over which to initialize Matched Queue + * @param[in] ignored + * @param[in] opts Set of options for Matched Queue + * @param[in] numopts Number of options passed + * @param[out] mq User-supplied storage to return the Matched Queue handle + * associated to the newly created Matched Queue. + * + * @remark This function can be called many times to retrieve the MQ handle + * associated to an endpoint, but options are only considered the first + * time the function is called. + * + * @post The user obtains a handle to an instantiated Match Queue. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK A new Matched Queue has been instantiated across all the + * members of the group. + * + * @code{.c} + int try_open_endpoint_and_initialize_mq( + psm2_ep_t *ep, // endpoint handle + psm2_epid_t *epid, // unique endpoint ID + psm2_uuid_t job_uuid, // unique job uuid, for ep_open + psm2_mq_t *mq, // MQ handle initialized on endpoint 'ep' + uint64_t communicator_bits) // Where we store our communicator or + // context bits in the 64-bit tag. + { + // Simplified open, see psm2_ep_open documentation for more info + psm2_ep_open(job_uuid, + NULL, // no options + ep, epid); + + // We initialize a matched queue by telling PSM the bits that are + // order-significant in the tag. Point-to-point ordering will not be + // maintained between senders where the communicator bits are not the + // same. + psm2_mq_init(ep, + communicator_bits, + NULL, // no other MQ options + 0, // 0 options passed + mq); // newly initialized matched Queue + + return 1; + } + @endcode + */ +psm2_error_t +psm2_mq_init(psm2_ep_t ep, uint64_t ignored, + const struct psm2_optkey *opts, int numopts, psm2_mq_t *mq); + +#define PSM2_MQ_ORDERMASK_NONE 0ULL + /**< This macro is reserved for future tag order masking support. */ + +#define PSM2_MQ_ORDERMASK_ALL 0xffffffffffffffffULL + /**< This macro is reserved for future tag order masking support. */ + +/** @brief Finalize (close) an MQ handle + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK A given Matched Queue has been freed and use of the future + * use of the handle produces undefined results. + */ +psm2_error_t +psm2_mq_finalize(psm2_mq_t mq); + +#define PSM2_MQ_TAG_ELEMENTS 4 + /**< Represents the number of 32-bit tag elements in the psm2_mq_tag_t + * type plus one extra element to keep alignment and padding + * as 16 bytes. */ + +/** @struct psm2_mq_tag + ** @brief MQ Message tag + * + * Extended message tag type introduced in PSM 2.0. The previous 64 bit tag + * values are replaced by a struct containing three 32 bit tag values for a + * total of 96 bits. Matching semantics are unchanged from the previous 64-bit + * matching scheme; the only difference is that 96 bits are matched instead of + * 64. For interoperability with existing PSM routines, 64 bit tags are + * extended to a 96 bit tag by setting the upper 32 bits (tag[2] or tag2) to + * zero. Other than this caveat, all of the existing routines using 64-bit + * tags are interchangeable with PSM2 routines using this psm2_mq_tag_t type. + * For example, a message sent using @ref psm2_mq_send can be received using + * @ref psm2_mq_irecv2, provided the tags match as described above. + */ +typedef +//struct psm2_mq_tag { +union psm2_mq_tag { + uint32_t tag[PSM2_MQ_TAG_ELEMENTS]; /* No longer specifying + * alignment as it makes + * code break with newer + * compilers. */ + /**< 3 x 32bit array representation of @ref psm2_mq_tag */ + struct { + uint32_t tag0; /**< 1 of 3 uint32_t tag values */ + uint32_t tag1; /**< 2 of 3 uint32_t tag values */ + uint32_t tag2; /**< 3 of 3 uint32_t tag values */ + }; + struct { + uint64_t tag64; /**< uint64_t tag values */ + uint32_t res; /**< uint32_t reserved */ + }; +} psm2_mq_tag_t; + +/** @brief MQ Non-blocking operation status + * + * Message completion status for asynchronous communication operations. + * For wait and test functions, MQ fills in the structure upon completion. + * Upon completion, receive requests fill in every field of the status + * structure while send requests only return a valid error_code and context + * pointer. + */ +typedef +struct psm2_mq_status { + /** Sender's original message tag (receive reqs only) */ + uint64_t msg_tag; + /** Sender's original message length (receive reqs only) */ + uint32_t msg_length; + /** Actual number of bytes transfered (receive reqs only) */ + uint32_t nbytes; + /** MQ error code for communication operation */ + psm2_error_t error_code; + /**< User-associated context for send or receive */ + void *context; +} psm2_mq_status_t; + +/** @brief MQ Non-blocking operation status + * + * Message completion status for asynchronous communication operations. For + * wait and test functions, MQ fills in the structure upon completion. Upon + * completion, requests fill in every field of the status structure with the + * exception of the nbytes field, which is only valid for receives. Version 2 + * of the status type contains an @ref psm2_mq_tag_t type to represent the tag + * instead of a 64-bit integer value and is for use with PSM v2 routines. + */ + +typedef +struct psm2_mq_status2 { + /** Remote peer's epaddr */ + psm2_epaddr_t msg_peer; + /** Sender's original message tag */ + psm2_mq_tag_t msg_tag __attribute__ ((aligned(16)));/* Alignment added + * to preserve the + * layout as is + * expected by + * existent code */ + /** Sender's original message length */ + uint32_t msg_length; + /** Actual number of bytes transfered (receiver only) */ + uint32_t nbytes; + /** MQ error code for communication operation */ + psm2_error_t error_code; + /** User-associated context for send or receive */ + void *context; +} psm2_mq_status2_t; + +/** @brief PSM2 Communication handle (opaque) */ +typedef struct psm2_mq_req *psm2_mq_req_t; + + +/** @brief MQ Request Struct + * + * Message completion request for asynchronous communication operations. + * Upon completion, requests are filled with the valid data for the + * corresponding send/recv operation that was completed. This datatype + * contains the status data and is converted into the + * mq_status structures in wait/test functions. + */ +struct psm2_mq_req_user { + /* Tag matching vars */ + psm2_epaddr_t peer; + psm2_mq_tag_t tag __attribute__ ((aligned(16)));/* Alignment added + * to preserve the + * layout as is + * expected by + * existent code */ + psm2_mq_tag_t tagsel; /* used for receives */ + + /* Buffer attached to request. May be a system buffer for unexpected + * messages or a user buffer when an expected message */ + uint8_t *buf; + uint32_t buf_len; + uint32_t error_code; + + uint32_t recv_msglen; /* Message length we are ready to receive */ + uint32_t send_msglen; /* Message length from sender */ + + /* Used for request to send messages */ + void *context; /* user context associated to sends or receives */ + + uint64_t user_reserved[4]; +}; + +/*! @} */ +/*! @ingroup mq + * @defgroup mq_options PSM Matched Queue Options + * @{ + * + * MQ options can be modified at any point at runtime, unless otherwise noted. + * The following example shows how to retrieve the current message size at + * which messages are sent as synchronous. + * + * @code{.c} + uint32_t get_hfirv_size(psm2_mq_t mq) + { + uint32_t rvsize; + psm2_getopt(mq, PSM2_MQ_RNDV_HFI_SZ, &rvsize); + return rvsize; + } + @endcode + */ + +/** @brief Get an MQ option (Deprecated. Use psm2_getopt with PSM2_COMPONENT_MQ) + * + * Function to retrieve the value of an MQ option. + * + * @param[in] mq Matched Queue handle + * @param[in] option Index of option to retrieve. Possible values are: + * @li @ref PSM2_MQ_RNDV_HFI_SZ + * @li @ref PSM2_MQ_RNDV_SHM_SZ + * @li @ref PSM2_MQ_MAX_SYSBUF_MBYTES + * + * @param[in] value Pointer to storage that can be used to store the value of + * the option to be set. It is up to the user to ensure that the + * pointer points to a memory location large enough to accommodate + * the value associated to the type. Each option documents the size + * associated to its value. + * + * @returns PSM2_OK if option could be retrieved. + * @returns PSM2_PARAM_ERR if the option is not a valid option number + */ +psm2_error_t psm2_mq_getopt(psm2_mq_t mq, int option, void *value); + +/** @brief Set an MQ option (Deprecated. Use psm2_setopt with PSM2_COMPONENT_MQ) + * + * Function to set the value of an MQ option. + * + * @param[in] mq Matched Queue handle + * @param[in] option Index of option to retrieve. Possible values are: + * @li @ref PSM2_MQ_RNDV_HFI_SZ + * @li @ref PSM2_MQ_RNDV_SHM_SZ + * @li @ref PSM2_MQ_MAX_SYSBUF_MBYTES + * + * @param[in] value Pointer to storage that contains the value to be updated + * for the supplied option number. It is up to the user to + * ensure that the pointer points to a memory location with a + * correct size. + * + * @returns PSM2_OK if option could be retrieved. + * @returns PSM2_PARAM_ERR if the option is not a valid option number + * @returns PSM2_OPT_READONLY if the option to be set is a read-only option + * (currently no MQ options are read-only). + */ +psm2_error_t psm2_mq_setopt(psm2_mq_t mq, int option, const void *value); + +/*! @} */ +/*! @ingroup mq + * @{ + */ + +#define PSM2_MQ_FLAG_SENDSYNC 0x01 + /**< MQ Send Force synchronous send */ + +#define PSM2_MQ_REQINVALID ((psm2_mq_req_t)(NULL)) + /**< MQ request completion value */ + +#define PSM2_MQ_ANY_ADDR ((psm2_epaddr_t)NULL) + /**< MQ receive from any source epaddr */ + + +/** @brief MQ fast-path operation enumeration + * + * To provide for quick enqueing of send/receives from within an AM handler + * PSM2 provdes fast path send/recv options that will enqueue those ops + * into the MQ. The supported operations to call in fast path are enumerated + * in the @ref psm2_mq_fp_op enum. + */ +enum psm2_mq_fp_op { + PSM2_MQ_ISEND_FP = 1, + PSM2_MQ_IRECV_FP, +}; + +/** @brief Post a fast-path isend/irecv into the MQ + * + * Function to only enqueue fast-path non-blocking sends or non-blocking recvs + * into a particular MQ. These calls only work if the process already holds + * the mq progress lock, this case traditionally only applies to calls from + * a registered AM Handler. + * + * This function helps to enable one-sided communication models from middleware + * such as OFI to provide fast >2KB message transfers for RMA operations. + * + * When posting irecvs every MQ message received on a particular MQ, + * the @c tag and @c tagsel parameters are used against the incoming + * message's send tag as described in @ref tagmatch. + * + * When posting isends the user gurantees that the source data will remain + * unmodified until the send is locally completed through a call such as + * @ref psm2_mq_wait or @ref psm2_mq_test. + * + * Progress on the operations enqueued into the MQ will may not occur until + * the next PSM2 progress API is invoked. + * + * @param[in] ep PSM2 endpoint + * @param[in] mq Matched Queue Handle + * @param[in] addr Destination EP address (used only on isends) + * @param[in] tag Send/Receive tag + * @param[in] tagsel Receive tag selector (used only on irecvs) + * @param[in] flags Send/Receive Flags + * @param[in] buf Send/Receive buffer + * @param[in] len Send/Receive buffer length + * @param[in] context User context pointer, available in @ref psm2_mq_status_t + * upon completion + * @param[in] fp_type Fast-path op requested + * @param[out] req PSM MQ Request handle created by the preposted receive, to + * be used for explicitly controlling message receive + * completion. + * + * @post The supplied buffer is given to MQ to match against incoming + * messages unless it is cancelled via @ref psm2_mq_cancel @e before any + * match occurs. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The receive buffer has successfully been posted to the MQ. + */ +psm2_error_t +psm2_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *tag, + psm2_mq_tag_t *tagsel, uint32_t flags, void *buf, uint32_t len, + void *context, enum psm2_mq_fp_op fp_type, psm2_mq_req_t *req); + +/** @brief Post a receive to a Matched Queue with tag selection criteria + * + * Function to receive a non-blocking MQ message by providing a preposted + * buffer. For every MQ message received on a particular MQ, the @c tag and @c + * tagsel parameters are used against the incoming message's send tag as + * described in @ref tagmatch. + * + * @param[in] mq Matched Queue Handle + * @param[in] rtag Receive tag + * @param[in] rtagsel Receive tag selector + * @param[in] flags Receive flags (None currently supported) + * @param[in] buf Receive buffer + * @param[in] len Receive buffer length + * @param[in] context User context pointer, available in @ref psm2_mq_status_t + * upon completion + * @param[out] req PSM MQ Request handle created by the preposted receive, to + * be used for explicitly controlling message receive + * completion. + * + * @post The supplied receive buffer is given to MQ to match against incoming + * messages unless it is cancelled via @ref psm2_mq_cancel @e before any + * match occurs. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The receive buffer has successfully been posted to the MQ. + */ +psm2_error_t +psm2_mq_irecv(psm2_mq_t mq, uint64_t rtag, uint64_t rtagsel, uint32_t flags, + void *buf, uint32_t len, void *context, psm2_mq_req_t *req); + +/** @brief Post a receive to a Matched Queue with source and tag selection + * criteria + * + * Function to receive a non-blocking MQ message by providing a preposted + * buffer. For every MQ message received on a particular MQ, the @c src, @c tag + * and @c tagsel parameters are used against the incoming message's send tag as + * described in @ref tagmatch. + * + * @param[in] mq Matched Queue Handle + * @param[in] src Source (sender's) epaddr (may be PSM2_MQ_ANY_ADDR) + * @param[in] rtag Receive tag + * @param[in] rtagsel Receive tag selector + * @param[in] flags Receive flags (None currently supported) + * @param[in] buf Receive buffer + * @param[in] len Receive buffer length + * @param[in] context User context pointer, available in @ref psm2_mq_status2_t + * upon completion + * @param[out] req PSM MQ Request handle created by the preposted receive, to + * be used for explicitly controlling message receive + * completion. + * + * @post The supplied receive buffer is given to MQ to match against incoming + * messages unless it is cancelled via @ref psm2_mq_cancel @e before any + * match occurs. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The receive buffer has successfully been posted to the MQ. + */ +psm2_error_t +psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *rtag, + psm2_mq_tag_t *rtagsel, uint32_t flags, void *buf, uint32_t len, + void *context, psm2_mq_req_t *req); + +/** @brief Post a receive to a Matched Queue with matched request + * + * Function to receive a non-blocking MQ message by providing a preposted + * buffer. The provided request should already be matched using the @ref + * psm2_mq_improbe or @ref psm2_mq_improbe2 routines. It is an error to pass a + * request that has not already been matched by one of those routines. + * + * @param[in] mq Matched Queue Handle + * @param[in] flags Receive flags (None currently supported) + * @param[in] buf Receive buffer + * @param[in] len Receive buffer length + * @param[in] context User context pointer, available in @ref psm2_mq_status_t + * upon completion + * @param[inout] reqo PSM MQ Request handle matched previously by a matched + * probe routine (@ref psm2_mq_improbe or @ref + * psm2_mq_improbe2), also to be used for explicitly + * controlling message receive completion. + * + * @post The supplied receive buffer is given to MQ to deliver the matched + * message. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The receive buffer has successfully been posted to the MQ. + */ +psm2_error_t +psm2_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len, + void *context, psm2_mq_req_t *reqo); + +/** @brief Send a blocking MQ message + * + * Function to send a blocking MQ message, whereby the message is locally + * complete and the source data can be modified upon return. + * + * @param[in] mq Matched Queue Handle + * @param[in] dest Destination EP address + * @param[in] flags Message flags, currently: + * @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message + * synchronously, meaning that the message will not be sent until + * the receiver acknowledges that it has matched the send with a + * receive buffer. + * @param[in] stag Message Send Tag + * @param[in] buf Source buffer pointer + * @param[in] len Length of message starting at @c buf. + * + * @post The source buffer is reusable and the send is locally complete. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * @note This send function has been implemented to best suit MPI_Send. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The message has been successfully sent. + */ +psm2_error_t +psm2_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, + const void *buf, uint32_t len); + +/** @brief Send a blocking MQ message + * + * Function to send a blocking MQ message, whereby the message is locally + * complete and the source data can be modified upon return. + * + * @param[in] mq Matched Queue Handle + * @param[in] dest Destination EP address + * @param[in] flags Message flags, currently: + * @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message + * synchronously, meaning that the message will not be sent until + * the receiver acknowledges that it has matched the send with a + * receive buffer. + * @param[in] stag Message Send Tag + * @param[in] buf Source buffer pointer + * @param[in] len Length of message starting at @c buf. + * + * @post The source buffer is reusable and the send is locally complete. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * @note This send function has been implemented to best suit MPI_Send. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The message has been successfully sent. + */ +psm2_error_t +psm2_mq_send2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, + psm2_mq_tag_t *stag, const void *buf, uint32_t len); + +/** @brief Send a non-blocking MQ message + * + * Function to initiate the send of a non-blocking MQ message, whereby the + * user guarantees that the source data will remain unmodified until the send + * is locally completed through a call such as @ref psm2_mq_wait or @ref + * psm2_mq_test. + * + * @param[in] mq Matched Queue Handle + * @param[in] dest Destination EP address + * @param[in] flags Message flags, currently: + * @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message + * synchronously, meaning that the message will not be sent until + * the receiver acknowledges that it has matched the send with a + * receive buffer. + * @param[in] stag Message Send Tag + * @param[in] buf Source buffer pointer + * @param[in] len Length of message starting at @c buf. + * @param[in] context Optional user-provided pointer available in @ref + * psm2_mq_status_t when the send is locally completed. + * @param[out] req PSM MQ Request handle created by the non-blocking send, to + * be used for explicitly controlling message completion. + * + * @post The source buffer is not reusable and the send is not locally complete + * until its request is completed by either @ref psm2_mq_test or @ref + * psm2_mq_wait. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * @note This send function has been implemented to suit MPI_Isend. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The message has been successfully initiated. + * + * @code{.c} + psm2_mq_req_t + non_blocking_send(const psm2_mq_t mq, psm2_epaddr_t dest_ep, + const void *buf, uint32_t len, + int context_id, int send_tag, const my_request_t *req) + { + psm2_mq_req_t req_mq; + // Set up our send tag, assume that "my_rank" is global and represents + // the rank of this process in the job + uint64_t tag = ( ((context_id & 0xffff) << 48) | + ((my_rank & 0xffff) << 32) | + ((send_tag & 0xffffffff)) ); + + psm2_mq_isend(mq, dest_ep, + 0, // no flags + tag, + buf, + len, + req, // this req is available in psm2_mq_status_t when one + // of the synchronization functions is called. + &req_mq); + return req_mq; + } + @endcode + */ +psm2_error_t +psm2_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, + const void *buf, uint32_t len, void *context, psm2_mq_req_t *req); + +/** @brief Send a non-blocking MQ message + * + * Function to initiate the send of a non-blocking MQ message, whereby the + * user guarantees that the source data will remain unmodified until the send + * is locally completed through a call such as @ref psm2_mq_wait or @ref + * psm2_mq_test. + * + * @param[in] mq Matched Queue Handle + * @param[in] dest Destination EP address + * @param[in] flags Message flags, currently: + * @li PSM2_MQ_FLAG_SENDSYNC tells PSM to send the message + * synchronously, meaning that the message will not be sent until + * the receiver acknowledges that it has matched the send with a + * receive buffer. + * @param[in] stag Message Send Tag, array of three 32-bit values. + * @param[in] buf Source buffer pointer + * @param[in] len Length of message starting at @c buf. + * @param[in] context Optional user-provided pointer available in @ref + * psm2_mq_status2_t when the send is locally completed. + * @param[out] req PSM MQ Request handle created by the non-blocking send, to + * be used for explicitly controlling message completion. + * + * @post The source buffer is not reusable and the send is not locally complete + * until its request is completed by either @ref psm2_mq_test or @ref + * psm2_mq_wait. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * @note This send function has been implemented to suit MPI_Isend. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The message has been successfully initiated. + * + * @code{.c} + psm2_mq_req_t + non_blocking_send(const psm2_mq_t mq, psm2_epaddr_t dest_ep, + const void *buf, uint32_t len, + int context_id, int send_tag, const my_request_t *req) + { + psm2_mq_req_t req_mq; + // Set up our send tag, assume that "my_rank" is global and represents + // the rank of this process in the job + psm2_mq_tag_t tag; + tag.tag[0] = send_tag; + tag.tag[1] = my_rank; + tag.tag[2] = context_id; + + psm2_mq_isend(mq, dest_ep, + 0, // no flags + &tag, + buf, + len, + req, // this req is available in psm2_mq_status2_t when one + // of the synchronization functions is called. + &req_mq); + return req_mq; + } + @endcode + */ +psm2_error_t +psm2_mq_isend2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, + psm2_mq_tag_t *stag, const void *buf, uint32_t len, void *context, + psm2_mq_req_t *req); + +/** @brief Try to Probe if a message is received matching tag selection + * criteria + * + * Function to verify if a message matching the supplied tag and tag selectors + * has been received. The message is not fully matched until the user + * provides a buffer with the successfully matching tag selection criteria + * through @ref psm2_mq_irecv. + * Probing for messages may be useful if the size of the + * message to be received is unknown, in which case its size will be + * available in the @c msg_length member of the returned @c status. + * + * Function ensures progress if matching request wasn’t found + * after the first attempt. + * + * @param[in] mq Matched Queue Handle + * @param[in] rtag Message receive tag + * @param[in] rtagsel Message receive tag selector + * @param[out] status Upon return, @c status is filled with information + * regarding the matching send. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error codes are returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL. + * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is + * unchanged. + */ +psm2_error_t +psm2_mq_iprobe(psm2_mq_t mq, uint64_t rtag, uint64_t rtagsel, + psm2_mq_status_t *status); + +/** @brief Try to Probe if a message is received matching source and tag + * selection criteria + * + * Function to verify if a message matching the supplied source, tag, and tag + * selectors has been received. The message is not fully matched until the + * user provides a buffer with the successfully matching tag selection criteria + * through @ref psm2_mq_irecv. Probing for messages may be useful if the size + * of the message to be received is unknown, in which case its size will be + * available in the @c msg_length member of the returned @c status. + * + * Function ensures progress if matching request wasn’t found + * after the first attempt. + * + * @param[in] mq Matched Queue Handle + * @param[in] src Source (sender's) epaddr (may be PSM2_MQ_ANY_ADDR) + * @param[in] rtag Message receive tag + * @param[in] rtagsel Message receive tag selector + * @param[out] status Upon return, @c status is filled with information + * regarding the matching send. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error codes are returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL. + * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is + * unchanged. + */ +psm2_error_t +psm2_mq_iprobe2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *rtag, + psm2_mq_tag_t *rtagsel, psm2_mq_status2_t *status); + +/** @brief Try to Probe if a message is received matching tag selection + * criteria + * + * Function to verify if a message matching the supplied source, tag, and tag + * selectors has been received. If a match is successful, the message is + * removed from the matching queue and returned as a request object. The + * message can be received using @ref psm2_mq_imrecv. It is erroneous to use + * the request object returned by @ref psm2_mq_improbe for any purpose other + * than passing to @ref psm2_mq_imrecv. Probing for messages may be useful if + * the size of the message to be received is unknown, in which case its size + * will be available in the @c msg_length member of the returned @c status. + * + * Function ensures progress if matching request wasn’t found + * after the first attempt. + * + * @param[in] mq Matched Queue Handle + * @param[in] rtag Message receive tag + * @param[in] rtagsel Message receive tag selector + * @param[out] req PSM MQ Request handle, to be used for receiving the matched + * message. + * @param[out] status Upon return, @c status is filled with information + * regarding the matching send. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error codes are returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL. + * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is unchanged. + */ +psm2_error_t +psm2_mq_improbe(psm2_mq_t mq, uint64_t rtag, uint64_t rtagsel, psm2_mq_req_t *req, + psm2_mq_status_t *status); + +/** @brief Try to Probe if a message is received matching source and tag + * selection criteria + * + * Function to verify if a message matching the supplied tag and tag selectors + * has been received. If a match is successful, the message is removed from + * the matching queue and returned as a request object. The message can be + * received using @ref psm2_mq_imrecv. It is erroneous to use the request + * object returned by @ref psm2_mq_improbe for any purpose other than passing to + * @ref psm2_mq_imrecv. Probing for messages may be useful if the size of the + * message to be received is unknown, in which case its size will be available + * in the @c msg_length member of the returned @c status. + * + * Function ensures progress if matching request wasn’t found + * after the first attempt. + * + * @param[in] mq Matched Queue Handle + * @param[in] src Source (sender's) epaddr (may be PSM2_MQ_ANY_ADDR) + * @param[in] rtag Message receive tag + * @param[in] rtagsel Message receive tag selector + * @param[out] reqo PSM MQ Request handle, to be used for receiving the matched + * message. + * @param[out] status Upon return, @c status is filled with information + * regarding the matching send. + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error codes are returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The iprobe is successful and status is updated if non-NULL. + * @retval PSM2_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is unchanged. + */ +psm2_error_t +psm2_mq_improbe2(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *rtag, + psm2_mq_tag_t *rtagsel, psm2_mq_req_t *reqo, + psm2_mq_status2_t *status); + +/** @brief Query for non-blocking requests ready for completion. + * + * Function to query a particular MQ for non-blocking requests that are ready + * for completion. Requests "ready for completion" are not actually considered + * complete by MQ until they are returned to the MQ library through @ref + * psm2_mq_wait or @ref psm2_mq_test. + * + * If the user can deal with consuming request completions in the order in + * which they complete, this function can be used both for completions and for + * ensuring progress. The latter requirement is satisfied when the user + * peeks an empty completion queue as a side effect of always aggressively + * peeking and completing all an MQ's requests ready for completion. + * + * + * @param[in] mq Matched Queue Handle + * @param[in,out] req MQ non-blocking request + * @param[in] status Optional MQ status, can be NULL. + * + * @post The user has ensured progress if the function returns @ref + * PSM2_MQ_NO_COMPLETIONS + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error codes are returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The peek is successful and @c req is updated with a request + * ready for completion. If @c status is non-NULL, it is also + * updated. + * + * @retval PSM2_MQ_NO_COMPLETIONS The peek is not successful, meaning that there + * are no further requests ready for completion. + * The contents of @c req and @c status remain + * unchanged. + * @code{.c} + // Example that uses ipeek_mq_ipeek to make progress instead of psm2_poll + // We return the amount of non-blocking requests that we've completed + int main_progress_loop(psm2_mq_t mq) + { + int num_completed = 0; + psm2_mq_req_t req; + psm2_mq_status_t status; + psm2_error_t err; + my_request_t *myreq; + + do { + err = psm2_mq_ipeek(mq, &req, + NULL); // No need for status in ipeek here + if (err == PSM2_MQ_NO_COMPLETIONS) + return num_completed; + else if (err != PSM2_OK) + goto errh; + num_completed++; + + // We obtained 'req' at the head of the completion queue. We can + // now free the request with PSM and obtain our original reques + // from the status' context + err = psm2_mq_test(&req, // will be marked as invalid + &status); // we need the status + myreq = (my_request_t *) status.context; + + // handle the completion for myreq whether myreq is a posted receive + // or a non-blocking send. + } + while (1); + } + @endcode + */ +psm2_error_t +psm2_mq_ipeek(psm2_mq_t mq, psm2_mq_req_t *req, psm2_mq_status_t *status); + +/** @brief Query for non-blocking requests ready for completion. + * + * Function to query a particular MQ for non-blocking requests that are ready + * for completion. Requests "ready for completion" are not actually considered + * complete by MQ until they are returned to the MQ library through @ref + * psm2_mq_wait or @ref psm2_mq_test. + * + * If the user can deal with consuming request completions in the order in + * which they complete, this function can be used both for completions and for + * ensuring progress. The latter requirement is satisfied when the user + * peeks an empty completion queue as a side effect of always aggressively + * peeking and completing all an MQ's requests ready for completion. + * + * + * @param[in] mq Matched Queue Handle + * @param[in,out] req MQ non-blocking request + * @param[in] status Optional MQ status, can be NULL. + * + * @post The user has ensured progress if the function returns @ref + * PSM2_MQ_NO_COMPLETIONS + * + * @remark This function may be called simultaneously from multiple threads + * as long as different MQ arguments are used in each of the calls. + * + * The following error codes are returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The peek is successful and @c req is updated with a request + * ready for completion. If @c status is non-NULL, it is also + * updated. + * + * @retval PSM2_MQ_NO_COMPLETIONS The peek is not successful, meaning that there + * are no further requests ready for completion. + * The contents of @c req and @c status remain + * unchanged. + * @code{.c} + // Example that uses ipeek_mq_ipeek to make progress instead of psm2_poll + // We return the amount of non-blocking requests that we've completed + int main_progress_loop(psm2_mq_t mq) + { + int num_completed = 0; + psm2_mq_req_t req; + psm2_mq_status2_t status; + psm2_error_t err; + my_request_t *myreq; + + do { + err = psm2_mq_ipeek2(mq, &req, + NULL); // No need for status in ipeek here + if (err == PSM2_MQ_NO_COMPLETIONS) + return num_completed; + else if (err != PSM2_OK) + goto errh; + num_completed++; + + // We obtained 'req' at the head of the completion queue. We can + // now free the request with PSM and obtain our original reques + // from the status' context + err = psm2_mq_test2(&req, // will be marked as invalid + &status); // we need the status + myreq = (my_request_t *) status.context; + + // handle the completion for myreq whether myreq is a posted receive + // or a non-blocking send. + } + while (1); + } + @endcode + */ +psm2_error_t +psm2_mq_ipeek2(psm2_mq_t mq, psm2_mq_req_t *req, psm2_mq_status2_t *status); + +/** @brief User defined Callback function handling copy of MQ request into user datatype + * + * Callback function used to convert an MQ request into a user's desired + * status structure. The user's callback function converts the MQ request into + * the provided status_array at the specified index. + * + * @param[in] req MQ External non-blocking Request structure + * @param[in] status_array Array of User defined status datatypes + * @param[in] entry_index Index in array where the converted request will be + * stored if successful + * + * The following error codes are returned. + * + * @retval < 0 The MQ conversion failed with a user defined error. + * + * @retval 0 The MQ was successfully processed, but was not saved + * in the provided @c status_array. + * + * @retval 1 The MQ was successfully processed and was saved in the + * @c status_array at the specified index. + * + * @retval >1 The MQ was successfully processed and was saved in the + * @c status_array at the specified index. This should + * be the last MQ converted in the batch, even if there + * are still spaces in @c status_array. + */ +typedef int (*psmi_mq_status_copy_user_t) (struct psm2_mq_req_user *req, + void *status_array, int entry_index); + +/** @brief Check and dequeue MQ requests into a user's status array using a callback. + * + * Function to atomically check and dequeue MQ entries from the completed + * queue and copy the MQ requests into a user's status datatype through a + * status_copy callback function. + * + * Once the MQ request has been successfully converted by the callback, the + * MQ request is freed and the next entry is processed making the supplied + * Request pointer invalid. + * + * The variable "count" passed in will only be increased if the MQ request was + * successfully stored into the user's passed in array. Otherwise the count + * variable is unchanged. + * + * NOTE: a count of 0 passed into psm2_mq_ipeek_dequeue_multi will result in + * no MQ elements being processed. + * + * @param[in] mq Matched Queue Handle + * @param[in] status_array Array of User defined status datatypes + * @param[in] status_copy Callback function pointer to convert + * MQ to caller datatype + * @param[in/out] count [in]Size of status_array, [out]number of elements + * populated into status_array or user's error return code + * + * The following error codes are returned. + * + * @retval PSM2_OK The dequeue operation was successful and populated the + * full @c status_array up to @c count entries. The parameter + * @c count is equal to the count passed in by the user. + * + * @retval PSM2_MQ_NO_COMPLETIONS The dequeue operation was not able to read + * @c count entries into the @c status_array. The number + * of entries that were successfully written to the + * @c status_array is set in the @c count for the user. + * + * @retval PSM2_INTERNAL_ERR The @c status_copy failed to successfully + * copy the status entry into the user's datatype. + * @c count is set to the return code from the + * @c status_copy. + */ + psm2_error_t + psm2_mq_ipeek_dequeue_multi(psm2_mq_t mq, void *status_array, + psmi_mq_status_copy_user_t status_copy, int *count); + +/** @brief Check and dequeue the first request entry from the completed queue. + * + * Function to atomically check and dequeue the first entry from the completed + * queue. It must be paired with function psm2_mq_req_free, which returns the + * request to PSM2 library. + * + * @param[in] mq Matched Queue Handle + * @param[out] req PSM MQ Request handle, to be used for receiving the matched + * message. + * + * The following error codes are returned. + * + * @retval PSM2_OK The dequeue operation was successful and @c req is updated + * with a request ready for completion. + * + * @retval PSM2_MQ_NO_COMPLETIONS The dequeue operation was not successful, + * meaning that there are no further requests ready + * for completion. The contents of @c req remain + * unchanged. + */ +psm2_error_t +psm2_mq_ipeek_dequeue(psm2_mq_t mq, psm2_mq_req_t *req); + +/** @brief Return the request to PSM2 library. + * + * Function returns the request previously obtained via psm2_mq_ipeek_dequeue + * to the PSM2 library. + * + * @param[in] mq Matched Queue Handle + * @param[in] req PSM MQ Request handle to be returned to PSM2 library. + If @p req is NULL, no operation is performed. + * + * The following error codes are returned. + * + * @retval PSM2_OK Return of an object to PSM2 library pool was successful. + */ +psm2_error_t +psm2_mq_req_free(psm2_mq_t mq, psm2_mq_req_t req); + +/** @brief Wait until a non-blocking request completes + * + * Function to wait on requests created from either preposted receive buffers + * or non-blocking sends. This is the only blocking function in the MQ + * interface and will poll until the request is complete as per the progress + * semantics explained in @ref mq_progress. + * + * @param[in,out] request MQ non-blocking request + * @param[out] status Updated if non-NULL when request successfully completes + * + * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend + * or @ref psm2_mq_irecv and passes a pointer to enough storage to write + * the output of a @ref psm2_mq_status_t or NULL if status is to be + * ignored. + * + * @pre Since MQ will internally ensure progress while the user is + * suspended, the user need not ensure that progress is made prior to + * calling this function. + * + * @post The request is assigned the value @ref PSM2_MQ_REQINVALID and all + * associated MQ request storage is released back to the MQ library. + * + * @remark This function may be called simultaneously from multiple threads + * as long as the requests that are used in each of the calls are + * associated with different MQs. + * + * @remarks + * @li This function ensures progress on the endpoint as long as the request + * is incomplete. + * @li @c status can be NULL, in which case no status is written upon + * completion. + * @li If @c request is @ref PSM2_MQ_REQINVALID, the function returns + * immediately. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The request is complete or the value of @c was + * @ref PSM2_MQ_REQINVALID. + * + */ +psm2_error_t +psm2_mq_wait(psm2_mq_req_t *request, psm2_mq_status_t *status); + +/** @brief Wait until a non-blocking request completes + * + * Function to wait on requests created from either preposted receive buffers + * or non-blocking sends. This is the only blocking function in the MQ + * interface and will poll until the request is complete as per the progress + * semantics explained in @ref mq_progress. + * + * @param[in,out] request MQ non-blocking request + * @param[out] status Updated if non-NULL when request successfully completes + * + * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend + * or @ref psm2_mq_irecv and passes a pointer to enough storage to write + * the output of a @ref psm2_mq_status2_t or NULL if status is to be + * ignored. + * + * @pre Since MQ will internally ensure progress while the user is + * suspended, the user need not ensure that progress is made prior to + * calling this function. + * + * @post The request is assigned the value @ref PSM2_MQ_REQINVALID and all + * associated MQ request storage is released back to the MQ library. + * + * @remark This function may be called simultaneously from multiple threads + * as long as the requests that are used in each of the calls are + * associated with different MQs. + * + * @remarks + * @li This function ensures progress on the endpoint as long as the request + * is incomplete. + * @li @c status can be NULL, in which case no status is written upon + * completion. + * @li If @c request is @ref PSM2_MQ_REQINVALID, the function returns + * immediately. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The request is complete or the value of @c was + * @ref PSM2_MQ_REQINVALID. + * + */ +psm2_error_t +psm2_mq_wait2(psm2_mq_req_t *request, psm2_mq_status2_t *status); + +/** @brief Test if a non-blocking request is complete + * + * Function to test requests created from either preposted receive buffers or + * non-blocking sends for completion. Unlike @ref psm2_mq_wait, this function + * tests @c request for completion and @e never ensures progress directly or + * indirectly. It is up to the user to employ some of the progress functions + * described in @ref mq_progress to ensure progress if the user chooses to + * exclusively test requests for completion. + * + * Testing a request for completion @e never internally ensure progress in + * order to be useful to construct higher-level completion tests over arrays to + * test some, all or any request that has completed. For testing arrays of + * requests, it is preferable for performance reasons to only ensure progress + * once before testing a set of requests for completion. + * + * @param[in,out] request MQ non-blocking request + * @param[out] status Updated if non-NULL and the request successfully + * completes + * + * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend + * or @ref psm2_mq_irecv and passes a pointer to enough storage to write + * the output of a @ref psm2_mq_status_t or NULL if status is to be + * ignored. + * + * @pre The user has ensured progress on the Matched Queue if @ref + * psm2_mq_test is exclusively used for guaranteeing request completions. + * + * @post If the request is complete, the request is assigned the value @ref + * PSM2_MQ_REQINVALID and all associated MQ request storage is released + * back to the MQ library. If the request is incomplete, the contents of + * @c request is unchanged. + * + * @post The user will ensure progress on the Matched Queue if @ref + * psm2_mq_test is exclusively used for guaranteeing request completions. + * + * @remark This function may be called simultaneously from multiple threads + * as long as the requests that are used in each of the calls are + * associated with different MQs. + * + * The following two errors are always returned. Other errors are handled by + * the PSM error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The request is complete and @c request is set to @ref + * PSM2_MQ_REQINVALID or the value of @c was PSM2_MQ_REQINVALID + * + * @retval PSM2_MQ_NO_COMPLETIONS The request is not complete and @c request is + * unchanged. + * + * @code{.c} + // Function that returns the first completed request in an array + // of requests. + void * + user_testany(psm2_ep_t ep, psm2_mq_req_t *allreqs, int nreqs) + { + int i; + void *context = NULL; + + // Ensure progress only once + psm2_poll(ep); + + // Test for at least one completion and return it's context + psm2_mq_status_t stat; + for (i = 0; i < nreqs; i++) { + if (psm2_mq_test(&allreqs[i], &stat) == PSM2_OK) { + context = stat.context; + break; + } + } + return context; + } + @endcode + */ +psm2_error_t +psm2_mq_test(psm2_mq_req_t *request, psm2_mq_status_t *status); + +/** @brief Test if a non-blocking request is complete + * + * Function to test requests created from either preposted receive buffers or + * non-blocking sends for completion. Unlike @ref psm2_mq_wait, this function + * tests @c request for completion and @e never ensures progress directly or + * indirectly. It is up to the user to employ some of the progress functions + * described in @ref mq_progress to ensure progress if the user chooses to + * exclusively test requests for completion. + * + * Testing a request for completion @e never internally ensure progress in + * order to be useful to construct higher-level completion tests over arrays to + * test some, all or any request that has completed. For testing arrays of + * requests, it is preferable for performance reasons to only ensure progress + * once before testing a set of requests for completion. + * + * @param[in,out] request MQ non-blocking request + * @param[out] status Updated if non-NULL and the request successfully + * completes + * + * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend + * or @ref psm2_mq_irecv and passes a pointer to enough storage to write + * the output of a @ref psm2_mq_status2_t or NULL if status is to be + * ignored. + * + * @pre The user has ensured progress on the Matched Queue if @ref + * psm2_mq_test is exclusively used for guaranteeing request completions. + * + * @post If the request is complete, the request is assigned the value @ref + * PSM2_MQ_REQINVALID and all associated MQ request storage is released + * back to the MQ library. If the request is incomplete, the contents of + * @c request is unchanged. + * + * @post The user will ensure progress on the Matched Queue if @ref + * psm2_mq_test is exclusively used for guaranteeing request completions. + * + * @remark This function may be called simultaneously from multiple threads + * as long as the requests that are used in each of the calls are + * associated with different MQs. + * + * The following two errors are always returned. Other errors are handled by + * the PSM error handler (@ref psm2_error_register_handler). + * + * @retval PSM2_OK The request is complete and @c request is set to @ref + * PSM2_MQ_REQINVALID or the value of @c was PSM2_MQ_REQINVALID + * + * @retval PSM2_MQ_NO_COMPLETIONS The request is not complete and @c request is + * unchanged. + * + * @code{.c} + // Function that returns the first completed request in an array + // of requests. + void * + user_testany(psm2_ep_t ep, psm2_mq_req_t *allreqs, int nreqs) + { + int i; + void *context = NULL; + + // Ensure progress only once + psm2_poll(ep); + + // Test for at least one completion and return it's context + psm2_mq_status2_t stat; + for (i = 0; i < nreqs; i++) { + if (psm2_mq_test2(&allreqs[i], &stat) == PSM2_OK) { + context = stat.context; + break; + } + } + return context; + } + @endcode + */ +psm2_error_t +psm2_mq_test2(psm2_mq_req_t *request, psm2_mq_status2_t *status); + +/** @brief Cancel a preposted request + * + * Function to cancel a preposted receive request returned by @ref + * psm2_mq_irecv. It is currently illegal to cancel a send request initiated + * with @ref psm2_mq_isend. + * + * @pre The user has obtained a valid MQ request by calling @ref psm2_mq_isend. + * + * @post Whether the cancel is successful or not, the user returns the + * request to the library by way of @ref psm2_mq_test or @ref + * psm2_mq_wait. + * + * @remark This function may be called simultaneously from multiple threads + * as long as the requests that are used in each of the calls are + * associated with different MQs. + * + * Only the two following errors can be returned directly, without being + * handled by the error handler (@ref psm2_error_register_handler): + * + * @retval PSM2_OK The request could be successfully cancelled such that the + * preposted receive buffer could be removed from the preposted + * receive queue before a match occurred. The associated @c + * request remains unchanged and the user must still return + * the storage to the MQ library. + * + * @retval PSM2_MQ_NO_COMPLETIONS The request could not be successfully cancelled + * since the preposted receive buffer has already + * matched an incoming message. The @c request + * remains unchanged. + * + */ +psm2_error_t psm2_mq_cancel(psm2_mq_req_t *req); + +/*! @brief MQ statistics structure */ +struct psm2_mq_stats { + /** Bytes received into a matched user buffer */ + uint64_t rx_user_bytes; + /** Messages received into a matched user buffer */ + uint64_t rx_user_num; + /** Bytes received into an unmatched system buffer */ + uint64_t rx_sys_bytes; + /** Messages received into an unmatched system buffer */ + uint64_t rx_sys_num; + + /** Total Messages transmitted (shm and hfi) */ + uint64_t tx_num; + /** Messages transmitted eagerly */ + uint64_t tx_eager_num; + /** Bytes transmitted eagerly */ + uint64_t tx_eager_bytes; + /** Messages transmitted using expected TID mechanism */ + uint64_t tx_rndv_num; + /** Bytes transmitted using expected TID mechanism */ + uint64_t tx_rndv_bytes; + /** Messages transmitted (shm only) */ + uint64_t tx_shm_num; + /** Messages received through shm */ + uint64_t rx_shm_num; + + /** Number of system buffers allocated */ + uint64_t rx_sysbuf_num; + /** Bytes allcoated for system buffers */ + uint64_t rx_sysbuf_bytes; + + /** rank in MPI_COMM_WORLD, while unchanging, easiest to put here */ + uint64_t comm_world_rank; + + /** Internally reserved for future use */ + uint64_t _reserved[15]; +}; + +#define PSM2_MQ_NUM_STATS 13 /**< How many stats are currently used in @ref psm2_mq_stats */ + +/*! @see psm2_mq_stats */ + typedef struct psm2_mq_stats psm2_mq_stats_t; + +/** @brief Retrieve statistics from an instantiated MQ */ + void + psm2_mq_get_stats(psm2_mq_t mq, psm2_mq_stats_t *stats); + +/*! @} */ +#ifdef __cplusplus +} /* extern "C" */ +#endif +#endif diff --git a/prov/psm3/psm3/psm_am.c b/prov/psm3/psm3/psm_am.c new file mode 100644 index 00000000000..f1f3a450df8 --- /dev/null +++ b/prov/psm3/psm3/psm_am.c @@ -0,0 +1,346 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_am.h" +#include "psm_am_internal.h" +#include "psm_mq_internal.h" + +int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid); + +/* AM capabilities parameters are initialized once in psmi_am_init_internal + and copied out in __psm2_am_get_parameters. When debugging is enabled, + various assertions reference these parameters for sanity checking. */ +struct psm2_am_parameters psmi_am_parameters = { 0 }; + +static int _ignore_handler(PSMI_AM_ARGS_DEFAULT) +{ + return 0; +} + +int psmi_abort_handler(PSMI_AM_ARGS_DEFAULT) +{ + abort(); + return 0; +} + +static void psmi_am_min_parameters(struct psm2_am_parameters *dest, + struct psm2_am_parameters *src) +{ + dest->max_handlers = min(dest->max_handlers, src->max_handlers); + dest->max_nargs = min(dest->max_nargs, src->max_nargs); + dest->max_request_short = + min(dest->max_request_short, src->max_request_short); + dest->max_reply_short = + min(dest->max_reply_short, src->max_reply_short); +} + +psm2_error_t psmi_am_init_internal(psm2_ep_t ep) +{ + int i; + struct psm2_ep_am_handle_entry *am_htable; + struct psm2_am_parameters params; + + psmi_am_parameters.max_handlers = INT_MAX; + psmi_am_parameters.max_nargs = INT_MAX; + psmi_am_parameters.max_request_short = INT_MAX; + psmi_am_parameters.max_reply_short = INT_MAX; + + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) { + ep->ptl_self.am_get_parameters(ep, ¶ms); + psmi_am_min_parameters(&psmi_am_parameters, ¶ms); + } + + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { + ep->ptl_ips.am_get_parameters(ep, ¶ms); + psmi_am_min_parameters(&psmi_am_parameters, ¶ms); + } + + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { + ep->ptl_amsh.am_get_parameters(ep, ¶ms); + psmi_am_min_parameters(&psmi_am_parameters, ¶ms); + } + + ep->am_htable = + psmi_malloc(ep, UNDEFINED, + sizeof(struct psm2_ep_am_handle_entry) * PSMI_AM_NUM_HANDLERS); + if (ep->am_htable == NULL) + return PSM2_NO_MEMORY; + + am_htable = (struct psm2_ep_am_handle_entry *) ep->am_htable; + for (i = 0; i < PSMI_AM_NUM_HANDLERS; i++) { + am_htable[i].hfn = _ignore_handler; + am_htable[i].hctx = NULL; + am_htable[i].version = PSM2_AM_HANDLER_V2; + } + + return PSM2_OK; + +} + +void psmi_am_fini_internal(psm2_ep_t ep) +{ + if(ep->am_htable != NULL) { + psmi_free(ep->am_htable); + } +} + +psm2_error_t +__psm2_am_register_handlers(psm2_ep_t ep, + const psm2_am_handler_fn_t *handlers, + int num_handlers, int *handlers_idx) +{ + int i, j; + + psmi_assert_always(ep->am_htable != NULL); + + PSM2_LOG_MSG("entering"); + /* For now just assign any free one */ + for (i = 0, j = 0; (i < PSMI_AM_NUM_HANDLERS) && (j < num_handlers); i++) { + if (ep->am_htable[i].hfn == _ignore_handler) { + ep->am_htable[i].hfn = handlers[j]; + ep->am_htable[i].hctx = NULL; + ep->am_htable[i].version = PSM2_AM_HANDLER_V1; + handlers_idx[j] = i; + if (++j == num_handlers) /* all registered */ + break; + } + } + + if (j < num_handlers) { + /* Not enough free handlers, restore unused handlers */ + for (i = 0; i < j; i++) { + ep->am_htable[handlers_idx[i]].hfn = _ignore_handler; + ep->am_htable[handlers_idx[i]].hctx = NULL; + ep->am_htable[handlers_idx[i]].version = PSM2_AM_HANDLER_V2; + } + PSM2_LOG_MSG("leaving"); + return psmi_handle_error(ep, PSM2_EP_NO_RESOURCES, + "Insufficient " + "available AM handlers: registered %d of %d requested handlers", + j, num_handlers); + } + else { + PSM2_LOG_MSG("leaving"); + return PSM2_OK; + } +} +PSMI_API_DECL(psm2_am_register_handlers) + +psm2_error_t +__psm2_am_register_handlers_2(psm2_ep_t ep, + const psm2_am_handler_2_fn_t *handlers, + int num_handlers, void **hctx, int *handlers_idx) +{ + int i, j; + + psmi_assert_always(ep->am_htable != NULL); + + PSM2_LOG_MSG("entering"); + /* For now just assign any free one */ + for (i = 0, j = 0; (i < PSMI_AM_NUM_HANDLERS) && (j < num_handlers); i++) { + if (ep->am_htable[i].hfn == _ignore_handler) { + ep->am_htable[i].hfn = handlers[j]; + ep->am_htable[i].hctx = hctx[j]; + ep->am_htable[i].version = PSM2_AM_HANDLER_V2; + handlers_idx[j] = i; + if (++j == num_handlers) /* all registered */ + break; + } + } + + if (j < num_handlers) { + /* Not enough free handlers, restore unused handlers */ + for (i = 0; i < j; i++) { + ep->am_htable[handlers_idx[i]].hfn = _ignore_handler; + ep->am_htable[handlers_idx[i]].hctx = NULL; + ep->am_htable[handlers_idx[i]].version = PSM2_AM_HANDLER_V2; + } + PSM2_LOG_MSG("leaving"); + return psmi_handle_error(ep, PSM2_EP_NO_RESOURCES, + "Insufficient " + "available AM handlers: registered %d of %d requested handlers", + j, num_handlers); + } + else { + PSM2_LOG_MSG("leaving"); + return PSM2_OK; + } +} +PSMI_API_DECL(psm2_am_register_handlers_2) + +void +__psm2_am_unregister_handlers(psm2_ep_t ep) +{ + int i; + + PSM2_LOG_MSG("entering"); + for (i = 0; i < PSMI_AM_NUM_HANDLERS; i++) { + if (ep->am_htable[i].hfn != _ignore_handler) { + ep->am_htable[i].hfn = _ignore_handler; + ep->am_htable[i].hctx = NULL; + ep->am_htable[i].version = PSM2_AM_HANDLER_V2; + } + } + PSM2_LOG_MSG("leaving"); +} +PSMI_API_DECL(psm2_am_unregister_handlers) + +psm2_error_t +__psm2_am_request_short(psm2_epaddr_t epaddr, psm2_handler_t handler, + psm2_amarg_t *args, int nargs, void *src, size_t len, + int flags, psm2_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + psm2_error_t err; + ptl_ctl_t *ptlc = epaddr->ptlctl; + + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + psmi_assert(epaddr != NULL); + psmi_assert(handler >= 0 && handler < psmi_am_parameters.max_handlers); + psmi_assert(nargs >= 0 && nargs <= psmi_am_parameters.max_nargs); + psmi_assert(nargs > 0 ? args != NULL : 1); + psmi_assert(len >= 0 && len <= psmi_am_parameters.max_request_short); + psmi_assert(len > 0 ? src != NULL : 1); + + PSMI_LOCK(ptlc->ep->mq->progress_lock); + + err = ptlc->am_short_request(epaddr, handler, args, + nargs, src, len, flags, completion_fn, + completion_ctxt); + PSMI_UNLOCK(ptlc->ep->mq->progress_lock); + PSM2_LOG_MSG("leaving"); + + return err; +} +PSMI_API_DECL(psm2_am_request_short) + +psm2_error_t +__psm2_am_reply_short(psm2_am_token_t token, psm2_handler_t handler, + psm2_amarg_t *args, int nargs, void *src, size_t len, + int flags, psm2_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + psm2_error_t err; + struct psmi_am_token *tok; + psm2_epaddr_t epaddr; + ptl_ctl_t *ptlc; + + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + psmi_assert_always(token != NULL); + psmi_assert(handler >= 0 && handler < psmi_am_parameters.max_handlers); + psmi_assert(nargs >= 0 && nargs <= psmi_am_parameters.max_nargs); + psmi_assert(nargs > 0 ? args != NULL : 1); + psmi_assert(len >= 0 && len <= psmi_am_parameters.max_reply_short); + psmi_assert(len > 0 ? src != NULL : 1); + + tok = (struct psmi_am_token *)token; + epaddr = tok->epaddr_incoming; + ptlc = epaddr->ptlctl; + + /* No locking here since we are already within handler context and already + * locked */ + + err = ptlc->am_short_reply(token, handler, args, + nargs, src, len, flags, completion_fn, + completion_ctxt); + PSM2_LOG_MSG("leaving"); + + return err; +} +PSMI_API_DECL(psm2_am_reply_short) + +psm2_error_t __psm2_am_get_source(psm2_am_token_t token, psm2_epaddr_t *epaddr_out) +{ + struct psmi_am_token *tok; + + PSM2_LOG_MSG("entering"); + if (token == NULL || epaddr_out == NULL) { + PSM2_LOG_MSG("leaving"); + return psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Invalid %s parameters", __FUNCTION__); + } + + tok = (struct psmi_am_token *)token; + *epaddr_out = tok->epaddr_incoming; + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} +PSMI_API_DECL(psm2_am_get_source) + +psm2_error_t +__psm2_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters, + size_t sizeof_parameters_in, + size_t *sizeof_parameters_out) +{ + size_t s; + + PSM2_LOG_MSG("entering"); + if (parameters == NULL) { + PSM2_LOG_MSG("leaving"); + return psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Invalid %s parameters", __FUNCTION__); + } + + memset(parameters, 0, sizeof_parameters_in); + s = min(sizeof(psmi_am_parameters), sizeof_parameters_in); + memcpy(parameters, &psmi_am_parameters, s); + *sizeof_parameters_out = s; + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} +PSMI_API_DECL(psm2_am_get_parameters) diff --git a/prov/psm3/psm3/psm_am_internal.h b/prov/psm3/psm3/psm_am_internal.h new file mode 100644 index 00000000000..af151dc18c1 --- /dev/null +++ b/prov/psm3/psm3/psm_am_internal.h @@ -0,0 +1,108 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _PSM2_AM_INTERNAL_H +#define _PSM2_AM_INTERNAL_H + +#define PSMI_AM_MAX_ARGS 10 +#define PSMI_AM_NUM_HANDLERS 256 /* must be power of 2 */ + +#define PSMI_AM_ARGS_DEFAULT psm2_am_token_t token, \ + psm2_amarg_t *args, int nargs, \ + void *src, uint32_t len, \ + void *hctx + +enum psm2_am_handler_version +{ + PSM2_AM_HANDLER_V1 = 0, + PSM2_AM_HANDLER_V2, +}; + +struct psm2_ep_am_handle_entry +{ + void *hfn; + void *hctx; + enum psm2_am_handler_version version; +}; + +struct psmi_am_token { + psm2_epaddr_t epaddr_incoming; + uint32_t flags; + /* Can handler reply? i.e. Not OPCODE_AM_REQUEST_NOREPLY request */ + uint32_t can_reply; + + /* PTLs may add other stuff here */ +}; + +/* AM capabilities parameters are initialized once in psmi_am_init_internal + and copied out in __psm2_am_get_parameters. When debugging is enabled, + various assertions reference these parameters for sanity checking. */ +extern struct psm2_am_parameters psmi_am_parameters; + +PSMI_ALWAYS_INLINE(struct psm2_ep_am_handle_entry * + psm_am_get_handler_function(psm2_ep_t ep, + psm2_handler_t handler_idx)) +{ + int hidx = handler_idx & (PSMI_AM_NUM_HANDLERS - 1); + struct psm2_ep_am_handle_entry *hentry = &ep->am_htable[hidx]; + psmi_assert_always(hentry != NULL); + return hentry; +} + +/* PSM internal initialization */ +psm2_error_t psmi_am_init_internal(psm2_ep_t ep); +void psmi_am_fini_internal(psm2_ep_t ep); + +#endif diff --git a/prov/psm3/psm3/psm_config.h b/prov/psm3/psm3/psm_config.h new file mode 100644 index 00000000000..9e671d15e54 --- /dev/null +++ b/prov/psm3/psm3/psm_config.h @@ -0,0 +1,211 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2018 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2018 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef PSM_CONFIG_H +#define PSM_CONFIG_H + +/* + * The following flags can be used instead of `make` switches in order to + * change behavior achieved when using `make` without parameters. + */ + +#ifndef RDPMC_PERF_FRAMEWORK +/* #define RDPMC_PERF_FRAMEWORK */ +#endif + +#ifndef PSM2_MOCK_TESTING +/* #define PSM2_MOCK_TESTING */ +#endif + +#ifndef PSM_CUDA +/* #define PSM_CUDA */ +/* #define NVIDIA_GPU_DIRECT */ +#endif + +#ifndef PSM3_BRAKE_DEBUG +/* #define PSM3_BRAKE_DEBUG */ +#endif + +#ifndef PSM_DEBUG +/* #define PSM_DEBUG */ +/* #define _HFI_DEBUGGING 1 */ +/* #define _FORTIFY_SOURCE 2 */ +#endif + +#ifndef PSM_HEAP_DEBUG +/* #define PSM_HEAP_DEBUG */ +#endif + +#ifndef PSM_PROFILE +/* #define PSM_PROFILE */ +#endif + +#define PSMI_MIN_EP_CONNECT_TIMEOUT (2 * SEC_ULL) +#define PSMI_MIN_EP_CLOSE_TIMEOUT (1 * SEC_ULL) +#define PSMI_MAX_EP_CLOSE_TIMEOUT (2 * SEC_ULL) + +#define PSMI_MIN_EP_CLOSE_GRACE_INTERVAL (1 * SEC_ULL) +#define PSMI_MAX_EP_CLOSE_GRACE_INTERVAL (2 * SEC_ULL) + + +#define PSMI_MAX_RAILS 32 /* Max number of unique devices */ + /* also sets PSMX3_MAX_UNITS in psmx3.h */ +#define PSMI_MAX_QPS 32 /* Max number of total QPs (QPs/NIC * RAILs) */ + /* must be >= PSMI_MAX_RAILS */ + +#define AFFINITY_SHM_BASENAME "/psm3_nic_affinity_shm" +#define AFFINITY_SHMEMSIZE sysconf(_SC_PAGE_SIZE) +#define AFFINITY_SHM_REF_COUNT_LOCATION 0 +#define AFFINITY_SHM_HFI_INDEX_LOCATION 1 +#define SEM_AFFINITY_SHM_RW_BASENAME "/psm3_nic_affinity_shm_rw_mutex" + +#define PSMI_RCVTHREAD_FLAGS 0x1 +/**< + * Default setting for Receive thread + * + * 0x0 disables rcvthread by default + * 0x1 enables ips receive thread by default + */ + +/* + * Define one of these below. + * + * Spinlock gives the best performance and makes sense with the progress thread + * only because the progress thread does a "trylock" and then goes back to + * sleep in a poll. + * + * Mutexlock should be used for experimentation while the more useful + * mutexlock-debug should be enabled during development to catch potential + * errors. + */ +#ifdef PSM_DEBUG +#define PSMI_LOCK_IS_MUTEXLOCK_DEBUG +#else +#define PSMI_LOCK_IS_SPINLOCK +/* #define PSMI_LOCK_IS_MUTEXLOCK */ +/* #define PSMI_LOCK_IS_MUTEXLOCK_DEBUG */ +/* #define PSMI_PLOCK_IS_NOLOCK */ +#endif + +#ifdef PSM_CUDA +/* XXX TODO: Getting the gpu page size from driver at init time */ +#define PSMI_GPU_PAGESIZE 65536 + +#define CUDA_SMALLHOSTBUF_SZ (256*1024) +#define CUDA_WINDOW_PREFETCH_DEFAULT 2 +#define GPUDIRECT_THRESH_RV 3 + +#define GDR_COPY_THRESH_SEND 32 +#define GDR_COPY_THRESH_RECV 64000 +/* All GPU transfers beyond this threshold use + * RNDV protocol. It is mostly a send side knob. + */ +#define CUDA_THRESH_RNDV 32768 +#endif + +#define MQ_HFI_THRESH_TINY 8 +#define MQ_HFI_THRESH_EGR_SDMA_XEON 34000 /* Eager Xeon blocking */ +#define MQ_HFI_THRESH_EGR_SDMA_PHI2 200000 /* Eager Phi2 blocking */ +#define MQ_HFI_THRESH_EGR_SDMA_SQ_XEON 16384 /* Eager Xeon non-blocking */ +#define MQ_HFI_THRESH_EGR_SDMA_SQ_PHI2 65536 /* Eager Phi2 non-blocking */ + +#define MQ_HFI_THRESH_RNDV_PHI2 200000 +#define MQ_HFI_THRESH_RNDV_XEON 64000 + +#define MQ_HFI_WINDOW_RNDV_PHI2 4194304 +#define MQ_HFI_WINDOW_RNDV_XEON 131072 + +#ifdef PSM_CUDA +#define MQ_HFI_WINDOW_RNDV_CUDA 2097152 +#endif + +#define MQ_SHM_THRESH_RNDV 16000 + +#define NUM_HASH_BUCKETS 64 +#define HASH_THRESHOLD 65 +#define NUM_HASH_CONFIGS 3 +#define NUM_MQ_SUBLISTS (NUM_HASH_CONFIGS + 1) + +#define REMOVE_ENTRY 1 + + +/* Keep timer stats */ +#define PSMI_TIMER_STATS 0 + + +/* Psm context */ +#define HAL_CONTEXT_OPEN_RETRY_MAX 3 + + +/* + * By default, PSMI_DEVICES_DEFAULT establishes the bind order a component is + * tested for reachability to each peer. First self, then shm and finally + * hfi. The order should really only affect endpoints that happen to be on + * the same node. PSM will correctly detect that two endpoints are on the same + * node even though they may be using different host interfaces. + */ +#define PSMI_DEVICES_DEFAULT "self,shm,nic" + +/* Lock */ +#define PSMI_USE_PTHREAD_SPINLOCKS 0 + +/* Utils */ +#define PSMI_EPID_TABSIZE_CHUNK 128 +#define PSMI_EPID_TABLOAD_FACTOR ((float)0.7) + +#define PSMI_EP_HOSTNAME_LEN 64 /* hostname only */ +#define PSMI_EP_NAME_LEN 96 /* hostname:LID:context:subcontext */ + +#define PSMI_FAULTINJ_SPEC_NAMELEN 32 + +#endif /* PSM_CONFIG_H */ diff --git a/prov/psm3/psm3/psm_context.c b/prov/psm3/psm3/psm_context.c new file mode 100644 index 00000000000..b48564e7f28 --- /dev/null +++ b/prov/psm3/psm3/psm_context.c @@ -0,0 +1,648 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + +#include +#include +#include "psm_user.h" +#include "psm2_hal.h" + +static int psmi_get_hfi_selection_algorithm(void); + +psm2_error_t psmi_context_interrupt_set(psmi_context_t *context, int enable) +{ + int poll_type; + int ret; + + if (!enable == !psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED)) + return PSM2_OK; + + if (enable) + poll_type = PSMI_HAL_POLL_TYPE_URGENT; + else + poll_type = 0; + + // we need the ep->verbs_ep and no way to get from psm_hw_ctxt to + // the ep. So we need a new function instead of just changing a HAL func + // if verbs_ep was the psm_hw_ctxt for UD HAL, this would not be necessary + ret = __psm2_ep_poll_type(poll_type, context->ep); + + if (ret != 0) + return PSM2_EP_NO_RESOURCES; + else { + if (enable) + psmi_hal_add_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED); + else + psmi_hal_sub_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED); + return PSM2_OK; + } +} + +int psmi_context_interrupt_isenabled(psmi_context_t *context) +{ + return psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED); +} + + +/* returns the 8-bit hash value of an uuid. */ +static inline +uint8_t +psmi_get_uuid_hash(psm2_uuid_t const uuid) +{ + int i; + uint8_t hashed_uuid = 0; + + for (i=0; i < sizeof(psm2_uuid_t); ++i) + hashed_uuid ^= *((uint8_t const *)uuid + i); + + return hashed_uuid; +} + +int psmi_get_current_proc_location() +{ + int core_id, node_id; + + core_id = sched_getcpu(); + if (core_id < 0) + return -EINVAL; + + node_id = numa_node_of_cpu(core_id); + if (node_id < 0) + return -EINVAL; + + return node_id; +} + +static void +psmi_spread_hfi_selection(psm2_uuid_t const job_key, long *unit_start, + long *unit_end, int nunits) +{ + { + /* else, we are going to look at: + (a hash of the job key plus the local rank id) mod nunits. */ + + *unit_start = ((hfi_get_mylocalrank()+1) + + psmi_get_uuid_hash(job_key)) % nunits; + if (*unit_start > 0) + *unit_end = *unit_start - 1; + else + *unit_end = nunits-1; + } +} + +static int +psmi_create_and_open_affinity_shm(psm2_uuid_t const job_key) +{ + int shm_fd, ret; + int first_to_create = 0; + size_t shm_name_len = 256; + shared_affinity_ptr = NULL; + affinity_shm_name = NULL; + affinity_shm_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, shm_name_len); + + psmi_assert_always(affinity_shm_name != NULL); + snprintf(affinity_shm_name, shm_name_len, + AFFINITY_SHM_BASENAME".%d", + psmi_get_uuid_hash(job_key)); + shm_fd = shm_open(affinity_shm_name, O_RDWR | O_CREAT | O_EXCL, + S_IRUSR | S_IWUSR); + if ((shm_fd < 0) && (errno == EEXIST)) { + shm_fd = shm_open(affinity_shm_name, O_RDWR, S_IRUSR | S_IWUSR); + if (shm_fd < 0) { + _HFI_VDBG("Cannot open affinity shared mem fd:%s, errno=%d\n", + affinity_shm_name, errno); + return shm_fd; + } + } else if (shm_fd > 0) { + first_to_create = 1; + } else { + _HFI_VDBG("Cannot create affinity shared mem fd:%s, errno=%d\n", + affinity_shm_name, errno); + } + + ret = ftruncate(shm_fd, AFFINITY_SHMEMSIZE); + if ( ret < 0 ) { + _HFI_VDBG("Cannot truncate affinity shared mem fd:%s, errno=%d\n", + affinity_shm_name, errno); + if (shm_fd >= 0) close(shm_fd); + return ret; + } + + shared_affinity_ptr = (uint64_t *) mmap(NULL, AFFINITY_SHMEMSIZE, PROT_READ | PROT_WRITE, + MAP_SHARED, shm_fd, 0); + if (shared_affinity_ptr == MAP_FAILED) { + _HFI_VDBG("Cannot mmap affinity shared memory. errno=%d\n", + errno); + close(shm_fd); + return -1; + } + close(shm_fd); + + psmi_affinity_shared_file_opened = 1; + + if (first_to_create) { + _HFI_VDBG("Creating shm to store NIC affinity per socket\n"); + + memset(shared_affinity_ptr, 0, AFFINITY_SHMEMSIZE); + + /* + * Once shm object is initialized, unlock others to be able to + * use it. + */ + psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name); + } else { + _HFI_VDBG("Opening shm object to read/write NIC affinity per socket\n"); + } + + /* + * Start critical section to increment reference count when creating + * or opening shm object. Decrement of ref count will be done before + * closing the shm. + */ + if (psmi_sem_timedwait(sem_affinity_shm_rw, sem_affinity_shm_rw_name)) { + _HFI_VDBG("Could not enter critical section to update shm refcount\n"); + return -1; + } + + shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] += 1; + + /* End critical section */ + psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name); + + return 0; +} + +/* + * Spread HFI selection between units if we find more than one within a socket. + */ +static void +psmi_spread_hfi_within_socket(long *unit_start, long *unit_end, int node_id, + int *saved_hfis, int found, psm2_uuid_t const job_key) +{ + int ret, shm_location; + + /* + * Take affinity lock and open shared memory region to be able to + * accurately determine which HFI to pick for this process. If any + * issues, bail by picking first known HFI. + */ + if (!psmi_affinity_semaphore_open) + goto spread_hfi_fallback; + + ret = psmi_create_and_open_affinity_shm(job_key); + if (ret < 0) + goto spread_hfi_fallback; + + shm_location = AFFINITY_SHM_HFI_INDEX_LOCATION + node_id; + if (shm_location > AFFINITY_SHMEMSIZE) + goto spread_hfi_fallback; + + /* Start critical section to read/write shm object */ + if (psmi_sem_timedwait(sem_affinity_shm_rw, sem_affinity_shm_rw_name)) { + _HFI_VDBG("Could not enter critical section to update NIC index\n"); + goto spread_hfi_fallback; + } + + *unit_start = *unit_end = saved_hfis[shared_affinity_ptr[shm_location]]; + shared_affinity_ptr[shm_location] = + (shared_affinity_ptr[shm_location] + 1) % found; + _HFI_VDBG("Selected NIC index= %ld, Next NIC=%ld, node = %d, local rank=%d, found=%d.\n", + *unit_start, shared_affinity_ptr[shm_location], node_id, + hfi_get_mylocalrank(), found); + + /* End Critical Section */ + psmi_sem_post(sem_affinity_shm_rw, sem_affinity_shm_rw_name); + + return; + +spread_hfi_fallback: + *unit_start = *unit_end = saved_hfis[0]; +} + +static void +psmi_create_affinity_semaphores(psm2_uuid_t const job_key) +{ + int ret; + sem_affinity_shm_rw_name = NULL; + size_t sem_len = 256; + + /* + * If already opened, no need to do anything else. + * This could be true for Multi-EP cases where a different thread has + * already created the semaphores. We don't need separate locks here as + * we are protected by the overall "psmi_creation_lock" which each + * thread will take in psm2_ep_open() + */ + if (psmi_affinity_semaphore_open) + return; + + sem_affinity_shm_rw_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, sem_len); + psmi_assert_always(sem_affinity_shm_rw_name != NULL); + snprintf(sem_affinity_shm_rw_name, sem_len, + SEM_AFFINITY_SHM_RW_BASENAME".%d", + psmi_get_uuid_hash(job_key)); + + ret = psmi_init_semaphore(&sem_affinity_shm_rw, sem_affinity_shm_rw_name, + S_IRUSR | S_IWUSR, 0); + if (ret) { + _HFI_VDBG("Cannot initialize semaphore: %s for read-write access to shm object.\n", + sem_affinity_shm_rw_name); + sem_close(sem_affinity_shm_rw); + psmi_free(sem_affinity_shm_rw_name); + sem_affinity_shm_rw_name = NULL; + return; + } + + _HFI_VDBG("Semaphore: %s created for read-write access to shm object.\n", + sem_affinity_shm_rw_name); + + psmi_affinity_semaphore_open = 1; + + return; +} + +// return set of units to consider and which to start at. +// caller will use 1st active unit which can be opened. +// caller will wrap around so it's valid for start > end +static +psm2_error_t +psmi_compute_start_and_end_unit(long unit_param,int nunitsactive,int nunits, + psm2_uuid_t const job_key, + long *unit_start,long *unit_end) +{ + unsigned short hfi_sel_alg = PSMI_UNIT_SEL_ALG_ACROSS; + int node_id, unit_id, found = 0; + int saved_hfis[nunits]; + + /* if the user did not set PSM3_NIC then ... */ + if (unit_param == PSM3_NIC_ANY) + { + if (nunitsactive > 1) { + // if NICs are on different subnets, and ! allow_routers + // we need to have all ranks default to the same subnet + // so force 1st active NIC in that case + uint64_t subnet; + int have_subnet = 0; + int have_eth = 0; + for (unit_id = 0; unit_id < nunits; unit_id++) { + uint64_t gid_hi, hi; + int is_eth = 0; + if (psmi_hal_get_unit_active(unit_id) <= 0) + continue; + if (0 != psmi_hal_get_port_subnet(unit_id, 1 /* VERBS_PORT*/, + &gid_hi, NULL, NULL, NULL, NULL, &hi, NULL)) + continue; // can't access NIC + is_eth = (gid_hi != hi); + if (! have_subnet) { + subnet = gid_hi; + have_subnet = 1; + have_eth = is_eth; + } else if (have_eth != is_eth + || (subnet != gid_hi + && (! is_eth || ! psmi_allow_routers))) { + // active units have different subnets + // caller will pick 1st active unit + *unit_start = 0; + *unit_end = nunits - 1; + return PSM2_OK; + } + } + } + + /* Get the actual selection algorithm from the environment: */ + hfi_sel_alg = psmi_get_hfi_selection_algorithm(); + /* If round-robin is selection algorithm and ... */ + if ((hfi_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS) && + /* there are more than 1 active units then ... */ + (nunitsactive > 1)) + { + /* + * Pick first HFI we find on same root complex + * as current task. If none found, fall back to + * load-balancing algorithm. + */ + node_id = psmi_get_current_proc_location(); + if (node_id >= 0) { + for (unit_id = 0; unit_id < nunits; unit_id++) { + if (psmi_hal_get_unit_active(unit_id) <= 0) + continue; + + int node_id_i; + + if (!psmi_hal_get_node_id(unit_id, &node_id_i)) { + if (node_id_i == node_id) { + saved_hfis[found] = unit_id; + found++; + } + } + } + + if (found > 1) { + psmi_create_affinity_semaphores(job_key); + psmi_spread_hfi_within_socket(unit_start, unit_end, + node_id, saved_hfis, + found, job_key); + } else if (found == 1) { + *unit_start = *unit_end = saved_hfis[0]; + } + } + + if (node_id < 0 || !found) { + psmi_spread_hfi_selection(job_key, unit_start, + unit_end, nunits); + } + } else if ((hfi_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS_ALL) && + (nunitsactive > 1)) { + psmi_spread_hfi_selection(job_key, unit_start, + unit_end, nunits); + } + else { // PSMI_UNIT_SEL_ALG_WITHIN or only 1 active unit + // caller will pick 1st active unit + *unit_start = 0; + *unit_end = nunits - 1; + } + } else if (unit_param >= 0) { + /* the user specified PSM3_NIC, we use it. */ + *unit_start = *unit_end = unit_param; + } else { + psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM3 can't open unit: %ld for reading and writing", + unit_param); + return PSM2_EP_DEVICE_FAILURE; + } + + return PSM2_OK; +} + +psm2_error_t +psmi_context_open(const psm2_ep_t ep, long unit_param, long port, + psm2_uuid_t const job_key, int64_t timeout_ns, + psmi_context_t *context) +{ + long open_timeout = 0, unit_start, unit_end, unit_id, unit_id_prev; + psm2_error_t err = PSM2_OK; + int nunits = psmi_hal_get_num_units(), nunitsactive=0; + + /* + * If shared contexts are enabled, try our best to schedule processes + * across one or many devices + */ + + /* if no units, then no joy. */ + if (nunits <= 0) + { + err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM3 no nic units are available"); + goto ret; + } + + /* Calculate the number of active units: */ + for (unit_id=0;unit_id < nunits;unit_id++) + { + if (psmi_hal_get_unit_active(unit_id) > 0) + nunitsactive++; + } + /* if no active units, then no joy. */ + if (nunitsactive == 0) + { + err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM3 no nic units are active"); + goto ret; + } + if (timeout_ns > 0) + open_timeout = (long)(timeout_ns / MSEC_ULL); + + + unit_start = 0; unit_end = nunits - 1; + err = psmi_compute_start_and_end_unit(unit_param, nunitsactive, + nunits, job_key, + &unit_start, &unit_end); + if (err != PSM2_OK) + return err; + + /* this is the start of a loop that starts at unit_start and goes to unit_end. + but note that the way the loop computes the loop control variable is by + an expression involving the mod operator. */ + int success = 0; + unit_id_prev = unit_id = unit_start; + do + { + /* close previous opened unit fd before attempting open of current unit. */ + if (context->psm_hw_ctxt) { + psmi_hal_close_context(&context->psm_hw_ctxt); + context->psm_hw_ctxt = 0; + } + + /* if the unit_id is not active, go to next one. */ + if (psmi_hal_get_unit_active(unit_id) <= 0) { + unit_id_prev = unit_id; + unit_id = (unit_id + 1) % nunits; + continue; + } + + /* open this unit. */ + int rv = psmi_hal_context_open(unit_id, port, open_timeout, + ep, job_key, context, + psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED), + HAL_CONTEXT_OPEN_RETRY_MAX); + + /* go to next unit if failed to open. */ + if (rv || context->psm_hw_ctxt == NULL) { + unit_id_prev = unit_id; + unit_id = (unit_id + 1) % nunits; + continue; + } + + success = 1; + break; + + } while (unit_id_prev != unit_end); + + if (!success) + { + err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM3 can't open nic unit: %ld",unit_param); + goto bail; + } + + context->ep = (psm2_ep_t) ep; + + /* Check backward compatibility bits here and save the info */ + if (psmi_hal_has_cap(PSM_HAL_CAP_GPUDIRECT_OT)) + { +#ifdef PSM_CUDA + is_driver_gpudirect_enabled = 1; +#else + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "FATAL ERROR: " + "CUDA version of rendezvous driver is loaded with non-CUDA version of " + "psm3 provider.\n"); +#endif + } +#ifdef PSM_CUDA + else + fprintf(stderr,"WARNING: running CUDA version of psm3 provider with non CUDA version of rendezvous driver.\n"); +#endif + _HFI_VDBG("hal_context_open() passed.\n"); + + /* Construct epid for this Endpoint */ + psmi_assert_always(PSMI_EPID_VERSION == PSMI_EPID_V3 + || PSMI_EPID_VERSION == PSMI_EPID_V4); + psmi_assert_always (ep->verbs_ep.context); + // TBD - if we put the verbs_ep in hw_ctxt we could push this to HAL + // verbs_ep_open has initialized: ep->unit_id, ep->portnum, + // ep->gid_hi, ep->gid_lo + if (ep->verbs_ep.link_layer == IBV_LINK_LAYER_ETHERNET) { + char buf[INET_ADDRSTRLEN]; + int netmask_bits = psmi_count_high_bits(ep->verbs_ep.ip_netmask); + if (netmask_bits < 0) { + err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM3 invalid netmask: %s", + psmi_ipv4_ntop(ep->verbs_ep.ip_netmask, buf, sizeof(buf))); + goto bail; + } + psmi_epid_ver = PSMI_EPID_V4; // overide default based on device + context->epid = PSMI_EPID_PACK_V4(ep->verbs_ep.ip_addr, + ep->verbs_ep.qp->qp_num, netmask_bits); + _HFI_VDBG("construct epid v4: 0x%"PRIx64" ip %s subnet_bits %u qp %d mtu %d\n", + context->epid, + psmi_ipv4_ntop(ep->verbs_ep.ip_addr, buf, sizeof(buf)), + netmask_bits, ep->verbs_ep.qp->qp_num, ep->mtu); + } else { + unsigned subnet = ep->gid_hi & 0xffff; + psmi_epid_ver = PSMI_EPID_V3; // overide default based on device + context->epid = PSMI_EPID_PACK_V3(ep->verbs_ep.port_attr.lid, + ep->verbs_ep.qp->qp_num, + subnet /*ep->gid_hi*/); + _HFI_VDBG("construct epid v3: 0x%"PRIx64" lid %d qp %d subnet 0x%x mtu %d\n", + context->epid, ep->verbs_ep.port_attr.lid, + ep->verbs_ep.qp->qp_num, subnet, ep->mtu); + } + + goto ret; + +bail: + _HFI_PRDBG("open failed: unit_id: %ld, err: %d (%s)\n", unit_id, err, strerror(errno)); + if (context->psm_hw_ctxt) { + psmi_hal_close_context(&context->psm_hw_ctxt); + context->psm_hw_ctxt = 0; + } +ret: + + _HFI_VDBG("psmi_context_open() return %d\n", err); + return err; +} + +psm2_error_t psmi_context_close(psmi_context_t *context) +{ + if (context->psm_hw_ctxt) { + psmi_hal_close_context(&context->psm_hw_ctxt); + context->psm_hw_ctxt = 0; + } + + return PSM2_OK; +} + +/* + * This function works whether a context is initialized or not in a psm2_ep. + * + * Returns one of + * + * PSM2_OK: Port status is ok (or context not initialized yet but still "ok") + * PSM2_OK_NO_PROGRESS: Cable pulled + * PSM2_EP_NO_NETWORK: No network, no lid, ... + * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc. + * The message follows the per-port status + * As of 7322-ready driver, need to check port-specific qword for IB + * as well as older unit-only. For now, we don't have the port interface + * defined, so just check port 0 qword for spi_status + */ +psm2_error_t psmi_context_check_status(const psmi_context_t *contexti) +{ + psm2_error_t err = PSM2_OK; + return err; +} + +static +int psmi_get_hfi_selection_algorithm(void) +{ + union psmi_envvar_val env_hfi1_alg; + int hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS; + + /* If a specific unit is set in the environment, use that one. */ + psmi_getenv("PSM3_NIC_SELECTION_ALG", + "NIC Device Selection Algorithm to use. Round Robin[RoundRobin or rr] (Default) " + ", Packed[p] or Round Robin All[RoundRobinAll or rra].", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"rr", &env_hfi1_alg); + + if (!strcasecmp(env_hfi1_alg.e_str, "Round Robin") + || !strcasecmp(env_hfi1_alg.e_str, "RoundRobin") + || !strcasecmp(env_hfi1_alg.e_str, "rr")) + hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS; + else if (!strcasecmp(env_hfi1_alg.e_str, "Packed") + || !strcasecmp(env_hfi1_alg.e_str, "p")) + hfi1_alg = PSMI_UNIT_SEL_ALG_WITHIN; + else if (!strcasecmp(env_hfi1_alg.e_str, "Round Robin All") + || !strcasecmp(env_hfi1_alg.e_str, "RoundRobinAll") + || !strcasecmp(env_hfi1_alg.e_str, "rra")) + hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS_ALL; + else { + _HFI_ERROR + ("Unknown NIC selection algorithm %s. Defaulting to Round Robin " + "allocation of NICs.\n", env_hfi1_alg.e_str); + hfi1_alg = PSMI_UNIT_SEL_ALG_ACROSS; + } + + return hfi1_alg; +} diff --git a/prov/psm3/psm3/psm_context.h b/prov/psm3/psm3/psm_context.h new file mode 100644 index 00000000000..c9387d1ac25 --- /dev/null +++ b/prov/psm3/psm3/psm_context.h @@ -0,0 +1,119 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_IN_USER_H +#error psm_context.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSM_CONTEXT_H +#define _PSM_CONTEXT_H + +typedef +struct psmi_context { + + /* The following three member variables are used for sharing contexts among + subcontexts and they have the following common properties: + + a. They are all initialized below HAL layer when the context is opened. + b. If they are NULL that means no context is being shared among subcontexts, + non-NULL means a context is being shared among some number of subcontexts. + c. The initialization code is currently found in the gen1 hal instance. + */ + + void *spio_ctrl; + void *tid_ctrl; + void *tf_ctrl; + + /* end of shared context member variables. */ + + psmi_hal_hw_context psm_hw_ctxt; + + psm2_ep_t ep; /* psm ep handle */ + psm2_epid_t epid; /* psm integral ep id */ + psm2_error_t status_lasterr; + time_t networkLostTime; +} psmi_context_t; + +psm2_error_t +psmi_context_open(const psm2_ep_t ep, long unit_id, long port, + psm2_uuid_t const job_key, + int64_t timeout_ns, psmi_context_t *context); + +psm2_error_t psmi_context_close(psmi_context_t *context); + +/* Check status of context */ +psm2_error_t psmi_context_check_status(const psmi_context_t *context); + +psm2_error_t psmi_context_interrupt_set(psmi_context_t *context, int enable); +int psmi_context_interrupt_isenabled(psmi_context_t *context); + +/* + * round robin contexts across HFIs, then + * ports; this is the default. + * This option spreads the HFI selection within the local socket. + * If it is preferred to spread job over over entire set of + * HFIs within the system, see ALG_ACROSS_ALL below. + */ +#define PSMI_UNIT_SEL_ALG_ACROSS PSM_HAL_ALG_ACROSS + +#define PSMI_UNIT_SEL_ALG_ACROSS_ALL PSM_HAL_ALG_ACROSS_ALL + +/* + * use all contexts on an HFI (round robin + * active ports within), then next HFI + */ +#define PSMI_UNIT_SEL_ALG_WITHIN PSM_HAL_ALG_WITHIN + +#endif /* PSM_CONTEXT_H */ diff --git a/prov/psm3/psm3/psm_diags.c b/prov/psm3/psm3/psm_diags.c new file mode 100644 index 00000000000..8b4ba8a1821 --- /dev/null +++ b/prov/psm3/psm3/psm_diags.c @@ -0,0 +1,368 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm_mq_internal.h" + +typedef void (*memcpy_fn_t) (void *dst, const void *src, size_t n); +static int psmi_test_memcpy(memcpy_fn_t, const char *name); +static int psmi_test_epid_table(int numelems); + +int psmi_diags(void); + +#define diags_assert(x) do { \ + if (!(x)) { \ + _HFI_ERROR("Diags assertion failure: %s\n", \ + #x); \ + goto fail; \ + } \ + } while (0) + +#define DIAGS_RETURN_PASS(str) \ + do { _HFI_INFO("%s: PASSED %s\n", __func__, str); return 0; } \ + while (0) +#define DIAGS_RETURN_FAIL(str) \ + do { _HFI_INFO("%s: FAILED %s\n", __func__, str); return 1; } \ + while (0) + +int psmi_diags(void) +{ + int ret = 0; + ret |= psmi_test_epid_table(2048); + ret |= psmi_test_memcpy((memcpy_fn_t) psmi_memcpyo, "psmi_memcpyo"); + /* ret |= psmi_test_memcpy((memcpy_fn_t) psmi_mq_mtucpy, "psmi_mq_mtucpy"); */ + + if (ret) + DIAGS_RETURN_FAIL(""); + else + DIAGS_RETURN_PASS(""); +} + +/* + * Hash table test + */ +#define NALLOC 1024 +static int psmi_test_epid_table(int numelems) +{ + ptl_ctl_t ctl; + psm2_epaddr_t *ep_array, epaddr, ep_alloc; + psm2_epid_t *epid_array, epid_tmp; + psm2_ep_t ep = (psm2_ep_t) (uintptr_t) 0xabcdef00; + struct psmi_epid_table *tab; + int i, j; + struct drand48_data drand48_data; + + ep_alloc = + (psm2_epaddr_t) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems, + sizeof(struct psm2_epaddr)); + ep_array = + (psm2_epaddr_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems, + sizeof(struct psm2_epaddr *)); + epid_array = + (psm2_epid_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems, + sizeof(psm2_epid_t)); + diags_assert(ep_alloc != NULL); + diags_assert(ep_array != NULL); + diags_assert(epid_array != NULL); + + srand48_r(12345678, &drand48_data); + + psmi_epid_init(); + tab = &psmi_epid_table; + ctl.ep = ep; + + for (i = 0; i < numelems; i++) { + epid_array[i] = i; + ep_alloc[i].ptlctl = &ctl; + ep_alloc[i].epid = epid_array[i]; + ep_array[i] = &ep_alloc[i]; + } + for (i = 0; i < numelems; i++) { + psmi_epid_add(ep, epid_array[i], ep_array[i]); + } + + /* Randomize epid_array */ + for (i = 0; i < numelems; i++) { + long int rand_result; + lrand48_r(&drand48_data, &rand_result); + j = (int)(rand_result % numelems); + epid_tmp = epid_array[i]; + epid_array[i] = epid_array[j]; + epid_array[j] = epid_tmp; + } + /* Lookup. */ + for (i = 0; i < numelems; i++) { + epaddr = psmi_epid_lookup(ep, epid_array[i]); + diags_assert(epaddr != NULL); + diags_assert(epaddr->epid == epid_array[i]); + diags_assert(epaddr->ptlctl->ep == ep); + } + + /* Randomize epid_array again */ + for (i = 0; i < numelems; i++) { + long int rand_result; + lrand48_r(&drand48_data, &rand_result); + j = (int)(rand_result % numelems); + epid_tmp = epid_array[i]; + epid_array[i] = epid_array[j]; + epid_array[j] = epid_tmp; + } + /* Delete half */ + for (i = 0; i < numelems / 2; i++) { + epaddr = psmi_epid_remove(ep, epid_array[i]); + diags_assert(epaddr != NULL); + diags_assert(epaddr->epid == epid_array[i]); + diags_assert(epaddr->ptlctl->ep == ep); + } + /* Lookup other half -- expect non-NULL, then delete */ + for (i = numelems / 2; i < numelems; i++) { + epaddr = psmi_epid_lookup(ep, epid_array[i]); + diags_assert(epaddr != NULL); + diags_assert(epaddr->epid == epid_array[i]); + diags_assert(epaddr->ptlctl->ep == ep); + epaddr = psmi_epid_remove(ep, epid_array[i]); + epaddr = psmi_epid_lookup(ep, epid_array[i]); + diags_assert(epaddr == NULL); + } + /* Lookup whole thing, expect done */ + for (i = 0; i < numelems; i++) { + epaddr = psmi_epid_lookup(ep, epid_array[i]); + diags_assert(epaddr == NULL); + } + for (i = 0; i < tab->tabsize; i++) { + diags_assert(tab->table[i].entry == NULL || + tab->table[i].entry == EPADDR_DELETED); + } + + /* Make sure we're not leaking memory somewhere... */ + diags_assert(tab->tabsize > tab->tabsize_used && + tab->tabsize * PSMI_EPID_TABLOAD_FACTOR > + tab->tabsize_used); + + /* Only free on success */ + psmi_epid_fini(); + psmi_free(epid_array); + psmi_free(ep_array); + psmi_free(ep_alloc); + DIAGS_RETURN_PASS(""); + +fail: + /* Klocwork scan report memory leak. */ + psmi_epid_fini(); + if (epid_array) + psmi_free(epid_array); + if (ep_array) + psmi_free(ep_array); + if (ep_alloc) + psmi_free(ep_alloc); + DIAGS_RETURN_FAIL(""); +} + +/* + * Memcpy correctness test + */ +static int memcpy_check_size(memcpy_fn_t fn, int *p, int *f, size_t n); +static void *memcpy_check_one(memcpy_fn_t fn, void *dst, void *src, size_t n); + +static int psmi_test_memcpy(memcpy_fn_t fn, const char *memcpy_name) +{ + const int CORNERS = 0; + const long long lo = 1; + const long long hi = 16 * 1024 * 1024; + const long long below = 32; + const long long above = 32; + long long n, m; + char buf[128]; + int ret = 0; + int memcpy_passed; + int memcpy_failed; + + memcpy_passed = 0; + memcpy_failed = 0; + + ret = memcpy_check_size(fn, &memcpy_passed, &memcpy_failed, 0); + if (ret < 0) + DIAGS_RETURN_FAIL("no heap space"); + + for (n = lo; n <= hi; n <<= 1) { + _HFI_INFO("%s %d align=0..16\n", memcpy_name, (int)n); + for (m = n - below; m <= n + above; m++) { + if (m == n) { + ret = + memcpy_check_size(fn, &memcpy_passed, + &memcpy_failed, n); + if (ret < 0) + DIAGS_RETURN_FAIL("no heap space"); + } else if (CORNERS && m >= lo && m <= hi && m > (n >> 1) + && m < max(n, ((n << 1) - below))) { + ret = + memcpy_check_size(fn, &memcpy_passed, + &memcpy_failed, + (size_t) m); + if (ret < 0) + DIAGS_RETURN_FAIL("no heap space"); + } + } + } + + int total = memcpy_passed + memcpy_failed; + if (total > 0) { + _HFI_INFO("%d memcpy tests with %d passed (%.2f%%) " + "and %d failed (%.2f%%)\n", + total, memcpy_passed, (100.0 * memcpy_passed) / total, + memcpy_failed, (100.0 * memcpy_failed) / total); + } + if (memcpy_failed) { + snprintf(buf, sizeof(buf), "%s %.2f%% of tests memcpy_failed", + memcpy_name, (100.0 * memcpy_failed) / total); + DIAGS_RETURN_FAIL(buf); + } else { + DIAGS_RETURN_PASS(memcpy_name); + } +} + +void *memcpy_check_one(memcpy_fn_t fn, void *dst, void *src, size_t n) +{ + int ok = 1; + unsigned int seed = (unsigned int) + ((uintptr_t) dst ^ (uintptr_t) src ^ (uintptr_t) n); + size_t i; + struct drand48_data drand48_data; + + if (!n) + return dst; + + memset(src, 0x55, n); + memset(dst, 0xaa, n); + srand48_r(seed, &drand48_data); + for (i = 0; i < n; i++) { + long int rand_result; + lrand48_r(&drand48_data, &rand_result); + ((uint8_t *) src)[i] = (((int)(rand_result & INT_MAX)) >> 16) & 0xff; + } + + fn(dst, src, n); + memset(src, 0, n); + srand48_r(seed, &drand48_data); + for (i = 0; i < n; i++) { + long int rand_result; + lrand48_r(&drand48_data, &rand_result); + int value = (int)(uint8_t) (((int)(rand_result % INT_MAX)) >> 16); + int v = (int)((uint8_t *) dst)[i]; + if (v != value) { + _HFI_ERROR + ("Error on index %llu : got %d instead of %d\n", + (unsigned long long)i, v, value); + ok = 0; + } + } + return ok ? dst : NULL; +} + +int memcpy_check_size(memcpy_fn_t fn, int *p, int *f, size_t n) +{ +#define num_aligns 16 +#define USE_MALLOC 0 +#define DEBUG 0 + uint8_t *src; + uint8_t *dst; + size_t size = n * 2 + num_aligns; + if (USE_MALLOC) { + src = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size); + dst = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size); + if (src == NULL || dst == NULL) { + if (src) psmi_free(src); + if (dst) psmi_free(dst); + return -1; + } + } else { + void *src_p = NULL, *dst_p = NULL; + if (posix_memalign(&src_p, 64, size) != 0 || + posix_memalign(&dst_p, 64, size) != 0) { + if (src_p) free(src_p); + if (dst_p) free(dst_p); + return -1; + } + src = (uint8_t *) src_p; + dst = (uint8_t *) dst_p; + } + + int src_align, dst_align; + for (src_align = 0; src_align < num_aligns; src_align++) { + for (dst_align = 0; dst_align < num_aligns; dst_align++) { + uint8_t *d = ((uint8_t *) dst) + dst_align; + uint8_t *s = ((uint8_t *) src) + src_align; + int ok = (memcpy_check_one(fn, d, s, n) != NULL); + if (DEBUG || !ok) { + _HFI_INFO("memcpy(%p, %p, %llu) : %s\n", d, s, + (unsigned long long)n, + ok ? "passed" : "failed"); + } + if (ok) { + (*p)++; + } else { + (*f)++; + } + } + } + if (USE_MALLOC) { + psmi_free(src); + psmi_free(dst); + } else { + free(src); + free(dst); + } + return 0; +} diff --git a/prov/psm3/psm3/psm_ep.c b/prov/psm3/psm3/psm_ep.c new file mode 100644 index 00000000000..29b60cc6664 --- /dev/null +++ b/prov/psm3/psm3/psm_ep.c @@ -0,0 +1,1794 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include +#include +#include /* cpu_set */ +#include /* isalpha */ +#include + +#include "psm_user.h" +#include "psm2_hal.h" +#include "psm_mq_internal.h" +#include "psm_am_internal.h" +#include "ips_proto_params.h" + +#ifdef PSM_CUDA +#include "psm_gdrcpy.h" +#endif +/* + * Endpoint management + */ +psm2_ep_t psmi_opened_endpoint = NULL; +int psmi_opened_endpoint_count = 0; +static uint32_t *hfi_lids; +static uint32_t nlids; + +static psm2_error_t psmi_ep_open_device(const psm2_ep_t ep, + const struct psm2_ep_open_opts *opts, + const psm2_uuid_t unique_job_key, + struct psmi_context *context, + psm2_epid_t *epid); + +/* + * Device management + * + * PSM uses "devices" as components to manage communication to self, to peers + * reachable via shared memory and finally to peers reachable only through + * hfi. + */ + +static psm2_error_t psmi_parse_devices(int devices[PTL_MAX_INIT], + const char *devstr); +static int psmi_device_is_enabled(const int devices[PTL_MAX_INIT], int devid); +int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid); + +psm2_error_t __psm2_ep_num_devunits(uint32_t *num_units_o) +{ + static int num_units = -1; + + PSM2_LOG_MSG("entering"); + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + if (num_units == -1) { + num_units = psmi_hal_get_num_units(); + if (num_units == -1) + num_units = 0; + } + + *num_units_o = (uint32_t) num_units; + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} +PSMI_API_DECL(psm2_ep_num_devunits) + +static int cmpfunc(const void *p1, const void *p2) +{ + uint64_t a = ((uint64_t *) p1)[0]; + uint64_t b = ((uint64_t *) p2)[0]; + if (a < b) + return -1; + if (a == b) + return 0; + return 1; +} + +// process PSM3_MULTIRAIL and PSM3_MULTIRAIL_MAP and return the +// list of unit/port in unit[0-*num_rails] and port[0-*num_rails] +// When *num_rails is returned as 0, multirail is not enabled and +// other mechanisms (PSM3_NIC, PSM3_NIC_SELECTION_ALG) must be +// used by the caller to select a single NIC for the process +static psm2_error_t +psmi_ep_multirail(int *num_rails, uint32_t *unit, uint16_t *port) +{ + uint32_t num_units; + uint64_t gid_hi; + unsigned i, j, count = 0; + int ret; + psm2_error_t err = PSM2_OK; + uint64_t gidh[PSMI_MAX_RAILS][3]; + union psmi_envvar_val env_multirail; + union psmi_envvar_val env_multirail_map; + int multirail_within_socket_used = 0; + int node_id = -1, found = 0; + + psmi_getenv("PSM3_MULTIRAIL", + "Use all available NICs in the system for communication.\n" + "0: Disabled (default),\n" + "1: Enable multirail across all available NICs,\n" + "2: Enable multirail within socket.\n" + "\t For multirail within a socket, we try to find at\n" + "\t least one NIC on the same socket as current task.\n" + "\t If none found, we continue to use other NICs within\n" + "\t the system.", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)0, + &env_multirail); + if (!env_multirail.e_int) { + *num_rails = 0; + return PSM2_OK; + } + + if (env_multirail.e_int == 2) + multirail_within_socket_used = 1; + +/* + * map is in format: unit:port,unit:port,... + * where :port is optional (default of 1) and unit can be name or number + */ +#define MAX_MAP_LEN (PSMI_MAX_RAILS*128) + if (!psmi_getenv("PSM3_MULTIRAIL_MAP", + "NIC selections for each rail in format:\n" + " rail,rail,...\n" + "Where rail can be: unit:port or unit\n" + "When port is omitted, it defaults to 1\n" + "unit can be device name or unit number\n" + "default autoselects", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"", &env_multirail_map)) { + + char temp[MAX_MAP_LEN+1]; + char *s; + char *delim; + + strncpy(temp, env_multirail_map.e_str, MAX_MAP_LEN); + if (temp[MAX_MAP_LEN-1] != 0) + return psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM3_MULTIRAIL_MAP too long: '%s'", + env_multirail_map.e_str); + s = temp; + psmi_assert(*s); + do { + int u, p; + int skip_port = 0; + + if (! *s) // trailing ',' on 2nd or later loop + break; + if (count >= PSMI_MAX_RAILS) + return psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM3_MULTIRAIL_MAP exceeds %u rails: '%s'", + PSMI_MAX_RAILS, env_multirail_map.e_str); + + // parse unit + delim = strchr(s, ':'); + if (! delim) { + delim = strchr(s, ','); + skip_port = 1; + p = 1; + } + if (! delim && !skip_port) + return psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM3_MULTIRAIL_MAP invalid format: '%s'", + env_multirail_map.e_str); + if (delim) + *delim = '\0'; + u = sysfs_find_unit(s); + if (u < 0) + return psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM3_MULTIRAIL_MAP invalid unit: '%s'", s); + if (delim) + s = delim+1; + + // optionally parse port + if (! skip_port) { + delim = strchr(s, ','); + if (delim) + *delim = '\0'; + p = psmi_parse_str_long(s); + if (p < 0) + return psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "PSM3_MULTIRAIL_MAP invalid port: '%s'", s); + if (delim) + s = delim+1; + } + + unit[count] = u; + port[count] = p; + count++; + } while (delim); + *num_rails = count; + +/* + * Check if any of the port is not usable. + */ + for (i = 0; i < count; i++) { + _HFI_VDBG("rail %d: %u(%s) %u\n", i, + unit[i], sysfs_unit_dev_name(unit[i]), port[i]); + ret = psmi_hal_get_port_active(unit[i], port[i]); + if (ret <= 0) + return psmi_handle_error(NULL, + PSM2_EP_DEVICE_FAILURE, + "PSM3_MULTIRAIL_MAP: Unit/port: %d(%s):%d is not active.", + unit[i], sysfs_unit_dev_name(unit[i]), + port[i]); + ret = psmi_hal_get_port_lid(unit[i], port[i]); + if (ret <= 0) + return psmi_handle_error(NULL, + PSM2_EP_DEVICE_FAILURE, + "PSM3_MULTIRAIL_MAP: Couldn't get lid for unit %d(%s):%d", + unit[i], sysfs_unit_dev_name(unit[i]), + port[i]); + ret = psmi_hal_get_port_subnet(unit[i], port[i], NULL, NULL, NULL, NULL, NULL, NULL, NULL); + if (ret == -1) + return psmi_handle_error(NULL, + PSM2_EP_DEVICE_FAILURE, + "PSM3_MULTIRAIL_MAP: Couldn't get subnet for unit %d(%s):%d", + unit[i], sysfs_unit_dev_name(unit[i]), + port[i]); + } + return PSM2_OK; + } + + if ((err = psm2_ep_num_devunits(&num_units))) { + return err; + } + if (num_units > PSMI_MAX_RAILS) { + _HFI_INFO + ("Found %d units, max %d units are supported, use %d\n", + num_units, PSMI_MAX_RAILS, PSMI_MAX_RAILS); + num_units = PSMI_MAX_RAILS; + } + + /* + * PSM3_MULTIRAIL=2 functionality- + * - Try to find at least find one HFI in the same root + * complex. If none found, continue to run and + * use remaining HFIs in the system. + * - If we do find at least one HFI in same root complex, we + * go ahead and add to list. + */ + if (multirail_within_socket_used) { + node_id = psmi_get_current_proc_location(); + for (i = 0; i < num_units; i++) { + if (psmi_hal_get_unit_active(i) <= 0) + continue; + int node_id_i; + + if (!psmi_hal_get_node_id(i, &node_id_i)) { + if (node_id_i == node_id) { + found = 1; + break; + } + } + } + } +/* + * Get all the ports with a valid lid and gid, one per unit. + */ + for (i = 0; i < num_units; i++) { + int node_id_i; + + if (!psmi_hal_get_node_id(i, &node_id_i)) + { + if (multirail_within_socket_used && + found && (node_id_i != node_id)) + continue; + } + + for (j = HFI_MIN_PORT; j <= HFI_MAX_PORT; j++) { + ret = psmi_hal_get_port_lid(i, j); + if (ret <= 0) + continue; + ret = psmi_hal_get_port_subnet(i, j, &gid_hi, NULL, NULL, NULL, NULL, NULL, NULL); + if (ret == -1) + continue; + + gidh[count][0] = gid_hi; + gidh[count][1] = i; + gidh[count][2] = j; + count++; + break; + } + } + +/* + * Sort all the ports with gidh from small to big. + * This is for multiple fabrics, and we use fabric with the + * smallest gid to make the master connection. + */ + qsort(gidh, count, sizeof(uint64_t) * 3, cmpfunc); + + for (i = 0; i < count; i++) { + unit[i] = (uint32_t) gidh[i][1]; + port[i] = (uint16_t) (uint32_t) gidh[i][2]; + } + *num_rails = count; + return PSM2_OK; +} + +// this is used to find devices with the same address as another process, +// implying intra-node comms. +#define MAX_GID_IDX 31 +static psm2_error_t +psmi_ep_devlids(uint32_t **lids, uint32_t *num_lids_o, + uint64_t my_gid_hi, uint64_t my_gid_lo, psm2_epid_t my_epid) +{ + uint32_t num_units; + int i; + psm2_error_t err = PSM2_OK; + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + if (hfi_lids == NULL) { + if ((err = psm2_ep_num_devunits(&num_units))) + goto fail; + hfi_lids = (uint32_t *) + psmi_calloc(PSMI_EP_NONE, UNDEFINED, + num_units * psmi_hal_get_num_ports(), sizeof(*hfi_lids)); + if (hfi_lids == NULL) { + err = psmi_handle_error(NULL, PSM2_NO_MEMORY, + "Couldn't allocate memory for dev_lids structure"); + goto fail; + } + + for (i = 0; i < num_units; i++) { + int j; + for (j = HFI_MIN_PORT; j <= HFI_MAX_PORT; j++) { + int lid = psmi_hal_get_port_lid(i, j); + int ret, idx = 0; + uint64_t gid_hi = 0, gid_lo = 0; + uint64_t actual_gid_hi = 0; + uint32_t ipaddr = 0; + + // if looking for IB/OPA lid, skip ports we can't get lid for + if (lid <= 0 && psmi_epid_version(my_epid) == PSMI_EPID_V3) + continue; + // we just need subnet and addr within subnet and idx + ret = psmi_hal_get_port_subnet(i, j, &gid_hi, &gid_lo, &ipaddr, NULL, &idx, &actual_gid_hi, NULL); + if (ret == -1) + continue; + if (my_gid_hi != gid_hi) { + _HFI_VDBG("LID %d, unit %d, port %d, mismatched " + "GID[%d] %llx:%llx and %llx:%llx\n", + lid, i, j, idx, + (unsigned long long)gid_hi, + (unsigned long long)gid_lo, + (unsigned long long)my_gid_hi, + (unsigned long long)my_gid_lo); + continue; + } + if (actual_gid_hi != gid_hi) { + if (_HFI_VDBG_ON) { + char buf[INET_ADDRSTRLEN]; + _HFI_VDBG("LID %d=>IPaddr %s, unit %d, port %d, matched " + "GID[%d] %llx:%llx and %llx:%llx\n", + lid, psmi_ipv4_ntop(ipaddr, buf, sizeof(buf)), i, j, idx, + (unsigned long long)gid_hi, + (unsigned long long)gid_lo, + (unsigned long long)my_gid_hi, + (unsigned long long)my_gid_lo); + } + + hfi_lids[nlids++] = (uint32_t) ipaddr; + } else { + _HFI_VDBG("LID %d, unit %d, port %d, matched " + "GID[%d] %llx:%llx and %llx:%llx\n", + lid, i, j, idx, + (unsigned long long)gid_hi, + (unsigned long long)gid_lo, + (unsigned long long)my_gid_hi, + (unsigned long long)my_gid_lo); + + hfi_lids[nlids++] = (uint16_t) lid; + } + } + } + if (nlids == 0) { + err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "Couldn't get lid&gid from any unit/port"); + goto fail; + } + } + *lids = hfi_lids; + *num_lids_o = nlids; + +fail: + return err; +} + +static psm2_error_t +psmi_ep_verify_pkey(psm2_ep_t ep, uint16_t pkey, uint16_t *opkey, uint16_t* oindex) +{ + int i, ret; + psm2_error_t err; + + for (i = 0; i < 16; i++) { +// TBD - if we adjust HAL to take a hw_context for this function and +// put the verbs_ep inside the HAL hw context, we can eliminate this ifdef +// and simply call into HAL + _HFI_UDDBG("looking for pkey 0x%x\n", pkey); + ret = verbs_get_port_index2pkey(ep, ep->portnum, i); + if (ret < 0) { + err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "Can't get a valid pkey value from pkey table\n"); + return err; + } + // pkey == 0 means get slot 0 + if (! pkey && ! i) + break; + if ((pkey & 0x7fff) == (uint16_t)(ret & 0x7fff)) { + break; + } + } + + /* if pkey does not match */ + if (i == 16) { + err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "Wrong pkey 0x%x, please use PSM3_PKEY to specify a valid pkey\n", + pkey); + return err; + } + + if (((uint16_t)ret & 0x8000) == 0) { + err = psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "Limited Member pkey 0x%x, please use PSM3_PKEY to specify a valid pkey\n", + (uint16_t)ret); + return err; + } + + /* return the final pkey */ + *opkey = (uint16_t)ret; + *oindex = (uint16_t)i; + + return PSM2_OK; +} + +uint64_t __psm2_epid_nid(psm2_epid_t epid) +{ + uint64_t rv; + + PSM2_LOG_MSG("entering"); + rv = (uint64_t) PSMI_EPID_GET_LID(epid); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_epid_nid) + + +/* Currently not exposed to users, we don't acknowledge the existence of + * service levels encoding within epids. This may require + * changing to expose SLs + */ +uint64_t psmi_epid_version(psm2_epid_t epid) +{ + return (uint64_t) PSMI_EPID_GET_EPID_VERSION(epid); +} + +uint64_t __psm2_epid_context(psm2_epid_t epid) +{ + uint64_t rv; + + PSM2_LOG_MSG("entering"); + rv = (uint64_t) PSMI_EPID_GET_CONTEXT(epid); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_epid_context) + +uint64_t __psm2_epid_port(psm2_epid_t epid) +{ + uint64_t rv; + PSM2_LOG_MSG("entering"); + rv = __psm2_epid_context(epid); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_epid_port) + +psm2_error_t __psm2_ep_query(int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo) +{ + psm2_error_t err = PSM2_OK; + int i; + psm2_ep_t ep; + + PSM2_LOG_MSG("entering"); + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + if (*num_of_epinfo <= 0) { + err = psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Invalid psm2_ep_query parameters"); + PSM2_LOG_MSG("leaving"); + return err; + } + + if (psmi_opened_endpoint == NULL) { + err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED, + "PSM Endpoint is closed or does not exist"); + PSM2_LOG_MSG("leaving"); + return err; + } + + ep = psmi_opened_endpoint; + for (i = 0; i < *num_of_epinfo; i++) { + if (ep == NULL) + break; + array_of_epinfo[i].ep = ep; + array_of_epinfo[i].epid = ep->epid; + array_of_epinfo[i].jkey = ep->jkey; + memcpy(array_of_epinfo[i].uuid, + (void *)ep->uuid, sizeof(psm2_uuid_t)); + psmi_uuid_unparse(ep->uuid, array_of_epinfo[i].uuid_str); + ep = ep->user_ep_next; + } + *num_of_epinfo = i; + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_ep_query) + +psm2_error_t __psm2_ep_epid_lookup(psm2_epid_t epid, psm2_epconn_t *epconn) +{ + psm2_error_t err = PSM2_OK; + psm2_epaddr_t epaddr; + psm2_ep_t ep; + + PSM2_LOG_MSG("entering"); + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + /* Need to have an opened endpoint before we can resolve epids */ + if (psmi_opened_endpoint == NULL) { + err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED, + "PSM Endpoint is closed or does not exist"); + PSM2_LOG_MSG("leaving"); + return err; + } + + ep = psmi_opened_endpoint; + while (ep) { + epaddr = psmi_epid_lookup(ep, epid); + if (!epaddr) { + ep = ep->user_ep_next; + continue; + } + + /* Found connection for epid. Return info about endpoint to caller. */ + psmi_assert_always(epaddr->ptlctl->ep == ep); + epconn->addr = epaddr; + epconn->ep = ep; + epconn->mq = ep->mq; + PSM2_LOG_MSG("leaving"); + return err; + } + + err = psmi_handle_error(NULL, PSM2_EPID_UNKNOWN, + "Endpoint connection status unknown"); + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_ep_epid_lookup); + +psm2_error_t __psm2_ep_epid_lookup2(psm2_ep_t ep, psm2_epid_t epid, psm2_epconn_t *epconn) +{ + psm2_error_t err = PSM2_OK; + + PSM2_LOG_MSG("entering"); + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + /* Need to have an opened endpoint before we can resolve epids */ + if (ep == NULL) { + err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED, + "PSM Endpoint is closed or does not exist"); + PSM2_LOG_MSG("leaving"); + return err; + } + + if (epconn == NULL) { + err = psmi_handle_error(ep, PSM2_PARAM_ERR, + "Invalid output parameter"); + PSM2_LOG_MSG("leaving"); + return err; + } + + psm2_epaddr_t epaddr = psmi_epid_lookup(ep, epid); + if (epaddr) { + /* Found connection for epid. Return info about endpoint to caller. */ + psmi_assert_always(epaddr->ptlctl->ep == ep); + epconn->addr = epaddr; + epconn->ep = ep; + epconn->mq = ep->mq; + PSM2_LOG_MSG("leaving"); + return err; + } + + err = psmi_handle_error(ep, PSM2_EPID_UNKNOWN, + "Endpoint connection status unknown"); + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_ep_epid_lookup2); + +psm2_error_t __psm2_epaddr_to_epid(psm2_epaddr_t epaddr, psm2_epid_t *epid) +{ + psm2_error_t err = PSM2_OK; + PSM2_LOG_MSG("entering"); + if (epaddr && epid) { + *epid = epaddr->epid; + } + else { + err = psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Invalid input epaddr or output epid parameter"); + } + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_epaddr_to_epid); + +psm2_error_t +__psm2_ep_epid_share_memory(psm2_ep_t ep, psm2_epid_t epid, int *result_o) +{ + int result = 0; + uint32_t num_lids = 0; + uint32_t epid_lid; + uint32_t *lids = NULL; + int i; + psm2_error_t err; + + PSM2_LOG_MSG("entering"); + psmi_assert_always(ep != NULL); + PSMI_ERR_UNLESS_INITIALIZED(ep); + + if ((!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) || + (psmi_epid_version(epid) == PSMI_EPID_VERSION_SHM)) { + /* If we are in the no hfi-mode, or the other process is, + * the epid doesn't help us - so assume both we're on the same + * machine and try to connect. + */ + result = 1; + } else { + epid_lid = (uint32_t) psm2_epid_nid(epid); + err = psmi_ep_devlids(&lids, &num_lids, ep->gid_hi, ep->gid_lo, ep->epid); + if (err) { + PSM2_LOG_MSG("leaving"); + return err; + } + for (i = 0; i < num_lids; i++) { + if (epid_lid == lids[i]) { + /* we share memory if the lid is the same. */ + result = 1; + break; + } + } + } + *result_o = result; + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} +PSMI_API_DECL(psm2_ep_epid_share_memory) + +psm2_error_t __psm2_ep_open_opts_get_defaults(struct psm2_ep_open_opts *opts) +{ + PSM2_LOG_MSG("entering"); + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + if (!opts) + return PSM2_PARAM_ERR; + + /* Set in order in the structure. */ + opts->timeout = 30000000000LL; /* 30 sec */ + opts->unit = PSM3_NIC_ANY; + opts->affinity = PSM2_EP_OPEN_AFFINITY_SET; + opts->shm_mbytes = 0; /* deprecated in psm2.h */ + opts->sendbufs_num = 1024; + opts->network_pkey = psmi_hal_get_default_pkey(); + opts->port = PSM3_NIC_PORT_ANY; + opts->outsl = PSMI_SL_DEFAULT; + opts->service_id = HFI_DEFAULT_SERVICE_ID; + opts->path_res_type = PSM2_PATH_RES_NONE; + opts->senddesc_num = 4096; + opts->imm_size = VERBS_SEND_MAX_INLINE; // PSM header size is 56 + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} +PSMI_API_DECL(psm2_ep_open_opts_get_defaults) + +psm2_error_t psmi_poll_noop(ptl_t *ptl, int replyonly); + +psm2_error_t +__psm2_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled, + struct psm2_ep_open_opts const *opts_i, psm2_mq_t mq, + psm2_ep_t *epo, psm2_epid_t *epido) +{ + psm2_ep_t ep = NULL; + uint32_t num_units; + size_t len; + psm2_error_t err; + psm2_epaddr_t epaddr = NULL; + char buf[128], *p; + union psmi_envvar_val envvar_val; + size_t ptl_sizes; + struct psm2_ep_open_opts opts; + ptl_t *amsh_ptl, *ips_ptl, *self_ptl; + int i; + + /* First get the set of default options, we overwrite with the user's + * desired values afterwards */ + if ((err = psm2_ep_open_opts_get_defaults(&opts))) + goto fail; + + if (opts_i != NULL) { + if (opts_i->timeout != -1) + opts.timeout = opts_i->timeout; + if (opts_i->unit != -1) + opts.unit = opts_i->unit; + if (opts_i->affinity != -1) + opts.affinity = opts_i->affinity; + + if (opts_i->sendbufs_num != -1) + opts.sendbufs_num = opts_i->sendbufs_num; + + if (opts_i->network_pkey != psmi_hal_get_default_pkey()) + opts.network_pkey = opts_i->network_pkey; + + if (opts_i->port != 0) + opts.port = opts_i->port; + + if (opts_i->outsl != -1) + opts.outsl = opts_i->outsl; + + if (opts_i->service_id) + opts.service_id = (uint64_t) opts_i->service_id; + if (opts_i->path_res_type != PSM2_PATH_RES_NONE) + opts.path_res_type = opts_i->path_res_type; + + if (opts_i->senddesc_num) + opts.senddesc_num = opts_i->senddesc_num; + + if (opts_i->imm_size) + opts.imm_size = opts_i->imm_size; + } + + /* Get Service ID from environment */ + if (!psmi_getenv("PSM3_IB_SERVICE_ID", + "Service ID for RV module RC QP connection establishment", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_ULONG_FLAGS, // FLAGS only affects output: hex + (union psmi_envvar_val)HFI_DEFAULT_SERVICE_ID, + &envvar_val)) { + opts.service_id = (uint64_t) envvar_val.e_ulonglong; + } + + opts.path_res_type = PSM2_PATH_RES_NONE; + + /* If a specific unit is set in the environment, use that one. */ + // PSM3_NIC may be a unit name, number, "any" or -1 + if (!psmi_getenv("PSM3_NIC", "Device Unit number or name (-1 or 'any' autodetects)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"any", &envvar_val)) { + if (0 == strcasecmp(envvar_val.e_str, "any")) { + opts.unit = PSM3_NIC_ANY; + } else { + // convert name to a unit number since rest of APIs use number + opts.unit = sysfs_find_unit(envvar_val.e_str); + if (opts.unit < 0) { + err = psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Unit unknown %s", envvar_val.e_str); + goto fail; + } + } + } + + /* Get user specified port number to use. */ + if (!psmi_getenv("PSM3_NIC_PORT", "NIC Port number (0 autodetects)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_LONG, + (union psmi_envvar_val)PSM3_NIC_PORT_ANY, + &envvar_val)) { + opts.port = envvar_val.e_long; + } + + /* Get service level from environment, path-query overrides it */ + if (!psmi_getenv + ("PSM3_NIC_SL", "NIC outging ServiceLevel number (default 0)", + PSMI_ENVVAR_LEVEL_HIDDEN, + PSMI_ENVVAR_TYPE_LONG, + (union psmi_envvar_val)PSMI_SL_DEFAULT, &envvar_val)) { + opts.outsl = envvar_val.e_long; + } + + /* Get network key from environment. MVAPICH and other vendor MPIs do not + * specify it on ep open and we may require it for vFabrics. + * path-query will override it. + */ + if (!psmi_getenv("PSM3_PKEY", + "PKey to use for endpoint (0=use slot 0)", + PSMI_ENVVAR_LEVEL_HIDDEN, + PSMI_ENVVAR_TYPE_ULONG_FLAGS, // show in hex + (union psmi_envvar_val)((unsigned int)(psmi_hal_get_default_pkey())), + &envvar_val)) { + opts.network_pkey = (uint64_t) envvar_val.e_ulong; + } + + /* BACKWARDS COMPATIBILITY: Open MPI likes to choose its own PKEY of + 0x7FFF. That's no longer a valid default, so override it if the + client was compiled against PSM v1 */ + if (PSMI_VERNO_GET_MAJOR(psmi_verno_client()) < 2 && + opts.network_pkey == 0x7FFF) { + opts.network_pkey = psmi_hal_get_default_pkey();; + } + + /* Get number of default send buffers from environment */ + if (!psmi_getenv("PSM3_NUM_SEND_BUFFERS", + "Number of send buffers to allocate [1024]", + PSMI_ENVVAR_LEVEL_HIDDEN, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)1024, &envvar_val)) { + opts.sendbufs_num = envvar_val.e_uint; + } + + /* Get immediate data size - transfers less than immediate data size do + * not consume a send buffer and require just a send descriptor. + */ + if (!psmi_getenv("PSM3_SEND_IMMEDIATE_SIZE", + "Immediate data send size not requiring a buffer [128]", + PSMI_ENVVAR_LEVEL_HIDDEN, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)128, &envvar_val)) { + opts.imm_size = envvar_val.e_uint; + } + + /* Get number of send descriptors - by default this is 4 times the number + * of send buffers - mainly used for short/inlined messages. + */ + if (!psmi_getenv("PSM3_NUM_SEND_DESCRIPTORS", + "Number of send descriptors to allocate [4096]", + PSMI_ENVVAR_LEVEL_HIDDEN, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)4096, &envvar_val)) { + opts.senddesc_num = envvar_val.e_uint; + } + if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { + if ((err = psm2_ep_num_devunits(&num_units)) != PSM2_OK) + goto fail; + } else + num_units = 0; + + /* do some error checking */ + if (opts.timeout < -1) { + err = psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Invalid timeout value %lld", + (long long)opts.timeout); + goto fail; + } else if (num_units && (opts.unit < -1 || opts.unit >= (int)num_units)) { + err = psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Invalid Device Unit ID %d (%d units found)", + opts.unit, num_units); + goto fail; + } else if ((opts.port < HFI_MIN_PORT || opts.port > HFI_MAX_PORT) && + opts.port != PSM3_NIC_PORT_ANY) { + err = psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Invalid Device port number %d", + opts.port); + goto fail; + } else if (opts.affinity < 0 + || opts.affinity > PSM2_EP_OPEN_AFFINITY_FORCE) { + err = + psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Invalid Affinity option: %d", + opts.affinity); + goto fail; + } else if (opts.outsl < PSMI_SL_MIN || opts.outsl > PSMI_SL_MAX) { + err = psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Invalid SL number: %lld", + (unsigned long long)opts.outsl); + goto fail; + } + + /* Allocate end point structure storage */ + ptl_sizes = + (psmi_device_is_enabled(devid_enabled, PTL_DEVID_SELF) ? + psmi_ptl_self.sizeof_ptl() : 0) + + (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS) ? + psmi_ptl_ips.sizeof_ptl() : 0) + + (psmi_device_is_enabled(devid_enabled, PTL_DEVID_AMSH) ? + psmi_ptl_amsh.sizeof_ptl() : 0); + if (ptl_sizes == 0) + return PSM2_EP_NO_DEVICE; + + ep = (psm2_ep_t) psmi_memalign(PSMI_EP_NONE, UNDEFINED, 64, + sizeof(struct psm2_ep) + ptl_sizes); + epaddr = (psm2_epaddr_t) psmi_calloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, + 1, sizeof(struct psm2_epaddr)); + if (ep == NULL || epaddr == NULL) { + err = psmi_handle_error(NULL, PSM2_NO_MEMORY, + "Couldn't allocate memory for %s structure", + ep == NULL ? "psm2_ep" : "psm2_epaddr"); + goto fail; + } + memset(ep, 0, sizeof(struct psm2_ep) + ptl_sizes); + + /* Copy PTL enabled status */ + for (i = 0; i < PTL_MAX_INIT; i++) + ep->devid_enabled[i] = devid_enabled[i]; + + /* Matched Queue initialization. We do this early because we have to + * make sure ep->mq exists and is valid before calling ips_do_work. + */ + ep->mq = mq; + + /* Get ready for PTL initialization */ + memcpy(&ep->uuid, (void *)unique_job_key, sizeof(psm2_uuid_t)); + ep->epaddr = epaddr; + ep->memmode = mq->memmode; + ep->hfi_num_sendbufs = opts.sendbufs_num; + ep->service_id = opts.service_id; + ep->path_res_type = opts.path_res_type; + ep->hfi_num_descriptors = opts.senddesc_num; + ep->hfi_imm_size = opts.imm_size; + ep->errh = psmi_errhandler_global; /* by default use the global one */ + ep->ptl_amsh.ep_poll = psmi_poll_noop; + ep->ptl_ips.ep_poll = psmi_poll_noop; + ep->connections = 0; + ep->rdmamode = psmi_parse_rdmamode(); // PSM3_RDMA + /* MR cache mode */ + // we need this early when creating the verbs_ep since it may affect + // if we open rv module. + // The value returned is a MR_CACHE_MODE_* selection + { + union psmi_envvar_val env_mr_cache_mode; + + if (! (ep->rdmamode & IPS_PROTOEXP_FLAG_ENABLED)) { + env_mr_cache_mode.e_uint = MR_CACHE_MODE_NONE; + } else if (IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode)) { + // RDMA enabled in kernel mode. Must use rv MR cache + env_mr_cache_mode.e_uint = MR_CACHE_MODE_RV; + } else { + /* Behavior of user space MR Cache + * when 0, we merely share MRs for concurrently used buffers + */ + // mode 2 (user space MR w/cache) is purposely not documented + psmi_getenv("PSM3_MR_CACHE_MODE", + "Enable MR caching 0=user space MR no cache" +#ifdef RNDV_MOD_MR + ", 1=kernel MR w/cache [1]", +#else + "[0]", +#endif + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, +#ifdef RNDV_MOD_MR + (union psmi_envvar_val)MR_CACHE_MODE_KERNEL, +#else + (union psmi_envvar_val)MR_CACHE_MODE_NONE, +#endif + &env_mr_cache_mode); + if (! MR_CACHE_MODE_VALID(env_mr_cache_mode.e_uint) + || env_mr_cache_mode.e_uint == MR_CACHE_MODE_RV) + env_mr_cache_mode.e_uint = MR_CACHE_MODE_NONE; + } +#ifndef RNDV_MOD_MR + if (env_mr_cache_mode.e_uint == MR_CACHE_MODE_KERNEL) + env_mr_cache_mode.e_uint = MR_CACHE_MODE_NONE; +#endif + ep->mr_cache_mode = env_mr_cache_mode.e_uint; + } + + /* See how many iterations we want to spin before yielding */ + psmi_getenv("PSM3_YIELD_SPIN_COUNT", + "Spin poll iterations before yield", + PSMI_ENVVAR_LEVEL_HIDDEN, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD, + &envvar_val); + ep->yield_spin_cnt = envvar_val.e_uint; + + /* Set skip_affinity flag if PSM is not allowed to set affinity */ + if (opts.affinity == PSM2_EP_OPEN_AFFINITY_SKIP) + ep->skip_affinity = true; + + ptl_sizes = 0; + amsh_ptl = ips_ptl = self_ptl = NULL; + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { + amsh_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); + ptl_sizes += psmi_ptl_amsh.sizeof_ptl(); + } + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { + ips_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); + ptl_sizes += psmi_ptl_ips.sizeof_ptl(); + } + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) { + self_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); + ptl_sizes += psmi_ptl_self.sizeof_ptl(); + } + + /* Get number of send WQEs + */ + psmi_getenv("PSM3_NUM_SEND_WQES", + "Number of send WQEs to allocate [4080]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)VERBS_SEND_QP_ENTRIES, &envvar_val); + ep->hfi_num_send_wqes = envvar_val.e_uint; + + psmi_getenv("PSM3_SEND_REAP_THRESH", + "Number of outstanding send WQEs before reap CQEs [256]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)VERBS_SEND_CQ_REAP, &envvar_val); + ep->hfi_send_reap_thresh = envvar_val.e_uint; + + psmi_getenv("PSM3_NUM_SEND_RDMA", + "Number of user space send RDMA to allow [128]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)VERBS_NUM_SEND_RDMA, &envvar_val); + ep->hfi_num_send_rdma = envvar_val.e_uint; + + /* Get number of recv WQEs + */ + psmi_getenv("PSM3_NUM_RECV_WQES", + "Number of recv WQEs to allocate [4095]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)VERBS_RECV_QP_ENTRIES, &envvar_val); + ep->hfi_num_recv_wqes = envvar_val.e_uint; + + /* Get number of recv CQEs + */ + psmi_getenv("PSM3_NUM_RECV_CQES", + "Number of recv CQEs to allocate\n" + "(0 will calculate as PSM3_NUM_RECV_WQES+1032 for PSM3_RDMA=0-2\n" + "and 4000 more than that for PSM3_RDMA=3]) [0]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)0, &envvar_val); + ep->hfi_num_recv_cqes = envvar_val.e_uint; + + /* Get RC QP timeout and retry + */ + psmi_getenv("PSM3_QP_TIMEOUT", + "Number of microseconds for RC QP timeouts [536870]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_ULONG, + (union psmi_envvar_val)VERBS_QP_TIMEOUT, &envvar_val); + ep->hfi_qp_timeout = timeout_usec_to_mult(envvar_val.e_ulong); + + psmi_getenv("PSM3_QP_RETRY", + "Limit on retries after RC QP timeout or RNR [7]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)VERBS_QP_RETRY, &envvar_val); + ep->hfi_qp_retry = (envvar_val.e_uint <= VERBS_QP_MAX_RETRY)? + envvar_val.e_uint:VERBS_QP_MAX_RETRY; + /* Size of RV Cache - only used for MR_CACHE_MODE_RV or KERNEL, + * otherwise ignored + */ + psmi_getenv("PSM3_RV_MR_CACHE_SIZE", + "kernel space MR cache size" + " (MBs, 0 lets rv module decide) [0]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)0, &envvar_val); + // TBD - min should be (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) * + // chunk size (mq->hfi_base_window_rv after psmi_mq_initialize_defaults + // TBD actual window_sz may be larger than mq->hfi_base_window_rv + ep->rv_mr_cache_size = envvar_val.e_uint; + + psmi_getenv("PSM3_RV_QP_PER_CONN", + "Number of sets of RC QPs per RV connection (0 lets rv module decide) [0]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)0, &envvar_val); + ep->rv_num_conn = envvar_val.e_uint; + + psmi_getenv("PSM3_RV_Q_DEPTH", + "Size of QPs and CQs per RV QP (0 lets rv module decide) [0]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)0, &envvar_val); + ep->rv_q_depth = envvar_val.e_uint; + + psmi_getenv("PSM3_RV_RECONNECT_TIMEOUT", + "RV End-point minimum re-connection timeout in seconds. 0 for no connection recovery [30]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)30, &envvar_val); + ep->rv_reconnect_timeout = envvar_val.e_uint; + + psmi_getenv("PSM3_RV_HEARTBEAT_INTERVAL", + "RV End-point heartbeat interval in milliseconds. 0 for no heartbeat [1000]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)1000, &envvar_val); + ep->rv_hb_interval = envvar_val.e_uint; + + // HFI Interface. + if ((err = psmi_ep_open_device(ep, &opts, unique_job_key, + &(ep->context), &ep->epid))) + goto fail; + + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { + _HFI_UDDBG("my QPN=%u (0x%x) EPID=0x%"PRIx64" %s\n", + ep->verbs_ep.qp->qp_num, ep->verbs_ep.qp->qp_num, (uint64_t)ep->epid, + psmi_epaddr_fmt_addr(ep->epid)); + } + psmi_assert_always(ep->epid != 0); + ep->epaddr->epid = ep->epid; + + _HFI_VDBG("psmi_ep_open_device() passed\n"); + + /* Set our new label as soon as we know what it is */ + strncpy(buf, psmi_gethostname(), sizeof(buf) - 1); + buf[sizeof(buf) - 1] = '\0'; + + p = buf + strlen(buf); + + /* If our rank is set, use it (same as mylabel). If not, use context */ + /* hostname.rank# or hostname.# (context), or hostname.pid# + */ + if (hfi_get_myrank() >= 0) + len = snprintf(p, sizeof(buf) - strlen(buf), ":rank%d.", hfi_get_myrank()); + else + len = snprintf(p, sizeof(buf) - strlen(buf), ":"PSMI_EPID_CONTEXT_FMT".", + PSMI_EPID_GET_CONTEXT_VAL(ep->epid)); + *(p + len) = '\0'; + ep->context_mylabel = psmi_strdup(ep, buf); + if (ep->context_mylabel == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + /* hfi_set_mylabel(ep->context_mylabel); */ + + if ((err = psmi_epid_set_hostname(psm2_epid_nid(ep->epid), buf, 0))) + goto fail; + + _HFI_VDBG("start ptl device init...\n"); + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) { + if ((err = psmi_ptl_self.init(ep, self_ptl, &ep->ptl_self))) + goto fail; + } + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { + if ((err = psmi_ptl_ips.init(ep, ips_ptl, &ep->ptl_ips))) + goto fail; + } + /* If we're shm-only, this device is enabled above */ + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { + if ((err = psmi_ptl_amsh.init(ep, amsh_ptl, &ep->ptl_amsh))) + goto fail; + } else { + /* We may have pre-attached as part of getting our rank for enabling + * shared contexts. */ + } + + _HFI_VDBG("finish ptl device init...\n"); + + /* + * Keep only IPS since only IPS support multi-rail, other devices + * are only setup once. IPS device can come to this function again. + */ + for (i = 0; i < PTL_MAX_INIT; i++) { + if (devid_enabled[i] != PTL_DEVID_IPS) { + devid_enabled[i] = -1; + } + } + + *epido = ep->epid; + *epo = ep; + + return PSM2_OK; + +fail: + if (ep != NULL) { + psmi_hal_close_context(&ep->context.psm_hw_ctxt); + psmi_free(ep); + } + if (epaddr != NULL) + psmi_free(epaddr); + return err; +} + +psm2_error_t +__psm2_ep_open(psm2_uuid_t const unique_job_key, + struct psm2_ep_open_opts const *opts_i, psm2_ep_t *epo, + psm2_epid_t *epido) +{ + psm2_error_t err; + psm2_mq_t mq; + psm2_epid_t epid; + psm2_ep_t ep, tmp; + uint32_t units[PSMI_MAX_QPS]; + uint16_t ports[PSMI_MAX_QPS]; + int i, num_rails = 0; + char *uname = "PSM3_NIC"; + char *pname = "PSM3_NIC_PORT"; + char uvalue[6], pvalue[6]; + int devid_enabled[PTL_MAX_INIT]; + union psmi_envvar_val devs; + int show_nics = 0; + + PSM2_LOG_MSG("entering"); + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + if (!epo || !epido) + return PSM2_PARAM_ERR; + + /* Allowing only one EP (unless explicitly enabled). */ + if (psmi_opened_endpoint_count > 0 && !psmi_multi_ep_enabled) { + PSM2_LOG_MSG("leaving"); + return PSM2_TOO_MANY_ENDPOINTS; + } + + /* Matched Queue initialization. We do this early because we have to + * make sure ep->mq exists and is valid before calling ips_do_work. + */ + err = psmi_mq_malloc(&mq); + PSMI_LOCK(psmi_creation_lock); + if (err != PSM2_OK) + goto fail; + + /* Set some of the MQ thresholds from the environment. + Do this before ptl initialization - the ptl may have other + constraints that will limit the MQ's settings. */ + err = psmi_mq_initialize_defaults(mq); + if (err != PSM2_OK) + goto fail; + + psmi_init_lock(&(mq->progress_lock)); + + /* See which ptl devices we want to use for this ep to be opened */ + psmi_getenv("PSM3_DEVICES", + "Ordered list of PSM-level devices", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)PSMI_DEVICES_DEFAULT, &devs); + + if ((err = psmi_parse_devices(devid_enabled, devs.e_str))) + goto fail; + + if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { + show_nics = psmi_parse_identify(); + err = psmi_ep_multirail(&num_rails, units, ports); + if (err != PSM2_OK) + goto fail; + + /* If multi-rail is used, set the first ep unit/port */ + if (num_rails > 0) { + snprintf(uvalue, 6, "%1d", units[0]); + snprintf(pvalue, 6, "%1d", ports[0]); + setenv(uname, uvalue, 1); + setenv(pname, pvalue, 1); + } + } + +#ifdef PSM_CUDA + if (PSMI_IS_GDR_COPY_ENABLED) + hfi_gdr_open(); +#endif + + err = __psm2_ep_open_internal(unique_job_key, + devid_enabled, opts_i, mq, &ep, &epid); + if (err != PSM2_OK) + goto fail; + + if (psmi_opened_endpoint == NULL) { + psmi_opened_endpoint = ep; + } else { + tmp = psmi_opened_endpoint; + while (tmp->user_ep_next) + tmp = tmp->user_ep_next; + tmp->user_ep_next = ep; + } + psmi_opened_endpoint_count++; + ep->mctxt_prev = ep->mctxt_next = ep; + ep->mctxt_master = ep; + mq->ep = ep; + + if (show_nics) + printf("%s %s NIC %u (%s) Port %u\n", + hfi_get_mylabel(), hfi_ident_tag, + ep->unit_id, sysfs_unit_dev_name(ep->unit_id), + ep->portnum); + + /* Active Message initialization */ + err = psmi_am_init_internal(ep); + if (err != PSM2_OK) + goto fail; + + *epo = ep; + *epido = epid; + + if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { + int j; + union psmi_envvar_val envvar_val; + + if (num_rails <= 0) { + // the NIC has now been selected for our process + // use the same NIC for any additional QPs below + num_rails = 1; + units[0] = ep->unit_id; + ports[0] = ep->portnum; + } + // When QP_PER_NIC >1, creates more than 1 QP on each NIC and then + // uses the multi-rail algorithms to spread the traffic across QPs + // This helps get better BW when there are relatively few processes/node + // care must be taken when combining this with user space RC QPs as + // scalability (memory footprint) issues can be multiplied + // This approach duplicates some per NIC resources (CQs, etc) but + // provides a simple approach + psmi_getenv("PSM3_QP_PER_NIC", + "Number of sets of QPs to open per NIC [1]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)1, &envvar_val); + + if ((num_rails * envvar_val.e_uint) > PSMI_MAX_QPS) { + err = psmi_handle_error(NULL, PSM2_TOO_MANY_ENDPOINTS, + "PSM3_QP_PER_NIC (%u) * num_rails (%d) > Max Support QPs (%u)", + envvar_val.e_uint, num_rails, PSMI_MAX_QPS); + goto fail; + } + + for (j= 0; j< envvar_val.e_uint; j++) { + for (i = 0; i < num_rails; i++) { + _HFI_VDBG("rail %d unit %u port %u\n", i, units[i], ports[i]); + // did 0, 0 already above + if (i == 0 && j== 0) + continue; + snprintf(uvalue, 6, "%1d", units[i]); + snprintf(pvalue, 6, "%1d", ports[i]); + setenv(uname, uvalue, 1); + setenv(pname, pvalue, 1); + + /* Create slave EP */ + err = __psm2_ep_open_internal(unique_job_key, + devid_enabled, opts_i, mq, + &tmp, &epid); + if (err) + goto fail; + + /* Point back to shared resources on the master EP */ + tmp->am_htable = ep->am_htable; + + /* Link slave EP after master EP. */ + PSM_MCTXT_APPEND(ep, tmp); + if (j == 0 && show_nics) + printf("%s %s NIC %u (%s) Port %u\n", + hfi_get_mylabel(), hfi_ident_tag, + tmp->unit_id, sysfs_unit_dev_name(tmp->unit_id), + tmp->portnum); + } + } + } + + _HFI_VDBG("psm2_ep_open() OK....\n"); + +fail: + fflush(stdout); + PSMI_UNLOCK(psmi_creation_lock); + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_ep_open) + +psm2_error_t __psm2_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in) +{ + psm2_error_t err = PSM2_OK; + + psmi_stats_ep_close(); // allow output of stats on 1st ep close if desired + +#if _HFI_DEBUGGING + uint64_t t_start = 0; + if (_HFI_PRDBG_ON) { + t_start = get_cycles(); + } +#endif + +#ifdef PSM_CUDA + /* + * The close on the gdr fd needs to be called before the + * close on the hfi fd as the the gdr device will hold + * reference count on the hfi device which will make the close + * on the hfi fd return without actually closing the fd. + */ + if (PSMI_IS_GDR_COPY_ENABLED) + hfi_gdr_close(); +#endif + union psmi_envvar_val timeout_intval; + psm2_ep_t tmp; + psm2_mq_t mmq; + + PSM2_LOG_MSG("entering"); + PSMI_ERR_UNLESS_INITIALIZED(ep); + psmi_assert_always(ep->mctxt_master == ep); + + PSMI_LOCK(psmi_creation_lock); + + psmi_am_fini_internal(ep); + + if (psmi_opened_endpoint == NULL) { + err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED, + "PSM Endpoint is closed or does not exist"); + PSM2_LOG_MSG("leaving"); + PSMI_UNLOCK(psmi_creation_lock); + return err; + } + + tmp = psmi_opened_endpoint; + while (tmp && tmp != ep) { + tmp = tmp->user_ep_next; + } + if (!tmp) { + err = psmi_handle_error(NULL, PSM2_EP_WAS_CLOSED, + "PSM Endpoint is closed or does not exist"); + PSM2_LOG_MSG("leaving"); + PSMI_UNLOCK(psmi_creation_lock); + return err; + } + + psmi_getenv("PSM3_CLOSE_TIMEOUT", + "End-point close timeout over-ride.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)0, &timeout_intval); + + if (getenv("PSM3_CLOSE_TIMEOUT")) { + timeout_in = timeout_intval.e_uint * SEC_ULL; + } else if (timeout_in > 0) { + /* The timeout parameter provides the minimum timeout. A heuristic + * is used to scale up the timeout linearly with the number of + * endpoints, and we allow one second per 100 endpoints. */ + timeout_in = max(timeout_in, (ep->connections * SEC_ULL) / 100); + } + + if (timeout_in > 0 && timeout_in < PSMI_MIN_EP_CLOSE_TIMEOUT) + timeout_in = PSMI_MIN_EP_CLOSE_TIMEOUT; + + /* Infinite and excessive close time-out are limited here to a max. + * The "rationale" is that there is no point waiting around forever for + * graceful termination. Normal (or forced) process termination should clean + * up the context state correctly even if termination is not graceful. */ + if (timeout_in <= 0 || timeout_in > PSMI_MAX_EP_CLOSE_TIMEOUT) + timeout_in = PSMI_MAX_EP_CLOSE_TIMEOUT; + _HFI_PRDBG("Closing endpoint %p with force=%s and to=%.2f seconds and " + "%d connections\n", + ep, mode == PSM2_EP_CLOSE_FORCE ? "YES" : "NO", + (double)timeout_in / 1e9, (int)ep->connections); + + /* XXX We currently cheat in the sense that we leave each PTL the allowed + * timeout. There's no good way to do this until we change the PTL + * interface to allow asynchronous finalization + */ + + + /* Check if transfer ownership of receive thread is needed before closing ep. + * In case of PSM3_MULTI_EP support receive thread is created and assigned + * to first opened endpoint. Receive thread is killed when closing this + * endpoint. + */ + if (ep->user_ep_next != NULL) { + /* Receive thread will be transfered and assigned to ep->user_ep_next + * only if currently working receive thread (which will be killed) is + * assigned to ep and there isn't any assigned to ep->user_ep_next. + */ + if ((psmi_ptl_ips_rcvthread.is_enabled(ep->ptl_ips.ptl)) && + (!psmi_ptl_ips_rcvthread.is_enabled(ep->user_ep_next->ptl_ips.ptl))) + psmi_ptl_ips_rcvthread.transfer_ownership(ep->ptl_ips.ptl, ep->user_ep_next->ptl_ips.ptl); + } + + /* + * Before freeing the master ep itself, + * remove it from the global linklist. + * We do it here to let atexit handler in ptl_am directory + * to search the global linklist and free the shared memory file. + */ + if (psmi_opened_endpoint == ep) { + /* Removing ep from global endpoint list. */ + psmi_opened_endpoint = ep->user_ep_next; + } else { + tmp = psmi_opened_endpoint; + while (tmp->user_ep_next != ep) { + tmp = tmp->user_ep_next; + } + /* Removing ep from global endpoint list. */ + tmp->user_ep_next = ep->user_ep_next; + } + psmi_opened_endpoint_count--; + + /* + * This do/while loop is used to close and free memory of endpoints. + * + * If MULTIRAIL feature is disable this loop will be passed only once + * and only endpoint passed in psm2_ep_close will be closed/removed. + * + * If MULTIRAIL feature is enabled then this loop will be passed + * multiple times (depending on number of rails). The order in which + * endpoints will be closed is shown below: + * + * |--this is master endpoint in case of multirail + * | this endpoint is passed to psm2_ep_close and + * V this is only endpoint known to user. + * +<-Ep0<-Ep1<-Ep2<-Ep3 + * |__________________| Ep3->mctxt_prev points to Ep2 + * (3) (2) (1) (4) Ep2->mctxt_prev points to Ep1 + * ^ Ep1->mctxt_prev points to Ep0 + * | Ep0->mctxt_prev points to Ep3 (master ep) + * | + * |---- order in which endpoints will be closed. + * + * Closing MULTIRAILs starts by closing slaves (Ep2, Ep1, Ep0) + * If MULTIRAIL is enabled then Ep3->mctxt_prev will point to Ep2, if + * feature is disabled then Ep3->mctxt_prev will point to Ep3 and + * do/while loop will have one pass. + * + * In case of MULTIRAIL enabled Ep3 which is master endpoint will be + * closed as the last one. + */ + mmq = ep->mq; + if (mmq) { + // in case mq_finalize not called, need to get stats out + // it will be a noop if called twice + psm2_mq_finalize(mmq); + } + tmp = ep->mctxt_prev; + do { + ep = tmp; + tmp = ep->mctxt_prev; + + PSMI_LOCK(ep->mq->progress_lock); + + PSM_MCTXT_REMOVE(ep); + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) + err = + psmi_ptl_amsh.fini(ep->ptl_amsh.ptl, mode, + timeout_in); + + if ((err == PSM2_OK || err == PSM2_TIMEOUT) && + psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) + err = + psmi_ptl_ips.fini(ep->ptl_ips.ptl, mode, + timeout_in); + + /* If there's timeouts in the disconnect requests, + * still make sure that we still get to close the + *endpoint and mark it closed */ + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) + psmi_context_close(&ep->context); + + psmi_epid_remove_all(ep); + psmi_free(ep->epaddr); + psmi_free(ep->context_mylabel); + + PSMI_UNLOCK(ep->mq->progress_lock); + + ep->mq = NULL; + __psm2_ep_free_verbs(ep); + + psmi_free(ep); + + } while ((err == PSM2_OK || err == PSM2_TIMEOUT) && tmp != ep); + + if (mmq) { + psmi_destroy_lock(&(mmq->progress_lock)); + err = psmi_mq_free(mmq); + } + + if (hfi_lids) + { + psmi_free(hfi_lids); + hfi_lids = NULL; + nlids = 0; + } + + PSMI_UNLOCK(psmi_creation_lock); + + if (_HFI_PRDBG_ON) { + _HFI_PRDBG_ALWAYS("Closed endpoint in %.3f secs\n", + (double)cycles_to_nanosecs(get_cycles() - + t_start) / SEC_ULL); + } + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_ep_close) + +static +psm2_error_t +psmi_ep_open_device(const psm2_ep_t ep, + const struct psm2_ep_open_opts *opts, + const psm2_uuid_t unique_job_key, + struct psmi_context *context, psm2_epid_t *epid) +{ + psm2_error_t err = PSM2_OK; + + /* Skip affinity. No affinity if: + * 1. User explicitly sets no-affinity=YES in environment. + * 2. User doesn't set affinity in environment and PSM is opened with + * option affinity skip. + */ + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { + union psmi_envvar_val env_rcvthread; + static int norcvthread; /* only for first rail */ + + ep->out_sl = opts->outsl; + + if ((err = + psmi_context_open(ep, opts->unit, opts->port, + unique_job_key, opts->timeout, + context)) != PSM2_OK) + goto fail; + + _HFI_DBG("[%d]use unit %d port %d\n", getpid(), + ep->unit_id, 1); + + /* At this point, we have the unit id and port number, so + * check if pkey is not 0x0/0x7fff/0xffff, and match one + * of the pkey in table. + */ + if ((err = + psmi_ep_verify_pkey(ep, (uint16_t) opts->network_pkey, + &ep->network_pkey, &ep->network_pkey_index)) != PSM2_OK) + goto fail; + + /* See if we want to activate support for receive thread */ + psmi_getenv("PSM3_RCVTHREAD", + "Enable Recv thread (0 disables thread)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + // default to 0 for all but 1st rail + (union psmi_envvar_val)(norcvthread++ ? 0 : + PSMI_RCVTHREAD_FLAGS), + &env_rcvthread); + + /* If enabled, use the polling capability to implement a receive + * interrupt thread that can handle urg packets */ + if (env_rcvthread.e_uint) { + psmi_hal_add_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD); +#ifdef PSMI_PLOCK_IS_NOLOCK + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "#define PSMI_PLOCK_IS_NOLOCK not functional yet " + "with RCVTHREAD on"); +#endif + } + + *epid = context->epid; + } else if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { + *epid = PSMI_EPID_PACK_SHM(getpid(), + PSMI_EPID_SHM_ONLY); /*is a only-shm epid */ + } else { + /* Self-only, meaning only 1 proc max */ + *epid = PSMI_EPID_PACK_SHM(0, + PSMI_EPID_SHM_ONLY); /*is a only-shm epid */ + } + +fail: + return err; +} + +/* Get a list of PTLs we want to use. The order is important, it affects + * whether node-local processes use shm or ips */ +static +psm2_error_t +psmi_parse_devices(int devices[PTL_MAX_INIT], const char *devstring) +{ + char *devstr = NULL; + char *b_new, *e, *ee, *b; + psm2_error_t err = PSM2_OK; + int len; + int i = 0; + + psmi_assert_always(devstring != NULL); + len = strlen(devstring) + 1; + + for (i = 0; i < PTL_MAX_INIT; i++) + devices[i] = -1; + + devstr = (char *)psmi_calloc(PSMI_EP_NONE, UNDEFINED, 2, len); + if (devstr == NULL) + goto fail; + + b_new = (char *)devstr; + e = b_new + len; + strncpy(e, devstring, len); + ee = e + len; + i = 0; + while (e < ee && *e && i < PTL_MAX_INIT) { + while (*e && !isalpha(*e)) + e++; + b = e; + while (*e && isalpha(*e)) + e++; + *e = '\0'; + if (*b) { + if (!strcasecmp(b, "self")) { + devices[i++] = PTL_DEVID_SELF; + b_new = strcpy(b_new, "self,"); + b_new += 5; + } else if (!strcasecmp(b, "shm") || + !strcasecmp(b, "shmem") || + !strcasecmp(b, "amsh")) { + devices[i++] = PTL_DEVID_AMSH; + strcpy(b_new, "amsh,"); + b_new += 5; + } else if (!strcasecmp(b, "hfi") || + !strcasecmp(b, "nic") || + !strcasecmp(b, "ipath") || + !strcasecmp(b, "ips")) { + devices[i++] = PTL_DEVID_IPS; + strcpy(b_new, "ips,"); + b_new += 4; + } else { + err = psmi_handle_error(NULL, PSM2_PARAM_ERR, + "%s set in environment variable PSM_PTL_DEVICES=\"%s\" " + "is not one of the recognized PTL devices (%s)", + b, devstring, + PSMI_DEVICES_DEFAULT); + goto fail; + } + e++; + } + } + if (b_new != devstr) /* we parsed something, remove trailing comma */ + *(b_new - 1) = '\0'; + + _HFI_PRDBG("PSM Device allocation order: %s\n", devstr); +fail: + if (devstr != NULL) + psmi_free(devstr); + return err; + +} + +static +int psmi_device_is_enabled(const int devid_enabled[PTL_MAX_INIT], int devid) +{ + int i; + for (i = 0; i < PTL_MAX_INIT; i++) + if (devid_enabled[i] == devid) + return 1; + return 0; +} + +int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid) +{ + return psmi_device_is_enabled(ep->devid_enabled, devid); +} diff --git a/prov/psm3/psm3/psm_ep.h b/prov/psm3/psm3/psm_ep.h new file mode 100644 index 00000000000..24d3fe49c3d --- /dev/null +++ b/prov/psm3/psm3/psm_ep.h @@ -0,0 +1,259 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_IN_USER_H +#error psm2_ep.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSMI_EP_H +#define _PSMI_EP_H + +#include "psm_verbs_ep.h" + +/* + * EPIDs encode the basic information needed to establish + * datagram traffic so that PSM connection establishment can + * negotiate and exchange the rest. + * + * EPID includes: EPID format version, network address, queue ID within NIC + */ + + +#define PSMI_SL_DEFAULT 0 +#define PSMI_SL_MIN 0 +#define PSMI_SL_MAX 31 +// IB/OPA: +// 0-2: ver = 3 +// 3-7: spare +// 8-31: QPN +// 32-47: lid [note, IB & OPA100 only support 16 bit LIDs] +// 48-63: subnet prefix low 16 bits +#define PSMI_EPID_PACK_V3(lid, qpn, subnet_id) \ + (((((uint64_t)PSMI_EPID_V3)&0x7)<<0) | \ + ((((uint64_t)qpn)&0xffffff)<<8) | \ + ((((uint64_t)lid)&0xffff)<<32) | \ + ((((uint64_t)subnet_id)&0xffff)<<48)) +// Eth: +// 0-2: ver = 4 +// 3-7: subnet (number of high bits in IP addr representing IP subnet) +// 8-31: UD QPN or UDP socket +// 32-63: IPv4 address +#define PSMI_EPID_PACK_V4(ip, qpn, subnet_bits) \ + (((((uint64_t)PSMI_EPID_V4)&0x7)<<0) | \ + ((((uint64_t)subnet_bits)&0x1f)<<3) | \ + ((((uint64_t)qpn)&0xffffff)<<8) | \ + ((((uint64_t)ip)&0xffffffff)<<32)) + +// shm and self: +// 0-2: ver = 0 +// 3: shm-only flag (1) +// 4-31: spare +// 32-63: pid +#define PSMI_EPID_PACK_SHM(process_id, shmbool) \ + (((((uint64_t)process_id)&0xffffffff)<<32) | \ + ((((uint64_t)shmbool)&0x1)<<3) | \ + ((((uint64_t)PSMI_EPID_VERSION_SHM)&0x7)<<0)) + +#define PSMI_EPID_GET_EPID_VERSION(epid) (((epid)>>0)&0x7) +#define PSMI_EPID_GET_LID_V3(epid) (((epid)>>32)&0xffff) // lid +#define PSMI_EPID_GET_LID_V4(epid) (((epid)>>32)&0xffffffff) // ip +#define PSMI_EPID_GET_CONTEXT(epid) (((epid)>>8)&0xffffff) // qpn/sock +#define PSMI_EPID_GET_SUBNET_ID_V3(epid) (((epid)>>48)&0xffff) +#define PSMI_EPID_GET_SUBNET_ID_V4(epid) (psmi_bit_count_to_mask(((epid)>>3)&0x1f) & PSMI_EPID_GET_LID_V4(epid)) // subnetwork +#define PSMI_EPID_GET_SUBNET_ID(epid) ((PSMI_EPID_GET_EPID_VERSION(epid) == PSMI_EPID_V3) ? \ + (uint32_t)PSMI_EPID_GET_SUBNET_ID_V3(epid) \ + : (uint32_t)PSMI_EPID_GET_SUBNET_ID_V4(epid)) +#define PSMI_EPID_CONTEXT_FMT "%d" +#define PSMI_EPID_GET_CONTEXT_VAL(epid) (int)PSMI_EPID_GET_CONTEXT(epid) + +#define PSM_MCTXT_APPEND(head, node) \ + node->mctxt_prev = head->mctxt_prev; \ + node->mctxt_next = head; \ + head->mctxt_prev->mctxt_next = node; \ + head->mctxt_prev = node; \ + node->mctxt_master = head +#define PSM_MCTXT_REMOVE(node) \ + node->mctxt_prev->mctxt_next = node->mctxt_next; \ + node->mctxt_next->mctxt_prev = node->mctxt_prev; \ + node->mctxt_next = node->mctxt_prev = node; \ + node->mctxt_master = NULL + +struct psm2_ep { + psm2_epid_t epid; /**> This endpoint's Endpoint ID */ + psm2_epaddr_t epaddr; /**> This ep's ep address */ + psm2_mq_t mq; /**> only 1 MQ */ + struct psm2_verbs_ep verbs_ep; + + int unit_id; + uint16_t portnum; + uint16_t out_sl; + // mtu is PSM payload allowed by local HW, + // mtu may be further reduced via PSM3_MTU by ips_proto_init + // for UD/UDP, mtu is reduced by PSM hdr size + uint16_t mtu; /* out_sl-->vl-->mtu in sysfs */ + uint16_t network_pkey; /**> Pkey */ + uint16_t network_pkey_index; /**> Pkey index */ + int did_syslog; + psm2_uuid_t uuid; + uint16_t jkey; + uint64_t service_id; /* OPA service ID */ + psm2_path_res_t path_res_type; /* Path resolution for endpoint */ + psm2_ep_errhandler_t errh; + int devid_enabled[PTL_MAX_INIT]; + int memmode; /**> min, normal, large memory mode */ + + uint32_t hfi_num_sendbufs;/**> Number of allocated send buffers */ + uint32_t hfi_num_descriptors;/** Number of allocated scb descriptors*/ + uint32_t hfi_num_send_wqes;/** Number of allocated SQ WQEs for send*/ + uint32_t hfi_num_send_rdma;/** Number of concurrent RDMA*/ + uint32_t hfi_send_reap_thresh;/** when to reap SQ compleitions*/ + uint32_t hfi_num_recv_wqes;/** Number of allocated RQ WQEs*/ + uint32_t hfi_num_recv_cqes;/** Number of allocated RQ CQEs*/ + uint8_t hfi_qp_timeout;/** RC QP timeout, IB enum */ + uint8_t hfi_qp_retry;/** RC QP retry limit */ + uint8_t rdmamode; /** PSM3_RDMA */ + uint8_t mr_cache_mode; /** PSM3_MR_CACHE_MODE */ + uint8_t rv_num_conn; /** PSM3_RV_QP_PER_CONN */ + uint32_t rv_mr_cache_size; /** PSM3_RV_MR_CACHE_SIZE */ + uint32_t rv_q_depth; /** PSM3_RV_Q_DEPTH */ + uint32_t rv_reconnect_timeout; /* PSM3_RV_RECONNECT_TIMEOUT */ + uint32_t rv_hb_interval; /* PSM3_RV_HEARTBEAT_INTERVAL */ + uint32_t hfi_imm_size; /** Immediate data size */ + uint32_t connections; /**> Number of connections */ + + psmi_context_t context; + char *context_mylabel; + uint32_t yield_spin_cnt; + + /* EP link-lists */ + struct psm2_ep *user_ep_next; + + /* EP link-lists for multi-context. */ + struct psm2_ep *mctxt_prev; + struct psm2_ep *mctxt_next; + struct psm2_ep *mctxt_master; + + /* Active Message handler table */ + struct psm2_ep_am_handle_entry *am_htable; + + uint64_t gid_hi; + uint64_t gid_lo; + + ptl_ctl_t ptl_amsh; + ptl_ctl_t ptl_ips; + ptl_ctl_t ptl_self; + + /* All ptl data is allocated inline below */ + uint8_t ptl_base_data[0] __attribute__ ((aligned(64))); + bool skip_affinity; +}; + +struct mqq { + psm2_mq_req_t first; + psm2_mq_req_t last; +}; + +typedef +union psmi_seqnum { + struct { + uint32_t psn_seq:11; + uint32_t psn_gen:20; + }; + struct { + uint32_t psn_num:31; + }; + uint32_t psn_val; +} psmi_seqnum_t; + +/* + * PSM end point address. One per connection and per rail. + */ +struct psm2_epaddr { + psm2_epid_t epid; /* peer's epid */ + ptl_ctl_t *ptlctl; /* The control structure for the ptl */ + struct ips_proto *proto; /* only for ips protocol */ + void *usr_ep_ctxt; /* User context associated with endpoint */ +}; + +#ifndef PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD +# define PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD 250 +#endif + +/* + * Users of BLOCKUNTIL should check the value of err upon return + */ +#define PSMI_BLOCKUNTIL(ep, err, cond) do { \ + int spin_cnt = 0; \ + PSMI_PROFILE_BLOCK(); \ + while (!(cond)) { \ + err = psmi_poll_internal(ep, 1); \ + if (err == PSM2_OK_NO_PROGRESS) { \ + PSMI_PROFILE_REBLOCK(1); \ + if (++spin_cnt == (ep)->yield_spin_cnt) { \ + spin_cnt = 0; \ + PSMI_YIELD((ep)->mq->progress_lock); \ + } \ + } \ + else if (err == PSM2_OK) { \ + PSMI_PROFILE_REBLOCK(0); \ + spin_cnt = 0; \ + } \ + else \ + break; \ + } \ + PSMI_PROFILE_UNBLOCK(); \ +} while (0) + +#endif /* _PSMI_EP_H */ diff --git a/prov/psm3/psm3/psm_ep_connect.c b/prov/psm3/psm3/psm_ep_connect.c new file mode 100644 index 00000000000..122994d973b --- /dev/null +++ b/prov/psm3/psm3/psm_ep_connect.c @@ -0,0 +1,619 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm_mq_internal.h" + +int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid); + +#if _HFI_DEBUGGING +PSMI_ALWAYS_INLINE( +char *psmi_getdevice(int type)) +{ + switch (type) { + case PTL_DEVID_IPS: + return "ips"; + case PTL_DEVID_AMSH: + return "amsh"; + case PTL_DEVID_SELF: + return "self"; + default: + return "ips"; + } +} +#endif + +psm2_error_t +__psm2_ep_connect(psm2_ep_t ep, int num_of_epid, psm2_epid_t const *array_of_epid, + int const *array_of_epid_mask, /* can be NULL */ + psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr, + int64_t timeout) +{ + psm2_error_t err = PSM2_OK; + ptl_ctl_t *ptlctl; + ptl_t *ptl; + int i, j, dup_idx; + int num_toconnect = 0; + int *epid_mask = NULL; + int *epid_mask_isdupof = NULL; + uint64_t t_start = get_cycles(); + uint64_t t_left; + union psmi_envvar_val timeout_intval; + + PSM2_LOG_MSG("entering"); + PSMI_ERR_UNLESS_INITIALIZED(ep); + + /* + * Normally we would lock here, but instead each implemented ptl component + * does its own locking. This is mostly because the ptl components are + * ahead of the PSM2 interface in that they can disconnect their peers. + */ + if (ep == NULL || array_of_epaddr == NULL || array_of_epid == NULL || + num_of_epid < 1) { + err = psmi_handle_error(ep, PSM2_PARAM_ERR, + "Invalid psm2_ep_connect parameters"); + goto fail_nolock; + } + + PSMI_LOCK(ep->mq->progress_lock); + + /* We need two of these masks to detect duplicates */ + err = PSM2_NO_MEMORY; + epid_mask = + (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid); + if (epid_mask == NULL) + goto fail; + epid_mask_isdupof = + (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid); + if (epid_mask_isdupof == NULL) + goto fail; + err = PSM2_OK; + + /* Eventually handle timeouts across all connects. */ + for (j = 0; j < num_of_epid; j++) { + if (array_of_epid_mask != NULL && !array_of_epid_mask[j]) + epid_mask[j] = 0; + else { + epid_mask[j] = 1; + array_of_errors[j] = PSM2_EPID_UNKNOWN; + array_of_epaddr[j] = NULL; + if (psmi_epid_version(array_of_epid[j]) != + PSMI_EPID_VERSION) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + " Mismatched version of EPID - %"PRIu64"\n" + "Confirm all nodes are running the same interconnect HW and PSM version\n", + psmi_epid_version(array_of_epid[j])); + } + num_toconnect++; + } + epid_mask_isdupof[j] = -1; + } + + psmi_getenv("PSM3_CONNECT_TIMEOUT", + "End-point minimum connection timeout. 0 for no time-out.", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)(timeout/SEC_ULL), &timeout_intval); + + if (getenv("PSM3_CONNECT_TIMEOUT")) { + timeout = timeout_intval.e_uint * SEC_ULL; + } else if (timeout > 0) { + /* The timeout parameter provides the minimum timeout. A heuristic + * is used to scale up the timeout linearly with the number of + * endpoints, and we allow one second per 100 endpoints. */ + timeout = max(timeout, (num_toconnect * SEC_ULL) / 100); + } + + if (timeout > 0 && timeout < PSMI_MIN_EP_CONNECT_TIMEOUT) + timeout = PSMI_MIN_EP_CONNECT_TIMEOUT; + _HFI_PRDBG("Connect to %d endpoints with time-out of %.2f secs\n", + num_toconnect, (double)timeout / 1e9); + + /* Look for duplicates in input array */ + for (i = 0; i < num_of_epid; i++) { + for (j = i + 1; j < num_of_epid; j++) { + if (array_of_epid[i] == array_of_epid[j] && + epid_mask[i] && epid_mask[j]) { + epid_mask[j] = 0; /* don't connect more than once */ + epid_mask_isdupof[j] = i; + } + } + } + + for (i = 0; i < PTL_MAX_INIT; i++) { + if (ep->devid_enabled[i] == -1) + continue; + /* Set up the right connect ptrs */ + switch (ep->devid_enabled[i]) { + case PTL_DEVID_IPS: + ptlctl = &ep->ptl_ips; + ptl = ep->ptl_ips.ptl; + break; + case PTL_DEVID_AMSH: + ptlctl = &ep->ptl_amsh; + ptl = ep->ptl_amsh.ptl; + break; + case PTL_DEVID_SELF: + ptlctl = &ep->ptl_self; + ptl = ep->ptl_self.ptl; + break; + default: + ptlctl = &ep->ptl_ips; /*no-unused */ + ptl = ep->ptl_ips.ptl; /*no-unused */ + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Unknown/unhandled PTL id %d\n", + ep->devid_enabled[i]); + break; + } + t_left = psmi_cycles_left(t_start, timeout); + + if (_HFI_VDBG_ON) { + _HFI_VDBG_ALWAYS + ("Trying to connect with device %s\n", + psmi_getdevice(ep->devid_enabled[i])); + } + if ((err = ptlctl->ep_connect(ptl, num_of_epid, array_of_epid, + epid_mask, array_of_errors, + array_of_epaddr, + cycles_to_nanosecs(t_left)))) { + if (_HFI_PRDBG_ON) { + _HFI_PRDBG_ALWAYS + ("Connect failure in device %s err=%d\n", + psmi_getdevice(ep->devid_enabled[i]), err); + } + goto connect_fail; + } + + /* Now process what's been connected */ + for (j = 0; j < num_of_epid; j++) { + dup_idx = epid_mask_isdupof[j]; + if (!epid_mask[j] && dup_idx == -1) + continue; + + if (dup_idx != -1) { /* dup */ + array_of_epaddr[j] = array_of_epaddr[dup_idx]; + array_of_errors[j] = array_of_errors[dup_idx]; + epid_mask_isdupof[j] = -1; + } + + if (array_of_errors[j] == PSM2_OK) { + epid_mask[j] = 0; /* don't try on next ptl */ + ep->connections++; + } + } + } + + for (i = 0; i < num_of_epid; i++) { + ptl_ctl_t *c = NULL; + if (array_of_epid_mask != NULL && !array_of_epid_mask[i]) + continue; + /* If we see unreachable here, that means some PTLs were not enabled */ + if (array_of_errors[i] == PSM2_EPID_UNREACHABLE) { + err = PSM2_EPID_UNREACHABLE; + break; + } + + psmi_assert_always(array_of_epaddr[i] != NULL); + c = array_of_epaddr[i]->ptlctl; + psmi_assert_always(c != NULL); + _HFI_VDBG("%-20s DEVICE %s (%p)\n", + psmi_epaddr_get_name(array_of_epid[i]), + c == &ep->ptl_ips ? "nic" : + (c == &ep->ptl_amsh ? "amsh" : "self"), + (void *)array_of_epaddr[i]->ptlctl->ptl); + } + + if (err == PSM2_OK) + for (i=0; idevid_enabled[i]) { + case PTL_DEVID_IPS: + devname = "nic"; + break; + case PTL_DEVID_AMSH: + devname = "shm"; + break; + case PTL_DEVID_SELF: + default: + devname = "self"; + break; + } + len += + snprintf(errbuf + len, + sizeof(errbuf) - len - 1, "%s,", + devname); + } + if (len < sizeof(errbuf) - 1 && devname != NULL) + /* parsed something, remove trailing comma */ + errbuf[len - 1] = ')'; + } else + len = snprintf(errbuf, sizeof(errbuf) - 1, + "%s", err == PSM2_TIMEOUT ? + "Detected connection timeout" : + psm2_error_get_string(err)); + + /* first pass, look for all nodes with the error */ + for (i = 0; i < num_of_epid && len < sizeof(errbuf) - 1; i++) { + if (array_of_epid_mask != NULL + && !array_of_epid_mask[i]) + continue; + if (array_of_errors[i] == PSM2_OK) + continue; + if (array_of_errors[i] == PSM2_EPID_UNREACHABLE && + err != PSM2_EPID_UNREACHABLE) + continue; + if (array_of_errors[i]) + array_of_epaddr[i] = NULL; + if (err == array_of_errors[i]) { + len += + snprintf(errbuf + len, + sizeof(errbuf) - len - 1, "%c %s", + j == 0 ? ':' : ',', + psmi_epaddr_get_hostname + (array_of_epid[i])); + j++; + } + } + errbuf[sizeof(errbuf) - 1] = '\0'; + err = psmi_handle_error(ep, err, "%s", errbuf); + } + +fail: + PSMI_UNLOCK(ep->mq->progress_lock); + +fail_nolock: + if (epid_mask != NULL) + psmi_free(epid_mask); + if (epid_mask_isdupof != NULL) + psmi_free(epid_mask_isdupof); + + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_ep_connect) + +psm2_error_t __psm2_ep_disconnect(psm2_ep_t ep, int num_of_epaddr, + psm2_epaddr_t *array_of_epaddr, + const int *array_of_epaddr_mask, + psm2_error_t *array_of_errors, + int64_t timeout) +{ + return psm2_ep_disconnect2(ep, num_of_epaddr, array_of_epaddr, + array_of_epaddr_mask, array_of_errors, + PSM2_EP_DISCONNECT_GRACEFUL, timeout); +} +PSMI_API_DECL(psm2_ep_disconnect) + +psm2_error_t __psm2_ep_disconnect2(psm2_ep_t ep, int num_of_epaddr, + psm2_epaddr_t *array_of_epaddr, + const int *array_of_epaddr_mask, + psm2_error_t *array_of_errors, + int mode, int64_t timeout) +{ + psm2_error_t err = PSM2_OK; + ptl_ctl_t *ptlctl; + ptl_t *ptl; + int i, j, dup_idx; + int num_todisconnect = 0; + int *epaddr_mask = NULL; + int *epaddr_mask_isdupof = NULL; + uint64_t t_start = get_cycles(); + uint64_t t_left; + union psmi_envvar_val timeout_intval; + + PSM2_LOG_MSG("entering"); + PSMI_ERR_UNLESS_INITIALIZED(ep); + + + /* + * Normally we would lock here, but instead each implemented ptl component + * does its own locking. This is mostly because the ptl components are + * ahead of the PSM2 interface in that they can disconnect their peers. + */ + if (ep == NULL || array_of_epaddr == NULL || + num_of_epaddr < 1) { + err = psmi_handle_error(ep, PSM2_PARAM_ERR, + "Invalid psm2_ep_disconnect parameters"); + goto fail_nolock; + } + + PSMI_LOCK(ep->mq->progress_lock); + + /* We need two of these masks to detect duplicates */ + err = PSM2_NO_MEMORY; + epaddr_mask = + (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epaddr); + if (epaddr_mask == NULL) + goto fail; + epaddr_mask_isdupof = + (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epaddr); + if (epaddr_mask_isdupof == NULL) + goto fail; + err = PSM2_OK; + + /* Eventually handle timeouts across all connects. */ + for (j = 0; j < num_of_epaddr; j++) { + if (array_of_epaddr_mask != NULL && !array_of_epaddr_mask[j]) + epaddr_mask[j] = 0; + else { + epaddr_mask[j] = 1; + array_of_errors[j] = PSM2_EPID_UNKNOWN; + num_todisconnect++; + } + epaddr_mask_isdupof[j] = -1; + } + + psmi_getenv("PSM3_DISCONNECT_TIMEOUT", + "End-point disconnection timeout over-ride. 0 for no time-out.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)0, &timeout_intval); + + if (getenv("PSM3_DISCONNECT_TIMEOUT")) { + timeout = timeout_intval.e_uint * SEC_ULL; + } else if (timeout > 0) { + /* The timeout parameter provides the minimum timeout. A heuristic + * is used to scale up the timeout linearly with the number of + * endpoints, and we allow one second per 100 endpoints. */ + timeout = max(timeout, (num_todisconnect * SEC_ULL) / 100); + } + + if (timeout > 0 && timeout < PSMI_MIN_EP_CONNECT_TIMEOUT) + timeout = PSMI_MIN_EP_CONNECT_TIMEOUT; + _HFI_PRDBG("Disconnect %d endpoints with time-out of %.2f secs\n", + num_todisconnect, (double)timeout / 1e9); + + /* Look for duplicates in input array */ + for (i = 0; i < num_of_epaddr; i++) { + for (j = i + 1; j < num_of_epaddr; j++) { + if (array_of_epaddr[i] == array_of_epaddr[j] && + epaddr_mask[i] && epaddr_mask[j]) { + epaddr_mask[j] = 0; /* don't disconnect more than once */ + epaddr_mask_isdupof[j] = i; + } + } + } + + for (i = 0; i < PTL_MAX_INIT; i++) { + if (ep->devid_enabled[i] == -1) + continue; + /* Set up the right connect ptrs */ + switch (ep->devid_enabled[i]) { + case PTL_DEVID_IPS: + ptlctl = &ep->ptl_ips; + ptl = ep->ptl_ips.ptl; + break; + case PTL_DEVID_AMSH: + ptlctl = &ep->ptl_amsh; + ptl = ep->ptl_amsh.ptl; + break; + case PTL_DEVID_SELF: + ptlctl = &ep->ptl_self; + ptl = ep->ptl_self.ptl; + break; + default: + ptlctl = &ep->ptl_ips; /*no-unused */ + ptl = ep->ptl_ips.ptl; /*no-unused */ + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Unknown/unhandled PTL id %d\n", + ep->devid_enabled[i]); + break; + } + t_left = psmi_cycles_left(t_start, timeout); + + if (_HFI_VDBG_ON) { + _HFI_VDBG_ALWAYS + ("Trying to disconnect with device %s\n", + psmi_getdevice(ep->devid_enabled[i])); + } + if ((err = ptlctl->ep_disconnect(ptl, (mode == PSM2_EP_DISCONNECT_FORCE), + num_of_epaddr, array_of_epaddr, + epaddr_mask, array_of_errors, + cycles_to_nanosecs(t_left)))) { + if (_HFI_PRDBG_ON) { + _HFI_PRDBG_ALWAYS + ("Disconnect failure in device %s err=%d\n", + psmi_getdevice(ep->devid_enabled[i]), err); + } + goto disconnect_fail; + } + + /* Now process what's been disconnected */ + for (j = 0; j < num_of_epaddr; j++) { + dup_idx = epaddr_mask_isdupof[j]; + if (!epaddr_mask[j] && dup_idx == -1) + continue; + + if (dup_idx != -1) { /* dup */ + array_of_errors[j] = array_of_errors[dup_idx]; + epaddr_mask_isdupof[j] = -1; + } + + if (array_of_errors[j] == PSM2_OK) { + epaddr_mask[j] = 0; /* don't try on next ptl */ + array_of_epaddr[j] = NULL; + ep->connections--; + } + } + } + + for (i = 0; i < num_of_epaddr; i++) { + if (array_of_epaddr_mask != NULL && !array_of_epaddr_mask[i]) + continue; + /* If we see unreachable here, that means some PTLs were not enabled */ + if (array_of_errors[i] == PSM2_EPID_UNREACHABLE) { + err = PSM2_EPID_UNREACHABLE; + break; + } + } + +disconnect_fail: + /* If the error is a timeout (at worse) and the client is OPA MPI, + * just return timeout to let OPA MPI handle the hostnames that + * timed out */ + if (err != PSM2_OK) { + char errbuf[PSM2_ERRSTRING_MAXLEN]; + size_t len; + int j = 0; + + if (err == PSM2_EPID_UNREACHABLE) { + char *deverr = "of an incorrect setting"; + char *eperr = ""; + char *devname = NULL; + if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { + deverr = + "there is no shared memory PSM3 device (shm)"; + eperr = " shared memory"; + } else + if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { + deverr = + "there is no OPA PSM3 device (nic)"; + eperr = " OPA"; + } + + len = snprintf(errbuf, sizeof(errbuf) - 1, + "Some%s endpoints could not be disconnected because %s " + "in the currently enabled PSM3_DEVICES (", + eperr, deverr); + for (i = 0; i < PTL_MAX_INIT && len < sizeof(errbuf) - 1; i++) { + switch (ep->devid_enabled[i]) { + case PTL_DEVID_IPS: + devname = "nic"; + break; + case PTL_DEVID_AMSH: + devname = "shm"; + break; + case PTL_DEVID_SELF: + default: + devname = "self"; + break; + } + len += + snprintf(errbuf + len, + sizeof(errbuf) - len - 1, "%s,", + devname); + } + if (len < sizeof(errbuf) - 1 && devname != NULL) + /* parsed something, remove trailing comma */ + errbuf[len - 1] = ')'; + } else + len = snprintf(errbuf, sizeof(errbuf) - 1, + "%s", err == PSM2_TIMEOUT ? + "Detected disconnect timeout" : + psm2_error_get_string(err)); + + /* first pass, look for all nodes with the error */ + for (i = 0; i < num_of_epaddr && len < sizeof(errbuf) - 1; i++) { + if (array_of_epaddr_mask != NULL + && !array_of_epaddr_mask[i]) + continue; + if (array_of_errors[i] == PSM2_OK) + continue; + if (array_of_errors[i] == PSM2_EPID_UNREACHABLE && + err != PSM2_EPID_UNREACHABLE) + continue; + if (err == array_of_errors[i]) { + len += + snprintf(errbuf + len, + sizeof(errbuf) - len - 1, "%c %s", + j == 0 ? ':' : ',', + psmi_epaddr_get_hostname + (array_of_epaddr[i]->epid)); + j++; + } + } + errbuf[sizeof(errbuf) - 1] = '\0'; + err = psmi_handle_error(ep, err, "%s", errbuf); + } + +fail: + PSMI_UNLOCK(ep->mq->progress_lock); + +fail_nolock: + if (epaddr_mask != NULL) + psmi_free(epaddr_mask); + if (epaddr_mask_isdupof != NULL) + psmi_free(epaddr_mask_isdupof); + + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_ep_disconnect2) diff --git a/prov/psm3/psm3/psm_error.c b/prov/psm3/psm3/psm_error.c new file mode 100644 index 00000000000..27da64115bb --- /dev/null +++ b/prov/psm3/psm3/psm_error.c @@ -0,0 +1,351 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" + +#define PSMI_NOLOG -1 + +struct psm2_error_token { + psm2_ep_t ep; + psm2_error_t error; + char err_string[PSM2_ERRSTRING_MAXLEN]; +}; + +static +psm2_error_t +psmi_errhandler_noop(psm2_ep_t ep, const psm2_error_t err, + const char *error_string, psm2_error_token_t token) +{ + return err; +} + +static +psm2_error_t +psmi_errhandler_psm(psm2_ep_t ep, + const psm2_error_t err, + const char *error_string, psm2_error_token_t token) +{ + /* we want the error to be seen through ssh, etc., so we flush and then + * sleep a bit. Not perfect, but not doing so means it almost never + * gets seen. */ + fprintf(stderr, "%s: %s\n", hfi_get_mylabel(), token->err_string); + fflush(stdout); + fflush(stderr); + + /* XXX Eventually, this will hook up to a connection manager, and we'll + * issue an upcall into the connection manager at shutdown time */ + sleep(3); + + /* We use this "special" ep internally to handle internal errors that are + * triggered from within code that is not expected to return to the user. + * Errors of this sort on not expected to be handled by users and always + * mean we have an internal PSM bug. */ + if (err == PSM2_INTERNAL_ERR) + abort(); + else + exit(-1); +} + +psm2_ep_errhandler_t psmi_errhandler_global = psmi_errhandler_noop; + +psm2_error_t __psm2_error_defer(psm2_error_token_t token) +{ + psm2_error_t rv; + PSM2_LOG_MSG("entering"); + rv = psmi_errhandler_psm(token->ep, token->error, token->err_string, + token); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_error_defer) + +psm2_error_t +__psm2_error_register_handler(psm2_ep_t ep, const psm2_ep_errhandler_t errhandler) +{ + psm2_ep_errhandler_t *errh; + + PSM2_LOG_MSG("entering"); + + if (ep == NULL) + errh = &psmi_errhandler_global; + else + errh = &ep->errh; + + if (errhandler == PSM2_ERRHANDLER_PSM_HANDLER) + *errh = psmi_errhandler_psm; + else if (errhandler == PSM2_ERRHANDLER_NO_HANDLER) + *errh = psmi_errhandler_noop; + else + *errh = errhandler; + + PSM2_LOG_MSG("leaving"); + + return PSM2_OK; +} +PSMI_API_DECL(psm2_error_register_handler) + +psm2_error_t +MOCKABLE (psmi_handle_error)(psm2_ep_t ep, psm2_error_t error, const char *buf, ...) +{ + va_list argptr; + int syslog_level; + int console_print = 0; + psm2_error_t newerr; + struct psm2_error_token token; + char *c, fullmsg[PSM2_ERRSTRING_MAXLEN]; + token.error = error; + snprintf(fullmsg, PSM2_ERRSTRING_MAXLEN - 1, "%s", buf); + fullmsg[PSM2_ERRSTRING_MAXLEN - 1] = '\0'; + va_start(argptr, buf); + vsnprintf(token.err_string, PSM2_ERRSTRING_MAXLEN - 1, fullmsg, argptr); + va_end(argptr); + token.err_string[PSM2_ERRSTRING_MAXLEN - 1] = '\0'; + + /* Unless the user has set PSM3_NO_VERBOSE_ERRORS, always print errors to + * console */ + c = getenv("PSM3_NO_VERBOSE_ERRORS"); + console_print = 0; + if (ep == PSMI_EP_LOGEVENT) + console_print = 1; + else if (!c || *c == '\0') { /* no desire to prevent verbose errors */ + /* Remove the console print if we're internally handling the error */ + if (ep == PSMI_EP_NORETURN) + console_print = 0; + else if (ep == NULL + && psmi_errhandler_global != psmi_errhandler_psm) + console_print = 1; + else if (ep != NULL && ep->errh != psmi_errhandler_psm) + console_print = 1; + } + + /* Before we let the user even handle the error, send to syslog */ + syslog_level = psmi_error_syslog_level(error); + if (syslog_level != PSMI_NOLOG || ep == PSMI_EP_LOGEVENT) + psmi_syslog(ep, console_print, + ep == PSMI_EP_LOGEVENT ? LOG_NOTICE : syslog_level, + "%s (err=%d)", token.err_string, error); + + if (ep == PSMI_EP_LOGEVENT) /* we're just logging */ + newerr = PSM2_OK; + else if (ep == PSMI_EP_NORETURN) + newerr = + psmi_errhandler_psm(NULL, error, token.err_string, &token); + else if (ep == NULL) + newerr = + psmi_errhandler_global(NULL, error, token.err_string, + &token); + else + newerr = ep->errh(ep, error, token.err_string, &token); + + return newerr; +} +MOCK_DEF_EPILOGUE(psmi_handle_error); + +/* Returns the "worst" error out of errA and errB */ +psm2_error_t psmi_error_cmp(psm2_error_t errA, psm2_error_t errB) +{ +#define _PSMI_ERR_IS(err) if (errA == (err) || errB == (err)) return (err) + + /* Bad runtime or before initialization */ + _PSMI_ERR_IS(PSM2_NO_MEMORY); + _PSMI_ERR_IS(PSM2_INTERNAL_ERR); + _PSMI_ERR_IS(PSM2_INIT_NOT_INIT); + _PSMI_ERR_IS(PSM2_INIT_BAD_API_VERSION); + + /* Before we cget an endpoint */ + _PSMI_ERR_IS(PSM2_EP_NO_DEVICE); + _PSMI_ERR_IS(PSM2_EP_UNIT_NOT_FOUND); + _PSMI_ERR_IS(PSM2_EP_DEVICE_FAILURE); + _PSMI_ERR_IS(PSM2_EP_NO_PORTS_AVAIL); + _PSMI_ERR_IS(PSM2_TOO_MANY_ENDPOINTS); + + /* As we open/close the endpoint */ + _PSMI_ERR_IS(PSM2_EP_NO_NETWORK); + _PSMI_ERR_IS(PSM2_SHMEM_SEGMENT_ERR); + _PSMI_ERR_IS(PSM2_EP_CLOSE_TIMEOUT); + _PSMI_ERR_IS(PSM2_EP_INVALID_UUID_KEY); + _PSMI_ERR_IS(PSM2_EP_NO_RESOURCES); + + /* In connect phase */ + _PSMI_ERR_IS(PSM2_EPID_NETWORK_ERROR); + _PSMI_ERR_IS(PSM2_EPID_INVALID_NODE); + _PSMI_ERR_IS(PSM2_EPID_INVALID_CONNECT); + _PSMI_ERR_IS(PSM2_EPID_INVALID_PKEY); + _PSMI_ERR_IS(PSM2_EPID_INVALID_VERSION); + _PSMI_ERR_IS(PSM2_EPID_INVALID_UUID_KEY); + _PSMI_ERR_IS(PSM2_EPID_INVALID_MTU); + _PSMI_ERR_IS(PSM2_EPID_RV_CONNECT_ERROR); + + /* Timeout if nothing else */ + _PSMI_ERR_IS(PSM2_TIMEOUT); + + _PSMI_ERR_IS(PSM2_EPID_RV_CONNECT_RECOVERING); + + /* Last resort */ + return max(errA, errB); +} + +struct psmi_error_item { + int syslog_level; + const char *error_string; +}; + +static +struct psmi_error_item psmi_error_items[] = { + {PSMI_NOLOG, "Success"}, /* PSM2_OK = 0, */ + {PSMI_NOLOG, "No events were progressed in psm_poll"}, /* PSM2_OK_NO_PROGRESS = 1 */ + {PSMI_NOLOG, "unknown 2"}, + {PSMI_NOLOG, "Error in a function parameter"}, /* PSM2_PARAM_ERR = 3 */ + {LOG_CRIT, "Ran out of memory"}, /* PSM2_NO_MEMORY = 4 */ + {PSMI_NOLOG, "PSM has not been initialized"}, /* PSM2_INIT_NOT_INIT = 5 */ + {LOG_INFO, "API version passed is incompatible"}, /* PSM2_INIT_BAD_API_VERSION = 6 */ + {PSMI_NOLOG, "PSM Could not set affinity"}, /* PSM2_NO_AFFINITY = 7 */ + {LOG_ALERT, "PSM Unresolved internal error"}, /* PSM2_INTERNAL_ERR = 8 */ + {LOG_CRIT, "PSM could not set up shared memory segment"}, /* PSM2_SHMEM_SEGMENT_ERR = 9 */ + {PSMI_NOLOG, "PSM option is a read-only option"}, /* PSM2_OPT_READONLY = 10 */ + {PSMI_NOLOG, "Operation timed out"}, /* PSM2_TIMEOUT = 11 */ + {LOG_INFO, "Exceeded supported amount of endpoints"}, + /* PSM2_TOO_MANY_ENDPOINTS = 12 */ + {PSMI_NOLOG, "PSM is in the finalized state"}, /* PSM2_IS_FINALIZED = 13 */ + {PSMI_NOLOG, "unknown 14"}, + {PSMI_NOLOG, "unknown 15"}, + {PSMI_NOLOG, "unknown 16"}, + {PSMI_NOLOG, "unknown 17"}, + {PSMI_NOLOG, "unknown 18"}, + {PSMI_NOLOG, "unknown 19"}, + {PSMI_NOLOG, "Endpoint was closed"}, /* PSM2_EP_WAS_CLOSED = 20 */ + {LOG_ALERT, "PSM Could not find an OPA Unit"}, /* PSM2_EP_NO_DEVICE = 21 */ + {PSMI_NOLOG, "User passed a bad unit number"}, /* PSM2_EP_UNIT_NOT_FOUND = 22 */ + {LOG_ALERT, "Failure in initializing endpoint"}, /* PSM2_EP_DEVICE_FAILURE = 23 */ + {PSMI_NOLOG, "Error closing the endpoing error"}, /* PSM2_EP_CLOSE_TIMEOUT = 24 */ + {PSMI_NOLOG, "No free contexts could be obtained"}, /* PSM2_EP_NO_PORTS_AVAIL = 25 */ + {LOG_ALERT, "Could not detect network connectivity"}, /* PSM2_EP_NO_NETWORK = 26 */ + {LOG_INFO, "Invalid Unique job-wide UUID Key"}, /* PSM2_EP_INVALID_UUID_KEY = 27 */ + {LOG_INFO, "Out of endpoint resources"}, /* PSM2_EP_NO_RESOURCES = 28 */ + {PSMI_NOLOG, "unknown 29"}, + {PSMI_NOLOG, "unknown 30"}, + {PSMI_NOLOG, "unknown 31"}, + {PSMI_NOLOG, "unknown 32"}, + {PSMI_NOLOG, "unknown 33"}, + {PSMI_NOLOG, "unknown 34"}, + {PSMI_NOLOG, "unknown 35"}, + {PSMI_NOLOG, "unknown 36"}, + {PSMI_NOLOG, "unknown 37"}, + {PSMI_NOLOG, "unknown 38"}, + {PSMI_NOLOG, "unknown 39"}, + {PSMI_NOLOG, "Unknown/unresolved connection status (other errors occurred)"}, /* PSM2_EPID_UNKNOWN = 40 */ + {PSMI_NOLOG, "Endpoint could not be reached"}, /* PSM2_EPID_UNREACHABLE = 41 */ + {PSMI_NOLOG, "unknown 42"}, + {LOG_CRIT, "Invalid node (mismatch in bit width 32/64 or byte order)"}, /* PSM2_EPID_INVALID_NODE = 43 */ + {LOG_CRIT, "Invalid MTU"}, /* PSM2_EPID_INVALID_MTU = 44 */ + {PSMI_NOLOG, "UUID key mismatch"}, /* PSM2_EPID_INVALID_UUID_KEY = 45 */ + {LOG_ERR, "Incompatible PSM version"}, /* PSM2_EPID_INVALID_VERSION = 46 */ + {LOG_CRIT, "Connect received garbled connection information"}, /* PSM2_EPID_INVALID_CONNECT = 47 */ + {PSMI_NOLOG, "Endpoint was already connected"}, /* PSM2_EPID_ALREADY_CONNECTED = 48 */ + {LOG_CRIT, "Two or more endpoints have the same network id (LID)"}, /* PSM2_EPID_NETWORK_ERROR = 49 */ + {LOG_CRIT, "Endpoint provided incompatible Partition Key"}, + {LOG_CRIT, "Unable to resolve network path. Check connectivity and routing between nodes"}, + {LOG_CRIT, "Unable to establish RV RC QP connection"}, /* PSM2_EPID_RV_CONNECT_ERROR */ + {LOG_INFO, "Recovering RV RC QP connection"}, /* PSM2_EPID_RV_CONNECT_RECOVERING */ + {PSMI_NOLOG, "unknown 54"}, + {PSMI_NOLOG, "unknown 55"}, + {PSMI_NOLOG, "unknown 56"}, + {PSMI_NOLOG, "unknown 57"}, + {PSMI_NOLOG, "unknown 58"}, + {PSMI_NOLOG, "unknown 59"}, + {PSMI_NOLOG, "MQ Non-blocking request is incomplete"}, /* PSM2_MQ_NO_COMPLETIONS = 60 */ + {PSMI_NOLOG, "MQ Message has been truncated at the receiver"}, /* PSM2_MQ_TRUNCATION = 61 */ + {PSMI_NOLOG, "unknown 62"}, + {PSMI_NOLOG, "unknown 63"}, + {PSMI_NOLOG, "unknown 64"}, + {PSMI_NOLOG, "unknown 65"}, + {PSMI_NOLOG, "unknown 66"}, + {PSMI_NOLOG, "unknown 67"}, + {PSMI_NOLOG, "unknown 68"}, + {PSMI_NOLOG, "unknown 69"}, + {PSMI_NOLOG, "Invalid AM reply"}, + {PSMI_NOLOG, "unknown 71"}, + {PSMI_NOLOG, "unknown 72"}, + {PSMI_NOLOG, "unknown 73"}, + {PSMI_NOLOG, "unknown 74"}, + {PSMI_NOLOG, "unknown 75"}, + {PSMI_NOLOG, "unknown 76"}, + {PSMI_NOLOG, "unknown 77"}, + {PSMI_NOLOG, "unknown 78"}, + {PSMI_NOLOG, "unknown 79"}, + {PSMI_NOLOG, "unknown 80"}, +}; + +const char *__psm2_error_get_string(psm2_error_t error) +{ + PSM2_LOG_MSG("entering"); + if (error >= PSM2_ERROR_LAST) { + PSM2_LOG_MSG("leaving"); + return "unknown"; + } + else { + PSM2_LOG_MSG("leaving"); + return psmi_error_items[error].error_string; + } +} +PSMI_API_DECL(psm2_error_get_string) + +int psmi_error_syslog_level(psm2_error_t error) +{ + if (error >= PSM2_ERROR_LAST) + return PSMI_NOLOG; + else + return psmi_error_items[error].syslog_level; +} diff --git a/prov/psm3/psm3/psm_error.h b/prov/psm3/psm3/psm_error.h new file mode 100644 index 00000000000..c986ea0bd72 --- /dev/null +++ b/prov/psm3/psm3/psm_error.h @@ -0,0 +1,105 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ +#include "psm2_mock_testing.h" + +#ifndef _PSMI_IN_USER_H +#error psm_error.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSMI_ERROR_H +#define _PSMI_ERROR_H + +#define PSMI_EP_NONE (NULL) +#define PSMI_EP_NORETURN ((psm2_ep_t) -2) +#define PSMI_EP_LOGEVENT ((psm2_ep_t) -3) + +extern psm2_ep_errhandler_t psmi_errhandler_global; + +// +// psmi_handle_error has a 1st argument which controls how it behaves. +// PSMI_EP_NO_RETURN – unconditionally outputs message and exits or aborts +// process. +// other values – behavior is controlled by how psm2 error handler has been set +// via PSM API. OFI/psm3 provider disables error handler so these will +// be silent +// +// to have PSM stop immediately with a message. use +// psmi_handle_error(PSMI_EP_NO_RETURN +// all other uses of psmi_handle_error are under the control of the middleware +// or OFI provider. The OFI provider turns all of them off. +// +// To have a a message be unconditionally output for all builds, regardless of +// env variables, use _HFI_ERROR or _HFI_UNIT_ERROR +// All other logging macros are under the control of the user via env variables +// and build options can disable them +// +// Other logging calls are only enabled if _HFI_DEBUGGING is defined, +// in which case _HFI_INFO is also enabled by default (but env can disable it). +// All others controlled by env variable. +// +// Currently opa_debug.h always defines _HFI_DEBUGGING and it is included by +// opa_udebug.h, so logging is presently enabled in all builds. At some point +// may want to explore a performance optimization and disable logging macros +// for lower level debug messages in non-debug builds. + +psm2_error_t MOCKABLE(psmi_handle_error)(psm2_ep_t ep, psm2_error_t error, + const char *buf, ...) + __attribute__((format(printf, 3, 4))); +MOCK_DCL_EPILOGUE(psmi_handle_error); + +psm2_error_t psmi_error_cmp(psm2_error_t errA, psm2_error_t errB); +int psmi_error_syslog_level(psm2_error_t error); + +#endif /* _PSMI_ERROR_H */ diff --git a/prov/psm3/psm3/psm_gdrcpy.h b/prov/psm3/psm3/psm_gdrcpy.h new file mode 100644 index 00000000000..2173f0b9c6e --- /dev/null +++ b/prov/psm3/psm3/psm_gdrcpy.h @@ -0,0 +1,82 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2018 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2018 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2018 Intel Corporation. All rights reserved. */ +#ifndef GDR_CPY_H +#define GDR_CPY_H +#ifdef PSM_CUDA + +#include "ptl_ips/ips_proto.h" + +#define GDR_FD get_gdr_fd() + +int get_gdr_fd(); + +void hfi_gdr_open(); + +void hfi_gdr_close(); + +void * +gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf, + size_t size, int flags, + struct ips_proto* proto); + +int +gdr_unmap_gpu_host_addr(int gdr_fd, const void *buf, + size_t size, struct ips_proto* proto); + + +uint64_t +gdr_cache_evict(); +#endif +#endif diff --git a/prov/psm3/psm3/psm_hal_gen1/hfi1_deprecated_gen1.h b/prov/psm3/psm3/psm_hal_gen1/hfi1_deprecated_gen1.h new file mode 100644 index 00000000000..2e64b470885 --- /dev/null +++ b/prov/psm3/psm3/psm_hal_gen1/hfi1_deprecated_gen1.h @@ -0,0 +1,69 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* + + hfi1_deprecated_gen1.h + + Contains certain features of the hfi1 module that have been deprecated. + + These features may still need to be supported by the psm library for + reasons of backwards compatibility. + */ + +#ifndef __HFI1_DEPRECATED_GEN1_H__ + +#define __HFI1_DEPRECATED_GEN1_H__ + + +#endif /* #ifndef __HFI1_DEPRECATED_GEN1_H__ */ diff --git a/prov/psm3/psm3/psm_hal_gen1/opa_common_gen1.h b/prov/psm3/psm3/psm_hal_gen1/opa_common_gen1.h new file mode 100644 index 00000000000..fbe8e3e6c35 --- /dev/null +++ b/prov/psm3/psm3/psm_hal_gen1/opa_common_gen1.h @@ -0,0 +1,61 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef OPA_COMMON_GEN1_H +#define OPA_COMMON_GEN1_H + +#include "hfi1_deprecated_gen1.h" + +#endif /* OPA_COMMON_GEN1_H */ diff --git a/prov/psm3/psm3/psm_hal_gen1/opa_i2cflash_gen1.c b/prov/psm3/psm3/psm_hal_gen1/opa_i2cflash_gen1.c new file mode 100644 index 00000000000..b7628ca20c9 --- /dev/null +++ b/prov/psm3/psm3/psm_hal_gen1/opa_i2cflash_gen1.c @@ -0,0 +1,64 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include +#include +#include +#include +#include +#include +#include +#include + diff --git a/prov/psm3/psm3/psm_hal_gen1/opa_proto_gen1.c b/prov/psm3/psm3/psm_hal_gen1/opa_proto_gen1.c new file mode 100644 index 00000000000..6cb1e8c15cb --- /dev/null +++ b/prov/psm3/psm3/psm_hal_gen1/opa_proto_gen1.c @@ -0,0 +1,77 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* This file contains the initialization functions used by the low + level hfi protocol code. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "opa_user_gen1.h" +#include "opa_udebug.h" + +#include + +size_t arrsz[MAPSIZE_MAX] = { 0 }; + + diff --git a/prov/psm3/psm3/psm_hal_gen1/opa_service_gen1.c b/prov/psm3/psm3/psm_hal_gen1/opa_service_gen1.c new file mode 100644 index 00000000000..1a96860e906 --- /dev/null +++ b/prov/psm3/psm3/psm_hal_gen1/opa_service_gen1.c @@ -0,0 +1,468 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2018 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2018 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* This file contains hfi service routine interface used by the low + level hfi protocol code. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "opa_service_gen1.h" +#include "psmi_wrappers.h" +#include "psm_netutils.h" + +#define HFI_UD_NUM_CONTEXTS 1024 +#define HFI_UD_NUM_FREE_CTXTS 1024 + + + + + + + + +#ifdef PSM2_SUPPORT_IW_CMD_API +ustatic +int _hfi_cmd_ioctl(int fd, struct hfi1_cmd *cmd, size_t count) +{ + uint64_t addrOrLiteral[2] = { (uint64_t)cmd->addr, (uint64_t)&cmd->addr }; + const static struct + { + unsigned int ioctlCmd; + unsigned int addrOrLiteralIdx; + } cmdTypeToIoctlNum[PSMI_HFI_CMD_LAST] = { + [PSMI_HFI_CMD_ASSIGN_CTXT] = {HFI1_IOCTL_ASSIGN_CTXT , 0}, + [PSMI_HFI_CMD_CTXT_INFO] = {HFI1_IOCTL_CTXT_INFO , 0}, + [PSMI_HFI_CMD_USER_INFO] = {HFI1_IOCTL_USER_INFO , 0}, + [PSMI_HFI_CMD_TID_UPDATE] = {HFI1_IOCTL_TID_UPDATE , 0}, + [PSMI_HFI_CMD_TID_FREE] = {HFI1_IOCTL_TID_FREE , 0}, + [PSMI_HFI_CMD_CREDIT_UPD] = {HFI1_IOCTL_CREDIT_UPD , 1}, + [PSMI_HFI_CMD_RECV_CTRL] = {HFI1_IOCTL_RECV_CTRL , 1}, + [PSMI_HFI_CMD_POLL_TYPE] = {HFI1_IOCTL_POLL_TYPE , 1}, + [PSMI_HFI_CMD_ACK_EVENT] = {HFI1_IOCTL_ACK_EVENT , 1}, + [PSMI_HFI_CMD_SET_PKEY] = {HFI1_IOCTL_SET_PKEY , 1}, + [PSMI_HFI_CMD_CTXT_RESET] = {HFI1_IOCTL_CTXT_RESET , 1}, + [PSMI_HFI_CMD_TID_INVAL_READ] = {HFI1_IOCTL_TID_INVAL_READ, 0}, + [PSMI_HFI_CMD_GET_VERS] = {HFI1_IOCTL_GET_VERS , 1}, +#ifdef PSM_CUDA + [PSMI_HFI_CMD_TID_UPDATE_V2] = {HFI1_IOCTL_TID_UPDATE_V2 , 0}, +#endif + }; + + if (cmd->type < PSMI_HFI_CMD_LAST) + return psmi_ioctl(fd, + cmdTypeToIoctlNum[cmd->type].ioctlCmd, + addrOrLiteral[cmdTypeToIoctlNum[cmd->type].addrOrLiteralIdx]); + else + { + errno = EINVAL; + return -1; + } +} +#endif /* #ifdef PSM2_SUPPORT_IW_CMD_API */ + +/* we use mmap64() because we compile in both 32 and 64 bit mode, + and we have to map physical addresses that are > 32 bits long. + While linux implements mmap64, it doesn't have a man page, + and isn't declared in any header file, so we declare it here ourselves. + + We'd like to just use -D_LARGEFILE64_SOURCE, to make off_t 64 bits and + redirects mmap to mmap64 for us, but at least through suse10 and fc4, + it doesn't work when the address being mapped is > 32 bits. It chips + off bits 32 and above. So we stay with mmap64. */ +void *hfi_mmap64(void *addr, size_t length, int prot, int flags, int fd, + __off64_t offset) +{ + return mmap64(addr, length, prot, flags, fd, offset); +} + +/* get the number of units supported by the driver. Does not guarantee */ +/* that a working chip has been found for each possible unit #. */ +/* number of units >=0 (0 means none found). */ +/* formerly used sysfs file "num_units" */ +int hfi_get_num_units(void) +{ + int ret = 0; + + while (1) { + char pathname[PATH_MAX]; + struct stat st; + int r; + snprintf(pathname, sizeof(pathname), "/dev/infiniband/uverbs%d", ret); + r = stat(pathname, &st); + if (r) break; + + ret++; + } + return ret; +} + +/* Given a unit number, returns 1 if any port on the unit is active. + returns 0 if no port on the unit is active. + returns -1 when an error occurred. */ +int hfi_get_unit_active(int unit) +{ + int p,rv; + + for (p = HFI_MIN_PORT; p <= HFI_MAX_PORT; p++) + if ((rv=hfi_get_port_lid(unit, p)) > 0) + break; + + if (p <= HFI_MAX_PORT) + { + return 1; + } + + return rv; +} + +/* get the number of contexts from the unit id. */ +/* Returns 0 if no unit or no match. */ +int hfi_get_num_contexts(int unit_id) +{ + return HFI_UD_NUM_CONTEXTS; +} + +/* Given a unit number and port number, returns 1 if the unit and port are active. + returns 0 if the unit and port are not active. + returns -1 when an error occurred. */ +int hfi_get_port_active(int unit, int port) +{ + int ret; + char *state; + ret = hfi_sysfs_port_read(unit, port, "phys_state", &state); + if (ret == -1) { + if (errno == ENODEV) + /* this is "normal" for port != 1, on single port chips */ + _HFI_VDBG + ("Failed to get phys_state for unit %u:%u: %s\n", + unit, port, strerror(errno)); + else + _HFI_DBG + ("Failed to get phys_state for unit %u:%u: %s\n", + unit, port, strerror(errno)); + return -1; + } else { + if (strncmp(state, "5: LinkUp", 9)) { + _HFI_DBG("Link is not Up for unit %u:%u\n", unit, port); + free(state); + return 0; + } + free(state); + return 1; + } +} + +/* Given the unit number, return an error, or the corresponding LID + For now, it's used only so the MPI code can determine it's own + LID, and which other LIDs (if any) are also assigned to this node + Returns an int, so -1 indicates an error. 0 may indicate that + the unit is valid, but no LID has been assigned. + No error print because we call this for both potential + ports without knowing if both ports exist (or are connected) */ +/* This routine is used in many places, such as get_unit_active, to + * confirm the port is usable. As such it includes additional checks that + * the port is active and for link_layer ethernet that it includes a RoCE + * IPv4 GID whose subnet can be identified + */ +int hfi_get_port_lid(int unit, int port) +{ + int ret = 0; + int64_t val = 0; + + if (hfi_get_port_active(unit,port) != 1) + return -2; + ret = hfi_sysfs_port_read_s64(unit, port, "lid", &val, 0); + _HFI_VDBG("ret %d, unit %d port %d lid %ld\n", ret, unit, + port, (long int)val); + + if (ret == -1) { + if (errno == ENODEV) + /* this is "normal" for port != 1, on single port chips */ + _HFI_VDBG("Failed to get LID for unit %u:%u: %s\n", + unit, port, strerror(errno)); + else + _HFI_DBG("Failed to get LID for unit %u:%u: %s\n", + unit, port, strerror(errno)); + } else { + char *link_lyr; + ret = hfi_sysfs_port_read(unit, port, "link_layer", &link_lyr); + if (ret == -1) { + if (errno == ENODEV) + /* this is "normal" for port != 1, on single port chips */ + _HFI_VDBG("Failed to get link_layer for unit %u:%u: %s\n", + unit, port, strerror(errno)); + else + _HFI_DBG("Failed to get link_layer for unit %u:%u: %s\n", + unit, port, strerror(errno)); + } else { + _HFI_VDBG("ret %d, unit %d port %d link_layer %s\n", + ret, unit, port, link_lyr); + + /* If this port is an Ethernet Port lid does not matter, return 1 */ + if (strncmp(link_lyr, "Ethernet", strlen("Ethernet")) == 0) { + uint64_t subnet, hi; + if (0 != hfi_get_port_subnet(unit, port, + &subnet, NULL, NULL, NULL, NULL, &hi, NULL)) { + _HFI_DBG("Failed to get subnet for unit %u:%u: %s\n", + unit, port, strerror(errno)); + ret = -1; + } else if (subnet == hi) { + _HFI_DBG("Skipping unit %u:%u: no RoCE IPv4 GID\n", + unit, port); + ret = -1; + } else + ret = 1; // for RoCE LID does not matter, return 1 + } else + ret = val; // OPA/IB LID we got + free(link_lyr); + } + } + + return ret; +} + +/* Given the unit number, return an error, or the corresponding GID + For now, it's used only so the MPI code can determine its fabric ID. + Returns an int, so -1 indicates an error. + No error print because we call this for both potential + ports without knowing if both ports exist (or are connected) */ +static int hfi_get_port_gid(int unit, int port, int idx, uint64_t *hi, uint64_t *lo) +{ + int ret; + char *gid_str = NULL; + char attr_str[64]; + + snprintf(attr_str, 64, "gids/%d", idx < 0 ? 0 : idx); + ret = hfi_sysfs_port_read(unit, port, attr_str, &gid_str); + if (ret == -1) { + if (errno == ENODEV) + /* this is "normal" for port != 1, on single + * port chips */ + _HFI_VDBG("Failed to get GID for unit %u:%u: %s\n", + unit, port, strerror(errno)); + else + _HFI_DBG("Failed to get GID for unit %u:%u: %s\n", + unit, port, strerror(errno)); + } else { + uint32_t gid[8] = {0}; + if (sscanf(gid_str, "%4x:%4x:%4x:%4x:%4x:%4x:%4x:%4x", + &gid[0], &gid[1], &gid[2], &gid[3], + &gid[4], &gid[5], &gid[6], &gid[7]) != 8) { + _HFI_DBG("Failed to parse GID for unit %u:%u: %s\n", + unit, port, gid_str); + ret = -1; + } else { + *hi = (((uint64_t) gid[0]) << 48) + | (((uint64_t) gid[1]) << 32) + | (((uint64_t) gid[2]) << 16) + | (((uint64_t) gid[3]) << 0); + *lo = (((uint64_t) gid[4]) << 48) + | (((uint64_t) gid[5]) << 32) + | (((uint64_t) gid[6]) << 16) + | (((uint64_t) gid[7]) << 0); + } + free(gid_str); + } + + return ret; +} +int hfi_get_unit_cpumask(int unit, cpu_set_t *cpuset) +{ + int ret = -1; + char *cpulist; + + CPU_ZERO(cpuset); + + ret = hfi_sysfs_unit_read(unit, "device/local_cpulist", &cpulist); + if (ret == -1) { + _HFI_VDBG("Failed to get cpu list for unit %u: %s\n", + unit, strerror(errno)); + } else { + int i = 0; + char *next_comma = NULL; + char *temp = cpulist; + char *dash; + int first = -1, last = -1; + + do { + next_comma = strchr(temp, ','); + dash = strchr(temp, '-'); + + first = atoi(temp); + + if (dash == NULL || (dash > next_comma && next_comma != NULL)) { + last = first; + } else { + last = atoi(dash + 1); + } + + for (i = first; i <= last; i++) { + CPU_SET(i, cpuset); + ret++; + } + + temp = next_comma + 1; + } while (next_comma != NULL); + + free(cpulist); + } + + return (ret >= 0 ? 0 : -1); +} + +/* Given the unit number, return an error, or the corresponding subnet + For IB/OPA the subnet is the hi 64b of the 1st GID + addr is the low 64b of the gid, ip_addr and netmask are N/A (0) + For Ethernet it's the IPv4 subnet derived from the 1st RoCE IPv4 GID + subnet is the upper portion of the ip_addr (& netmask) + addr is the lower portion of the ip_addr (& ~netmask) + and ip_addr and netmask are returned + In all cases, idx, hi and lo are the actual gid + All values are in host byte order + Returns an int, so -1 indicates an error. + No error print because we call this for both potential + ports without knowing if both ports exist (or are connected) */ +int hfi_get_port_subnet(int unit, int port, uint64_t *subnet, uint64_t *addr, + uint32_t *ip_addr, uint32_t *netmask, + int *idx, uint64_t *hi, uint64_t *lo) +{ + int i; + int have_subnet = 0; + uint64_t gid_hi, gid_lo; + + for (i =0; ; i++) { + if (-1 == hfi_get_port_gid(unit, port, i, &gid_hi, &gid_lo)) + break; // stop at 1st non-existent gid (or non-existent port) + if (gid_lo == 0) // Skip over empty gid table entries. + continue; + if (! have_subnet) { + // save 1st valid gid, this is answer unless we find eth + if (idx) *idx = i; + if (subnet) *subnet = gid_hi; + if (addr) *addr = gid_lo; + if (ip_addr) *ip_addr = 0; + if (netmask) *netmask = 0; + if (hi) *hi = gid_hi; + if (lo) *lo = gid_lo; + have_subnet = 1; + } + // RoCEv2 Gid => ::ffff: + if (gid_hi == 0x0 && (gid_lo >> 32) == 0x0000ffff) { + uint32_t ipaddr = (uint32_t)(gid_lo & 0xffffffff); + __be32 mask = 0; + if (!psmi_get_eth_netmask(__cpu_to_be32(ipaddr), &mask)) { + // stop at 1st valid ethernet gid + uint32_t nm = __be32_to_cpu(mask); + if (idx) *idx = i; + if (subnet) *subnet = ipaddr & nm; + if (addr) *addr = ipaddr & ~nm; + if (ip_addr) *ip_addr = ipaddr; + if (netmask) *netmask = nm; + if (hi) *hi = gid_hi; + if (lo) *lo = gid_lo; + break; + } else { + return -1; // we're stuck, can't figure out netmask + } + } + } + return (have_subnet?0:-1); +} + + +/* Given the unit number, return an error, or the corresponding link rate + for the port */ +/* Returns an int, so -1 indicates an error. */ +int hfi_get_port_rate(int unit, int port) +{ + int ret; + double rate; + char *data_rate = NULL, *newptr; + + ret = hfi_sysfs_port_read(unit, port, "rate", &data_rate); + if (ret == -1) + goto get_port_rate_error; + else { + rate = strtod(data_rate, &newptr); + if ((rate == 0) && (data_rate == newptr)) + goto get_port_rate_error; + } + + free(data_rate); + return ((int)(rate * 2) >> 1); + +get_port_rate_error: + _HFI_INFO("Failed to get link rate for unit %u:%u: %s\n", + unit, port, strerror(errno)); + + return ret; +} + + + + + + + diff --git a/prov/psm3/psm3/psm_hal_gen1/opa_service_gen1.h b/prov/psm3/psm3/psm_hal_gen1/opa_service_gen1.h new file mode 100644 index 00000000000..c9fbb4e6fb9 --- /dev/null +++ b/prov/psm3/psm3/psm_hal_gen1/opa_service_gen1.h @@ -0,0 +1,181 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef OPA_SERVICE_GEN1_H +#define OPA_SERVICE_GEN1_H + +/* This file contains all the lowest level routines calling into sysfs */ +/* and qib driver. All other calls are based on these routines. */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE /* See feature_test_macros(7) */ +#endif +#include /* cpu_set_t and CPU_* MACROs */ +#include + +#include "opa_intf.h" +#include "opa_common_gen1.h" +#include "opa_udebug.h" +#include "opa_byteorder.h" + +/* upper and lower bounds for HFI port numbers */ +#define HFI_MIN_PORT 1 +#define HFI_MAX_PORT 1 +#ifndef HFI_NUM_PORTS_GEN1 +#define HFI_NUM_PORTS_GEN1 (HFI_MAX_PORT - HFI_MIN_PORT + 1) +#endif +/* any unit id to match. */ +#define PSM3_NIC_ANY ((long)-1) +/* any port num to match. */ +#define PSM3_NIC_PORT_ANY ((long)0) + + +/* Given a unit number and port number, returns 1 if the unit and port are active. + returns 0 if the unit and port are not active. returns -1 when an error occurred. */ +int hfi_get_port_active(int, int); + +/* Given the unit number and port, return an error, or the corresponding LID */ +/* Returns an int, so -1 indicates a general error. -2 indicates that the unit/port + are not active. 0 indicates that the unit is valid, but no LID has been assigned. */ +int hfi_get_port_lid(int, int); + +/* Given a unit number, return an error, or the corresponding cpuset. */ +/* Returns an int, so -1 indicates an error. */ +int hfi_get_unit_cpumask(int unit, cpu_set_t *cpuset); + +/* Given the unit number and port, return an error, or the corresponding */ +/* subnet, addr and gid. For ethernet uses 1st IPv4 RoCE gid. */ +/* For IB/OPA uses 1st valid gid */ +/* Returns an int, so -1 indicates an error. */ +int hfi_get_port_subnet(int unit, int port, uint64_t *subnet, uint64_t *addr, + uint32_t *ip_addr, uint32_t *netmask, + int *idx, uint64_t *hi, uint64_t *lo); + + +/* Given the unit number, return an error, or the corresponding link rate + for the port */ +/* Returns an int, so -1 indicates an error. */ +int hfi_get_port_rate(int unit, int port); + + +/* Get the number of units supported by the driver. Does not guarantee + that a working chip has been found for each possible unit #. + Returns -1 with errno set, or number of units >=0 (0 means none found). */ +int hfi_get_num_units(); + +/* Given a unit number, returns 1 if any port on the unit is active. + returns 0 if no port on the unit is active. + returns -1 when an error occurred. */ +int hfi_get_unit_active(int unit); + +/* get the number of contexts from the unit id. */ +int hfi_get_num_contexts(int unit); + + +/* We use mmap64() because we compile in both 32 and 64 bit mode, + and we have to map physical addresses that are > 32 bits long. + While linux implements mmap64, it doesn't have a man page, + and isn't declared in any header file, so we declare it here ourselves. */ + +/* We'd like to just use -D_LARGEFILE64_SOURCE, to make off_t 64 bits and + redirects mmap to mmap64 for us, but at least through suse10 and fc4, + it doesn't work when the address being mapped is > 32 bits. It chips + off bits 32 and above. So we stay with mmap64. */ +extern void *mmap64(void *, size_t, int, int, int, __off64_t); +void *hfi_mmap64(void *, size_t, int, int, int, __off64_t); + +/* Statistics maintained by the driver */ +int hfi_get_stats(uint64_t *, int); +int hfi_get_stats_names(char **namep); +/* Counters maintained in the chip, globally, and per-prot */ +int hfi_get_ctrs_unit(int unitno, uint64_t *, int); +int hfi_get_ctrs_unit_names(int unitno, char **namep); +int hfi_get_ctrs_port(int unitno, int port, uint64_t *, int); +int hfi_get_ctrs_port_names(int unitno, char **namep); + +/* sysfs helper routines (only those currently used are exported; + * try to avoid using others) */ + +const char *sysfs_unit_path(int unit_id); + +/* read a string value */ +int hfi_sysfs_port_read(uint32_t unit, uint32_t port, const char *attr, + char **datap); + +/* read a string value into buff, no more than size bytes. + returns the number of bytes read */ +size_t hfi_sysfs_unit_port_read(uint32_t unit, uint32_t port, const char *attr, + char *buff, size_t size); + +/* open attribute in unit's sysfs directory via open(2) */ +int hfi_sysfs_unit_open(uint32_t unit, const char *attr, int flags); +int hfi_sysfs_port_open(uint32_t unit, uint32_t port, const char *attr, + int flags); + +int hfi_sysfs_unit_read(uint32_t unit, const char *attr, char **datap); + +/* print to attribute in {unit,port} sysfs directory */ +int hfi_sysfs_port_printf(uint32_t unit, uint32_t port, const char *attr, + const char *fmt, ...) + __attribute__((format(printf, 4, 5))); +int hfi_sysfs_unit_printf(uint32_t unit, const char *attr, const char *fmt, ...) + __attribute__((format(printf, 3, 4))); + +/* read a signed 64-bit quantity, in some arbitrary base */ +int hfi_sysfs_unit_read_s64(uint32_t unit, const char *attr, + int64_t *valp, int base); +int hfi_sysfs_port_read_s64(uint32_t unit, uint32_t port, const char *attr, + int64_t *valp, int base); +int64_t hfi_sysfs_unit_read_node_s64(uint32_t unit); + +#endif /* OPA_SERVICE_GEN1_H */ diff --git a/prov/psm3/psm3/psm_hal_gen1/opa_user_gen1.h b/prov/psm3/psm3/psm_hal_gen1/opa_user_gen1.h new file mode 100644 index 00000000000..49e786f7384 --- /dev/null +++ b/prov/psm3/psm3/psm_hal_gen1/opa_user_gen1.h @@ -0,0 +1,294 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef OPA_USER_GEN1_H +#define OPA_USER_GEN1_H + +/* This file contains all of the data structures and routines that are + publicly visible and usable (to low level infrastructure code; it is + not expected that any application, or even normal application-level library, + will ever need to use any of this). + + Additional entry points and data structures that are used by these routines + may be referenced in this file, but they should not be generally available; + they are visible here only to allow use in inlined functions. Any variable, + data structure, or function that starts with a leading "_" is in this + category. +*/ + +/* Include header files we need that are unlikely to otherwise be needed by */ +/* programs. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "opa_intf.h" +#include "opa_common_gen1.h" +#include "opa_byteorder.h" +#include "opa_udebug.h" +#include "opa_service_gen1.h" +#include "opa_user.h" + +#define HFI_RHF_USE_EGRBFR_MASK 0x1 +#define HFI_RHF_USE_EGRBFR_SHIFT 15 +#define HFI_RHF_EGRBFR_INDEX_MASK 0x7FF +#define HFI_RHF_EGRBFR_INDEX_SHIFT 16 + +#define HFI_RHF_SEQ_MASK 0xF +#define HFI_RHF_SEQ_SHIFT 28 +#define HFI_RHF_EGRBFR_OFFSET_MASK 0xFFF +#define HFI_RHF_EGRBFR_OFFSET_SHIFT 0 +#define HFI_RHF_HDRQ_OFFSET_MASK 0x1FF +#define HFI_RHF_HDRQ_OFFSET_SHIFT 12 +#define HFI_RHF_TIDERR 0x08000000 + +/* TidFlow related bits */ +#define HFI_TF_SEQNUM_SHIFT 0 +#define HFI_TF_SEQNUM_MASK 0x7ff + +#define HFI_TF_GENVAL_SHIFT 11 +#define HFI_TF_GENVAL_MASK 0xfffff + +#define HFI_TF_FLOWVALID_SHIFT 32 +#define HFI_TF_FLOWVALID_MASK 0x1 + +#define HFI_TF_KEEP_AFTER_SEQERR_SHIFT 34 +#define HFI_TF_KEEP_AFTER_SEQERR_MASK 0x1 +#define HFI_TF_KEEP_ON_GENERR_SHIFT 35 +#define HFI_TF_KEEP_ON_GENERR_MASK 0x1 +#define HFI_TF_KEEP_PAYLOAD_ON_GENERR_SHIFT 36 +#define HFI_TF_KEEP_PAYLOAD_ON_GENERR_MASK 0x1 +#define HFI_TF_STATUS_SEQMISMATCH_SHIFT 37 +#define HFI_TF_STATUS_SEQMISMATCH_MASK 0x1 +#define HFI_TF_STATUS_GENMISMATCH_SHIFT 38 +#define HFI_TF_STATUS_GENMISMATCH_MASK 0x1 + +/* PBC bits */ +#define HFI_PBC_STATICRCC_SHIFT 0 +#define HFI_PBC_STATICRCC_MASK 0xffff + +#define HFI_PBC_SC4_SHIFT 4 +#define HFI_PBC_SC4_MASK 0x1 + +#define HFI_PBC_INTR_SHIFT 31 +#define HFI_PBC_DCINFO_SHIFT 30 +#define HFI_PBC_TESTEBP_SHIFT 29 +#define HFI_PBC_PACKETBYPASS_SHIFT 28 +#define HFI_PBC_INSERTHCRC_SHIFT 26 +#define HFI_PBC_INSERTHCRC_MASK 0x3 +#define HFI_PBC_CREDITRETURN_SHIFT 25 +#define HFI_PBC_INSERTBYPASSICRC_SHIFT 24 +#define HFI_PBC_TESTBADICRC_SHIFT 23 +#define HFI_PBC_FECN_SHIFT 22 +#define HFI_PBC_VL_SHIFT 12 +#define HFI_PBC_VL_MASK 0xf +#define HFI_PBC_LENGTHDWS_SHIFT 0 +#define HFI_PBC_LENGTHDWS_MASK 0xfff + +/* this portion only defines what we currently use */ +struct hfi_pbc { + __u32 pbc0; + __u16 PbcStaticRateControlCnt; + __u16 fill1; +}; + +typedef enum mapsize +{ SC_CREDITS, + PIO_BUFBASE_SOP, + PIO_BUFBASE, + RCVHDR_BUFBASE, + RCVEGR_BUFBASE, + SDMA_COMP_BUFBASE, + USER_REGBASE, + RCVHDRTAIL_BASE, + EVENTS_BUFBASE, + STATUS_BUFBASE, + SUBCTXT_UREGBASE, + SUBCTXT_RCVHDRBUF, + SUBCTXT_RCVEGRBUF, + MAPSIZE_MAX +} mapsize_t; + +/* TODO: consider casting in the ALIGN() macro */ +#define ALIGN(x, a) (((x)+(a)-1)&~((a)-1)) +#define ALIGNDOWN_PTR(x, a) ((void*)(((uintptr_t)(x))&~((uintptr_t)((a)-1)))) + +/* using the same flags for all the mappings */ +#define HFI_MMAP_FLAGS (MAP_SHARED|MAP_LOCKED) +#define HFI_MMAP_PGSIZE sysconf(_SC_PAGESIZE) +/* cast to uintptr_t as opposed to intptr_t which evaluates to a signed type + * * on which one should not perform bitwise operations (undefined behavior) + * */ +#define HFI_MMAP_PGMASK (~(uintptr_t)(HFI_MMAP_PGSIZE-1)) + +/* this is only an auxiliary macro for HFI_MMAP_ERRCHECK() + * @off expected to be unsigned in order to AND with the page mask and avoid undefined behavior + */ +#define U64_TO_OFF64_PGMASK(off) ((__off64_t)((off) & HFI_MMAP_PGMASK)) + +#define HFI_MMAP_ALIGNOFF(fd, off, size, prot) hfi_mmap64(0,(size),(prot),HFI_MMAP_FLAGS,(fd),U64_TO_OFF64_PGMASK((off))) +/* complementary */ +#define HFI_MUNMAP(addr, size) munmap((addr), (size)) + +/* make sure uintmax_t can hold the result of unsigned int multiplication */ +#if UINT_MAX > (UINTMAX_MAX / UINT_MAX) +#error We cannot safely multiply unsigned integers on this platform +#endif + +/* @member assumed to be of type u64 and validated to be so */ +#define HFI_MMAP_ERRCHECK(fd, binfo, member, size, prot) ({ \ + typeof((binfo)->member) *__tptr = (__u64 *)NULL; \ + (void)__tptr; \ + void *__maddr = HFI_MMAP_ALIGNOFF((fd), (binfo)->member, (size), (prot)); \ + do { \ + if (unlikely(__maddr == MAP_FAILED)) { \ + uintmax_t outval = (uintmax_t)((binfo)->member); \ + _HFI_INFO("mmap of " #member " (0x%jx) size %zu failed: %s\n", \ + outval, size, strerror(errno)); \ + goto err_mmap_##member; \ + } \ + (binfo)->member = (__u64)__maddr; \ + _HFI_VDBG(#member "mmap %jx successful\n", (uintmax_t)((binfo)->member)); \ + } while(0); \ + __maddr; \ +}) + +/* assigns 0 to the member after unmapping */ +#define HFI_MUNMAP_ERRCHECK(binfo, member, size) \ + do { typeof((binfo)->member) *__tptr = (__u64 *)NULL; \ + (void)__tptr; \ + void *__addr = ALIGNDOWN_PTR((binfo)->member, HFI_MMAP_PGSIZE); \ + if (unlikely( __addr == NULL || (munmap(__addr, (size)) == -1))) { \ + _HFI_INFO("unmap of " #member " (%p) failed: %s\n", \ + __addr, strerror(errno)); \ + } \ + else { \ + _HFI_VDBG("unmap of " #member "(%p) succeeded\n", __addr); \ + (binfo)->member = 0; \ + } \ + } while(0) + +#define HFI_PCB_SIZE_IN_BYTES 8 + +/* Usable bytes in header (hdrsize - lrh - bth) */ +#define HFI_MESSAGE_HDR_SIZE_HFI (HFI_MESSAGE_HDR_SIZE-20) + +/* + * SDMA includes 8B sdma hdr, 8B PBC, and message header. + * If we are using GPU workloads, we need to set a new + * "flags" member which takes another 2 bytes in the + * sdma hdr. We let the driver know of this 2 extra bytes + * at runtime when we set the length for the iovecs. + */ +#define HFI_SDMA_HDR_SIZE (8+8+56) + +static inline __u32 hfi_hdrget_seq(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[0]) >> HFI_RHF_SEQ_SHIFT) + & HFI_RHF_SEQ_MASK; +} + +static inline __u32 hfi_hdrget_hdrq_offset(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[1]) >> HFI_RHF_HDRQ_OFFSET_SHIFT) + & HFI_RHF_HDRQ_OFFSET_MASK; +} + + + +/* don't inline these; it's all init code, and not inlining makes the */ +/* overall code shorter and easier to debug */ +void hfi_touch_mmap(void *, size_t) __attribute__ ((noinline)); + + +/* +* Safe version of hfi_[d/q]wordcpy that is guaranteed to only copy each byte once. +*/ +#if defined(__x86_64__) && defined(HAVE_PSM3_DWORD_FAST) +void hfi_dwordcpy_safe(volatile uint32_t *dest, const uint32_t *src, + uint32_t ndwords); +void hfi_qwordcpy_safe(volatile uint64_t *dest, const uint64_t *src, + uint32_t nqwords); +#else +#define hfi_dwordcpy_safe hfi_dwordcpy +#define hfi_qwordcpy_safe hfi_qwordcpy +#endif + + + + + + + + + + + + + + + + + + + + + + +#endif /* OPA_USER_GEN1_H */ diff --git a/prov/psm3/psm3/psm_hal_gen1/opa_utils_gen1.c b/prov/psm3/psm3/psm_hal_gen1/opa_utils_gen1.c new file mode 100644 index 00000000000..7ed8e123e4f --- /dev/null +++ b/prov/psm3/psm3/psm_hal_gen1/opa_utils_gen1.c @@ -0,0 +1,97 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* This file contains hfi service routine interface used by the low */ +/* level hfi protocol code. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "opa_user_gen1.h" + +/* touch the pages, with a 32 bit read */ +void hfi_touch_mmap(void *m, size_t bytes) +{ + volatile uint32_t *b = (volatile uint32_t *)m, c; + size_t i; /* m is always page aligned, so pgcnt exact */ + int __hfi_pg_sz; + + /* First get the page size */ + __hfi_pg_sz = sysconf(_SC_PAGESIZE); + + _HFI_VDBG("Touch %lu mmap'ed pages starting at %p\n", + (unsigned long)bytes / __hfi_pg_sz, m); + bytes /= sizeof(c); + for (i = 0; i < bytes; i += __hfi_pg_sz / sizeof(c)) + c = b[i]; +} + + +// never called for UD/UDP, we use __psm2_ep_poll_type instead + + + + + + diff --git a/prov/psm3/psm3/psm_hal_gen1/psm_gdrcpy.c b/prov/psm3/psm3/psm_hal_gen1/psm_gdrcpy.c new file mode 100644 index 00000000000..25036895b27 --- /dev/null +++ b/prov/psm3/psm3/psm_hal_gen1/psm_gdrcpy.c @@ -0,0 +1,151 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2018 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2018 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ +#ifdef PSM_CUDA +#include "psm_user.h" +#include "psm2_hal.h" +#include "psm_gdrcpy.h" +#include +#include +#include +#include "ptl_ips/ips_tid.h" +#include "ptl_ips/ips_expected_proto.h" +#include "opa_user_gen1.h" + +static int gdr_fd; + +int get_gdr_fd(){ + return gdr_fd; +} + +#define GPU_PAGE_OFFSET_MASK (PSMI_GPU_PAGESIZE -1) +#define GPU_PAGE_MASK ~GPU_PAGE_OFFSET_MASK + + + + + +void * +gdr_convert_gpu_to_host_addr(int gdr_fd, unsigned long buf, + size_t size, int flags, + struct ips_proto* proto) +{ + void *host_addr_buf; + + uintptr_t pageaddr = buf & GPU_PAGE_MASK; +// TBD - is this comment correct? Callers may be calling for a whole +// app buffer, especially when RECV RDMA is disabled + /* As size is guarenteed to be in the range of 0-8kB + * there is a guarentee that buf+size-1 does not overflow + * 64 bits. + */ + uint32_t pagelen = (uint32_t) (PSMI_GPU_PAGESIZE + + ((buf + size - 1) & GPU_PAGE_MASK) - + pageaddr); + + _HFI_VDBG("buf=%p size=%zu pageaddr=%p pagelen=%u flags=0x%x proto=%p\n", + (void *)buf, size, (void *)pageaddr, pagelen, flags, proto); +#ifdef RNDV_MOD_MR + host_addr_buf = __psm2_rv_pin_and_mmap(proto->ep->verbs_ep.rv, pageaddr, pagelen); + if (! host_addr_buf) { + if (errno == ENOMEM || errno == EINVAL) { + /* Fatal error */ + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Unable to PIN GPU pages(Out of BAR1 space) (errno: %d)\n", errno); + return NULL; + } else { + /* Fatal error */ + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "PIN/MMAP ioctl failed errno %d\n", + errno); + return NULL; + } + } +#else + psmi_assert_always(0); // unimplemented, should not get here + host_addr_buf = NULL; +#endif /* RNDV_MOD_MR */ + return host_addr_buf + (buf & GPU_PAGE_OFFSET_MASK); +} + +// keep this symmetrical with other functions, even though gdr_fd not used +int +gdr_unmap_gpu_host_addr(int gdr_fd, const void *buf, + size_t size, struct ips_proto* proto) +{ +#ifdef RNDV_MOD_MR + // TBD - will we need to round size up to pagelen? + if (0 != __psm2_rv_munmap_and_unpin(proto->ep->verbs_ep.rv, buf, size)) { + /* Fatal error */ + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "UNMMAP/UNPIN ioctl failed errno %d\n", + errno); + return -1; + } + return 0; +#else + psmi_assert_always(0); // unimplemented, should not get here + errno = EINVAL; + return -1; +#endif +} + + +void hfi_gdr_open(){ + return; +} + +void hfi_gdr_close() +{ +} + +#endif diff --git a/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1.c b/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1.c new file mode 100644 index 00000000000..bd9eb23d5f5 --- /dev/null +++ b/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1.c @@ -0,0 +1,121 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "psm_user.h" +#include "psm2_hal.h" + +#if PSMI_HAL_INST_CNT > 1 +#define PSMI_HAL_CAT_INL_SYM(KERNEL) hfp_gen1_ ## KERNEL +#include "psm2_hal_inline_t.h" +#include "psm_hal_inline_i.h" +#endif + +/* define the singleton that implements hal for gen1 */ +static hfp_gen1_t psm_gen1_hi = { + /* start of public psmi_hal_instance_t data */ + .phi = { + .type = PSM_HAL_INSTANCE_GEN1, + .description = "PSM3 HAL instance for GEN1" +#ifdef PSM_CUDA + " (cuda)" +#endif + , + .hfi_name = "hfi1", + .hfi_sys_class_path = "/sys/class/infiniband/hfi1", + .params = {0}, + + /* The following methods are alphabetized */ +#if PSMI_HAL_INST_CNT > 1 + + .hfp_close_context = hfp_gen1_close_context, + .hfp_context_open = hfp_gen1_context_open, + + + .hfp_finalize_ = hfp_gen1_finalize_, + + + .hfp_get_jkey = hfp_gen1_get_jkey, + + + .hfp_get_node_id = hfp_gen1_get_node_id, + + + + .hfp_get_port_lid = hfp_gen1_get_port_lid, + + + .hfp_get_port_rate = hfp_gen1_get_port_rate, + + + .hfp_spio_process_events = hfp_gen1_spio_process_events, + .hfp_spio_transfer_frame = hfp_gen1_spio_transfer_frame, + + +#endif // PSMI_HAL_INST_CNT > 1 + .hfp_get_port_subnet = hfp_gen1_get_port_subnet, + .hfp_get_default_pkey = hfp_gen1_get_default_pkey, + .hfp_get_num_contexts = hfp_gen1_get_num_contexts, + .hfp_get_num_free_contexts = hfp_gen1_get_num_free_contexts, + .hfp_get_num_units = hfp_gen1_get_num_units, + .hfp_get_num_ports = hfp_gen1_get_num_ports, + .hfp_get_port_active = hfp_gen1_get_port_active, + .hfp_get_unit_active = hfp_gen1_get_unit_active, + .hfp_initialize = hfp_gen1_initialize, + }, +}; + +/* __psmi_hal_gen1_constructor */ +static void __attribute__ ((constructor)) __psmi_hal_gen1_constructor(void) +{ + psmi_hal_register_instance((psmi_hal_instance_t*)&psm_gen1_hi); +} diff --git a/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1.h b/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1.h new file mode 100644 index 00000000000..3509bcddf57 --- /dev/null +++ b/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1.h @@ -0,0 +1,74 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "psm_user.h" +#include "ips_proto.h" +#include "ips_proto_internal.h" +#include "psm_hal_gen1_spio.h" +#include "psm_mq_internal.h" +#include "opa_user_gen1.h" + + +/* Private struct on a per-context basis. */ +typedef struct _hfp_gen1_pc_private +{ +} hfp_gen1_pc_private; + + +/* declare hfp_gen1_t struct, (combines public psmi_hal_instance_t + together with a private struct) */ +typedef struct _hfp_gen1 +{ + psmi_hal_instance_t phi; +} hfp_gen1_t; + diff --git a/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1_spio.c b/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1_spio.c new file mode 100644 index 00000000000..5f57e4ba3d8 --- /dev/null +++ b/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1_spio.c @@ -0,0 +1,288 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2017 Intel Corporation. All rights reserved. */ + +/* included header files */ +#include +#include +#include +#include +#include + +#include "ips_proto.h" +#include "ips_proto_internal.h" +#include "psm_hal_gen1_spio.h" +#include "ips_proto_params.h" + +/* Report PIO stalls every 20 seconds at the least */ +#define SPIO_STALL_WARNING_INTERVAL (nanosecs_to_cycles(20e9)) +#define SPIO_MAX_CONSECUTIVE_SEND_FAIL (1<<20) /* 1M */ +/* RESYNC_CONSECUTIVE_SEND_FAIL has to be a multiple of MAX_CONSECUTIVE */ +#define SPIO_RESYNC_CONSECUTIVE_SEND_FAIL (1<<4) /* 16 */ + + + + + + + + + + + + + + +/* + * Check and process events + * return value: + * PSM2_OK: normal events processing; + * PSM2_OK_NO_PROGRESS: no event is processed; + */ +static PSMI_HAL_INLINE psm2_error_t +ips_spio_process_events(const struct ptl *ptl_gen) +{ + // TODD - TBD - check link status events for UD/UDP + return PSM2_OK; +} + + +// TBD we could get also get scb->cksum out of scb +// when called: +// scb->ips_lrh has fixed size PSM header including OPA LRH +// payload, length is data after header +// we don't do checksum, let verbs handle that for us +// we need to manage our own registered send buffers because +// in the control paths (connect, disconnect), the scb may be on the stack +// and we must be done with it when this returns. +// in the normal path the scb could be longer lived if we wanted it to be. +// OPA SDMA had a synchronous routine on control path (ips_dma_transfer_frame) +// which started the DMA and waits for it to complete +// in the normal path, scb_send_dma was used. This sends all the scb's on a +// pending queue. It only reaps DMA in that path if it is out of DMA resources +// a few receive paths for ack and nak also reap send DMAs. +// In general scb's just describe an IO, they do not have persistent buffers. +// So send bounce buffers avoid MR handling overheads. +// So for simplicity here we will take a lazy Send CQ reaping strategy. +// We'll reap if we need more and will do a quick reap after we post a new send +// this should keep CQ reaping out of the latency path for microbenchmarks. +// It does not seem that DMA does any reaping in other progress calls +// however the reaping in ack's may help it. +// important to note that UD Send completion just means the packet exited the +// local HFI, does not imply end to end delivery. PIO has +// similar semantics and we know the UDP sendto simply puts a packet on +// a UDP queue for future transmission, much like a UD QP post_send works +psm2_error_t +ips_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, + struct ips_scb *scb, uint32_t *payload, + uint32_t length, uint32_t isCtrlMsg, + uint32_t cksum_valid, uint32_t cksum +#ifdef PSM_CUDA + , uint32_t is_cuda_payload +#endif + ) +{ + psm2_error_t ret = PSM2_OK; + psm2_error_t err; + psm2_ep_t ep = proto->ep; + struct ibv_send_wr wr; + struct ibv_send_wr *bad_wr; + struct ibv_sge list; + sbuf_t sbuf; + struct ips_message_header *ips_lrh = &scb->ips_lrh; + + // these defines are bit ugly, but make code below simpler with less ifdefs + // once we decide if USE_RC is valuable we can cleanup + // for RC we continue to use UD QP for control messages + // (connect/disconnect/ack/nak/becn), this avoids issues especially during + // QP teardown in disconnect. We also use UD for ACK/NAK, this allows + // flow credits to be managed over UD +#define USE_ALLOCATOR (isCtrlMsg?&ep->verbs_ep.send_allocator:flow->ipsaddr->use_allocator) +#define USE_QP (isCtrlMsg?ep->verbs_ep.qp:flow->ipsaddr->use_qp) +#define USE_MAX_INLINE (isCtrlMsg?ep->verbs_ep.qp_cap.max_inline_data:flow->ipsaddr->use_max_inline_data) + +#ifdef PSM_FI + if_pf(PSMI_FAULTINJ_ENABLED_EP(ep)) { + PSMI_FAULTINJ_STATIC_DECL(fi_sendlost, "sendlost", + "drop " + "RC eager or any " + "UD packet before sending", + 1, IPS_FAULTINJ_SENDLOST); + if (psmi_faultinj_is_fault(fi_sendlost)) + return PSM2_OK; + } +#endif + PSMI_LOCK_ASSERT(proto->mq->progress_lock); + psmi_assert_always(! cksum_valid); // no software checksum yet + // allocate a send buffer + // if we have no buffers, we can return PSM2_EP_NO_RESOURCES and caller + // will try again later + sbuf = __psm2_ep_verbs_alloc_sbuf(USE_ALLOCATOR); + if_pf (! sbuf) { + // reap some SQ completions + ret = psm2_verbs_completion_update(proto->ep); + if_pf (ret != PSM2_OK) + return ret; + sbuf = __psm2_ep_verbs_alloc_sbuf(USE_ALLOCATOR); + } + if_pf (! sbuf) { + _HFI_VDBG("out of send buffers\n"); + return PSM2_EP_NO_RESOURCES; + } + _HFI_VDBG("got sbuf %p index %lu\n", sbuf_to_buffer(sbuf), send_buffer_index(sbuf_pool(ep, sbuf), sbuf_to_buffer(sbuf))); + // TBD - we should be able to skip sending some headers such as OPA lrh and + // perhaps bth (does PSM use bth to hold PSNs?) + // copy scb->ips_lrh to send buffer + _HFI_VDBG("copy lrh %p\n", ips_lrh); + memcpy(sbuf_to_buffer(sbuf), ips_lrh, sizeof(*ips_lrh)); + // copy payload to send buffer, length could be zero, be safe + _HFI_VDBG("copy payload %p %u\n", payload, length); +#ifdef PSM_CUDA + if (is_cuda_payload) { + PSMI_CUDA_CALL(cuMemcpyDtoH, sbuf_to_buffer(sbuf)+sizeof(*ips_lrh), + (CUdeviceptr)payload, length); + } else +#endif + { + memcpy(sbuf_to_buffer(sbuf)+sizeof(*ips_lrh), payload, length); + } + _HFI_VDBG("%s send - opcode %x\n", qp_type_str(USE_QP), + _get_proto_hfi_opcode((struct ips_message_header*)sbuf_to_buffer(sbuf))); + // we don't support software checksum + psmi_assert_always(! (proto->flags & IPS_PROTO_FLAG_CKSUM)); + psmi_assert_always(USE_QP); // make sure we aren't called too soon + list.addr = (uintptr_t)sbuf_to_buffer(sbuf); + list.length = sizeof(*ips_lrh)+ length ; // note no UD_ADDITION + list.lkey = sbuf_lkey(ep, sbuf); +#ifdef PSM_FI + if_pf(PSMI_FAULTINJ_ENABLED_EP(ep)) { + PSMI_FAULTINJ_STATIC_DECL(fi_sq_lkey, "sq_lkey", + "send " + "RC eager or any " + "UD packet with bad lkey", + 0, IPS_FAULTINJ_SQ_LKEY); + if (psmi_faultinj_is_fault(fi_sq_lkey)) { + printf("corrupting SQ lkey QP %u\n", USE_QP->qp_num ); + fflush(stdout); + list.lkey = 0x55; + } + } +#endif + wr.next = NULL; // just post 1 + psmi_assert(!((uintptr_t)sbuf & VERBS_SQ_WR_ID_MASK)); + wr.wr_id = (uintptr_t)sbuf | VERBS_SQ_WR_ID_SEND; // we'll get this back in completion + // we don't use the scb as wr_id since it seems they may be freed + // immediately after a succesful call to transfer + wr.sg_list = &list; + wr.num_sge = 1; // size of sg_list + wr.opcode = IBV_WR_SEND; + // we want to only get occasional send completions + // and use them to release a whole set of buffers for reuse + // For USE_RC this is imperfect, we track when to ask for a CQE + // per RC QP. However when traffic is using varied RC QPs, we may be + // left with some RC QPs with up to VERBS_SEND_CQ_COALLESCE-1 unsignalled + // WQEs and no traffic for a while, hence consuming a few send buffers per + // QP. By tracking it per RC QP we at least avoid the case of a rotating + // traffic pattern never asking for a CQE for a given QP + if_pf ( ! --(USE_ALLOCATOR->send_num_til_coallesce)) { + wr.send_flags = IBV_SEND_SIGNALED; // get a completion + USE_ALLOCATOR->send_num_til_coallesce = VERBS_SEND_CQ_COALLESCE; + } else { + wr.send_flags = 0; + } + if_pf (ips_lrh->khdr.kdeth0 & __cpu_to_le32(IPS_SEND_FLAG_INTR)) { + _HFI_VDBG("send solicted event\n"); + wr.send_flags |= IBV_SEND_SOLICITED; + } + + // for small messages, we may use IBV_SEND_INLINE for performance + if (list.length <= USE_MAX_INLINE) + wr.send_flags |= IBV_SEND_INLINE; + //wr.imm_data = 0; // only if we use IBV_WR_SEND_WITH_IMM; + // ud fields are ignored for RC send (overlay fields for RDMA) + // so reduce branches by just always filling in these few fields + //if (USE_QP->qp_type == IBV_QPT_UD) + psmi_assert_always(flow->path->ah); + wr.wr.ud.ah = flow->path->ah; + wr.wr.ud.remote_qpn = flow->ipsaddr->remote_qpn; + wr.wr.ud.remote_qkey = ep->verbs_ep.qkey; + + if (_HFI_PDBG_ON) { + _HFI_PDBG("ud_transfer_frame: len %u, remote qpn %u payload %u\n", + list.length, + (USE_QP->qp_type != IBV_QPT_UD)? flow->ipsaddr->remote_qpn : + wr.wr.ud.remote_qpn, + length); + __psm2_dump_buf((uint8_t*)list.addr, list.length); + _HFI_PDBG("post send: QP %p (%u)\n", USE_QP, USE_QP->qp_num); + } + if_pf (ibv_post_send(USE_QP, &wr, &bad_wr)) { + if (errno != EBUSY && errno != EAGAIN && errno != ENOMEM) + _HFI_ERROR("failed to post SQ: %s", strerror(errno)); + ret = PSM2_EP_NO_RESOURCES; + } + _HFI_VDBG("done ud_transfer_frame: len %u, remote qpn %u\n", + list.length, + (USE_QP->qp_type != IBV_QPT_UD)? flow->ipsaddr->remote_qpn : + wr.wr.ud.remote_qpn); + // reap any completions + err = psm2_verbs_completion_update(proto->ep); + if_pf (err != PSM2_OK) + return err; + return ret; +#undef USE_ALLOCATOR +#undef USE_QP +#undef USE_MAX_INLINE +} + diff --git a/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1_spio.h b/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1_spio.h new file mode 100644 index 00000000000..5fb3ac219fd --- /dev/null +++ b/prov/psm3/psm3/psm_hal_gen1/psm_hal_gen1_spio.h @@ -0,0 +1,177 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2017 Intel Corporation. All rights reserved. */ + +#ifndef IPS_SPIO_H +#define IPS_SPIO_H + +#include "psm_user.h" + +#define IPS_CTXT_RESET_MAX 1000 /* max send context reset */ +struct ips_spio; +struct ptl; +struct ips_proto; +struct ips_flow; + +/* 64B move instruction support */ +#define AVX512F_BIT 16 /* level 07h, ebx */ +/* 32B move instruction support */ +#define AVX2_BIT 5 /* level 07h, ebx */ +/* 16B move instruction support */ +#define SSE2_BIT 26 /* level 01h, edx */ + +typedef +void (*ips_spio_blockcpy_fn_t)(volatile uint64_t *dest, + const uint64_t *src, uint32_t nblock); +#ifdef PSM_AVX512 +void hfi_pio_blockcpy_512(volatile uint64_t *dest, + const uint64_t *src, uint32_t nblock); +#endif +void hfi_pio_blockcpy_256(volatile uint64_t *dest, + const uint64_t *src, uint32_t nblock); +void hfi_pio_blockcpy_128(volatile uint64_t *dest, + const uint64_t *src, uint32_t nblock); +void hfi_pio_blockcpy_64(volatile uint64_t *dest, + const uint64_t *src, uint32_t nblock); + + + +static inline psm2_error_t ips_spio_transfer_frame(struct ips_proto *proto, + struct ips_flow *flow, struct ips_scb *scb, + uint32_t *payload, uint32_t length, + uint32_t isCtrlMsg, uint32_t cksum_valid, + uint32_t cksum +#ifdef PSM_CUDA + , uint32_t is_cuda_payload +#endif +); + +static psm2_error_t ips_spio_process_events(const struct ptl *ptl); + +#define SPIO_CREDITS_Counter(value) (((value) >> 0) & 0x7FF) +#define SPIO_CREDITS_Status(value) (((value) >> 11) & 0x1) +#define SPIO_CREDITS_DueToPbc(value) (((value) >> 12) & 0x1) +#define SPIO_CREDITS_DueToTheshold(value) (((value) >> 13) & 0x1) +#define SPIO_CREDITS_DueToErr(value) (((value) >> 14) & 0x1) +#define SPIO_CREDITS_DueToForce(value) (((value) >> 15) & 0x1) +struct ips_spio_credits { +/* don't use bit operation for performance reason, + * using above macro instead. + uint16_t Counter:11; + uint16_t Status:1; + uint16_t CreditReturnDueToPbc:1; + uint16_t CreditReturnDueToThreshold:1; + uint16_t CreditReturnDueToErr:1; + uint16_t CreditReturnDueToForce:1; +*/ + union { + struct { + uint16_t value; + uint16_t pad0; + uint32_t pad1; + }; + uint64_t credit_return; + }; +}; + +struct ips_spio_ctrl { + /* credit return lock for context sharing */ + pthread_spinlock_t spio_ctrl_lock; + + /* PIO write in progress for context sharing */ + volatile uint16_t spio_write_in_progress; + /* send context reset count */ + volatile uint16_t spio_reset_count; + /* HFI frozen count, shared copy */ + volatile uint16_t spio_frozen_count; + + volatile uint16_t spio_available_blocks; + volatile uint16_t spio_block_index; + volatile uint16_t spio_fill_counter; + volatile struct ips_spio_credits spio_credits; +} __attribute__ ((aligned(64))); + +struct ips_spio { + const psmi_context_t *context; + struct ptl *ptl; + uint16_t unit_id; + uint16_t portnum; + + pthread_spinlock_t spio_lock; /* thread lock */ + volatile __le64 *spio_credits_addr __attribute__ ((aligned(64))); + volatile uint64_t *spio_bufbase_sop; + volatile uint64_t *spio_bufbase; + volatile struct ips_spio_ctrl *spio_ctrl; + + uint16_t spio_frozen_count; /* local copy */ + uint16_t spio_total_blocks; + uint16_t spio_block_index; + + uint32_t spio_consecutive_failures; + uint64_t spio_num_stall; + uint64_t spio_num_stall_total; + uint64_t spio_next_stall_warning; + uint64_t spio_last_stall_cyc; + uint64_t spio_init_cyc; + + psm2_error_t (*spio_reset_hfi)(struct ips_spio *ctrl); + psm2_error_t (*spio_credit_return_update)(struct ips_spio *ctrl); + + /* copying routines based on block size */ + ips_spio_blockcpy_fn_t spio_blockcpy_med; + ips_spio_blockcpy_fn_t spio_blockcpy_large; + +}; + +#endif /* IPS_SPIO_H */ diff --git a/prov/psm3/psm3/psm_hal_gen1/psm_hal_inline_i.h b/prov/psm3/psm3/psm_hal_gen1/psm_hal_inline_i.h new file mode 100644 index 00000000000..d89fd79bbb9 --- /dev/null +++ b/prov/psm3/psm3/psm_hal_gen1/psm_hal_inline_i.h @@ -0,0 +1,437 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "psm_hal_gen1.h" + +extern size_t arrsz[MAPSIZE_MAX]; + +static inline struct _hfp_gen1 *get_psm_gen1_hi(void) +{ + return (struct _hfp_gen1*) psmi_hal_current_hal_instance; +} + +/* hfp_gen1_initialize */ +static PSMI_HAL_INLINE int hfp_gen1_initialize(psmi_hal_instance_t *phi) +{ + return 0; +} + +/* hfp_gen1_finalize_ */ +static PSMI_HAL_INLINE int hfp_gen1_finalize_(void) +{ + return 0; +} + +/* hfp_gen1_get_num_units */ +static PSMI_HAL_INLINE int hfp_gen1_get_num_units(void) +{ + return hfi_get_num_units(); +} + +/* hfp_gen1_get_num_ports */ +static PSMI_HAL_INLINE int hfp_gen1_get_num_ports(void) +{ + return HFI_NUM_PORTS_GEN1; +} + +/* hfp_gen1_get_unit_active */ +static PSMI_HAL_INLINE int hfp_gen1_get_unit_active(int unit) +{ + return hfi_get_unit_active(unit); +} + +/* hfp_gen1_get_port_active */ +static PSMI_HAL_INLINE int hfp_gen1_get_port_active(int unit, int port) +{ + return hfi_get_port_active(unit, port); +} + +// Most of these defines are in opa_service_gen1.c, but there are no +// include files common to that file and this one +#define HFI_UD_NUM_CTXTS 1024 + +/* hfp_gen1_get_num_contexts */ +static PSMI_HAL_INLINE int hfp_gen1_get_num_contexts(int unit) +{ + return HFI_UD_NUM_CTXTS; +} + +// Most of these defines are in opa_service_gen1.c, but there are no +// include files common to that file and this one +#define HFI_UD_NUM_FREE_CTXTS 1024 + +/* hfp_gen1_get_num_free_contexts */ +static PSMI_HAL_INLINE int hfp_gen1_get_num_free_contexts(int unit) +{ + return HFI_UD_NUM_FREE_CTXTS; +} + + + +/* hfp_gen1_close_context */ +static PSMI_HAL_INLINE int hfp_gen1_close_context(psmi_hal_hw_context *ctxtp) +{ + hfp_gen1_pc_private *psm_hw_ctxt; + + if (!ctxtp || !*ctxtp) + return PSM_HAL_ERROR_OK; + + psm_hw_ctxt = (hfp_gen1_pc_private *)(*ctxtp); + psmi_free(psm_hw_ctxt); + + return PSM_HAL_ERROR_OK; +} + +/* Moved from psm_context.c */ + + + + + + +static inline char * _dump_cpu_affinity(char *buf, size_t buf_size, cpu_set_t * cpuset) { + int i; + int isfirst = 1; + char tmp[25]; //%d = 10 :: 10 + '-' + 10 + ',' + '\0' = 23 + int first = -1, last = -1; + + for (i = 0; i < CPU_SETSIZE; i++) { + if (CPU_ISSET(i, cpuset)) { + if (first == -1) { + first = last = i; + } else if ((last+1) == i) { + last = i; + } + } else if (first != -1) { + if (first == last) { + snprintf(tmp, sizeof(tmp), "%d,", first); + } else { + snprintf(tmp, sizeof(tmp), "%d-%d,", first, last); + } + first = last = -1; + + if (isfirst) { + strncpy(buf, tmp, buf_size-1); + isfirst=0; + } else { + strncat(buf, tmp, buf_size-1); + } + buf[buf_size-1] = '\0'; + } + } + + if (first != -1) { + if (first == last) { + snprintf(tmp, sizeof(tmp), "%d,", first); + } else { + snprintf(tmp, sizeof(tmp), "%d-%d,", first, last); + } + if (isfirst) { + strncpy(buf, tmp, buf_size-1); + } else { + strncat(buf, tmp, buf_size-1); + } + buf[buf_size-1] = '\0'; + } + char *comma = strrchr(buf, ','); + if (comma) comma[0] = '\0'; + + return buf; +} // pthread_getaffinity_np + +/* hfp_gen1_context_open */ +static PSMI_HAL_INLINE int hfp_gen1_context_open(int unit, + int port, + uint64_t open_timeout, + psm2_ep_t ep, + psm2_uuid_t const job_key, + psmi_context_t *psm_ctxt, + uint32_t cap_mask, + unsigned retryCnt) +{ + psm2_error_t err = PSM2_OK; + hfp_gen1_pc_private *pc_private = psmi_malloc(ep, UNDEFINED, sizeof(hfp_gen1_pc_private)); + + if_pf (!pc_private) { + //err = -PSM_HAL_ERROR_CANNOT_OPEN_CONTEXT; + goto bail; + } + + memset(pc_private, 0, sizeof(hfp_gen1_pc_private)); + + + // open verbs 1st so psmi_context_open can get pkey, lid, etc + if ((err = __psm2_ep_open_verbs(ep, unit, port, job_key)) != PSM2_OK) { + _HFI_ERROR( "Unable to initialize verbs\n"); + err = -PSM_HAL_ERROR_CANNOT_OPEN_CONTEXT; + goto bail; + } + + pthread_t mythread = pthread_self(); + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + + int s = pthread_getaffinity_np(mythread, sizeof(cpu_set_t), &cpuset); + if (s != 0) { + psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "Can't get CPU affinity: %s\n", strerror(errno)); + goto bail; + } + + if (_HFI_DBG_ON) { + char cpu_buf[128] = {0}; + _HFI_DBG( "CPU affinity Before set: %s\n", _dump_cpu_affinity(cpu_buf, 128, &cpuset)); + } + + if (getenv("PSM3_FORCE_CPUAFFINITY") || + !(getenv("PSM3_NO_CPUAFFINITY") || ep->skip_affinity)) + { + cpu_set_t mycpuset, andcpuset; + + if (hfi_get_unit_cpumask(unit, &mycpuset)) { + _HFI_ERROR( "Failed to get unit %d's cpu set\n", unit); + //err = -PSM_HAL_ERROR_GENERAL_ERROR; + goto bail; + } + + int cpu_count = CPU_COUNT(&cpuset); + int my_count = CPU_COUNT(&mycpuset); + if (cpu_count > my_count) { + andcpuset = cpuset; + } else { + CPU_AND(&andcpuset, &cpuset, &mycpuset); + } + int cpu_and_count = CPU_COUNT(&andcpuset); + + if (cpu_and_count > 0 && pthread_setaffinity_np(mythread, sizeof(andcpuset), &andcpuset)) { + _HFI_ERROR( "Failed to set unit %d's cpu set: %s\n", unit, strerror(errno)); + //err = -PSM_HAL_ERROR_GENERAL_ERROR; + goto bail; + } else if (cpu_and_count == 0 && _HFI_DBG_ON) { + char buf1[128] = {0}; + char buf2[128] = {0}; + _HFI_DBG( "CPU affinity not set, NIC selected is not on the same socket as thread (\"%s\" & \"%s\" == 0).\n", + _dump_cpu_affinity(buf1, 128, &mycpuset), _dump_cpu_affinity(buf2, 128, &cpuset)); + } + } + if (_HFI_DBG_ON) { + CPU_ZERO(&cpuset); + int s = pthread_getaffinity_np(mythread, sizeof(cpu_set_t), &cpuset); + if (s != 0) { + psmi_handle_error(NULL, PSM2_EP_DEVICE_FAILURE, + "Can't get CPU affinity: %s\n", strerror(errno)); + goto bail; + } + char cpu_buf[128] = {0}; + _HFI_DBG( "CPU affinity After set: %s\n", _dump_cpu_affinity(cpu_buf, 128, &cpuset)); + } + +// TBD - inside hfi_userinit_internal we would find CPU +// which HFI is closest to and set affinity. Need a way to do that for UD +// we would also wash jkey through driver and stash it in _hfi_ctrl +// but because we disable this we won't have an _hfi_ctrl structure + + psm_ctxt->psm_hw_ctxt = pc_private; + return PSM_HAL_ERROR_OK; + +bail: + if (pc_private) { + psmi_free(pc_private); + } + + return -PSM_HAL_ERROR_GENERAL_ERROR; +} + + + + + +static PSMI_HAL_INLINE int hfp_gen1_get_port_rate(int unit, int port) +{ + return hfi_get_port_rate(unit, port); +} + + + + + +static PSMI_HAL_INLINE int hfp_gen1_get_port_lid(int unit, int port) +{ + return hfi_get_port_lid(unit, port); +} + +static PSMI_HAL_INLINE int hfp_gen1_get_port_subnet(int unit, int port, + uint64_t *subnet, uint64_t *addr, uint32_t *ip_addr, uint32_t *netmask, + int *idx, uint64_t *hi, uint64_t *lo) +{ + return hfi_get_port_subnet(unit, port, subnet, addr, ip_addr, netmask, + idx, hi, lo); +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +static PSMI_HAL_INLINE int hfp_gen1_get_default_pkey(void) +{ + return HFI_DEFAULT_P_KEY; +} + +#include "psm_hal_gen1_spio.c" + + + +static PSMI_HAL_INLINE int hfp_gen1_spio_transfer_frame(struct ips_proto *proto, + struct ips_flow *flow, struct ips_scb *scb, + uint32_t *payload, uint32_t length, + uint32_t isCtrlMsg, uint32_t cksum_valid, + uint32_t cksum, psmi_hal_hw_context ctxt +#ifdef PSM_CUDA + , uint32_t is_cuda_payload +#endif + ) +{ + return ips_spio_transfer_frame(proto, flow, scb, + payload, length, isCtrlMsg, + cksum_valid, cksum +#ifdef PSM_CUDA + , is_cuda_payload +#endif + ); +} + +static PSMI_HAL_INLINE int hfp_gen1_spio_process_events(const struct ptl *ptl) +{ + return ips_spio_process_events(ptl); +} + +static PSMI_HAL_INLINE int hfp_gen1_get_node_id(int unit, int *nodep) +{ + int64_t node_id = hfi_sysfs_unit_read_node_s64(unit); + *nodep = (int)node_id; + if (node_id != -1) + return PSM_HAL_ERROR_OK; + else + return -PSM_HAL_ERROR_GENERAL_ERROR; +} + + + + +static PSMI_HAL_INLINE int hfp_gen1_get_jkey(psmi_hal_hw_context ctxt) +{ + return 0; // TBD - washed through driver - see HED-542 +} + + + + + + + + + + + + + + diff --git a/prov/psm3/psm3/psm_help.h b/prov/psm3/psm3/psm_help.h new file mode 100644 index 00000000000..7fc880b6467 --- /dev/null +++ b/prov/psm3/psm3/psm_help.h @@ -0,0 +1,195 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_HELP_H +#define _PSMI_HELP_H +#include "psm_log.h" + +/* XXX gcc only */ +#define PSMI_INLINE(FN) \ + static inline FN + +#ifndef PACK_SUFFIX +/* XXX gcc only */ +#define PACK_SUFFIX __attribute__((packed)) +#endif + +#define PSMI_ALWAYS_INLINE(FN) \ + static __inline__ FN __attribute__((always_inline)); \ + static __inline__ FN + +#define PSMI_NEVER_INLINE(FN) \ + static FN __attribute__((noinline)); \ + static FN + +#define _PPragma(x) _Pragma(x) + +#define STRINGIFY(s) _STRINGIFY(s) +#define _STRINGIFY(s) #s +#define PSMI_CURLOC __FILE__ ":" STRINGIFY(__LINE__) +#define psmi_assert_always_loc(x, curloc) \ + do { \ + if_pf(!(x)) { \ + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Assertion failure at %s: %s", curloc, \ + STRINGIFY(x)); \ + } } while (0) + +#define psmi_assert_always(x) psmi_assert_always_loc(x, PSMI_CURLOC) + +#ifdef PSM_DEBUG +# define psmi_assert(x) psmi_assert_always(x) +# define PSMI_ASSERT_INITIALIZED() psmi_assert_always(psmi_isinitialized()) +#else +# define psmi_assert(x) +# define PSMI_ASSERT_INITIALIZED() +#endif + +#define _PSMI_API_NAME(FN) __ ## FN +#define _PSMI_API_STR(FN) _STRINGIFY(__ ## FN) +#define PSMI_API_DECL(FN) \ + typeof(_PSMI_API_NAME(FN)) FN __attribute__((weak, alias(_PSMI_API_STR(FN)))); + +#define PSMI_ERR_UNLESS_INITIALIZED(ep) \ + do { \ + if (!psmi_isinitialized()) { \ + PSM2_LOG_MSG("leaving"); \ + return psmi_handle_error(ep, PSM2_INIT_NOT_INIT, \ + "PSM3 has not been initialized"); \ + } \ + } while (0) + +#define PSMI_CHECKMEM(err, mem) \ + do { \ + if ((mem) == NULL) { \ + (err) = PSM2_NO_MEMORY; \ + goto fail; \ + } \ + } while (0) + +#define PSMI_CACHEALIGN __attribute__((aligned(64))) + +/* Easy way to ignore the OK_NO_PROGRESS case */ +PSMI_ALWAYS_INLINE(psm2_error_t psmi_err_only(psm2_error_t err)) +{ + if (err > PSM2_OK_NO_PROGRESS) + return err; + else + return PSM2_OK; +} + +#ifdef min +#undef min +#endif +#define min(a, b) ((a) < (b) ? (a) : (b)) + +#ifdef max +#undef max +#endif +#define max(a, b) ((a) > (b) ? (a) : (b)) + +#define SEC_ULL 1000000000ULL +#define MSEC_ULL 1000000ULL +#define USEC_ULL 1000ULL +#define NSEC_ULL 1ULL + +#define PSMI_TRUE 1 +#define PSMI_FALSE 0 + +#define PSMI_CYCLES_TO_SECSF(cycles) \ + ((double) cycles_to_nanosecs(cycles) / 1.0e9) + +#define PSMI_PAGESIZE psmi_getpagesize() +#define PSMI_POWEROFTWO(P) (((P)&((P)-1)) == 0) +#define PSMI_ALIGNDOWN(p, P) (((uintptr_t)(p))&~((uintptr_t)((P)-1))) +#define PSMI_ALIGNUP(p, P) (PSMI_ALIGNDOWN((uintptr_t)(p)+((uintptr_t)((P)-1)), (P))) + +#define PSMI_MAKE_DRIVER_VERSION(major, minor) ((major)<<16 | ((minor) & 0xffff)) + +#ifdef PSM_DEBUG + +/* The intent of the following two macros is to emit an internal error if a size of a + 'member' is not as expected, violating an assumption in the code. There are some + problems with the implementation of this code: + + The first macro creates a static const variable with ABSOLUTELY NO references + to them. For example there are ABSOLUTELY NO uses of the second macro in the + PSM code. This is not completely pure. GCC version 5, for example, emits a + warning for defining a static const when it is not referenced. + + A better implementation of the intent of this code is to use static_assert() + so that at compile time the violations can be caught and corrected - not at + run time. */ + +#define PSMI_STRICT_SIZE_DECL(member, sz) static const size_t __psm2_ss_ ## member = sz +#define PSMI_STRICT_SIZE_VERIFY(member, sz) \ + do { \ + if (__psm2_ss_ ## member != (sz)) { \ + char errmsg[64]; \ + snprintf(errmsg, 32, "Internal error: %s " \ + "size doesn't match expected %d bytes", \ + STRINGIFY(member), (int) __psm2_ss_ ## member); \ + exit(-1); \ + } \ + } while (0) + +#else + +#define PSMI_STRICT_SIZE_DECL(member, sz) /* nothing */ +#define PSMI_STRICT_SIZE_VERIFY(member, sz) /* nothing */ + +#endif /* PSM_DEBUG */ + +#endif /* _PSMI_HELP_H */ diff --git a/prov/psm3/psm3/psm_lock.h b/prov/psm3/psm3/psm_lock.h new file mode 100644 index 00000000000..a7393a53033 --- /dev/null +++ b/prov/psm3/psm3/psm_lock.h @@ -0,0 +1,238 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_IN_USER_H +#error psm_lock.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSMI_LOCK_H +#define _PSMI_LOCK_H + +#ifndef PSMI_USE_PTHREAD_SPINLOCKS +#define PSMI_USE_PTHREAD_SPINLOCKS 0 +#endif + +#if PSMI_USE_PTHREAD_SPINLOCKS +typedef pthread_spinlock_t psmi_spinlock_t; + +#define psmi_spin_init(lock) pthread_spin_init(lock, \ + PTHREAD_PROCESS_PRIVATE) +#define psmi_spin_destroy(lock) pthread_spin_destroy(lock) +#define psmi_spin_lock(lock) pthread_spin_lock(lock) +#define psmi_spin_trylock(lock) pthread_spin_trylock(lock) +#define psmi_spin_unlock(lock) pthread_spin_unlock(lock) +#else +typedef ips_atomic_t psmi_spinlock_t; +#define PSMI_SPIN_INVALID 2 +#define PSMI_SPIN_LOCKED 1 +#define PSMI_SPIN_UNLOCKED 0 +#endif + +/* psmi_lock_t structure */ +typedef struct { + +#ifdef PSMI_LOCK_IS_SPINLOCK + psmi_spinlock_t lock; +#elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG) + pthread_mutex_t lock; + pthread_t lock_owner; +#elif defined(PSMI_LOCK_IS_MUTEXLOCK) + pthread_mutex_t lock; +#endif +} psmi_lock_t; + + +#if PSMI_USE_PTHREAD_SPINLOCKS +#else +PSMI_ALWAYS_INLINE(int psmi_spin_init(psmi_spinlock_t *lock)) +{ + ips_atomic_set(lock, PSMI_SPIN_UNLOCKED); + return 0; +} + +PSMI_ALWAYS_INLINE(int psmi_spin_trylock(psmi_spinlock_t *lock)) +{ + if (ips_atomic_cmpxchg(lock, PSMI_SPIN_UNLOCKED, PSMI_SPIN_LOCKED) + == PSMI_SPIN_UNLOCKED) { + return 0; + } + + return EBUSY; +} + +PSMI_ALWAYS_INLINE(int psmi_spin_destroy(psmi_spinlock_t *lock)) +{ + if (lock == NULL) { + return EINVAL; + } + + /* We could just do psmi_spin_trylock() here and dispense with the invalid state */ + if (ips_atomic_cmpxchg(lock, PSMI_SPIN_UNLOCKED, PSMI_SPIN_INVALID) + == PSMI_SPIN_UNLOCKED) { + return 0; + } + + return EBUSY; +} + +PSMI_ALWAYS_INLINE(int psmi_spin_lock(psmi_spinlock_t *lock)) +{ + while (psmi_spin_trylock(lock) == EBUSY) { + } + return 0; +} + +PSMI_ALWAYS_INLINE(int psmi_spin_unlock(psmi_spinlock_t *lock)) +{ + atomic_set(lock, PSMI_SPIN_UNLOCKED); + return 0; +} +#endif /* PSMI_USE_PTHREAD_SPINLOCKS */ + +PSMI_ALWAYS_INLINE(void psmi_init_lock(psmi_lock_t *lock)) +{ +#ifdef PSMI_LOCK_IS_SPINLOCK + psmi_spin_init(&(lock->lock)); +#elif defined(PSMI_LOCK_IS_MUTEXLOCK) + pthread_mutex_init(&(lock->lock), NULL); +#elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG) + pthread_mutexattr_t attr; + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK_NP); + pthread_mutex_init(&(lock->lock), &attr); + pthread_mutexattr_destroy(&attr); + lock->lock_owner = PSMI_LOCK_NO_OWNER; +#endif +} + +PSMI_ALWAYS_INLINE(void psmi_destroy_lock(psmi_lock_t *lock)) +{ +#ifdef PSMI_LOCK_IS_SPINLOCK + int err; + /* This will map to either pthread_spin_destroy() or our custom psmi_spin_destroy(). + * Both their return values can be interpreted by strerror(). + */ + if ((err = psmi_spin_destroy(&(lock->lock))) != 0) { + _HFI_VDBG("Destroying spinlock failed: %s\n", strerror(err)); + } + /* The same path for both the regular mutex and the debugging mutex */ +#elif defined(PSMI_LOCK_IS_MUTEXLOCK) || defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG) + int err; + if ((err = pthread_mutex_destroy(&(lock->lock))) != 0) { + /* strerror_r() may be a better choice here but it is tricky + * to reliably detect the XSI vs GNU version, and if hardcoded, + * may be inadvertently changed when tampering with headers/makefiles + * in the long run. + * + * This would result in incorrect operation: a segfault from + * derefencing the return value or failure to retrieve the + * error string. + * + * The C11's strerror_s may be an option here too. + */ + _HFI_VDBG("Destroying mutex failed: %s\n", strerror(err)); + } +#endif +} + +PSMI_ALWAYS_INLINE(int psmi_sem_post(sem_t *sem, const char *name)) +{ + if (sem_post(sem) == -1) { + _HFI_VDBG("Semaphore %s: post failed\n", name ? name : "NULL" ); + return -1; + } + + _HFI_VDBG("Semaphore %s: post succeeded\n", name ? name : "NULL"); + + return 0; +} + +PSMI_ALWAYS_INLINE(int psmi_sem_timedwait(sem_t *sem, const char *name)) +{ + /* Wait 5 seconds for shm read-write lock to open */ + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_sec += 5; + + if (sem_timedwait(sem, &ts) == -1) { + _HFI_VDBG("Semaphore %s: Timedwait failed\n", name ? name : "NULL" ); + return -1; + } + + _HFI_VDBG("Semaphore %s: Timedwait succeeded\n", name ? name : "NULL"); + + return 0; +} + +PSMI_ALWAYS_INLINE(int psmi_init_semaphore(sem_t **sem, const char *name, + mode_t mode, int value)) +{ + *sem = sem_open(name, O_CREAT | O_EXCL, mode, value); + if ((*sem == SEM_FAILED) && (errno == EEXIST)) { + *sem = sem_open(name, O_CREAT, mode, value); + if (*sem == SEM_FAILED) { + _HFI_VDBG("Cannot open semaphore %s, errno=%d\n", + name, errno); + return -1; + } + } else if (*sem == SEM_FAILED) { + _HFI_VDBG("Cannot create semaphore %s, errno=%d\n", name, errno); + return -1; + } + + return 0; +} + +#endif /* _PSMI_LOCK_H */ diff --git a/prov/psm3/psm3/psm_log.h b/prov/psm3/psm3/psm_log.h new file mode 100644 index 00000000000..2e4ab814ec7 --- /dev/null +++ b/prov/psm3/psm3/psm_log.h @@ -0,0 +1,282 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef _PSMI_LOG_H +#define _PSMI_LOG_H + +/* + + A note about PSM_LOG and PSM_LOG_FAST_IO: + + By default, the PSM_LOG facility is safe and slow. Log messages + are written to a file under /tmp as they're generated. So, if the test case + has an abnormal termination such as a segmentation fault or an abort(), + the log messages will still be available. + + However, debugging timing sensitive problems, make the default PSM_LOG + facility inadequate as the timing overhead that it introduces dominates, and + the symptoms of the problem being tested may change. + + When performance is important, you can use BOTH: PSM_LOG and PSM_LOG_FAST_IO. + With PSM_LOG_FAST_IO, log messages are written to a memory buffer, and when + the program terminates, the log messages are written to a file under /tmp + + * How to use basic functionality of PSM LOG: + + - To use default PSM_LOG, build PSM2 with macro + PSM_LOG=1 + + - To use PSM_LOG when performance is critical, build PSM2 with macros + PSM_LOG=1 PSM_LOG_FAST_IO=1 + + - Insert log message in code with a . Log message follow the same + format as printf(). For example: + PSM2_LOG_MSG(" %u", 1); + + - To filter out log messages, set environment variable + PSM3_LOG_SRCH_FORMAT_STRING to and the wildcard character (*). + For example, + PSM3_LOG_SRCH_FORMAT_STRING=* + + - A more detailed explanation to use PSM LOG can be found below. + + * How to get log messages with abnormal termination while using + PSM LOG with PSM_LOG_FAST_IO: + + - Log messages are saved from a memory buffer to a file under /tmp when + psmi_log_fini() is called. psmi_log_fini() is exposed to the outside + world via the linker script file, so client test code can psmi_log_fini() + on a fatal error. + + -------------------------------------------------------------------------------- + + This file (psm_log.h) defines macros for logging messages to assist + investigations into the psm library. + + By default, these macros are not defined when building psm. When not defined, + the macros become no-ops in the PSM code. + + When enabled (by defining the PSM_LOG symbol), the macros present information + to the psmi_log_message() facility for processing. See below for more + information on the psmi_log_message() facility. + + The macros are described in the following: + + PSM2_LOG_MSG(FORMAT,...) Spills a printf-style message to the log. + PSM2_LOG_DECLARE_BT_BUFFER() Declares a local back trace buffer for use + with the PSM2_LOG_BT() macro. + PSM2_LOG_BT(NFRAMES,FORMAT,...) Spills the current backtrace, if it differs + from the previous backtrace spilled to the + log. + + The psmi_log_message() facility is the backend for these messages when + PSM_LOG is enabled. The psmi_log_message() facility spills messages to + unique log files based on the process id and the thread id. So every unique + process id, and thread id will spill to unique log files. The + psmi_log_message prefixes each message in the log files with a high + resolution timer message so that messages from multiple threads and log files + can be reconciled to one timeline. It is left as an exercise to the reader + to reconcile log messages from different hosts to one timeline. + + The backtrace capability in the PSM_LOG functionality needs some explanation: + often a bug happens only when the code is tickled from a specific call-chain. + The PSM2_LOG_BT() macro supports identifying the unique call-chain when a + problem occurs. The model is as follows: + + A unique declaration is made for a backtrace to spill the backtrace + information to. This declaration should be made in the same basic block as + the use of the PSM2_LOG_BT() macro. To make the declaration, use + PSM2_LOG_DECLARE_BT_BUFFER(). + + When the PSM_LOG is enabled, at the statement for the macro: + PSM2_LOG_BT(NFRAMES,FORMAT,...), the psmi_log_message() facility generates + the current backtrace, and compares the first NFRAMES of the current backtrace + against the previous backtrace stored in the backtrace buffer declared with + the declaration. If the two backtraces differ, the psmi_log_message() code + saves the current backtrace into the declared buffer, and then spills the + backtrace to the log file. + + At runtime, setting environment variables can squelch the log file from + getting too big: + + PSM3_LOG_INC_FUNCTION_NAMES is a list of function name lists (abbreviated + FNL) (see below), that will INClude the FNL's into the colleciton of functions + to spill log data for. + + PSM3_LOG_EXC_FUNCTION_NAMES is a list of FNL's (see below), that will EXClude + the FNL's from the collection of functions to spill log data for. + + An FNL is a 'Function Name List' that is defined by the following grammar: + + # A LINE1 is either a single line number of a range of line numbers: + LINE1 :: lineNumber | + lineNumber1 '-' lineNumber2 + + # LINES is a list of LINE1's separated by commas: + LINES :: LINE1 | + LINE1 ',' LINES + + # An FN is either a function name, or a function name with a list of lines: + FN :: functionName | + functionName ';' LINES + + # A FNL is a list of FN's separated by colons: + FNL :: FN | + FN ':' FNL + + # Examples: + foo:bar the two functions foo and bar + foo;1-10 lines 1 to 10 of function foo. + bar;1,3,5 lines 1, 3 and 5 of function bar + + PSM3_LOG_SRCH_FORMAT_STRING If set, overrides the PSM3_LOG_INC_FUNCTION_NAMES + and PSM3_LOG_EXC_FUNCTION_NAMES settings. Causes the psmi_log_message() + facility to only emit the log messages that match (using fnmatch()) the + message in FORMAT. + + */ + +typedef enum +{ + PSM2_LOG_TX = 0, + PSM2_LOG_RX = 1, + PSM2_LOG_PEND = 2, +} psmi_log_tx_rx_t; + +#ifdef PSM_LOG + +extern void psmi_log_initialize(void); + +/* defined in psm_utils.c */ +extern void psmi_log_message(const char *fileName, + const char *functionName, + int lineNumber, + const char *format, ...); + +#ifdef PSM_LOG_FAST_IO +extern void psmi_log_fini(void); +#else +#define psmi_log_fini() /* nothing */ +#endif + +#define PSM2_LOG_MSG(FORMAT , ...) psmi_log_message(__FILE__,__FUNCTION__,__LINE__,FORMAT, ## __VA_ARGS__) + +#define PSM2_LOG_BT_BUFFER_SIZE 100 + +#define PSM2_LOG_DECLARE_BT_BUFFER() static void * psm_log_bt_buffer[PSM2_LOG_BT_BUFFER_SIZE] + +#define PSM2_LOG_DECLARE_BT_BUFFER_SZ(SIZE) static void * psm_log_bt_buffer[SIZE] + +#define PSM2_LOG_BT_MAGIC ((const char *)-1) + +#define PSM2_LOG_BT(NFRAMES,FORMAT , ...) psmi_log_message(__FILE__,__FUNCTION__,__LINE__,PSM2_LOG_BT_MAGIC,psm_log_bt_buffer,NFRAMES,FORMAT, ## __VA_ARGS__) + +#define PSM2_LOG_EPM_MAGIC ((const char *)-2) + +/* EPM is short for Emit Protocol Message to the log file. +OPCODE is an int, and corresponds to one of the OPCODES declared in ptl_ips/ips_proto_header.h +TXRX is an int, and should be one of the above two consts (PSM2_LOG_TX, or PSM2_LOG_RX). +FROMEPID and TOEPID are uint64_t's and the fromepid should be the epid (end point id) of the sender of the message + and the toepid should be the epid (end point id) of the receiver of the message + */ +#define PSM2_LOG_EPM(OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) \ + psmi_log_message(__FILE__,__FUNCTION__,__LINE__, \ + PSM2_LOG_EPM_MAGIC,OPCODE,TXRX,FROMEPID,TOEPID,FORMAT, \ + ## __VA_ARGS__) + +/* Just adds a condition to the PSM2_LOG_EPM() macro. */ +#define PSM2_LOG_EPM_COND(COND,OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) \ + if (COND) \ + PSM2_LOG_EPM(OPCODE,TXRX,FROMEPID,TOEPID,FORMAT, ## __VA_ARGS__) + +#define PSM2_LOG_DUMP_MAGIC ((const char *)-3) + +#define PSM2_LOG_MSG_DUMP(ADDR,SIZE,FORMAT , ...) \ + psmi_log_message(__FILE__,__FUNCTION__,__LINE__,PSM2_LOG_DUMP_MAGIC,ADDR,SIZE, \ + FORMAT, ## __VA_ARGS__) + +#define PSM2_LOG_PKT_STRM_MAGIC ((const char *)-4) + +#define PSM2_LOG_MIN_MAGIC PSM2_LOG_BT_MAGIC + +#define PSM2_LOG_MAX_MAGIC PSM2_LOG_PKT_STRM_MAGIC + +#define PSM2_LOG_PKT_STRM(TXRX,IPS_MSG_HDRP,FORMAT, ...) \ + psmi_log_message(__FILE__,__FUNCTION__,__LINE__,PSM2_LOG_PKT_STRM_MAGIC,TXRX, \ + IPS_MSG_HDRP,FORMAT, ## __VA_ARGS__) + +#else + +#define psmi_log_initialize() /* nothing */ + +#define PSM2_LOG_MSG(FORMAT , ...) /* nothing */ + +#define psmi_log_fini() /* nothing */ + +#define PSM2_LOG_DECLARE_BT_BUFFER() /* nothing */ + +#define PSM2_LOG_DECLARE_BT_BUFFER_SZ(SIZE) /* nothing */ + +#define PSM2_LOG_BT(NFRAMES,FORMAT , ...) /* nothing */ + +#define PSM2_LOG_EPM(OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) /* nothing */ + +#define PSM2_LOG_EPM_COND(COND,OPCODE,TXRX,FROMEPID,TOEPID,FORMAT,...) /* nothing */ + +#define PSM2_LOG_MSG_DUMP(ADDR,SIZE,FORMAT , ...) /* nothing */ + +#define PSM2_LOG_PKT_STRM(TXRX,IPS_MSG_HDRP,FORMAT, ...) /* nothing */ + +#endif /* #ifdef PSM_LOG */ + +#endif /* #ifndef _PSMI_LOG_H */ diff --git a/prov/psm3/psm3/psm_memcpy.c b/prov/psm3/psm3/psm_memcpy.c new file mode 100644 index 00000000000..b7c7a89523e --- /dev/null +++ b/prov/psm3/psm3/psm_memcpy.c @@ -0,0 +1,68 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include +#include +#include +#include + +#include "psm_user.h" +#include "psm_mq_internal.h" + +void *psmi_memcpyo(void *dst, const void *src, size_t n) +{ + psmi_mq_mtucpy(dst, src, n); + return dst; +} diff --git a/prov/psm3/psm3/psm_mock.c b/prov/psm3/psm3/psm_mock.c new file mode 100644 index 00000000000..bdcfd419094 --- /dev/null +++ b/prov/psm3/psm3/psm_mock.c @@ -0,0 +1,90 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "psm_user.h" +#include "psm_mq_internal.h" +#include "psm2_mock_testing.h" + +#ifdef PSM2_MOCK_TESTING +void MOCKABLE(psmi_mockable_lock_init)(psmi_lock_t *pl) +{ + _PSMI_LOCK_INIT(*pl); +} +MOCK_DEF_EPILOGUE(psmi_mockable_lock_init); +int MOCKABLE(psmi_mockable_lock_try)(psmi_lock_t *pl) +{ + int ret = _PSMI_LOCK_TRY(*pl); + return ret; +} +MOCK_DEF_EPILOGUE(psmi_mockable_lock_try); +void MOCKABLE(psmi_mockable_lock)(psmi_lock_t *pl) +{ + _PSMI_LOCK(*pl); +} +MOCK_DEF_EPILOGUE(psmi_mockable_lock); +void MOCKABLE(psmi_mockable_unlock)(psmi_lock_t *pl) +{ + _PSMI_UNLOCK(*pl); +} +MOCK_DEF_EPILOGUE(psmi_mockable_unlock); +void MOCKABLE(psmi_mockable_lock_assert)(psmi_lock_t *pl) +{ + _PSMI_LOCK_ASSERT(*pl); +} +MOCK_DEF_EPILOGUE(psmi_mockable_lock_assert); +void MOCKABLE(psmi_mockable_unlock_assert)(psmi_lock_t *pl) +{ + _PSMI_UNLOCK_ASSERT(*pl); +} +MOCK_DEF_EPILOGUE(psmi_mockable_unlock_assert); +#endif diff --git a/prov/psm3/psm3/psm_mpool.c b/prov/psm3/psm3/psm_mpool.c new file mode 100644 index 00000000000..1f2a365d334 --- /dev/null +++ b/prov/psm3/psm3/psm_mpool.c @@ -0,0 +1,573 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" + +#define PSMI_MPOOL_ALIGNMENT 64 + +struct mpool_element { + union { + SLIST_ENTRY(mpool_element) me_next; + mpool_t me_mpool; + }; + + uint32_t me_gen_count; + uint32_t me_index; +#ifdef PSM_DEBUG + uint32_t me_isused; +#endif +} __attribute__ ((aligned(8))); + +#ifdef PSM_DEBUG +# define me_mark_used(me) ((me)->me_isused = 1) +# define me_mark_unused(me) ((me)->me_isused = 0) +#else +# define me_mark_used(me) +# define me_mark_unused(me) +#endif + +struct mpool { + int mp_type; + int mp_flags; + int mp_vector_shift; + + uint32_t mp_elm_vector_size; + uint32_t mp_elm_offset; + uint32_t mp_num_obj; + uint32_t mp_num_obj_inuse; + uint32_t mp_elm_size; + uint32_t mp_obj_size; + uint32_t mp_num_obj_per_chunk; + uint32_t mp_num_obj_max_total; + psmi_memtype_t mp_memtype; + + SLIST_HEAD(, mpool_element) mp_head; + struct mpool_element **mp_elm_vector; + struct mpool_element **mp_elm_vector_free; + non_empty_callback_fn_t mp_non_empty_cb; + void *mp_non_empty_cb_context; + +#ifdef PSM_CUDA + alloc_dealloc_callback_fn_t mp_alloc_dealloc_cb; + void *mp_alloc_dealloc_cb_context; +#endif +}; + +static int psmi_mpool_allocate_chunk(mpool_t); + +/** + * psmi_mpool_create() + * + * Create a memory pool and allocates objects of size + * . If more memory is needed to accommodate mpool_get() + * requests, the memory pool will allocate another chunk of + * objects, until it reaches the maximum number of objects + * it can allocate. + * + * size of each individual object + * number of objects to allocate per chunk (power of two) + * total number of objects that may be allocated + * at any given time. Must be a power of two greater than + * . + * + * flags to be applied on the memory pool (ie. memory + * alignment) + * + * callback to be called when the memory pool has some + * free objects available again (after running out of them). + * context pointer for the callback + * + * Return the mpool on success, NULL on failure. + */ +mpool_t +psmi_mpool_create_inner(size_t obj_size, uint32_t num_obj_per_chunk, + uint32_t num_obj_max_total, int flags, + psmi_memtype_t statstype, + non_empty_callback_fn_t cb, void *context) +{ + mpool_t mp; + int s; + size_t hdr_size; + + if (!PSMI_POWEROFTWO(num_obj_per_chunk) || + !PSMI_POWEROFTWO(num_obj_max_total) || + num_obj_max_total < num_obj_per_chunk) { + return NULL; + } + + mp = psmi_calloc(PSMI_EP_NONE, statstype, 1, sizeof(struct mpool)); + if (mp == NULL) { + fprintf(stderr, + "Failed to allocate memory for memory pool: %s\n", + strerror(errno)); + return NULL; + } + + for (s = 1; s < num_obj_per_chunk; s <<= 1) + mp->mp_vector_shift++; + + mp->mp_flags = flags; + mp->mp_num_obj_per_chunk = num_obj_per_chunk; + mp->mp_num_obj_max_total = num_obj_max_total; + mp->mp_non_empty_cb = cb; + mp->mp_non_empty_cb_context = context; + + mp->mp_memtype = statstype; + + SLIST_INIT(&mp->mp_head); + mp->mp_elm_vector_size = num_obj_max_total / num_obj_per_chunk; + mp->mp_elm_vector = + psmi_calloc(PSMI_EP_NONE, statstype, mp->mp_elm_vector_size, + sizeof(struct mpool_element *)); + if (mp->mp_elm_vector == NULL) { + fprintf(stderr, + "Failed to allocate memory for memory pool vector: " + "%s\n", strerror(errno)); + psmi_free(mp); + return NULL; + } + + mp->mp_elm_vector_free = mp->mp_elm_vector; + + if (flags & PSMI_MPOOL_ALIGN) { + /* User wants its block to start on a PSMI_MPOOL_ALIGNMENT + * boundary. */ + hdr_size = PSMI_ALIGNUP(sizeof(struct mpool_element), + PSMI_MPOOL_ALIGNMENT); + mp->mp_obj_size = PSMI_ALIGNUP(obj_size, PSMI_MPOOL_ALIGNMENT); + mp->mp_elm_size = hdr_size + mp->mp_obj_size; + + mp->mp_elm_offset = hdr_size - sizeof(struct mpool_element); + } else { + hdr_size = sizeof(struct mpool_element); + mp->mp_obj_size = PSMI_ALIGNUP(obj_size, 8); + mp->mp_elm_size = hdr_size + mp->mp_obj_size; + mp->mp_elm_offset = 0; + } + + return mp; +} + +mpool_t +MOCKABLE(psmi_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk, + uint32_t num_obj_max_total, int flags, + psmi_memtype_t statstype, non_empty_callback_fn_t cb, + void *context) +{ + mpool_t mp; + + mp = psmi_mpool_create_inner(obj_size, num_obj_per_chunk, + num_obj_max_total, flags, statstype, + cb, context); + + if (mp == NULL) + return NULL; + + if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) { + psmi_mpool_destroy(mp); + return NULL; + } + + return mp; +} +MOCK_DEF_EPILOGUE(psmi_mpool_create); + +#ifdef PSM_CUDA +mpool_t +psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk, + uint32_t num_obj_max_total, int flags, + psmi_memtype_t statstype, + non_empty_callback_fn_t cb, void *context, + alloc_dealloc_callback_fn_t ad_cb, void *ad_context) +{ + mpool_t mp; + + mp = psmi_mpool_create_inner(obj_size, num_obj_per_chunk, + num_obj_max_total, flags, statstype, + cb, context); + + if (mp == NULL) + return NULL; + + mp->mp_alloc_dealloc_cb = ad_cb; + mp->mp_alloc_dealloc_cb_context = ad_context; + + if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) { + psmi_mpool_destroy(mp); + return NULL; + } + + return mp; +} +#endif + +/** + * psmi_mpool_get() + * + * memory pool + * + * Requests an object from the memory pool. + * + * Returns NULL if the maximum number of objects has been allocated (refer to + * in psmi_mpool_create) or if running out of memory. + */ +void *psmi_mpool_get(mpool_t mp) +{ + struct mpool_element *me; + void *obj; + + if (SLIST_EMPTY(&mp->mp_head)) { + if (psmi_mpool_allocate_chunk(mp) != PSM2_OK) + return NULL; + } + + me = SLIST_FIRST(&mp->mp_head); + SLIST_REMOVE_HEAD(&mp->mp_head, me_next); + + psmi_assert(!me->me_isused); + me_mark_used(me); + + /* store a backpointer to the memory pool */ + me->me_mpool = mp; + mp->mp_num_obj_inuse++; + psmi_assert(mp->mp_num_obj_inuse <= mp->mp_num_obj); + + obj = (void *)((uintptr_t) me + sizeof(struct mpool_element)); + + return obj; +} + +/** + * psmi_mpool_put() + * + * object to return to the memory pool + * + * Returns an to the memory pool subsystem. This object will be re-used + * to fulfill new psmi_mpool_get() requests. + */ +void psmi_mpool_put(void *obj) +{ + struct mpool_element *me; + int was_empty; + mpool_t mp; + + me = (struct mpool_element *) + ((uintptr_t) obj - sizeof(struct mpool_element)); + me->me_gen_count++; + + mp = me->me_mpool; + + psmi_assert(mp != NULL); + psmi_assert(mp->mp_num_obj_inuse >= 0); + psmi_assert(me->me_isused); + me_mark_unused(me); + + was_empty = mp->mp_num_obj_inuse == mp->mp_num_obj_max_total; + SLIST_INSERT_HEAD(&mp->mp_head, me, me_next); + + mp->mp_num_obj_inuse--; + + /* tell the user that memory is available */ + if (mp->mp_non_empty_cb && was_empty) + mp->mp_non_empty_cb(mp->mp_non_empty_cb_context); +} + +/** + * psmi_mpool_get_obj_index() + * + * object in the memory pool + * + * Returns the index of the in the memory pool. + */ + +int psmi_mpool_get_obj_index(void *obj) +{ + struct mpool_element *me = (struct mpool_element *) + ((uintptr_t) obj - sizeof(struct mpool_element)); + + return me->me_index; +} + +/** + * psmi_mpool_get_obj_gen_count() + * + * object in the memory pool + * + * Returns the generation count of the . + */ +uint32_t psmi_mpool_get_obj_gen_count(void *obj) +{ + struct mpool_element *me = (struct mpool_element *) + ((uintptr_t) obj - sizeof(struct mpool_element)); + + return me->me_gen_count; +} + +/** + * psmi_mpool_get_obj_index_gen_count() + * + * object in the memory pool + * + * Returns the index of the in . + * Returns the generation count of the in . + */ +int +psmi_mpool_get_obj_index_gen_count(void *obj, uint32_t *index, + uint32_t *gen_count) +{ + struct mpool_element *me = (struct mpool_element *) + ((uintptr_t) obj - sizeof(struct mpool_element)); + + *index = me->me_index; + *gen_count = me->me_gen_count; + return 0; +} + +/** + * psmi_mpool_find_obj_by_index() + * + * memory pool + * index of the object + * + * Returns the object located at in the memory pool or NULL if the + * is invalid. + */ +void *psmi_mpool_find_obj_by_index(mpool_t mp, int index) +{ + struct mpool_element *me; + + if_pf(index < 0 || index >= mp->mp_num_obj) + return NULL; + + me = (struct mpool_element *) + ((uintptr_t) mp->mp_elm_vector[index >> mp->mp_vector_shift] + + (index & (mp->mp_num_obj_per_chunk - 1)) * mp->mp_elm_size + + mp->mp_elm_offset); + + /* If this mpool doesn't require generation counts, it's illegal to find a + * freed object */ +#ifdef PSM_DEBUG + if (mp->mp_flags & PSMI_MPOOL_NOGENERATION) + psmi_assert(!me->me_isused); +#endif + + return (void *)((uintptr_t) me + sizeof(struct mpool_element)); +} + +#ifdef PSM_CUDA +/** + * psmi_mpool_chunk_dealloc() + * memory pool + * index + * Calls the dealloc function on each element in the chunk. + */ +void psmi_mpool_chunk_dealloc(mpool_t mp, int idx) +{ + int j; + for (j = 0; j < mp->mp_num_obj_per_chunk; j++) + mp->mp_alloc_dealloc_cb(0 /* is not alloc */, + mp->mp_alloc_dealloc_cb_context, + ((void *) mp->mp_elm_vector[idx]) + + j * mp->mp_elm_size + + sizeof(struct mpool_element)); +} +#endif +/** + * psmi_mpool_destroy() + * + * memory pool + * + * Destroy a previously allocated memory pool and reclaim its associated + * memory. The behavior is undefined if some objects have not been returned + * to the memory pool with psmi_mpool_put(). + */ +void psmi_mpool_destroy(mpool_t mp) +{ + int i = 0; + size_t nbytes = mp->mp_num_obj * mp->mp_elm_size; + + for (i = 0; i < mp->mp_elm_vector_size; i++) { + if (mp->mp_elm_vector[i]) { +#ifdef PSM_CUDA + if (mp->mp_alloc_dealloc_cb) + psmi_mpool_chunk_dealloc(mp, i); +#endif + psmi_free(mp->mp_elm_vector[i]); + } + } + psmi_free(mp->mp_elm_vector); + nbytes += mp->mp_elm_vector_size * sizeof(struct mpool_element *); + psmi_free(mp); + nbytes += sizeof(struct mpool); +} + +/** + * psmi_mpool_get_max_obj() + * + * memory pool + * + * Returns the num-obj-per-chunk + * Returns the num-obj-max-total + */ +void +MOCKABLE(psmi_mpool_get_obj_info)(mpool_t mp, uint32_t *num_obj_per_chunk, + uint32_t *num_obj_max_total) +{ + *num_obj_per_chunk = mp->mp_num_obj_per_chunk; + *num_obj_max_total = mp->mp_num_obj_max_total; + return; +} +MOCK_DEF_EPILOGUE(psmi_mpool_get_obj_info); + +static int psmi_mpool_allocate_chunk(mpool_t mp) +{ + struct mpool_element *elm; + void *chunk; + uint32_t i = 0, num_to_allocate; + + num_to_allocate = + mp->mp_num_obj + mp->mp_num_obj_per_chunk > + mp->mp_num_obj_max_total ? 0 : mp->mp_num_obj_per_chunk; + + psmi_assert(mp->mp_num_obj + num_to_allocate <= + mp->mp_num_obj_max_total); + + if (num_to_allocate == 0) + return PSM2_NO_MEMORY; + +#ifdef PSM_CUDA + if (mp->mp_alloc_dealloc_cb) + chunk = psmi_calloc(PSMI_EP_NONE, mp->mp_memtype, + num_to_allocate, mp->mp_elm_size); + else + chunk = psmi_malloc(PSMI_EP_NONE, mp->mp_memtype, + num_to_allocate * mp->mp_elm_size); +#else + chunk = psmi_malloc(PSMI_EP_NONE, mp->mp_memtype, + num_to_allocate * mp->mp_elm_size); +#endif + if (chunk == NULL) { + fprintf(stderr, + "Failed to allocate memory for memory pool chunk: %s\n", + strerror(errno)); + return PSM2_NO_MEMORY; + } + + for (i = 0; i < num_to_allocate; i++) { +#ifdef PSM_CUDA + if (mp->mp_alloc_dealloc_cb) + mp->mp_alloc_dealloc_cb(1 /* is alloc */, + mp->mp_alloc_dealloc_cb_context, + chunk + i * mp->mp_elm_size + + sizeof(struct mpool_element)); +#endif + elm = (struct mpool_element *)((uintptr_t) chunk + + i * mp->mp_elm_size + + mp->mp_elm_offset); + elm->me_gen_count = 0; + elm->me_index = mp->mp_num_obj + i; +#ifdef PSM_DEBUG + elm->me_isused = 0; +#endif + SLIST_INSERT_HEAD(&mp->mp_head, elm, me_next); +#if 0 + fprintf(stderr, "chunk%ld i=%d elm=%p user=%p next=%p\n", + (long)(mp->mp_elm_vector_free - mp->mp_elm_vector), + (int)i, elm, + (void *)((uintptr_t) elm + + sizeof(struct mpool_element)), SLIST_NEXT(elm, + me_next)); +#endif + } + + psmi_assert((uintptr_t) mp->mp_elm_vector_free + < ((uintptr_t) mp->mp_elm_vector) + mp->mp_elm_vector_size + * sizeof(struct mpool_element *)); + + mp->mp_elm_vector_free[0] = chunk; + mp->mp_elm_vector_free++; + mp->mp_num_obj += num_to_allocate; + + return PSM2_OK; +} + +#if 0 +void psmi_mpool_dump(mpool_t mp) +{ + int i, j; + struct mpool_element *me; + + fprintf(stderr, "Memory pool %p has %d elements per chunk.\n", + mp, mp->mp_num_obj_per_chunk); + for (i = 0; i < mp->mp_elm_vector_size; i++) { + if (mp->mp_elm_vector[i] != NULL) { + fprintf(stderr, "===========================\n"); + fprintf(stderr, "mpool chunk #%d\n", i); + + for (j = 0, me = mp->mp_elm_vector[i]; + j < mp->mp_num_obj_per_chunk; + j++, me = (struct mpool_element *) + ((uintptr_t) me + mp->mp_elm_size)) { + fprintf(stderr, + "obj=%p index=%d gen_count=%d\n", + (void *)((uintptr_t) me + + sizeof(struct mpool_element)), + me->me_index, me->me_gen_count); + } + fprintf(stderr, "===========================\n"); + } + } +} +#endif diff --git a/prov/psm3/psm3/psm_mpool.h b/prov/psm3/psm3/psm_mpool.h new file mode 100644 index 00000000000..8098f60ce71 --- /dev/null +++ b/prov/psm3/psm3/psm_mpool.h @@ -0,0 +1,107 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_IN_USER_H +#error psm_mpool.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef PSM_MPOOL_H +#define PSM_MPOOL_H + +/* mpool flags */ +#define PSMI_MPOOL_ALIGN_CACHE 0x1 +#define PSMI_MPOOL_ALIGN_PAGE 0x2 +#define PSMI_MPOOL_NOGENERATION 0x4 + +/* Backwards compatibility */ +#define PSMI_MPOOL_ALIGN PSMI_MPOOL_ALIGN_CACHE + +typedef struct mpool *mpool_t; +typedef void (*non_empty_callback_fn_t) (void *context); +typedef void (*alloc_dealloc_callback_fn_t) (int is_alloc, void *context, + void *chunk); + +mpool_t +MOCKABLE(psmi_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk, + uint32_t num_obj_max_total, int flags, + psmi_memtype_t statstype, + non_empty_callback_fn_t cb, void *context); +MOCK_DCL_EPILOGUE(psmi_mpool_create); + +mpool_t psmi_mpool_create_for_cuda(size_t obj_size, uint32_t num_obj_per_chunk, + uint32_t num_obj_max_total, int flags, + psmi_memtype_t statstype, + non_empty_callback_fn_t cb, void *context, + alloc_dealloc_callback_fn_t ad_cb, + void *ad_context); + +void psmi_mpool_destroy(mpool_t mp); + +void +MOCKABLE(psmi_mpool_get_obj_info)(mpool_t mp, uint32_t *num_obj_per_chunk, + uint32_t *num_obj_max_total); +MOCK_DCL_EPILOGUE(psmi_mpool_get_obj_info); + +void *psmi_mpool_get(mpool_t mp); +void psmi_mpool_put(void *obj); + +int psmi_mpool_get_obj_index(void *obj); +uint32_t psmi_mpool_get_obj_gen_count(void *obj); +int psmi_mpool_get_obj_index_gen_count(void *obj, + uint32_t *index, uint32_t *gen_count); + +void *psmi_mpool_find_obj_by_index(mpool_t mp, int index); + +#endif diff --git a/prov/psm3/psm3/psm_mq.c b/prov/psm3/psm3/psm_mq.c new file mode 100644 index 00000000000..c4891987806 --- /dev/null +++ b/prov/psm3/psm3/psm_mq.c @@ -0,0 +1,1654 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include + +#include "psm_user.h" +#include "psm2_hal.h" +#include "psm_mq_internal.h" +#include "ips_proto_params.h" + +#ifdef PSM_CUDA +#include "psm_gdrcpy.h" +#endif + +/* + * Functions to manipulate the expected queue in mq_ep. + */ + +/* + * Once the linked lists cross the size limit, this function will enable tag + * hashing and disable the non-hashing fastpath. We need to go back and insert + * reqs into the hash tables where the hashing searches will look for them. + */ +void +psmi_mq_fastpath_disable(psm2_mq_t mq) +{ + psm2_mq_req_t *curp, cur; + struct mqq *qp; + unsigned hashvals[NUM_HASH_CONFIGS]; + int t = PSM2_ANYTAG_ANYSRC; + + mq->nohash_fastpath = 0; + /* Everything in the unexpected_q needs to be duplicated into + each of the (three) unexpected hash tables. */ + qp = &mq->unexpected_q; + for (curp = &qp->first; (cur = *curp) != NULL; curp = &cur->next[t]) { + mq->unexpected_hash_len++; + hashvals[PSM2_TAG_SRC] = + hash_64(cur->req_data.tag.tag64) % NUM_HASH_BUCKETS; + hashvals[PSM2_TAG_ANYSRC] = + hash_32(cur->req_data.tag.tag[0]) % NUM_HASH_BUCKETS; + hashvals[PSM2_ANYTAG_SRC] = + hash_32(cur->req_data.tag.tag[1]) % NUM_HASH_BUCKETS; + for (t = PSM2_TAG_SRC; t < PSM2_ANYTAG_ANYSRC; t++) + mq_qq_append_which(mq->unexpected_htab, + t, hashvals[t], cur); + } + + /* Everything in the expected_q needs to be moved into the + (single) correct expected hash table. */ + qp = &mq->expected_q; + for (curp = &qp->first; (cur = *curp) != NULL; /*curp = &cur->next*/) { + /* must read next ptr before remove */ + curp = &cur->next[PSM2_ANYTAG_ANYSRC]; + if ((cur->req_data.tagsel.tag[0] == 0xFFFFFFFF) && + (cur->req_data.tagsel.tag[1] == 0xFFFFFFFF)) { + /* hash tag0 and tag1 */ + t = PSM2_TAG_SRC; + hashvals[t] = hash_64(cur->req_data.tag.tag64) % NUM_HASH_BUCKETS; + mq_qq_append_which(mq->expected_htab, + t, hashvals[t], cur); + } else if (cur->req_data.tagsel.tag[0] == 0xFFFFFFFF) { + t = PSM2_TAG_ANYSRC; + hashvals[t] = hash_32(cur->req_data.tag.tag[0]) % NUM_HASH_BUCKETS; + mq_qq_append_which(mq->expected_htab, + t, hashvals[t], cur); + } else if (cur->req_data.tagsel.tag[1] == 0xFFFFFFFF) { + t = PSM2_ANYTAG_SRC; + hashvals[t] = hash_32(cur->req_data.tag.tag[1]) % NUM_HASH_BUCKETS; + mq_qq_append_which(mq->expected_htab, + t, hashvals[t], cur); + } else + continue; /* else, req must stay in ANY ANY */ + + mq->expected_list_len--; + mq->expected_hash_len++; + mq_qq_remove_which(cur, PSM2_ANYTAG_ANYSRC); + } +} + +/* easy threshold to re-enable: if |hash| == 0 && |list| < X + aggressive threshold: if |hash| + |list| < X + even easier: if |hash| + |list| == 0 + might be better approach to avoid constant bouncing between modes */ +void psmi_mq_fastpath_try_reenable(psm2_mq_t mq) +{ + if_pf(mq->nohash_fastpath == 0 && + mq->unexpected_hash_len == 0 && + mq->expected_hash_len == 0 && + mq->unexpected_list_len == 0 && + mq->expected_list_len == 0){ + mq->nohash_fastpath = 1; + } +} + +/* + * ! @brief PSM exposed version to allow PTLs to match + */ + +/*! @brief Try to match against the MQ using a tag and tagsel + * + * @param[in] mq Message Queue + * @param[in] src Source (sender) epaddr, may be PSM2_MQ_ANY_ADDR. + * @param[in] tag Input Tag + * @param[in] tagsel Input Tag Selector + * @param[in] remove Non-zero to remove the req from the queue + * + * @returns NULL if no match or an mq request if there is a match + */ +static +psm2_mq_req_t +mq_req_match_with_tagsel(psm2_mq_t mq, psm2_epaddr_t src, + psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel, int remove) +{ + psm2_mq_req_t *curp; + psm2_mq_req_t cur; + unsigned hashval; + int i, j = 0; + struct mqq *qp; + + if_pt (mq->nohash_fastpath) { + i = j = PSM2_ANYTAG_ANYSRC; + qp = &mq->unexpected_q; + } else if ((tagsel->tag[0] == 0xFFFFFFFF) && + (tagsel->tag[1] == 0xFFFFFFFF)) { + i = PSM2_TAG_SRC; + hashval = hash_64(tag->tag64) % NUM_HASH_BUCKETS; + qp = &mq->unexpected_htab[i][hashval]; + } else if (tagsel->tag[0] == 0xFFFFFFFF) { + i = PSM2_TAG_ANYSRC; + hashval = hash_32(tag->tag[0]) % NUM_HASH_BUCKETS; + qp = &mq->unexpected_htab[i][hashval]; + } else if (tagsel->tag[1] == 0xFFFFFFFF) { + i = PSM2_ANYTAG_SRC; + hashval = hash_32(tag->tag[1]) % NUM_HASH_BUCKETS; + qp = &mq->unexpected_htab[i][hashval]; + } else { + /* unhashable tag */ + i = PSM2_ANYTAG_ANYSRC; + qp = &mq->unexpected_q; + } + + for (curp = &qp->first; (cur = *curp) != NULL; curp = &cur->next[i]) { + psmi_assert(cur->req_data.peer != PSM2_MQ_ANY_ADDR); + if ((src == PSM2_MQ_ANY_ADDR || src == cur->req_data.peer) && + !((tag->tag[0] ^ cur->req_data.tag.tag[0]) & tagsel->tag[0]) && + !((tag->tag[1] ^ cur->req_data.tag.tag[1]) & tagsel->tag[1]) && + !((tag->tag[2] ^ cur->req_data.tag.tag[2]) & tagsel->tag[2])) { + /* match! */ + if (remove) { + if_pt (i == PSM2_ANYTAG_ANYSRC) + mq->unexpected_list_len--; + else + mq->unexpected_hash_len--; + for (; j < NUM_MQ_SUBLISTS; j++) + mq_qq_remove_which(cur, j); + psmi_mq_fastpath_try_reenable(mq); + } + return cur; + } + } + return NULL; +} + +static void mq_add_to_expected_hashes(psm2_mq_t mq, psm2_mq_req_t req) +{ + unsigned hashval; + int i; + + req->timestamp = mq->timestamp++; + if_pt (mq->nohash_fastpath) { + mq_qq_append(&mq->expected_q, req); + req->q[PSM2_ANYTAG_ANYSRC] = &mq->expected_q; + mq->expected_list_len++; + if_pf (mq->expected_list_len >= HASH_THRESHOLD) + psmi_mq_fastpath_disable(mq); + } else if ((req->req_data.tagsel.tag[0] == 0xFFFFFFFF) && + (req->req_data.tagsel.tag[1] == 0xFFFFFFFF)) { + i = PSM2_TAG_SRC; + hashval = hash_64(req->req_data.tag.tag64) % NUM_HASH_BUCKETS; + mq_qq_append_which(mq->expected_htab, i, hashval, req); + mq->expected_hash_len++; + } else if (req->req_data.tagsel.tag[0] == 0xFFFFFFFF) { + i = PSM2_TAG_ANYSRC; + hashval = hash_32(req->req_data.tag.tag[0]) % NUM_HASH_BUCKETS; + mq_qq_append_which(mq->expected_htab, i, hashval, req); + mq->expected_hash_len++; + } else if (req->req_data.tagsel.tag[1] == 0xFFFFFFFF) { + i = PSM2_ANYTAG_SRC; + hashval = hash_32(req->req_data.tag.tag[1]) % NUM_HASH_BUCKETS; + mq_qq_append_which(mq->expected_htab, i, hashval, req); + mq->expected_hash_len++; + } else { + mq_qq_append(&mq->expected_q, req); + req->q[PSM2_ANYTAG_ANYSRC] = &mq->expected_q; + mq->expected_list_len++; + } +} + +/*! @brief Try to remove the req in the MQ + * + * @param[in] mq Message Queue + * @param[in] req MQ request + * + * @returns 1 if successfully removed, or 0 if req cannot be found. + */ +static +int mq_req_remove_single(psm2_mq_t mq, psm2_mq_req_t req) +{ + int i; + + /* item should only exist in one expected queue at a time */ + psmi_assert((!!req->q[0] + !!req->q[1] + !!req->q[2] + !!req->q[3]) == 1); + + for (i = 0; i < NUM_MQ_SUBLISTS; i++) + if (req->q[i]) /* found */ + break; + switch (i) { + case PSM2_ANYTAG_ANYSRC: + mq->expected_list_len--; + break; + case PSM2_TAG_SRC: + case PSM2_TAG_ANYSRC: + case PSM2_ANYTAG_SRC: + mq->expected_hash_len--; + break; + default: + return 0; + } + + mq_qq_remove_which(req, i); + psmi_mq_fastpath_try_reenable(mq); + return 1; +} + +PSMI_ALWAYS_INLINE( +psm2_mq_req_t +psmi_mq_iprobe_inner(psm2_mq_t mq, psm2_epaddr_t src, + psm2_mq_tag_t *tag, + psm2_mq_tag_t *tagsel, int remove_req)) +{ + psm2_mq_req_t req; + + PSMI_LOCK(mq->progress_lock); + req = mq_req_match_with_tagsel(mq, src, tag, tagsel, remove_req); + + if (req != NULL) { + PSMI_UNLOCK(mq->progress_lock); + return req; + } + + psmi_poll_internal(mq->ep, 1); + /* try again */ + req = mq_req_match_with_tagsel(mq, src, tag, tagsel, remove_req); + + PSMI_UNLOCK(mq->progress_lock); + return req; +} + +psm2_error_t +__psm2_mq_iprobe2(psm2_mq_t mq, psm2_epaddr_t src, + psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel, + psm2_mq_status2_t *status) +{ + psm2_mq_req_t req; + + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + + req = psmi_mq_iprobe_inner(mq, src, tag, tagsel, 0); + psmi_assert_req_not_internal(req); + + if (req != NULL) { + if (status != NULL) { + mq_status2_copy(req, status); + } + PSM2_LOG_MSG("leaving"); + return PSM2_OK; + } + PSM2_LOG_MSG("leaving"); + return PSM2_MQ_NO_COMPLETIONS; +} +PSMI_API_DECL(psm2_mq_iprobe2) + +psm2_error_t +__psm2_mq_iprobe(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, + psm2_mq_status_t *status) +{ + psm2_mq_tag_t rtag; + psm2_mq_tag_t rtagsel; + psm2_mq_req_t req; + + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + + rtag.tag64 = tag; +#ifdef PSM_DEBUG + rtag.tag[2] = 0; +#endif + rtagsel.tag64 = tagsel; + rtagsel.tag[2] = 0; + + req = psmi_mq_iprobe_inner(mq, PSM2_MQ_ANY_ADDR, &rtag, &rtagsel, 0); + psmi_assert_req_not_internal(req); + + if (req != NULL) { + if (status != NULL) { + mq_status_copy(req, status); + } + PSM2_LOG_MSG("leaving"); + return PSM2_OK; + } + + PSM2_LOG_MSG("leaving"); + + return PSM2_MQ_NO_COMPLETIONS; +} +PSMI_API_DECL(psm2_mq_iprobe) + +psm2_error_t +__psm2_mq_improbe2(psm2_mq_t mq, psm2_epaddr_t src, + psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel, + psm2_mq_req_t *reqo, psm2_mq_status2_t *status) +{ + psm2_mq_req_t req; + + PSM2_LOG_MSG("entering"); + + PSMI_ASSERT_INITIALIZED(); + + req = psmi_mq_iprobe_inner(mq, src, tag, tagsel, 1); + if (req != NULL) { + if (status != NULL) { + mq_status2_copy(req, status); + } + *reqo = req; + PSM2_LOG_MSG("leaving"); + return PSM2_OK; + } + + *reqo = NULL; + PSM2_LOG_MSG("leaving"); + return PSM2_MQ_NO_COMPLETIONS; +} +PSMI_API_DECL(psm2_mq_improbe2) + +psm2_error_t +__psm2_mq_improbe(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, + psm2_mq_req_t *reqo, psm2_mq_status_t *status) +{ + psm2_mq_tag_t rtag; + psm2_mq_tag_t rtagsel; + psm2_mq_req_t req; + + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + + rtag.tag64 = tag; +#ifdef PSM_DEBUG + rtag.tag[2] = 0; +#endif + rtagsel.tag64 = tagsel; + rtagsel.tag[2] = 0; + + req = psmi_mq_iprobe_inner(mq, PSM2_MQ_ANY_ADDR, &rtag, &rtagsel, 1); + if (req != NULL) { + if (status != NULL) { + mq_status_copy(req, status); + } + *reqo = req; + PSM2_LOG_MSG("leaving"); + return PSM2_OK; + } + + *reqo = NULL; + PSM2_LOG_MSG("leaving"); + return PSM2_MQ_NO_COMPLETIONS; +} +PSMI_API_DECL(psm2_mq_improbe) + +psm2_error_t __psm2_mq_cancel(psm2_mq_req_t *ireq) +{ + psm2_mq_req_t req = *ireq; + psm2_mq_t mq; + psm2_error_t err = PSM2_OK; + + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + + if (req == NULL) { + PSM2_LOG_MSG("leaving"); + return PSM2_MQ_NO_COMPLETIONS; + } + + /* Cancelling a send is a blocking operation, and expensive. + * We only allow cancellation of rendezvous sends, consider the eager sends + * as always unsuccessfully cancelled. + */ + mq = req->mq; + PSMI_LOCK(mq->progress_lock); + + if (MQE_TYPE_IS_RECV(req->type)) { + if (req->state == MQ_STATE_POSTED) { + int rc; + + rc = mq_req_remove_single(mq, req); + psmi_assert_always(rc); + req->state = MQ_STATE_COMPLETE; + mq_qq_append(&mq->completed_q, req); + err = PSM2_OK; + } else + err = PSM2_MQ_NO_COMPLETIONS; + } else { + err = psmi_handle_error(mq->ep, PSM2_PARAM_ERR, + "Cannot cancel send requests (req=%p)", + req); + } + + PSMI_UNLOCK(mq->progress_lock); + + PSM2_LOG_MSG("leaving"); + + return err; +} +PSMI_API_DECL(psm2_mq_cancel) + +/* This is the only PSM function that blocks. + * We handle it in a special manner since we don't know what the user's + * execution environment is (threads, oversubscribing processes, etc). + * + * The status argument can be an instance of either type psm2_mq_status_t or + * psm2_mq_status2_t. Depending on the type, a corresponding status copy + * routine should be passed in. + */ +PSMI_ALWAYS_INLINE( +psm2_error_t +psmi_mq_wait_inner(psm2_mq_req_t *ireq, void *status, + psmi_mq_status_copy_t status_copy, + int do_lock)) +{ + psm2_error_t err = PSM2_OK; + + psm2_mq_req_t req = *ireq; + if (req == PSM2_MQ_REQINVALID) { + return PSM2_OK; + } + + if (do_lock) + PSMI_LOCK(req->mq->progress_lock); + + if (req->state != MQ_STATE_COMPLETE) { + psm2_mq_t mq = req->mq; + + /* We'll be waiting on this req, mark it as so */ + req->type |= MQE_TYPE_WAITING; + + _HFI_VDBG("req=%p, buf=%p, len=%d, waiting\n", + req, req->req_data.buf, req->req_data.buf_len); + + if (req->testwait_callback) { + err = req->testwait_callback(ireq); + if (do_lock) + PSMI_UNLOCK(req->mq->progress_lock); + if (status != NULL) { + status_copy(req, status); + } + return err; + } + + PSMI_BLOCKUNTIL(mq->ep, err, req->state == MQ_STATE_COMPLETE); + + if (err > PSM2_OK_NO_PROGRESS) + goto fail_with_lock; + else + err = PSM2_OK; + } + + if(!psmi_is_req_internal(req)) + mq_qq_remove(&req->mq->completed_q, req); + + if (status != NULL) { + status_copy(req, status); + } + + _HFI_VDBG("req=%p complete, buf=%p, len=%d, err=%d\n", + req, req->req_data.buf, req->req_data.buf_len, req->req_data.error_code); + + psmi_mq_req_free(req); + *ireq = PSM2_MQ_REQINVALID; + +fail_with_lock: + if (do_lock) + PSMI_UNLOCK(req->mq->progress_lock); + return err; +} + +psm2_error_t +__psm2_mq_wait2(psm2_mq_req_t *ireq, psm2_mq_status2_t *status) +{ + psm2_error_t rv; + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + psmi_assert_req_not_internal(*ireq); + + rv = psmi_mq_wait_inner(ireq, status, + (psmi_mq_status_copy_t) mq_status2_copy, 1); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_mq_wait2) + +psm2_error_t +__psm2_mq_wait(psm2_mq_req_t *ireq, psm2_mq_status_t *status) +{ + psm2_error_t rv; + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + psmi_assert_req_not_internal(*ireq); + + rv = psmi_mq_wait_inner(ireq, status, + (psmi_mq_status_copy_t) mq_status_copy, 1); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_mq_wait) + +psm2_error_t psmi_mq_wait_internal(psm2_mq_req_t *ireq) +{ + return psmi_mq_wait_inner(ireq, NULL, NULL, 0); +} + +/* The status argument can be an instance of either type psm2_mq_status_t or + * psm2_mq_status2_t. Depending on the type, a corresponding status copy + * routine should be passed in. + */ +PSMI_ALWAYS_INLINE( +psm2_error_t +psmi_mq_test_inner(psm2_mq_req_t *ireq, void *status, + psmi_mq_status_copy_t status_copy)) +{ + psm2_mq_req_t req = *ireq; + psm2_error_t err = PSM2_OK; + + PSMI_ASSERT_INITIALIZED(); + + if (req == PSM2_MQ_REQINVALID) { + return PSM2_OK; + } + + if (req->state != MQ_STATE_COMPLETE) { + if (req->testwait_callback) { + PSMI_LOCK(req->mq->progress_lock); + err = req->testwait_callback(ireq); + if (status != NULL) { + status_copy(req, status); + } + PSMI_UNLOCK(req->mq->progress_lock); + return err; + } else + return PSM2_MQ_NO_COMPLETIONS; + } + + if (status != NULL) + status_copy(req, status); + + _HFI_VDBG + ("req=%p complete, tag=%08x.%08x.%08x buf=%p, len=%d, err=%d\n", + req, req->req_data.tag.tag[0], req->req_data.tag.tag[1], + req->req_data.tag.tag[2], req->req_data.buf, + req->req_data.buf_len, req->req_data.error_code); + + PSMI_LOCK(req->mq->progress_lock); + mq_qq_remove(&req->mq->completed_q, req); + psmi_mq_req_free(req); + PSMI_UNLOCK(req->mq->progress_lock); + + *ireq = PSM2_MQ_REQINVALID; + + return err; +} + +psm2_error_t +__psm2_mq_test2(psm2_mq_req_t *ireq, psm2_mq_status2_t *status) +{ + psm2_error_t rv; + PSM2_LOG_MSG("entering"); + rv = psmi_mq_test_inner(ireq, status, + (psmi_mq_status_copy_t) mq_status2_copy); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_mq_test2) + +psm2_error_t +__psm2_mq_test(psm2_mq_req_t *ireq, psm2_mq_status_t *status) +{ + psm2_error_t rv; + PSM2_LOG_MSG("entering"); + rv = psmi_mq_test_inner(ireq, status, + (psmi_mq_status_copy_t) mq_status_copy); + PSM2_LOG_MSG("leaving"); + return rv; + +} +PSMI_API_DECL(psm2_mq_test) + +psm2_error_t +__psm2_mq_isend2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, + psm2_mq_tag_t *stag, const void *buf, uint32_t len, + void *context, psm2_mq_req_t *req) +{ + psm2_error_t err; + + PSM2_LOG_MSG("entering"); + + PSMI_ASSERT_INITIALIZED(); + psmi_assert(stag != NULL); + + PSMI_LOCK(mq->progress_lock); + err = + dest->ptlctl->mq_isend(mq, dest, flags, PSMI_REQ_FLAG_NORMAL, + stag, buf, len, context, req); + PSMI_UNLOCK(mq->progress_lock); + + psmi_assert(*req != NULL); + psmi_assert_req_not_internal(*req); + + (*req)->req_data.peer = dest; + + PSM2_LOG_MSG("leaving"); + + return err; +} +PSMI_API_DECL(psm2_mq_isend2) + +psm2_error_t +__psm2_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, + const void *buf, uint32_t len, void *context, psm2_mq_req_t *req) +{ + psm2_error_t err; + psm2_mq_tag_t tag; + + PSM2_LOG_MSG("entering"); + + tag.tag64 = stag; + tag.tag[2] = 0; + + PSMI_ASSERT_INITIALIZED(); + + PSMI_LOCK(mq->progress_lock); + err = dest->ptlctl->mq_isend(mq, dest, flags, PSMI_REQ_FLAG_NORMAL, + &tag, buf, len, context, req); + PSMI_UNLOCK(mq->progress_lock); + + psmi_assert(*req != NULL); + psmi_assert_req_not_internal(*req); + + (*req)->req_data.peer = dest; + + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_mq_isend) + +psm2_error_t +__psm2_mq_send2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, + psm2_mq_tag_t *stag, const void *buf, uint32_t len) +{ + psm2_error_t err; + + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + psmi_assert(stag != NULL); + + PSMI_LOCK(mq->progress_lock); + err = dest->ptlctl->mq_send(mq, dest, flags, stag, buf, len); + PSMI_UNLOCK(mq->progress_lock); + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_mq_send2) + +psm2_error_t +__psm2_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag, + const void *buf, uint32_t len) +{ + psm2_error_t err; + psm2_mq_tag_t tag; + + PSM2_LOG_MSG("entering stag: 0x%" PRIx64, stag); + + tag.tag64 = stag; + tag.tag[2] = 0; + + PSMI_ASSERT_INITIALIZED(); + + PSMI_LOCK(mq->progress_lock); + err = dest->ptlctl->mq_send(mq, dest, flags, &tag, buf, len); + PSMI_UNLOCK(mq->progress_lock); + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_mq_send) + +/* + * Common subroutine to psm2_mq_irecv2 and psm2_mq_imrecv. This code assumes + * that the provided request has been matched, and begins copying message data + * that has already arrived to the user's buffer. Any remaining data is copied + * by PSM polling until the message is complete. + */ +static psm2_error_t +psm2_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len) +{ + uint32_t copysz; + + PSM2_LOG_MSG("entering"); + psmi_assert(MQE_TYPE_IS_RECV(req->type)); + psmi_mtucpy_fn_t psmi_mtucpy_fn = psmi_mq_mtucpy; +#if defined(PSM_CUDA) + int converted = 0; + if (!req->is_buf_gpu_mem) + psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem; +#endif // PSM_CUDA + + _HFI_VDBG("(req=%p) buf=%p len=%u req.state=%u\n", req, buf, len, req->state); + + switch (req->state) { + case MQ_STATE_COMPLETE: + if (req->req_data.buf != NULL) { /* 0-byte messages don't alloc a sysbuf */ + copysz = mq_set_msglen(req, len, req->req_data.send_msglen); + void *ubuf = buf; +#ifdef PSM_CUDA + if (PSMI_USE_GDR_COPY(req, len)) { + ubuf = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)buf, + len, 1, + mq->ep->epaddr->proto); + psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem; + converted = 1; + } +#endif // PSM_CUDA + psmi_mtucpy_fn(ubuf, (const void *)req->req_data.buf, copysz); +#if defined(PSM_CUDA) + if (converted) { + gdr_unmap_gpu_host_addr(GDR_FD, ubuf, len, + mq->ep->epaddr->proto); + } +#endif // PSM_CUDA + psmi_mq_sysbuf_free(mq, req->req_data.buf); + } + req->req_data.buf = buf; + req->req_data.buf_len = len; + mq_qq_append(&mq->completed_q, req); + break; + + case MQ_STATE_UNEXP: /* not done yet */ + copysz = mq_set_msglen(req, len, req->req_data.send_msglen); + /* Copy What's been received so far and make sure we don't receive + * any more than copysz. After that, swap system with user buffer + */ + req->recv_msgoff = min(req->recv_msgoff, copysz); + +#ifdef PSM_CUDA + if (PSMI_USE_GDR_COPY(req, req->req_data.send_msglen)) { + buf = gdr_convert_gpu_to_host_addr(GDR_FD, (unsigned long)req->user_gpu_buffer, + req->req_data.send_msglen, 1, + mq->ep->epaddr->proto); + psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem; + converted = 1; + } +#endif // PSM_CUDA + + if (req->recv_msgoff) { + psmi_mtucpy_fn(buf, (const void *)req->req_data.buf, + req->recv_msgoff); + } +#if defined(PSM_CUDA) + if (converted) { + gdr_unmap_gpu_host_addr(GDR_FD, buf, req->req_data.send_msglen, + mq->ep->epaddr->proto); + } +#endif // PSM_CUDA + psmi_mq_sysbuf_free(mq, req->req_data.buf); + + req->state = MQ_STATE_MATCHED; + req->req_data.buf = buf; + req->req_data.buf_len = len; + break; + + case MQ_STATE_UNEXP_RV: /* rendez-vous ... */ + copysz = mq_set_msglen(req, len, req->req_data.send_msglen); + /* Copy What's been received so far and make sure we don't receive + * any more than copysz. After that, swap system with user buffer + */ + req->recv_msgoff = min(req->recv_msgoff, copysz); + if (req->recv_msgoff) { + psmi_mtucpy_fn(buf, (const void *)req->req_data.buf, + req->recv_msgoff); + } + if (req->send_msgoff) { + psmi_mq_sysbuf_free(mq, req->req_data.buf); + } + + req->state = MQ_STATE_MATCHED; + req->req_data.buf = buf; + req->req_data.buf_len = len; + req->rts_callback(req, 0); + break; + + default: + fprintf(stderr, "Unexpected state %d in req %p\n", req->state, + req); + fprintf(stderr, "type=%d, mq=%p, tag=%08x.%08x.%08x\n", + req->type, req->mq, req->req_data.tag.tag[0], req->req_data.tag.tag[1], + req->req_data.tag.tag[2]); + abort(); + } + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} + +psm2_error_t +__psm2_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *tag, + psm2_mq_tag_t *tagsel, uint32_t flags, void *buf, uint32_t len, + void *context, enum psm2_mq_fp_op fp_type, psm2_mq_req_t *req) +{ + psm2_error_t err = PSM2_OK; + + PSM2_LOG_MSG("entering"); + + PSMI_ASSERT_INITIALIZED(); + + PSMI_LOCK_ASSERT(mq->progress_lock); + + if (fp_type == PSM2_MQ_ISEND_FP) { + psmi_assert(tag != NULL); + err = + addr->ptlctl->mq_isend(mq, addr, flags, PSMI_REQ_FLAG_FASTPATH, + tag, buf, len, context, req); + + psmi_assert(*req != NULL); + psmi_assert_req_not_internal(*req); + + (*req)->req_data.peer = addr; + } else if (fp_type == PSM2_MQ_IRECV_FP) { + psm2_mq_req_t recv_req; + +#ifdef PSM_CUDA + int gpu_mem = 0; + void *gpu_user_buffer = NULL; + + if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(buf)) { + psmi_cuda_set_attr_sync_memops(buf); + + gpu_mem = 1; + gpu_user_buffer = buf; + } +#endif + + /* First check unexpected Queue and remove req if found */ + recv_req = mq_req_match_with_tagsel(mq, addr, tag, tagsel, REMOVE_ENTRY); + + if (recv_req == NULL) { + /* prepost before arrival, add to expected q */ + recv_req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); + if_pf(recv_req == NULL) { + err = PSM2_NO_MEMORY; + goto recv_ret; + } + + recv_req->req_data.peer = addr; + recv_req->req_data.tag = *tag; + recv_req->req_data.tagsel = *tagsel; + recv_req->state = MQ_STATE_POSTED; + recv_req->req_data.buf = buf; + recv_req->req_data.buf_len = len; + recv_req->req_data.recv_msglen = len; + recv_req->recv_msgoff = 0; + recv_req->req_data.context = context; + +#ifdef PSM_CUDA + recv_req->is_buf_gpu_mem = gpu_mem; + recv_req->user_gpu_buffer = gpu_user_buffer; +#endif + + mq_add_to_expected_hashes(mq, recv_req); + _HFI_VDBG("buf=%p,len=%d,tag=%08x.%08x.%08x " + " tagsel=%08x.%08x.%08x req=%p\n", + buf, len, tag->tag[0], tag->tag[1], tag->tag[2], + tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], recv_req); + } else { + _HFI_VDBG("unexpected buf=%p,len=%d,tag=%08x.%08x.%08x" + " tagsel=%08x.%08x.%08x req=%p\n", buf, len, + tag->tag[0], tag->tag[1], tag->tag[2], + tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], recv_req); + +#ifdef PSM_CUDA + recv_req->is_buf_gpu_mem = gpu_mem; + recv_req->user_gpu_buffer = gpu_user_buffer; +#endif + + recv_req->req_data.context = context; + + psm2_mq_irecv_inner(mq, recv_req, buf, len); + } +recv_ret: + psmi_assert_req_not_internal(recv_req); + *req = recv_req; + } else { + err = PSM2_PARAM_ERR; + } + + PSM2_LOG_MSG("leaving"); + + return err; +} +PSMI_API_DECL(psm2_mq_fp_msg) + +psm2_error_t +__psm2_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src, + psm2_mq_tag_t *tag, psm2_mq_tag_t *tagsel, + uint32_t flags, void *buf, uint32_t len, void *context, + psm2_mq_req_t *reqo) +{ + psm2_error_t err = PSM2_OK; + psm2_mq_req_t req; + +#ifdef PSM_CUDA + int gpu_mem = 0; + + if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(buf)) { + psmi_cuda_set_attr_sync_memops(buf); + + gpu_mem = 1; + } +#endif + + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + + PSMI_LOCK(mq->progress_lock); + + /* First check unexpected Queue and remove req if found */ + req = mq_req_match_with_tagsel(mq, src, tag, tagsel, REMOVE_ENTRY); + + if (req == NULL) { + /* prepost before arrival, add to expected q */ + req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); + if_pf(req == NULL) { + err = PSM2_NO_MEMORY; + goto ret; + } + + req->req_data.peer = src; + req->req_data.tag = *tag; + req->req_data.tagsel = *tagsel; + req->state = MQ_STATE_POSTED; + req->req_data.buf = buf; + req->req_data.buf_len = len; + req->req_data.recv_msglen = len; + req->recv_msgoff = 0; + req->req_data.context = context; + +#ifdef PSM_CUDA + req->is_buf_gpu_mem = gpu_mem; + if (gpu_mem) + req->user_gpu_buffer = buf; + else + req->user_gpu_buffer = NULL; +#endif + + mq_add_to_expected_hashes(mq, req); + _HFI_VDBG("buf=%p,len=%d,tag=%08x.%08x.%08x " + " tagsel=%08x.%08x.%08x req=%p\n", + buf, len, tag->tag[0], tag->tag[1], tag->tag[2], + tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], req); + } else { + _HFI_VDBG("unexpected buf=%p,len=%d,tag=%08x.%08x.%08x" + " tagsel=%08x.%08x.%08x req=%p\n", buf, len, + tag->tag[0], tag->tag[1], tag->tag[2], + tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], req); +#ifdef PSM_CUDA + req->is_buf_gpu_mem = gpu_mem; + if (gpu_mem) + req->user_gpu_buffer = buf; + else + req->user_gpu_buffer = NULL; +#endif + + req->req_data.context = context; + + psm2_mq_irecv_inner(mq, req, buf, len); + } + +ret: + PSMI_UNLOCK(mq->progress_lock); + psmi_assert_req_not_internal(req); + *reqo = req; + PSM2_LOG_MSG("leaving"); + + return err; +} +PSMI_API_DECL(psm2_mq_irecv2) + +psm2_error_t +__psm2_mq_irecv(psm2_mq_t mq, uint64_t tag, uint64_t tagsel, uint32_t flags, + void *buf, uint32_t len, void *context, psm2_mq_req_t *reqo) +{ + psm2_error_t rv; + psm2_mq_tag_t rtag; + psm2_mq_tag_t rtagsel; + + *reqo = NULL; + + PSM2_LOG_MSG("entering tag: 0x%" PRIx64, tag); + + rtag.tag64 = tag; +#ifdef PSM_DEBUG + rtag.tag[2] = 0; +#endif + rtagsel.tag64 = tagsel; + rtagsel.tag[2] = 0; + rv = __psm2_mq_irecv2(mq, PSM2_MQ_ANY_ADDR, &rtag, &rtagsel, + flags, buf, len, context, reqo); + + psmi_assert_req_not_internal(*reqo); + PSM2_LOG_MSG("leaving"); + + return rv; +} +PSMI_API_DECL(psm2_mq_irecv) + +psm2_error_t +__psm2_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len, + void *context, psm2_mq_req_t *reqo) +{ + psm2_error_t err = PSM2_OK; + psm2_mq_req_t req = *reqo; + + PSM2_LOG_MSG("entering"); + PSMI_ASSERT_INITIALIZED(); + + if (req == PSM2_MQ_REQINVALID) { + err = psmi_handle_error(mq->ep, PSM2_PARAM_ERR, + "Invalid request (req=%p)", req); + } else { + /* Message is already matched -- begin delivering message data to the + user's buffer. */ + req->req_data.context = context; + +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(buf)) { + psmi_cuda_set_attr_sync_memops(buf); + req->is_buf_gpu_mem = 1; + } else { + req->is_buf_gpu_mem = 0; + } +#endif + + PSMI_LOCK(mq->progress_lock); + psm2_mq_irecv_inner(mq, req, buf, len); + PSMI_UNLOCK(mq->progress_lock); + } + + PSM2_LOG_MSG("leaving"); + + return err; +} +PSMI_API_DECL(psm2_mq_imrecv) + +/* The status argument can be an instance of either type psm2_mq_status_t or + * psm2_mq_status2_t. Depending on the type, a corresponding status copy + * routine should be passed in. + */ +PSMI_ALWAYS_INLINE( +psm2_error_t +psmi_mq_ipeek_inner(psm2_mq_t mq, psm2_mq_req_t *oreq, + void *status, + psmi_mq_status_copy_t status_copy)) +{ + psm2_mq_req_t req; + + PSMI_ASSERT_INITIALIZED(); + + if ((req = mq->completed_q.first) == NULL) { + PSMI_LOCK(mq->progress_lock); + psmi_poll_internal(mq->ep, 1); + if ((req = mq->completed_q.first) == NULL) { + PSMI_UNLOCK(mq->progress_lock); + return PSM2_MQ_NO_COMPLETIONS; + } + PSMI_UNLOCK(mq->progress_lock); + } + /* something in the queue */ + *oreq = req; + if (status != NULL) + status_copy(req, status); + + return PSM2_OK; +} + +psm2_error_t +__psm2_mq_ipeek2(psm2_mq_t mq, psm2_mq_req_t *oreq, psm2_mq_status2_t *status) +{ + psm2_error_t rv; + + *oreq = NULL; + + PSM2_LOG_MSG("entering"); + rv = psmi_mq_ipeek_inner(mq, oreq, status, + (psmi_mq_status_copy_t) mq_status2_copy); + + psmi_assert_req_not_internal(*oreq); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_mq_ipeek2) + +psm2_error_t +__psm2_mq_ipeek(psm2_mq_t mq, psm2_mq_req_t *oreq, psm2_mq_status_t *status) +{ + psm2_error_t rv; + + *oreq = NULL; + PSM2_LOG_MSG("entering"); + rv = psmi_mq_ipeek_inner(mq, oreq, status, + (psmi_mq_status_copy_t) mq_status_copy); + + psmi_assert_req_not_internal(*oreq); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_mq_ipeek) + +psm2_error_t __psm2_mq_ipeek_dequeue_multi(psm2_mq_t mq, void *status_array, + psmi_mq_status_copy_user_t status_copy, int *count) +{ + psm2_mq_req_t req; + int read_count = *count; + int ret = 0; + + PSMI_ASSERT_INITIALIZED(); + + *count = 0; + while (*count < read_count) { + PSMI_LOCK(mq->progress_lock); + + if (mq->completed_q.first == NULL) + psmi_poll_internal(mq->ep, 1); + + if ((req = mq->completed_q.first) == NULL) { + PSMI_UNLOCK(mq->progress_lock); + return PSM2_MQ_NO_COMPLETIONS; + } + + mq_qq_remove(&mq->completed_q, req); + PSMI_UNLOCK(mq->progress_lock); + + ret = status_copy(&req->req_data, status_array, *count); + psm2_mq_req_free(mq, req); + + if (unlikely(ret < 0)) { + *count = ret; + return PSM2_INTERNAL_ERR; + } else if (ret == 0) { + continue; + } + + *count = *count + 1; + + if (ret > 1) + break; + } + return PSM2_OK; +} +PSMI_API_DECL(psm2_mq_ipeek_dequeue_multi) + +psm2_error_t __psm2_mq_ipeek_dequeue(psm2_mq_t mq, psm2_mq_req_t *oreq) +{ + psm2_mq_req_t req; + + PSMI_ASSERT_INITIALIZED(); + PSMI_LOCK(mq->progress_lock); + if (mq->completed_q.first == NULL) + psmi_poll_internal(mq->ep, 1); + if ((req = mq->completed_q.first) == NULL) { + PSMI_UNLOCK(mq->progress_lock); + return PSM2_MQ_NO_COMPLETIONS; + } + mq_qq_remove(&mq->completed_q, req); + PSMI_UNLOCK(mq->progress_lock); + *oreq = req; + return PSM2_OK; +} +PSMI_API_DECL(psm2_mq_ipeek_dequeue) + +psm2_error_t __psm2_mq_req_free(psm2_mq_t mq, psm2_mq_req_t req) +{ + PSMI_ASSERT_INITIALIZED(); + if (req == NULL) + return PSM2_OK; + PSMI_LOCK(mq->progress_lock); + psmi_mq_req_free(req); + PSMI_UNLOCK(mq->progress_lock); + + return PSM2_OK; +} +PSMI_API_DECL(psm2_mq_req_free) + +static +psm2_error_t psmi_mqopt_ctl(psm2_mq_t mq, uint32_t key, void *value, int get) +{ + psm2_error_t err = PSM2_OK; + uint32_t val32; + + switch (key) { + case PSM2_MQ_RNDV_HFI_SZ: + if (get) + *((uint32_t *) value) = mq->hfi_thresh_rv; + else { + val32 = *((uint32_t *) value); + mq->hfi_thresh_rv = val32; + } + _HFI_VDBG("RNDV_HFI_SZ = %d (%s)\n", + mq->hfi_thresh_rv, get ? "GET" : "SET"); + break; + + case PSM2_MQ_RNDV_SHM_SZ: + if (get) + *((uint32_t *) value) = mq->shm_thresh_rv; + else { + val32 = *((uint32_t *) value); + mq->shm_thresh_rv = val32; + } + _HFI_VDBG("RNDV_SHM_SZ = %d (%s)\n", + mq->shm_thresh_rv, get ? "GET" : "SET"); + break; + case PSM2_MQ_MAX_SYSBUF_MBYTES: + /* Deprecated: this option no longer does anything. */ + break; + + default: + err = + psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Unknown option key=%u", key); + break; + } + return err; +} + +psm2_error_t __psm2_mq_getopt(psm2_mq_t mq, int key, void *value) +{ + psm2_error_t rv; + PSM2_LOG_MSG("entering"); + PSMI_ERR_UNLESS_INITIALIZED(mq->ep); + rv = psmi_mqopt_ctl(mq, key, value, 1); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_mq_getopt) + +psm2_error_t __psm2_mq_setopt(psm2_mq_t mq, int key, const void *value) +{ + psm2_error_t rv; + PSM2_LOG_MSG("entering"); + PSMI_ERR_UNLESS_INITIALIZED(mq->ep); + rv = psmi_mqopt_ctl(mq, key, (void *)value, 0); + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_mq_setopt) + +#define TAB_SIZE 16 +#define STATS \ + STAT(rx_user_bytes) \ + STAT(rx_user_num) \ + STAT(rx_sys_bytes) \ + STAT(rx_sys_num) \ + STAT(tx_num) \ + STAT(tx_eager_num) \ + STAT(tx_eager_bytes) \ + STAT(tx_rndv_num) \ + STAT(tx_rndv_bytes) \ + STAT(tx_shm_num) \ + STAT(rx_shm_num) \ + STAT(rx_sysbuf_num) \ + STAT(rx_sysbuf_bytes) \ + STAT(comm_world_rank) + +static +void +psmi_mq_print_stats(psm2_mq_t mq, FILE *perf_stats_fd) +{ + psm2_mq_stats_t stats; + char msg_buffer[MSG_BUFFER_LEN]; + + psm2_mq_get_stats(mq, &stats); + +#define STAT(x) \ + snprintf(msg_buffer, MSG_BUFFER_LEN, "%*lu",TAB_SIZE, stats.x); \ + fwrite(msg_buffer, sizeof(char), strlen(msg_buffer), perf_stats_fd); + + STATS + +#undef STAT + + fwrite("\n", sizeof(char), 1, perf_stats_fd); +} + + +static +void +*psmi_mq_print_stats_thread(void *_mq) +{ + psm2_mq_t mq = (psm2_mq_t)_mq; + char perf_file_name[MSG_BUFFER_LEN]; + char msg_buffer[MSG_BUFFER_LEN]; + int delta_t = 0; + + snprintf(perf_file_name, MSG_BUFFER_LEN, "./psm3-perf-stat-ep-0x%" PRIx64 "-pid-%d", + (uint64_t)(mq->ep->epid), + getpid()); + FILE *perf_stats_fd = fopen(perf_file_name, "w+"); + + if (!perf_stats_fd) + { + _HFI_ERROR("Failed to create fd for performance logging\n"); + goto end; + } + +#define STAT(x) \ + snprintf(msg_buffer, MSG_BUFFER_LEN, "%*s",TAB_SIZE, #x);\ + fwrite(msg_buffer, sizeof(char), strlen(msg_buffer), perf_stats_fd); + + STAT(delta_t) + STATS + +#undef STAT + + fwrite("\n", sizeof(char), 1, perf_stats_fd); + + /* Performance stats will be printed every $PSM3_MQ_PRINT_STATS seconds */ + do { + snprintf(msg_buffer, MSG_BUFFER_LEN, "%*d",TAB_SIZE, delta_t); + fwrite(msg_buffer, sizeof(char), strlen(msg_buffer), perf_stats_fd); + psmi_mq_print_stats(mq, perf_stats_fd); + fflush(perf_stats_fd); + usleep(MICRO_SEC * mq->print_stats); + delta_t += mq->print_stats; + } while (mq->mq_perf_data.perf_print_stats); + + fclose(perf_stats_fd); +end: + pthread_exit(NULL); +} + +static +void +psmi_mq_print_stats_init(psm2_mq_t mq) +{ + mq->mq_perf_data.perf_print_stats = 1; + if (pthread_create(&(mq->mq_perf_data.perf_print_thread), NULL, + psmi_mq_print_stats_thread, (void*)mq)) + { + mq->mq_perf_data.perf_print_stats = 0; + _HFI_ERROR("Failed to create logging thread\n"); + } +} + +static +void +psmi_mq_print_stats_finalize(psm2_mq_t mq) +{ + if (mq->mq_perf_data.perf_print_stats) + { + mq->mq_perf_data.perf_print_stats = 0; + pthread_join(mq->mq_perf_data.perf_print_thread, NULL); + } +} + +/* + * This is the API for the user. We actually allocate the MQ much earlier, but + * the user can set options after obtaining an endpoint + */ +psm2_error_t +__psm2_mq_init(psm2_ep_t ep, uint64_t ignored, + const struct psm2_optkey *opts, int numopts, psm2_mq_t *mqo) +{ + psm2_error_t err = PSM2_OK; + + if (ep == NULL) { + err = PSM2_PARAM_ERR; + goto fail; + } + + psm2_mq_t mq = ep->mq; + int i; + + PSM2_LOG_MSG("entering"); + + PSMI_ERR_UNLESS_INITIALIZED(ep); + + psmi_assert_always(mq != NULL); + psmi_assert_always(mq->ep != NULL); + + mq->stats.comm_world_rank = hfi_get_myrank(); + + /* Process options */ + for (i = 0; err == PSM2_OK && i < numopts; i++) + err = psmi_mqopt_ctl(mq, opts[i].key, opts[i].value, 0); + if (err != PSM2_OK) /* error already handled */ + goto fail; + + /* Initialize the unexpected system buffer allocator */ + psmi_mq_sysbuf_init(mq); + char buf[128]; + psmi_mq_sysbuf_getinfo(mq, buf, sizeof buf); + _HFI_VDBG("%s", buf); + + *mqo = mq; + + if (mq->print_stats > 0) + psmi_mq_print_stats_init(mq); + +fail: + PSM2_LOG_MSG("leaving"); + return err; +} +PSMI_API_DECL(psm2_mq_init) + +psm2_error_t __psm2_mq_finalize(psm2_mq_t mq) +{ + psm2_error_t rv = PSM2_OK; + + PSM2_LOG_MSG("entering"); + + PSMI_ERR_UNLESS_INITIALIZED(mq->ep); + + if (mq->print_stats == -1) + { + mq->print_stats = 1; + psmi_mq_print_stats_init(mq); + } + if (mq->print_stats != 0) + psmi_mq_print_stats_finalize(mq); + + PSM2_LOG_MSG("leaving"); + return rv; +} +PSMI_API_DECL(psm2_mq_finalize) + +void __psm2_mq_get_stats(psm2_mq_t mq, psm2_mq_stats_t *stats) +{ + PSM2_LOG_MSG("entering"); + memcpy(stats, &mq->stats, sizeof(psm2_mq_stats_t)); + PSM2_LOG_MSG("leaving"); +} +PSMI_API_DECL(psm2_mq_get_stats) + +static psm2_error_t psmi_mq_initstats(psm2_mq_t mq) +{ + struct psmi_stats_entry entries[] = { + PSMI_STATS_DECL("COMM_WORLD_Rank", + MPSPAWN_STATS_REDUCTION_ALL, NULL, + &mq->stats.comm_world_rank), + PSMI_STATS_DECLU64("Total_count_sent", &mq->stats.tx_num), + PSMI_STATS_DECLU64("Eager_count_sent", &mq->stats.tx_eager_num), + PSMI_STATS_DECLU64("Eager_bytes_sent", &mq->stats.tx_eager_bytes), + PSMI_STATS_DECLU64("Rendezvous_count_sent", &mq->stats.tx_rndv_num), + PSMI_STATS_DECLU64("Rendezvous_bytes_sent", &mq->stats.tx_rndv_bytes), + PSMI_STATS_DECLU64("Expected_count_recv", &mq->stats.rx_user_num), + PSMI_STATS_DECLU64("Expected_bytes_recv", &mq->stats.rx_user_bytes), + PSMI_STATS_DECLU64("Unexpected_count_recv", &mq->stats.rx_sys_num), + PSMI_STATS_DECLU64("Unexpected_bytes_recv", &mq->stats.rx_sys_bytes), + PSMI_STATS_DECLU64("shm_count_sent", &mq->stats.tx_shm_num), + PSMI_STATS_DECLU64("shm_count_recv", &mq->stats.rx_shm_num), + PSMI_STATS_DECLU64("sysbuf_count", &mq->stats.rx_sysbuf_num), + PSMI_STATS_DECLU64("sysbuf_bytes", &mq->stats.rx_sysbuf_bytes), + }; + + return psmi_stats_register_type("MPI_Statistics_Summary", + PSMI_STATSTYPE_MQ, + entries, + PSMI_STATS_HOWMANY(entries), 0, mq); +} + +psm2_error_t psmi_mq_malloc(psm2_mq_t *mqo) +{ + psm2_error_t err = PSM2_OK; + + psm2_mq_t mq = + (psm2_mq_t) psmi_calloc(NULL, UNDEFINED, 1, sizeof(struct psm2_mq)); + if (mq == NULL) { + err = psmi_handle_error(NULL, PSM2_NO_MEMORY, + "Couldn't allocate memory for mq endpoint"); + goto fail; + } + + mq->ep = NULL; + /*mq->unexpected_callback = NULL; */ + mq->memmode = psmi_parse_memmode(); + + memset(mq->unexpected_htab, 0, + NUM_HASH_CONFIGS * NUM_HASH_BUCKETS * sizeof(struct mqq)); + memset(mq->expected_htab, 0, + NUM_HASH_CONFIGS * NUM_HASH_BUCKETS * sizeof(struct mqq)); + memset(&mq->expected_q, 0, sizeof(struct mqq)); + memset(&mq->unexpected_q, 0, sizeof(struct mqq)); + memset(&mq->completed_q, 0, sizeof(struct mqq)); + memset(&mq->outoforder_q, 0, sizeof(struct mqq)); + STAILQ_INIT(&mq->eager_q); + + + /* The values are overwritten in initialize_defaults, they're just set to + * sensible defaults until then */ + if(psmi_cpu_model == CPUID_MODEL_PHI_GEN2 || psmi_cpu_model == CPUID_MODEL_PHI_GEN2M) + { + mq->hfi_thresh_rv = MQ_HFI_THRESH_RNDV_PHI2; + mq->hfi_base_window_rv = MQ_HFI_WINDOW_RNDV_PHI2; + } else { + mq->hfi_thresh_rv = MQ_HFI_THRESH_RNDV_XEON; + mq->hfi_base_window_rv = MQ_HFI_WINDOW_RNDV_XEON; + } + if (! (psmi_parse_rdmamode() & IPS_PROTOEXP_FLAG_ENABLED)) { + // TBD - when RDMA is disabled do we want to disable rendezvous? + // even without RDMA, the receiver controlled pacing helps scalability + mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous + } + mq->hfi_thresh_tiny = MQ_HFI_THRESH_TINY; +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED) + mq->hfi_base_window_rv = MQ_HFI_WINDOW_RNDV_CUDA; +#endif + mq->shm_thresh_rv = MQ_SHM_THRESH_RNDV; + + memset(&mq->stats, 0, sizeof(psm2_mq_stats_t)); + err = psmi_mq_req_init(mq); + if (err) + goto fail; + psmi_mq_initstats(mq); + + *mqo = mq; + + return PSM2_OK; +fail: + if (mq != NULL) + psmi_free(mq); + return err; +} + +psm2_error_t psmi_mq_initialize_defaults(psm2_mq_t mq) +{ + union psmi_envvar_val env_hfitiny, env_rvwin, env_hfirv, + env_shmrv, env_stats; + + psmi_getenv("PSM3_MQ_TINY_NIC_THRESH", + "NIC tiny packet switchover (max 8, default 8)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)mq->hfi_thresh_tiny, &env_hfitiny); + mq->hfi_thresh_tiny = min(env_hfitiny.e_uint, 8); + + psmi_getenv("PSM3_MQ_RNDV_NIC_THRESH", + "NIC eager-to-rendezvous switchover", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)mq->hfi_thresh_rv, &env_hfirv); + mq->hfi_thresh_rv = env_hfirv.e_uint; + + psmi_getenv("PSM3_MQ_RNDV_NIC_WINDOW", + "NIC rendezvous window size, max 4M", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)mq->hfi_base_window_rv, &env_rvwin); + mq->hfi_base_window_rv = min(4 * 1024 * 1024, env_rvwin.e_uint); + + /* Re-evaluate this since it may have changed after initializing the shm + * device */ + mq->shm_thresh_rv = psmi_shm_mq_rv_thresh; + psmi_getenv("PSM3_MQ_RNDV_SHM_THRESH", + "shm eager-to-rendezvous switchover", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)mq->shm_thresh_rv, &env_shmrv); + mq->shm_thresh_rv = env_shmrv.e_uint; + + psmi_getenv("PSM3_MQ_PRINT_STATS", + "Prints MQ performance stats every n seconds to file " + "./psm3-perf-stat-ep-[epid]-pid-[pid] when set to -1 stats are " + "printed only once during finalization", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) 0, &env_stats); + mq->print_stats = env_stats.e_uint; + + mq->nohash_fastpath = 1; + return PSM2_OK; +} + +psm2_error_t MOCKABLE(psmi_mq_free)(psm2_mq_t mq) +{ + psmi_mq_req_fini(mq); + psmi_mq_sysbuf_fini(mq); + psmi_stats_deregister_type(PSMI_STATSTYPE_MQ, mq); + psmi_free(mq); + return PSM2_OK; +} +MOCK_DEF_EPILOGUE(psmi_mq_free); diff --git a/prov/psm3/psm3/psm_mq_internal.h b/prov/psm3/psm3/psm_mq_internal.h new file mode 100644 index 00000000000..452bf7a3e92 --- /dev/null +++ b/prov/psm3/psm3/psm_mq_internal.h @@ -0,0 +1,623 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#ifndef MQ_INT_H +#define MQ_INT_H + +/* Ugh. smmintrin.h eventually includes mm_malloc.h, which calls malloc */ +#ifdef malloc +#undef malloc +#endif +#ifdef free +#undef free +#endif +#include +#include "psm_user.h" +#include "psm_sysbuf.h" + +#include "psm2_mock_testing.h" + +#if 0 +typedef psm2_error_t(*psm_mq_unexpected_callback_fn_t) + (psm2_mq_t mq, uint16_t mode, psm2_epaddr_t epaddr, + uint64_t tag, uint32_t send_msglen, const void *payload, + uint32_t paylen); +#endif + +#define MICRO_SEC 1000000 +#define MSG_BUFFER_LEN 100 + +struct psm2_mq_perf_data +{ + pthread_t perf_print_thread; + int perf_print_stats; +}; + +enum psm2_mq_tag_pattern { + PSM2_TAG_SRC = 0, + PSM2_TAG_ANYSRC, + PSM2_ANYTAG_SRC, + PSM2_ANYTAG_ANYSRC, +}; + +struct psm2_mq { + psm2_ep_t ep; /**> ep back pointer */ + mpool_t sreq_pool; + mpool_t rreq_pool; + + struct mqq unexpected_htab[NUM_HASH_CONFIGS][NUM_HASH_BUCKETS]; + struct mqq expected_htab[NUM_HASH_CONFIGS][NUM_HASH_BUCKETS]; + + /* in case the compiler can't figure out how to preserve the hashed values + between mq_req_match() and mq_add_to_unexpected_hashes() ... */ + unsigned hashvals[NUM_HASH_CONFIGS]; + + /*psm_mq_unexpected_callback_fn_t unexpected_callback; */ + struct mqq expected_q; /**> Preposted (expected) queue */ + struct mqq unexpected_q; /**> Unexpected queue */ + struct mqq completed_q; /**> Completed queue */ + + struct mqq outoforder_q; /**> OutofOrder queue */ + STAILQ_HEAD(, psm2_mq_req) eager_q; /**> eager request queue */ + + uint32_t hfi_thresh_tiny; + uint32_t hfi_thresh_rv; + uint32_t shm_thresh_rv; + uint32_t hfi_base_window_rv; /**> this is a base rndv window size, + will be further trimmed down per-connection based + on the peer's MTU */ + int memmode; + + uint64_t timestamp; + + psm2_mq_stats_t stats; /**> MQ stats, accumulated by each PTL */ + + int print_stats; + struct psm2_mq_perf_data mq_perf_data; + + int nohash_fastpath; + unsigned unexpected_hash_len; + unsigned unexpected_list_len; + unsigned expected_hash_len; + unsigned expected_list_len; + + psmi_mem_ctrl_t handler_index[MM_NUM_OF_POOLS]; + int mem_ctrl_is_init; + uint64_t mem_ctrl_total_bytes; + + psmi_lock_t progress_lock; +}; + +#define MQE_TYPE_IS_SEND(type) ((type) & MQE_TYPE_SEND) +#define MQE_TYPE_IS_RECV(type) ((type) & MQE_TYPE_RECV) + +#define MQE_TYPE_SEND 0x1000 +#define MQE_TYPE_RECV 0x2000 +#define MQE_TYPE_FLAGMASK 0x0fff +#define MQE_TYPE_WAITING 0x0001 +#define MQE_TYPE_WAITING_PEER 0x0004 +#define MQE_TYPE_EAGER_QUEUE 0x0008 + +#define MQ_STATE_COMPLETE 0 +#define MQ_STATE_POSTED 1 +#define MQ_STATE_MATCHED 2 +#define MQ_STATE_UNEXP 3 +#define MQ_STATE_UNEXP_RV 4 +#define MQ_STATE_FREE 5 + +/* + * These must match the ips protocol message opcode. + */ +#define MQ_MSG_TINY 0xc1 +#define MQ_MSG_SHORT 0xc2 +#define MQ_MSG_EAGER 0xc3 +#define MQ_MSG_LONGRTS 0xc4 + +/* + * Descriptor allocation limits. + * The 'LIMITS' predefines fill in a psmi_rlimits_mpool structure + */ +#define MQ_SENDREQ_LIMITS { \ + .env = "PSM3_MQ_SENDREQS_MAX", \ + .descr = "Max num of isend requests in flight", \ + .env_level = PSMI_ENVVAR_LEVEL_USER, \ + .minval = 1, \ + .maxval = ~0, \ + .mode[PSMI_MEMMODE_NORMAL] = { 1024, 1048576 }, \ + .mode[PSMI_MEMMODE_MINIMAL] = { 1024, 65536 }, \ + .mode[PSMI_MEMMODE_LARGE] = { 8192, 16777216 } \ + } + +#define MQ_RECVREQ_LIMITS { \ + .env = "PSM3_MQ_RECVREQS_MAX", \ + .descr = "Max num of irecv requests in flight", \ + .env_level = PSMI_ENVVAR_LEVEL_USER, \ + .minval = 1, \ + .maxval = ~0, \ + .mode[PSMI_MEMMODE_NORMAL] = { 1024, 1048576 }, \ + .mode[PSMI_MEMMODE_MINIMAL] = { 1024, 65536 }, \ + .mode[PSMI_MEMMODE_LARGE] = { 8192, 16777216 } \ + } + +typedef psm2_error_t(*mq_rts_callback_fn_t) (psm2_mq_req_t req, int was_posted); +typedef psm2_error_t(*mq_testwait_callback_fn_t) (psm2_mq_req_t *req); + + +/* If request is marked as internal, then it will not + be exposed to the user, will not be added to the mq->completed_q. + This flag is set if request is used by e.g. MPI_SEND */ +#define PSMI_REQ_FLAG_IS_INTERNAL (1 << 0) +/* Identifies req as part of fast path. */ +#define PSMI_REQ_FLAG_FASTPATH (1 << 1) +/* Identifies req as a NORMAL operation with no special cases.*/ +#define PSMI_REQ_FLAG_NORMAL 0 + +#define psmi_is_req_internal(req) ((req)->flags_internal & PSMI_REQ_FLAG_IS_INTERNAL) + +#define psmi_assert_req_not_internal(req) psmi_assert(((req) == PSM2_MQ_REQINVALID) || \ + (!psmi_is_req_internal(req))) + +/* receive mq_req, the default */ +struct psm2_mq_req { + struct psm2_mq_req_user req_data; + + struct { + psm2_mq_req_t next[NUM_MQ_SUBLISTS]; + psm2_mq_req_t prev[NUM_MQ_SUBLISTS]; + STAILQ_ENTRY(psm2_mq_req) nextq; /* used for eager only */ + }; + struct mqq *q[NUM_MQ_SUBLISTS]; + uint64_t timestamp; + uint32_t state; + uint32_t type; + psm2_mq_t mq; + + /* Some PTLs want to get notified when there's a test/wait event */ + mq_testwait_callback_fn_t testwait_callback; + + uint16_t msg_seqnum; /* msg seq num for mctxt */ + uint32_t recv_msgoff; /* Message offset into req_data.buf */ + union { + uint32_t send_msgoff; /* Bytes received so far.. can be larger than buf_len */ + uint32_t recv_msgposted; + }; + uint32_t rts_reqidx_peer; + + uint32_t flags_user; + uint32_t flags_internal; + + /* Used to keep track of unexpected rendezvous */ + mq_rts_callback_fn_t rts_callback; + psm2_epaddr_t rts_peer; + uintptr_t rts_sbuf; + + psm2_verbs_mr_t mr; // local registered memory for app buffer + +#ifdef PSM_CUDA + uint8_t* user_gpu_buffer; + STAILQ_HEAD(sendreq_spec_, ips_cuda_hostbuf) sendreq_prefetch; + uint32_t prefetch_send_msgoff; + int cuda_hostbuf_used; + CUipcMemHandle cuda_ipc_handle; + CUevent cuda_ipc_event; + uint8_t cuda_ipc_handle_attached; + uint32_t cuda_ipc_offset; + /* + * is_sendbuf_gpu_mem - Used to always select TID path on the receiver + * when send is on a device buffer + */ + uint8_t is_sendbuf_gpu_mem; +#endif + /* + * is_buf_gpu_mem - used to indicate if the send or receive is issued + * on a device/host buffer. + */ + uint8_t is_buf_gpu_mem; + + /* PTLs get to store their own per-request data. MQ manages the allocation + * by allocating psm2_mq_req so that ptl_req_data has enough space for all + * possible PTLs. + */ + union { + void *ptl_req_ptr; /* when used by ptl as pointer */ + uint8_t ptl_req_data[0]; /* when used by ptl for "inline" data */ + }; +}; + +PSMI_ALWAYS_INLINE( +unsigned +hash_64(uint64_t a)) +{ + return _mm_crc32_u64(0, a); +} +PSMI_ALWAYS_INLINE( +unsigned +hash_32(uint32_t a)) +{ + return _mm_crc32_u32(0, a); +} + +void MOCKABLE(psmi_mq_mtucpy)(void *vdest, const void *vsrc, uint32_t nchars); +MOCK_DCL_EPILOGUE(psmi_mq_mtucpy); +void psmi_mq_mtucpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars); + +#if defined(__x86_64__) +void psmi_mq_mtucpy_safe(void *vdest, const void *vsrc, uint32_t nchars); +#else +#define psmi_mq_mtucpy_safe psmi_mq_mtucpy +#endif + +/* + * Optimize for 0-8 byte case, but also handle others. + */ +PSMI_ALWAYS_INLINE( +void +mq_copy_tiny(uint32_t *dest, uint32_t *src, uint8_t len)) +{ +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(dest) || PSMI_IS_CUDA_MEM(src))) { + PSMI_CUDA_CALL(cuMemcpy, (CUdeviceptr)dest, (CUdeviceptr)src, len); + return; + } +#endif + switch (len) { + case 8: *dest++ = *src++; + /* fall through */ + case 4: *dest++ = *src++; + /* fall through */ + case 0: + return; + case 7: + case 6: + case 5: + *dest++ = *src++; + len -= 4; + /* fall through */ + case 3: + case 2: + case 1: + break; + default: /* greater than 8 */ + psmi_mq_mtucpy(dest, src, len); + return; + } + uint8_t *dest1 = (uint8_t *) dest; + uint8_t *src1 = (uint8_t *) src; + switch (len) { + case 3: *dest1++ = *src1++; + /* fall through */ + case 2: *dest1++ = *src1++; + /* fall through */ + case 1: *dest1++ = *src1++; + } +} + +typedef void (*psmi_mtucpy_fn_t)(void *dest, const void *src, uint32_t len); +#ifdef PSM_CUDA + +PSMI_ALWAYS_INLINE( +void +mq_copy_tiny_host_mem(uint32_t *dest, uint32_t *src, uint8_t len)) +{ + switch (len) { + case 8: *dest++ = *src++; + /* fall through */ + case 4: *dest++ = *src++; + /* fall through */ + case 0: + return; + case 7: + case 6: + case 5: + *dest++ = *src++; + len -= 4; + /* fall through */ + case 3: + case 2: + case 1: + break; + default: /* greater than 8 */ + psmi_mq_mtucpy(dest, src, len); + return; + } + uint8_t *dest1 = (uint8_t *) dest; + uint8_t *src1 = (uint8_t *) src; + switch (len) { + case 3: *dest1++ = *src1++; + /* fall through */ + case 2: *dest1++ = *src1++; + /* fall through */ + case 1: *dest1++ = *src1++; + } +} +#endif + +/* Typedef describing a function to populate a psm2_mq_status(2)_t given a + * matched request. The purpose of this typedef is to avoid duplicating + * code to handle both PSM v1 and v2 status objects. Outer routines pass in + * either mq_status_copy or mq_status2_copy and the inner routine calls that + * provided routine to fill in the correct status type. + */ +typedef void (*psmi_mq_status_copy_t) (psm2_mq_req_t req, void *status); + +/* + * Given an req with buffer ubuf of length ubuf_len, + * fill in the req's status and return the amount of bytes the request + * can receive. + * + * The function sets status truncation errors. Basically what MPI_Status does. + */ +PSMI_ALWAYS_INLINE( +void +mq_status_copy(psm2_mq_req_t req, psm2_mq_status_t *status)) +{ + status->msg_tag = req->req_data.tag.tag64; + status->msg_length = req->req_data.send_msglen; + status->nbytes = req->req_data.recv_msglen; + status->error_code = (psm2_error_t)req->req_data.error_code; + status->context = req->req_data.context; +} + +PSMI_ALWAYS_INLINE( +void +mq_status2_copy(psm2_mq_req_t req, psm2_mq_status2_t *status)) +{ + status->msg_peer = req->req_data.peer; + status->msg_tag = req->req_data.tag; + status->msg_length = req->req_data.send_msglen; + status->nbytes = req->req_data.recv_msglen; + status->error_code = (psm2_error_t)req->req_data.error_code; + status->context = req->req_data.context; +} + +PSMI_ALWAYS_INLINE( +uint32_t +mq_set_msglen(psm2_mq_req_t req, uint32_t recvlen, uint32_t sendlen)) +{ + req->req_data.send_msglen = sendlen; + if (recvlen < sendlen) { + req->req_data.recv_msglen = recvlen; + req->req_data.error_code = PSM2_MQ_TRUNCATION; + return recvlen; + } else { + req->req_data.recv_msglen = sendlen; + req->req_data.error_code = PSM2_OK; + return sendlen; + } +} + +PSMI_ALWAYS_INLINE( +int +min_timestamp_4(psm2_mq_req_t *match)) +{ + uint64_t oldest = -1; + int which = -1, i; + for (i = 0; i < 4; i++) { + if (match[i] && (match[i]->timestamp < oldest)) { + oldest = match[i]->timestamp; + which = i; + } + } + return which; +} + +#ifndef PSM_DEBUG +/*! Append to Queue */ +PSMI_ALWAYS_INLINE(void mq_qq_append(struct mqq *q, psm2_mq_req_t req)) +{ + req->next[PSM2_ANYTAG_ANYSRC] = NULL; + req->prev[PSM2_ANYTAG_ANYSRC] = q->last; + if (q->last) + q->last->next[PSM2_ANYTAG_ANYSRC] = req; + else + q->first = req; + q->last = req; + req->q[PSM2_ANYTAG_ANYSRC] = q; +} +#else +#define mq_qq_append(qq, req) \ + do { \ + psmi_assert_req_not_internal(req); \ + (req)->next[PSM2_ANYTAG_ANYSRC] = NULL; \ + (req)->prev[PSM2_ANYTAG_ANYSRC] = (qq)->last; \ + if ((qq)->last) \ + (qq)->last->next[PSM2_ANYTAG_ANYSRC] = (req); \ + else \ + (qq)->first = (req); \ + (qq)->last = (req); \ + (req)->q[PSM2_ANYTAG_ANYSRC] = (qq); \ + if (qq == &(req)->mq->completed_q) \ + _HFI_VDBG("Moving (req)=%p to completed queue on %s, %d\n", \ + (req), __FILE__, __LINE__); \ + } while (0) +#endif +PSMI_ALWAYS_INLINE( +void mq_qq_append_which(struct mqq q[NUM_HASH_CONFIGS][NUM_HASH_BUCKETS], + int table, int bucket, psm2_mq_req_t req)) +{ + req->next[table] = NULL; + req->prev[table] = q[table][bucket].last; + if (q[table][bucket].last) + q[table][bucket].last->next[table] = req; + else + q[table][bucket].first = req; + q[table][bucket].last = req; + req->q[table] = &q[table][bucket]; +} +PSMI_ALWAYS_INLINE(void mq_qq_remove(struct mqq *q, psm2_mq_req_t req)) +{ + if (req->next[PSM2_ANYTAG_ANYSRC] != NULL) + req->next[PSM2_ANYTAG_ANYSRC]->prev[PSM2_ANYTAG_ANYSRC] = + req->prev[PSM2_ANYTAG_ANYSRC]; + else + q->last = req->prev[PSM2_ANYTAG_ANYSRC]; + if (req->prev[PSM2_ANYTAG_ANYSRC]) + req->prev[PSM2_ANYTAG_ANYSRC]->next[PSM2_ANYTAG_ANYSRC] = + req->next[PSM2_ANYTAG_ANYSRC]; + else + q->first = req->next[PSM2_ANYTAG_ANYSRC]; +} +PSMI_ALWAYS_INLINE(void mq_qq_remove_which(psm2_mq_req_t req, int table)) +{ + struct mqq *q = req->q[table]; + + req->q[table] = NULL; + if (req->next[table] != NULL) + req->next[table]->prev[table] = req->prev[table]; + else + q->last = req->prev[table]; + if (req->prev[table]) + req->prev[table]->next[table] = req->next[table]; + else + q->first = req->next[table]; +} + +psm2_error_t psmi_mq_req_init(psm2_mq_t mq); +psm2_error_t psmi_mq_req_fini(psm2_mq_t mq); +psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type); +MOCK_DCL_EPILOGUE(psmi_mq_req_alloc); +#define psmi_mq_req_free(req) psmi_mpool_put(req) + +/* + * Main receive progress engine, for shmops and hfi, in mq.c + */ +psm2_error_t psmi_mq_malloc(psm2_mq_t *mqo); +psm2_error_t psmi_mq_initialize_defaults(psm2_mq_t mq); + +psm2_error_t MOCKABLE(psmi_mq_free)(psm2_mq_t mq); +MOCK_DCL_EPILOGUE(psmi_mq_free); + +/* Three functions that handle all MQ stuff */ +#define MQ_RET_MATCH_OK 0 +#define MQ_RET_UNEXP_OK 1 +#define MQ_RET_UNEXP_NO_RESOURCES 2 +#define MQ_RET_DATA_OK 3 +#define MQ_RET_DATA_OUT_OF_ORDER 4 + +void psmi_mq_handle_rts_complete(psm2_mq_req_t req); +int psmi_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req, + uint32_t offset, const void *payload, uint32_t paylen); +int psmi_mq_handle_rts(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, + uint32_t msglen, const void *payload, uint32_t paylen, + int msgorder, mq_rts_callback_fn_t cb, + psm2_mq_req_t *req_o); +int psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, + uint32_t msglen, uint32_t offset, + const void *payload, uint32_t paylen, int msgorder, + uint32_t opcode, psm2_mq_req_t *req_o); +int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t req); + +#if 0 // unused code, specific to QLogic MPI +void psmi_mq_stats_register(psm2_mq_t mq, mpspawn_stats_add_fn add_fn); +#endif + +void psmi_mq_fastpath_disable(psm2_mq_t mq); +void psmi_mq_fastpath_try_reenable(psm2_mq_t mq); + +PSMI_ALWAYS_INLINE( +psm2_mq_req_t +mq_ooo_match(struct mqq *q, void *msgctl, uint16_t msg_seqnum)) +{ + psm2_mq_req_t *curp; + psm2_mq_req_t cur; + + for (curp = &q->first; (cur = *curp) != NULL; curp = &cur->next[PSM2_ANYTAG_ANYSRC]) { + if (cur->ptl_req_ptr == msgctl && cur->msg_seqnum == msg_seqnum) { + /* match! */ + mq_qq_remove(q, cur); + return cur; + } + } + return NULL; /* no match */ +} + +PSMI_ALWAYS_INLINE( +psm2_mq_req_t +mq_eager_match(psm2_mq_t mq, void *peer, uint16_t msg_seqnum)) +{ + psm2_mq_req_t cur; + + cur = STAILQ_FIRST(&mq->eager_q); + while (cur) { + if (cur->ptl_req_ptr == peer && cur->msg_seqnum == msg_seqnum) + return cur; + cur = STAILQ_NEXT(cur, nextq); + } + return NULL; /* no match */ +} + +#if 0 +/* Not exposed in public psm, but may extend parts of PSM 2.1 to support + * this feature before 2.3 */ +psm_mq_unexpected_callback_fn_t +psmi_mq_register_unexpected_callback(psm2_mq_t mq, + psm_mq_unexpected_callback_fn_t fn); +#endif + +PSMI_ALWAYS_INLINE(void psmi_mq_stats_rts_account(psm2_mq_req_t req)) +{ + psm2_mq_t mq = req->mq; + if (MQE_TYPE_IS_SEND(req->type)) { + mq->stats.tx_num++; + mq->stats.tx_rndv_num++; + mq->stats.tx_rndv_bytes += req->req_data.send_msglen; + } else { + mq->stats.rx_user_num++; + mq->stats.rx_user_bytes += req->req_data.recv_msglen; + } + return; +} + +#endif diff --git a/prov/psm3/psm3/psm_mq_recv.c b/prov/psm3/psm3/psm_mq_recv.c new file mode 100644 index 00000000000..131c5b5dd12 --- /dev/null +++ b/prov/psm3/psm3/psm_mq_recv.c @@ -0,0 +1,678 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "psm_mq_internal.h" +#include "ptl_ips/ips_proto_header.h" + +#ifdef PSM_CUDA +#include "psm_gdrcpy.h" +#endif + +#if 0 +/* Not exposed in public psm, but may extend parts of PSM 2.1 to support + * this feature before 2.3 */ +psm_mq_unexpected_callback_fn_t +psmi_mq_register_unexpected_callback(psm2_mq_t mq, + psm_mq_unexpected_callback_fn_t fn) +{ + psm_mq_unexpected_callback_fn_t old_fn = mq->unexpected_callback; + mq->unexpected_callback = fn; + return old_fn; +} +#endif + +// the RTS/CTS sequence using TID is now complete +// used on both sender and receiver side +// LONG_DATA on sender ends up in ips_proto_mq_eager_complete +// LONG_DATA on receiver ends up in psmi_mq_handle_data +void psmi_mq_handle_rts_complete(psm2_mq_req_t req) +{ + psm2_mq_t mq = req->mq; + + if (req->mr) { + _HFI_MMDBG("RTS complete, releasing MR: rkey: 0x%x\n", req->mr->rkey); + psm2_verbs_release_mr(req->mr); + req->mr = NULL; + ips_tid_mravail_callback(req->rts_peer->proto); + } + + /* Stats on rendez-vous messages */ + psmi_mq_stats_rts_account(req); + req->state = MQ_STATE_COMPLETE; + ips_barrier(); + if(!psmi_is_req_internal(req)) + mq_qq_append(&mq->completed_q, req); + + _HFI_VDBG("RTS complete, req=%p, recv_msglen = %d\n", + req, req->req_data.recv_msglen); + return; +} + +static void +psmi_mq_req_copy(psm2_mq_req_t req, + uint32_t offset, const void *buf, uint32_t nbytes) +{ + /* recv_msglen may be changed by unexpected receive req_data.buf. */ + uint32_t msglen_this, end; + uint8_t *msgptr = (uint8_t *) req->req_data.buf + offset; + + /* out of receiving range. */ + if (offset >= req->req_data.recv_msglen) { + req->send_msgoff += nbytes; + return; + } + + end = offset + nbytes; + if (end > req->req_data.recv_msglen) { + msglen_this = req->req_data.recv_msglen - offset; + end = req->req_data.recv_msglen; + } else { + msglen_this = nbytes; + } + + psmi_mq_mtucpy(msgptr, buf, msglen_this); + + if (req->recv_msgoff < end) { + req->recv_msgoff = end; + } + + req->send_msgoff += nbytes; + return; +} + +// This handles eager and LONG_DATA payload and completion for receiver +int +psmi_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req, + uint32_t offset, const void *buf, uint32_t nbytes) +{ + psmi_assert(req != NULL); + int rc; + + if (req->state == MQ_STATE_MATCHED) + rc = MQ_RET_MATCH_OK; + else { + psmi_assert(req->state == MQ_STATE_UNEXP); + rc = MQ_RET_UNEXP_OK; + } + + psmi_mq_req_copy(req, offset, buf, nbytes); + + /* + * the reason to use >= is because send_msgoff + * may be DW pad included. + */ + if (req->send_msgoff >= req->req_data.send_msglen) { + if (req->type & MQE_TYPE_EAGER_QUEUE) { + STAILQ_REMOVE(&mq->eager_q, req, psm2_mq_req, nextq); + } + + if (req->state == MQ_STATE_MATCHED) { + psmi_assert(! req->mr); + req->state = MQ_STATE_COMPLETE; + ips_barrier(); + mq_qq_append(&mq->completed_q, req); + } else { /* MQ_STATE_UNEXP */ + req->state = MQ_STATE_COMPLETE; + } + } + + return rc; +} + +static +void mq_add_to_unexpected_hashes(psm2_mq_t mq, psm2_mq_req_t req) +{ + int table; + mq_qq_append(&mq->unexpected_q, req); + req->q[PSM2_ANYTAG_ANYSRC] = &mq->unexpected_q; + mq->unexpected_list_len++; + if_pt (mq->nohash_fastpath) { + if_pf (mq->unexpected_list_len >= HASH_THRESHOLD) + psmi_mq_fastpath_disable(mq); + return; + } + + for (table = PSM2_TAG_SRC; table < PSM2_ANYTAG_ANYSRC; table++) + mq_qq_append_which(mq->unexpected_htab, + table, mq->hashvals[table], req); + mq->unexpected_hash_len++; +} + + +psm2_mq_req_t +mq_list_scan(struct mqq *q, psm2_epaddr_t src, psm2_mq_tag_t *tag, int which, uint64_t *time_threshold) +{ + psm2_mq_req_t *curp, cur; + + for (curp = &q->first; + ((cur = *curp) != NULL) && (cur->timestamp < *time_threshold); + curp = &cur->next[which]) { + if ((cur->req_data.peer == PSM2_MQ_ANY_ADDR || src == cur->req_data.peer) && + !((tag->tag[0] ^ cur->req_data.tag.tag[0]) & cur->req_data.tagsel.tag[0]) && + !((tag->tag[1] ^ cur->req_data.tag.tag[1]) & cur->req_data.tagsel.tag[1]) && + !((tag->tag[2] ^ cur->req_data.tag.tag[2]) & cur->req_data.tagsel.tag[2])) { + *time_threshold = cur->timestamp; + return cur; + } + } + return NULL; +} + +psm2_mq_req_t +mq_req_match(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, int remove) +{ + psm2_mq_req_t match[4]; + int table; + uint64_t best_ts = -1; + + if (mq->nohash_fastpath) { + table = PSM2_ANYTAG_ANYSRC; + match[table] = + mq_list_scan(&mq->expected_q, + src, tag, PSM2_ANYTAG_ANYSRC, &best_ts); + if (match[table] && remove) { + mq->expected_list_len--; + mq_qq_remove_which(match[table], table); + } + return match[table]; + } + + mq->hashvals[PSM2_TAG_SRC] = hash_64(tag->tag64) % NUM_HASH_BUCKETS; + mq->hashvals[PSM2_TAG_ANYSRC] = hash_32(tag->tag[0]) % NUM_HASH_BUCKETS; + mq->hashvals[PSM2_ANYTAG_SRC] = hash_32(tag->tag[1]) % NUM_HASH_BUCKETS; + + for (table = PSM2_TAG_SRC; table < PSM2_ANYTAG_ANYSRC; table++) + match[table] = + mq_list_scan(&mq->expected_htab[table][mq->hashvals[table]], + src, tag, table, &best_ts); + table = PSM2_ANYTAG_ANYSRC; + match[table] = mq_list_scan(&mq->expected_q, src, tag, table, &best_ts); + + table = min_timestamp_4(match); + if (table == -1) + return NULL; + + if (remove) { + if_pt (table == PSM2_ANYTAG_ANYSRC) + mq->expected_list_len--; + else + mq->expected_hash_len--; + mq_qq_remove_which(match[table], table); + psmi_mq_fastpath_try_reenable(mq); + } + return match[table]; +} +/* + * This handles the rendezvous MPI envelopes, the packet might have the whole + * message payload, or zero payload. + * our return indicates if we had a match. If no match we prepare the + * req for future processing and callback when a future MPI_recv call matches + * as a performance optmization, the first time we lack a match we ask for + * a REVISIT of the message to help the case where the MPI_recv is just slightly + * after the RTS arrived + */ +int +psmi_mq_handle_rts(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, + uint32_t send_msglen, const void *payload, uint32_t paylen, + int msgorder, mq_rts_callback_fn_t cb, psm2_mq_req_t *req_o) +{ + psm2_mq_req_t req; + uint32_t msglen; + int rc; + + PSMI_LOCK_ASSERT(mq->progress_lock); + + _HFI_MMDBG("rts from 0x%"PRIx64" 0x%x,0x%x,0x%x", + src->epid, tag->tag0, tag->tag1, tag->tag2); + if (msgorder && (req = mq_req_match(mq, src, tag, 1))) { + /* we have a match, no need to callback */ + msglen = mq_set_msglen(req, req->req_data.buf_len, send_msglen); + /* reset send_msglen because sender only sends this many */ + req->req_data.send_msglen = msglen; + req->state = MQ_STATE_MATCHED; + req->req_data.peer = src; + req->req_data.tag = *tag; + + if (paylen > msglen) paylen = msglen; + if (paylen) { + // payload of RTS can contain a single packet synchronous MPI msg + psmi_mq_mtucpy(req->req_data.buf, payload, paylen); + } + req->recv_msgoff = req->send_msgoff = paylen; + *req_o = req; /* yes match */ + PSM2_LOG_EPM(OPCODE_LONG_RTS,PSM2_LOG_RX,src->epid,mq->ep->epid, + "req->rts_reqidx_peer: %d",req->rts_reqidx_peer); + rc = MQ_RET_MATCH_OK; + } else if (msgorder > 1) { + /* There is NO request match, and this is the first time + * to try to process this packet, we leave the packet in + * hardware queue for retry in hope there is a request + * match next time, this is for performance + * consideration. + */ + _HFI_MMDBG("no match 1st time - revisit msgorder=%d\n", msgorder); + rc = MQ_RET_UNEXP_NO_RESOURCES; + } else { /* No match, keep track of callback */ + /* this is the 2nd attempt so we need to put it on the unexpected + * queue and move on. A future MPI_Recv call will match it + */ + // TBD - on OPA for OSU latency we tend to hit revisit queue and then + // match on 2nd call. On PSM UD we tend to hit revisit queue and + // then still not match on 2nd attempt and end up here. Unclear + // why MPI_Recv gets posted a little slower. Maybe RDMA Write acks + // occur a little slower then Native OPA's explicit TID_COMPLETION + // such that sender does not get it's MPI_Send done prior to remote + // node completing it's MPI_Recv and starting it's next MPI_Send + // may want to see if REVISIT is providing value or whether anything + // to tune to speed up RDMA Send completion (eg. and the ack which + // triggers it). + // Experiment with skipping revisit return above and always doing + // this need more analysis but limited if any impact on native OPA. + _HFI_MMDBG("no match req queue msgorder=%d\n", msgorder); + req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); + psmi_assert(req != NULL); + /* We don't know recv_msglen yet but we set it here for + * mq_iprobe */ + req->req_data.send_msglen = req->req_data.recv_msglen = send_msglen; + PSM2_LOG_EPM_COND(req->req_data.send_msglen > mq->hfi_thresh_rv, + OPCODE_LONG_RTS,PSM2_LOG_RX,src->epid,mq->ep->epid, + "req->rts_reqidx_peer: %d",req->rts_reqidx_peer); + req->state = MQ_STATE_UNEXP_RV; + req->req_data.peer = src; + req->req_data.tag = *tag; + req->rts_callback = cb; + if (paylen > send_msglen) paylen = send_msglen; + if (paylen) { + req->req_data.buf = psmi_mq_sysbuf_alloc(mq, paylen); + psmi_assert(paylen == 0 || req->req_data.buf != NULL); + mq->stats.rx_sysbuf_num++; + mq->stats.rx_sysbuf_bytes += paylen; + psmi_mq_mtucpy(req->req_data.buf, payload, paylen); + } + req->recv_msgoff = req->send_msgoff = paylen; + + if (msgorder) { + mq_add_to_unexpected_hashes(mq, req); + } + /* caller will handle out of order case */ + *req_o = req; /* no match, will callback */ + rc = MQ_RET_UNEXP_OK; + } + +#ifdef PSM_DEBUG + if (req) + _HFI_VDBG("match=%s (req=%p) src=%s mqtag=%08x.%08x.%08x recvlen=%d " + "sendlen=%d errcode=%d\n", + rc == MQ_RET_MATCH_OK ? "YES" : "NO", req, + psmi_epaddr_get_name(src->epid), + req->req_data.tag.tag[0], req->req_data.tag.tag[1], req->req_data.tag.tag[2], + req->req_data.recv_msglen, req->req_data.send_msglen, req->req_data.error_code); + else + _HFI_VDBG("match=%s (req=%p) src=%s\n", + rc == MQ_RET_MATCH_OK ? "YES" : "NO", req, + psmi_epaddr_get_name(src->epid)); +#endif /* #ifdef PSM_DEBUG */ + return rc; +} + +/* + * This handles the regular (i.e. non-rendezvous MPI envelopes) + */ +int +psmi_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *tag, + uint32_t send_msglen, uint32_t offset, + const void *payload, uint32_t paylen, int msgorder, + uint32_t opcode, psm2_mq_req_t *req_o) +{ + psm2_mq_req_t req; + uint32_t msglen; + psmi_mtucpy_fn_t psmi_mtucpy_fn; +#if defined(PSM_CUDA) + int converted = 0; +#endif // PSM_CUDA + + if (msgorder && (req = mq_req_match(mq, src, tag, 1))) { + /* we have a match */ + void *user_buffer = req->req_data.buf; + psmi_assert(MQE_TYPE_IS_RECV(req->type)); + req->req_data.peer = src; + req->req_data.tag = *tag; + msglen = mq_set_msglen(req, req->req_data.buf_len, send_msglen); + + _HFI_VDBG("match=YES (req=%p) opcode=%x src=%s mqtag=%x.%x.%x" + " msglen=%d paylen=%d\n", req, opcode, + psmi_epaddr_get_name(src->epid), + tag->tag[0], tag->tag[1], tag->tag[2], msglen, + paylen); + + switch (opcode) { + case MQ_MSG_TINY: + /* mq_copy_tiny() can handle zero byte */ +#ifdef PSM_CUDA + if (PSMI_USE_GDR_COPY(req, msglen)) { + user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, + (unsigned long)req->req_data.buf, + msglen, 1, src->proto); + converted = 1; + } +#endif + mq_copy_tiny((uint32_t *) user_buffer, (uint32_t *) payload, msglen); +#if defined(PSM_CUDA) + if (converted) { + gdr_unmap_gpu_host_addr(GDR_FD, user_buffer, msglen, + src->proto); + } +#endif // PSM_CUDA + + req->state = MQ_STATE_COMPLETE; + ips_barrier(); + mq_qq_append(&mq->completed_q, req); + break; + + case MQ_MSG_SHORT: /* message fits in 1 payload */ + psmi_mtucpy_fn = psmi_mq_mtucpy; +#ifdef PSM_CUDA + if (PSMI_USE_GDR_COPY(req, msglen)) { + user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, + (unsigned long)req->req_data.buf, + msglen, 1, src->proto); + psmi_mtucpy_fn = psmi_mq_mtucpy_host_mem; + converted = 1; + } +#endif + if (msglen <= paylen) { + psmi_mtucpy_fn(user_buffer, payload, msglen); + } else { + psmi_assert((msglen & ~0x3) == paylen); + psmi_mtucpy_fn(user_buffer, payload, paylen); + /* + * there are nonDW bytes attached in header, + * copy after the DW payload. + */ + mq_copy_tiny((uint32_t *)((uint8_t *)user_buffer + paylen), + (uint32_t *)&offset, msglen & 0x3); + } +#if defined(PSM_CUDA) + if (converted) { + gdr_unmap_gpu_host_addr(GDR_FD, user_buffer, msglen, + src->proto); + } +#endif // PSM_CUDA + req->state = MQ_STATE_COMPLETE; + ips_barrier(); + mq_qq_append(&mq->completed_q, req); + break; + + case MQ_MSG_EAGER: + req->state = MQ_STATE_MATCHED; + req->type |= MQE_TYPE_EAGER_QUEUE; + req->send_msgoff = req->recv_msgoff = 0; + STAILQ_INSERT_TAIL(&mq->eager_q, req, nextq); + _HFI_VDBG("exp MSG_EAGER of length %d bytes pay=%d\n", + msglen, paylen); +#ifdef PSM_CUDA + if (PSMI_USE_GDR_COPY(req, req->req_data.send_msglen)) { + req->req_data.buf = gdr_convert_gpu_to_host_addr(GDR_FD, + (unsigned long)req->user_gpu_buffer, + req->req_data.send_msglen, 1, src->proto); + converted = 1; + } +#endif + if (paylen > 0) + psmi_mq_handle_data(mq, req, offset, payload, + paylen); +#if defined(PSM_CUDA) + if (converted) { + gdr_unmap_gpu_host_addr(GDR_FD, req->req_data.buf, + req->req_data.send_msglen, src->proto); + } +#endif // PSM_CUDA + break; + + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Internal error, unknown packet 0x%x", + opcode); + } + + mq->stats.rx_user_bytes += msglen; + mq->stats.rx_user_num++; + + *req_o = req; /* yes match */ + return MQ_RET_MATCH_OK; + } + + /* unexpected message or out of order message. */ + +#if 0 + /* + * Keep a callback here in case we want to fit some other high-level + * protocols over MQ (i.e. shmem). These protocols would bypass the + * normal message handling and go to higher-level message handlers. + */ + if (msgorder && mq->unexpected_callback) { + mq->unexpected_callback(mq, opcode, epaddr, req_data.tag, send_msglen, + payload, paylen); + *req_o = NULL; + return MQ_RET_UNEXP_OK; + } +#endif + + if (msgorder > 1) { + /* There is NO request match, and this is the first time + * to try to process this packet, we leave the packet in + * hardware queue for retry in hope there is a request + * match nex time, this is for performance + * consideration. + */ + return MQ_RET_UNEXP_NO_RESOURCES; + } + + req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); + psmi_assert(req != NULL); + + req->req_data.peer = src; + req->req_data.tag = *tag; + req->recv_msgoff = 0; + req->req_data.recv_msglen = req->req_data.send_msglen = req->req_data.buf_len = msglen = + send_msglen; + + _HFI_VDBG("match=NO (req=%p) opcode=%x src=%s mqtag=%08x.%08x.%08x" + " send_msglen=%d\n", req, opcode, + psmi_epaddr_get_name(src->epid), + tag->tag[0], tag->tag[1], tag->tag[2], send_msglen); + + switch (opcode) { + case MQ_MSG_TINY: + if (msglen > 0) { + req->req_data.buf = psmi_mq_sysbuf_alloc(mq, msglen); + psmi_assert(msglen == 0 || req->req_data.buf != NULL); + mq->stats.rx_sysbuf_num++; + mq->stats.rx_sysbuf_bytes += paylen; + mq_copy_tiny((uint32_t *) req->req_data.buf, + (uint32_t *) payload, msglen); + } else + req->req_data.buf = NULL; + req->state = MQ_STATE_COMPLETE; + break; + + case MQ_MSG_SHORT: + req->req_data.buf = psmi_mq_sysbuf_alloc(mq, msglen); + psmi_assert(msglen == 0 || req->req_data.buf != NULL); + mq->stats.rx_sysbuf_num++; + mq->stats.rx_sysbuf_bytes += paylen; + if (msglen <= paylen) { + psmi_mq_mtucpy(req->req_data.buf, payload, msglen); + } else { + psmi_assert((msglen & ~0x3) == paylen); + psmi_mq_mtucpy(req->req_data.buf, payload, paylen); + /* + * there are nonDW bytes attached in header, + * copy after the DW payload. + */ + mq_copy_tiny((uint32_t *)(req->req_data.buf+paylen), + (uint32_t *)&offset, msglen & 0x3); + } + req->state = MQ_STATE_COMPLETE; + break; + + case MQ_MSG_EAGER: + req->send_msgoff = 0; + req->req_data.buf = psmi_mq_sysbuf_alloc(mq, msglen); + psmi_assert(msglen == 0 || req->req_data.buf != NULL); + mq->stats.rx_sysbuf_num++; + mq->stats.rx_sysbuf_bytes += paylen; + req->state = MQ_STATE_UNEXP; + req->type |= MQE_TYPE_EAGER_QUEUE; + STAILQ_INSERT_TAIL(&mq->eager_q, req, nextq); + _HFI_VDBG("unexp MSG_EAGER of length %d bytes pay=%d\n", + msglen, paylen); + if (paylen > 0) + psmi_mq_handle_data(mq, req, offset, payload, paylen); + break; + + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Internal error, unknown packet 0x%x", + opcode); + } + + mq->stats.rx_sys_bytes += msglen; + mq->stats.rx_sys_num++; + + if (msgorder) { + mq_add_to_unexpected_hashes(mq, req); + } + /* caller will handle out of order case */ + *req_o = req; /* no match, will callback */ + return MQ_RET_UNEXP_OK; +} + +int psmi_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq) +{ + psm2_mq_req_t ereq; + uint32_t msglen; + + ereq = mq_req_match(mq, ureq->req_data.peer, &ureq->req_data.tag, 1); + if (ereq == NULL) { + mq_add_to_unexpected_hashes(mq, ureq); + return 0; + } + + psmi_assert(MQE_TYPE_IS_RECV(ereq->type)); + ereq->req_data.peer = ureq->req_data.peer; + ereq->req_data.tag = ureq->req_data.tag; + msglen = mq_set_msglen(ereq, ereq->req_data.buf_len, ureq->req_data.send_msglen); + + switch (ureq->state) { + case MQ_STATE_COMPLETE: + if (ureq->req_data.buf != NULL) { /* 0-byte don't alloc a sysreq_data.buf */ + psmi_mq_mtucpy(ereq->req_data.buf, (const void *)ureq->req_data.buf, + msglen); + psmi_mq_sysbuf_free(mq, ureq->req_data.buf); + } + ereq->state = MQ_STATE_COMPLETE; + ips_barrier(); + mq_qq_append(&mq->completed_q, ereq); + break; + case MQ_STATE_UNEXP: /* not done yet */ + ereq->state = MQ_STATE_MATCHED; + ereq->msg_seqnum = ureq->msg_seqnum; + ereq->ptl_req_ptr = ureq->ptl_req_ptr; + ereq->send_msgoff = ureq->send_msgoff; + ereq->recv_msgoff = min(ureq->recv_msgoff, msglen); + if (ereq->recv_msgoff) { + psmi_mq_mtucpy(ereq->req_data.buf, + (const void *)ureq->req_data.buf, + ereq->recv_msgoff); + } + psmi_mq_sysbuf_free(mq, ureq->req_data.buf); + ereq->type = ureq->type; + STAILQ_INSERT_AFTER(&mq->eager_q, ureq, ereq, nextq); + STAILQ_REMOVE(&mq->eager_q, ureq, psm2_mq_req, nextq); + break; + case MQ_STATE_UNEXP_RV: /* rendez-vous ... */ + ereq->state = MQ_STATE_MATCHED; + ereq->rts_peer = ureq->rts_peer; + ereq->rts_sbuf = ureq->rts_sbuf; + ereq->send_msgoff = ureq->send_msgoff; + ereq->recv_msgoff = min(ureq->recv_msgoff, msglen); + if (ereq->recv_msgoff) { + psmi_mq_mtucpy(ereq->req_data.buf, + (const void *)ureq->req_data.buf, + ereq->recv_msgoff); + } + if (ereq->send_msgoff) { + psmi_mq_sysbuf_free(mq, ureq->req_data.buf); + } + ereq->rts_callback = ureq->rts_callback; + ereq->rts_reqidx_peer = ureq->rts_reqidx_peer; + ereq->type = ureq->type; + ereq->rts_callback(ereq, 0); + break; + default: + fprintf(stderr, "Unexpected state %d in req %p\n", ureq->state, + ureq); + fprintf(stderr, "type=%d, mq=%p, tag=%08x.%08x.%08x\n", + ureq->type, ureq->mq, ureq->req_data.tag.tag[0], + ureq->req_data.tag.tag[1], ureq->req_data.tag.tag[2]); + abort(); + } + + psmi_mq_req_free(ureq); + return 0; +} diff --git a/prov/psm3/psm3/psm_mq_utils.c b/prov/psm3/psm3/psm_mq_utils.c new file mode 100644 index 00000000000..be05a46854c --- /dev/null +++ b/prov/psm3/psm3/psm_mq_utils.c @@ -0,0 +1,266 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm_mq_internal.h" + +/* + * + * MQ request allocator + * + */ + +psm2_mq_req_t MOCKABLE(psmi_mq_req_alloc)(psm2_mq_t mq, uint32_t type) +{ + psm2_mq_req_t req; + + psmi_assert(type == MQE_TYPE_RECV || type == MQE_TYPE_SEND); + + if (type == MQE_TYPE_SEND) + req = psmi_mpool_get(mq->sreq_pool); + else + req = psmi_mpool_get(mq->rreq_pool); + + if_pt(req != NULL) { + memset(req, 0, sizeof(struct psm2_mq_req)); + + req->type = type; + req->state = MQ_STATE_FREE; + req->mq = mq; + + return req; + } else { /* we're out of reqs */ + int issend = (type == MQE_TYPE_SEND); + uint32_t reqmax, reqchunk; + psmi_mpool_get_obj_info(issend ? mq->sreq_pool : mq->rreq_pool, + &reqchunk, &reqmax); + + psmi_handle_error(PSMI_EP_NORETURN, PSM2_PARAM_ERR, + "Exhausted %d MQ %s request descriptors, which usually indicates " + "a user program error or insufficient request descriptors (%s=%d)", + reqmax, issend ? "isend" : "irecv", + issend ? "PSM3_MQ_SENDREQS_MAX" : + "PSM3_MQ_RECVREQS_MAX", reqmax); + return NULL; + } +} +MOCK_DEF_EPILOGUE(psmi_mq_req_alloc); + +#ifdef PSM_CUDA +void psmi_cuda_recvreq_alloc_func(int is_alloc, void* context, void* obj) { + psm2_mq_req_t recvreq = (psm2_mq_req_t)obj; + if (PSMI_IS_CUDA_ENABLED) { + if (is_alloc) + PSMI_CUDA_CALL(cuEventCreate, &recvreq->cuda_ipc_event, CU_EVENT_DEFAULT); + else + PSMI_CUDA_CALL(cuEventDestroy, recvreq->cuda_ipc_event); + } + return; +} +#endif + +psm2_error_t psmi_mq_req_init(psm2_mq_t mq) +{ + psm2_mq_req_t warmup_req; + psm2_error_t err = PSM2_OK; + + _HFI_VDBG("mq element sizes are %d bytes\n", + (int)sizeof(struct psm2_mq_req)); + + /* + * Send MQ requests + */ + { + struct psmi_rlimit_mpool rlim = MQ_SENDREQ_LIMITS; + uint32_t maxsz, chunksz; + + if ((err = + psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz))) + goto fail; + + if ((mq->sreq_pool = + psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz, + maxsz, 0, DESCRIPTORS, NULL, + NULL)) == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + } + + /* + * Receive MQ requests + */ + { + struct psmi_rlimit_mpool rlim = MQ_RECVREQ_LIMITS; + uint32_t maxsz, chunksz; + + if ((err = + psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz))) + goto fail; + /* Have a callback function for receive req mpool which creates + * and destroy events. + */ +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED) { + if ((mq->rreq_pool = + psmi_mpool_create_for_cuda(sizeof(struct psm2_mq_req), chunksz, + maxsz, 0, DESCRIPTORS, NULL, + NULL, psmi_cuda_recvreq_alloc_func, NULL)) == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + } + else { + if ((mq->rreq_pool = + psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz, + maxsz, 0, DESCRIPTORS, NULL, + NULL)) == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + } +#else + if ((mq->rreq_pool = + psmi_mpool_create(sizeof(struct psm2_mq_req), chunksz, + maxsz, 0, DESCRIPTORS, NULL, + NULL)) == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } +#endif + } + + /* Warm up the allocators */ + warmup_req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); + psmi_assert_always(warmup_req != NULL); + psmi_mq_req_free(warmup_req); + + warmup_req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); + psmi_assert_always(warmup_req != NULL); + psmi_mq_req_free(warmup_req); + +fail: + return err; +} + +psm2_error_t psmi_mq_req_fini(psm2_mq_t mq) +{ + psmi_mpool_destroy(mq->rreq_pool); + psmi_mpool_destroy(mq->sreq_pool); + return PSM2_OK; +} + + +#if 0 // unused code, specific to QLogic MPI +/* + * Hooks to plug into QLogic MPI stats + */ + +static +void psmi_mq_stats_callback(struct mpspawn_stats_req_args *args) +{ + uint64_t *entry = args->stats; + psm2_mq_t mq = (psm2_mq_t) args->context; + psm2_mq_stats_t mqstats; + + psm2_mq_get_stats(mq, &mqstats); + + if (args->num < 8) + return; + + entry[0] = mqstats.tx_eager_num; + entry[1] = mqstats.tx_eager_bytes; + entry[2] = mqstats.tx_rndv_num; + entry[3] = mqstats.tx_rndv_bytes; + + entry[4] = mqstats.rx_user_num; + entry[5] = mqstats.rx_user_bytes; + entry[6] = mqstats.rx_sys_num; + entry[7] = mqstats.rx_sys_bytes; +} + +void psmi_mq_stats_register(psm2_mq_t mq, mpspawn_stats_add_fn add_fn) +{ + char *desc[8]; + uint16_t flags[8]; + int i; + struct mpspawn_stats_add_args mp_add; + /* + * Hardcode flags until we correctly move mpspawn to its own repo. + * flags[i] = MPSPAWN_REDUCTION_MAX | MPSPAWN_REDUCTION_MIN; + */ + for (i = 0; i < 8; i++) + flags[i] = MPSPAWN_STATS_REDUCTION_ALL; + + desc[0] = "Eager count sent"; + desc[1] = "Eager bytes sent"; + desc[2] = "Rendezvous count sent"; + desc[3] = "Rendezvous bytes sent"; + desc[4] = "Expected count received"; + desc[5] = "Expected bytes received"; + desc[6] = "Unexpect count received"; + desc[7] = "Unexpect bytes received"; + + mp_add.version = MPSPAWN_STATS_VERSION; + mp_add.num = 8; + mp_add.header = "MPI Statistics Summary (max,min @ rank)"; + mp_add.req_fn = psmi_mq_stats_callback; + mp_add.desc = desc; + mp_add.flags = flags; + mp_add.context = mq; + + add_fn(&mp_add); +} +#endif diff --git a/prov/psm3/psm3/psm_netutils.h b/prov/psm3/psm3/psm_netutils.h new file mode 100644 index 00000000000..316529bef8b --- /dev/null +++ b/prov/psm3/psm3/psm_netutils.h @@ -0,0 +1,87 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef _PSMI_NETUTILS_H +#define _PSMI_NETUTILS_H + +#include /* ipv4addr */ +#include +#include +#include +#include +#include + +// network function subset of psm_utils.c so that HAL can use this without +// needing psm_ep_t and psm_epid_t from psm_user.h + +/* + * network address manipulation + */ + // prefered size for psmi_sockaddr_ntop +#define PSM_ADDRSTRLEN (INET6_ADDRSTRLEN+19+7) // 16 digit sid, plus 3 " 0x" + // 4 digit pkey plues 3 " 0x" +const char *psmi_sockaddr_ntop(struct sockaddr* addr, char *dst, socklen_t size); +const char *psmi_ipv4_ntop(uint32_t ip_addr, char *dst, socklen_t size); +socklen_t psmi_sockaddr_len(struct sockaddr* addr); + +int psmi_count_high_bits(uint32_t netmask); +// This converts a bit count generated by psmi_count_high_bits back into +// a IPv4 netmask +static inline uint32_t psmi_bit_count_to_mask(int count) +{ + return (uint32_t)(0xffffffff << (32-count)); +} + +int psmi_get_eth_netmask(__be32 ip_addr, __be32 *netmask); + +#endif /* _PSMI_NETUTILS_H */ diff --git a/prov/psm3/psm3/psm_perf.c b/prov/psm3/psm3/psm_perf.c new file mode 100644 index 00000000000..aaf3fd05213 --- /dev/null +++ b/prov/psm3/psm3/psm_perf.c @@ -0,0 +1,260 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifdef RDPMC_PERF_FRAMEWORK + +#include "psm_user.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Configuration */ + +#define RDPMC_PERF_DEFAULT_TYPE (PERF_TYPE_HARDWARE) +#define RDPMC_PERF_DEFAULT_CONFIG (PERF_COUNT_HW_CPU_CYCLES) + +__thread struct rdpmc_ctx global_rdpmc_ctx; + +u64 global_rdpmc_begin[RDPMC_PERF_MAX_SLOT_NUMBER]; +u64 global_rdpmc_summ[RDPMC_PERF_MAX_SLOT_NUMBER]; +u64 global_rdpmc_number[RDPMC_PERF_MAX_SLOT_NUMBER]; + +char global_rdpmc_slot_name[RDPMC_PERF_MAX_SLOT_NUMBER][RDPMC_PERF_MAX_SLOT_NAME]; + +__thread unsigned int global_rdpmc_type = RDPMC_PERF_DEFAULT_TYPE; +__thread unsigned int global_rdpmc_config = RDPMC_PERF_DEFAULT_CONFIG; + +struct rdpmc_ctx { + int fd; + struct perf_event_mmap_page *buf; +}; + +typedef unsigned long long u64; + +#if defined(__ICC) || defined(__INTEL_COMPILER) +#include "immintrin.h" +#endif + +/** + * DOC: Ring 3 counting for CPU performance counters + * + * This library allows accessing CPU performance counters from ring 3 + * using the perf_events subsystem. This is useful to measure specific + * parts of programs (e.g. excluding initialization code) + * + * Requires a Linux 3.3+ kernel + */ + +/** + * rdpmc_open_attr - initialize a raw ring 3 readable performance counter + * @attr: perf struct %perf_event_attr for the counter + * @ctx: Pointer to struct %rdpmc_ctx that is initialized. + * @leader_ctx: context of group leader or NULL + * + * This allows more flexible setup with a custom &perf_event_attr. + * For simple uses rdpmc_open() should be used instead. + * Must be called for each thread using the counter. + * Must be closed with rdpmc_close() + */ +PSMI_ALWAYS_INLINE(int rdpmc_open_attr(struct perf_event_attr *attr, struct rdpmc_ctx *ctx, + struct rdpmc_ctx *leader_ctx)) +{ + ctx->fd = syscall(__NR_perf_event_open, attr, 0, -1, + leader_ctx ? leader_ctx->fd : -1, 0); + if (ctx->fd < 0) { + perror("perf_event_open"); + return -1; + } + ctx->buf = mmap(NULL, sysconf(_SC_PAGESIZE), PROT_READ, MAP_SHARED, ctx->fd, 0); + if (ctx->buf == MAP_FAILED) { + close(ctx->fd); + perror("mmap on perf fd"); + return -1; + } + return 0; +} + +/** + * rdpmc_open - initialize a simple ring 3 readable performance counter + * @counter: Raw event descriptor (UUEE UU unit mask EE event) + * @ctx: Pointer to struct &rdpmc_ctx that is initialized + * + * The counter will be set up to count CPU events excluding the kernel. + * Must be called for each thread using the counter. + * The caller must make sure counter is suitable for the running CPU. + * Only works in 3.3+ kernels. + * Must be closed with rdpmc_close() + */ + +PSMI_ALWAYS_INLINE(int rdpmc_open(unsigned counter, struct rdpmc_ctx *ctx)) +{ + struct perf_event_attr attr = { + .type = counter > 10 ? PERF_TYPE_RAW : PERF_TYPE_HARDWARE, + .size = PERF_ATTR_SIZE_VER0, + .config = counter, + .sample_type = PERF_SAMPLE_READ, + .exclude_kernel = 1, + }; + return rdpmc_open_attr(&attr, ctx, NULL); +} + +/** + * rdpmc_close: free a ring 3 readable performance counter + * @ctx: Pointer to &rdpmc_ctx context. + * + * Must be called by each thread for each context it initialized. + */ +PSMI_ALWAYS_INLINE(void rdpmc_close(struct rdpmc_ctx *ctx)) +{ + close(ctx->fd); + munmap(ctx->buf, sysconf(_SC_PAGESIZE)); +} + +static void psmi_rdpmc_perf_framework_init() +{ + int rdpmc_retval; + + struct rdpmc_ctx *leader = NULL; + + int env_result = 1; + char * env_type = NULL; + char * env_config = NULL; + + env_type = getenv("RDPMC_PERF_TYPE"); + + if (env_type) + { + global_rdpmc_type = (int)strtoll(env_type, NULL, 16); + } + else + { + env_result = 0; + } + + env_config = getenv("RDPMC_PERF_CONFIG"); + + if (env_config) + { + global_rdpmc_config = (int)strtoll(env_config, NULL, 16); + } + else + { + env_result = 0; + } + + if (env_result != 1) + { + global_rdpmc_type = RDPMC_PERF_DEFAULT_TYPE; + global_rdpmc_config = RDPMC_PERF_DEFAULT_CONFIG; + } + + struct perf_event_attr attr = { + .type = global_rdpmc_type, + .size = sizeof(struct perf_event_attr), + .config = global_rdpmc_config, + .sample_type = PERF_SAMPLE_READ, + }; + + rdpmc_retval = rdpmc_open_attr(&attr, &global_rdpmc_ctx, leader); + + if (rdpmc_retval < 0) + { + printf("Unable to initialize RDPMC. Error: %d\n", rdpmc_retval); + exit(-1); + } +} + +/** + * rdpmc_read: read a ring 3 readable performance counter + * @ctx: Pointer to initialized &rdpmc_ctx structure. + * + * Read the current value of a running performance counter. + */ +unsigned long long rdpmc_read(struct rdpmc_ctx *ctx) +{ + static __thread int rdpmc_perf_initialized = 0; + + if_pf(!rdpmc_perf_initialized) + { + psmi_rdpmc_perf_framework_init(); + rdpmc_perf_initialized = 1; + } + + u64 val; + unsigned seq; + u64 offset = 0; + + typeof (ctx->buf) buf = ctx->buf; + do { + seq = buf->lock; + ips_rmb(); + if (buf->index <= 0) + return buf->offset; +#if defined(__ICC) || defined(__INTEL_COMPILER) + val = _rdpmc(buf->index - 1); +#else /* GCC */ + val = __builtin_ia32_rdpmc(buf->index - 1); +#endif + offset = buf->offset; + ips_rmb(); + } while (buf->lock != seq); + return val + offset; +} + +#endif /* RDPMC_PERF_FRAMEWORK */ diff --git a/prov/psm3/psm3/psm_perf.h b/prov/psm3/psm3/psm_perf.h new file mode 100644 index 00000000000..7233ba8d398 --- /dev/null +++ b/prov/psm3/psm3/psm_perf.h @@ -0,0 +1,149 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* When the perf framework is enabled, GENERIC_PERF_BEGIN/END can be + * used to mark sections of code to be included in a given "slot number" + * for performance statistics. + * The PMU will be used to measure instruction cycles used between the BEGIN/END + * This permits precise statistics to be gathered for how much CPU is required + * to execute all the code in a given slot number during a given run. + * At the end of the run the statistics are output. + * This capability is only enabled when PSM is built with -DRDPMC_PERF_FRAMEWORK + */ + +/* slot numbers for the counters we want */ +#define PSM_TX_SPEEDPATH_CTR 0 +#define PSM_RX_SPEEDPATH_CTR 1 + +#ifdef RDPMC_PERF_FRAMEWORK + +/* Configuration */ + +#define RDPMC_PERF_MAX_SLOT_NUMBER (8) // we only use 2, RX and TX +#define RDPMC_PERF_MAX_SLOT_NAME (256) + +/* RDPMC infrastructure */ + +extern __thread struct rdpmc_ctx global_rdpmc_ctx; + +typedef unsigned long long u64; + +extern u64 global_rdpmc_begin[RDPMC_PERF_MAX_SLOT_NUMBER]; +extern u64 global_rdpmc_summ[RDPMC_PERF_MAX_SLOT_NUMBER]; +extern u64 global_rdpmc_number[RDPMC_PERF_MAX_SLOT_NUMBER]; + +extern char global_rdpmc_slot_name[RDPMC_PERF_MAX_SLOT_NUMBER][RDPMC_PERF_MAX_SLOT_NAME]; + +extern __thread unsigned int global_rdpmc_type; +extern __thread unsigned int global_rdpmc_config; + +extern unsigned long long rdpmc_read(struct rdpmc_ctx *ctx); + +#define RDPMC_PERF_INIT() \ +{ \ + int i; \ + for (i = 0; i < RDPMC_PERF_MAX_SLOT_NUMBER; i++) \ + { \ + global_rdpmc_begin[i] = 0; \ + global_rdpmc_summ[i] = 0; \ + global_rdpmc_number[i] = 0; \ + global_rdpmc_slot_name[i][0] = '\0'; \ + } \ +} + +/* There is no slot_number max range check */ + +#define RDPMC_PERF_SET_SLOT_NAME(slot_number, name) \ +{ \ + strncpy(global_rdpmc_slot_name[(slot_number)], (name), RDPMC_PERF_MAX_SLOT_NAME - 1); \ + global_rdpmc_slot_name[(slot_number)][RDPMC_PERF_MAX_SLOT_NAME - 1] = '\0'; \ +} + +#define RDPMC_PERF_BEGIN(slot_number) \ +{ \ + global_rdpmc_begin[(slot_number)] = rdpmc_read(&global_rdpmc_ctx); \ +} + +#define RDPMC_PERF_END(slot_number) \ +{ \ + global_rdpmc_summ[(slot_number)] += (rdpmc_read(&global_rdpmc_ctx) - global_rdpmc_begin[(slot_number)]); \ + global_rdpmc_number[(slot_number)]++; \ +} + +#define RDPMC_PERF_DUMP(stream) \ +{ \ + int i; \ + for (i = 0; i < RDPMC_PERF_MAX_SLOT_NUMBER; i++) \ + { \ + if (global_rdpmc_slot_name[i][0]) \ + { \ + fprintf((stream), "RDPMC [%s] (%x, %04x) avg = %g (%llu times)\n", \ + global_rdpmc_slot_name[i], global_rdpmc_type, global_rdpmc_config, \ + (double)global_rdpmc_summ[i] / global_rdpmc_number[i], global_rdpmc_number[i]); \ + fflush((stream)); \ + } \ + } \ +} + +#define GENERIC_PERF_INIT() RDPMC_PERF_INIT() +#define GENERIC_PERF_SET_SLOT_NAME(slot_number, name) RDPMC_PERF_SET_SLOT_NAME(slot_number, name) +#define GENERIC_PERF_BEGIN(slot_number) RDPMC_PERF_BEGIN(slot_number) +#define GENERIC_PERF_END(slot_number) RDPMC_PERF_END(slot_number) +#define GENERIC_PERF_DUMP(stream) RDPMC_PERF_DUMP(stream) +#else /* RDPMC_PERF_FRAMEWORK */ +#define GENERIC_PERF_INIT() +#define GENERIC_PERF_SET_SLOT_NAME(slot_number, name) +#define GENERIC_PERF_BEGIN(slot_number) +#define GENERIC_PERF_END(slot_number) +#define GENERIC_PERF_DUMP(stream) +#endif /* RDPMC_PERF_FRAMEWORK */ diff --git a/prov/psm3/psm3/psm_rndv_mod.c b/prov/psm3/psm3/psm_rndv_mod.c new file mode 100644 index 00000000000..e4b01ced9d8 --- /dev/null +++ b/prov/psm3/psm3/psm_rndv_mod.c @@ -0,0 +1,816 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ +#ifdef RNDV_MOD_MR + +#include +#include +#include +#include +//#include +//#include +//#include +//#include /* cpu_set */ +#include /* isalpha */ +//#include +#include +//#include // for AF_IB structures +//#include +#include "psm_user.h" // get psmi_calloc and free +#include "psm_rndv_mod.h" + +#include +#include + +// Intel Columbiaville (800 series NIC) specific udata for RV reg_mr ioctl +// Mellanox and OPA ignore udata, so doesn't matter what we pass them + +/* For CVL irdma device */ +/* nd_linux-lib_cpk_rdma/src/DRIVER_CORE/src/CORE/icrdma-abi.h */ +enum irdma_memreg_type { + IW_MEMREG_TYPE_MEM = 0, + IW_MEMREG_TYPE_QP = 1, + IW_MEMREG_TYPE_CQ = 2, + IW_MEMREG_TYPE_RSVD = 3, + IW_MEMREG_TYPE_MW = 4, +}; + +struct irdma_mem_reg_req { + uint16_t reg_type; /* Memory, QP or CQ */ + uint16_t cq_pages; + uint16_t rq_pages; + uint16_t sq_pages; +}; + +// we won't have ep in kernel API and won't have this memory tracking +// so just use EP_NONE +#define my_calloc(nmemb, size) (psmi_calloc(PSMI_EP_NONE, PEER_RNDV, (nmemb), (size))) +//#define my_calloc(nmemb, size) (psmi_calloc(PSMI_EP_NONE, NETWORK_BUFFERS, (nmemb), (size))) +#define my_free(p) (psmi_free(p)) + +static int rv_map_event_ring(psm2_rv_t rv, struct rv_event_ring* ring, + int entries, int offset) +{ + ring->len = RING_ALLOC_LEN(entries); + + //printf("Calling mmap for offset: %d len:%d\n", offset, ring->len); + + ring->hdr = (struct rv_ring_header *)mmap(0, ring->len, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE | MAP_LOCKED, + rv->fd, offset); + if (!ring->hdr) { + ring->len = 0; + return -1; + } + ring->num = entries; + return 0; +} + +static void rv_unmap_event_ring(psm2_rv_t rv, struct rv_event_ring* ring) +{ + if (ring->hdr) + if(munmap(ring->hdr, ring->len)) + printf("munmap event ring failed:%s (%d)\n", strerror(errno),errno); + ring->hdr = NULL; + ring->len = 0; + ring->num = 0; +} + +// we call this once per ep (eg. NIC) so we supply the local address +// of our NIC for use in the IB CM bind, especially for ethernet +psm2_rv_t __psm2_rv_open(const char *devname, struct local_info *loc_info) +{ + psm2_rv_t rv = NULL; + struct rv_attach_params aparams; + struct rv_query_params_out qparams; + int ret; + int save_errno; + + loc_info->capability = 0; + rv = (psm2_rv_t)my_calloc(1, sizeof(struct psm2_rv)); + if (! rv) { + save_errno = ENOMEM; + goto fail; + } + //printf("XXXX 0x%lx %s\n", pthread_self(), __FUNCTION__); + rv->fd = open(RV_FILE_NAME, O_RDWR); + if (rv->fd == -1) { + save_errno = errno; + printf("fd open failed %s: %s\n", RV_FILE_NAME, strerror(errno)); + goto fail; + } + + if ((ret = ioctl(rv->fd, RV_IOCTL_QUERY, &qparams)) != 0) { + save_errno = errno; + printf("query ioctl failed ret:%s (%d)\n", strerror(errno), ret); + goto fail; + } + loc_info->major_rev = qparams.major_rev; + loc_info->minor_rev = qparams.minor_rev; + loc_info->capability = qparams.capability; + + memset(&aparams, 0, sizeof(aparams)); + snprintf(aparams.in.dev_name, RV_MAX_DEV_NAME_LEN, "%s", devname); + aparams.in.mr_cache_size = loc_info->mr_cache_size; + aparams.in.rdma_mode = loc_info->rdma_mode; + aparams.in.port_num = loc_info->port_num; + aparams.in.num_conn = loc_info->num_conn; + aparams.in.loc_addr = loc_info->loc_addr; + aparams.in.index_bits = loc_info->index_bits; + aparams.in.loc_gid_index = loc_info->loc_gid_index; + memcpy(&aparams.in.loc_gid, &loc_info->loc_gid, sizeof(aparams.in.loc_gid)); + + if (loc_info->job_key_len > sizeof(aparams.in.job_key)) { + save_errno = EINVAL; + printf("job_key_len too long\n"); + goto fail; + } + aparams.in.job_key_len = loc_info->job_key_len; + memcpy(&aparams.in.job_key, loc_info->job_key, loc_info->job_key_len); + // if 0 specified, kernel will pick a value for all jobs + // otherwise PSM can specify a job specific value, must be same in all + // processes in a given job + // ok if multiple PSM processes in different jobs all funnel + // through same listener service id as the job_key will differentiate them + aparams.in.service_id = loc_info->service_id; + aparams.in.context = (uint64_t)loc_info->context; + aparams.in.cq_entries = loc_info->cq_entries; + aparams.in.q_depth = loc_info->q_depth; + aparams.in.reconnect_timeout = loc_info->reconnect_timeout; + aparams.in.hb_interval = loc_info->hb_interval; + + if ((ret = ioctl(rv->fd, RV_IOCTL_ATTACH, &aparams)) != 0) { + save_errno = errno; + printf("attach ioctl failed ret:%s (%d)\n", strerror(errno), ret); + goto fail; + } + + loc_info->rv_index = aparams.out.rv_index; + loc_info->mr_cache_size = aparams.out.mr_cache_size; + loc_info->q_depth = aparams.out.q_depth; + loc_info->reconnect_timeout = aparams.out.reconnect_timeout; + + //printf("XXXX 0x%lx %s fd:%d\n", pthread_self(), __FUNCTION__, rv->fd); + if (loc_info->cq_entries) { + if (rv_map_event_ring(rv, &rv->events, loc_info->cq_entries, 0)) { + save_errno = errno; + printf("mmap event ring failed:%s (%d)\n", strerror(errno), errno); + goto fail; + } + } + + return rv; +fail: + if (rv) { + (void)__psm2_rv_close(rv); + } + errno = save_errno; + return NULL; +} + +// 0 on success +// -1 if rv invalid or not open and errno set +int __psm2_rv_close(psm2_rv_t rv) +{ + + if (! rv) { + errno = EINVAL; + return -1; + } + //printf("XXXX 0x%lx %s fd:%d\n", pthread_self(), __FUNCTION__, rv->fd); + rv_unmap_event_ring(rv, &rv->events); +#if 0 + if ((ret = ioctl(rv->fd, RV_IOCTL_DETACH, NULL)) != 0) { + perror("close failed\n"); + } +#endif + if (rv->fd != -1) { + close(rv->fd); + } + + my_free(rv); + return 0; +} + +int __psm2_rv_get_cache_stats(psm2_rv_t rv, struct psm2_rv_cache_stats *stats) +{ + struct rv_cache_stats_params_out sparams; + int ret; + int save_errno; + + memset(&sparams, 0, sizeof(sparams)); + if ((ret = ioctl(rv->fd, RV_IOCTL_GET_CACHE_STATS, &sparams)) != 0) { + save_errno = errno; + printf("get_cache_stats failed ret:%d: %s\n", ret, strerror(errno)); + goto fail; + } + stats->cache_size = sparams.cache_size; + stats->max_cache_size = sparams.max_cache_size; + stats->limit_cache_size = sparams.limit_cache_size; + stats->count = sparams.count; + stats->max_count = sparams.max_count; + stats->inuse = sparams.inuse; + stats->max_inuse = sparams.max_inuse; + stats->max_refcount = sparams.max_refcount; + stats->hit = sparams.hit; + stats->miss = sparams.miss; + stats->full = sparams.full; + stats->failed = sparams.failed; + stats->remove = sparams.remove; + stats->evict = sparams.evict; + return 0; +fail: + errno = save_errno; + return -1; +} + +// we have a little dance here to hide the RV connect REQ and RSP from PSM +// without needing a callback into PSM. +// We do this by creating the rv_conn object with the remote addressing +// information before any connection activity. +// PSM has it's own connection REQ/RSP which will occur. +// By creating the rv_conn object before PSM sends it's REQ or RSP and not +// starting the rv connection process until PSM is about to send a PSM RSP +// (or receives a PSM RSP), we ensure that both sides +// will have a rv_conn ready by the time RV's CM REQ arrives. +// Inbound RV CM REQs can compare the REQ against expected remote addresses +// and match the proper one. + +// For RV at the kernel level, we only need connections at the node level. +// In the kernel a single rv_conn will be created per remote NIC, these +// rv_conn will be shared among multiple PSM processes. So they can be +// identified by remote addr alone. +// +// For kernel RV, the REQ/RSP also needs to include the +// job_key. RC QPs are not shared across jobs. Kernel RV will use the +// job_key to select the proper set of rv and rv_conn objects. If none are +// found the connection is rejected (or discarded? Which is better for +// Denial of service protection?). + +// We implement a simple peer-peer connect model here and +// the listener side will also create conn for inbound connect REQs +// thus PSM must call this function for both sides of a connection +// we will compae rem_addr aganint our local address (already +// set in rv_open) to decide which side is passive vs active side of IB CM +// connection establishment +// See description above for more info on connection model +psm2_rv_conn_t __psm2_rv_create_conn(psm2_rv_t rv, + struct ibv_ah_attr *ah_attr, // for remote node + uint32_t rem_addr) // for simple compare to loc_addr +{ + psm2_rv_conn_t conn = NULL; + struct rv_conn_create_params param; + int save_errno; + + conn = (psm2_rv_conn_t)my_calloc(1, sizeof(struct psm2_rv_conn)); + if (! conn) { + save_errno = ENOMEM; + goto fail; + } + conn->rv = rv; + // call kernel, kernel will save off this info, will have a single + // shared rv_conn for all processes talking to a given remote node + // NO IB CM activity here, just save info in prep for rv_connect + // TBD, do we need rem_addr argument? It can be figured out from + // ah_attr: for IB use dlid, for eth use low 32 bits of gid + // TBD should we specify PKey here for an additional inbound check + memset(¶m, 0, sizeof(param)); + memcpy(¶m.in.ah, ah_attr, sizeof(param.in.ah)); + param.in.rem_addr = rem_addr; + // while a user context could be supplied here that turns out to be + // expensive as the kernel must either search to find the right + // rv_user and rv_user_conn or the kernel must keep an array of 2^index_bits + // rv_user_conn pointers to find the right conn_context to supply in + // recv CQEs. Given PSM is only using conn_context as a sanity check + // we can have the CQE contain the rv_conn handle instead and eliminate + // the need for a kernel rv_user_conn all together + + if (ioctl(rv->fd, RV_IOCTL_CONN_CREATE, ¶m)) { + save_errno = errno; + goto fail; + } + + /* Copy the params to conn for connection use */ + conn->handle = param.out.handle; + conn->conn_handle = param.out.conn_handle; + return conn; + +fail: + if (conn) + my_free(conn); + errno = save_errno; + return NULL; +} + +int __psm2_rv_connect(psm2_rv_conn_t conn, const struct ib_user_path_rec *path) +{ + struct rv_conn_connect_params_in param; + int ret; + + if (!conn) { + errno = EINVAL; + return -1; + } + + // kernel will: + // compare conn->rem_addr and rv->loc_addr to pick passive and active side + // active side will start the IB CM connection (and return immediately) + // passive side will ensure listener is started on 1st crate_conn for + // a given NIC + // + // on the listener, as inbound connections arrive their job_key directs + // them to the proper node level rv (shared by all local process rv_open + // with same job_key). (reject or ignore if no rv's match job key) + // The proper rv then compares the remote address and other info from + // CM REQ against conn->ah_attr to confirm it is coming from a node we + // expect to be part of the job reject (or ignore) unmatched REQs + // (note ah_attr is a superset of rem_addr, so can just compare ah_attr) + // but note that ah_attr format is a little different for IB vs Eth + // Eth uses GID to hold IP address while IB will use LID + // TBD what we will enforced regarding SL, pkey, etc for Eth + // for IB/OPA they should match + // The loc_gid and dgid are available for use by the active side to + // satisfy IB CM. The passive side can ignore these and simply use + // ah_attr to verify incoming connections. Note on the passive side + // an incoming connection can arrive before this call, so it may not have + // The dgid available when the inbound connect request arrives. + // + // in either case, the connection process continues in background in + // kernel and PSM can poll for rv_connected to determine when it is done + // + // kernel will concurrently make progress on multiple connections + // active side may have a limit on how many it starts at once and may + // progress through the needed connections in "clumps" + // all connections are at node to node level and shared by all + // processes within the given job. + // + //return 0 on success, -1 w/errno on error + memset(¶m, 0, sizeof(param)); + param.handle = conn->handle; + memcpy(¶m.path, path, sizeof(param.path)); + ret = ioctl(conn->rv->fd, RV_IOCTL_CONN_CONNECT, ¶m); + if (ret) + conn->handle = 0; // invalid handle, rv has freed uconn + return ret; +} + +int __psm2_rv_connected(psm2_rv_conn_t conn) +{ + struct rv_conn_connected_params_in param; + + if (! conn) { + errno = EINVAL; + return -1; + } + // verify if conn is now fully established + // 0=no + // 1=yes + // -1=error and errno set + memset(¶m, 0, sizeof(param)); + param.handle = conn->handle; + return ioctl(conn->rv->fd, RV_IOCTL_CONN_CONNECTED, ¶m); +} + +// get connection count for specified sconn index within given conn +// the count is incremented each time a successful (re)connection occurs +// The advancement of the count can be used as a barrier to indicate +// all transactions related to a previous QP prior to recovery are done +// and drained. +// returns -1 with EIO if connection cannot be recovered +// return 0 with latest conn_count if connected or being recovered +int __psm2_rv_get_conn_count(psm2_rv_t rv, psm2_rv_conn_t conn, + uint8_t index, uint32_t *count) +{ + struct rv_conn_get_conn_count_params params; + int ret; + int save_errno; + + memset(¶ms, 0, sizeof(params)); + if (conn) + params.in.handle = conn->handle; + params.in.index = index; + + if ((ret = ioctl(rv->fd, RV_IOCTL_CONN_GET_CONN_COUNT, ¶ms)) != 0) { + save_errno = errno; + printf("get_conn_count failed ret:%d: %s\n", ret, strerror(errno)); + goto fail; + } + *count = params.out.count; + return 0; +fail: + errno = save_errno; + return -1; +} + +int __psm2_rv_get_conn_stats(psm2_rv_t rv, psm2_rv_conn_t conn, + uint8_t index, struct psm2_rv_conn_stats *stats) +{ + struct rv_conn_get_stats_params sparams; + int ret; + int save_errno; + + memset(&sparams, 0, sizeof(sparams)); + if (conn) + sparams.in.handle = conn->handle; + sparams.in.index = index; + if ((ret = ioctl(rv->fd, RV_IOCTL_CONN_GET_STATS, &sparams)) != 0) { + save_errno = errno; + printf("get_conn_stats failed ret:%d: %s\n", ret, strerror(errno)); + goto fail; + } + stats->index = sparams.out.index; + stats->flags = sparams.out.flags; + stats->num_conn = sparams.out.num_conn; + + stats->req_error = sparams.out.req_error; + stats->req_recv = sparams.out.req_recv; + stats->rep_error = sparams.out.rep_error; + stats->rep_recv = sparams.out.rep_recv; + stats->rtu_recv = sparams.out.rtu_recv; + stats->established = sparams.out.established; + stats->dreq_error = sparams.out.dreq_error; + stats->dreq_recv = sparams.out.dreq_recv; + stats->drep_recv = sparams.out.drep_recv; + stats->timewait = sparams.out.timewait; + stats->mra_recv = sparams.out.mra_recv; + stats->rej_recv = sparams.out.rej_recv; + stats->lap_error = sparams.out.lap_error; + stats->lap_recv = sparams.out.lap_recv; + stats->apr_recv = sparams.out.apr_recv; + stats->unexp_event = sparams.out.unexp_event; + stats->req_sent = sparams.out.req_sent; + stats->rep_sent = sparams.out.rep_sent; + stats->rtu_sent = sparams.out.rtu_sent; + stats->rej_sent = sparams.out.rej_sent; + stats->dreq_sent = sparams.out.dreq_sent; + stats->drep_sent = sparams.out.drep_sent; + stats->wait_time = sparams.out.wait_time; + stats->resolve_time = sparams.out.resolve_time; + stats->connect_time = sparams.out.connect_time; + stats->connected_time = sparams.out.connected_time; + stats->resolve = sparams.out.resolve; + stats->resolve_fail = sparams.out.resolve_fail; + stats->conn_recovery = sparams.out.conn_recovery; + stats->rewait_time = sparams.out.rewait_time; + stats->reresolve_time = sparams.out.reresolve_time; + stats->reconnect_time = sparams.out.reconnect_time; + stats->max_rewait_time = sparams.out.max_rewait_time; + stats->max_reresolve_time = sparams.out.max_reresolve_time; + stats->max_reconnect_time = sparams.out.max_reconnect_time; + stats->reresolve = sparams.out.reresolve; + stats->reresolve_fail = sparams.out.reresolve_fail; + + stats->post_write = sparams.out.post_write; + stats->post_write_fail = sparams.out.post_write_fail; + stats->post_write_bytes = sparams.out.post_write_bytes; + stats->outstand_send_write = sparams.out.outstand_send_write; + stats->send_write_cqe = sparams.out.send_write_cqe; + stats->send_write_cqe_fail = sparams.out.send_write_cqe_fail; + + stats->recv_write_cqe = sparams.out.recv_write_cqe; + stats->recv_write_bytes = sparams.out.recv_write_bytes; + stats->recv_cqe_fail = sparams.out.recv_cqe_fail; + + stats->post_hb = sparams.out.post_hb; + stats->post_hb_fail = sparams.out.post_hb_fail; + stats->send_hb_cqe = sparams.out.send_hb_cqe; + stats->send_hb_cqe_fail = sparams.out.send_hb_cqe_fail; + stats->recv_hb_cqe = sparams.out.recv_hb_cqe; + return 0; +fail: + errno = save_errno; + return -1; +} + +int __psm2_rv_get_event_stats(psm2_rv_t rv, struct psm2_rv_event_stats *stats) +{ + struct rv_event_stats_params_out sparams; + int ret; + int save_errno; + + memset(&sparams, 0, sizeof(sparams)); + if ((ret = ioctl(rv->fd, RV_IOCTL_GET_EVENT_STATS, &sparams)) != 0) { + save_errno = errno; + printf("get_event_stats failed ret:%d: %s\n", ret, strerror(errno)); + goto fail; + } + stats->send_write_cqe = sparams.send_write_cqe; + stats->send_write_cqe_fail = sparams.send_write_cqe_fail; + stats->send_write_bytes = sparams.send_write_bytes; + + stats->recv_write_cqe = sparams.recv_write_cqe; + stats->recv_write_cqe_fail = sparams.recv_write_cqe_fail; + stats->recv_write_bytes = sparams.recv_write_bytes; + return 0; +fail: + errno = save_errno; + return -1; +} + +int __psm2_rv_disconnect(psm2_rv_conn_t conn) +{ + if (! conn) { + errno = EINVAL; + return -1; + } + // reduce reference count on kernel connection. + // When reference count hits 0, kernel can start IB CM disconnection + // said disconnect process may continue on past when the processes exit + // TBD - if PSM should wait for disconnect to finish, especially after + // find disconnect is called. - assume NO + // start disconnection + // return 0 on success + // return -1 and errno on error + // once disconnected an event will occur with id from original conn req + return 0; +} + +void __psm2_rv_destroy_conn(psm2_rv_conn_t conn) +{ + if (! conn) { + // TBD - could have errno and return code here? + return; + } + //psm2_rv_t rv = conn->rv; + //TBD - tell kernel, it will cleanup and start disconnect if not alraedy + // started + //TBD - cleanup conn resources + + my_free(conn); +} + +psm2_rv_mr_t __psm2_rv_reg_mem(psm2_rv_t rv, int cmd_fd_int, struct ibv_pd *pd, + void *addr, uint64_t length, int access) +{ + psm2_rv_mr_t mr = NULL; + struct rv_mem_params mparams; + struct irdma_mem_reg_req req; + int save_errno; + + if (!rv || (!pd && !(access & IBV_ACCESS_KERNEL))) { + save_errno = EINVAL; + goto fail; + } + + mr = (psm2_rv_mr_t)my_calloc(1, sizeof(struct psm2_rv_mr)); + if (! mr) { + save_errno = ENOMEM; + goto fail; + } + + //printf("XXXX 0x%lx %s\n", pthread_self(), __FUNCTION__); + memset(&mparams, 0, sizeof(mparams)); + if (pd) + mparams.in.ibv_pd_handle = pd->handle; + mparams.in.cmd_fd_int = cmd_fd_int; + mparams.in.addr = (uint64_t)addr; + mparams.in.length = length; + mparams.in.access = access; + memset(&req, 0, sizeof(req)); + // driver specific data type + req.reg_type = IW_MEMREG_TYPE_MEM; + mparams.in.ulen = sizeof(req); + mparams.in.udata = &req; + if (ioctl(rv->fd, RV_IOCTL_REG_MEM, &mparams)) { + save_errno = errno; + goto fail; + } + mr->addr = (uint64_t)addr; + mr->length = length; + mr->access = access; + mr->handle = mparams.out.mr_handle; + mr->iova = mparams.out.iova; + mr->lkey = mparams.out.lkey; + mr->rkey = mparams.out.rkey; + //printf("XXXX 0x%lx %s pdh:0x%x cmd_fd_int:%d addr:0x%p len:%ld acc:0x%x lkey:0x%x rkey:0x%x mr:%d\n", + // pthread_self(), __FUNCTION__, pd->handle, cmd_fd_int, addr, length, access, + // mr->lkey, mr->rkey, mr->handle); + + return mr; +fail: + if (mr) + my_free(mr); + errno = save_errno; + return NULL; +} + +int __psm2_rv_dereg_mem(psm2_rv_t rv, psm2_rv_mr_t mr) +{ + struct rv_dereg_params_in dparams; + int ret; + + if (! rv || ! mr) { + errno = EINVAL; + return -1; + } + //printf("XXXX 0x%lx %s mr:%d\n", pthread_self(), __FUNCTION__, mr->handle); + dparams.mr_handle = mr->handle; + dparams.addr = mr->addr; + dparams.length = mr->length; + dparams.access = mr->access; + if ((ret = ioctl(rv->fd, RV_IOCTL_DEREG_MEM, &dparams)) != 0) + return ret; + my_free(mr); + return 0; +} + +#ifdef PSM_CUDA + +void * __psm2_rv_pin_and_mmap(psm2_rv_t rv, uintptr_t pageaddr, uint32_t pagelen) +{ + struct rv_gpu_mem_params params; + int ret; + int save_errno; + + memset(¶ms, 0, sizeof(params)); + /* XXX: Add the version field once it is restored */ + params.in.gpu_buf_addr = pageaddr; + params.in.gpu_buf_size = pagelen; + + if ((ret = ioctl(rv->fd, RV_IOCTL_GPU_PIN_MMAP, ¶ms)) != 0) { + save_errno = errno; + perror("gpu_pin_mmap failed\n"); + errno = save_errno; + return NULL; + } + + // return mapped host address or NULL with errno set + return (void*)(uintptr_t)params.out.host_buf_addr; +} + +int __psm2_rv_munmap_and_unpin(psm2_rv_t rv, const void *buf, uint32_t size) +{ + struct rv_gpu_mem_params params; + int ret; + int save_errno; + + memset(¶ms, 0, sizeof(params)); + /* XXX: Add the version field once it is restored */ + params.in.gpu_buf_addr = (uintptr_t)buf; + params.in.gpu_buf_size = size; + + // buf is what was returned from a previous call to __psm2_rv_pin_and_mmap + // size is app buffer size, not rounded up to page size (could do that in caller if needed) + // this should reduce reference count but continue to cache the mmap & pin + // pages for future use in a later pin_and_mmap call (or perhaps even a + // later reg_mr?). Note we can even keep the pages mmaped still as caller + // should not use the pointer again until after a future pin_and_mmap call + // return 0 on success or -1 with errno + + if ((ret = ioctl(rv->fd, RV_IOCTL_GPU_MUNMAP_UNPIN, ¶ms)) != 0) { + save_errno = errno; + perror("gpu_unpin_munmap failed\n"); + errno = save_errno; + return ret; + } + + return 0; +} + +#endif /* PSM_CUDA */ + +int __psm2_rv_post_rdma_write_immed(psm2_rv_t rv, psm2_rv_conn_t conn, + void *loc_buf, psm2_rv_mr_t loc_mr, + uint64_t rem_buf, uint32_t rkey, + uint64_t len, uint32_t immed, uint64_t wr_id, + uint8_t *sconn_index, uint32_t *conn_count) +{ + struct rv_post_write_params pparams; + int ret; + + if (! rv || ! conn || ! loc_buf || ! loc_mr || ! rem_buf) { + errno = EINVAL; + return -1; + } + //printf("XXXX 0x%lx %s\n", pthread_self(), __FUNCTION__); + memset(&pparams, 0, sizeof(pparams)); + pparams.in.handle = conn->handle; + pparams.in.loc_addr = (uint64_t)loc_buf; + pparams.in.loc_mr_handle = loc_mr->handle; + pparams.in.loc_mr_addr = loc_mr->addr; + pparams.in.loc_mr_length = loc_mr->length; + pparams.in.loc_mr_access = loc_mr->access; + pparams.in.rem_addr = rem_buf; + pparams.in.rkey = rkey; + pparams.in.length = len; + pparams.in.immed = immed; + pparams.in.wr_id = wr_id; + ret = ioctl(rv->fd, RV_IOCTL_POST_RDMA_WR_IMMED, &pparams); + *sconn_index = pparams.out.sconn_index; + *conn_count = pparams.out.conn_count; + return ret; +} + +// Safely poll an event and consume it. +// returns 0 if CQ empty, 1 if consumed an entry and -1 if error +// given PSM locking model, we don't need to get a lock here, caller will +// already hold progress_lock if needed +int __psm2_rv_poll_cq(psm2_rv_t rv, struct rv_event *ev) +{ + uint32_t next; + // TBD - may want to skip error checks for datapath perf + if (! rv || ! rv->events.hdr) { + errno = EINVAL; + return -1; + } + next = rv->events.hdr->head; + if (next == rv->events.hdr->tail) + return 0; // empty + // make sure read of tail completes before fetch event + { asm volatile("lfence":::"memory"); } + *ev = rv->events.hdr->entries[next++]; + // make sure event fully fetched before advance head + { asm volatile("sfence":::"memory"); } + if (next == rv->events.num) + next = 0; + rv->events.hdr->head = next; + return 1; +} + +// Safely scan CQ for an event without consuming anything. +// returns 1 if matching successful CQ event found +// returns 0 if CQE empty or no matching successful event found +// Only messages on CQ immediately prior to call are scanned, new CQ events +// arriving during or after this function are not scanned +// given PSM locking model, we don't need to get a lock here, caller will +// already hold progress_lock if needed +int __psm2_rv_scan_cq(psm2_rv_t rv, uint8_t event_type, + uint32_t imm_mask, uint32_t imm_value) +{ + uint32_t next; + uint32_t tail; + struct rv_event *ev; + + // TBD - may want to skip error checks for datapath perf + if (! rv || ! rv->events.hdr) { + errno = EINVAL; + return -1; + } + next = rv->events.hdr->head; + tail = rv->events.hdr->tail; + // make sure read of tail completes before read events + { asm volatile("lfence":::"memory"); } + while (next != tail) { + ev = &rv->events.hdr->entries[next++]; + if (ev->event_type == event_type + && ev->wc.status == 0 + && (ev->wc.imm_data & imm_mask) == imm_value) + return 1; // found + if (next == rv->events.num) + next = 0; + } + return 0; // not found +} +#endif // RNDV_MOD_MR diff --git a/prov/psm3/psm3/psm_rndv_mod.h b/prov/psm3/psm3/psm_rndv_mod.h new file mode 100644 index 00000000000..95726bc9e82 --- /dev/null +++ b/prov/psm3/psm3/psm_rndv_mod.h @@ -0,0 +1,198 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ +#if defined(RNDV_MOD_MR) + +#ifndef _PSMI_RNDV_MOD_MR_H +#define _PSMI_RNDV_MOD_MR_H + +#include +//#include +//#include +#include + +struct local_info { + uint32_t mr_cache_size; // in MBs + uint8_t rdma_mode; // RV_RDMA_MODE_* + + // additional information for RV_RDMA_MODE_KERNEL + uint8_t port_num; + uint8_t num_conn; // # QPs between each pair of nodes + uint32_t loc_addr; // our local address. (cpu byte order) + // for OPA/IB a 16 bit LID + // for ethernet a 32 bit IPv4 address + uint8_t index_bits; // num high bits of immed data with rv index + uint16_t loc_gid_index; // index for loc_gid + union ibv_gid loc_gid; // our local GID for use in IB CM connections + uint16_t qos_class_sl; // TBD if will use + // indicated in ah_attr when create_conn + uint16_t job_key_len; + uint8_t *job_key; + uint64_t service_id; // optional override to rv kernel param + void *context; + uint32_t cq_entries; // rv event queue for PSM polling + uint32_t q_depth; // depth of QP and CQ per QP + uint32_t reconnect_timeout; // in seconds + uint32_t hb_interval; // in milliseconds + // output from RNDV driver + uint16_t major_rev; // driver ABI rev + uint16_t minor_rev; // driver ABI rev + uint64_t capability; + uint32_t rv_index; // unique within job on given NIC +}; + +struct rv_event_ring { + struct rv_ring_header *hdr; + int len; + uint32_t num; +}; + +struct psm2_rv { + int fd; /* file handle used to issue ioctls to rv driver */ + struct rv_event_ring events; +}; +typedef struct psm2_rv *psm2_rv_t; + +struct psm2_rv_conn { + psm2_rv_t rv; // our parent + uint64_t handle; // rv_user_conn + uint64_t conn_handle; // rv_conn + // ah, path and context are saved only in kernel +}; +typedef struct psm2_rv_conn *psm2_rv_conn_t; + +// for simple sanity check +static inline uint64_t psm2_rv_conn_get_conn_handle(psm2_rv_conn_t conn) +{ + return conn->conn_handle; +} + +struct psm2_rv_mr { + uint64_t addr; + uint64_t length; + int access; + uint64_t handle; + uint64_t iova; + uint32_t lkey; + uint32_t rkey; +}; +typedef struct psm2_rv_mr *psm2_rv_mr_t; + +#define psm2_rv_cache_stats rv_cache_stats_params_out + +#define psm2_rv_conn_stats rv_conn_get_stats_params_out + +#define psm2_rv_event_stats rv_event_stats_params_out + +static inline uint16_t psm2_rv_get_user_major_bldtime_version(void) +{ + return RV_ABI_VER_MAJOR; +} + +static inline uint16_t psm2_rv_get_user_minor_bldtime_version(void) +{ + return RV_ABI_VER_MINOR; +} + +extern psm2_rv_t __psm2_rv_open(const char *devname, struct local_info *loc_info); + +extern int __psm2_rv_close(psm2_rv_t rv); + +extern int __psm2_rv_get_cache_stats(psm2_rv_t rv, + struct psm2_rv_cache_stats *stats); + +extern psm2_rv_conn_t __psm2_rv_create_conn(psm2_rv_t rv, + struct ibv_ah_attr *ah_attr, // for remote node + uint32_t rem_addr); // for simple compare to loc_addr + +extern int __psm2_rv_connect(psm2_rv_conn_t conn, + const struct ib_user_path_rec *path); + +extern int __psm2_rv_connected(psm2_rv_conn_t conn); + +extern int __psm2_rv_get_conn_count(psm2_rv_t rv, psm2_rv_conn_t conn, + uint8_t index, uint32_t *count); + +extern int __psm2_rv_get_conn_stats(psm2_rv_t rv, psm2_rv_conn_t conn, + uint8_t index, struct psm2_rv_conn_stats *stats); + +extern int __psm2_rv_get_event_stats(psm2_rv_t rv, + struct psm2_rv_event_stats *stats); + +extern int __psm2_rv_disconnect(psm2_rv_conn_t conn); + +extern void __psm2_rv_destroy_conn(psm2_rv_conn_t conn); + +extern psm2_rv_mr_t __psm2_rv_reg_mem(psm2_rv_t rv, int cmd_fd, struct ibv_pd *pd, void *addr, + uint64_t length, int access); + +extern int __psm2_rv_dereg_mem(psm2_rv_t rv, psm2_rv_mr_t mr); + +extern void * __psm2_rv_pin_and_mmap(psm2_rv_t rv, uintptr_t pageaddr, uint32_t pagelen); + +extern int __psm2_rv_munmap_and_unpin(psm2_rv_t rv, const void *buf, uint32_t size); + +extern int __psm2_rv_post_rdma_write_immed(psm2_rv_t rv, psm2_rv_conn_t conn, + void *loc_buf, psm2_rv_mr_t loc_mr, + uint64_t rem_buf, uint32_t rkey, + size_t len, uint32_t immed, uint64_t wr_id, + uint8_t *sconn_index, uint32_t *conn_count); + +extern int __psm2_rv_poll_cq(psm2_rv_t rv, struct rv_event *ev); + +extern int __psm2_rv_scan_cq(psm2_rv_t rv, uint8_t event_type, + uint32_t imm_mask, uint32_t imm_value); + +#endif // _PSMI_RNDV_MOD_MR_H +#endif // defined(RNDV_MOD_MR) diff --git a/prov/psm3/psm3/psm_stats.c b/prov/psm3/psm3/psm_stats.c new file mode 100644 index 00000000000..f9883f60522 --- /dev/null +++ b/prov/psm3/psm3/psm_stats.c @@ -0,0 +1,771 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm_mq_internal.h" + +struct psmi_stats_type { + STAILQ_ENTRY(psmi_stats_type) next; + struct psmi_stats_entry *entries; + + int num_entries; + const char *heading; + uint32_t statstype; + uint64_t id; // identifier to include in output, typically epid + void *context; +}; + +static STAILQ_HEAD(, psmi_stats_type) psmi_stats = +STAILQ_HEAD_INITIALIZER(psmi_stats); + +pthread_spinlock_t psmi_stats_lock; // protects psmi_stats list +// stats output +static int print_statsmask; +static time_t stats_start; +static char perf_file_name[PATH_MAX]; +static FILE *perf_stats_fd; +// stats thread +static int print_stats_freq; +static int print_stats_running; +static pthread_t perf_print_thread; + +// we attempt open only once and only output error once +// this prevents multiple failures and also prevents reopen during finalize +static void psmi_open_stats_fd() +{ + static int attempted_open; + + if (! attempted_open && ! perf_stats_fd) { + perf_stats_fd = fopen(perf_file_name, "w+"); + if (!perf_stats_fd) + _HFI_ERROR("Failed to create fd for performance logging\n"); + attempted_open = 1; + } +} + +// caller must get psmi_stats_lock +static psm2_error_t +psmi_stats_deregister_type_internal(uint32_t statstype, + void *context) +{ + struct psmi_stats_type *type; + + STAILQ_FOREACH(type, &psmi_stats, next) { + if (type->statstype == statstype && type->context == context) { + STAILQ_REMOVE(&psmi_stats, type, psmi_stats_type, next); + psmi_free(type->entries); + psmi_free(type); + return PSM2_OK; + } + } + return PSM2_INTERNAL_ERR; // not found +} + +static psm2_error_t +psmi_stats_register_type_internal(const char *heading, + uint32_t statstype, + const struct psmi_stats_entry *entries_i, + int num_entries, uint64_t id, void *context, + bool rereg) +{ + struct psmi_stats_entry *entries; + struct psmi_stats_type *type; + int i; + psm2_error_t err = PSM2_OK; + + if (! heading || ! context || ! statstype || ! num_entries || ! entries_i) + return PSM2_PARAM_ERR; + + entries = + psmi_calloc(PSMI_EP_NONE, STATS, num_entries, + sizeof(struct psmi_stats_entry)); + type = + psmi_calloc(PSMI_EP_NONE, STATS, 1, sizeof(struct psmi_stats_type)); + PSMI_CHECKMEM(err, entries); + PSMI_CHECKMEM(err, type); + + type->entries = entries; + type->num_entries = num_entries; + type->statstype = statstype; + type->id = id; + type->context = context; + type->heading = heading; + + for (i = 0; i < num_entries; i++) { + type->entries[i].desc = entries_i[i].desc; + type->entries[i].flags = entries_i[i].flags; + type->entries[i].getfn = entries_i[i].getfn; + type->entries[i].u.val = entries_i[i].u.val; + } + + pthread_spin_lock(&psmi_stats_lock); + if (rereg) + (void) psmi_stats_deregister_type_internal(statstype, context); + STAILQ_INSERT_TAIL(&psmi_stats, type, next); + pthread_spin_unlock(&psmi_stats_lock); + return err; + +fail: + if (entries) + psmi_free(entries); + if (type) + psmi_free(type); + return err; +} + +psm2_error_t +psmi_stats_register_type(const char *heading, + uint32_t statstype, + const struct psmi_stats_entry *entries_i, + int num_entries, uint64_t id, void *context) +{ + return psmi_stats_register_type_internal(heading, statstype, entries_i, + num_entries, id, context, 0); +} + +psm2_error_t +psmi_stats_reregister_type(const char *heading, + uint32_t statstype, + const struct psmi_stats_entry *entries_i, + int num_entries, uint64_t id, void *context) +{ + return psmi_stats_register_type_internal(heading, statstype, entries_i, + num_entries, id, context, 1); +} + +void psmi_stats_show(uint32_t statsmask) +{ + struct psmi_stats_type *type; + time_t now; + char buf[100]; + int first=1; + + pthread_spin_lock(&psmi_stats_lock); + psmi_open_stats_fd(); + if (! perf_stats_fd) + goto unlock; + + now = time(NULL); + + if (print_stats_freq > 0) + fprintf(perf_stats_fd, "Time Delta %u seconds %s\n", + (unsigned)(now - stats_start), ctime_r(&now, buf)); + + STAILQ_FOREACH(type, &psmi_stats, next) { + int i; + struct psmi_stats_entry *entry; + + if (! (type->statstype & statsmask)) + continue; + if (print_stats_freq <= 0 && first) { + fprintf(perf_stats_fd, "Time Delta %u seconds %s\n", + (unsigned)(now - stats_start), ctime_r(&now, buf)); + first = 0; + } + if (type->id) + fprintf(perf_stats_fd, " %s id 0x%"PRIx64"\n", type->heading, type->id); + else + fprintf(perf_stats_fd, " %s\n", type->heading); + for (i=0, entry=&type->entries[0]; inum_entries; i++, entry++) { + uint64_t value; + value = (entry->getfn != NULL)? entry->getfn(type->context) + : *entry->u.val; + if (value || ! (entry->flags & MPSPAWN_STATS_SKIP_IF_ZERO) + || (statsmask & _PSMI_STATSTYPE_SHOWZERO)) + fprintf(perf_stats_fd, " %s %"PRIu64" (%"PRId64")\n", entry->desc, + value, (int64_t)value - (int64_t)entry->old_value); + entry->old_value = value; + } + } + fflush(perf_stats_fd); +unlock: + pthread_spin_unlock(&psmi_stats_lock); +} + +psm2_error_t psmi_stats_deregister_type(uint32_t statstype, void *context) +{ + psm2_error_t err; + + pthread_spin_lock(&psmi_stats_lock); + err = psmi_stats_deregister_type_internal(statstype, context); + pthread_spin_unlock(&psmi_stats_lock); + return err; +} + +psm2_error_t psmi_stats_deregister_all(void) +{ + struct psmi_stats_type *type; + + /* Currently our mpi still reads stats after finalize so this isn't safe + * yet */ + pthread_spin_lock(&psmi_stats_lock); + while ((type = STAILQ_FIRST(&psmi_stats)) != NULL) { + STAILQ_REMOVE_HEAD(&psmi_stats, next); + psmi_free(type->entries); + psmi_free(type); + } + pthread_spin_unlock(&psmi_stats_lock); + + return PSM2_OK; +} + +static +void +*psmi_print_stats_thread(void *unused) +{ + if (print_stats_freq <= 0) + goto end; + + psmi_open_stats_fd(); + if (!perf_stats_fd) + goto end; + + /* Performance stats will be printed every $PSM3_PRINT_STATS seconds */ + do { + psmi_stats_show(print_statsmask); + usleep(MICRO_SEC * print_stats_freq); + } while (print_stats_running); + +end: + pthread_exit(NULL); +} + +static void +psmi_print_stats_init_thread(void) +{ + print_stats_running = 1; + if (pthread_create(&perf_print_thread, NULL, + psmi_print_stats_thread, (void*)NULL)) + { + print_stats_running = 0; + _HFI_ERROR("Failed to create logging thread\n"); + } +} + +psm2_error_t +psmi_stats_initialize(void) +{ + union psmi_envvar_val env_stats; + + psmi_getenv("PSM3_PRINT_STATS", + "Prints performance stats every n seconds to file " + "./psm3-perf-stat-[hostname]-pid-[pid] when set to -1 stats are " + "printed only once on 1st ep close", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) 0, &env_stats); + print_stats_freq = env_stats.e_uint; + + psmi_getenv("PSM3_PRINT_STATSMASK", + "Mask of statistic types to print: " + "MQ=1, RCVTHREAD=0x100, IPS=0x200" + ", RDMA=0x400, MRCache=0x800" +#ifdef PSM_DEBUG + ", MEMORY=0x1000" +#endif +#ifdef RNDV_MOD_MR + ", RVEvents=0x2000, RVRDMA=0x4000" +#endif +#ifdef PSM_FI + ", FaultInj=0x8000" +#endif + ". 0x100000 causes zero values to also be shown", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val) PSMI_STATSTYPE_ALL, &env_stats); + print_statsmask = env_stats.e_uint; + + pthread_spin_init(&psmi_stats_lock, PTHREAD_PROCESS_PRIVATE); + stats_start = time(NULL); + + snprintf(perf_file_name, sizeof(perf_file_name), + "./psm3-perf-stat-%s-pid-%d", + psmi_gethostname(), getpid()); + + if (print_stats_freq > 0) + psmi_print_stats_init_thread(); + return PSM2_OK; +} + +void +psmi_stats_finalize(void) +{ + if (print_stats_freq == -1) { + psmi_stats_show(print_statsmask); + } else if (print_stats_running) { + print_stats_running = 0; + pthread_join(perf_print_thread, NULL); + } + if (perf_stats_fd) { + fclose(perf_stats_fd); + perf_stats_fd = NULL; + } + psmi_stats_deregister_all(); +} + +// called at start of ep_close so we can output 1 shot as needed while +// most of the interesting stats are available +// we only output if we have done no previous outputs, so +// if there are multiple EPs this only outputs on 1st EP close +void +psmi_stats_ep_close(void) +{ + if (print_stats_freq == -1 && ! perf_stats_fd) + psmi_stats_show(print_statsmask); +} + +#if 0 // unused code, specific to QLogic MPI +static uint32_t typestring_to_type(const char *typestr) +{ + if (strncasecmp(typestr, "all", 4) == 0) + return PSMI_STATSTYPE_ALL; + else if (strncasecmp(typestr, "p2p", 4) == 0) + return PSMI_STATSTYPE_P2P; + else if (strncasecmp(typestr, "hfi", 6) == 0) + return PSMI_STATSTYPE_HFI; + else if (strncasecmp(typestr, "ips", 4) == 0) + return PSMI_STATSTYPE_IPSPROTO; + else if ((strncasecmp(typestr, "intr", 5) == 0) || + (strncasecmp(typestr, "thread", 7) == 0) || + (strncasecmp(typestr, "rcvthread", 10) == 0)) + return PSMI_STATSTYPE_RCVTHREAD; + else if ((strncasecmp(typestr, "mq", 3) == 0) || + (strncasecmp(typestr, "mpi", 4) == 0)) + return PSMI_STATSTYPE_MQ; + else if ((strncasecmp(typestr, "tid", 4) == 0) || + (strncasecmp(typestr, "tids", 5) == 0)) + return PSMI_STATSTYPE_TIDS; + else if ((strncasecmp(typestr, "memory", 7) == 0) || + (strncasecmp(typestr, "alloc", 6) == 0) || + (strncasecmp(typestr, "malloc", 7) == 0)) + return PSMI_STATSTYPE_MEMORY; + else + return 0; +} + +static uint32_t stats_parse_enabled_mask(const char *stats_string) +{ + char *b = (char *)stats_string; + char *e = b; + char buf[128]; + + uint32_t stats_enabled_mask = 0; + + while (*e) { + b = e; + while (*e && *e != ',' && *e != '+' && *e != '.' && + *e != '|' && *e != ':') + e++; + if (e > b) { /* something new to parse */ + int len = ((e - b) > (sizeof(buf) - 1)) ? + (sizeof(buf) - 1) : (e - b); + strncpy(buf, b, len); + buf[len] = '\0'; + stats_enabled_mask |= typestring_to_type(buf); + } + if (*e) + e++; /* skip delimiter */ + } + return stats_enabled_mask; +} + +static +void psmi_stats_mpspawn_callback(struct mpspawn_stats_req_args *args) +{ + const struct psmi_stats_entry *entry; + struct psmi_stats_type *type = (struct psmi_stats_type *)args->context; + int i, num = args->num; + uint64_t *stats = args->stats; + uint64_t *c = NULL; + uint64_t *s = NULL; + + psmi_assert(num == type->num_entries); + + if (type->statstype == PSMI_STATSTYPE_MEMORY) { + for (i = 0; i < num; i++) { + entry = &type->entries[i]; + stats[i] = + *(uint64_t *) ((uintptr_t) &psmi_stats_memory + + (uintptr_t) entry->u.off); + } + } else { + for (i = 0; i < num; i++) { + entry = &type->entries[i]; + if (entry->getfn != NULL) + stats[i] = entry->getfn(type->context); + else + stats[i] = *entry->u.val; + } + } + + if (c != NULL) + psmi_free(c); + if (s != NULL) + psmi_free(s); +} + +static +void +stats_register_mpspawn_single(mpspawn_stats_add_fn add_fn, + char *heading, + int num_entries, + struct psmi_stats_entry *entries, + mpspawn_stats_req_fn req_fn, void *context) +{ + int i; + struct mpspawn_stats_add_args mp_add; + + mp_add.version = MPSPAWN_STATS_VERSION; + mp_add.num = num_entries; + mp_add.header = heading; + mp_add.req_fn = req_fn; + mp_add.context = context; + + mp_add.desc = (char **)alloca(sizeof(char *) * num_entries); + + mp_add.flags = (uint16_t *) alloca(sizeof(uint16_t *) * num_entries); + + for (i = 0; i < num_entries; i++) { + mp_add.desc[i] = (char *)entries[i].desc; + mp_add.flags[i] = entries[i].flags; + } + + /* Ignore return code, doesn't matter to *us* if register failed */ + add_fn(&mp_add); + + return; +} + +static void stats_register_mem_stats(psm2_ep_t ep); +static psm2_error_t psmi_stats_epaddr_register(struct mpspawn_stats_init_args + *args); + +/* + * Downcall from QLogic MPI into PSM, so we can register stats + */ +void *psmi_stats_register(struct mpspawn_stats_init_args *args) +{ + struct psmi_stats_type *type; + uint32_t statsmask; + + /* + * Args has a version string in it, but we can ignore it since mpspawn + * will decide if it supports *our* version + */ + + /* + * Eventually, parse the stats_types to add various "flavours" of stats + */ + if (args->stats_types == NULL) + return NULL; + + statsmask = stats_parse_enabled_mask(args->stats_types); + + /* MQ (MPI-level) statistics */ + if (statsmask & PSMI_STATSTYPE_MQ) + psmi_mq_stats_register(args->mq, args->add_fn); + + + if (statsmask & PSMI_STATSTYPE_MEMORY) + stats_register_mem_stats(args->mq->ep); + + /* + * At this point all PSM and hfi-level components have registered stats + * with the PSM stats interface. We register with the mpspawn stats + * interface with an upcall in add_fn + */ + STAILQ_FOREACH(type, &psmi_stats, next) { + if (type->statstype & statsmask) + stats_register_mpspawn_single(args->add_fn, + type->heading, + type->num_entries, + type->entries, + psmi_stats_mpspawn_callback, + type); + } + + /* + * Special handling for per-endpoint statistics + * Only MPI knows what the endpoint-addresses are in the running program, + * PSM has no sense of MPI worlds. In stats register, MPI tells PSM how + * many endpoints it anticipates having and PSM simply reserves that amount + * of stats entries X the amount of per-endpoint stats. + */ + if (statsmask & PSMI_STATSTYPE_P2P) + psmi_stats_epaddr_register(args); + + return NULL; +} + +struct stats_epaddr { + psm2_ep_t ep; + mpspawn_map_epaddr_fn epaddr_map_fn; + int num_ep; + int num_ep_stats; +}; + +static +void psmi_stats_epaddr_callback(struct mpspawn_stats_req_args *args) +{ + int i, num, off; + uint64_t *statsp; + struct stats_epaddr *stats_ctx = (struct stats_epaddr *)args->context; + psm2_ep_t ep = stats_ctx->ep; + psm2_epaddr_t epaddr; + + num = stats_ctx->num_ep * stats_ctx->num_ep_stats; + + /* First always NAN the entire stats request */ + for (i = 0; i < num; i++) { + if (args->flags[i] & MPSPAWN_STATS_TYPE_DOUBLE) + args->stats[i] = MPSPAWN_NAN; + else + args->stats[i] = MPSPAWN_NAN_U64; + } + + for (i = 0; i < stats_ctx->num_ep; i++) { + statsp = args->stats + i * stats_ctx->num_ep_stats; + off = 0; + epaddr = stats_ctx->epaddr_map_fn(i); + if (epaddr == NULL) + continue; + + /* Self */ + if (&ep->ptl_self == epaddr->ptlctl) { + if (ep->ptl_self.epaddr_stats_get != NULL) + off += + ep->ptl_self.epaddr_stats_get(epaddr, + statsp + off); + } else { + if (ep->ptl_self.epaddr_stats_num != NULL) + off += ep->ptl_self.epaddr_stats_num(); + } + + /* Shm */ + if (&ep->ptl_amsh == epaddr->ptlctl) { + if (ep->ptl_amsh.epaddr_stats_get != NULL) + off += + ep->ptl_amsh.epaddr_stats_get(epaddr, + statsp + off); + } else { + if (ep->ptl_amsh.epaddr_stats_num != NULL) + off += ep->ptl_amsh.epaddr_stats_num(); + } + + /* ips */ + if (&ep->ptl_ips == epaddr->ptlctl) { + if (ep->ptl_ips.epaddr_stats_get != NULL) + off += + ep->ptl_ips.epaddr_stats_get(epaddr, + statsp + off); + } else { + if (ep->ptl_ips.epaddr_stats_num != NULL) + off += ep->ptl_ips.epaddr_stats_num(); + } + } + return; +} + +static +psm2_error_t +psmi_stats_epaddr_register(struct mpspawn_stats_init_args *args) +{ + int i = 0, j; + int num_ep = args->num_epaddr; + int num_ep_stats = 0; + int nz; + char **desc, **desc_i; + uint16_t *flags, *flags_i; + char *p; + char buf[128]; + psm2_ep_t ep; + struct mpspawn_stats_add_args mp_add; + struct stats_epaddr *stats_ctx; + psm2_error_t err = PSM2_OK; + + if (args->mq == NULL) + return PSM2_OK; + ep = args->mq->ep; + + /* Figure out how many stats there are in an endpoint from all devices */ + if (ep->ptl_self.epaddr_stats_num != NULL) + num_ep_stats += ep->ptl_self.epaddr_stats_num(); + if (ep->ptl_amsh.epaddr_stats_num != NULL) + num_ep_stats += ep->ptl_amsh.epaddr_stats_num(); + if (ep->ptl_ips.epaddr_stats_num != NULL) + num_ep_stats += ep->ptl_ips.epaddr_stats_num(); + + /* Allocate desc and flags and let each device initialize their + * descriptions and flags */ + desc = + psmi_malloc(ep, STATS, + sizeof(char *) * num_ep_stats * (num_ep + 1)); + if (desc == NULL) + return PSM2_NO_MEMORY; + flags = + psmi_malloc(ep, STATS, + sizeof(uint16_t) * num_ep_stats * (num_ep + 1)); + if (flags == NULL) { + psmi_free(desc); + return PSM2_NO_MEMORY; + } + + /* Get the descriptions/flags from each device */ + i = 0; + i += ep->ptl_self.epaddr_stats_num != NULL ? + ep->ptl_self.epaddr_stats_init(desc + i, flags + i) : 0; + i += ep->ptl_amsh.epaddr_stats_num != NULL ? + ep->ptl_amsh.epaddr_stats_init(desc + i, flags + i) : 0; + i += ep->ptl_ips.epaddr_stats_num != NULL ? + ep->ptl_ips.epaddr_stats_init(desc + i, flags + i) : 0; + psmi_assert_always(i == num_ep_stats); + + /* + * Clone the descriptions for each endpoint but append "rank %d" to it + * beforehand. + */ + nz = (num_ep < 10 ? 1 : (num_ep < 100 ? 2 : /* cheap log */ + (num_ep < 1000 ? 3 : (num_ep < 1000 ? 4 : + (num_ep < + 10000 ? 5 : 6))))); + + desc_i = desc + num_ep_stats; + flags_i = flags + num_ep_stats; + memset(desc_i, 0, sizeof(char *) * num_ep * num_ep_stats); + + for (i = 0; i < num_ep; i++) { + for (j = 0; j < num_ep_stats; j++) { + snprintf(buf, sizeof(buf) - 1, "<%*d> %s", nz, i, + desc[j]); + buf[sizeof(buf) - 1] = '\0'; + p = psmi_strdup(ep, buf); + if (p == NULL) { + err = PSM2_NO_MEMORY; + goto clean; + } + desc_i[i * num_ep_stats + j] = p; + flags_i[i * num_ep_stats + j] = flags[j]; + } + } + + mp_add.version = MPSPAWN_STATS_VERSION; + mp_add.num = num_ep_stats * num_ep; + mp_add.header = "Endpoint-to-Endpoint Stats (by )"; + mp_add.req_fn = psmi_stats_epaddr_callback; + mp_add.desc = desc_i; + mp_add.flags = flags_i; + stats_ctx = psmi_malloc(ep, STATS, sizeof(struct stats_epaddr)); + if (stats_ctx == NULL) { + err = PSM2_NO_MEMORY; + goto clean; + } + stats_ctx->ep = ep; + stats_ctx->epaddr_map_fn = args->epaddr_map_fn; + stats_ctx->num_ep = num_ep; + stats_ctx->num_ep_stats = num_ep_stats; + mp_add.context = stats_ctx; + + args->add_fn(&mp_add); + +clean: + /* Now we can free all the descriptions */ + for (i = 0; i < num_ep; i++) { + for (j = 0; j < num_ep_stats; j++) + if (desc_i[i * num_ep_stats + j]) + psmi_free(desc_i[i * num_ep_stats + j]); + } + + psmi_free(desc); + psmi_free(flags); + + return err; +} + + + +#undef _SDECL +#define _SDECL(_desc, _param) { \ + .desc = _desc, \ + .flags = MPSPAWN_STATS_REDUCTION_ALL \ + | MPSPAWN_STATS_SKIP_IF_ZERO, \ + .getfn = NULL, \ + .u.off = offsetof(struct psmi_stats_malloc, _param) \ + } + +static +void stats_register_mem_stats(psm2_ep_t ep) +{ + struct psmi_stats_entry entries[] = { + _SDECL("Total_(current)", m_all_total), + _SDECL("Total_(max)", m_all_max), + _SDECL("All_Peers_(current)", m_perpeer_total), + _SDECL("All_Peers_(max)", m_perpeer_max), + _SDECL("Network_Buffers_(current)", m_netbufs_total), + _SDECL("Network_Buffers_(max)", m_netbufs_max), + _SDECL("PSM_desctors_(current)", m_descriptors_total), + _SDECL("PSM_desctors_(max)", m_descriptors_max), + _SDECL("Unexp._buffers_(current)", m_unexpbufs_total), + _SDECL("Unexp._Buffers_(max)", m_unexpbufs_max), +#ifdef RNDV_MOD_MR + _SDECL("Peer_Rndv_(current)", m_peerrndv_total), + _SDECL("Peer_Rndv_(max)", m_peerrndv_max), +#endif + _SDECL("Other_(current)", m_undefined_total), + _SDECL("Other_(max)", m_undefined_max), + }; + + psmi_stats_register_type("PSM_memory_allocation_statistics", + PSMI_STATSTYPE_MEMORY, + entries, PSMI_STATS_HOWMANY(entries), ep); +} +#endif // 0 // unused code, specific to QLogic MPI diff --git a/prov/psm3/psm3/psm_stats.h b/prov/psm3/psm3/psm_stats.h new file mode 100644 index 00000000000..516a7111c0d --- /dev/null +++ b/prov/psm3/psm3/psm_stats.h @@ -0,0 +1,158 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_IN_USER_H +#error psm_stats.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSM_STATS_H +#define _PSM_STATS_H + +#include "mpspawn_stats.h" + +#define PSMI_STATSTYPE_MQ 0x00001 +#define PSMI_STATSTYPE_RCVTHREAD 0x00100 /* num_wakups, ratio, etc. */ +#define PSMI_STATSTYPE_IPSPROTO 0x00200 /* acks,naks,err_chks */ +#define PSMI_STATSTYPE_TIDS 0x00400 +#if 0 // unused code, specific to QLogic MPI +#define PSMI_STATSTYPE_P2P 0x00800 /* ep-to-ep details */ +#endif +#define PSMI_STATSTYPE_MR_CACHE 0x00800 +#define PSMI_STATSTYPE_MEMORY 0x01000 +#ifdef RNDV_MOD_MR +#define PSMI_STATSTYPE_RV_EVENT 0x02000 /* RV user event */ +#define PSMI_STATSTYPE_RV_RDMA 0x04000 /* RV shared conn RDMA */ +#endif +#define PSMI_STATSTYPE_FAULTINJ 0x08000 /* fault injection - PSM_FI */ +#define PSMI_STATSTYPE_ALL 0xfffff +#define _PSMI_STATSTYPE_SHOWZERO 0x100000 + +#if 0 // unused code, specific to QLogic MPI +#define PSMI_STATSTYPE_HFI (PSMI_STATSTYPE_RCVTHREAD| \ + PSMI_STATSTYPE_IPSPROTO | \ + PSMI_STATSTYPE_MEMORY | \ + PSMI_STATSTYPE_TIDS) +#endif + +/* Used to determine how many stats in static array decl. */ +#define PSMI_STATS_HOWMANY(entries) \ + (sizeof(entries)/sizeof(entries[0])) + +#define PSMI_STATS_DECL(_desc, _flags, _getfn, _val) \ + { .desc = _desc, \ + .flags = _flags, \ + .getfn = _getfn, \ + .u.val = _val, \ + } + +#define PSMI_STATS_DECLU64(_desc, _val) \ + PSMI_STATS_DECL(_desc, \ + MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO, \ + NULL, \ + _val) + +#define PSMI_STATS_DECL_FUNC(_desc, _getfn) \ + PSMI_STATS_DECL(_desc, \ + MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO, \ + _getfn, \ + NULL) + +struct psmi_stats_entry { + const char *desc; + uint16_t flags; + uint64_t(*getfn) (void *context); /* optional fn ptr to get value */ + union { + uint64_t *val; /* where value is stored if getfn is NULL */ + //uint64_t off; /* or offset if that makes more sense */ + } u; + uint64_t old_value; /* value fetched from previous report */ +}; + +static inline void +psmi_stats_init_u64(struct psmi_stats_entry *e, const char *desc, uint64_t *val) +{ + e->desc = desc; + e->flags = MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO; + e->getfn = NULL; + e->u.val = val; + e->old_value = 0; +} + +/* + * Copy the array of entries and keep track of the context + * statstype and context form a unique key to identify the stats for deregister + */ +psm2_error_t +psmi_stats_register_type(const char *heading, + uint32_t statstype, + const struct psmi_stats_entry *entries, + int num_entries, uint64_t id, void *context); + +/* deregister old copy and register a new one in it's place */ +psm2_error_t +psmi_stats_reregister_type(const char *heading, + uint32_t statstype, + const struct psmi_stats_entry *entries, + int num_entries, uint64_t id, void *context); + +psm2_error_t psmi_stats_deregister_type(uint32_t statstype, void *context); + +psm2_error_t psmi_stats_initialize(void); + +void psmi_stats_finalize(void); + +void psmi_stats_ep_close(void); // let stats react to 1st ep close if desired + +#endif /* PSM_STATS_H */ diff --git a/prov/psm3/psm3/psm_sysbuf.c b/prov/psm3/psm3/psm_sysbuf.c new file mode 100644 index 00000000000..48fc06e503c --- /dev/null +++ b/prov/psm3/psm3/psm_sysbuf.c @@ -0,0 +1,222 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm_mq_internal.h" + +/* + * + * System buffer (unexpected message) allocator + * + */ + +#define MM_FLAG_NONE 0 +#define MM_FLAG_TRANSIENT 0x1 + +struct psmi_mem_block_ctrl { + union { + psmi_mem_ctrl_t *mem_handler; + struct psmi_mem_block_ctrl *next; + }; +}; + + +/* Per MQ allocators */ +void psmi_mq_sysbuf_init(psm2_mq_t mq) +{ + int i; + uint32_t block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, (uint32_t)-1}; + uint32_t replenishing_rate[] = {128, 64, 32, 16, 8, 4, 0}; + + if (mq->mem_ctrl_is_init) + return; + mq->mem_ctrl_is_init = 1; + + for (i=0; i < MM_NUM_OF_POOLS; i++) { + mq->handler_index[i].block_size = block_sizes[i]; + mq->handler_index[i].current_available = 0; + mq->handler_index[i].free_list = NULL; + mq->handler_index[i].total_alloc = 0; + mq->handler_index[i].replenishing_rate = replenishing_rate[i]; + + if (block_sizes[i] == -1) { + psmi_assert_always(replenishing_rate[i] == 0); + mq->handler_index[i].flags = MM_FLAG_TRANSIENT; + } + else { + psmi_assert_always(replenishing_rate[i] > 0); + mq->handler_index[i].flags = MM_FLAG_NONE; + } + } + + /* Hit once on each block size so we have a pool that's allocated */ + for (i=0; i < MM_NUM_OF_POOLS; i++) { + void *ptr; + if (block_sizes[i] == -1) + continue; + ptr = psmi_mq_sysbuf_alloc(mq, block_sizes[i]); + psmi_mq_sysbuf_free(mq, ptr); + } +} + +void psmi_mq_sysbuf_fini(psm2_mq_t mq) // free all buffers that is currently not used +{ + struct psmi_mem_block_ctrl *block; + int i; + + if (mq->mem_ctrl_is_init == 0) + return; + + for (i=0; i < MM_NUM_OF_POOLS; i++) { + while ((block = mq->handler_index[i].free_list) != NULL) { + mq->handler_index[i].free_list = block->next; + psmi_free(block); + } + } + mq->mem_ctrl_is_init = 0; +} + +void psmi_mq_sysbuf_getinfo(psm2_mq_t mq, char *buf, size_t len) +{ + snprintf(buf, len-1, "Sysbuf consumption: %"PRIu64" bytes\n", + mq->mem_ctrl_total_bytes); + buf[len-1] = '\0'; + return; +} + +void *psmi_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t alloc_size) +{ + psmi_mem_ctrl_t *mm_handler = mq->handler_index; + struct psmi_mem_block_ctrl *new_block; + int replenishing; + + /* There is a timing race with ips initialization, fix later. + * * XXX */ + if (!mq->mem_ctrl_is_init) + psmi_mq_sysbuf_init(mq); + + mq->stats.rx_sysbuf_num++; + mq->stats.rx_sysbuf_bytes += alloc_size; + + while (mm_handler->block_size < alloc_size) + mm_handler++; + + replenishing = mm_handler->replenishing_rate; + + if (mm_handler->current_available == 0) { // allocate more buffers + if (mm_handler->flags & MM_FLAG_TRANSIENT) { + uint32_t newsz = alloc_size + sizeof(struct psmi_mem_block_ctrl); + new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz); + + if (new_block) { + new_block->mem_handler = mm_handler; + new_block++; + mm_handler->total_alloc++; + mq->mem_ctrl_total_bytes += newsz; + } + return new_block; + } + + do { + uint32_t newsz = mm_handler->block_size + sizeof(struct psmi_mem_block_ctrl); + + new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz); + mq->mem_ctrl_total_bytes += newsz; + + if (new_block) { + mm_handler->current_available++; + mm_handler->total_alloc++; + + new_block->next = mm_handler->free_list; + mm_handler->free_list = new_block; + } + + } while (--replenishing && new_block); + } + + if (mm_handler->current_available) { + mm_handler->current_available--; + + new_block = mm_handler->free_list; + mm_handler->free_list = new_block->next; + + new_block->mem_handler = mm_handler; + new_block++; + + return new_block; + } + return NULL; +} + +void psmi_mq_sysbuf_free(psm2_mq_t mq, void * mem_to_free) +{ + struct psmi_mem_block_ctrl * block_to_free; + psmi_mem_ctrl_t *mm_handler; + + psmi_assert_always(mq->mem_ctrl_is_init); + + block_to_free = (struct psmi_mem_block_ctrl *)mem_to_free - 1; + mm_handler = block_to_free->mem_handler; + + if (mm_handler->flags & MM_FLAG_TRANSIENT) { + psmi_free(block_to_free); + } else { + block_to_free->next = mm_handler->free_list; + mm_handler->free_list = block_to_free; + mm_handler->current_available++; + } + + return; +} diff --git a/prov/psm3/psm3/psm_sysbuf.h b/prov/psm3/psm3/psm_sysbuf.h new file mode 100644 index 00000000000..07ab5939104 --- /dev/null +++ b/prov/psm3/psm3/psm_sysbuf.h @@ -0,0 +1,81 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef SYSBUF_INT_H +#define SYSBUF_INT_H + +#include "psm_user.h" + +#define MM_NUM_OF_POOLS 7 + +typedef struct psmi_mem_ctrl { + struct psmi_mem_block_ctrl *free_list; + uint32_t total_alloc; + uint32_t current_available; + uint32_t block_size; + uint32_t flags; + uint32_t replenishing_rate; +} psmi_mem_ctrl_t; + +/* + * MQ unexpected buffer management + */ +void psmi_mq_sysbuf_init(psm2_mq_t mq); +void psmi_mq_sysbuf_fini(psm2_mq_t mq); +void* psmi_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t nbytes); +void psmi_mq_sysbuf_free(psm2_mq_t mq, void *); +void psmi_mq_sysbuf_getinfo(psm2_mq_t mq, char *buf, size_t len); + +#endif /* SYSBUF_INT_H */ diff --git a/prov/psm3/psm3/psm_timer.c b/prov/psm3/psm3/psm_timer.c new file mode 100644 index 00000000000..9a8dddd2889 --- /dev/null +++ b/prov/psm3/psm3/psm_timer.c @@ -0,0 +1,198 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" + +#if PSMI_TIMER_STATS +# define PSMI_TIMER_STATS_ADD_INSERTION(ctrl) ((ctrl)->num_insertions++) +# define PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl) ((ctrl)->num_traversals++) +#else +# define PSMI_TIMER_STATS_ADD_INSERTION(ctrl) +# define PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl) +#endif + +psm2_error_t psmi_timer_init(struct psmi_timer_ctrl *ctrl) +{ + ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE; + +#if PSMI_TIMER_STATS + ctrl->num_insertions = 0; + ctrl->num_traversals = 0; +#endif + + TAILQ_INIT(&ctrl->timerq); + return PSM2_OK; +} + +psm2_error_t psmi_timer_fini(struct psmi_timer_ctrl *ctrl) +{ +#if PSMI_TIMER_STATS + if (ctrl->num_insertions > 0) { + _HFI_INFO("avg elem traversals/insertion = %3.2f %%\n", + 100.0 * (double)ctrl->num_traversals / + ctrl->num_insertions); + } +#endif + return PSM2_OK; +} + +void +psmi_timer_request_always(struct psmi_timer_ctrl *ctrl, + struct psmi_timer *t_insert, uint64_t t_cyc_expire) +{ + struct psmi_timer *t_cursor; + + psmi_assert(!(t_insert->flags & PSMI_TIMER_FLAG_PENDING)); + + t_insert->t_timeout = t_cyc_expire; + t_insert->flags |= PSMI_TIMER_FLAG_PENDING; + + /* + * We keep the list from oldest (head) to newest (tail), with the + * assumption that insert and remove occur much more often than search + * (when the timer expires). Newly added timers are more likely to expire + * later rather than sooner, which is why the head is older. + */ + PSMI_TIMER_STATS_ADD_INSERTION(ctrl); + + if (TAILQ_EMPTY(&ctrl->timerq)) { /* Common case */ + TAILQ_INSERT_TAIL(&ctrl->timerq, t_insert, timer); + ctrl->t_cyc_next_expire = t_cyc_expire; + PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl); + return; + } else if (t_cyc_expire > PSMI_TIMER_PRIO_LAST) { + TAILQ_FOREACH(t_cursor, &ctrl->timerq, timer) { + if (t_cursor->t_timeout <= t_cyc_expire) { + TAILQ_INSERT_BEFORE(t_cursor, t_insert, timer); + return; + } + PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl); + } + /* Got to the end of the list -- We're the next to expire */ + ctrl->t_cyc_next_expire = t_cyc_expire; + TAILQ_INSERT_TAIL(&ctrl->timerq, t_insert, timer); + return; + } else { + TAILQ_FOREACH_REVERSE(t_cursor, &ctrl->timerq, timerq, timer) { + if (t_cursor->t_timeout >= t_cyc_expire) { + TAILQ_INSERT_AFTER(&ctrl->timerq, t_cursor, + t_insert, timer); + ctrl->t_cyc_next_expire = + min(t_cyc_expire, ctrl->t_cyc_next_expire); + return; + } + PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl); + } + TAILQ_INSERT_HEAD(&ctrl->timerq, t_insert, timer); + /* No need to check if we inserted last, given first branch case */ + /* if (TAILQ_LAST(&ctrl->timerq, timerq) == t_insert) */ + /* ctrl->t_cyc_next_expire = t_cyc_expire; */ + return; + } + + return; +} + +psm2_error_t +psmi_timer_process_expired(struct psmi_timer_ctrl *ctrl, uint64_t t_cyc_expire) +{ + psm2_error_t err = PSM2_OK_NO_PROGRESS; + struct psmi_timer *t_cursor = TAILQ_LAST(&ctrl->timerq, timerq); + + PSM2_LOG_MSG("entering"); + + while (t_cursor) { + if (t_cursor->t_timeout > t_cyc_expire) + break; + + err = PSM2_OK; + psmi_assert(t_cursor->flags & PSMI_TIMER_FLAG_PENDING); + t_cursor->flags &= ~PSMI_TIMER_FLAG_PENDING; + TAILQ_REMOVE(&ctrl->timerq, t_cursor, timer); + t_cursor->expire_callback(t_cursor, t_cyc_expire); + t_cursor = TAILQ_PREV(t_cursor, timerq, timer); + } + + if (TAILQ_EMPTY(&ctrl->timerq)) + ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE; + else + ctrl->t_cyc_next_expire = + TAILQ_LAST(&ctrl->timerq, timerq)->t_timeout; + + PSM2_LOG_MSG("leaving"); + return err; +} + +void +psmi_timer_cancel_inner(struct psmi_timer_ctrl *ctrl, + struct psmi_timer *t_remove) +{ + + psmi_assert(t_remove->flags & PSMI_TIMER_FLAG_PENDING); + + t_remove->flags &= ~PSMI_TIMER_FLAG_PENDING; + TAILQ_REMOVE(&ctrl->timerq, t_remove, timer); + + /* + * If we're removing the last entry, we need to reset the + * expiration cycle time. + */ + if (TAILQ_EMPTY(&ctrl->timerq)) + ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE; + else + ctrl->t_cyc_next_expire = + TAILQ_LAST(&ctrl->timerq, timerq)->t_timeout; + return; +} diff --git a/prov/psm3/psm3/psm_timer.h b/prov/psm3/psm3/psm_timer.h new file mode 100644 index 00000000000..8c03d187295 --- /dev/null +++ b/prov/psm3/psm3/psm_timer.h @@ -0,0 +1,160 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_IN_USER_H +#error psm_timer.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSMI_TIMER_H +#define _PSMI_TIMER_H + + +typedef struct psmi_timer psmi_timer; +typedef psm2_error_t(*psmi_timer_expire_callback_t) (struct psmi_timer *, + uint64_t); + +struct psmi_timer { + TAILQ_ENTRY(psmi_timer) timer; /* opaque */ + uint64_t t_timeout; /* opaque */ + uint8_t flags; /* opaque */ + + psmi_timer_expire_callback_t expire_callback; /* user -- callback fn */ + void *context; /* user -- callback param */ +}; + +struct psmi_timer_ctrl { + uint64_t t_cyc_next_expire; + TAILQ_HEAD(timerq, psmi_timer) timerq; + +#if PSMI_TIMER_STATS + uint64_t num_insertions; + uint64_t num_traversals; +#endif +}; + +/* + * Some events need to be unconditionally enqueued at the beginning of the + * timerq -- they are not timers meant to expire but merely operations that + * need to be delayed. For delayed operations, there are 5 levels of + * priority. + */ +#define PSMI_TIMER_PRIO_0 0ULL +#define PSMI_TIMER_PRIO_1 1ULL +#define PSMI_TIMER_PRIO_2 2ULL +#define PSMI_TIMER_PRIO_3 3ULL +#define PSMI_TIMER_PRIO_4 4ULL +#define PSMI_TIMER_PRIO_LAST PSMI_TIMER_PRIO_4 + +#define PSMI_TIMER_INFINITE 0xFFFFFFFFFFFFFFFFULL +#define PSMI_TIMER_FLAG_PENDING 0x01 + +/* + * Timer control initialization and finalization + */ +psm2_error_t psmi_timer_init(struct psmi_timer_ctrl *ctrl); +psm2_error_t psmi_timer_fini(struct psmi_timer_ctrl *ctrl); + +/* + * Timer entry initialization (a timer must be initialized before it can be + * added to the timer request queue). + */ + +PSMI_ALWAYS_INLINE( +void +psmi_timer_entry_init(struct psmi_timer *t_init, + psmi_timer_expire_callback_t expire_fn, + void *context)) +{ + t_init->flags = 0; + t_init->expire_callback = expire_fn; + t_init->context = context; + return; +} + +/* + * Timer requests, conditional (macro) or unconditional + */ +#define psmi_timer_request(ctrl, t_insert, t_cyc) \ + if (!((t_insert)->flags & PSMI_TIMER_FLAG_PENDING)) \ + psmi_timer_request_always((ctrl), (t_insert), (t_cyc)) + +void psmi_timer_request_always(struct psmi_timer_ctrl *ctrl, + struct psmi_timer *t_insert, + uint64_t t_cyc_expire); + +/* + * Timer cancelations, conditional (macro) only (cancel_inner is internal) + */ +#define psmi_timer_cancel(ctrl, t_remove) \ + if ((t_remove)->flags & PSMI_TIMER_FLAG_PENDING) \ + psmi_timer_cancel_inner(ctrl, t_remove) +void psmi_timer_cancel_inner(struct psmi_timer_ctrl *ctrl, + struct psmi_timer *t_remove); + +/* + * Timer processing, conditional or unconditional. + */ +#define psmi_timer_process_if_expired(ctrl, t_cyc_expire) \ + (((ctrl)->t_cyc_next_expire <= (t_cyc_expire)) ? \ + psmi_timer_process_expired(ctrl, t_cyc_expire) : \ + PSM2_OK_NO_PROGRESS) + +#define psmi_timer_is_expired(ctrl, t_cyc_expire) \ + ((ctrl)->t_cyc_next_expire <= (t_cyc_expire)) + +psm2_error_t psmi_timer_process_expired(struct psmi_timer_ctrl *ctrl, + uint64_t t_cyc_expire); + +#endif /* _PSMI_TIMER_H */ diff --git a/prov/psm3/psm3/psm_udp_ep.c b/prov/psm3/psm3/psm_udp_ep.c new file mode 100644 index 00000000000..bd99a7260ff --- /dev/null +++ b/prov/psm3/psm3/psm_udp_ep.c @@ -0,0 +1,54 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ diff --git a/prov/psm3/psm3/psm_udp_ep.h b/prov/psm3/psm3/psm_udp_ep.h new file mode 100644 index 00000000000..7a02ee9e6e4 --- /dev/null +++ b/prov/psm3/psm3/psm_udp_ep.h @@ -0,0 +1,56 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + + diff --git a/prov/psm3/psm3/psm_user.h b/prov/psm3/psm3/psm_user.h new file mode 100644 index 00000000000..a0d9e5a8e49 --- /dev/null +++ b/prov/psm3/psm3/psm_user.h @@ -0,0 +1,551 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#ifndef _PSMI_USER_H +#define _PSMI_USER_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "psm_config.h" +#include +#include + +#include +#include +#include +#include +#include + +#include "psm2.h" +#include "psm2_mq.h" + +#include "ptl.h" + +#include "opa_user.h" +#include "opa_queue.h" + +#include "psm_log.h" +#include "psm_perf.h" + +#define PSMI_LOCK_NO_OWNER ((pthread_t)(-1)) + +#define _PSMI_IN_USER_H + +/* Opaque hw context pointer used in HAL, + and defined by each HAL instance. */ +typedef void *psmi_hal_hw_context; + +#include "psm_help.h" +#include "psm_error.h" +#include "psm_context.h" +#include "psm_utils.h" +#include "psm_timer.h" +#include "psm_mpool.h" +#include "psm_ep.h" +#include "psm_lock.h" +#include "psm_stats.h" +#include "psm2_mock_testing.h" + +#undef _PSMI_IN_USER_H + +#define PSMI_VERNO_MAKE(major, minor) ((((major)&0xff)<<8)|((minor)&0xff)) +#define PSMI_VERNO PSMI_VERNO_MAKE(PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR) +#define PSMI_VERNO_GET_MAJOR(verno) (((verno)>>8) & 0xff) +#define PSMI_VERNO_GET_MINOR(verno) (((verno)>>0) & 0xff) + +int psmi_verno_client(); +int psmi_verno_isinteroperable(uint16_t verno); +int MOCKABLE(psmi_isinitialized)(); +MOCK_DCL_EPILOGUE(psmi_isinitialized); + +psm2_error_t psmi_poll_internal(psm2_ep_t ep, int poll_amsh); +psm2_error_t psmi_mq_wait_internal(psm2_mq_req_t *ireq); + +int psmi_get_current_proc_location(); + +extern int psmi_epid_ver; +extern int psmi_allow_routers; +extern uint32_t non_dw_mul_sdma; +extern psmi_lock_t psmi_creation_lock; +extern psm2_ep_t psmi_opened_endpoint; + +extern int psmi_affinity_shared_file_opened; +extern uint64_t *shared_affinity_ptr; +extern char *affinity_shm_name; + +extern sem_t *sem_affinity_shm_rw; +extern int psmi_affinity_semaphore_open; +extern char *sem_affinity_shm_rw_name; + +PSMI_ALWAYS_INLINE( +int +_psmi_get_epid_version()) { + return psmi_epid_ver; +} + +#define PSMI_EPID_VERSION_SHM 0 +#define PSMI_EPID_SHM_ONLY 1 +#define PSMI_EPID_IPS_SHM 0 +#define PSMI_EPID_VERSION _psmi_get_epid_version() +#define PSMI_MAX_EPID_VERNO_SUPPORTED 4 +#define PSMI_MIN_EPID_VERNO_SUPPORTED 3 +#define PSMI_EPID_VERNO_DEFAULT 3 // allows 3 or 4 based on NIC +#define PSMI_EPID_V3 3 // IB UD +#define PSMI_EPID_V4 4 // Eth UD + +#define PSMI_EPID_GET_LID(epid) ((PSMI_EPID_GET_EPID_VERSION(epid) == PSMI_EPID_V3) ? \ + (int)PSMI_EPID_GET_LID_V3(epid) \ + : (int)PSMI_EPID_GET_LID_V4(epid)) +// for V3 we use low 16 and next 16 should be zero +// for V4 we have network in low 32 bits +#define PSMI_GET_SUBNET_ID(gid_hi) (gid_hi & 0xffffffff) + + +/* + * Following is the definition of various lock implementations. The choice is + * made by defining specific lock type in relevant section of psm_config.h + */ +#ifdef PSMI_LOCK_IS_SPINLOCK +#define _PSMI_LOCK_INIT(pl) psmi_spin_init(&((pl).lock)) +#define _PSMI_LOCK_TRY(pl) psmi_spin_trylock(&((pl).lock)) +#define _PSMI_LOCK(pl) psmi_spin_lock(&((pl).lock)) +#define _PSMI_UNLOCK(pl) psmi_spin_unlock(&((pl).lock)) +#define _PSMI_LOCK_ASSERT(pl) +#define _PSMI_UNLOCK_ASSERT(pl) +#define PSMI_LOCK_DISABLED 0 + +#elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG) + +PSMI_ALWAYS_INLINE( +int +_psmi_mutex_trylock_inner(pthread_mutex_t *mutex, + const char *curloc, pthread_t *lock_owner)) +{ + psmi_assert_always_loc(*lock_owner != pthread_self(), + curloc); + int ret = pthread_mutex_trylock(mutex); + if (ret == 0) + *lock_owner = pthread_self(); + return ret; +} + +PSMI_ALWAYS_INLINE( +int +_psmi_mutex_lock_inner(pthread_mutex_t *mutex, + const char *curloc, pthread_t *lock_owner)) +{ + psmi_assert_always_loc(*lock_owner != pthread_self(), + curloc); + int ret = pthread_mutex_lock(mutex); + psmi_assert_always_loc(ret != EDEADLK, curloc); + *lock_owner = pthread_self(); + return ret; +} + +PSMI_ALWAYS_INLINE( +void +_psmi_mutex_unlock_inner(pthread_mutex_t *mutex, + const char *curloc, pthread_t *lock_owner)) +{ + psmi_assert_always_loc(*lock_owner == pthread_self(), + curloc); + *lock_owner = PSMI_LOCK_NO_OWNER; + psmi_assert_always_loc(pthread_mutex_unlock(mutex) != + EPERM, curloc); + return; +} + +#define _PSMI_LOCK_INIT(pl) /* static initialization */ +#define _PSMI_LOCK_TRY(pl) \ + _psmi_mutex_trylock_inner(&((pl).lock), PSMI_CURLOC, \ + &((pl).lock_owner)) +#define _PSMI_LOCK(pl) \ + _psmi_mutex_lock_inner(&((pl).lock), PSMI_CURLOC, \ + &((pl).lock_owner)) +#define _PSMI_UNLOCK(pl) \ + _psmi_mutex_unlock_inner(&((pl).lock), PSMI_CURLOC, \ + &((pl).lock_owner)) +#define _PSMI_LOCK_ASSERT(pl) \ + psmi_assert_always((pl).lock_owner == pthread_self()); +#define _PSMI_UNLOCK_ASSERT(pl) \ + psmi_assert_always((pl).lock_owner != pthread_self()); +#define PSMI_LOCK_DISABLED 0 + +#elif defined(PSMI_LOCK_IS_MUTEXLOCK) +#define _PSMI_LOCK_INIT(pl) /* static initialization */ +#define _PSMI_LOCK_TRY(pl) pthread_mutex_trylock(&((pl).lock)) +#define _PSMI_LOCK(pl) pthread_mutex_lock(&((pl).lock)) +#define _PSMI_UNLOCK(pl) pthread_mutex_unlock(&((pl).lock)) +#define PSMI_LOCK_DISABLED 0 +#define _PSMI_LOCK_ASSERT(pl) +#define _PSMI_UNLOCK_ASSERT(pl) + +#elif defined(PSMI_PLOCK_IS_NOLOCK) +#define _PSMI_LOCK_TRY(pl) 0 /* 0 *only* so progress thread never succeeds */ +#define _PSMI_LOCK(pl) +#define _PSMI_UNLOCK(pl) +#define PSMI_LOCK_DISABLED 1 +#define _PSMI_LOCK_ASSERT(pl) +#define _PSMI_UNLOCK_ASSERT(pl) +#else +#error No LOCK lock type declared +#endif + +#define PSMI_YIELD(pl) \ + do { _PSMI_UNLOCK((pl)); sched_yield(); _PSMI_LOCK((pl)); } while (0) + +#ifdef PSM2_MOCK_TESTING +/* If this is a mocking tests build, all the operations on the locks + * are routed through functions which may be mocked, if necessary. */ +void MOCKABLE(psmi_mockable_lock_init)(psmi_lock_t *pl); +MOCK_DCL_EPILOGUE(psmi_mockable_lock_init); + +int MOCKABLE(psmi_mockable_lock_try)(psmi_lock_t *pl); +MOCK_DCL_EPILOGUE(psmi_mockable_lock_try); + +void MOCKABLE(psmi_mockable_lock)(psmi_lock_t *pl); +MOCK_DCL_EPILOGUE(psmi_mockable_lock); + +void MOCKABLE(psmi_mockable_unlock)(psmi_lock_t *pl); +MOCK_DCL_EPILOGUE(psmi_mockable_unlock); + +void MOCKABLE(psmi_mockable_lock_assert)(psmi_lock_t *pl); +MOCK_DCL_EPILOGUE(psmi_mockable_lock_assert); + +void MOCKABLE(psmi_mockable_unlock_assert)(psmi_lock_t *pl); +MOCK_DCL_EPILOGUE(psmi_mockable_unlock_assert); + +#define PSMI_LOCK_INIT(pl) psmi_mockable_lock_init(&(pl)) +#define PSMI_LOCK_TRY(pl) psmi_mockable_lock_try(&(pl)) +#define PSMI_LOCK(pl) psmi_mockable_lock(&(pl)) +#define PSMI_UNLOCK(pl) psmi_mockable_unlock(&(pl)) +#define PSMI_LOCK_ASSERT(pl) psmi_mockable_lock_assert(&(pl)) +#define PSMI_UNLOCK_ASSERT(pl) psmi_mockable_unlock_assert(&(pl)) +#else +#define PSMI_LOCK_INIT(pl) _PSMI_LOCK_INIT(pl) +#define PSMI_LOCK_TRY(pl) _PSMI_LOCK_TRY(pl) +#define PSMI_LOCK(pl) _PSMI_LOCK(pl) +#define PSMI_UNLOCK(pl) _PSMI_UNLOCK(pl) +#define PSMI_LOCK_ASSERT(pl) _PSMI_LOCK_ASSERT(pl) +#define PSMI_UNLOCK_ASSERT(pl) _PSMI_UNLOCK_ASSERT(pl) +#endif + +#ifdef PSM_PROFILE +void psmi_profile_block() __attribute__ ((weak)); +void psmi_profile_unblock() __attribute__ ((weak)); +void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak)); + +#define PSMI_PROFILE_BLOCK() psmi_profile_block() +#define PSMI_PROFILE_UNBLOCK() psmi_profile_unblock() +#define PSMI_PROFILE_REBLOCK(noprog) psmi_profile_reblock(noprog) +#else +#define PSMI_PROFILE_BLOCK() +#define PSMI_PROFILE_UNBLOCK() +#define PSMI_PROFILE_REBLOCK(noprog) +#endif + +#ifdef PSM_CUDA + +#ifndef PSM_CUDA_MOCK +#include +#include + +#if CUDA_VERSION < 7000 +#error Please update CUDA driver, required minimum version is 7.0 +#endif +#else +// included in stand-alone unit test that does not use real CUDA functions +#include "psmi_cuda_mock.h" +#endif /* PSM_CUDA_MOCK */ + +extern int is_cuda_enabled; +extern int is_gdr_copy_enabled; +extern int device_support_gpudirect; +extern int gpu_p2p_supported; +extern int my_gpu_device; +extern int cuda_lib_version; + +extern CUcontext ctxt; +extern void *psmi_cuda_lib; + +extern CUresult (*psmi_cuInit)(unsigned int Flags ); +extern CUresult (*psmi_cuCtxDetach)(CUcontext c); +extern CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c); +extern CUresult (*psmi_cuCtxSetCurrent)(CUcontext c); +extern CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); +extern CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p); +extern CUresult (*psmi_cuDeviceCanAccessPeer)(int *canAccessPeer, CUdevice dev, CUdevice peerDev); +extern CUresult (*psmi_cuDeviceGet)(CUdevice* device, int ordinal); +extern CUresult (*psmi_cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev); +extern CUresult (*psmi_cuDriverGetVersion)(int* driverVersion); +extern CUresult (*psmi_cuDeviceGetCount)(int* count); +extern CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags); +extern CUresult (*psmi_cuStreamDestroy)(CUstream phStream); +extern CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags); +extern CUresult (*psmi_cuEventDestroy)(CUevent hEvent); +extern CUresult (*psmi_cuEventQuery)(CUevent hEvent); +extern CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream); +extern CUresult (*psmi_cuEventSynchronize)(CUevent hEvent); +extern CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags); +extern CUresult (*psmi_cuMemFreeHost)(void* p); +extern CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); +extern CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); +extern CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); +extern CUresult (*psmi_cuMemcpyHtoD)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount); +extern CUresult (*psmi_cuMemcpyDtoHAsync)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); +extern CUresult (*psmi_cuMemcpyHtoDAsync)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream); +extern CUresult (*psmi_cuIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr); +extern CUresult (*psmi_cuIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags); +extern CUresult (*psmi_cuIpcCloseMemHandle)(CUdeviceptr dptr); +extern CUresult (*psmi_cuMemGetAddressRange)(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr); +extern CUresult (*psmi_cuDevicePrimaryCtxGetState)(CUdevice dev, unsigned int* flags, int* active); +extern CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev); +extern CUresult (*psmi_cuCtxGetDevice)(CUdevice* device); +extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device); + +#define PSMI_CUDA_CALL(func, args...) do { \ + CUresult cudaerr; \ + cudaerr = psmi_##func(args); \ + if (cudaerr != CUDA_SUCCESS) { \ + if (ctxt == NULL) \ + _HFI_ERROR( \ + "Check if CUDA is initialized" \ + "before psm2_ep_open call \n"); \ + _HFI_ERROR( \ + "CUDA failure: %s() (at %s:%d)" \ + "returned %d\n", \ + #func, __FILE__, __LINE__, cudaerr); \ + psmi_handle_error( \ + PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Error returned from CUDA function.\n");\ + } \ + } while (0) + +/** + * Similar to PSMI_CUDA_CALL() except does not error out + * if func(args) returns CUDA_SUCCESS or except_err + * + * Invoker must provide 'CUresult cudaerr' in invoked scope + * so invoker can inspect whether cudaerr == CUDA_SUCCESS or + * cudaerr == except_err after expanded code is executed. + * + * As except_err is an allowed value, message is printed at + * DBG level. + */ +#define PSMI_CUDA_CALL_EXCEPT(except_err, func, args...) do { \ + cudaerr = psmi_##func(args); \ + if (cudaerr != CUDA_SUCCESS && cudaerr != except_err) { \ + if (ctxt == NULL) \ + _HFI_ERROR( \ + "Check if CUDA is initialized" \ + "before psm2_ep_open call \n"); \ + _HFI_ERROR( \ + "CUDA failure: %s() (at %s:%d)" \ + "returned %d\n", \ + #func, __FILE__, __LINE__, cudaerr); \ + psmi_handle_error( \ + PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Error returned from CUDA function.\n");\ + } else if (cudaerr == except_err) { \ + _HFI_DBG( \ + "CUDA non-zero return value: %s() (at %s:%d)" \ + "returned %d\n", \ + #func, __FILE__, __LINE__, cudaerr); \ + } \ + } while (0) + +#define PSMI_CUDA_CHECK_EVENT(event, cudaerr) do { \ + cudaerr = psmi_cuEventQuery(event); \ + if ((cudaerr != CUDA_SUCCESS) && \ + (cudaerr != CUDA_ERROR_NOT_READY)) { \ + _HFI_ERROR( \ + "CUDA failure: %s() returned %d\n", \ + "cuEventQuery", cudaerr); \ + psmi_handle_error( \ + PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \ + "Error returned from CUDA function.\n");\ + } \ + } while (0) + +#define PSMI_CUDA_DLSYM(psmi_cuda_lib,func) do { \ + psmi_##func = dlsym(psmi_cuda_lib, STRINGIFY(func)); \ + if (!psmi_##func) { \ + psmi_handle_error(PSMI_EP_NORETURN, \ + PSM2_INTERNAL_ERR, \ + " Unable to resolve %s symbol" \ + " in CUDA libraries.\n",STRINGIFY(func));\ + } \ +} while (0) + +PSMI_ALWAYS_INLINE( +int +_psmi_is_cuda_mem(const void *ptr)) +{ + CUresult cres; + CUmemorytype mt; + unsigned uvm = 0; + cres = psmi_cuPointerGetAttribute( + &mt, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr) ptr); + if ((cres == CUDA_SUCCESS) && (mt == CU_MEMORYTYPE_DEVICE)) { + cres = psmi_cuPointerGetAttribute( + &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr); + if ((cres == CUDA_SUCCESS) && (uvm == 0)) + return 1; + else + return 0; + } else + return 0; +} + +#define PSMI_IS_CUDA_ENABLED likely(is_cuda_enabled) +#define PSMI_IS_CUDA_DISABLED unlikely(!is_cuda_enabled) + +PSMI_ALWAYS_INLINE( +int +_psmi_is_gdr_copy_enabled()) +{ + return is_gdr_copy_enabled; +} + +#define PSMI_IS_GDR_COPY_ENABLED _psmi_is_gdr_copy_enabled() + +#define PSMI_IS_CUDA_MEM(p) _psmi_is_cuda_mem(p) + +struct ips_cuda_hostbuf { + STAILQ_ENTRY(ips_cuda_hostbuf) req_next; + STAILQ_ENTRY(ips_cuda_hostbuf) next; + uint32_t size, offset, bytes_read; + /* This flag indicates whether a chb is + * pulled from a mpool or dynamically + * allocated using calloc. */ + uint8_t is_tempbuf; + CUevent copy_status; + psm2_mq_req_t req; + void *host_buf; + CUdeviceptr gpu_buf; +}; + +struct ips_cuda_hostbuf_mpool_cb_context { + unsigned bufsz; +}; +void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj); + +#define CUDA_HOSTBUFFER_LIMITS { \ + .env = "PSM3_CUDA_BOUNCEBUFFERS_MAX", \ + .descr = "Max CUDA bounce buffers (in MB)", \ + .env_level = PSMI_ENVVAR_LEVEL_HIDDEN, \ + .minval = 1, \ + .maxval = 1<<30, \ + .mode[PSMI_MEMMODE_NORMAL] = { 16, 256 }, \ + .mode[PSMI_MEMMODE_MINIMAL] = { 1, 1 }, \ + .mode[PSMI_MEMMODE_LARGE] = { 32, 512 } \ + } + +extern uint32_t gpudirect_send_threshold; +extern uint32_t gpudirect_recv_threshold; +extern uint32_t cuda_thresh_rndv; +/* This threshold dictates when the sender turns off + * GDR Copy. The threshold needs to be less than + * CUDA RNDV threshold. + */ +extern uint32_t gdr_copy_threshold_send; +/* This threshold dictates when the reciever turns off + * GDR Copy. The threshold needs to be less than + * CUDA RNDV threshold. + */ +extern uint32_t gdr_copy_threshold_recv; + +#define PSMI_USE_GDR_COPY(req, len) req->is_buf_gpu_mem && \ + PSMI_IS_GDR_COPY_ENABLED && \ + len >=1 && len <= gdr_copy_threshold_recv + +enum psm2_chb_match_type { + /* Complete data found in a single chb */ + PSMI_CUDA_FULL_MATCH_FOUND = 0, + /* Data is spread across two chb's */ + PSMI_CUDA_SPLIT_MATCH_FOUND = 1, + /* Data is only partially prefetched */ + PSMI_CUDA_PARTIAL_MATCH_FOUND = 2, + PSMI_CUDA_CONTINUE = 3 +}; +typedef enum psm2_chb_match_type psm2_chb_match_type_t; + +/* + * CUDA documentation dictates the use of SYNC_MEMOPS attribute + * when the buffer pointer received into PSM has been allocated + * by the application. This guarantees that all memory operations + * to this region of memory (used by multiple layers of the stack) + * always synchronize. + */ +static inline +void psmi_cuda_set_attr_sync_memops(const void *ubuf) +{ + int true_flag = 1; + + PSMI_CUDA_CALL(cuPointerSetAttribute, &true_flag, + CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr) ubuf); +} + +#endif /* PSM_CUDA */ + +#define COMPILE_TIME_ASSERT(NAME,COND) extern char NAME[1/ COND] + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* _PSMI_USER_H */ diff --git a/prov/psm3/psm3/psm_utils.c b/prov/psm3/psm3/psm_utils.c new file mode 100644 index 00000000000..4ccb737f9a4 --- /dev/null +++ b/prov/psm3/psm3/psm_utils.c @@ -0,0 +1,3094 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include /* gethostbyname */ +#include /* malloc_usable_size */ +#include "psm_user.h" +#include "psm2_hal.h" +#include "psm_am_internal.h" +#include "psm_mq_internal.h" +#include "ips_proto_params.h" +#include // for sockaddr +#include // for AF_IB structures +#include + + +int psmi_ep_device_is_enabled(const psm2_ep_t ep, int devid); + +struct psmi_epid_table psmi_epid_table; + +/* Iterator to access the epid table. + * 'ep' can be NULL if remote endpoints from all endpoint handles are requested + */ +void psmi_epid_itor_init(struct psmi_eptab_iterator *itor, psm2_ep_t ep) +{ + itor->i = 0; + itor->ep = ep; + pthread_mutex_lock(&psmi_epid_table.tablock); +} + +void *psmi_epid_itor_next(struct psmi_eptab_iterator *itor) +{ + int i; + struct psmi_epid_tabentry *e; + + if (itor->i >= psmi_epid_table.tabsize) + return NULL; + for (i = itor->i; i < psmi_epid_table.tabsize; i++) { + e = &psmi_epid_table.table[i]; + if (!e->entry || e->entry == EPADDR_DELETED) + continue; + if (itor->ep && e->ep != itor->ep) + continue; + itor->i = i + 1; + return e->entry; + } + itor->i = psmi_epid_table.tabsize; /* put at end of table */ + return NULL; +} + +void psmi_epid_itor_fini(struct psmi_eptab_iterator *itor) +{ + pthread_mutex_unlock(&psmi_epid_table.tablock); + itor->i = 0; +} + +#define mix64(a, b, c) \ +{ \ + a -= b; a -= c; a ^= (c>>43); \ + b -= c; b -= a; b ^= (a<<9); \ + c -= a; c -= b; c ^= (b>>8); \ + a -= b; a -= c; a ^= (c>>38); \ + b -= c; b -= a; b ^= (a<<23); \ + c -= a; c -= b; c ^= (b>>5); \ + a -= b; a -= c; a ^= (c>>35); \ + b -= c; b -= a; b ^= (a<<49); \ + c -= a; c -= b; c ^= (b>>11); \ + a -= b; a -= c; a ^= (c>>12); \ + b -= c; b -= a; b ^= (a<<18); \ + c -= a; c -= b; c ^= (b>>22); \ +} + +psm2_error_t psmi_epid_init() +{ + pthread_mutexattr_t attr; + psmi_epid_table.table = NULL, psmi_epid_table.tabsize = 0; + psmi_epid_table.tabsize_used = 0; + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); + pthread_mutex_init(&psmi_epid_table.tablock, &attr); + pthread_mutexattr_destroy(&attr); + return PSM2_OK; +}; + +psm2_error_t psmi_epid_fini() +{ + if (psmi_epid_table.table != NULL) { + psmi_free(psmi_epid_table.table); + psmi_epid_table.table = NULL; + } + psmi_epid_table.tabsize = 0; + psmi_epid_table.tabsize_used = 0; + return PSM2_OK; +} + +PSMI_ALWAYS_INLINE( +uint64_t +hash_this(const psm2_ep_t ep, const psm2_epid_t epid)) +{ + uint64_t ep_i = (uint64_t) (uintptr_t) ep; + uint64_t epid_i = (uint64_t) epid; + uint64_t hash = 0x9e3779b97f4a7c13LL; + mix64(ep_i, epid_i, hash); + return hash; +} + +PSMI_ALWAYS_INLINE( +void * +psmi_epid_lookup_inner(psm2_ep_t ep, psm2_epid_t epid, int remove)) +{ + uint64_t key = hash_this(ep, epid); + struct psmi_epid_tabentry *e; + void *entry = NULL; + int idx; + + pthread_mutex_lock(&psmi_epid_table.tablock); + if (!psmi_epid_table.table) + goto ret; + idx = (int)(key % psmi_epid_table.tabsize); + while (psmi_epid_table.table[idx].entry != NULL) { + /* An epid can be added twice if there's more than one opened endpoint, + * but really we match on epid *and* on endpoint */ + e = &psmi_epid_table.table[idx]; + if (e->entry != EPADDR_DELETED && e->key == key) { + entry = e->entry; + if (remove) + psmi_epid_table.table[idx].entry = + EPADDR_DELETED; + goto ret; + } + if (++idx == psmi_epid_table.tabsize) + idx = 0; + } +ret: + pthread_mutex_unlock(&psmi_epid_table.tablock); + return entry; +} + +void *psmi_epid_lookup(psm2_ep_t ep, psm2_epid_t epid) +{ + void *entry = psmi_epid_lookup_inner(ep, epid, 0); + if (PSMI_EP_HOSTNAME != ep) + _HFI_VDBG("lookup of (%p,%" PRIx64 ") returns %p\n", ep, epid, + entry); + return entry; +} + +void *psmi_epid_remove(psm2_ep_t ep, psm2_epid_t epid) +{ + if (PSMI_EP_HOSTNAME != ep) + _HFI_VDBG("remove of (%p,%" PRIx64 ")\n", ep, epid); + return psmi_epid_lookup_inner(ep, epid, 1); +} + +void psmi_epid_remove_all(psm2_ep_t ep) +{ + size_t i; + struct psmi_epid_tabentry *e; + + pthread_mutex_lock(&psmi_epid_table.tablock); + + for (i = 0; i < psmi_epid_table.tabsize; i++) { + e = &psmi_epid_table.table[i]; + + if (e->entry == NULL || e->entry == EPADDR_DELETED) + continue; + + if (e->ep == ep) { + /* unspecified fields implicitly zeroed */ + *e = (struct psmi_epid_tabentry) { + .entry = EPADDR_DELETED + }; + } + } + + pthread_mutex_unlock(&psmi_epid_table.tablock); +} + +psm2_error_t psmi_epid_add(psm2_ep_t ep, psm2_epid_t epid, void *entry) +{ + uint64_t key; + int idx, i, newsz; + struct psmi_epid_tabentry *e; + psm2_error_t err = PSM2_OK; + + if (PSMI_EP_HOSTNAME != ep) + _HFI_VDBG("add of (%p,%" PRIx64 ") with entry %p\n", ep, epid, + entry); + pthread_mutex_lock(&psmi_epid_table.tablock); + /* Leave this here, mostly for sanity and for the fact that the epid + * table is currently not used in the critical path */ + if (++psmi_epid_table.tabsize_used > + (int)(psmi_epid_table.tabsize * PSMI_EPID_TABLOAD_FACTOR)) { + struct psmi_epid_tabentry *newtab; + newsz = psmi_epid_table.tabsize + PSMI_EPID_TABSIZE_CHUNK; + newtab = (struct psmi_epid_tabentry *) + psmi_calloc(ep, PER_PEER_ENDPOINT, + newsz, sizeof(struct psmi_epid_tabentry)); + if (newtab == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + if (psmi_epid_table.table) { /* rehash the table */ + for (i = 0; i < psmi_epid_table.tabsize; i++) { + e = &psmi_epid_table.table[i]; + if (e->entry == NULL) + continue; + /* When rehashing, mark deleted as free again */ + if (e->entry == EPADDR_DELETED) { + psmi_epid_table.tabsize_used--; + continue; + } + idx = (int)(e->key % newsz); + while (newtab[idx].entry != NULL) + if (++idx == newsz) + idx = 0; + newtab[idx].entry = e->entry; + newtab[idx].key = e->key; + newtab[idx].ep = e->ep; + newtab[idx].epid = e->epid; + } + psmi_free(psmi_epid_table.table); + } + psmi_epid_table.table = newtab; + psmi_epid_table.tabsize = newsz; + } + key = hash_this(ep, epid); + idx = (int)(key % psmi_epid_table.tabsize); + e = &psmi_epid_table.table[idx]; + while (e->entry && e->entry != EPADDR_DELETED) { + if (++idx == psmi_epid_table.tabsize) + idx = 0; + e = &psmi_epid_table.table[idx]; + } + e->entry = entry; + e->key = key; + e->epid = epid; + e->ep = ep; + +fail: + pthread_mutex_unlock(&psmi_epid_table.tablock); + return err; +} + +static psmi_lock_t psmi_gethostname_lock; + +static void __attribute__ ((constructor)) __psmi_gethostname_lock_constructor(void) +{ + psmi_init_lock(&psmi_gethostname_lock); +} + +char *psmi_gethostname(void) +{ + static char hostname[80] = { '\0' }; + char *c; + + if (hostname[0] == '\0') { + PSMI_LOCK(psmi_gethostname_lock); + /* CRITICAL SECTION START */ + if (hostname[0] == '\0') { + gethostname(hostname, sizeof(hostname)); + hostname[sizeof(hostname) - 1] = '\0'; /* no guarantee of nul termination */ + if ((c = strchr(hostname, '.'))) + *c = '\0'; + } + PSMI_UNLOCK(psmi_gethostname_lock); + /* CRITICAL SECTION END */ + } + + return hostname; +} + +/* + * Hostname stuff. We really only register the network portion of the epid + * since all epids from the same nid are assumed to have the same hostname. + */ +psm2_error_t +psmi_epid_set_hostname(uint64_t nid, const char *hostname, int overwrite) +{ + size_t hlen; + char *h; + psm2_error_t err = PSM2_OK; + + if (hostname == NULL) + return PSM2_OK; + /* First see if a hostname already exists */ + if ((h = psmi_epid_lookup(PSMI_EP_HOSTNAME, nid)) != NULL) { + if (!overwrite) + return PSM2_OK; + + h = psmi_epid_remove(PSMI_EP_HOSTNAME, nid); + if (h != NULL) /* free the previous hostname if so exists */ + psmi_free(h); + } + + hlen = min(PSMI_EP_HOSTNAME_LEN, strlen(hostname) + 1); + h = (char *)psmi_malloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, hlen); + if (h == NULL) + return PSM2_NO_MEMORY; + snprintf(h, hlen, "%s", hostname); + h[hlen - 1] = '\0'; + err = psmi_epid_add(PSMI_EP_HOSTNAME, nid, h); + return err; +} + +/* XXX These three functions are not thread safe, we'll use a rotating buffer + * trick to make them thread safe because we really only have a few thread + * (assuming multi_ep has < 8 threads of it's own) */ +/* this returns just the addressing */ +const char *psmi_epaddr_fmt_addr(psm2_epid_t epid) +{ + static char hostnamebufs[16][PSMI_EP_HOSTNAME_LEN]; + static int bufno; + char *hostname; + + hostname = hostnamebufs[bufno]; + bufno = (bufno + 1) % 16; + + char buf[INET_ADDRSTRLEN]; + if (PSMI_EPID_GET_EPID_VERSION(epid) == PSMI_EPID_V4) + snprintf(hostname, PSMI_EP_HOSTNAME_LEN - 1, "IP=%s QP=%d", + psmi_ipv4_ntop((uint32_t)PSMI_EPID_GET_LID(epid), buf, sizeof(buf)), + (int)PSMI_EPID_GET_CONTEXT(epid)); + else + snprintf(hostname, PSMI_EP_HOSTNAME_LEN - 1, "LID=%d QP=%d", + (int)PSMI_EPID_GET_LID(epid), + (int)PSMI_EPID_GET_CONTEXT(epid)); + hostname[PSMI_EP_HOSTNAME_LEN - 1] = '\0'; + return hostname; +} + +/* this returns the simple name, if not known gives addressing */ +const char *psmi_epaddr_get_hostname(psm2_epid_t epid) +{ + uint64_t nid = psm2_epid_nid(epid); + char *h; + + /* First, if we have registered a host for this epid, just return that, or + * else try to return something with lid and context */ + h = psmi_epid_lookup(PSMI_EP_HOSTNAME, nid); + if (h != NULL) + return h; + else { + return psmi_epaddr_fmt_addr(epid); + } +} + +/* this returns the name and addressing */ +/* if not known just gives addressing */ +const char *psmi_epaddr_get_name(psm2_epid_t epid) +{ + static char hostnamebufs[4][PSMI_EP_HOSTNAME_LEN]; + static int bufno; + char *h, *hostname; + hostname = hostnamebufs[bufno]; + bufno = (bufno + 1) % 4; + + h = psmi_epid_lookup(PSMI_EP_HOSTNAME, psm2_epid_nid(epid)); + if (h == NULL) + return psmi_epaddr_get_hostname(epid); + else { + snprintf(hostname, PSMI_EP_HOSTNAME_LEN - 1, "%s (%s)", h, + psmi_epaddr_fmt_addr(epid)); + hostname[PSMI_EP_HOSTNAME_LEN - 1] = '\0'; + } + return hostname; +} + + +// superset of inet_ntop. For AF_INET and AF_INET6 outputs address and port +// for AF_IB outputs address sid and pkey +const char *psmi_sockaddr_ntop(struct sockaddr* addr, char *dst, socklen_t size) +{ + if (! dst || size < PSM_ADDRSTRLEN) { + // be strict, keeps it simple + errno = ENOSPC; + return "ENOSPC"; // callers just use in a printf + } + *dst = '\0'; // be safe + if (! addr) { + snprintf(dst, size, "(nil)"); + return dst; + } + // show network address and port (or sid) + switch (addr->sa_family) { + case AF_INET: + { + struct sockaddr_in* in_addr = ((struct sockaddr_in*)addr); + // we show the IPv4 address and port + inet_ntop(AF_INET, &in_addr->sin_addr, dst, size); + snprintf(dst+strlen(dst), size-strlen(dst), " %u", be16toh(in_addr->sin_port)); + return dst; + } + case AF_INET6: + { + struct sockaddr_in6* in_addr = ((struct sockaddr_in6*)addr); + // we show just the IPv6 address and port. + // could also show scope_id and flowinfo + inet_ntop(AF_INET6, &in_addr->sin6_addr, dst, size); + snprintf(dst+strlen(dst), size-strlen(dst), " %u", be16toh(in_addr->sin6_port)); + return dst; + } + case AF_IB: + { + struct sockaddr_ib* ib_addr = ((struct sockaddr_ib*)addr); + // we show the GID sid and pkey. + // Could also output sid_mask and sib_scope_id + inet_ntop(AF_INET6, &ib_addr->sib_addr, dst, size); + snprintf(dst+strlen(dst), size-strlen(dst), " 0x%016"PRIx64" 0x%04"PRIx16, be64toh(ib_addr->sib_sid), be16toh(ib_addr->sib_pkey)); + return dst; + } + default: + snprintf(dst, size, "Unsupported"); + return dst; + } +} + +// subset of inet_ntop. +// formats address or netmask (in host byte order) +// into buf which has >= buf_size bytes available. +// returns a \0 terminated string suitable for use in printf such as: +// { char buf[INET_ADDRSTRLEN]; +// printf("IP=%s\n", psmi_ipv4_ntop(ip_addr, buf, sizeof(buf));} +// on success pointer returned will be to buf. For various errors a +// constant string outside of buf may be returned such that caller can safely +// call printf (or similar functions) without checking return value. +// on errors, errno is also set. +// Note IPv4 addresses worse case length is INET_ADDRSTRLEN. +const char *psmi_ipv4_ntop(uint32_t ip_addr, char *dst, socklen_t size) +{ + struct in_addr in_addr; + if (! dst || size < INET_ADDRSTRLEN) { + // be strict, keeps it simple + errno = ENOSPC; + return "ENOSPC"; // callers just use in a printf + } + *dst = '\0'; // be safe + in_addr.s_addr = __cpu_to_be32(ip_addr); + // we show the IPv4 address and port + inet_ntop(AF_INET, &in_addr, dst, size); + return dst; +} + +socklen_t psmi_sockaddr_len(struct sockaddr* addr) +{ + switch (addr->sa_family) { + case AF_INET: + return (sizeof(struct sockaddr_in)); + case AF_INET6: + return (sizeof(struct sockaddr_in6)); + case AF_IB: + return (sizeof(struct sockaddr_ib)); + default: + // unknown + return 0; // be conservative + } +} + +// used for IPv4 netmask processing. A valid netmask has a sequence of 1s +// and then all other bits are 0. +// This counts how many 1s are in the high end of the netmask and confirms +// the remaining low bits are 0. +int psmi_count_high_bits(uint32_t netmask) +{ + int i=0; + uint32_t mask = 0x80000000; + while (mask & netmask) { + i++; mask >>= 1; + } + // confirm all low bits of netmask are 0 + if (netmask != psmi_bit_count_to_mask(i)) + return -1; + return i; +} + +// given an IPv4 address, figure out which ifconfig entry matches and +// return the netmask +int psmi_get_eth_netmask(__be32 ip_addr, __be32 *netmask) +{ + struct ifaddrs *ifap, *ifa; + + if (getifaddrs(&ifap) == 0) { + for (ifa = ifap; ifa != NULL; ifa = ifa->ifa_next) { + struct sockaddr_in *addr = (struct sockaddr_in *)ifa->ifa_addr; + struct sockaddr_in *nmask = (struct sockaddr_in *)ifa->ifa_netmask; + __be32 nm; + char buf[INET_ADDRSTRLEN]; + char buf2[INET_ADDRSTRLEN]; + + if (!nmask) continue; + if (addr->sin_family != AF_INET) continue; + if (addr->sin_addr.s_addr != ip_addr) continue; + + nm = (__be32)nmask->sin_addr.s_addr; + + if (_HFI_DBG_ON) { + _HFI_DBG("Related ifaddr[%s]: %s netmask %s\n", + ifa->ifa_name, + psmi_ipv4_ntop(__be32_to_cpu(ip_addr), buf, sizeof(buf)), + psmi_ipv4_ntop(__be32_to_cpu(nm), buf2, sizeof(buf2))); + } + *netmask = nm; + break; + } + (void)freeifaddrs(ifap); + } else { + return -1; + } + return 0; +} + +/* Wrapper, in case we port to OS xyz that doesn't have sysconf */ +uintptr_t psmi_getpagesize(void) +{ + static uintptr_t pagesz = (uintptr_t) -1; + long sz; + if (pagesz != (uintptr_t) -1) + return pagesz; + sz = sysconf(_SC_PAGESIZE); + if (sz == -1) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Can't query system page size"); + } + + pagesz = (uintptr_t) sz; + return pagesz; +} + +/* _CONSUMED_ALL() is a macro which indicates if strtol() consumed all + of the input passed to it. */ +#define _CONSUMED_ALL(CHAR_PTR) (((CHAR_PTR) != NULL) && (*(CHAR_PTR) == 0)) + +/* parse env of the form 'val' or 'val:' or 'val:pattern' + * for PSM3_VERBOSE_ENV and PSM3_IDENITFY + * if nothing provided or doesn't match current process, def is returned + * if syntax error, def_syntax is returned + */ +static int psmi_parse_val_pattern(const char *env, int def, int def_syntax) +{ + int ret = def; + + if (env && *env) { + char *e = psmi_strdup(NULL, env); + char *ep; + char *p; + + psmi_assert_always(e != NULL); + if (e == NULL) // for klocwork + goto done; + p = strchr(e, ':'); + if (p) + *p = '\0'; + int val = (int)strtol(e, &ep, 0); + if (! _CONSUMED_ALL(ep)) + ret = def_syntax; + else + ret = val; + if (val && p) { + if (! *(p+1)) { // val: -> val:*:rank0 + if (hfi_get_myrank() != 0) + ret = def; + } else if (0 != fnmatch(p+1, hfi_get_mylabel(), 0 +#ifdef FNM_EXTMATCH + | FNM_EXTMATCH +#endif + )) + ret = def; + } + psmi_free(e); + } +done: + return ret; +} + +/* If PSM3_VERBOSE_ENV is set in the environment, we determine + * what its verbose level is and print the environment at "INFO" + * level if the environment's level matches the desired printlevel. + */ +static int psmi_getenv_verblevel = -1; +static int psmi_getenv_is_verblevel(int printlevel) +{ + if (psmi_getenv_verblevel == -1) { + char *env = getenv("PSM3_VERBOSE_ENV"); + int nlevel = PSMI_ENVVAR_LEVEL_USER; + psmi_getenv_verblevel = psmi_parse_val_pattern(env, 0, 2); + if (psmi_getenv_verblevel < 0 || psmi_getenv_verblevel > 3) + psmi_getenv_verblevel = 2; + if (psmi_getenv_verblevel > 0) + nlevel = 0; /* output at INFO level */ + if (psmi_getenv_verblevel == 1) + _HFI_ENVDBG(0, " %-25s => '%s' (default was '%s')\n", + "PSM3_VERBOSE_ENV", env?env:"", "0"); + else if (env && *env) + _HFI_ENVDBG(nlevel, " %-25s %-40s => '%s' (default was '%s')\n", + "PSM3_VERBOSE_ENV", + "Enable verbose output of environment variables. " + "(0 - none, 1 - changed w/o help, 2 - user help, " + "#: - limit output to rank 0, #:pattern - limit output " + "to processes whose label matches " +#ifdef FNM_EXTMATCH + "extended " +#endif + "glob pattern)", +// don't document that 3 and 3: and 3:pattern can output hidden params + env, "0"); + else /* defaulted */ + _HFI_ENVDBG(nlevel, + " %-25s %-40s => '%s'\n", + "PSM3_VERBOSE_ENV", + "Enable verbose output of environment variables. " + "(0 - none, 1 - changed w/o help, 2 - user help, " + "#: - limit output to rank 0, #:pattern - limit output " + "to processes whose label matches " +#ifdef FNM_EXTMATCH + "extended " +#endif + "glob pattern)", +// don't document that 3 and 3: and 3:pattern can output hidden params + "0"); + } + return ((printlevel <= psmi_getenv_verblevel + && psmi_getenv_verblevel == 1) + || printlevel <= psmi_getenv_verblevel-1); +} + +#define GETENV_PRINTF(_level, _fmt, ...) \ + do { \ + if ((_level & PSMI_ENVVAR_LEVEL_NEVER_PRINT) == 0) \ + { \ + int nlevel = _level; \ + if (psmi_getenv_is_verblevel(nlevel)) \ + nlevel = 0; /* output at INFO level */ \ + _HFI_ENVDBG(nlevel, _fmt, ##__VA_ARGS__); \ + } \ + } while (0) + +int +MOCKABLE(psmi_getenv)(const char *name, const char *descr, int level, + int type, union psmi_envvar_val defval, + union psmi_envvar_val *newval) +{ + int used_default = 0; + union psmi_envvar_val tval; + char *env = getenv(name); +#if _HFI_DEBUGGING + int ishex = (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS || + type == PSMI_ENVVAR_TYPE_UINT_FLAGS); +#endif + + /* for verblevel 1 we only output non-default values with no help + * for verblevel>1 we promote to info (verblevel=2 promotes USER, + * verblevel=3 promotes HIDDEN) and show help. + * for verblevel< 1 we don't promote anything and show help + */ +#define _GETENV_PRINT(used_default, fmt, val, defval) \ + do { \ + (void)psmi_getenv_is_verblevel(level); \ + if (used_default && psmi_getenv_verblevel != 1) \ + GETENV_PRINTF(level, "%s%-25s %-40s =>%s" fmt \ + "\n", level > 1 ? "*" : " ", name, \ + descr, ishex ? "0x" : " ", val); \ + else if (! used_default && psmi_getenv_verblevel == 1) \ + GETENV_PRINTF(1, "%s%-25s =>%s" \ + fmt " (default was%s" fmt ")\n", \ + level > 1 ? "*" : " ", name, \ + ishex ? " 0x" : " ", val, \ + ishex ? " 0x" : " ", defval); \ + else if (! used_default && psmi_getenv_verblevel != 1) \ + GETENV_PRINTF(1, "%s%-25s %-40s =>%s" \ + fmt " (default was%s" fmt ")\n", \ + level > 1 ? "*" : " ", name, descr, \ + ishex ? " 0x" : " ", val, \ + ishex ? " 0x" : " ", defval); \ + } while (0) + +#define _CONVERT_TO_NUM(DEST,TYPE,STRTOL) \ + do { \ + char *ep; \ + /* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */ \ + DEST = (TYPE)STRTOL(env, &ep, 10); \ + if (! _CONSUMED_ALL(ep)) { \ + DEST = (TYPE)STRTOL(env, &ep, 16); \ + if (! _CONSUMED_ALL(ep)) { \ + used_default = 1; \ + tval = defval; \ + } \ + } \ + } while (0) + + switch (type) { + case PSMI_ENVVAR_TYPE_YESNO: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } else if (env[0] == 'Y' || env[0] == 'y') + tval.e_int = 1; + else if (env[0] == 'N' || env[0] == 'n') + tval.e_int = 0; + else { + char *ep; + tval.e_ulong = strtoul(env, &ep, 0); + if (ep == env) { + used_default = 1; + tval = defval; + } else if (tval.e_ulong != 0) + tval.e_ulong = 1; + } + _GETENV_PRINT(used_default, "%s", tval.e_long ? "YES" : "NO", + defval.e_int ? "YES" : "NO"); + break; + + case PSMI_ENVVAR_TYPE_STR: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } else + tval.e_str = env; + _GETENV_PRINT(used_default, "'%s'", tval.e_str, defval.e_str); + break; + + case PSMI_ENVVAR_TYPE_INT: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } else { + _CONVERT_TO_NUM(tval.e_int,int,strtol); + } + _GETENV_PRINT(used_default, "%d", tval.e_int, defval.e_int); + break; + + case PSMI_ENVVAR_TYPE_UINT: + case PSMI_ENVVAR_TYPE_UINT_FLAGS: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } else { + _CONVERT_TO_NUM(tval.e_int,unsigned int,strtoul); + } + if (type == PSMI_ENVVAR_TYPE_UINT_FLAGS) + _GETENV_PRINT(used_default, "%x", tval.e_uint, + defval.e_uint); + else + _GETENV_PRINT(used_default, "%u", tval.e_uint, + defval.e_uint); + break; + + case PSMI_ENVVAR_TYPE_LONG: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } else { + _CONVERT_TO_NUM(tval.e_long,long,strtol); + } + _GETENV_PRINT(used_default, "%ld", tval.e_long, defval.e_long); + break; + case PSMI_ENVVAR_TYPE_ULONG_ULONG: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } else { + _CONVERT_TO_NUM(tval.e_ulonglong,unsigned long long,strtoull); + } + _GETENV_PRINT(used_default, "%llu", + tval.e_ulonglong, defval.e_ulonglong); + break; + case PSMI_ENVVAR_TYPE_ULONG: + case PSMI_ENVVAR_TYPE_ULONG_FLAGS: + default: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } else { + _CONVERT_TO_NUM(tval.e_ulong,unsigned long,strtoul); + } + if (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS) + _GETENV_PRINT(used_default, "%lx", tval.e_ulong, + defval.e_ulong); + else + _GETENV_PRINT(used_default, "%lu", tval.e_ulong, + defval.e_ulong); + break; + } +#undef _GETENV_PRINT + *newval = tval; + + return used_default; +} +MOCK_DEF_EPILOGUE(psmi_getenv); + +/* + * Parsing long parameters + * -1 -> parse error + */ +long psmi_parse_str_long(const char *string) +{ + char *ep; \ + long ret; + + if (! string || ! *string) + return -1; + /* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */ + ret = strtol(string, &ep, 10); + if (! _CONSUMED_ALL(ep)) { + ret = strtol(string, &ep, 16); + if (! _CONSUMED_ALL(ep)) + return -1; + } + return ret; +} + +/* + * Parsing int parameters set in string tuples. + * Output array int *vals should be able to store 'ntup' elements. + * Values are only overwritten if they are parsed. + * Tuples are always separated by colons ':' + */ +int psmi_parse_str_tuples(const char *string, int ntup, int *vals) +{ + char *b = (char *)string; + char *e = b; + int tup_i = 0; + int n_parsed = 0; + char *buf = psmi_strdup(NULL, string); + psmi_assert_always(buf != NULL); + + while (*e && tup_i < ntup) { + b = e; + while (*e && *e != ':') + e++; + if (e > b) { /* something to parse */ + char *ep; + int len = e - b; + long int l; + strncpy(buf, b, len); + buf[len] = '\0'; + l = strtol(buf, &ep, 0); + if (ep != buf) { /* successful conversion */ + vals[tup_i] = (int)l; + n_parsed++; + } + } + if (*e == ':') + e++; /* skip delimiter */ + tup_i++; + } + psmi_free(buf); + return n_parsed; +} + +/* + * Memory footprint/usage mode. + * + * This can be used for debug or for separating large installations from + * small/medium ones. The default is to assume a medium installation. Large + * is not that much larger in memory footprint, but we make a conscious effort + * an consuming only the amount of memory we need. + */ +int psmi_parse_memmode(void) +{ + union psmi_envvar_val env_mmode; + int used_default = + psmi_getenv("PSM3_MEMORY", "Memory usage mode (min, normal or large)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"normal", &env_mmode); + if (used_default || !strcasecmp(env_mmode.e_str, "normal")) + return PSMI_MEMMODE_NORMAL; + else if (!strcasecmp(env_mmode.e_str, "min")) + return PSMI_MEMMODE_MINIMAL; + else if (!strcasecmp(env_mmode.e_str, "large") || + !strcasecmp(env_mmode.e_str, "big")) + return PSMI_MEMMODE_LARGE; + else { + _HFI_PRDBG("PSM3_MEMORY env value %s unrecognized, " + "using 'normal' memory mode instead\n", + env_mmode.e_str); + return PSMI_MEMMODE_NORMAL; + } +} + +/* RDMA mode */ +// we need this early when setting defaults for RV thresholds in psmi_mq_malloc +// and also want this available when creating the verbs_ep since it may affect +// sizing of CQs and buffers. But during mq_malloc we don't have an ep or proto +// to save this into +// The value returned is a bitmask of IPS_PROTOEXP_FLAG_* selections +unsigned psmi_parse_rdmamode(void) +{ + union psmi_envvar_val env_rdma; + static unsigned saved_rdmamode = 0xffffffff; + + // only parse once so doesn't appear in PSM3_VERBOSE_ENV multiple times + if (saved_rdmamode != 0xffffffff) + return saved_rdmamode; + + psmi_getenv("PSM3_RDMA", + "RDMA proto control (0-no RDMA," +#ifdef RNDV_MOD_MR + " 1-kernel RDMA," +#endif + " 2-user RDMA, 3-user RC send/RDMA) " + //" additional flags: 8-interleave, 0x10-serialize" + // IPS_PROTOEXP_FLAG_TID_DEBUG (0x4) N/A + , + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)IPS_PROTOEXP_FLAGS_DEFAULT, + &env_rdma); +#ifndef RNDV_MOD_MR + if (IPS_PROTOEXP_FLAG_KERNEL_QP(env_rdma.e_uint)) + env_rdma.e_uint = 0; +#endif + saved_rdmamode = env_rdma.e_uint; + return saved_rdmamode; +} + +/* PSM3_IDENTIFY */ +// we need in multiple places +int psmi_parse_identify(void) +{ + union psmi_envvar_val myenv; + static int saved_identify = -1; + + // only parse once so doesn't appear in PSM3_VERBOSE_ENV multiple times + if (saved_identify >= 0) + return saved_identify; + + psmi_getenv("PSM3_IDENTIFY", "Identify PSM version being run " + "(0 - disable, 1 - enable, 1: - limit output to rank 0, " + "1:pattern - limit output " + "to processes whose label matches " +#ifdef FNM_EXTMATCH + "extended " +#endif + "glob pattern)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"0", &myenv); + saved_identify = psmi_parse_val_pattern(myenv.e_str, 0, 0); + + return saved_identify; +} + +static +const char *psmi_memmode_string(int mode) +{ + psmi_assert(mode >= PSMI_MEMMODE_NORMAL && mode < PSMI_MEMMODE_NUM); + switch (mode) { + case PSMI_MEMMODE_NORMAL: + return "normal"; + case PSMI_MEMMODE_MINIMAL: + return "minimal"; + case PSMI_MEMMODE_LARGE: + return "large"; + default: + return "unknown"; + } +} + +psm2_error_t +psmi_parse_mpool_env(const psm2_mq_t mq, int level, + const struct psmi_rlimit_mpool *rlim, + uint32_t *valo, uint32_t *chunkszo) +{ + uint32_t val; + const char *env = rlim->env; + int mode = mq->memmode; + psm2_error_t err = PSM2_OK; + union psmi_envvar_val env_val; + + psmi_assert_always(mode >= PSMI_MEMMODE_NORMAL + && mode < PSMI_MEMMODE_NUM); + + psmi_getenv(rlim->env, rlim->descr, rlim->env_level, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)rlim->mode[mode].obj_max, &env_val); + + val = env_val.e_uint; + if (val < rlim->minval || val > rlim->maxval) { + err = psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Env. var %s=%u is invalid (valid settings in mode PSM3_MEMORY=%s" + " are inclusively between %u and %u)", + env, val, psmi_memmode_string(mode), + rlim->minval, rlim->maxval); + goto fail; + } + + _HFI_VDBG("%s max=%u,chunk=%u (mode=%s(%u),min=%u,max=%u)\n", + env, val, rlim->mode[mode].obj_chunk, + psmi_memmode_string(mode), mode, rlim->minval, rlim->maxval); + + *valo = val; + *chunkszo = rlim->mode[mode].obj_chunk; + +fail: + return err; +} + +uint64_t psmi_cycles_left(uint64_t start_cycles, int64_t timeout_ns) +{ + if (timeout_ns < 0) + return 0ULL; + else if (timeout_ns == 0ULL || timeout_ns == ~0ULL) + return ~0ULL; + else { + uint64_t t_end = nanosecs_to_cycles(timeout_ns); + uint64_t t_now = get_cycles() - start_cycles; + + if (t_now >= t_end) + return 0ULL; + else + return (t_end - t_now); + } +} + +uint32_t psmi_get_ipv4addr() +{ + struct hostent *he; + uint32_t addr = 0; + + he = gethostbyname(psmi_gethostname()); + if (he != NULL && he->h_addrtype == AF_INET && he->h_addr != NULL) { + memcpy(&addr, he->h_addr, sizeof(uint32_t)); + return addr; + } else + return 0; +} + +#define PSMI_EP_IS_PTR(ptr) ((ptr) != NULL && (ptr) < PSMI_EP_LOGEVENT) + +void +psmi_syslog(psm2_ep_t ep, int to_console, int level, const char *format, ...) +{ + va_list ap; + + /* If we've never syslogged anything from this ep at the PSM level, make + * sure we log context information */ + if (PSMI_EP_IS_PTR(ep) && !ep->did_syslog) { + char uuid_str[64]; + ep->did_syslog = 1; + + memset(&uuid_str, 0, sizeof(uuid_str)); + psmi_uuid_unparse(ep->uuid, uuid_str); + hfi_syslog("PSM", 0, LOG_WARNING, + "uuid_key=%s,unit=%d" + , + uuid_str, + ep->unit_id + ); + } + + va_start(ap, format); + hfi_vsyslog("PSM", to_console, level, format, ap); + va_end(ap); +} + +/* Table of CRCs of all 8-bit messages. */ +static uint32_t crc_table[256]; + +/* Flag: has the table been computed? Initially false. */ +static int crc_table_computed; + +/* Make the table for a fast CRC. */ +static void make_crc_table(void) +{ + uint32_t c; + int n, k; + + for (n = 0; n < 256; n++) { + c = (uint32_t) n; + for (k = 0; k < 8; k++) { + if (c & 1) + c = 0xedb88320 ^ (c >> 1); + else + c = c >> 1; + } + crc_table[n] = c; + } + crc_table_computed = 1; +} + +/* Update a running CRC with the bytes buf[0..len-1]--the CRC + * should be initialized to all 1's, and the transmitted value + * is the 1's complement of the final running CRC (see the + * crc() routine below)). + */ + +static uint32_t update_crc(uint32_t crc, unsigned char *buf, int len) +{ + uint32_t c = crc; + int n; + + if_pf(!crc_table_computed) + make_crc_table(); + for (n = 0; n < len; n++) { + c = crc_table[(c ^ buf[n]) & 0xff] ^ (c >> 8); + } + return c; +} + +/* Return the CRC of the bytes buf[0..len-1]. */ +uint32_t psmi_crc(unsigned char *buf, int len) +{ + return update_crc(0xffffffff, buf, len) ^ 0xffffffff; +} + +int psmi_multi_ep_enabled = 0; +void psmi_multi_ep_init() +{ + union psmi_envvar_val env_fi; + + psmi_getenv("PSM3_MULTI_EP", "PSM3 Multiple Endpoints (yes/no)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_YESNO, + PSMI_ENVVAR_VAL_YES, &env_fi); + + psmi_multi_ep_enabled = env_fi.e_uint; +} + +#ifdef PSM_FI + +struct psmi_faultinj_spec { + STAILQ_ENTRY(psmi_faultinj_spec) next; + char spec_name[PSMI_FAULTINJ_SPEC_NAMELEN]; + + uint64_t num_faults; + uint64_t num_calls; + + struct drand48_data drand48_data; + int num; + int denom; + long int initial_seed; +}; + +int psmi_faultinj_enabled = 0; +int psmi_faultinj_verbose = 0; +char *psmi_faultinj_outfile = NULL; +int psmi_faultinj_sec_rail = 0; + +static struct psmi_faultinj_spec psmi_faultinj_dummy; +static STAILQ_HEAD(, psmi_faultinj_spec) psmi_faultinj_head = + STAILQ_HEAD_INITIALIZER(psmi_faultinj_head); +int psmi_faultinj_num_entries; + +void psmi_faultinj_init() +{ + union psmi_envvar_val env_fi; + + psmi_getenv("PSM3_FI", "PSM Fault Injection " + "(0 - disable, 1 - enable, 1: - limit to rank 0, " + "1:pattern - limit " + "to processes whose label matches " +#ifdef FNM_EXTMATCH + "extended " +#endif + "glob pattern)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"0", &env_fi); + psmi_faultinj_enabled = psmi_parse_val_pattern(env_fi.e_str, 0, 0); + + if (psmi_faultinj_enabled) { + char *def = NULL; + if (!psmi_getenv + ("PSM3_FI_TRACEFILE", "PSM Fault Injection output file", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)def, &env_fi)) { + psmi_faultinj_outfile = psmi_strdup(NULL, env_fi.e_str); + } + if (!psmi_getenv + ("PSM3_FI_VERBOSE", "PSM Fault verbose output", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)0, &env_fi)) { + psmi_faultinj_verbose = env_fi.e_int; + } + if (!psmi_getenv + ("PSM3_FI_RAIL", "PSM Fault Injection rail (0=all, 1=secondary only)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)0, &env_fi)) { + psmi_faultinj_sec_rail = env_fi.e_int; + } + } + + return; +} + +/* we only grow new entries, so if we fail to allocate, just ignore request */ +static void psmi_faultinj_reregister_stats() +{ + struct psmi_stats_entry *entries; + struct psmi_stats_entry *e; + int num_entries = 0; + struct psmi_faultinj_spec *fi; + + entries = psmi_calloc(PSMI_EP_NONE, STATS, psmi_faultinj_num_entries, + sizeof(struct psmi_stats_entry)); + if (! entries) + return; + e = entries; + STAILQ_FOREACH(fi, &psmi_faultinj_head, next) { + psmi_stats_init_u64(e, fi->spec_name, &fi->num_faults); + e++; num_entries++; + } + + psmi_stats_reregister_type("Fault_Injection", PSMI_STATSTYPE_FAULTINJ, + entries, num_entries, 0, &psmi_faultinj_head); + psmi_free(entries); +} + +void psmi_faultinj_fini() +{ + struct psmi_faultinj_spec *fi; + FILE *fp; + int do_fclose = 0; + + if (!psmi_faultinj_enabled) + return; + psmi_stats_deregister_type(PSMI_STATSTYPE_FAULTINJ, &psmi_faultinj_head); + + if (psmi_faultinj_outfile == NULL) + return; + if (strncmp(psmi_faultinj_outfile, "stdout", 7) == 0) + fp = stdout; + else if (strncmp(psmi_faultinj_outfile, "stderr", 7) == 0) + fp = stderr; + else { + char *c = psmi_faultinj_outfile; + char buf[192]; + int append = 0; + if (*c == '+') { + append = 1; + ++c; + } + do_fclose = 1; + snprintf(buf, sizeof(buf) - 1, "%s.%s", c, hfi_get_mylabel()); + buf[sizeof(buf) - 1] = '\0'; + fp = fopen(buf, append ? "a" : "w"); + } + + if (fp != NULL) { + STAILQ_FOREACH(fi, &psmi_faultinj_head, next) { + fprintf(fp, "%s:%s PSM3_FI_%-13s %2.3f%% => " + "%2.3f%% %10"PRIu64" faults/%10"PRIu64" events seed %10ld\n", + __progname, hfi_get_mylabel(), fi->spec_name, + (double)fi->num * 100.0 / fi->denom, + (fi->num_calls ? + (double)fi->num_faults * 100.0 / fi->num_calls + :(double)0.0), + fi->num_faults, fi->num_calls, + fi->initial_seed); + } + fflush(fp); + if (do_fclose) + fclose(fp); + } + + psmi_free(psmi_faultinj_outfile); + return; +} + +/* + * Intended to be used only once, not in the critical path + */ +struct psmi_faultinj_spec *psmi_faultinj_getspec(const char *spec_name, + const char *help, int num, + int denom) +{ + struct psmi_faultinj_spec *fi; + + if (!psmi_faultinj_enabled) + return &psmi_faultinj_dummy; + + STAILQ_FOREACH(fi, &psmi_faultinj_head, next) { + if (strcmp(fi->spec_name, spec_name) == 0) + return fi; + } + + /* We got here, so no spec -- allocate one */ + fi = psmi_malloc(PSMI_EP_NONE, UNDEFINED, + sizeof(struct psmi_faultinj_spec)); + psmi_assert_always(fi != NULL); + strncpy(fi->spec_name, spec_name, PSMI_FAULTINJ_SPEC_NAMELEN - 1); + fi->spec_name[PSMI_FAULTINJ_SPEC_NAMELEN - 1] = '\0'; + fi->num = num; + fi->denom = denom; + fi->initial_seed = getpid(); + fi->num_faults = 0; + fi->num_calls = 0; + + /* + * See if we get a hint from the environment. + * Format is + * + * + * By default, we chose the initial seed to be the 'pid'. If users need + * repeatability, they should set initial_seed to be the 'pid' when the + * error was observed or force the initial_seed to be a constant number in + * each running process. Using 'pid' is useful because core dumps store + * pids and our backtrace format does as well so if a crash is observed for + * a specific seed, programs can reuse the 'pid' to regenerate the same + * error condition. + */ + { + int fvals[3] = { num, denom, (int)getpid() }; + union psmi_envvar_val env_fi; + char fvals_str[128]; + char fname[128]; + char fdesc[300]; + + snprintf(fvals_str, sizeof(fvals_str) - 1, "%d:%d:1", num, + denom); + fvals_str[sizeof(fvals_str) - 1] = '\0'; + snprintf(fname, sizeof(fname) - 1, "PSM3_FI_%s", spec_name); + fname[sizeof(fname) - 1] = '\0'; + snprintf(fdesc, sizeof(fdesc) - 1, "Fault Injection - %s <%s>", + help, fvals_str); + + if (!psmi_getenv(fname, fdesc, PSMI_ENVVAR_LEVEL_HIDDEN, + PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)fvals_str, &env_fi)) { + /* not using default values */ + int n_parsed = + psmi_parse_str_tuples(env_fi.e_str, 3, fvals); + if (n_parsed >= 1) + fi->num = fvals[0]; + if (n_parsed >= 2) + fi->denom = fvals[1]; + if (n_parsed >= 3) + fi->initial_seed = (long int)fvals[2]; + } + } + srand48_r(fi->initial_seed, &fi->drand48_data); + + psmi_faultinj_num_entries++; + STAILQ_INSERT_TAIL(&psmi_faultinj_head, fi, next); + psmi_faultinj_reregister_stats(); + return fi; +} + +int psmi_faultinj_is_fault(struct psmi_faultinj_spec *fi) +{ + if (!psmi_faultinj_enabled) /* never fault if disabled */ + return 0; + if (fi->num == 0) + return 0; + + fi->num_calls++; + long int rnum; + lrand48_r(&fi->drand48_data, &rnum); + if (((int) (rnum % INT_MAX)) % fi->denom <= fi->num) { + fi->num_faults++; + if (psmi_faultinj_verbose) { + printf("%s: injecting fault: %s\n", hfi_get_mylabel(), fi->spec_name); + fflush(stdout); + } + return 1; + } else + return 0; +} + +#endif /* #ifdef PSM_FI */ + +/* For memory allocation, we kind of break the PSM error handling rules. + * If the caller gets NULL, it has to assume that the error has been handled + * and should always return PSM2_NO_MEMORY */ + +/* + * Log memory increments or decrements of type memstats_t. + */ +struct psmi_memtype_hdr { + struct { + uint64_t size:48; + uint64_t magic:8; + uint64_t type:8; + }; + void *original_allocation; +}; + +// Memory stats will only be collected under debug builds + +#ifdef PSM_DEBUG +#define psmi_stats_mask PSMI_STATSTYPE_MEMORY +#else +#define psmi_stats_mask 0 +#endif + +struct psmi_stats_malloc psmi_stats_memory; + +void psmi_mem_stats_register(void) +{ + struct psmi_stats_entry entries[] = { + PSMI_STATS_DECLU64("Total_(current)", + (uint64_t*)&psmi_stats_memory.m_all_total), + PSMI_STATS_DECLU64("Total_(max)", + (uint64_t*)&psmi_stats_memory.m_all_max), + PSMI_STATS_DECLU64("All_Peers_(current)", + (uint64_t*)&psmi_stats_memory.m_perpeer_total), + PSMI_STATS_DECLU64("All_Peers_(max)", + (uint64_t*)&psmi_stats_memory.m_perpeer_max), + PSMI_STATS_DECLU64("Network_Buffers_(current)", + (uint64_t*)&psmi_stats_memory.m_netbufs_total), + PSMI_STATS_DECLU64("Network_Buffers_(max)", + (uint64_t*)&psmi_stats_memory.m_netbufs_max), + PSMI_STATS_DECLU64("PSM_desctors_(current)", + (uint64_t*)&psmi_stats_memory.m_descriptors_total), + PSMI_STATS_DECLU64("PSM_desctors_(max)", + (uint64_t*)&psmi_stats_memory.m_descriptors_max), + PSMI_STATS_DECLU64("Unexp._buffers_(current)", + (uint64_t*)&psmi_stats_memory.m_unexpbufs_total), + PSMI_STATS_DECLU64("Unexp._Buffers_(max)", + (uint64_t*)&psmi_stats_memory.m_unexpbufs_max), +#ifdef RNDV_MOD_MR + PSMI_STATS_DECLU64("Peer_Rndv_(current)", + (uint64_t*)&psmi_stats_memory.m_peerrndv_total), + PSMI_STATS_DECLU64("Peer_Rndv_(max)", + (uint64_t*)&psmi_stats_memory.m_peerrndv_max), +#endif + PSMI_STATS_DECLU64("statistics_(current)", + (uint64_t*)&psmi_stats_memory.m_stats_total), + PSMI_STATS_DECLU64("statistics_(max)", + (uint64_t*)&psmi_stats_memory.m_stats_max), + PSMI_STATS_DECLU64("Other_(current)", + (uint64_t*)&psmi_stats_memory.m_undefined_total), + PSMI_STATS_DECLU64("Other_(max)", + (uint64_t*)&psmi_stats_memory.m_undefined_max), + }; + + if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) { + psmi_stats_register_type("PSM_memory_allocation_statistics", + PSMI_STATSTYPE_MEMORY, + entries, + PSMI_STATS_HOWMANY(entries), 0, &psmi_stats_memory); + } +} + + +void psmi_log_memstats(psmi_memtype_t type, int64_t nbytes) +{ +#define _add_max_total(type, nbytes) \ + psmi_stats_memory.m_ ## type ## _total += (nbytes); \ + psmi_stats_memory.m_ ## type ## _max = max( \ + psmi_stats_memory.m_ ## type ## _total, \ + psmi_stats_memory.m_ ## type ## _max); + + switch (type) { + case PER_PEER_ENDPOINT: + _add_max_total(perpeer, nbytes); + break; + case NETWORK_BUFFERS: + _add_max_total(netbufs, nbytes); + break; + case DESCRIPTORS: + _add_max_total(descriptors, nbytes); + break; + case UNEXPECTED_BUFFERS: + _add_max_total(unexpbufs, nbytes); + break; + case STATS: + _add_max_total(stats, nbytes); + break; +#ifdef RNDV_MOD_MR + case PEER_RNDV: + _add_max_total(peerrndv, nbytes); + break; +#endif + case UNDEFINED: + _add_max_total(undefined, nbytes); + break; + default: + psmi_assert_always(type == TOTAL); + break; + } + _add_max_total(all, nbytes); + psmi_stats_memory.m_all_max++; +#undef _add_max_total + + return; +} + +#ifdef malloc +#undef malloc +#endif + +#ifdef PSM_HEAP_DEBUG + +/* PSM HEAP DEBUG documentation: + + In the following code, the acronym: 'HD' is short for "Heap Debug". + + Each actual heap allocation will have a header and a trailer surrounding it, + and the header itself may have some vacant space preceding it due to alignment + needs: + + 0. This area is the actual return value of posix_memalign and is due to + alignment requirements. (This area does not exist for heap allocations + from malloc()). + 1. HD HEADER + 2. Actual allocation + 3. HD TRAILER + + malloc() / posix_memalign returns area 0 through 3 to the Heap Debug (HD) code, + then the HD code writes areas 1 and 3, and then returns a pointer to area 2 to + the caller. Thereafter, the HD code will inspect areas 1 and 3 of all heap + allocations to make sure they have retained their integrity. + + Surrounding the actual allocation like this enables: + + 1. Checking for heap overrun / underrun of all allocations. + 2. Checking for double frees. + 3. Use of an area that has been freed. + 4. Identifying orphaned heap allocations. + +Constant no-mans-land written to areas that no-one should be writing to: + + */ + +#define HD_NO_MANS_LAND -15 + +/* The following is the declaration of the HD header. */ + +/* Heap debug header magic number type: */ +typedef char HD_Hdr_Magic_Type[8]; + +typedef struct HD_Header_Struct +{ + HD_Hdr_Magic_Type magic1; /* Magic number to ensure this + allocation has integrity. + (guards against heap + overrun from above). */ + const char *allocLoc; /* Source file name/line + number where this heap + allocation was made. */ + const char *freeLoc; /* Source filename/line number + where this heap allocation + was freed. */ + struct HD_Header_Struct *nextHD_header; /* Creates a singly-linked + list of all heap + allocations. */ + uint64_t sizeOfAlloc; /* size of this heap + allocation. */ + void *systemAlloc; /* The actual return value + from malloc()/posix_memaligh(). */ + uint64_t systemAllocSize;/* The size that is actually allocated + by malloc()/posix_memalign(). */ + HD_Hdr_Magic_Type magic2; /* Second magic number to + ensure this allocation + has integrity. + (guards against heap + underrun from the actual + allocation that follows). */ +} __attribute__ ((packed)) HD_Header_Type; + +typedef struct HD_free_list_struct +{ + HD_Header_Type *freedStuct; + struct HD_free_list_struct *next_free_struct; +} HD_Free_Struct_Type; + +static HD_Free_Struct_Type *HD_free_list_root = NULL; +static HD_Free_Struct_Type **HD_free_list_bottom = &HD_free_list_root; + +typedef char HD_Trlr_Magic_Type[16]; + +static const HD_Hdr_Magic_Type HD_HDR_MGC_1 = "Eric"; +static const HD_Hdr_Magic_Type HD_HDR_MGC_2 = "Emily"; +static const HD_Trlr_Magic_Type HD_TRLR_MGC = "Erin&Elaine"; + +/* Convert a pointer of an actual allocation to a pointer to its HD header: */ +static inline HD_Header_Type *HD_AA_TO_HD_HDR(void *aa) +{ + char *p = (char*)aa; + return (HD_Header_Type*)(p - sizeof(HD_Header_Type)); +} + +/* Convert a pointer to an HD header to the actual allocation: */ +static inline void *HD_HDR_TO_AA(HD_Header_Type *phdHdr) +{ + char *p = (char*)phdHdr; + return p + sizeof(HD_Header_Type); +} + +/* Get the address of the trailer that follows the actual allocation: */ +static inline void *HD_GET_HD_TRLR(HD_Header_Type *phdr) +{ + char *p = (char*)HD_HDR_TO_AA(phdr); + return p + phdr->sizeOfAlloc; +} + +static HD_Header_Type * HD_root_of_list = NULL; /* Root of singly linked list + of all heap allocations */ +static HD_Header_Type **HD_end_of_list = &HD_root_of_list; /* Pointer to the + last pointer of the singly linked list of all heap allocations. */ + +/* Number of allocations in the list. Maintained to assert the integrity + of the singly linked list of heap allocations. */ +static int n_allocations = 0; + +/* HD_check_one_struct() checks one heap allocation for integrity. */ +static inline void HD_check_one_struct(HD_Header_Type *p, int checkAA,const char *curloc) +{ + int s=0; + + /* First check the magic values in the header and trailer: */ + s |= memcmp(p->magic1,HD_HDR_MGC_1,sizeof(HD_HDR_MGC_1)) ? 1 : 0; + s |= memcmp(p->magic2,HD_HDR_MGC_2,sizeof(HD_HDR_MGC_2)) ? 2 : 0; + s |= memcmp(HD_GET_HD_TRLR(p),HD_TRLR_MGC,sizeof(HD_TRLR_MGC)) ? 4 : 0; + + if (s != 0) + { + fprintf(stderr,"header/trailer error: checking location: %s, s: %d, p: %p, " + "p->allocLoc: %s\n",curloc,s,p,p->allocLoc); + fprintf(stderr,"actual allocation starts at: %p, length: %" PRIu64 "\n", (char*)HD_HDR_TO_AA(p),p->sizeOfAlloc); + fflush(0); + abort(); + } + + /* Next, check the area between systemAlloc and the start of the header */ + signed char *pchr = (signed char *)p->systemAlloc; + while (pchr < (signed char*)p) + { + psmi_assert_always(*pchr == (signed char) HD_NO_MANS_LAND); + pchr++; + } + + /* Lastly, check the actual allocation area if directed to do so: */ + if (checkAA) + { + uint64_t i; + signed char *pchr = HD_HDR_TO_AA(p); + for (i=0;i < p->sizeOfAlloc;i++) + if (pchr[i] != (signed char) HD_NO_MANS_LAND) + { + fprintf(stderr, + "use after free; ptr: %p,\n" + " allocated from: %s,\n" + " validated from: %s\n" + " freed from: %s\n", + pchr+i,p->allocLoc,curloc,p->freeLoc); + fflush(0); + psmi_assert_always(0); + } + } +} + +/* _psmi_heapdebug_val_heapallocs() walks the singly linked list and inspects all + * heap allocations to ensure all of them have integrity still. */ +void _psmi_heapdebug_val_heapallocs(const char *curloc) +{ + /* first check current allocation list: */ + HD_Header_Type *p = HD_root_of_list; + int cnt = 0; + + while (p) + { + HD_check_one_struct(p,0,curloc); + p = p->nextHD_header; + cnt++; + } + psmi_assert_always(cnt == n_allocations); + /* Next check free list */ + HD_Free_Struct_Type *pfreestruct = HD_free_list_root; + while (pfreestruct) + { + HD_check_one_struct(pfreestruct->freedStuct,1,curloc); + pfreestruct = pfreestruct->next_free_struct; + } +} + +/* psmi_heapdebug_finalize() validates the heap and then emits all of the allocations to stdout. + to help debug heap memory leaks. */ +void psmi_heapdebug_finalize(void) +{ + /* First validate the existing heap allocations: */ + + psmi_heapdebug_val_heapallocs(); + + printf("orphaned heap allocations: %d\n", n_allocations); + + if (n_allocations > 0) + { + /* Now, emit all of the alloations to stdout. */ + + HD_Header_Type *p = HD_root_of_list; + + while (p) + { + printf("orphaned heap allocation: %p allocated at: %s, size: %lu\n", + p, p->allocLoc, p->sizeOfAlloc); + + p = p->nextHD_header; + } + fflush(0); + /* Abort if any allocations still exist: */ + abort(); + } +} + +/* hd_est_hdr_trlr() establishes the new allocation to the singly linked list, and adds + * the header and trailer to the allocation. Lastly, it validates the existing singly-linked + * list for integrity. */ +static void hd_est_hdr_trlr(HD_Header_Type *hd_alloc, + void *systemAlloc, + uint64_t systemSize, + uint64_t actualSize, + const char *curloc) +{ + /* First, write HD_NO_MANS_LAND to the entire allocation: */ + memset(systemAlloc,HD_NO_MANS_LAND,systemSize); + + /* Write the HD header info: */ + memcpy(hd_alloc->magic1,HD_HDR_MGC_1,sizeof(HD_HDR_MGC_1)); + hd_alloc->allocLoc = curloc; + hd_alloc->freeLoc = NULL; + hd_alloc->nextHD_header = NULL; + hd_alloc->sizeOfAlloc = actualSize; + hd_alloc->systemAlloc = systemAlloc; + hd_alloc->systemAllocSize = systemSize; + memcpy(hd_alloc->magic2,HD_HDR_MGC_2,sizeof(HD_HDR_MGC_2)); + memcpy(HD_GET_HD_TRLR(hd_alloc),HD_TRLR_MGC,sizeof(HD_TRLR_MGC)); + *HD_end_of_list = hd_alloc; + HD_end_of_list = &hd_alloc->nextHD_header; + n_allocations++; + psmi_heapdebug_val_heapallocs(); +} + +/* hd_malloc() is the heap debug version of malloc that will create the header and trailer + * and link the allocation into the singly linked list. */ +static inline void *hd_malloc(size_t sz, const char *curloc) +{ + const uint64_t wholeSize = sizeof(HD_Header_Type) + sz + sizeof(HD_TRLR_MGC); + HD_Header_Type *hd_alloc = (HD_Header_Type*)malloc(wholeSize); + + hd_est_hdr_trlr(hd_alloc,hd_alloc,wholeSize,sz,curloc); + return HD_HDR_TO_AA(hd_alloc); +} + +/* hd_memalign() is the heap debug version of posix_memalign(). */ +static inline int hd_memalign(void **ptr,uint64_t alignment, size_t sz, const char *curloc) +{ + void *systemAlloc = NULL; + const uint64_t alignMask = alignment - 1; + uint64_t systemSize = sizeof(HD_Header_Type) + alignMask + sz + sizeof(HD_TRLR_MGC); + int rv = posix_memalign(&systemAlloc,alignment,systemSize); + char *actualAlloc = NULL; + const char *endOfSystemAlloc = ((char*)systemAlloc) + systemSize; + + if (rv) + return rv; + + uint64_t actualAllocu64 = (uint64_t) systemAlloc; + actualAllocu64 += sizeof(HD_Header_Type) + alignMask; + actualAllocu64 &= ~ alignMask; + actualAlloc = (char*)actualAllocu64; + psmi_assert_always((actualAllocu64 & alignMask) == 0); + psmi_assert_always((actualAlloc+sz+sizeof(HD_TRLR_MGC)) <= endOfSystemAlloc); + psmi_assert_always((actualAlloc - (char*)systemAlloc) >= sizeof(HD_Header_Type)); + + hd_est_hdr_trlr(HD_AA_TO_HD_HDR(actualAlloc),systemAlloc,systemSize,sz,curloc); + *ptr = actualAlloc; + return rv; +} + +/* hd_free() is the heap debug version of free(). First, hd_free() ensures that the ptr to be + * freed in fact is known by the HD code. Next, hd_free() removes the ptr from the list. Then, + * hd_free scribbles to the ptr's area and actually frees the heap space. */ +static inline void hd_free(void *ptr,const char *curloc) +{ + HD_Header_Type *hd_alloc = HD_AA_TO_HD_HDR(ptr); + HD_Header_Type *p = HD_root_of_list, *q = NULL; + + psmi_heapdebug_val_heapallocs(); + while (p) + { + if (p == hd_alloc) + { + /* first, fix the next pointers: */ + if (q) + { + q->nextHD_header = p->nextHD_header; + } + else + { + psmi_assert_always(p == HD_root_of_list); + HD_root_of_list = p->nextHD_header; + } + /* Now, handle the case of removing the last entry in the list. */ + if (&p->nextHD_header == HD_end_of_list) + { + if (q) + { + q->nextHD_header = NULL; + HD_end_of_list = &q->nextHD_header; + } + else + { + HD_root_of_list = NULL; + HD_end_of_list = &HD_root_of_list; + } + } + /* Scribble to the actual allocation to make further access to the heap + area unusable. */ + n_allocations--; + memset(HD_HDR_TO_AA(hd_alloc),HD_NO_MANS_LAND,hd_alloc->sizeOfAlloc); + hd_alloc->freeLoc = curloc; + /* Add this allocation to the free list. */ + HD_Free_Struct_Type *pfreestruct = (HD_Free_Struct_Type*)malloc(sizeof(HD_Free_Struct_Type)); + *HD_free_list_bottom = pfreestruct; + HD_free_list_bottom = &pfreestruct->next_free_struct; + pfreestruct->freedStuct = hd_alloc; + pfreestruct->next_free_struct = NULL; + psmi_heapdebug_val_heapallocs(); + return; + } + q = p; + p = p->nextHD_header; + } + /* trying to free a heap allocation that we did not allocate. */ + psmi_assert_always(0); +} + +size_t hd_malloc_usable_size(void *ptr,const char *curloc) +{ + HD_Header_Type *hd_alloc = HD_AA_TO_HD_HDR(ptr); + return hd_alloc->systemAllocSize; +} + +#endif + +#ifdef PSM_HEAP_DEBUG + +/* For HD code, we retarget the malloc, memaligh and free calls to the hd versions + * of the code. */ + +#define my_malloc(SZ,CURLOC) hd_malloc(SZ,CURLOC) +#define my_memalign(PTR,ALIGN,SZ,CURLOC) hd_memalign(PTR,ALIGN,SZ,CURLOC) +#define my_free(PTR,CURLOC) hd_free(PTR,CURLOC) +#define my_malloc_usable_size(PTR,CURLOC) hd_malloc_usable_size(PTR,CURLOC) + +#else + +/* For non-HD code, we target the code to the usual functions: */ +#define my_malloc(SZ,CURLOC) malloc(SZ) +#define my_memalign(PTR,ALIGN,SZ,CURLOC) posix_memalign(PTR,ALIGN,SZ) +#define my_free(PTR,CURLOC) free(PTR) +#define my_malloc_usable_size(PTR,CURLOC) malloc_usable_size(PTR) + +#endif + +void *psmi_malloc_internal(psm2_ep_t ep, psmi_memtype_t type, + size_t sz, const char *curloc) +{ + size_t newsz = sz; + void *newa; + + if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) + newsz += sizeof(struct psmi_memtype_hdr); + + newa = my_malloc(newsz,curloc); + if (newa == NULL) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, + "Out of memory for malloc at %s", curloc); + return NULL; + } + + if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) { + struct psmi_memtype_hdr *hdr = (struct psmi_memtype_hdr *)newa; + hdr->size = newsz; + hdr->type = type; + hdr->magic = 0x8c; + hdr->original_allocation = newa; + psmi_log_memstats(type, newsz); + newa = (void *)(hdr + 1); + /* _HFI_INFO("alloc is %p\n", newa); */ + } + return newa; +} + +void *psmi_realloc_internal(psm2_ep_t ep, psmi_memtype_t type, + void *ptr, size_t nsz, const char *curloc) +{ + if (ptr) + { + size_t existingSize = psmi_malloc_usable_size_internal(ptr,curloc); + if (nsz > existingSize) + { + void *newPtr = psmi_malloc_internal(ep,type,nsz,curloc); + + memcpy(newPtr,ptr,existingSize); + psmi_free_internal(ptr,curloc); + return newPtr; + } + else + /* We will not support shrinking virtual space + for performance reasons. */ + return ptr; + } + else + return psmi_malloc_internal(ep,type,nsz,curloc); +} + +#ifdef memalign +#undef memalign +#endif +void *psmi_memalign_internal(psm2_ep_t ep, psmi_memtype_t type, + size_t alignment, size_t sz, const char *curloc) +{ + size_t newsz = sz; + void *newa; + int ret, preambleSize = 0; + + if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) + { + if (sizeof(struct psmi_memtype_hdr) > alignment) + { + int n = sizeof(struct psmi_memtype_hdr) / alignment; + int r = sizeof(struct psmi_memtype_hdr) % alignment; + if (r) + n++; + preambleSize = n * alignment; + } + else + preambleSize = alignment; + newsz += preambleSize; + } + + ret = my_memalign(&newa, alignment, newsz, curloc); + if (ret) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, + "Out of memory for malloc at %s", curloc); + return NULL; + } + + if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) { + void *rv = (void *)((uint8_t *)newa + preambleSize); + struct psmi_memtype_hdr *hdr = (struct psmi_memtype_hdr *)((uint8_t *)rv - sizeof(struct psmi_memtype_hdr)); + hdr->size = newsz; + hdr->type = type; + hdr->magic = 0x8c; + hdr->original_allocation = newa; + psmi_log_memstats(type, newsz); + newa = rv; + /* _HFI_INFO("alloc is %p\n", newa); */ + } + return newa; +} + +#ifdef calloc +#undef calloc +#endif + +void *psmi_calloc_internal(psm2_ep_t ep, psmi_memtype_t type, size_t nelem, + size_t elemsz, const char *curloc) +{ + void *newa = psmi_malloc_internal(ep, type, nelem * elemsz, curloc); + if (newa == NULL) /* error handled above */ + return NULL; + memset(newa, 0, nelem * elemsz); + return newa; +} + +#ifdef strdup +#undef strdup +#endif + +void *psmi_strdup_internal(psm2_ep_t ep, const char *string, const char *curloc) +{ + size_t len = strlen(string) + 1; + void *newa = psmi_malloc_internal(ep, UNDEFINED, len, curloc); + if (newa == NULL) + return NULL; + memcpy(newa, string, len); /* copy with \0 */ + return newa; +} + +#ifdef free +#undef free +#endif + +void MOCKABLE(psmi_free_internal)(void *ptr,const char *curloc) +{ + if_pf(psmi_stats_mask & PSMI_STATSTYPE_MEMORY) { + struct psmi_memtype_hdr *hdr = + (struct psmi_memtype_hdr *)ptr - 1; + /* _HFI_INFO("hdr is %p, ptr is %p\n", hdr, ptr); */ + psmi_memtype_t type = hdr->type; + int64_t size = hdr->size; + int magic = (int)hdr->magic; + psmi_log_memstats(type, -size); + psmi_assert_always(magic == 0x8c); + ptr = hdr->original_allocation; + } + my_free(ptr,curloc); +} +MOCK_DEF_EPILOGUE(psmi_free_internal); + +#ifdef malloc_usable_size +#undef malloc_usable_size +#endif + +size_t psmi_malloc_usable_size_internal(void *ptr, const char *curLoc) +{ + return my_malloc_usable_size(ptr,curLoc); +} + +PSMI_ALWAYS_INLINE( +psm2_error_t +psmi_coreopt_ctl(const void *core_obj, int optname, + void *optval, uint64_t *optlen, int get)) +{ + psm2_error_t err = PSM2_OK; + + switch (optname) { + case PSM2_CORE_OPT_DEBUG: + /* Sanity check length */ + if (*optlen < sizeof(unsigned)) { + err = psmi_handle_error(NULL, + PSM2_PARAM_ERR, + "Option value length error"); + *optlen = sizeof(unsigned); + return err; + } + + if (get) { + *((unsigned *)optval) = hfi_debug; + } else + hfi_debug = *(unsigned *)optval; + break; + case PSM2_CORE_OPT_EP_CTXT: + { + /* core object is epaddr */ + psm2_epaddr_t epaddr = (psm2_epaddr_t) core_obj; + + /* Sanity check epaddr */ + if (!epaddr) { + return psmi_handle_error(NULL, + PSM2_PARAM_ERR, + "Invalid endpoint address"); + } + + /* Sanity check length */ + if (*optlen < sizeof(unsigned long)) { + err = psmi_handle_error(NULL, + PSM2_PARAM_ERR, + "Option value length error"); + *optlen = sizeof(void *); + return err; + } + + if (get) { + *((unsigned long *)optval) = + (unsigned long)epaddr->usr_ep_ctxt; + } else + epaddr->usr_ep_ctxt = optval; + } + break; + default: + /* Unknown/unrecognized option */ + err = psmi_handle_error(NULL, + PSM2_PARAM_ERR, + "Unknown PSM3_CORE option %u.", + optname); + break; + } + return err; +} + +psm2_error_t psmi_core_setopt(const void *core_obj, int optname, + const void *optval, uint64_t optlen) +{ + return psmi_coreopt_ctl(core_obj, optname, (void *)optval, &optlen, 0); +} + +psm2_error_t psmi_core_getopt(const void *core_obj, int optname, + void *optval, uint64_t *optlen) +{ + return psmi_coreopt_ctl(core_obj, optname, optval, optlen, 1); +} + +/* PSM AM component option handling */ +PSMI_ALWAYS_INLINE( +psm2_error_t +psmi_amopt_ctl(const void *am_obj, int optname, + void *optval, uint64_t *optlen, int get)) +{ + psm2_error_t err = PSM2_OK; + + /* AM object is a psm2_epaddr (or NULL for global minimum sz) */ + /* psm2_epaddr_t epaddr = (psm2_epaddr_t) am_obj; */ + + /* All AM options are read-only. */ + if (!get) { + return err = + psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_OPT_READONLY, + "Attempted to set read-only option value"); + } + + /* Sanity check length -- all AM options are uint32_t. */ + if (*optlen < sizeof(uint32_t)) { + *optlen = sizeof(uint32_t); + return err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_PARAM_ERR, + "Option value length error"); + } + + switch (optname) { + case PSM2_AM_OPT_FRAG_SZ: + *((uint32_t *) optval) = psmi_am_parameters.max_request_short; + break; + case PSM2_AM_OPT_NARGS: + *((uint32_t *) optval) = psmi_am_parameters.max_nargs; + break; + case PSM2_AM_OPT_HANDLERS: + *((uint32_t *) optval) = psmi_am_parameters.max_handlers; + break; + default: + err = + psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Unknown PSM3_AM option %u.", optname); + } + + return err; +} + +psm2_error_t psmi_am_setopt(const void *am_obj, int optname, + const void *optval, uint64_t optlen) +{ + return psmi_amopt_ctl(am_obj, optname, (void *)optval, &optlen, 0); +} + +psm2_error_t psmi_am_getopt(const void *am_obj, int optname, + void *optval, uint64_t *optlen) +{ + return psmi_amopt_ctl(am_obj, optname, optval, optlen, 1); +} + +#ifdef PSM_LOG + +#include +#include +#include +#include +#include +#include "ptl_ips/ips_proto_header.h" + +/* A treeNode is used to store the list of Function Name Lists that + are passed to the PSM_LOG facility via environment variables. + See psm_log.h for more information. + + Note that treeNode is a node in a binary tree data structure. */ +typedef struct _treeNode +{ + const char *name; + int line1,line2; + struct _treeNode *left,*right; +} treeNode; + +/* An epmTreeNode is used to track the number of protocol packets + that are send/recevied, for a given opcode, and source epid + to another epid. */ +typedef struct _epmTreeNode +{ + int opcode,count,txrx; + uint64_t fromepid,toepid; + struct _epmTreeNode *left,*right; +} epmTreeNode; + + +/* given a line range: [*line1 .. *line2], and another line, line + 'join' the line range to the new line if the line immediately abuts + the line range. The new line does not abut the existing range, + return 0. Else, return 1. + + For example, take the line range [ 20 .. 30 ] and the line: 19. + Since 19 comes immediately before 20, the line range can be joined + resulting in the line rage: [ 19 .. 30 ]. The function returns 1 for this + case. + + The following other examples gives the new line range given the new line and + range [ 20 .. 30 ], and gives the return value: + + 31 [ 20 .. 31 ] 1 + 18 [ 20 .. 30 ] 0 + 32 [ 20 .. 30 ] 0 + 25 [ 20 .. 30 ] 1 */ +static int joinOverlap(int *line1,int *line2,int line) +{ + long long ll_line = line; + + if (ll_line+1 >= *line1 && ll_line-1 <= *line2) + { + *line1 = min(*line1,line); + *line2 = max(*line2,line); + return 1; + } + return 0; +} + +/* given two line ranges, determine the range that encompasses both line ranges + if an overlap has occurred. Returns 0 if the two ranges do not overlap and + do not abutt. + + Some examples, if line1=20 and line2=30 + + [20 30] [20 30] 2 + [19 30] [19 30] 2 + [19 20] [19 30] 2 + [10 15] [20 30] 0 + [40 50] [20 30] 0 */ +static int joinOverlapRange(int *line1,int *line2,int l1,int l2) +{ + return joinOverlap(line1,line2,l1) + joinOverlap(line1,line2,l2); +} + +/* inserts a new treeNode into the FNL tree, or, merges the lines that are already + present in the tree. */ +static void insertNodeInTree(treeNode **root,const char *name,int line1,int line2) +{ + if (*root) + { + int c = strcmp(name,(*root)->name); + if (c < 0) + insertNodeInTree(&((*root)->left),name,line1,line2); + else if (c > 0) + insertNodeInTree(&((*root)->right),name,line1,line2); + else + { + if (joinOverlapRange(&(*root)->line1,&(*root)->line2,line1,line2)) + return; + else if (line1 < (*root)->line1) + insertNodeInTree(&((*root)->left),name,line1,line2); + else if (line2 > (*root)->line2) + insertNodeInTree(&((*root)->right),name,line1,line2); + else psmi_assert_always(0); /* should never happen. */ + } + } + else + { + *root = malloc(sizeof(treeNode)); + (*root)->name = strdup(name); + (*root)->line1 = line1; + (*root)->line2 = line2; + (*root)->left = (*root)->right = NULL; + } +} + +/* Returns -1 if the data in the node is less than the data supplied as parameter, else + Returns 1 if the data in the node is greater than the data supplied as parameter, else + Returns 0. + */ +static int compareEpmNode(epmTreeNode *node,int opcode,int txrx,uint64_t fromepid,uint64_t toepid) +{ +#define COMPARE_ONE(X) if (node->X != X) return node->X < X ? -1 : 1 + COMPARE_ONE(opcode); + COMPARE_ONE(txrx); + COMPARE_ONE(fromepid); + COMPARE_ONE(toepid); + return 0; +} + +/* Inserts a new node in the tree corresponding to the parameters, or, retrieves the node in the tree. + In either case, this code returns a pointer to the count in the node. */ +static int *insertNodeInEpmTree(epmTreeNode **root,int opcode,int txrx,uint64_t fromepid,uint64_t toepid) +{ + if (*root) + { + int a = compareEpmNode((*root),opcode,txrx,fromepid,toepid); + if (a < 0) + return insertNodeInEpmTree(&((*root)->left),opcode,txrx,fromepid,toepid); + else if (a > 0) + return insertNodeInEpmTree(&((*root)->right),opcode,txrx,fromepid,toepid); + else + return &((*root)->count); + } + else + { + *root = malloc(sizeof(epmTreeNode)); + (*root)->opcode = opcode; + (*root)->txrx = txrx; + (*root)->count = 0; + (*root)->fromepid = fromepid; + (*root)->toepid = toepid; + (*root)->left = (*root)->right = NULL; + return &((*root)->count); + } +} + +/* returns 0, if the node is present, non-zero if it is absent. */ +static int lookupNodeInTree(const treeNode *root,const char *name,int line) +{ + if (root) + { + int c = strcmp(name,root->name); + if (c < 0) + return lookupNodeInTree(root->left,name,line); + else if (c > 0) + return lookupNodeInTree(root->right,name,line); + else + { + if (line < root->line1) + return lookupNodeInTree(root->left,name,line); + else if (line > root->line2) + return lookupNodeInTree(root->right,name,line); + else /* line must be >= root->line1 and line must be <= root->line2. */ + return 0; + } + } + else + { + return 1; + } +} + +/* Declare a prototype for a parserFunc - referenced in the following code: */ +typedef void parserFunc(char *,int,int,void *); + +/* breaks down a string into 'c'-delimited substrings, and calls the parser func for each substring. */ +static void parseString(char *ps,char c,parserFunc pf,void *ctx) +{ + int idx,n=0; + char *p; + + /* first, count the number of instances of c in ps, for use by the parser function: */ + for (idx=0;ps[idx];idx++) + if (ps[idx] == c) + n++; + /* next, break down ps into 'c'-delimited substrings, and call parser function, pf for each substring: */ + for (idx=0,p=ps;p && *p;idx++) + { + char *t = strchr(p,c); + if (!t) + { + break; + } + else + { + *t = 0; + pf(p,idx,n,ctx); + p = t+1; + } + } + /* finally, call pf on the final substring. */ + pf(p,idx,n,ctx); +} + +/* fncNameCtx is the context used while parsing FNL's (see psm_log.h for more info) from the environment: */ +typedef struct +{ + const char *currentFuncName; + int firstLineNumber; + treeNode **root; +} funcNameCtx; + +/* This is the start of the parser code for parsing FNL's. Here is the grammar: + + An FNL is a 'Function Name List' that is defined by the following grammar: + + # A LINE1 is either a single line number of a range of line numbers: +(1) LINE1 :: lineNumber | +(2) lineNumber1 '-' lineNumber2 + + # LINES is a list of LINE1's separated by commas: +(3) LINES :: LINE1 | +(4) LINE1 ',' LINES + + # An FN is either a function name, or a function name with a list of lines: +(5) FN :: functionName | +(6) functionName ';' LINES + + # A FNL is a list of FN's separated by colons: +(7) FNL :: FN | +(8) FN ':' FNL + + # Examples: + foo:bar the two functions foo and bar + foo;1-10 lines 1 to 10 of function foo. + bar;1,3,5 lines 1, 3 and 5 of function bar + +*/ + +/* p4() inserts a (function name and line number) pair into the FNL tree or a (function name and line number range) in the FNL tree. +*/ +static void p4(char *s,int idx,int n,void *ctx) +{ + funcNameCtx *pfnc = (funcNameCtx *)ctx; + + if (n == 0) /* production (1) */ + { + pfnc->firstLineNumber = atoi(s); + insertNodeInTree(pfnc->root,pfnc->currentFuncName,pfnc->firstLineNumber,pfnc->firstLineNumber); + } + else if (n == 1) /* production (2) */ + { + if (idx == 0) /* lhs of production (2) */ + pfnc->firstLineNumber = atoi(s); + else /* rhs of production (2). */ + insertNodeInTree(pfnc->root,pfnc->currentFuncName,pfnc->firstLineNumber,atoi(s)); + } +} + +/* p3 puts an entry into the FNL tree for all of the lines of a given functionname, or, it parses the list of line number ranges and + uses p4 to spill each individual range (or just one line number) into the tree */ +static void p3(char *s,int idx,int n,void *ctx) +{ + funcNameCtx *pfnc = (funcNameCtx *)ctx; + + if (n == 0 && *s == 0) /* production (5)/(7) */ + { + insertNodeInTree(pfnc->root,pfnc->currentFuncName,0,INT_MAX); + } + else if (*s) /* production (2) */ + { + /* breakdown the string into hyphen-delimited substrings, and further parses each substring with p4: */ + parseString(s,'-',p4,ctx); + } +} + +/* p2 parses the function name, and caches it into the context, and thereafter uses p3 to parse the line number range list. */ +static void p2(char *s,int idx,int n,void *ctx) +{ + funcNameCtx *pfnc = (funcNameCtx *)ctx; + + if (n) + { + if (idx == 0) + pfnc->currentFuncName = s; + else + { + /* production (4) */ + /* breakdown the string into comma-delimited substrings, and further parses each substring with p3: */ + parseString(s,',',p3,ctx); + } + } + else + { + /* production (7)/(5). */ + insertNodeInTree(pfnc->root,pfnc->currentFuncName=s,0,INT_MAX); + } +} + +/* p1 parses each function name and line range list. */ +static void p1(char *s,int idx,int n,void *ctx) +{ + /* production (5)/(6)) */ + /* breakdown the string into semi-colon-delimited substrings, and further parses each substring with p2: */ + parseString(s,';',p2,ctx); +} + +static void parseAndInsertInTree(const char *buf,treeNode **root) +{ + funcNameCtx t; + t.root = root; + char *p = alloca(strlen(buf)+1); + strcpy(p,buf); + /* productions (7)/(8) */ + /* separates string into colon-separated strings, and then parses each substring in p1: */ + parseString(p,':',p1,(void*)&t); +} + +/* initialization code for the psmi log mechanism. */ +static inline void psmi_initialize(const char **plmf_fileName_kernel, + const char **plmf_search_format_string, + treeNode **includeFunctionNamesTreeRoot, + treeNode **excludeFunctionNamesTreeRoot) +{ + static volatile int plmf_initialized = 0; + + if (!plmf_initialized) + { + static pthread_mutex_t plmf_init_mutex = PTHREAD_MUTEX_INITIALIZER; + + if (pthread_mutex_lock(&plmf_init_mutex)) + { + perror("cannot lock mutex for psmi_log_message facility"); + return; + } + /* CRITICAL SECTION BEGIN */ + if (!plmf_initialized) + { + /* initializing psmi log message facility here. */ + const char *env = getenv("PSM3_LOG_FILENAME"); + if (env) + *plmf_fileName_kernel = env; + env = getenv("PSM3_LOG_SRCH_FORMAT_STRING"); + if (env) + { + *plmf_search_format_string = env; + } + else + { + env = getenv("PSM3_LOG_INC_FUNCTION_NAMES"); + if (env) + { + parseAndInsertInTree(env,includeFunctionNamesTreeRoot); + } + env = getenv("PSM3_LOG_EXC_FUNCTION_NAMES"); + if (env) + { + parseAndInsertInTree(env,excludeFunctionNamesTreeRoot); + } + } + /* initialization of psmi log message facility is completed. */ + plmf_initialized = 1; + } + /* CRITICAL SECTION END */ + if (pthread_mutex_unlock(&plmf_init_mutex)) + { + perror("cannot unlock mutex for psmi_log_message facility"); + return; + } + } +} + +/* Utility function to map the integer txrx value to the given strings for emitting to the log file. */ +static const char * const TxRxString(int txrx) +{ + switch(txrx) + { + case PSM2_LOG_TX: return "Sent"; + case PSM2_LOG_RX: return "Received"; + case PSM2_LOG_PEND: return "Pending"; + default: return "Unknown"; + } +} + +/* Utility function to map an integer opcode value to the given strings for emitting to the log file. */ +static const char * const OpcodeString(int opcode) +{ + switch(opcode) + { + case OPCODE_LONG_RTS: return "RTS"; + case OPCODE_LONG_CTS: return "CTS"; + case OPCODE_LONG_DATA: return "DATA"; + case OPCODE_ERR_CHK_RDMA: return "ERR_CHK_RDMA"; + case OPCODE_ERR_CHK_RDMA_RESP: return "ERR_CHK_RDMA_RESP"; + default: return "UNKNOWN"; + } +} + +static const char *plmf_fileName_kernel = "/tmp/psm2_log"; +static const char *plmf_search_format_string = NULL; +static treeNode *includeFunctionNamesTreeRoot = NULL; +static treeNode *excludeFunctionNamesTreeRoot = NULL; + +void psmi_log_initialize(void) +{ + /* If not initialized, then, initialize in a single thread of execution. */ + psmi_initialize(&plmf_fileName_kernel, + &plmf_search_format_string, + &includeFunctionNamesTreeRoot, + &excludeFunctionNamesTreeRoot); +} + +#ifdef PSM_LOG_FAST_IO + +struct psmi_log_io_thread_info +{ + pthread_t thread_id; + char *buff; + unsigned long max_buff_length, curr_buff_length; + pthread_mutex_t flags_mutex; + volatile int flags; +#define PSMI_LOG_IO_FLAG_IO_IN_PROGRESS 1 /* io is currently in progress */ +#define PSMI_LOG_IO_FLAG_IO_SHUTDOWN 2 /* we are shutting down logging. */ +}; + +/* Please note that psmi_log_io_info is in thread local storage. */ +static __thread struct psmi_log_io_thread_info psmi_log_io_info = +{ + .thread_id = 0, + .buff = NULL, + .max_buff_length = 0, + .curr_buff_length = 0, + .flags_mutex = PTHREAD_MUTEX_INITIALIZER, + .flags = 0 +}; + +static struct +{ + unsigned int nTableEntries,maxTableEntries; + pthread_mutex_t table_mutex; + struct psmi_log_io_thread_info **table; +} psmi_log_io_table = +{ + .nTableEntries = 0, + .maxTableEntries = 0, + .table_mutex = PTHREAD_MUTEX_INITIALIZER, + .table = NULL +}; + +void psmi_log_fini() +{ + if (pthread_mutex_lock(&psmi_log_io_table.table_mutex)) + { + perror("Cannot lock mutex for psmi_log_io_table"); + return; + } + /* Start critical section. */ + + unsigned int i; + for (i=0;i < psmi_log_io_table.nTableEntries;i++) + { + if (psmi_log_io_table.table[i]) + { + struct psmi_log_io_thread_info *pti = psmi_log_io_table.table[i]; + int flags; + + if (pthread_mutex_lock(&pti->flags_mutex)) + { + perror("can't lock the flags mutex."); + continue; + } + /* critical section */ + flags = (pti->flags |= PSMI_LOG_IO_FLAG_IO_SHUTDOWN); + /* end critical section */ + pthread_mutex_unlock(&pti->flags_mutex); + /* if io is currenctly in progress, allow it to complete. */ + while (flags & PSMI_LOG_IO_FLAG_IO_IN_PROGRESS) + { + sleep(1); + if (pthread_mutex_lock(&pti->flags_mutex)) + { + perror("can't lock the flags mutex."); + continue; + } + flags = pti->flags; + pthread_mutex_unlock(&pti->flags_mutex); + } + if (pti->buff) + { + char logFileName[256]; + FILE *fout; + + snprintf(logFileName,sizeof(logFileName),"%s.%d.%ld", + plmf_fileName_kernel,getpid(),pti->thread_id); + fout = fopen(logFileName,"w"); + if (!fout) + { + perror(logFileName); + continue; + } + fwrite(pti->buff,pti->curr_buff_length,1,fout); + fclose(fout); + } + } + psmi_log_io_table.table[i] = NULL; + } + psmi_log_io_table.nTableEntries = 0; + psmi_free(psmi_log_io_table.table); + psmi_log_io_table.table = NULL; + psmi_log_io_table.maxTableEntries = 0; + /* End critical section. */ + pthread_mutex_unlock(&psmi_log_io_table.table_mutex); +} + +static int psmi_log_register_tls(void) +{ + if (psmi_log_io_info.thread_id != pthread_self()) + { + psmi_log_io_info.thread_id = pthread_self(); + if (pthread_mutex_lock(&psmi_log_io_table.table_mutex)) + { + perror("cannot lock table mutex"); + return -1; + } + /* critical section start. */ + if (psmi_log_io_table.maxTableEntries < psmi_log_io_table.nTableEntries+1) + { + if (psmi_log_io_table.maxTableEntries == 0) + { + psmi_log_io_table.maxTableEntries = 2; + psmi_log_io_table.table = psmi_malloc(PSMI_EP_NONE, + PER_PEER_ENDPOINT, + psmi_log_io_table.maxTableEntries * + sizeof(struct psmi_log_io_thread_info *)); + } + else + { + psmi_log_io_table.maxTableEntries *= 2; + psmi_log_io_table.table = psmi_realloc(PSMI_EP_NONE, + PER_PEER_ENDPOINT, + psmi_log_io_table.table, + psmi_log_io_table.maxTableEntries * + sizeof(struct psmi_log_io_thread_info *)); + } + } + psmi_log_io_table.table[psmi_log_io_table.nTableEntries] = &psmi_log_io_info; + psmi_log_io_table.nTableEntries++; + /* critical section end. */ + pthread_mutex_unlock(&psmi_log_io_table.table_mutex); + } + if (pthread_mutex_lock(&psmi_log_io_info.flags_mutex)) + { + perror("cannot lock table mutex"); + return -1; + } + /* critical section start. */ + int old_flags = psmi_log_io_info.flags; + int new_flags = old_flags; + if (0 == (old_flags & PSMI_LOG_IO_FLAG_IO_SHUTDOWN)) + new_flags |= PSMI_LOG_IO_FLAG_IO_IN_PROGRESS; + psmi_log_io_info.flags = new_flags; + /* critical section end. */ + pthread_mutex_unlock(&psmi_log_io_info.flags_mutex); + if (new_flags & PSMI_LOG_IO_FLAG_IO_IN_PROGRESS) + return 0; + return -1; +} + +static void psmi_buff_fclose(int port) +{ + if (pthread_mutex_lock(&psmi_log_io_info.flags_mutex)) + { + perror("cannot lock table mutex"); + return; + } + /* critical section start. */ + psmi_log_io_info.flags &= ~PSMI_LOG_IO_FLAG_IO_IN_PROGRESS; + /* critical section end. */ + pthread_mutex_unlock(&psmi_log_io_info.flags_mutex); +} + +static void growBuff(size_t minExcess) +{ + while (psmi_log_io_info.curr_buff_length+minExcess > psmi_log_io_info.max_buff_length) + { + if (!psmi_log_io_info.buff) + psmi_log_io_info.buff = (char *)psmi_malloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, + psmi_log_io_info.max_buff_length = 1 << 20); + else + { + psmi_log_io_info.max_buff_length *= 2; + psmi_log_io_info.buff = (char *)psmi_realloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, + psmi_log_io_info.buff, + psmi_log_io_info.max_buff_length); + } + } +} + +static int psmi_buff_vfprintf(int port, const char *format, va_list ap) +{ + int done = 0; + size_t excess = 1024; + int length; + + while (!done) + { + growBuff(excess); + + length = vsnprintf(psmi_log_io_info.buff + psmi_log_io_info.curr_buff_length, + excess, format, ap); + if (length >= excess) + excess *= 2; + else + done = 1; + } + psmi_log_io_info.curr_buff_length += length; + return length; +} + +static int psmi_buff_fprintf(int port,const char *format, ...) +{ + int length; + va_list ap; + + va_start(ap, format); + + length = psmi_buff_vfprintf(port,format,ap); + + va_end(ap); + return length; +} + +static int psmi_buff_fputc(int c, int port) +{ + growBuff(1024); + psmi_log_io_info.buff[psmi_log_io_info.curr_buff_length] = c; + psmi_log_io_info.curr_buff_length++; + return 1; +} +#endif + + +#define IS_PSMI_LOG_MAGIC(S) ((((uint64_t)(S)) <= ((uint64_t)PSM2_LOG_MIN_MAGIC)) && \ + (((uint64_t)(S)) >= ((uint64_t)PSM2_LOG_MAX_MAGIC))) + +/* plmf is short for 'psm log message facility. All of the PSM_LOG macros defined in psm_log.h + are serviced from this back end. */ +void psmi_log_message(const char *fileName, + const char *functionName, + int lineNumber, + const char *format, ...) +{ + va_list ap; + + va_start(ap, format); + + /* Next, determine if this log message is signal or noise. */ + if (plmf_search_format_string) + { + if (!IS_PSMI_LOG_MAGIC(format)) + { + if (fnmatch(plmf_search_format_string, format, 0)) + { + va_end(ap); + /* tis noise, return. */ + return; + } + } + } + else + { + if (includeFunctionNamesTreeRoot) + { + if (lookupNodeInTree(includeFunctionNamesTreeRoot,functionName,lineNumber)) + { + va_end(ap); + /* tis noise, return. */ + return; + } + } + + if (excludeFunctionNamesTreeRoot) + { + if (!lookupNodeInTree(excludeFunctionNamesTreeRoot,functionName,lineNumber)) + { + va_end(ap); + /* tis noise, return. */ + return; + } + } + } + + /* At this point, we think that this may be a message that we want to emit to the log. + But, there is one more test, to apply to the cases where the format is one of the + special formats for backtrack, and packet stream for example. */ + { + void **voidarray = NULL; + int nframes = 0; + const char *newFormat = format; + int opcode = 0; + psmi_log_tx_rx_t txrx = 0; + uint64_t fromepid = 0; + uint64_t toepid = 0; + void *dumpAddr[2] = {0}; + size_t dumpSize[2] = {0}; + +#ifdef PSM_LOG_FAST_IO +#define IO_PORT 0 +#define MY_FPRINTF psmi_buff_fprintf +#define MY_VFPRINTF psmi_buff_vfprintf +#define MY_FPUTC psmi_buff_fputc +#define MY_FCLOSE psmi_buff_fclose +#else + char logFileName[256]; + FILE *fout; +#define IO_PORT fout +#define MY_FPRINTF fprintf +#define MY_VFPRINTF vfprintf +#define MY_FPUTC fputc +#define MY_FCLOSE fclose +#endif + struct timespec tp; + + /* Pop arguments for the alternative forms of PSM_LOG functionality: */ + if (format == PSM2_LOG_BT_MAGIC) + { + voidarray = va_arg(ap,void **); + nframes = va_arg(ap,int); + newFormat = va_arg(ap,const char *); + } + else if (format == PSM2_LOG_EPM_MAGIC) + { + opcode = va_arg(ap,int); + txrx = va_arg(ap,psmi_log_tx_rx_t); + fromepid = va_arg(ap,uint64_t); + toepid = va_arg(ap,uint64_t); + newFormat = va_arg(ap,const char *); + } + else if (format == PSM2_LOG_DUMP_MAGIC) + { + dumpAddr[0] = va_arg(ap,void*); + dumpSize[0] = va_arg(ap,size_t); + newFormat = va_arg(ap,const char *); + } + else if (format == PSM2_LOG_PKT_STRM_MAGIC) + { + txrx = va_arg(ap,psmi_log_tx_rx_t); + dumpAddr[0] = va_arg(ap,struct ips_message_header *); + if (txrx == PSM2_LOG_RX) + { + dumpAddr[1] = va_arg(ap,uint32_t *); + dumpSize[1] = sizeof(uint64_t); + } + newFormat = va_arg(ap,const char *); + dumpSize[0] = sizeof(struct ips_message_header); + } + + /* One last test to make sure that this message is signal: */ + if (plmf_search_format_string && newFormat) + { + if (fnmatch(plmf_search_format_string, newFormat, 0)) + { + va_end(ap); + /* tis noise, return. */ + return; + } + } + +#ifdef PSM_LOG_FAST_IO + if (psmi_log_register_tls() != 0) + { + va_end(ap); + return; + } +#else + /* At this point we know that the message is not noise, and it is going to be emitted to the log. */ + snprintf(logFileName,sizeof(logFileName),"%s.%d.%ld", + plmf_fileName_kernel,getpid(), + pthread_self()); + fout = fopen(logFileName,"a"); + if (!fout) + { + va_end(ap); + return; + } +#endif + +#define M1() clock_gettime(CLOCK_REALTIME, &tp); \ + MY_FPRINTF(IO_PORT,"%f %s %s:%d: ", \ + (double)tp.tv_sec + ((double)tp.tv_nsec/1000000000.0), \ + functionName,fileName,lineNumber) + + M1(); + + if (!IS_PSMI_LOG_MAGIC(format)) + { + MY_VFPRINTF(IO_PORT,format,ap); + MY_FPUTC('\n',IO_PORT); + } + else if (format == PSM2_LOG_BT_MAGIC) + { + void *newframes[nframes]; + int newframecnt = backtrace(newframes,nframes); + int pframes = min(newframecnt,nframes); + + MY_VFPRINTF(IO_PORT,newFormat,ap); + MY_FPUTC('\n',IO_PORT); + + if (memcmp(voidarray,newframes,pframes * sizeof(void*))) + { + int i; + char **strings; + + memcpy(voidarray,newframes,sizeof(newframes)); + M1(); + MY_FPRINTF(IO_PORT, + "backtrace() returned %d addresses\n", + newframecnt); + strings = backtrace_symbols(voidarray, pframes); + if (strings == NULL) + { + perror("backtrace_symbols"); + exit(EXIT_FAILURE); + } + for (i = 0; i < pframes; i++) + { + M1(); + MY_FPRINTF(IO_PORT,"%s\n", strings[i]); + } +#undef free + free(strings); + } + } + else if (format == PSM2_LOG_EPM_MAGIC) + { + static epmTreeNode *root = 0; + static pthread_mutex_t plmf_epm_mutex = + PTHREAD_MUTEX_INITIALIZER; + int *pcount = 0; + if (pthread_mutex_lock(&plmf_epm_mutex)) + { + perror("cannot lock mutex for " + "psmi_log_message facility"); + va_end(ap); + return; + } + /* START OF CRITICAL SECTION */ + pcount = insertNodeInEpmTree(&root,opcode,txrx, + fromepid,toepid); + /* END OF CRITICAL SECTION */ + if (pthread_mutex_unlock(&plmf_epm_mutex)) + { + perror("cannot unlock mutex for " + "psmi_log_message facility"); + va_end(ap); + return; + } + (*pcount)++; + MY_FPRINTF(IO_PORT,"%s %s from: %" PRIx64 + ", to: %" PRIx64 ", count: %d, ", + TxRxString(txrx),OpcodeString(opcode), + fromepid,toepid,*pcount); + MY_VFPRINTF(IO_PORT,newFormat,ap); + MY_FPUTC('\n',IO_PORT); + } + else if (format == PSM2_LOG_PKT_STRM_MAGIC) + { + MY_FPRINTF(IO_PORT,"PKT_STRM: %s: imh: %p%s ", TxRxString(txrx), + dumpAddr[0], (txrx == PSM2_LOG_RX) ? "," : ""); + if (txrx == PSM2_LOG_RX) + MY_FPRINTF(IO_PORT,"rhf: %p ", dumpAddr[1]); + goto dumpit; + } + else if (format == PSM2_LOG_DUMP_MAGIC) + { + MY_VFPRINTF(IO_PORT,newFormat,ap); + MY_FPUTC('\n',IO_PORT); + dumpit: + M1(); + + uint8_t *pu8 = (uint8_t *)dumpAddr[0]; + size_t i,cnt=0; + for (i=0;i < dumpSize[0];i++) + { + if ((i != 0) && ((i % 8) == 0)) + { + MY_FPRINTF(IO_PORT," (%d)\n",(int)(i-8)); + M1(); + cnt = 0; + } + else if (cnt) + MY_FPUTC(',',IO_PORT); + MY_FPRINTF(IO_PORT,"0x%02x", pu8[i]); + cnt++; + } + if (cnt) + MY_FPRINTF(IO_PORT," (%d)\n",(int)(i-8)); + if (dumpSize[1]) + { + dumpSize[0] = dumpSize[1]; + dumpAddr[0] = dumpAddr[1]; + dumpSize[1] = 0; + goto dumpit; + } + } + MY_FCLOSE(IO_PORT); + } + + va_end(ap); +} +#endif /* #ifdef PSM_LOG */ diff --git a/prov/psm3/psm3/psm_utils.h b/prov/psm3/psm3/psm_utils.h new file mode 100644 index 00000000000..0551650cdcb --- /dev/null +++ b/prov/psm3/psm3/psm_utils.h @@ -0,0 +1,432 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef _PSMI_IN_USER_H +#error psm_utils.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSMI_UTILS_H +#define _PSMI_UTILS_H + +#include /* ipv4addr */ +#include /* malloc/free */ +#include + +/* + * Endpoint 'id' hash table, with iterator interface + */ +struct psmi_epid_table { + struct psmi_epid_tabentry *table; + int tabsize; + int tabsize_used; + pthread_mutex_t tablock; +}; +/* + * Endpoint address hash table + */ +struct psmi_epid_tabentry { + void *entry; + uint64_t key; + psm2_ep_t ep; + psm2_epid_t epid; +}; + +extern struct psmi_epid_table psmi_epid_table; +#define EPADDR_DELETED ((void *)-1) /* tag used to mark deleted entries */ + +psm2_error_t psmi_epid_init(); +psm2_error_t psmi_epid_fini(); +void *psmi_epid_lookup(psm2_ep_t ep, psm2_epid_t epid); +void *psmi_epid_remove(psm2_ep_t ep, psm2_epid_t epid); +void psmi_epid_remove_all(psm2_ep_t ep); +psm2_error_t psmi_epid_add(psm2_ep_t ep, psm2_epid_t epid, void *entry); +#define PSMI_EP_HOSTNAME ((psm2_ep_t) -1) /* Special endpoint handle we use + * to register hostnames */ +#define PSMI_EP_CROSSTALK ((psm2_ep_t) -2) /* Second special endpoint handle + * to log which nodes we've seen + * crosstalk from */ +struct psmi_eptab_iterator { + int i; /* last index looked up */ + psm2_ep_t ep; +}; +void psmi_epid_itor_init(struct psmi_eptab_iterator *itor, psm2_ep_t ep); +void *psmi_epid_itor_next(struct psmi_eptab_iterator *itor); +void psmi_epid_itor_fini(struct psmi_eptab_iterator *itor); + +uint64_t psmi_epid_version(psm2_epid_t epid); + +/* + * Hostname manipulation + */ +char *psmi_gethostname(void); +const char *psmi_epaddr_fmt_addr(psm2_epid_t epid); +const char *psmi_epaddr_get_hostname(psm2_epid_t epid); +const char *psmi_epaddr_get_name(psm2_epid_t epid); +psm2_error_t psmi_epid_set_hostname(uint64_t nid, const char *hostname, + int overwrite); + +/* + * Memory allocation, use macros only. + * + * In all calls, ep can be a specific endpoint (valid psm2_ep_t) or PSMI_EP_NONE + * if no endpoint is available. + * + * psmi_malloc_usable_size(void *ptr) + * psmi_malloc(ep, memtype, size) + * psmi_realloc(ep, memtype, ptr, newsize) + * psmi_memalign(ep, memtype, alignment, size) + * psmi_calloc(ep, memtype, elemsz, numelems) + * psmi_strdup(ep, memtype, ptr) + * psmi_free(ptr) + * + */ +typedef enum psmi_memtype { + TOTAL = 0, /* Logged automatically by malloc/calloc */ + UNDEFINED, /* For tracking "other types" of allocations */ + PER_PEER_ENDPOINT, /* For tracking "per peer" allocations */ + NETWORK_BUFFERS, /* For tracking network buffers */ + DESCRIPTORS, /* For tracking send/recv descriptors */ + UNEXPECTED_BUFFERS, /* For tracking unexpected recv buffers */ + STATS, /* For tracking stats-related allocs */ +#ifdef RNDV_MOD_MR + // TBD, should we just tabulate this into PER_PEER_ENDPOINT + // maybe once debugged we should consolidate? + PEER_RNDV, /* for tracking Rendezvous per RC QP resources */ +#endif +} psmi_memtype_t; + +/* + * We track allocation stats. + */ +struct psmi_stats_malloc { + int64_t m_all_total; + int64_t m_all_max; + int64_t m_perpeer_total; + int64_t m_perpeer_max; + int64_t m_netbufs_total; + int64_t m_netbufs_max; + int64_t m_descriptors_total; + int64_t m_descriptors_max; + int64_t m_unexpbufs_total; + int64_t m_unexpbufs_max; + int64_t m_undefined_total; + int64_t m_undefined_max; + int64_t m_stats_total; + int64_t m_stats_max; +#ifdef RNDV_MOD_MR + int64_t m_peerrndv_total; + int64_t m_peerrndv_max; +#endif +}; + +extern struct psmi_stats_malloc psmi_stats_memory; + +void psmi_mem_stats_register(void); + +void *psmi_malloc_internal(psm2_ep_t ep, psmi_memtype_t mt, size_t sz, + const char *curloc); +void *psmi_realloc_internal(psm2_ep_t ep, psmi_memtype_t mt, void *ptr, + size_t newSz, const char *curloc); +void *psmi_memalign_internal(psm2_ep_t ep, psmi_memtype_t mt, size_t alignment, + size_t sz, const char *curloc); +void *psmi_calloc_internal(psm2_ep_t ep, psmi_memtype_t mt, size_t num, + size_t sz, const char *curloc); +void *psmi_strdup_internal(psm2_ep_t ep, const char *string, const char *curloc); + +void MOCKABLE(psmi_free_internal)(void *ptr, const char *curLoc); +MOCK_DCL_EPILOGUE(psmi_free_internal); + +size_t psmi_malloc_usable_size_internal(void *ptr, const char *curLoc); + +#ifdef PSM_HEAP_DEBUG +/* During heap debug code, we can sprinkle function calls: + psmi_heapdebug_val_heapallocs(), that will examine all of the heap allocations + to ensure integrity. */ +void _psmi_heapdebug_val_heapallocs(const char *curloc); + +#define psmi_heapdebug_val_heapallocs() _psmi_heapdebug_val_heapallocs(PSMI_CURLOC) + +/* Finialize the heapdebug functionality after tear down of the psm + session when you are certain that all heap allocations have been + freed. psmi_heapdebug_finalize() will emit all of the extant + heap allocations and abort if there are any. This is to aid + in debug of heap leaks. */ +void psmi_heapdebug_finalize(void); + +#else + +#define psmi_heapdebug_val_heapallocs() /* nothing */ +#define psmi_heapdebug_finalize() /* nothing */ + +#endif + +#define psmi_strdup(ep, string) psmi_strdup_internal(ep, string, PSMI_CURLOC) +#define psmi_calloc(ep, mt, nelem, elemsz) \ + psmi_calloc_internal(ep, mt, nelem, elemsz, PSMI_CURLOC) +#define psmi_malloc(ep, mt, sz) psmi_malloc_internal(ep, mt, sz, PSMI_CURLOC) +#define psmi_realloc(ep, mt, ptr, nsz) psmi_realloc_internal(ep, mt, ptr, nsz, PSMI_CURLOC) +#define psmi_memalign(ep, mt, al, sz) \ + psmi_memalign_internal(ep, mt, al, sz, PSMI_CURLOC) +#define psmi_free(ptr) psmi_free_internal(ptr, PSMI_CURLOC) +#define psmi_malloc_usable_size(ptr) psmi_malloc_usable_size_internal(ptr, PSMI_CURLOC) +#ifndef PSM_IS_TEST +#define malloc(sz) _use_psmi_malloc_instead_of_plain_malloc +#define realloc(ptr,nsz) _use_psmi_realloc_instead_of_plain_realloc +#define memalign(algn,sz) _use_psmi_memalign_instead_of_plain_memalign +#define calloc(sz, nelm) _use_psmi_calloc_instead_of_plain_calloc +#ifdef strdup +#undef strdup +#endif +#define strdup(ptr) _use_psmi_strdup_instead_of_plain_strdup +#define free(ptr) _use_psmi_free_instead_of_plain_free +#define malloc_usable_size(ptr) _use_psmi_malloc_usable_size_instead_of_plain_malloc_usable_size +#endif /* PSM_IS_TEST */ + +void psmi_log_memstats(psmi_memtype_t type, int64_t nbytes); + +/* + * Parse int parameters + * -1 -> parse error + */ +long psmi_parse_str_long(const char *str); + +/* + * Parsing int parameters set in string tuples. + */ +int psmi_parse_str_tuples(const char *str, int ntup, int *vals); + +/* + * Resource Limiting based on PSM memory mode. + */ +#define PSMI_MEMMODE_NORMAL 0 +#define PSMI_MEMMODE_MINIMAL 1 +#define PSMI_MEMMODE_LARGE 2 +#define PSMI_MEMMODE_NUM 3 + +struct psmi_rlimit_mpool { + const char *env; + const char *descr; + int env_level; + uint32_t minval; + uint32_t maxval; + struct { + uint32_t obj_chunk; + uint32_t obj_max; + } mode[PSMI_MEMMODE_NUM]; +}; +psm2_error_t psmi_parse_mpool_env(const psm2_mq_t mq, int level, + const struct psmi_rlimit_mpool *rlim, + uint32_t *valo, uint32_t *chunkszo); +int psmi_parse_memmode(void); +int psmi_parse_identify(void); +unsigned psmi_parse_rdmamode(void); + +/* + * Parsing environment variables + */ + +union psmi_envvar_val { + void *e_void; + char *e_str; + int e_int; + unsigned int e_uint; + long e_long; + unsigned long e_ulong; + unsigned long long e_ulonglong; +}; + +#define PSMI_ENVVAR_LEVEL_USER 1 +#define PSMI_ENVVAR_LEVEL_HIDDEN 2 +#define PSMI_ENVVAR_LEVEL_NEVER_PRINT 4 + +#define PSMI_ENVVAR_TYPE_YESNO 0 +#define PSMI_ENVVAR_TYPE_STR 1 +#define PSMI_ENVVAR_TYPE_INT 2 +#define PSMI_ENVVAR_TYPE_UINT 3 +#define PSMI_ENVVAR_TYPE_UINT_FLAGS 4 +#define PSMI_ENVVAR_TYPE_LONG 5 +#define PSMI_ENVVAR_TYPE_ULONG 6 +#define PSMI_ENVVAR_TYPE_ULONG_FLAGS 7 +#define PSMI_ENVVAR_TYPE_ULONG_ULONG 8 + +#define PSMI_ENVVAR_VAL_YES ((union psmi_envvar_val) 1) +#define PSMI_ENVVAR_VAL_NO ((union psmi_envvar_val) 0) + +int +MOCKABLE(psmi_getenv)(const char *name, const char *descr, int level, + int type, union psmi_envvar_val defval, + union psmi_envvar_val *newval); +MOCK_DCL_EPILOGUE(psmi_getenv); +/* + * Misc functionality + */ +uintptr_t psmi_getpagesize(void); +uint64_t psmi_cycles_left(uint64_t start_cycles, int64_t timeout_ns); +uint32_t psmi_get_ipv4addr(); +void psmi_syslog(psm2_ep_t ep, int to_console, int level, + const char *format, ...); +void psmi_uuid_unparse(const psm2_uuid_t uuid, char *out); +int psmi_uuid_compare(const psm2_uuid_t uuA, const psm2_uuid_t uuB); +void *psmi_memcpyo(void *dst, const void *src, size_t n); +uint32_t psmi_crc(unsigned char *buf, int len); + +/* + * Internal CPUID detection + */ +#define CPUID_FAMILY_MASK 0x00000f00 +#define CPUID_MODEL_MASK 0x000000f0 +#define CPUID_EXMODEL_MASK 0x000f0000 + +/* + * CPUID return values + */ +#define CPUID_FAMILY_XEON 0x00000600 +#define CPUID_MODEL_PHI_GEN2 87 +#define CPUID_MODEL_PHI_GEN2M 133 +/* + * cpuid function 0, returns "GeniuneIntel" in EBX,ECX,EDX + * due to Little Endian and Hex it is not so obvious + */ +#define CPUID_GENUINE_INTEL_EBX 0x756e6547 /* "uneG" - Little Endian "Genu" */ +#define CPUID_GENUINE_INTEL_ECX 0x6c65746e /* "Ieni" - Little Endian "ineI" */ +#define CPUID_GENUINE_INTEL_EDX 0x49656e69 /* "letn" - Little Endian "ntel" */ + +/* + * These values are internal only, not real register values + */ +#define CPUID_GENUINE_INTEL 0xf0000000 +#define CPUID_MODEL_UNDEFINED -1 + +/* + * Global model so we can tune defaults better for specific cpu's + */ +extern uint32_t psmi_cpu_model; + +/* + * Diagnostics, all in psm_diags.c + */ +int psmi_diags(void); + +/* + * Multiple Endpoints + */ +extern int psmi_multi_ep_enabled; +void psmi_multi_ep_init(); + +#ifdef PSM_FI +/* + * Fault injection + * Controlled by: + * PSM3_FI=0/1 - enable + * PSM3_FI_TRACEFILE - where to put summary stats at end of run + * "stdout", "stderr", of prefix for per process filename + * PSM3_FI_VERBOSE - output to std when generate fault + * PSM3_FI_RAIL - only generate for secondary EPs/Rails/QPs + * PSM3_FI_X - for each fault type: num:denom:seed + * fault num/denom of events, seed random for reproducing + * recvlost - discard packet on receive before processing + * rq_lkey - RQ WQE with bad lkey + * rc_rdma_lkey - User RC SQ WQE with bad lkey + * rc_rdma_rkey - User RC SQ WQE with bad rkey + * rv_rdma_len - RV SQ WQE with bad len + * rv_rdma_rkey - RV SQ WQE with bad rkey + * sq_lkey - SQ WQE with bad lkey + * sendlost - discard packet on send before sending + * reg_mr - register MR failure (ENOMEM) + * nonpri_reg_mr - non-priority register MR failure (ENOMEM) + * pri_reg_mr - priority register MR failure (ENOMEM) + */ +struct psmi_faultinj_spec; +int psmi_faultinj_enabled; /* use macro to test */ +int psmi_faultinj_sec_rail; /* faults only on secondary rails or EPs */ +#if 1 /* possible to disable at compile time */ +#define PSMI_FAULTINJ_ENABLED() (!!psmi_faultinj_enabled) +#define PSMI_FAULTINJ_ENABLED_EP(ep) (PSMI_FAULTINJ_ENABLED() \ + && (!psmi_faultinj_sec_rail || \ + (psmi_opened_endpoint && (ep) != psmi_opened_endpoint))) +#else +#define PSMI_FAULTINJ_ENABLED() 0 +#define PSMI_FAULTINJ_ENABLED_EP(ep) 0 +#endif + +void psmi_faultinj_init(); +void psmi_faultinj_fini(); +struct psmi_faultinj_spec *psmi_faultinj_getspec(const char *spec_name, + const char *help, + int num, int denom); +#define PSMI_FAULTINJ_STATIC_DECL(var, spec_name, help, num, denom) \ + static struct psmi_faultinj_spec *var; \ + if (PSMI_FAULTINJ_ENABLED() && (var) == NULL) \ + (var) = psmi_faultinj_getspec((spec_name), (help), (num), (denom)); +int psmi_faultinj_is_fault(struct psmi_faultinj_spec *spec); + +#endif /* #ifdef PSM_FI */ +/* + * PSM core component set/get options + */ +psm2_error_t psmi_core_setopt(const void *core_obj, int optname, + const void *optval, uint64_t optlen); + +psm2_error_t psmi_core_getopt(const void *core_obj, int optname, + void *optval, uint64_t *optlen); + +/* + * PSM AM component set/get options + */ +psm2_error_t psmi_am_setopt(const void *am_obj, int optname, + const void *optval, uint64_t optlen); + +psm2_error_t psmi_am_getopt(const void *am_obj, int optname, + void *optval, uint64_t *optlen); + +#endif /* _PSMI_UTILS_H */ diff --git a/prov/psm3/psm3/psm_verbs_ep.c b/prov/psm3/psm3/psm_verbs_ep.c new file mode 100644 index 00000000000..d4f5150fe0a --- /dev/null +++ b/prov/psm3/psm3/psm_verbs_ep.c @@ -0,0 +1,2131 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include +#include +#include +#include /* cpu_set */ +#include /* isalpha */ +#include +#include // for AF_IB +#include +#include +#include +#include + +#include "psm_user.h" +#include "psm_mq_internal.h" +#include "psm_am_internal.h" +#ifdef RNDV_MOD_MR +#include "psm_rndv_mod.h" +#endif +#include "opa_byteorder.h" +#include "ips_proto_params.h" +#include "psm2_hal.h" +#ifdef PSM_FI +#include "ips_config.h" +#endif + + +#ifdef min +#undef min +#endif +#define min(a, b) ((a) < (b) ? (a) : (b)) + +#ifdef max +#undef max +#endif +#define max(a, b) ((a) > (b) ? (a) : (b)) + +// macros taken fron IbAccess imath.h +/* round up value to align, align must be a power of 2 */ +#ifndef ROUNDUPP2 +#define ROUNDUPP2(val, align) \ + (((uint32_t)(val) + (uint32_t)(align) - 1) & (~((uint32_t)(align)-1))) +#endif +/* force to use 64 bits in 32bit box */ +#ifndef ROUNDUP64P2 +#define ROUNDUP64P2(val, align) \ + (((uint64_t)(val) + (uint64_t)(align) - 1) & (~((uint64_t)(align)-1))) +#endif + +/* round up value to align, align can be any value, less efficient than ROUNDUPP2 */ +#ifndef ROUNDUP +#define ROUNDUP(val, align) \ + ((( ((uint32_t)(val)) + (uint32_t)(align) -1) / (align) ) * (align)) +#endif + +/* round down value to align, align must be a power of 2 */ +#ifndef ROUNDDOWNP2 +#define ROUNDDOWNP2(val, align) \ + (((uint32_t)(val)) & (~((uint32_t)(align)-1))) +#endif + +/* round down value to align, align can be any value, less efficient than ROUNDDOWNP2 */ +#ifndef ROUNDDOWN +#define ROUNDDOWN(val, align) \ + ((( ((uint32_t)(val))) / (align) ) * (align)) +#endif + + + +// convert MTU enums to bytes +// TBD - is there a way to specify MTU > 4K, such as 9000 byte jumbo +#define MTU_FIX (7) // mtu_ind of 1 (256) => 2^(7+1) +#define MTU_SIZE(mtu_ind) (((uint64_t)1 << (MTU_FIX + mtu_ind))) + +static psm2_error_t verbs_open_dev(psm2_ep_t ep, int unit, int port, psm2_uuid_t const job_key); +static psm2_error_t +check_port_state(psm2_ep_t ep); +static struct ibv_qp* ud_qp_create(psm2_ep_t ep); +static psm2_error_t modify_ud_qp_to_init(psm2_ep_t ep, struct ibv_qp *qp); +static psm2_error_t modify_ud_qp_to_rtr(psm2_ep_t ep, struct ibv_qp *qp); +static psm2_error_t modify_ud_qp_to_rts(psm2_ep_t ep, struct ibv_qp *qp); +static const char *link_layer_str(int8_t link_layer); +static enum psm_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed); + +void __psm2_ep_free_verbs(psm2_ep_t ep); +#ifdef RNDV_MOD_MR +static void deregister_rv_conn_stats(psm2_ep_t ep); +static void deregister_rv_event_stats(psm2_ep_t ep); +#endif + +// initialize the ep->verbs_ep portion of the ep +psm2_error_t +__psm2_ep_open_verbs(psm2_ep_t ep, int unit, int port, psm2_uuid_t const job_key) +{ + int flags; + + // make sure all fields are empty. + memset(&ep->verbs_ep,0,sizeof(ep->verbs_ep)); + + ep->verbs_ep.qkey = *(uint32_t*)job_key; // use 1st 32 bits of job_key + + if (_HFI_UDDBG_ON) { + char uuid_str[64]; + memset(&uuid_str, 0, sizeof(uuid_str)); + psmi_uuid_unparse(job_key, uuid_str); + _HFI_UDDBG("job key %s qkey=0x%x\n", uuid_str, ep->verbs_ep.qkey); + } + + if (PSM2_OK != verbs_open_dev(ep, unit, port, job_key)) { + // verbs_open_dev already posted error. + goto fail; + } + + // compute an appropriate PSM payload size based on the UD MTU + // and save result into ep->mtu + if (PSM2_OK != check_port_state(ep)) { + goto fail; + } + + // we'll poll, so no need to allocate an event channel + // eg. ibv_create_comp_channel + + ep->verbs_ep.pd = ibv_alloc_pd(ep->verbs_ep.context); + if (! ep->verbs_ep.pd) { + _HFI_ERROR( "Unable to alloc PD on %s: %s\n", + ep->verbs_ep.ib_devname, strerror(errno)); + goto fail; + } + + // planned QP sizes, also influences CQ sizes + // PSM3_NUM_SEND_WQES, PSM3_NUM_RECV_WQES + + // we use ep as the cq_context (would be in callbacks if any) + // we don't setup a completion channel nor completion vector since we will + // poll + // we will never have more than hfi_num_send_wqes + hfi_num_send_rdma + // so CQ only needs a little headroom to be safe (1000) + ep->verbs_ep.send_cq = ibv_create_cq(ep->verbs_ep.context, ep->hfi_num_send_wqes+ep->hfi_num_send_rdma + 1000, (void*)ep, NULL, 0); + if (! ep->verbs_ep.send_cq) { + _HFI_ERROR( "Unable to create send CQ of size %u on %s: %s\n", + ep->hfi_num_send_wqes+1000, ep->verbs_ep.ib_devname, + strerror(errno)); + goto fail; + } + + ep->verbs_ep.recv_comp_channel = ibv_create_comp_channel(ep->verbs_ep.context); + if (! ep->verbs_ep.recv_comp_channel) { + _HFI_ERROR( "Unable to create recv CQ completion channel on %s: %s\n", + ep->verbs_ep.ib_devname, strerror(errno)); + goto fail; + } + // change completion channel to non-blocking + flags = fcntl( ep->verbs_ep.recv_comp_channel->fd, F_GETFL); + if (0 > fcntl( ep->verbs_ep.recv_comp_channel->fd, F_SETFL, flags | O_NONBLOCK)) { + _HFI_ERROR( "Unable to change file descriptor of completion event channel for %s: %s\n", + ep->verbs_ep.ib_devname, strerror(errno)); + goto fail; + } + // this gets done by __psm2_ep_poll_type + //if (ibv_req_notify_cq(ep->verbs_ep.recv_cq, 0)) { + // _HFI_ERROR("Can't request RQ events from %s: %s\n", + // ep->verbs_ep.ib_devname, strerror(errno)); + // goto fail; + //} + + // TBD - should we pick an EQ number + // we use ep as the cq_context (would be in callbacks if any) + // we will never have more than hfi_num_recv_wqes+HFI_TF_NFLOWS + // inflight WQEs + // so CQ only needs a little headroom to be safe (1000) + // HFI_TF_NFLOWS (32) limits receiver side concurrent tidflows (aka inbound + // RDMA w/immed). + // For USER RC Eager we can have num_recv_wqes/FRACTION per QP + // in which case theoretical need could be huge. We add 4000 as a + // swag to cover most cases and user can always tune higher as needed + if (! ep->hfi_num_recv_cqes) { + ep->hfi_num_recv_cqes = ep->hfi_num_recv_wqes+HFI_TF_NFLOWS+1000; + if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) + ep->hfi_num_recv_cqes += 4000; + } + ep->verbs_ep.recv_cq = ibv_create_cq(ep->verbs_ep.context, + ep->hfi_num_recv_cqes, + (void*)ep, ep->verbs_ep.recv_comp_channel, 0); + if (! ep->verbs_ep.recv_cq) { + _HFI_ERROR( "Unable to create recv CQ of size %u on %s: %s\n", + ep->hfi_num_recv_cqes, ep->verbs_ep.ib_devname, + strerror(errno)); + goto fail; + } + + ep->verbs_ep.qp = ud_qp_create(ep); + if (! ep->verbs_ep.qp) { + _HFI_ERROR( "Unable to create QP\n"); + goto fail; + } + + // rest of resources initialized by __psm2_ep_initialize_queues after we + // have processed PSM3_MTU configuration + return PSM2_OK; + +fail: + __psm2_ep_free_verbs(ep); + return PSM2_INTERNAL_ERR; +} + +// ep->mtu is now max PSM payload, not including headers and perhaps decreased +// via PSM3_MTU +// initialize the buffer pools and move the UD QP to RTS +psm2_error_t +__psm2_ep_initialize_queues(psm2_ep_t ep) +{ + + if (PSM2_OK != psm_verbs_alloc_send_pool(ep, ep->verbs_ep.pd, &ep->verbs_ep.send_pool, + // save 1 send WQE just to be paranoid (should be unnecessary) + min(ep->hfi_num_send_wqes, ep->verbs_ep.qp_cap.max_send_wr-1), + // want to end up with multiple of cache line (64) + // ep->mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU + // be conservative (+BUFFER_HEADROOM) + ep->mtu + MAX_PSM_HEADER + BUFFER_HEADROOM + )) { + _HFI_ERROR( "Unable to allocate UD send buffer pool\n"); + goto fail; + } + if (PSM2_OK != psm_verbs_init_send_allocator(&ep->verbs_ep.send_allocator, + &ep->verbs_ep.send_pool)) { + _HFI_ERROR( "Unable to init UD send buffer allocator\n"); + goto fail; + } + + ep->verbs_ep.send_reap_thresh = min(ep->hfi_send_reap_thresh, ep->verbs_ep.send_pool.send_total/2); + _HFI_UDDBG("reaping when %u posted.\n", ep->verbs_ep.send_reap_thresh); + + if (PSM2_OK != psm_verbs_alloc_recv_pool(ep, ep->verbs_ep.qp, &ep->verbs_ep.recv_pool, + min(ep->hfi_num_recv_wqes, ep->verbs_ep.qp_cap.max_recv_wr), + // want to end up with multiple of cache line (64) + // ep->mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU + // be conservative (+BUFFER_HEADROOM) + ep->mtu + MAX_PSM_HEADER + BUFFER_HEADROOM + )) { + _HFI_ERROR( "Unable to allocate UD recv buffer pool\n"); + goto fail; + } + + if (PSM2_OK != modify_ud_qp_to_init(ep, ep->verbs_ep.qp)) { + goto fail; + } + + if (PSM2_OK != __psm2_ep_verbs_prepost_recv(&ep->verbs_ep.recv_pool)) { + _HFI_ERROR( "Unable to prepost recv buffers on QP\n"); + goto fail; + } + + // move QP to RTR and RTS + if(PSM2_OK != modify_ud_qp_to_rtr(ep, ep->verbs_ep.qp)) { + goto fail; + } + if(PSM2_OK != modify_ud_qp_to_rts(ep, ep->verbs_ep.qp)) { + goto fail; + } + _HFI_UDDBG("created QP %p (%u)\n", ep->verbs_ep.qp, ep->verbs_ep.qp->qp_num); + return PSM2_OK; + +fail: + psm_verbs_free_send_pool(&ep->verbs_ep.send_pool); + psm_verbs_free_recv_pool(&ep->verbs_ep.recv_pool); + return PSM2_INTERNAL_ERR; +} + +int __psm2_ep_poll_type(int poll_type, psm2_ep_t ep) +{ + //if (poll_type == PSMI_HAL_POLL_TYPE_URGENT) { + if (poll_type) { + // set for event on solicted recv + _HFI_UDDBG("enable solicited event\n"); + if (0 != ibv_req_notify_cq(ep->verbs_ep.recv_cq, 1)) { + _HFI_ERROR("Can't request solicitied RQ events on %s: %s\n", + ep->verbs_ep.ib_devname, strerror(errno)); + return -1; + } +#if 0 + } else if (poll_type = PSMI_HAL_POLL_TYPE_ANYRCV) { + // set for event on all recv completions + psmi_assert_always(0); // not used by PSM + if (0 != ibv_req_notify_cq(ep->verbs_ep.recv_cq, 0)) { + _HFI_ERROR("Can't request all RQ events on %s: %s\n", + ep->verbs_ep.ib_devname, strerror(errno)); + return -1; + } +#endif + } else { + // no events for solicted and unsolictited recv + _HFI_UDDBG("disable solicited event - noop\n"); + // this is only done once during PSM shutdown of rcvthread. + // Verbs events are one-shots. No way to disable. However once + // PSM stops rcvthread shortly after this call, no one will be + // polling for these events so worst case only 1 additional event + // occurs and does not get reenabled. + } + return 0; +} + +// free reources in ep->verbs_ep portion of the ep +void __psm2_ep_free_verbs(psm2_ep_t ep) +{ + psm_verbs_free_send_pool(&ep->verbs_ep.send_pool); + psm_verbs_free_recv_pool(&ep->verbs_ep.recv_pool); + if (ep->verbs_ep.qp) { + ibv_destroy_qp(ep->verbs_ep.qp); + ep->verbs_ep.qp = NULL; + } + if (ep->verbs_ep.recv_cq) { + ibv_destroy_cq(ep->verbs_ep.recv_cq); + ep->verbs_ep.recv_cq = NULL; + } + if (ep->verbs_ep.recv_comp_channel) { + ibv_destroy_comp_channel(ep->verbs_ep.recv_comp_channel); + ep->verbs_ep.recv_comp_channel = NULL; + } + + if (ep->verbs_ep.send_cq) { + ibv_destroy_cq(ep->verbs_ep.send_cq); + ep->verbs_ep.send_cq = NULL; + } + if (ep->verbs_ep.pd) { + ibv_dealloc_pd(ep->verbs_ep.pd); + ep->verbs_ep.pd = NULL; + } +#ifdef RNDV_MOD_MR + if (ep->verbs_ep.rv) { + if (IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode)) { + deregister_rv_conn_stats(ep); + deregister_rv_event_stats(ep); + } + __psm2_rv_close(ep->verbs_ep.rv); + ep->verbs_ep.rv = NULL; + } +#endif + if (ep->verbs_ep.context) { + ibv_close_device(ep->verbs_ep.context); + ep->verbs_ep.context = NULL; + } + if (ep->verbs_ep.ib_devname) { + psmi_free(ep->verbs_ep.ib_devname); + ep->verbs_ep.ib_devname = NULL; + } +} + +// ep argument is only for calloc to associate memory statistics with ep +// do NOT use ep->verbs_ep.*_pool in this function, instead of pool +// to access buffering fields. This function will be called for other pools +// which are tracked in other structures but still part of the ep's memory stats +psm2_error_t psm_verbs_alloc_send_pool(psm2_ep_t ep, struct ibv_pd *pd, + psm2_verbs_send_pool_t pool, + uint32_t send_total, uint32_t send_buffer_size) +{ + memset(pool,0,sizeof(*pool)); + + // use what we got, make sure it's a multiple of coallesce + // don't grow beyond requested, otherwise we could exceed CQ sizes + pool->send_total = ROUNDDOWN(send_total, VERBS_SEND_CQ_COALLESCE); + + if (send_total && send_buffer_size) { + // allocate send buffers + int i; + pool->send_buffer_size = send_buffer_size; + pool->send_num_free = pool->send_total; + pool->send_buffers = (uint8_t *)psmi_memalign(ep, NETWORK_BUFFERS, CPU_PAGE_ALIGN, + pool->send_total*pool->send_buffer_size); + if (! pool->send_buffers) { + _HFI_ERROR( "can't alloc send buffers"); + goto fail; + } + + _HFI_UDDBG("send pool: buffers: %p size %u\n", pool->send_buffers, pool->send_buffer_size); + pool->send_bufs = (struct verbs_sbuf *)psmi_calloc(ep, NETWORK_BUFFERS, + pool->send_total*sizeof(struct verbs_sbuf), 1); + if (! pool->send_bufs) { + _HFI_ERROR("can't alloc send buffers ctrl"); + goto fail; + } + // prepare free list, put lower numbered buffers at head of free list + for (i=pool->send_total-1; i >= 0; i--) { + pool->send_bufs[i].buffer = &(pool->send_buffers[send_buffer_start(pool, i)]); + pool->send_bufs[i].next = pool->send_free; + pool->send_free = &(pool->send_bufs[i]); + } + _HFI_UDDBG("%u Send Buffers of %u bytes each allocated at %p.\n", pool->send_total, pool->send_buffer_size, + pool->send_buffers); + + // UD doesn't support RDMA, so we just need local NIC to be able to + // access our buffers with kernel bypass (IBV_ACCESS_LOCAL_WRITE) + // technically we probably don't need LOCAL_WRITE for send buffers + pool->send_buffer_mr = ibv_reg_mr( + pd, pool->send_buffers, + pool->send_total*pool->send_buffer_size, + IBV_ACCESS_LOCAL_WRITE); + if (! pool->send_buffer_mr) { + _HFI_ERROR( "Unable to alloc send buffer MR on %s: %s\n", + ep->verbs_ep.ib_devname, strerror(errno)); + goto fail; + } + } + return PSM2_OK; + +fail: + psm_verbs_free_send_pool(pool); + return PSM2_INTERNAL_ERR; +} + +extern psm2_error_t psm_verbs_init_send_allocator( + psm2_verbs_send_allocator_t allocator, + psm2_verbs_send_pool_t pool) +{ + + memset(allocator,0,sizeof(*allocator)); + allocator->pool = pool; + allocator->send_num_til_coallesce = VERBS_SEND_CQ_COALLESCE; + return PSM2_OK; +} + + +// ep argument is only for calloc to associate memory statistics with ep +// do NOT use ep->verbs_ep.*_pool in this function, instead of pool +// to access buffering fields. This function will be called for other pools +// which are tracked in other structures but still part of the ep's memory stats +// For RC QPs receiving only RDMA Write with immediate, no buffer space is +// needed. Caller will specify recv_buffer_size==0 with a recv_total. +psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp, + psm2_verbs_recv_pool_t pool, + uint32_t recv_total, uint32_t recv_buffer_size) +{ + memset(pool,0,sizeof(*pool)); + + pool->qp = qp; // save a reference +#ifdef PSM_FI + pool->ep = ep; +#endif + pool->recv_total = recv_total; + + if (recv_total ) { + int i; + if (recv_buffer_size) { + // allocate recv buffers + pool->recv_buffer_size = recv_buffer_size; + // beginning of UD QP Recv Buf always consumed with space for IB GRH + if (qp->qp_type == IBV_QPT_UD) { + // round up UD_ADDITION (40) to multiple of 64 for better + // cache alignment of buffers + pool->recv_buffer_size += ROUNDUP(UD_ADDITION, 64); + pool->addition = UD_ADDITION; + } + pool->recv_buffers = (uint8_t *)psmi_calloc(ep, NETWORK_BUFFERS, + pool->recv_total*pool->recv_buffer_size, 1); + if (! pool->recv_buffers) { + _HFI_ERROR( "can't alloc recv buffers"); + goto fail; + } + //printf("recv pool: buffers: %p size %u\n", pool->recv_buffers, pool->recv_buffer_size); + pool->recv_bufs = (struct verbs_rbuf *)psmi_calloc(ep, NETWORK_BUFFERS, + pool->recv_total*sizeof(struct verbs_rbuf), 1); + if (! pool->recv_bufs) { + _HFI_ERROR("can't alloc recv buffers ctrl"); + goto fail; + } + // prepare rbuf handles for use as wr_id + for (i=0; irecv_total; i++) { + pool->recv_bufs[i].buffer = &(pool->recv_buffers[recv_buffer_start(pool, i)]); + pool->recv_bufs[i].pool = pool; + } + _HFI_UDDBG("%u Recv Buffers of %u bytes each allocated at %p.\n", pool->recv_total, pool->recv_buffer_size, + pool->recv_buffers); + + // UD doesn't support RDMA, so we just need local NIC to be able to + // access our buffers with kernel bypass (IBV_ACCESS_LOCAL_WRITE) + pool->recv_buffer_mr = ibv_reg_mr( + qp->pd, pool->recv_buffers, + pool->recv_total*pool->recv_buffer_size, + IBV_ACCESS_LOCAL_WRITE); + if (! pool->recv_buffer_mr) { + _HFI_ERROR( "Unable to alloc recv buffer MR on %s: %s\n", + ep->verbs_ep.ib_devname, strerror(errno)); + goto fail; + } + } else { + // we want a pool for RDMA Write w/immediate recv. No buffers + psmi_assert(qp->qp_type != IBV_QPT_UD); + // we use exactly 1 rbuf so wr_id can lead us to pool and qp + pool->recv_bufs = (struct verbs_rbuf *)psmi_calloc(ep, NETWORK_BUFFERS, + sizeof(struct verbs_rbuf), 1); + if (! pool->recv_bufs) { + _HFI_ERROR("can't alloc recv buffers ctrl"); + goto fail; + } + // prepare rbuf handle for use as wr_id + pool->recv_bufs->pool = pool; + _HFI_UDDBG("%u Recv Buffers of %u bytes each allocated.\n", pool->recv_total, pool->recv_buffer_size); + } +#if VERBS_RECV_QP_COALLESCE > 1 + // prebuild as much as we can + for (i=0; i < VERBS_RECV_QP_COALLESCE; i++ ) { + struct ibv_recv_wr *wr = &(pool->recv_wr_list[i]); + wr->next = &(pool->recv_wr_list[i+1]); + if (recv_buffer_size) { + struct ibv_sge *list = &(pool->recv_sge_list[i]); + wr->sg_list = list; + list->length = pool->recv_buffer_size; + list->lkey = pool->recv_buffer_mr->lkey; + wr->num_sge = 1; // size of sg_list + } else { + wr->sg_list = NULL; + wr->num_sge = 0; // size of sg_list + } + } + // fixup end of list + pool->recv_wr_list[VERBS_RECV_QP_COALLESCE-1].next = NULL; + pool->next_recv_wqe = 0; +#endif + } + return PSM2_OK; + +fail: + psm_verbs_free_recv_pool(pool); + return PSM2_INTERNAL_ERR; +} + +void psm_verbs_free_send_pool(psm2_verbs_send_pool_t pool) +{ + if (pool->send_buffer_mr) { + ibv_dereg_mr(pool->send_buffer_mr); + pool->send_buffer_mr = NULL; + } + if (pool->send_bufs) { + psmi_free(pool->send_bufs); + pool->send_bufs = NULL; + } + if (pool->send_buffers) { + psmi_free(pool->send_buffers); + pool->send_buffers = NULL; + } + memset(pool,0,sizeof(*pool)); // in case anyone looks at other integers +} + +// this is not allowed to access pool->qp, it may already be destroyed +void psm_verbs_free_recv_pool(psm2_verbs_recv_pool_t pool) +{ + if (pool->recv_buffer_mr) { + ibv_dereg_mr(pool->recv_buffer_mr); + pool->recv_buffer_mr = NULL; + } + if (pool->recv_bufs) { + psmi_free(pool->recv_bufs); + pool->recv_bufs = NULL; + } + if (pool->recv_buffers) { + psmi_free(pool->recv_buffers); + pool->recv_buffers = NULL; + } + memset(pool,0,sizeof(*pool)); // in case anyone looks at other integers +} + +// the allocator tries to reallocate recently freed send buffers +// so we can tend to allocate a small set of buffers +// to improve CPU, MMU and NIC MMU hit rates +sbuf_t __psm2_ep_verbs_alloc_sbuf(psm2_verbs_send_allocator_t allocator) +{ + psm2_verbs_send_pool_t pool = allocator->pool; + sbuf_t sbuf = pool->send_free; + if_pt (sbuf) { + // take off head of free list + pool->send_free = sbuf->next; + pool->send_num_free--; + sbuf->next = NULL; + // keep a list of allocated buffers in order at alloc_head + // and put this one at the alloc_end of the list + if_pf (! allocator->send_alloc_head) // unlikely when more than 1 posted + allocator->send_alloc_head = sbuf; + if_pt (allocator->send_alloc_end) // likely when more than 1 posted + allocator->send_alloc_end->next = sbuf; + allocator->send_alloc_end = sbuf; + sbuf->allocator = allocator; + } + return sbuf; +} + +// buffers must be freed in order, the fact the SQ reports completions in +// same order as send WQEs ensures this +// this will free count buffers with buf being the last freed +void __psm2_ep_verbs_free_sbuf( + sbuf_t buf, uint32_t count) +{ + psm2_verbs_send_allocator_t allocator = buf->allocator; + psm2_verbs_send_pool_t pool = allocator->pool; + sbuf_t b; + do { + // take 1st off allocated list + b = allocator->send_alloc_head; + allocator->send_alloc_head = b->next; + if_pf (allocator->send_alloc_end == b) // unlikely last outstanding + allocator->send_alloc_end = NULL; + // put at head of free list + b->next = pool->send_free; + pool->send_free = b; + pool->send_num_free++; +#ifdef UD_DEBUG + printf("freed: %u num free: %u\n", + (uint32_t)send_buffer_index(pool, b->buffer), + pool->send_num_free); +#endif + } while (--count && b != buf); + // normally we will find buf just as we exhaust count (coallesce amount). + // however when send error CQEs occur (such as flush) we may find less + // than count inflight ahead of buf + //psmi_assert_always(b == buf && count == 0); + psmi_assert_always(b == buf); +} + +psm2_error_t __psm2_ep_verbs_post_recv( + rbuf_t buf) +{ + psm2_verbs_recv_pool_t pool = buf->pool; +#if VERBS_RECV_QP_COALLESCE > 1 + struct ibv_recv_wr *wr; +#else + struct ibv_recv_wr wr; + struct ibv_sge list; +#endif + struct ibv_recv_wr *bad_wr; + + // only RC QPs doing just RDMA Write can have a zero buffer size + if (pool->recv_buffer_size) { + uint32_t index = recv_buffer_index(pool, rbuf_to_buffer(buf)); + // make sure its a buffer in our pool + psmi_assert_always(index < pool->recv_total); + // assert on index covers these 2 asserts + //psmi_assert_always(rbuf_to_buffer(buf) >= pool->recv_buffers); + //psmi_assert_always(rbuf_to_buffer(buf) <= pool->recv_buffers + + // pool->recv_total)*pool->recv_buffer_size); + // make sure buf is exactly at the start of a buffer in our pool + psmi_assert_always(rbuf_to_buffer(buf) == &(pool->recv_buffers[recv_buffer_start(pool, index)])); + +#if VERBS_RECV_QP_COALLESCE > 1 + // put buf in wr at end of list + wr = &(pool->recv_wr_list[pool->next_recv_wqe]); + psmi_assert(wr->sg_list == &(pool->recv_sge_list[pool->next_recv_wqe])); + wr->sg_list->addr = (uintptr_t)rbuf_to_buffer(buf); + wr->wr_id = (uintptr_t)buf; // we'll get this back in completion +#ifdef PSM_FI + if_pf(PSMI_FAULTINJ_ENABLED_EP(pool->ep)) { + PSMI_FAULTINJ_STATIC_DECL(fi_rq_lkey, "rq_lkey", + "post UD " + "or RC " + "RQ WQE with bad lkey", + 0, IPS_FAULTINJ_RQ_LKEY); + if (psmi_faultinj_is_fault(fi_rq_lkey)) + wr->sg_list->lkey = 55; + } else + wr->sg_list->lkey = pool->recv_buffer_mr->lkey; +#endif + if_pf (++pool->next_recv_wqe >= VERBS_RECV_QP_COALLESCE) { + // we have a batch ready to post + if_pf (ibv_post_recv(pool->qp, pool->recv_wr_list, &bad_wr)) { + _HFI_ERROR("failed to post RQ: %s", strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted RQ, including buffer %u\n", index); + pool->next_recv_wqe = 0; + } else { + //_HFI_VDBG("preped RQE, buffer %u\n", index); + } +#else + list.addr = (uintptr_t)rbuf_to_buffer(buf); + list.length = pool->recv_buffer_size; + list.lkey = pool->recv_buffer_mr->lkey; +#ifdef PSM_FI + if_pf(PSMI_FAULTINJ_ENABLED_EP(pool->ep)) { + PSMI_FAULTINJ_STATIC_DECL(fi_rq_lkey, "rq_lkey", + "post UD " + "or RC " + "RQ WQE with bad lkey", + 0, IPS_FAULTINJ_RQ_LKEY); + if (psmi_faultinj_is_fault(fi_rq_lkey)) + list.lkey = 55; + } +#endif + wr.next = NULL; // just post 1 + wr.wr_id = (uintptr_t)buf; // we'll get this back in completion + wr.sg_list = &list; + wr.num_sge = 1; // size of sg_list + + if_pf (ibv_post_recv(pool->qp, &wr, &bad_wr)) { + _HFI_ERROR("failed to post RQ: %s", strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted RQ, buffer %u\n", index); +#endif + } else { +#if VERBS_RECV_QP_COALLESCE > 1 + // put buf in wr at end of list + wr = &(pool->recv_wr_list[pool->next_recv_wqe]); + psmi_assert(wr->sg_list == NULL); + wr->wr_id = (uintptr_t)buf; // we'll get this back in completion + if_pf (++pool->next_recv_wqe >= VERBS_RECV_QP_COALLESCE) { + // we have a batch ready to post + if_pf (ibv_post_recv(pool->qp, pool->recv_wr_list, &bad_wr)) { + _HFI_ERROR("failed to post RQ: %s", strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted RQ\n"); + pool->next_recv_wqe = 0; + } else { + //_HFI_VDBG("preped RQE\n"); + } +#else + wr.next = NULL; // just post 1 + wr.wr_id = (uintptr_t)buf; // we'll get this back in completion + wr.sg_list = NULL; + wr.num_sge = 0; // size of sg_list + + if_pf (ibv_post_recv(pool->qp, &wr, &bad_wr)) { + _HFI_ERROR("failed to post RQ: %s", strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //_HFI_VDBG("posted RQ\n"); +#endif + } + return PSM2_OK; +} + +psm2_error_t __psm2_ep_verbs_prepost_recv( + psm2_verbs_recv_pool_t pool) +{ + int i; + + if (! pool->recv_total) + return PSM2_INTERNAL_ERR; + // prepare RQ + for (i=0; i< pool->recv_total; i++) { + rbuf_t buf = &(pool->recv_bufs[i]); + if (pool->recv_buffer_size) + buf = &(pool->recv_bufs[i]); + else + buf = pool->recv_bufs; // only 1, just to find pool and qp + if (PSM2_OK != __psm2_ep_verbs_post_recv( + buf)) { + _HFI_ERROR( "Unable to post RQ\n"); + return PSM2_INTERNAL_ERR; + } + } + return PSM2_OK; +} + +// only used when PSM3_RDMA enabled +psm2_error_t psm2_verbs_post_rdma_write_immed(psm2_ep_t ep, struct ibv_qp *qp, + void *loc_buf, struct psm2_verbs_mr *loc_mr, + uint64_t rem_buf, uint32_t rkey, + size_t len, uint32_t immed, uint64_t wr_id) +{ + struct ibv_send_wr wr; + struct ibv_send_wr *bad_wr; + struct ibv_sge list; + psm2_error_t ret = PSM2_OK; + + //printf("XXXX %s 0x%p %ld 0x%x\n", __FUNCTION__, loc_buf, len, loc_mr->lkey); + psmi_assert(IPS_PROTOEXP_FLAG_USER_RC_QP(ep->rdmamode)); + + list.addr = (uintptr_t)loc_buf; + list.length = len; + list.lkey = loc_mr->lkey; +#ifdef PSM_FI + if_pf(PSMI_FAULTINJ_ENABLED_EP(ep)) { + PSMI_FAULTINJ_STATIC_DECL(fi_rc_rdma_lkey, "rc_rdma_lkey", + "post RC RDMA Write WQE with bad lkey", + 0, IPS_FAULTINJ_RC_RDMA_LKEY); + if (psmi_faultinj_is_fault(fi_rc_rdma_lkey)) { + printf("corrupting RC RDMA lkey QP %u\n", qp->qp_num); + fflush(stdout); + list.lkey = 55; + } + } +#endif + wr.next = NULL; // just post 1 + psmi_assert(! (wr_id & VERBS_SQ_WR_ID_MASK)); + wr.wr_id = wr_id | VERBS_SQ_WR_ID_RDMA_WRITE; + wr.sg_list = &list; + wr.num_sge = 1; // size of sg_list + wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; + wr.imm_data = immed; + wr.wr.rdma.remote_addr = rem_buf; + wr.wr.rdma.rkey = rkey; +#ifdef PSM_FI + if_pf(PSMI_FAULTINJ_ENABLED_EP(ep)) { + PSMI_FAULTINJ_STATIC_DECL(fi_rc_rdma_rkey, "rc_rdma_rkey", + "post RC RDMA Write WQE with bad rkey", + 0, IPS_FAULTINJ_RC_RDMA_RKEY); + if (psmi_faultinj_is_fault(fi_rc_rdma_rkey)) { + printf("corrupting RC RDMA rkey QP %u\n", qp->qp_num); + fflush(stdout); + wr.wr.rdma.rkey = 55; + } + } +#endif + // RDMA Writes will tend to be larger and we want the completion + // to reflect the RDMA for a given CTS is completed + wr.send_flags = IBV_SEND_SIGNALED; // get a completion + // no need for wr.send_flags |= IBV_SEND_SOLICITED + // these will be bigger sends, no need for inline + ep->verbs_ep.send_rdma_outstanding++; + if_pf (ibv_post_send(qp, &wr, &bad_wr)) { + if (errno != EBUSY && errno != EAGAIN && errno != ENOMEM) + _HFI_ERROR("failed to post RC SQ on %s: %s", + ep->verbs_ep.ib_devname, strerror(errno)); + // caller will try again later when next send buffer freed + // or timer expires + ret = PSM2_TIMEOUT; + ep->verbs_ep.send_rdma_outstanding--; + goto done; + } + _HFI_VDBG("posted RDMA Write: from 0x%"PRIx64" to 0x%"PRIx64" len %u rkey 0x%x\n", + list.addr, wr.wr.rdma.remote_addr, list.length, wr.wr.rdma.rkey /* TBD rem QPN */ ); +#if 0 + // we will not have many in flight at a time so + // normal progress calls should be sufficient + // no need to reap completions here + err = psm2_verbs_completion_update(ep); + if_pf (err != PSM2_OK) + return err; +#endif +done: + //printf("XXXX %s ret:%d\n", __FUNCTION__, ret); + return ret; +} + +#ifdef RNDV_MOD_MR +psm2_error_t psm2_verbs_post_rv_rdma_write_immed(psm2_ep_t ep, + psm2_rv_conn_t conn, + void *loc_buf, struct psm2_verbs_mr *loc_mr, + uint64_t rem_buf, uint32_t rkey, + size_t len, uint32_t immed, uint64_t wr_id, + uint8_t *sconn_index, uint32_t *conn_count) +{ + psm2_error_t ret = PSM2_OK; + + //printf("XXXX %s 0x%p %ld 0x%x\n", __FUNCTION__, loc_buf, len, loc_mr->lkey); + psmi_assert(IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode)); + + ep->verbs_ep.send_rdma_outstanding++; +#ifdef PSM_FI + if_pf(PSMI_FAULTINJ_ENABLED_EP(ep)) { + PSMI_FAULTINJ_STATIC_DECL(fi_rv_rdma_len, "rv_rdma_len", + "post RV RDMA Write with bad len (may want RV build with RNDV_LOCAL_ERR_TEST)", + 0, IPS_FAULTINJ_RV_RDMA_LEN); + if (psmi_faultinj_is_fault(fi_rv_rdma_len)) + len += 1000000000; + } + if_pf(PSMI_FAULTINJ_ENABLED_EP(ep)) { + PSMI_FAULTINJ_STATIC_DECL(fi_rv_rdma_rkey, "rv_rdma_rkey", + "post RV RDMA Write with bad rkey", + 1, IPS_FAULTINJ_RV_RDMA_RKEY); + if (psmi_faultinj_is_fault(fi_rv_rdma_rkey)) + rkey = 55; + } +#endif + if (__psm2_rv_post_rdma_write_immed(ep->verbs_ep.rv, conn, + loc_buf, loc_mr->mr.rv_mr, + rem_buf, rkey, + len, immed, wr_id, sconn_index, conn_count)) { + switch (errno) { + case EIO: + // lost or failed connection + ret = PSM2_EPID_RV_CONNECT_ERROR; + break; + case EAGAIN: + // lost connection and are recoverying it + ret = PSM2_EPID_RV_CONNECT_RECOVERING; + break; + case ENOMEM: + case EBUSY: + // caller will try again later when next send buffer freed + // or timer expires + ret = PSM2_TIMEOUT; + break; + default: + ret = PSM2_INTERNAL_ERR; + break; + } + if (errno != EBUSY && errno != EAGAIN && errno != ENOMEM) + _HFI_ERROR("failed to post RV RC SQ on %s: %s", + ep->verbs_ep.ib_devname, strerror(errno)); + ep->verbs_ep.send_rdma_outstanding--; + goto done; + } + _HFI_VDBG("posted RV RDMA Write: from 0x%"PRIx64" to 0x%"PRIx64" len %u rkey 0x%x\n", + (uint64_t)loc_buf, rem_buf, (unsigned)len, rkey /* TBD rem QPN */ ); +done: + //printf("XXXX %s ret:%d\n", __FUNCTION__, ret); + return ret; +} +#endif // RNDV_MOD_MR + +extern int ips_protoexp_rdma_write_completion( uint64_t wr_id); + +// we structure this similar to ips_proto_dma_completion_update +// this is non-blocking. We reap what's available and then return +psm2_error_t +psm2_verbs_completion_update(psm2_ep_t ep) +{ + #define CQE_BATCH 10 // reap a few at a time, hopefully faster this way + //#define CQE_BATCH 8 or 18 // reap a few at a time, hopefully faster this way + // 18*COALLESE > default reap threshold so we + // should get away with one poll_q + // not sure if doing the exact math here would + // add clocks and hurt a bit more than approx math + // int batch = (send_reap_thresh/COALLESCE) + 2 + // alloca(sizeof(ibv_wc) & batch) + struct ibv_wc wc[CQE_BATCH]; + int ne; + + PSMI_LOCK_ASSERT(ep->mq->progress_lock); + // TBD - when coallescing completions we'll tend to fall through to poll_cq + // this only called when out of buffers or immediately after posting a send + // reduce the frequency of poll_cq by only checking once we have at least + // send_reap_thresh sends in flight + // for USE_RC this is imperfect, we can have a handful of unsignaled + // send WQEs on multiple RC QPs, in which case we may exceed the + // reap_thresh but not find any CQEs until we post more sends and + // hit the coalsce threshold. + if_pt ((! ep->verbs_ep.send_rdma_outstanding + || IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode)) + && ep->verbs_ep.send_pool.send_num_free > ep->verbs_ep.send_pool.send_total - ep->verbs_ep.send_reap_thresh ) + return PSM2_OK; // not ready to reap, return quickly + + //if ( 0 != (ne = ibv_poll_cq(ep->verbs_ep.send_cq, CQE_BATCH, wc))) + while ( 0 != (ne = ibv_poll_cq(ep->verbs_ep.send_cq, CQE_BATCH, wc))) + { + unsigned i; + for (i=0; iverbs_ep.ib_devname, + ibv_wc_status_str(wc[i].status), (int)wc[i].status, + wc[i].qp_num); + // For user space RC QP, the QP is now in QPS_ERROR and we + // need to reset (or replace) and reconnect it. + // Upcoming async event will cause us to stop. + // User's wanting reliability for RDMA should use RV. + if (VERBS_SQ_WR_OP(wc[i].wr_id) == VERBS_SQ_WR_ID_SEND) + __psm2_ep_verbs_free_sbuf( + (sbuf_t)(wc[i].wr_id & ~VERBS_SQ_WR_ID_MASK), + VERBS_SEND_CQ_COALLESCE); + continue; + } + switch (wc[i].opcode) { + case IBV_WC_SEND: + // UD sends just mean it got onto the wire and can reuse our buf + // no guarantees it made it to the remote side + // buffer address is in wc.wr_id + _HFI_VDBG("send done (%u bytes) sbuf index %lu\n", wc[i].byte_len, + send_buffer_index(&ep->verbs_ep.send_pool, sbuf_to_buffer((sbuf_t)(wc[i].wr_id)))); + __psm2_ep_verbs_free_sbuf( + (sbuf_t)(wc[i].wr_id & ~VERBS_SQ_WR_ID_MASK), + VERBS_SEND_CQ_COALLESCE); + break; + case IBV_WC_RDMA_WRITE: + ep->verbs_ep.send_rdma_outstanding--; + ips_protoexp_rdma_write_completion( + wc[i].wr_id & ~VERBS_SQ_WR_ID_MASK); + break; + default: + _HFI_ERROR("unexpected send completion on %s opcode %d QP %u\n", + ep->verbs_ep.ib_devname, + wc[i].opcode, wc[i].qp_num); + break; + } + } +#if 0 + // this is optional, especially if use "if" above instead of while + if (ne verbs_ep.context); + if (0 != ibv_query_pkey(ep->verbs_ep.context, port, index, &pkey)) { + _HFI_ERROR( "Can't query pkey index %d on %s: %s\n", index, + ep->verbs_ep.ib_devname, strerror(errno)); + return -1; + } + _HFI_UDDBG("got pkey 0x%x on %s\n", __be16_to_cpu(pkey), + ep->verbs_ep.ib_devname); + return __be16_to_cpu(pkey); +} + +#ifdef RNDV_MOD_MR +// accessor functions for cm statistics +#define EP_STAT_FUNC(func, stat) \ + static uint64_t func(void *context) \ + { \ + psm2_ep_t ep = (psm2_ep_t)context; \ + return ep->stat; \ + } + +EP_STAT_FUNC(rv_q_depth, rv_q_depth) +EP_STAT_FUNC(rv_reconnect_timeout, rv_reconnect_timeout) +EP_STAT_FUNC(rv_hb_interval, rv_hb_interval) +#undef EP_STAT_FUNC + +static uint64_t rv_index(void *context) +{ + struct psm2_verbs_ep *vep = &((psm2_ep_t)context)->verbs_ep; + return vep->rv_index; +} + +static uint64_t rv_conn_flags(void *context) +{ + struct psm2_verbs_ep *vep = &((psm2_ep_t)context)->verbs_ep; + if (vep->rv) { + // this is a little sly, we know the stats processing routines will + // call the accessors in the order from the entries list + // so we use the 1st of the rv statistics accessors to get + // the statistics from rv into the cache structure so other accessors + // can simply return the relevant value + // we get aggregated values instead of per conn + (void)__psm2_rv_get_conn_stats(vep->rv, NULL, 0, &vep->rv_conn_stats); + } + return vep->rv_conn_stats.flags; +} + +#define RV_CM_STAT_FUNC(func, stat) \ + static uint64_t func(void *context) \ + { \ + struct psm2_verbs_ep *vep = &((psm2_ep_t)context)->verbs_ep; \ + return vep->rv_conn_stats.stat; \ + } + +RV_CM_STAT_FUNC(rv_conn_num_conn, num_conn) +RV_CM_STAT_FUNC(rv_conn_req_error, req_error) +RV_CM_STAT_FUNC(rv_conn_req_recv, req_recv) +RV_CM_STAT_FUNC(rv_conn_rep_error, rep_error) +RV_CM_STAT_FUNC(rv_conn_rep_recv, rep_recv) +RV_CM_STAT_FUNC(rv_conn_rtu_recv, rtu_recv) +RV_CM_STAT_FUNC(rv_conn_established, established) +RV_CM_STAT_FUNC(rv_conn_dreq_error, dreq_error) +RV_CM_STAT_FUNC(rv_conn_dreq_recv, dreq_recv) +RV_CM_STAT_FUNC(rv_conn_drep_recv, drep_recv) +RV_CM_STAT_FUNC(rv_conn_timewait, timewait) +RV_CM_STAT_FUNC(rv_conn_mra_recv, mra_recv) +RV_CM_STAT_FUNC(rv_conn_rej_recv, rej_recv) +RV_CM_STAT_FUNC(rv_conn_lap_error, lap_error) +RV_CM_STAT_FUNC(rv_conn_lap_recv, lap_recv) +RV_CM_STAT_FUNC(rv_conn_apr_recv, apr_recv) +RV_CM_STAT_FUNC(rv_conn_unexp_event, unexp_event) +RV_CM_STAT_FUNC(rv_conn_req_sent, req_sent) +RV_CM_STAT_FUNC(rv_conn_rep_sent, rep_sent) +RV_CM_STAT_FUNC(rv_conn_rtu_sent, rtu_sent) +RV_CM_STAT_FUNC(rv_conn_rej_sent, rej_sent) +RV_CM_STAT_FUNC(rv_conn_dreq_sent, dreq_sent) +RV_CM_STAT_FUNC(rv_conn_drep_sent, drep_sent) +//RV_CM_STAT_FUNC(rv_conn_wait_time, wait_time) +//RV_CM_STAT_FUNC(rv_conn_resolve_time, resolve_time) +//RV_CM_STAT_FUNC(rv_conn_connect_time, connect_time) +//RV_CM_STAT_FUNC(rv_conn_connected_time, connected_time) +RV_CM_STAT_FUNC(rv_conn_resolve, resolve) +RV_CM_STAT_FUNC(rv_conn_resolve_fail, resolve_fail) +RV_CM_STAT_FUNC(rv_conn_conn_recovery, conn_recovery) +//RV_CM_STAT_FUNC(rv_conn_rewait_time, rewait_time) +//RV_CM_STAT_FUNC(rv_conn_reresolve_time, reresolve_time) +//RV_CM_STAT_FUNC(rv_conn_reconnect_time, reconnect_time) +//RV_CM_STAT_FUNC(rv_conn_max_rewait_time, max_rewait_time) +//RV_CM_STAT_FUNC(rv_conn_max_reresolve_time, max_reresolve_time) +//RV_CM_STAT_FUNC(rv_conn_max_reconnect_time, max_reconnect_time) +RV_CM_STAT_FUNC(rv_conn_reresolve, reresolve) +RV_CM_STAT_FUNC(rv_conn_reresolve_fail, reresolve_fail) +//RV_CM_STAT_FUNC(rv_conn_post_write, post_write) +//RV_CM_STAT_FUNC(rv_conn_post_write_fail, post_write_fail) +//RV_CM_STAT_FUNC(rv_conn_post_write_bytes, post_write_bytes) +RV_CM_STAT_FUNC(rv_conn_outstand_send_write, outstand_send_write) +//RV_CM_STAT_FUNC(rv_conn_send_write_cqe, send_write_cqe) +//RV_CM_STAT_FUNC(rv_conn_send_write_cqe_fail, send_write_cqe_fail) +//RV_CM_STAT_FUNC(rv_conn_recv_write_cqe, recv_write_cqe) +//RV_CM_STAT_FUNC(rv_conn_recv_write_bytes, recv_write_bytes) +//RV_CM_STAT_FUNC(rv_conn_recv_cqe_fail, recv_cqe_fail) +//RV_CM_STAT_FUNC(rv_conn_post_hb, post_hb) +//RV_CM_STAT_FUNC(rv_conn_post_hb_fail, post_hb_fail) +//RV_CM_STAT_FUNC(rv_conn_send_hb_cqe, send_hb_cqe) +//RV_CM_STAT_FUNC(rv_conn_send_hb_cqe_fail, send_hb_cqe_fail) +//RV_CM_STAT_FUNC(rv_conn_recv_hb_cqe, recv_hb_cqe) +#undef RV_CM_STAT_FUNC + +static void register_rv_conn_stats(psm2_ep_t ep) +{ + struct psm2_rv_conn_stats *ep_rv_conn_stats = &ep->verbs_ep.rv_conn_stats; + + struct psmi_stats_entry entries[] = { + PSMI_STATS_DECL("rv_q_depth", MPSPAWN_STATS_REDUCTION_ALL, + rv_q_depth, NULL), + PSMI_STATS_DECL("rv_reconnect_timeout", MPSPAWN_STATS_REDUCTION_ALL, + rv_reconnect_timeout, NULL), + PSMI_STATS_DECL("rv_hb_interval", MPSPAWN_STATS_REDUCTION_ALL, + rv_hb_interval, NULL), + PSMI_STATS_DECL("rv_index", MPSPAWN_STATS_REDUCTION_ALL, + rv_index, NULL), + + PSMI_STATS_DECL("rv_conn_flags", MPSPAWN_STATS_REDUCTION_ALL, + rv_conn_flags, NULL), + + PSMI_STATS_DECL_FUNC("num_conn", rv_conn_num_conn), + PSMI_STATS_DECL_FUNC("req_error", rv_conn_req_error), + PSMI_STATS_DECL_FUNC("req_recv", rv_conn_req_recv), + PSMI_STATS_DECL_FUNC("rep_error", rv_conn_rep_error), + PSMI_STATS_DECL_FUNC("rep_recv", rv_conn_rep_recv), + PSMI_STATS_DECL_FUNC("rtu_recv", rv_conn_rtu_recv), + PSMI_STATS_DECL_FUNC("established", rv_conn_established), + PSMI_STATS_DECL_FUNC("dreq_error", rv_conn_dreq_error), + PSMI_STATS_DECL_FUNC("dreq_recv", rv_conn_dreq_recv), + PSMI_STATS_DECL_FUNC("drep_recv", rv_conn_drep_recv), + PSMI_STATS_DECL_FUNC("timewait", rv_conn_timewait), + PSMI_STATS_DECL_FUNC("mra_recv", rv_conn_mra_recv), + PSMI_STATS_DECL_FUNC("rej_recv", rv_conn_rej_recv), + PSMI_STATS_DECL_FUNC("lap_error", rv_conn_lap_error), + PSMI_STATS_DECL_FUNC("lap_recv", rv_conn_lap_recv), + PSMI_STATS_DECL_FUNC("apr_recv", rv_conn_apr_recv), + PSMI_STATS_DECL_FUNC("unexp_event", rv_conn_unexp_event), + PSMI_STATS_DECL_FUNC("req_sent", rv_conn_req_sent), + PSMI_STATS_DECL_FUNC("rep_sent", rv_conn_rep_sent), + PSMI_STATS_DECL_FUNC("rtu_sent", rv_conn_rtu_sent), + PSMI_STATS_DECL_FUNC("rej_sent", rv_conn_rej_sent), + PSMI_STATS_DECL_FUNC("dreq_sent", rv_conn_dreq_sent), + PSMI_STATS_DECL_FUNC("drep_sent", rv_conn_drep_sent), + PSMI_STATS_DECLU64("wait_time", (uint64_t*)&ep_rv_conn_stats->wait_time), + PSMI_STATS_DECLU64("resolve_time", (uint64_t*)&ep_rv_conn_stats->resolve_time), + PSMI_STATS_DECLU64("connect_time", (uint64_t*)&ep_rv_conn_stats->connect_time), + PSMI_STATS_DECLU64("connected_time", (uint64_t*)&ep_rv_conn_stats->connected_time), + PSMI_STATS_DECL_FUNC("resolve", rv_conn_resolve), + PSMI_STATS_DECL_FUNC("resolve_fail", rv_conn_resolve_fail), + PSMI_STATS_DECL_FUNC("conn_recovery", rv_conn_conn_recovery), + PSMI_STATS_DECLU64("rewait_time", (uint64_t*)&ep_rv_conn_stats->rewait_time), + PSMI_STATS_DECLU64("reresolve_time", (uint64_t*)&ep_rv_conn_stats->reresolve_time), + PSMI_STATS_DECLU64("reconnect_time", (uint64_t*)&ep_rv_conn_stats->reconnect_time), + PSMI_STATS_DECLU64("max_rewait_time", (uint64_t*)&ep_rv_conn_stats->max_rewait_time), + PSMI_STATS_DECLU64("max_reresolve_time", (uint64_t*)&ep_rv_conn_stats->max_reresolve_time), + PSMI_STATS_DECLU64("max_reconnect_time", (uint64_t*)&ep_rv_conn_stats->max_reconnect_time), + PSMI_STATS_DECL_FUNC("reresolve", rv_conn_reresolve), + PSMI_STATS_DECL_FUNC("reresolve_fail", rv_conn_reresolve_fail), + PSMI_STATS_DECLU64("post_write", (uint64_t*)&ep_rv_conn_stats->post_write), + PSMI_STATS_DECLU64("post_write_fail", (uint64_t*)&ep_rv_conn_stats->post_write_fail), + PSMI_STATS_DECLU64("post_write_bytes", (uint64_t*)&ep_rv_conn_stats->post_write_bytes), + PSMI_STATS_DECL_FUNC("send_write_out", rv_conn_outstand_send_write), + PSMI_STATS_DECLU64("send_write_cqe", (uint64_t*)&ep_rv_conn_stats->send_write_cqe), + PSMI_STATS_DECLU64("send_write_cqe_fail", (uint64_t*)&ep_rv_conn_stats->send_write_cqe_fail), + + PSMI_STATS_DECLU64("recv_write_cqe", (uint64_t*)&ep_rv_conn_stats->recv_write_cqe), + PSMI_STATS_DECLU64("recv_write_bytes", (uint64_t*)&ep_rv_conn_stats->recv_write_bytes), + PSMI_STATS_DECLU64("recv_cqe_fail", (uint64_t*)&ep_rv_conn_stats->recv_cqe_fail), + + PSMI_STATS_DECLU64("post_hb", (uint64_t*)&ep_rv_conn_stats->post_hb), + PSMI_STATS_DECLU64("post_hb_fail", (uint64_t*)&ep_rv_conn_stats->post_hb_fail), + PSMI_STATS_DECLU64("send_hb_cqe", (uint64_t*)&ep_rv_conn_stats->send_hb_cqe), + PSMI_STATS_DECLU64("send_hb_cqe_fail", (uint64_t*)&ep_rv_conn_stats->send_hb_cqe_fail), + PSMI_STATS_DECLU64("recv_hb_cqe", (uint64_t*)&ep_rv_conn_stats->recv_hb_cqe), + }; + + psmi_stats_register_type("RV_Shared_Conn_RDMA_Statistics", + PSMI_STATSTYPE_RV_RDMA, + entries, + PSMI_STATS_HOWMANY(entries), + ep->epid, ep); +} + +static void deregister_rv_conn_stats(psm2_ep_t ep) +{ + psmi_stats_deregister_type(PSMI_STATSTYPE_RV_RDMA, ep); +} + +// accessor functions for event statistics +static uint64_t rv_send_write_cqe(void *context) +{ + struct psm2_verbs_ep *vep = &((psm2_ep_t)context)->verbs_ep; + if (vep->rv) { + // this is a little sly, we know the stats processing routines will + // call the accessors in the order from the entries list + // so we use the 1st of the rv statistics accessors to get + // the statistics from rv into the cache structure so other accessors + // can simply return the relevant value + // we get aggregated values instead of per conn + (void)__psm2_rv_get_event_stats(vep->rv, &vep->rv_event_stats); + } + return vep->rv_event_stats.send_write_cqe; +} + +#define RV_EVENT_STAT_FUNC(func, stat) \ + static uint64_t func(void *context) \ + { \ + struct psm2_verbs_ep *vep = &((psm2_ep_t)context)->verbs_ep; \ + return vep->rv_event_stats.stat; \ + } + +//RV_EVENT_STAT_FUNC(rv_send_write_cqe_fail, send_write_cqe_fail) +//RV_EVENT_STAT_FUNC(rv_send_write_bytes, send_write_bytes) + +//RV_EVENT_STAT_FUNC(rv_recv_write_cqe, recv_write_cqe) +//RV_EVENT_STAT_FUNC(rv_recv_write_cqe_fail, recv_write_cqe_fail) +//RV_EVENT_STAT_FUNC(rv_recv_write_bytes, recv_write_bytes) +#undef RV_EVENT_STAT_FUNC + +static void register_rv_event_stats(psm2_ep_t ep) +{ + struct psm2_rv_event_stats *ep_rv_event_stats = &ep->verbs_ep.rv_event_stats; + + struct psmi_stats_entry entries[] = { + PSMI_STATS_DECL_FUNC("send_write_cqe", rv_send_write_cqe), + PSMI_STATS_DECLU64("send_write_cqe_fail", (uint64_t*)&ep_rv_event_stats->send_write_cqe_fail), + PSMI_STATS_DECLU64("send_write_bytes", (uint64_t*)&ep_rv_event_stats->send_write_bytes), + + PSMI_STATS_DECLU64("recv_write_cqe", (uint64_t*)&ep_rv_event_stats->recv_write_cqe), + PSMI_STATS_DECLU64("recv_write_cqe_fail", (uint64_t*)&ep_rv_event_stats->recv_write_cqe_fail), + PSMI_STATS_DECLU64("recv_write_bytes", (uint64_t*)&ep_rv_event_stats->recv_write_bytes), + }; + + psmi_stats_register_type("RV_User_Event_Statistics", + PSMI_STATSTYPE_RV_EVENT, + entries, + PSMI_STATS_HOWMANY(entries), + ep->epid, ep); +} + +static void deregister_rv_event_stats(psm2_ep_t ep) +{ + psmi_stats_deregister_type(PSMI_STATSTYPE_RV_EVENT, ep); +} + +static psm2_error_t open_rv(psm2_ep_t ep, psm2_uuid_t const job_key) +{ + struct local_info loc_info = { 0 }; + + loc_info.mr_cache_size = ep->rv_mr_cache_size; + loc_info.rdma_mode = IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode)? + RV_RDMA_MODE_KERNEL: RV_RDMA_MODE_USER; + + // the rest of loc_info is really only needed for RV_RDMA_MODE_KERNEL + loc_info.port_num = ep->portnum; + loc_info.num_conn = ep->rv_num_conn; + // caller computes our local EPID, but loc_addr must == PSMI_EPID_GET_LID + // for what will be established as our local epid by psmi_context_open + // later rem_addr will be compared to this and is based on PSMI_EPID_GET_LID + // for a remote epid + if (ep->verbs_ep.port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { + // use IPv4 addr in lgid as local address + loc_info.loc_addr = ep->verbs_ep.ip_addr; + } else { + loc_info.loc_addr = ep->verbs_ep.port_attr.lid; + } + loc_info.index_bits = RV_INDEX_BITS; + loc_info.loc_gid_index = ep->verbs_ep.lgid_index; + loc_info.loc_gid = ep->verbs_ep.lgid; + // TBD qos_class_sl + loc_info.job_key_len = min(RV_MAX_JOB_KEY_LEN, sizeof(psm2_uuid_t)); + loc_info.job_key = (uint8_t*)job_key; + loc_info.service_id = ep->service_id; + loc_info.context = ep; + if (IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode)) { + // HFI_TF_NFLOWS (32) limits recv side concurrent tidflows (aka inbound + // for send we never have more than hfi_num_send_rdma RDMA outstanding + loc_info.cq_entries = ep->hfi_num_send_rdma + HFI_TF_NFLOWS + 32; + } + loc_info.q_depth = ep->rv_q_depth; + loc_info.reconnect_timeout = ep->rv_reconnect_timeout; + loc_info.hb_interval = ep->rv_hb_interval; + + ep->verbs_ep.rv =__psm2_rv_open(ep->verbs_ep.ib_devname, &loc_info); + if (! ep->verbs_ep.rv) { + return PSM2_INTERNAL_ERR; + } + if (psmi_parse_identify()) { + printf("%s %s run-time rv interface v%d.%d\n", + hfi_get_mylabel(), hfi_ident_tag, + loc_info.major_rev, + loc_info.minor_rev); + } + // parallel psm_hal_gen1/psm_hal_inline_i.h handling HFI1_CAP_GPUDIRECT_OT + // psm_context.c will detect a CUDA driver w/non-CUDA PSM as fatal error +#ifndef RV_CAP_GPU_DIRECT +#ifdef PSM_CUDA +#error "Inconsistent build. RV_CAP_GPU_DIRECT must be defined for CUDA builds." +#else +// lifted from rv_user_ioctls.h +#define RV_CAP_GPU_DIRECT (1UL << 63) +#endif +#endif + if (loc_info.capability & RV_CAP_GPU_DIRECT) + psmi_hal_add_cap(PSM_HAL_CAP_GPUDIRECT_OT); + ep->verbs_ep.rv_index = loc_info.rv_index; + ep->rv_mr_cache_size = loc_info.mr_cache_size; + ep->rv_q_depth = loc_info.q_depth; + ep->rv_reconnect_timeout = loc_info.reconnect_timeout; + + if (IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode)) { + register_rv_conn_stats(ep); + register_rv_event_stats(ep); + } + + return PSM2_OK; +} +#endif // RNDV_MOD_MR + +static psm2_error_t verbs_open_dev(psm2_ep_t ep, int unit, int port, psm2_uuid_t const job_key) +{ + // similar to code in ifs-all/Topology, enumerates devices and picks one + int i, num_of_devices; + struct ibv_device **dev_list = NULL; + struct ibv_device *ib_dev = NULL; + int err = PSM2_OK; + const char *unitpath = sysfs_unit_path(unit); + uint64_t hi, lo; + int flags; + + // callers tend not to set port, 0 means any + if (PSM3_NIC_PORT_ANY == port) + port = VERBS_PORT; + ep->portnum = port; + if (! unitpath) { + _HFI_ERROR( "NULL sysfs unitpath for unit %d\n", unit); + return PSM2_INTERNAL_ERR; + } + + char *dev_name = strrchr(unitpath, '/'); + if (dev_name == NULL) { + _HFI_ERROR( "invalid sysfs unitpath for unit %d\n", unit); + return PSM2_INTERNAL_ERR; + } + dev_name++; // Inc past last '/' + + ep->verbs_ep.ib_devname = psmi_strdup(ep, dev_name); + if (! ep->verbs_ep.ib_devname) { + _HFI_ERROR( "can't alloc devname"); + return PSM2_INTERNAL_ERR; + } + + dev_list = ibv_get_device_list(&num_of_devices); + if (num_of_devices <= 0) { + _HFI_ERROR(" Did not detect any RDMA devices \n"); + _HFI_ERROR(" If device exists, check if driver is up\n"); + err = PSM2_INTERNAL_ERR; + goto fail; + } + if (!dev_list) { + _HFI_ERROR(" Internal error, exiting.\n"); + err = PSM2_INTERNAL_ERR; + goto fail; + } + + for (i = 0; i < num_of_devices; i++) { + if (!strcmp(ibv_get_device_name(dev_list[i]), ep->verbs_ep.ib_devname)) + break; + } + if (i >= num_of_devices) { + _HFI_ERROR("Unit Id [%d] name %s not found, number of devices is %d\n", + unit, ep->verbs_ep.ib_devname, num_of_devices); + err = PSM2_INTERNAL_ERR; + goto fail; + } + ep->unit_id = unit; + _HFI_UDDBG("Using unit_id[%d] %s.\n", ep->unit_id, ep->verbs_ep.ib_devname); + + ib_dev = dev_list[i]; // device list order may differ from unit order + ep->verbs_ep.context = ibv_open_device(ib_dev); + if (! ep->verbs_ep.context) { + _HFI_ERROR( "Unable to open %s: %s\n", ep->verbs_ep.ib_devname, + strerror(errno)); + err = PSM2_INTERNAL_ERR; + goto fail; + } else { + _HFI_UDDBG("Opened %s.\n",ep->verbs_ep.ib_devname); + } + // change async events to non-blocking + flags = fcntl( ep->verbs_ep.context->async_fd, F_GETFL); + if (0 > fcntl( ep->verbs_ep.context->async_fd, F_SETFL, flags | O_NONBLOCK)) { + _HFI_ERROR( "Unable to change file descriptor of async events for %s: %s\n", + ep->verbs_ep.ib_devname, strerror(errno)); + err = PSM2_INTERNAL_ERR; + goto fail; + } + + if (ibv_query_port(ep->verbs_ep.context, ep->portnum, &ep->verbs_ep.port_attr)) { + _HFI_ERROR( "Unable to query port %u of %s: %s\n", ep->portnum, + ep->verbs_ep.ib_devname, strerror(errno)); + err = PSM2_INTERNAL_ERR; + goto fail; + } else { + _HFI_UDDBG("Queried %s.\n",ep->verbs_ep.ib_devname); + } + + if (0 != psmi_hal_get_port_subnet(ep->unit_id, ep->portnum, + &ep->gid_hi, &ep->gid_lo, // effective subnet and addr in subnet + &ep->verbs_ep.ip_addr, &ep->verbs_ep.ip_netmask, // if eth + &ep->verbs_ep.lgid_index, &hi, &lo)) { + _HFI_ERROR( "Unable to get subnet for port %u of %s: %s\n", ep->portnum, + ep->verbs_ep.ib_devname, strerror(errno)); + err = PSM2_INTERNAL_ERR; + goto fail; + } else { + ep->verbs_ep.lgid.global.subnet_prefix = __cpu_to_be64(hi); + ep->verbs_ep.lgid.global.interface_id = __cpu_to_be64(lo); + _HFI_UDDBG("Subnet for port %u of %s: 0x%"PRIx64" addr 0x%"PRIx64" gid 0x%"PRIx64":0x%"PRIx64"\n", + ep->portnum, ep->verbs_ep.ib_devname, + ep->gid_hi, ep->gid_lo, hi, lo); + } + +#ifdef RNDV_MOD_MR + if (IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode) + || ep->mr_cache_mode == MR_CACHE_MODE_KERNEL ) { + // cache mode is only set when rdmamode is enabled (eg. kernel or user) + psmi_assert(ep->rdmamode & IPS_PROTOEXP_FLAG_ENABLED); + // open rendezvous module for the same port as our verbs device + err = open_rv(ep, job_key); + if (err != PSM2_OK) { + _HFI_ERROR( "Unable to open rendezvous module for port %u of %s.\n", + ep->portnum, ep->verbs_ep.ib_devname); + // TBD - could ignore error and proceed with UD mode + //err = PSM2_OK; + err = PSM2_INTERNAL_ERR; + goto fail; + } + } +#endif + +done: + if (dev_list) + ibv_free_device_list(dev_list); + return err; + +fail: + if (ep->verbs_ep.context) { + ibv_close_device(ep->verbs_ep.context); + ep->verbs_ep.context = NULL; + } + if (ep->verbs_ep.ib_devname) { + psmi_free(ep->verbs_ep.ib_devname); + ep->verbs_ep.ib_devname = NULL; + } + goto done; +} + +static psm2_error_t +check_port_state(psm2_ep_t ep) +{ + uint32_t active_mtu; + + active_mtu = MTU_SIZE(ep->verbs_ep.port_attr.active_mtu); + if (ep->verbs_ep.port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { + _HFI_UDDBG("running on ethernet at %d MTU\n", active_mtu); + } else { + _HFI_UDDBG( "running on %s at %d MTU\n", link_layer_str(ep->verbs_ep.port_attr.link_layer), active_mtu); + } + if (strcmp("Unknown", link_layer_str(ep->verbs_ep.port_attr.link_layer)) == 0) { + _HFI_ERROR( "Link layer on port %d of %s is Unknown\n", ep->portnum, + ep->verbs_ep.ib_devname); + return PSM2_INTERNAL_ERR; + } + ep->verbs_ep.link_layer = ep->verbs_ep.port_attr.link_layer; + + if (ep->verbs_ep.port_attr.state != IBV_PORT_ACTIVE) { + _HFI_ERROR( " Port state is not active for %s port %d: %d\n", + ep->verbs_ep.ib_devname, ep->portnum, + ep->verbs_ep.port_attr.state); + //_HFI_ERROR( " Port number %d on %s state is %s\n", + //params->ib_port, ep->verbs_ep.ib_devname, + //portStates[ep->verbs_ep.port_attr.state]); + return PSM2_INTERNAL_ERR; + } + + // compute MTU. + // ep->mtu is the PSM payload size. For OPA native mode, this did not + // include headers as OPA allowed up to an additional 128 bytes of headers. + // However all UD QP payloads (including PSM headers) are + // counted toward MTU in UD verbs. So need to discount by PSM header size + ep->mtu = active_mtu - MAX_PSM_HEADER; + _HFI_UDDBG("Max PSM payload (aka MTU): %u\n", ep->mtu); + // TBD - *act_mtu = defined constant, we can use an eager RC message size + // for PSM which is larger than packet MTU + ep->verbs_ep.active_rate = verbs_get_rate( + ep->verbs_ep.port_attr.active_width, + ep->verbs_ep.port_attr.active_speed); + return PSM2_OK; +} + +static struct ibv_qp* ud_qp_create(psm2_ep_t ep) +{ + struct ibv_qp* qp = NULL; + + struct ibv_qp_init_attr attr = { 0 }; + + attr.qp_context = ep; // our own pointer + attr.send_cq = ep->verbs_ep.send_cq; + attr.recv_cq = ep->verbs_ep.recv_cq; + // one extra WQE to be safe in case verbs needs a spare WQE + attr.cap.max_send_wr = ep->hfi_num_send_wqes+1; + attr.cap.max_send_sge = 1; + attr.cap.max_inline_data = ep->hfi_imm_size; + + attr.srq = NULL; + attr.cap.max_recv_wr = ep->hfi_num_recv_wqes; + attr.cap.max_recv_sge = 1; + + attr.qp_type = IBV_QPT_UD; + + qp = ibv_create_qp(ep->verbs_ep.pd, &attr); + if (qp == NULL && errno == ENOMEM) { + _HFI_ERROR( "Unable to create UD QP on %s: %s\n", + ep->verbs_ep.ib_devname, strerror(errno)); + _HFI_ERROR( "Requested QP size might be too big. Try reducing TX depth and/or inline size.\n"); + _HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n", + ep->hfi_num_send_wqes+1, ep->hfi_num_recv_wqes); + } + + // attr reports what we got, double check and react in case + ep->verbs_ep.qp_cap = attr.cap; + + // QP adjusted values due to HW limits + if (ep->hfi_imm_size > attr.cap.max_inline_data) { + _HFI_UDDBG( "Limited to inline size of %d, requested %u\n", + attr.cap.max_inline_data, ep->hfi_imm_size); + } else { + _HFI_UDDBG("Inline Size: %u\n", attr.cap.max_inline_data); + } + if (ep->hfi_num_send_wqes+1 > attr.cap.max_send_wr) { + _HFI_UDDBG( "Limited to %d SQ WQEs, requested %u\n", + attr.cap.max_send_wr, ep->hfi_num_send_wqes+1); + } else { + _HFI_UDDBG("SQ WQEs: %u\n", attr.cap.max_send_wr); + } + if (1 > attr.cap.max_send_sge) { + _HFI_UDDBG( "Limited to %d SQ SGEs\n", + attr.cap.max_send_sge); + } + if (ep->hfi_num_recv_wqes > attr.cap.max_recv_wr) { + _HFI_UDDBG( "Limited to %d RQ WQEs, requested %u\n", + attr.cap.max_recv_wr, ep->hfi_num_recv_wqes); + } else { + _HFI_UDDBG("RQ WQEs: %u\n", attr.cap.max_recv_wr); + } + if (1 > attr.cap.max_recv_sge) { + _HFI_UDDBG( "Limited to %d RQ SGEs\n", + attr.cap.max_recv_sge); + } + + return qp; +} + +static psm2_error_t modify_ud_qp_to_init(psm2_ep_t ep, struct ibv_qp *qp) +{ + struct ibv_qp_attr attr = { 0 }; + int flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY; + + attr.qp_state = IBV_QPS_INIT; + attr.pkey_index = ep->network_pkey_index; + attr.port_num = ep->portnum; + attr.qkey = ep->verbs_ep.qkey; + //attr.qp_access_flags N/A for UD + //flags |= IBV_QP_ACCESS_FLAGS; + + if (ibv_modify_qp(qp, &attr,flags)) { + _HFI_ERROR( "Failed to modify UD QP to INIT on %s: %s\n", + ep->verbs_ep.ib_devname,strerror(errno)); + return PSM2_INTERNAL_ERR; + } + return PSM2_OK; +} + +static psm2_error_t modify_ud_qp_to_rtr(psm2_ep_t ep,struct ibv_qp *qp) +{ + struct ibv_qp_attr attr = { 0 }; + int flags = IBV_QP_STATE; + + attr.qp_state = IBV_QPS_RTR; + + if (ibv_modify_qp(qp, &attr, flags)) { + _HFI_ERROR( "Failed to modify UD QP to RTR on %s: %s\n", + ep->verbs_ep.ib_devname,strerror(errno)); + return PSM2_INTERNAL_ERR; + } + return PSM2_OK; +} + +static psm2_error_t modify_ud_qp_to_rts(psm2_ep_t ep, struct ibv_qp *qp) +{ + struct ibv_qp_attr attr = { 0 }; + int flags = IBV_QP_STATE | IBV_QP_SQ_PSN; + + attr.qp_state = IBV_QPS_RTS; + attr.sq_psn = 0x1234; // doesn't really matter for UD + + if (ibv_modify_qp(qp, &attr, flags)) { + _HFI_ERROR( "Failed to modify UD QP to RTS on %s: %s\n", + ep->verbs_ep.ib_devname,strerror(errno)); + return PSM2_INTERNAL_ERR; + } + return PSM2_OK; +} + +struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap) +{ + struct ibv_qp* qp = NULL; + + struct ibv_qp_init_attr attr; + memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); + + attr.qp_context = context; + attr.send_cq = ep->verbs_ep.send_cq; + attr.recv_cq = ep->verbs_ep.recv_cq; + attr.srq = NULL; + // one extra WQE to be safe in case verbs needs a spare WQE + if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) { + // need to be prepared in case all sends posted to same RC QP, so + // match the number of send buffers we plan to allocate + attr.cap.max_send_wr = ep->hfi_num_send_wqes+ep->hfi_num_send_rdma+1; + attr.cap.max_send_sge = 1; + // inline data helps latency and message rate for small sends + // Later we may explore use of + // send SGEs pointing to application buffers, somewhat like WFR send DMA + attr.cap.max_inline_data = ep->hfi_imm_size; + attr.cap.max_recv_wr = ep->hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION;// TBD + attr.cap.max_recv_sge = 1; + } else { + // only RDMA Write w/immediate + attr.cap.max_send_wr = ep->hfi_num_send_rdma+1; + attr.cap.max_send_sge = 1; + attr.cap.max_inline_data = 0; + // incoming Write w/immediate consumes a RQ WQE but no buffer needed + attr.cap.max_recv_wr = HFI_TF_NFLOWS+1; + attr.cap.max_recv_sge = 0; + } + + attr.qp_type = IBV_QPT_RC; + + qp = ibv_create_qp(ep->verbs_ep.pd, &attr); + if (qp == NULL) { + _HFI_ERROR( "Unable to create RC QP on %s: %s\n", + ep->verbs_ep.ib_devname, strerror(errno)); + _HFI_ERROR( "Requested QP size might be too big. Try reducing TX depth and/or inline size.\n"); + _HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n", + ep->hfi_num_send_wqes+1, + ep->hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION); + return NULL; + } + +// TBD - getting too small resources should be fatal or adjust limits to be smaller + if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) { + // QP adjusted values due to HW limits + if (ep->hfi_imm_size > attr.cap.max_inline_data) { + _HFI_UDDBG( "Limited to inline size of %d, requested %u\n", + attr.cap.max_inline_data, ep->hfi_imm_size); + } else { + _HFI_UDDBG("Inline Size: %u\n", attr.cap.max_inline_data); + } + if (ep->hfi_num_send_wqes+ep->hfi_num_send_rdma+1 > attr.cap.max_send_wr) { + _HFI_UDDBG( "Limited to %d SQ WQEs, requested %u\n", + attr.cap.max_send_wr, ep->hfi_num_send_wqes+ep->hfi_num_send_rdma+1); + } else { + _HFI_UDDBG("SQ WQEs: %u\n", attr.cap.max_send_wr); + } + if (1 > attr.cap.max_send_sge) { + _HFI_UDDBG( "Limited to %d SQ SGEs\n", + attr.cap.max_send_sge); + } + if (ep->hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION > attr.cap.max_recv_wr) { + _HFI_UDDBG( "Limited to %d RQ WQEs, requested %u\n", + attr.cap.max_recv_wr, ep->hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION); + } else { + _HFI_UDDBG("RQ WQEs: %u\n", attr.cap.max_recv_wr); + } + if (1 > attr.cap.max_recv_sge) { + _HFI_UDDBG( "Limited to %d RQ SGEs\n", + attr.cap.max_recv_sge); + } + } else { + // QP adjusted values due to HW limits + if (ep->hfi_num_send_rdma+1 > attr.cap.max_send_wr) { + _HFI_UDDBG( "Limited to %d SQ WQEs, requested %u\n", + attr.cap.max_send_wr, ep->hfi_num_send_rdma+1); + } else { + _HFI_UDDBG("SQ WQEs: %u\n", attr.cap.max_send_wr); + } + if (1 > attr.cap.max_send_sge) { + _HFI_UDDBG( "Limited to %d SQ SGEs\n", + attr.cap.max_send_sge); + } + if (HFI_TF_NFLOWS+1 > attr.cap.max_recv_wr) { + _HFI_UDDBG( "Limited to %d RQ WQEs, requested %u\n", + attr.cap.max_recv_wr, HFI_TF_NFLOWS+1); + } else { + _HFI_UDDBG("RQ WQEs: %u\n", attr.cap.max_recv_wr); + } + } + + if (cap) + *cap = attr.cap; + _HFI_MMDBG("created RC QP %d\n", qp->qp_num); + return qp; +} + +void rc_qp_destroy(struct ibv_qp* qp) +{ + ibv_destroy_qp(qp); +} + +psm2_error_t modify_rc_qp_to_init(psm2_ep_t ep, struct ibv_qp *qp) +{ + struct ibv_qp_attr attr = { 0 }; + int flags = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT; + + attr.qp_state = IBV_QPS_INIT; + attr.pkey_index = ep->network_pkey_index; + attr.port_num = ep->portnum; + + //attr.qkey = ep->verbs_ep.qkey; + //flags |= IBV_QP_QKEY; // only allowed for UD + attr.qp_access_flags = 0; + attr.qp_access_flags |= IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE; + //attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC; + flags |= IBV_QP_ACCESS_FLAGS; + + if (ibv_modify_qp(qp, &attr, flags)) { + _HFI_ERROR( "Failed to modify RC QP to INIT on %s: %s\n", + ep->verbs_ep.ib_devname, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + _HFI_MMDBG("moved %d to INIT\n", qp->qp_num); + return PSM2_OK; +} + +// initpsn is from packet we received +// req_attr is from REQ or REP from other side +psm2_error_t modify_rc_qp_to_rtr(psm2_ep_t ep, struct ibv_qp *qp, + const struct psm_rc_qp_attr *req_attr, + const ips_path_rec_t *path_rec, uint32_t initpsn) +{ + int flags = IBV_QP_STATE; + struct ibv_qp_attr attr = { 0 }; + + attr.qp_state = IBV_QPS_RTR; + + ips_path_rec_to_ah_attr(ep, path_rec, &attr.ah_attr); + flags |= IBV_QP_AV; + + // TBD - we already factored in req vs pr to update pr no need + // for modify_cq_qp_to_rtr to repeat it + // pr_mtu is max PSM paylod in bytes and req_attr_mtu is IB enum + attr.path_mtu = MIN(opa_mtu_int_to_enum(path_rec->pr_mtu), req_attr->mtu); + attr.dest_qp_num = req_attr->qpn; + attr.rq_psn = initpsn; + flags |= (IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN); + + _HFI_UDDBG("set max_dest_rd_atomic to %u\n", attr.max_dest_rd_atomic); + attr.min_rnr_timer = 12; // TBD well known + flags |= (IBV_QP_MIN_RNR_TIMER | IBV_QP_MAX_DEST_RD_ATOMIC); + + if (ibv_modify_qp(qp, &attr, flags)) { + _HFI_ERROR( "Failed to modify RC QP to RTR on %s: %s\n", + ep->verbs_ep.ib_devname, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + _HFI_MMDBG("moved %d to RTR\n", qp->qp_num); + + return PSM2_OK; +} + +// initpsn is value we sent in our req and rep +// req_attr is from REP we received from other side +psm2_error_t modify_rc_qp_to_rts(psm2_ep_t ep, struct ibv_qp *qp, + const struct psm_rc_qp_attr *req_attr, uint32_t initpsn) +{ + int flags = IBV_QP_STATE; + struct ibv_qp_attr attr = { 0 }; + + attr.qp_state = IBV_QPS_RTS; + + attr.sq_psn = initpsn; // value we told other side + flags |= IBV_QP_SQ_PSN; + + _HFI_UDDBG("set max_rd_atomic to %u\n", attr.max_rd_atomic); + flags |= IBV_QP_MAX_QP_RD_ATOMIC; + + attr.retry_cnt = ep->hfi_qp_retry; + attr.rnr_retry = ep->hfi_qp_retry; // only for eager RC QP rdmamode + attr.timeout = ep->hfi_qp_timeout; + flags |= IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_TIMEOUT; + + _HFI_MMDBG("moving %d to RTS\n", qp->qp_num); + if (ibv_modify_qp(qp, &attr, flags)) { + _HFI_ERROR( "Failed to modify RC QP to RTS on %s: %s\n", + ep->verbs_ep.ib_devname, strerror(errno)); + return PSM2_INTERNAL_ERR; + } + //__psm2_dump_verbs_qp(qp); + return PSM2_OK; +} + +/****************************************************************************** + * * Try to map verbs' link layer types to a descriptive string or "Unknown" + * ******************************************************************************/ +static const char *link_layer_str(int8_t link_layer) +{ + switch (link_layer) { + + case IBV_LINK_LAYER_UNSPECIFIED: + case IBV_LINK_LAYER_INFINIBAND: + return "IB"; + case IBV_LINK_LAYER_ETHERNET: + return "Ethernet"; + default: + return "Unknown"; + } +} + +int __psm2_nonzero_gid(const union ibv_gid *gid) +{ + static union ibv_gid zero_gid = { { 0 } }; + + return memcmp(gid, &zero_gid, sizeof(*gid)) != 0; +} + +char * +__psm2_dump_gid(union ibv_gid *gid, char *buf, size_t bufsize) +{ + snprintf(buf, bufsize, "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:" + "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x", + gid->raw[0], gid->raw[1], gid->raw[2], gid->raw[3], + gid->raw[4], gid->raw[5], gid->raw[6], gid->raw[7], + gid->raw[8], gid->raw[9], gid->raw[10], gid->raw[11], + gid->raw[12], gid->raw[13], gid->raw[14], gid->raw[15]); + + return buf; +} + +void +__psm2_dump_verbs_ep(psm2_ep_t ep, unsigned igid) +{ + struct psm2_verbs_ep *vep = &(ep->verbs_ep); + union ibv_gid gid; + + printf("ib_devname = %s\n", vep->ib_devname); + printf("qp_num = %u\n", vep->qp->qp_num); + printf("GID = "); + if (0 == ibv_query_gid(vep->context, ep->portnum, igid, &gid)) { + char buf[80]; + printf("%s\n", __psm2_dump_gid(&gid, buf, sizeof(buf))); + } else { + printf("unavailable.\n"); + } +} + +void +__psm2_dump_verbs_qp(struct ibv_qp *qp) +{ + struct ibv_qp_attr attr; + struct ibv_qp_init_attr init_attr; + int mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_CAP + /*| IBV_QP_RATE_LIMIT*/ ; + if (qp->qp_type == IBV_QPT_RC) { + mask |= IBV_QP_ACCESS_FLAGS | IBV_QP_AV | IBV_QP_PATH_MTU + | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY + | IBV_QP_RQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC + // | IBV_QP_ALT_PATH + | IBV_QP_MIN_RNR_TIMER | IBV_QP_SQ_PSN + | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_PATH_MIG_STATE + | IBV_QP_DEST_QPN; + } else { + mask |= IBV_QP_QKEY; + } + if (ibv_query_qp(qp, &attr, mask, &init_attr)) { + printf("unable to query QP\n"); + return; + } + // rate_limit field not available in some versions of verbs.h + //printf("QP %p (%u), type %u state %u PkeyIndx %u Port %u rate %u draining %u\n", + // qp, qp->qp_num, qp->qp_type, attr.qp_state, attr.pkey_index, + // attr.port_num, attr.rate_limit, attr.sq_draining); + printf("QP %p (%u), type %u state %u PkeyIndx %u Port %u draining %u\n", + qp, qp->qp_num, qp->qp_type, attr.qp_state, attr.pkey_index, + attr.port_num, attr.sq_draining); + printf(" send: wr %u sge %u inline %u recv: wr %u sqe %u\n", + attr.cap.max_send_wr, attr.cap.max_send_sge, attr.cap.max_inline_data, + attr.cap.max_recv_wr, attr.cap.max_recv_sge); + printf(" context %p send_cq %p recv_cq %p srq %p sg_sig_all %u\n", + init_attr.qp_context, init_attr.send_cq, init_attr.recv_cq, + init_attr.srq, init_attr.sq_sig_all); + if (qp->qp_type == IBV_QPT_RC) { + char buf[80]; + printf(" mtu %u mig %u rq_psn %u sq_psn %u dest_qp %u access %u\n", + attr.path_mtu, attr.path_mig_state, attr.rq_psn, attr.sq_psn, + attr.dest_qp_num, attr.qp_access_flags); + printf(" max_rd_atomic %u max_dest_rd_atomic %u\n", + attr.max_rd_atomic, attr.max_dest_rd_atomic); + printf(" min_rnr_timer %u timeout %u retry_cnt %u rnr_retry %u\n", + attr.min_rnr_timer, attr.timeout, attr.retry_cnt, attr.rnr_retry); + printf(" ah_attr: port %u dlid %u sl %u src_path_bits %u rate %u global %u\n", + attr.ah_attr.port_num, attr.ah_attr.dlid, + attr.ah_attr.sl, + attr.ah_attr.src_path_bits, attr.ah_attr.static_rate, + attr.ah_attr.is_global); + if (attr.ah_attr.is_global) { + printf(" dgid: %s\n", + __psm2_dump_gid(&attr.ah_attr.grh.dgid, buf, sizeof(buf))); + printf(" flow %u sgid_idx %u hop %u tc %u\n", + attr.ah_attr.grh.flow_label, attr.ah_attr.grh.sgid_index, + attr.ah_attr.grh.hop_limit, attr.ah_attr.grh.traffic_class); + } + printf(" alt_ah_attr: port %u dlid %u sl %u src_path_bits %u rate %u global %u\n", + attr.alt_ah_attr.port_num, attr.alt_ah_attr.dlid, + attr.alt_ah_attr.sl, + attr.alt_ah_attr.src_path_bits, attr.alt_ah_attr.static_rate, + attr.alt_ah_attr.is_global); + if (attr.alt_ah_attr.is_global) { + printf(" dgid: %s\n", + __psm2_dump_gid(&attr.alt_ah_attr.grh.dgid, buf, sizeof(buf))); + printf(" flow %u sgid_idx %u hop %u tc %u\n", + attr.alt_ah_attr.grh.flow_label, attr.alt_ah_attr.grh.sgid_index, + attr.alt_ah_attr.grh.hop_limit, attr.alt_ah_attr.grh.traffic_class); + } + printf(" alt pkey idx %u alt port %u alt timeout %u\n", + attr.alt_pkey_index, attr.alt_port_num, attr.alt_timeout); + } else { + printf("qkey: 0x%x\n", attr.qkey); + } + return; +} + +static enum psm_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed) +{ + switch (width) { + case 1: /* 1x */ + switch (speed) { + case 1: return PSM_IBV_RATE_2_5_GBPS; + case 2: return PSM_IBV_RATE_5_GBPS; + case 4: /* fall through */ + case 8: return PSM_IBV_RATE_10_GBPS; + case 16: return PSM_IBV_RATE_14_GBPS; + case 32: return PSM_IBV_RATE_25_GBPS; + case 64: return PSM_IBV_RATE_50_GBPS; + default: + _HFI_ERROR( "unknown link speed 0x%x\n", speed); + return PSM_IBV_RATE_100_GBPS; + } + case 2: /* 4x */ + switch (speed) { + case 1: return PSM_IBV_RATE_10_GBPS; + case 2: return PSM_IBV_RATE_20_GBPS; + case 4: /* fall through */ + case 8: return PSM_IBV_RATE_40_GBPS; + case 16: return PSM_IBV_RATE_56_GBPS; + case 32: return PSM_IBV_RATE_100_GBPS; + case 64: return PSM_IBV_RATE_200_GBPS; + default: + _HFI_ERROR( "unknown link speed 0x%x\n", speed); + return PSM_IBV_RATE_100_GBPS; + } + case 4: /* 8x */ + switch (speed) { + case 1: return PSM_IBV_RATE_20_GBPS; + case 2: return PSM_IBV_RATE_40_GBPS; + case 4: /* fall through */ + case 8: return PSM_IBV_RATE_80_GBPS; + case 16: return PSM_IBV_RATE_112_GBPS; + case 32: return PSM_IBV_RATE_200_GBPS; + case 64: return PSM_IBV_RATE_400_GBPS; + default: + _HFI_ERROR( "unknown link speed 0x%x\n", speed); + return PSM_IBV_RATE_100_GBPS; + } + case 8: /* 12x */ + switch (speed) { + case 1: return PSM_IBV_RATE_30_GBPS; + case 2: return PSM_IBV_RATE_60_GBPS; + case 4: /* fall through */ + case 8: return PSM_IBV_RATE_120_GBPS; + case 16: return PSM_IBV_RATE_168_GBPS; + case 32: return PSM_IBV_RATE_300_GBPS; + case 64: return PSM_IBV_RATE_600_GBPS; + default: + _HFI_ERROR( "unknown link speed 0x%x\n", speed); + return PSM_IBV_RATE_100_GBPS; + } + default: + _HFI_ERROR( "unknown link width 0x%x\n", width); + return PSM_IBV_RATE_100_GBPS; + } +} + +// unfortunately ibv_rate_to_mult and mult_to_ibv_rate have a bug as they +// omit 100g rate and some others, so we create our own +static int my_ibv_rate_to_mult(enum psm_ibv_rate rate) +{ + switch (rate) { + case PSM_IBV_RATE_2_5_GBPS: return 1; + case PSM_IBV_RATE_5_GBPS: return 2; + case PSM_IBV_RATE_10_GBPS: return 4; + case PSM_IBV_RATE_20_GBPS: return 8; + case PSM_IBV_RATE_30_GBPS: return 12; + case PSM_IBV_RATE_40_GBPS: return 16; + case PSM_IBV_RATE_60_GBPS: return 24; + case PSM_IBV_RATE_80_GBPS: return 32; + case PSM_IBV_RATE_120_GBPS: return 48; + case PSM_IBV_RATE_14_GBPS: return 5; + case PSM_IBV_RATE_56_GBPS: return 22; + case PSM_IBV_RATE_112_GBPS: return 44; + case PSM_IBV_RATE_168_GBPS: return 67; + case PSM_IBV_RATE_25_GBPS: return 10; + case PSM_IBV_RATE_100_GBPS: return 40; + case PSM_IBV_RATE_200_GBPS: return 80; + case PSM_IBV_RATE_300_GBPS: return 120; + case PSM_IBV_RATE_28_GBPS: return 11; + case PSM_IBV_RATE_50_GBPS: return 20; + case PSM_IBV_RATE_400_GBPS: return 160; + case PSM_IBV_RATE_600_GBPS: return 240; + default: return 40; + } +} + +static enum psm_ibv_rate my_mult_to_ibv_rate(int mult) +{ + switch (mult) { + case 1: return PSM_IBV_RATE_2_5_GBPS; + case 2: return PSM_IBV_RATE_5_GBPS; + case 4: return PSM_IBV_RATE_10_GBPS; + case 8: return PSM_IBV_RATE_20_GBPS; + case 12: return PSM_IBV_RATE_30_GBPS; + case 16: return PSM_IBV_RATE_40_GBPS; + case 24: return PSM_IBV_RATE_60_GBPS; + case 32: return PSM_IBV_RATE_80_GBPS; + case 48: return PSM_IBV_RATE_120_GBPS; + case 5: return PSM_IBV_RATE_14_GBPS; + case 22: return PSM_IBV_RATE_56_GBPS; + case 44: return PSM_IBV_RATE_112_GBPS; + case 67: return PSM_IBV_RATE_168_GBPS; + case 10: return PSM_IBV_RATE_25_GBPS; + case 40: return PSM_IBV_RATE_100_GBPS; + case 80: return PSM_IBV_RATE_200_GBPS; + case 120: return PSM_IBV_RATE_300_GBPS; + case 11: return PSM_IBV_RATE_28_GBPS; + case 20: return PSM_IBV_RATE_50_GBPS; + case 160: return PSM_IBV_RATE_400_GBPS; + case 240: return PSM_IBV_RATE_600_GBPS; + default: return PSM_IBV_RATE_100_GBPS; + } +} + + +enum psm_ibv_rate min_rate(enum psm_ibv_rate a, enum psm_ibv_rate b) +{ + // unfortunately the ibv_rate enum is not sorted by link rate + // so we must convert to "mult" to compare then convert back + return my_mult_to_ibv_rate(min(my_ibv_rate_to_mult(a), + my_ibv_rate_to_mult(b))); +} + diff --git a/prov/psm3/psm3/psm_verbs_ep.h b/prov/psm3/psm3/psm_verbs_ep.h new file mode 100644 index 00000000000..f8719d633cb --- /dev/null +++ b/prov/psm3/psm3/psm_verbs_ep.h @@ -0,0 +1,395 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + + +#ifndef _PSMI_IN_USER_H +#error psm_verbs_ep.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSMI_VERBS_EP_H +#define _PSMI_VERBS_EP_H + +#include +#ifdef RNDV_MOD_MR +#include +#endif +#include "ptl_ips/ips_path_rec.h" + +#define MAX_PSM_HEADER 64 // sizeof(ips_lrh) == 56, round up to 64 + +// defaults, these are reconfigurable with: +// PSM3_SEND_IMMEDIATE_SIZE +// PSM3_NUM_SEND_WQES +// PSM3_NUM_RECV_WQES +// PSM3_QP_TIMEOUT +// PSM3_QP_RETRY +#define VERBS_SEND_MAX_INLINE 64 // 56 is PSM header size +#define VERBS_SEND_QP_ENTRIES 4080 // will round down to multiple of COALLESCE +#define VERBS_NUM_SEND_RDMA 128 // max conurrent RDMA send WQEs per NIC +#define VERBS_RECV_QP_ENTRIES 4095 // avoid CQ overflow, CVL may be limited to 4095? +#define VERBS_QP_TIMEOUT 536870 // in microseconds (17) +#define VERBS_QP_RETRY 7 // limit on RC QP retries for rnr or timeout +#define VERBS_QP_MAX_RETRY 7 // max allowed by verbs for QP_RETRY + +// hardcoded for now +#define VERBS_RECV_QP_FRACTION 4 // size RC QPs as 1/FRACTION of the + // final UD RECV QP size + // only ask for a completion this often. + // If 1, ask for completion on every send. +#define VERBS_SEND_CQ_COALLESCE 8 + // For USE_RC, keep this modest as we + // could have up to this many -1 unsignaled + // WQEs per QP, which may consume send bufs + // for quite some time if the QP is only + // occasionally used + // if we have ~100 QPs and 1000s of send + // buffers, this should be ok +#define VERBS_RECV_QP_COALLESCE 16 // gather and build this many recv WQEs + // before post on recv Q. + // Reduces verbs calls + // if 1, post as we recv them +#define VERBS_SEND_CQ_REAP 256 // check for completions when this many unreaped +#define VERBS_PORT 1 // default port if not specified +#define VERBS_RECV_CQE_BATCH 1 // how many CQEs to ask for at a time +#define UD_ADDITION (40) // extra bytes at start of UD recv buffer + // defined in verbs API to accomidate IB GRH +#define BUFFER_HEADROOM 0 // how much extra to allocate in buffers + // as a paranoid headroom for use of more than + // intended. Was 64, but seems we can do + // without it and hence make buffers better + // page aligned + // value here should be a multiple of CPU + // cache size +#define CPU_PAGE_ALIGN PSMI_PAGESIZE // boundary to align buffer pools for +#include "psm_verbs_mr.h" + +// some older distros lack some of the rates, so define our own list here +enum psm_ibv_rate { + //PSM_IBV_RATE_MAX = 0, + PSM_IBV_RATE_2_5_GBPS = 2, + PSM_IBV_RATE_5_GBPS = 5, + PSM_IBV_RATE_10_GBPS = 3, + PSM_IBV_RATE_20_GBPS = 6, + PSM_IBV_RATE_30_GBPS = 4, + PSM_IBV_RATE_40_GBPS = 7, + PSM_IBV_RATE_60_GBPS = 8, + PSM_IBV_RATE_80_GBPS = 9, + PSM_IBV_RATE_120_GBPS = 10, + PSM_IBV_RATE_14_GBPS = 11, + PSM_IBV_RATE_56_GBPS = 12, + PSM_IBV_RATE_112_GBPS = 13, + PSM_IBV_RATE_168_GBPS = 14, + PSM_IBV_RATE_25_GBPS = 15, + PSM_IBV_RATE_100_GBPS = 16, + PSM_IBV_RATE_200_GBPS = 17, + PSM_IBV_RATE_300_GBPS = 18, + PSM_IBV_RATE_28_GBPS = 19, + PSM_IBV_RATE_50_GBPS = 20, + PSM_IBV_RATE_400_GBPS = 21, + PSM_IBV_RATE_600_GBPS = 22, +}; + +// Per IBTA the wc.opcode is undefined in error CQEs +// so we need to save that information in the wr_id. +// Fortunately our wr_id's are well aligned pointers so +// we can stash the flag in the low bits of wr_id +#define VERBS_SQ_WR_ID_SEND 0x0 +#define VERBS_SQ_WR_ID_RDMA_WRITE 0x1 +#define VERBS_SQ_WR_ID_MASK 0x1 +#define VERBS_SQ_WR_OP(wr_id) ((wr_id)&VERBS_SQ_WR_ID_MASK) +#define VERBS_SQ_WR_OP_STR(wr_id) (VERBS_SQ_WR_OP(wr_id)?"RDMA Write":"Send") + +struct verbs_sbuf { + struct verbs_sbuf *next; + uint8_t *buffer; + struct psm2_verbs_send_allocator *allocator; +}; +typedef struct verbs_sbuf *sbuf_t; +#define sbuf_to_buffer(buf) ((buf)->buffer) +#define sbuf_pool(ep, buf) ((buf)->allocator->pool) +#define sbuf_lkey(ep, buf) (sbuf_pool(ep, buf)->send_buffer_mr->lkey) + + +// when we get a CQE we need to find the pool and the QP it came from +// (pool has a reference to the qp). +// unfortunately, the CQE has a qp_num but not a ibv_qp pointer. So we need +// to keep this information here and use this structure as the wr_id for our +// RQ WQE. +struct verbs_rbuf { + uint8_t *buffer; + struct psm2_verbs_recv_pool *pool;; +}; +typedef struct verbs_rbuf *rbuf_t; +#define rbuf_to_buffer(buf) ((buf)->buffer) +#define rbuf_addition(buf) ((buf)->pool->addition) +#define rbuf_qp(ep, buf) ((buf)->pool->qp) + +static inline const char*qp_type_str(struct ibv_qp *qp) { + return (qp->qp_type == IBV_QPT_UD)?"UD":"RC"; +} + +// subset of RC QP attr which we need to exchange in PSM req/rep +// when ! defined(USE_RC), it is zeroed to keep req/rep size consistent +// this structure is also used in REQ/REP packet format and size can't change +// list of fields comes from IB CM for RC QP connection +// These fields are purely information about sender: +// qpn, srq, target_ack_delay +// These fields are negotiated. +// mtu, responder_resources, initiator_depth +// Each side sends their best possible value and the receiver picks +// the min of it's own best and the REQ/REP received +struct psm_rc_qp_attr { + uint32_t qpn:24; + uint32_t mtu:4; // HW MTU for RC QP + uint32_t srq:1; // using SRQ + uint32_t resv:3; + uint8_t target_ack_delay:5; // 5 bits for computing timeout - TBD if need + uint8_t resv2:3; + // these control how many concurrent RDMA reads/atomics are allowed per QP + // the initiator of the RDMA reads must issue no more than target can handle + // can be 0 if we don't plan to use RDMA read + // behavior here is based on PSM CM approach, which differs from IB CM + // IB CM REP would have the result of the negotiated value + // for PSM CM, sender puts same values in REQ and REP + // receiver will use the min of its preferred value and the received value + // sent REQ/REP indicates what we desire to use from sender perspective + // responder_resources <= local CA max_qp_rd_atom + // initiator_depth <= local CA max_qp_init_rd_atom + // REQ/REP recipient sets values as follows: + // QP max_rd_atomic = MIN(our requested initiator_depth, + // received responder_resouces) + // QP max_dest_rd_atomic = MIN(our requested responder_resources, + // received initiator_depth) + // ibv_device_attr: + // CA max_qp_rd_atom - max incoming RDMA Reads (responder) + // CA max_qp_init_rd_atom -max outstanding outgoing RDMA Reads (initiator) + // ibv_qp_attr: + // QP max_dest_rd_atomic - max incoming RDMA Reads (responder) + // QP max_rd_atomic - max outstanding outgoing RDMA Reads (initiator) + uint8_t responder_resources; + uint8_t initiator_depth; + // QKey well known + // starting PSN - use initpsn in req/rep + // retry_cnt,rnr_retry_cnt - well known + // pkey - already known + // LID, GID, SL, etc - already known, same as UD QP + uint8_t resv3[17]; // future expansion, keeping struct mult of 64b +} PACK_SUFFIX; + +// pool of send buffers +// When USE_RC we allow multiple QPs to be to share the same send buffer pool. +struct psm2_verbs_send_pool { + // our preregistered send buffers + uint32_t send_buffer_size; + uint32_t send_total; + uint32_t send_num_free; + uint8_t *send_buffers; // aligned buffers for use + struct verbs_sbuf *send_bufs; + struct verbs_sbuf *send_free; // head of free list + struct ibv_mr *send_buffer_mr; +}; +typedef struct psm2_verbs_send_pool *psm2_verbs_send_pool_t; + +// track the list of allocated (aka inflight) send buffers so we +// can coalesce completions and still find all the completed buffers +// For USE_RC, we need an inflight list per QP to account for the unpredictable +// order of send CQEs from different QPs. +struct psm2_verbs_send_allocator { + psm2_verbs_send_pool_t pool; // pool we allocate from + struct verbs_sbuf *send_alloc_head; // head of allocated list + struct verbs_sbuf *send_alloc_end; // end of allocated list + uint32_t send_num_til_coallesce; +}; +typedef struct psm2_verbs_send_allocator *psm2_verbs_send_allocator_t; + +// receive buffer pool +// we use the same basic mechanisms for UD and RC QP buffer pools +// but sizes may differ +// when USE_RC, we need a separate recv pool per QP so we can prepost bufs. +struct psm2_verbs_recv_pool { + struct ibv_qp *qp; // secondary reference to QP these buffers are for +#ifdef PSM_FI + psm2_ep_t ep; +#endif + // our preregistered recv buffers + uint32_t recv_buffer_size; + uint32_t recv_total; + uint8_t *recv_buffers; + struct ibv_mr *recv_buffer_mr; + uint32_t addition; // UD_ADDITION for UD QP, 0 for RC QP +#if VERBS_RECV_QP_COALLESCE > 1 + // list of ready to post WQEs and SGEs + struct ibv_recv_wr recv_wr_list[VERBS_RECV_QP_COALLESCE]; + struct ibv_sge recv_sge_list[VERBS_RECV_QP_COALLESCE]; + uint32_t next_recv_wqe; // next index in rsc_wr_list/sge_list to use +#endif + struct verbs_rbuf *recv_bufs; +}; +typedef struct psm2_verbs_recv_pool *psm2_verbs_recv_pool_t; + +// this structure can be part of psm2_ep +// one instance of this per local end point (NIC) +// we will create a single PD and UD QP with related resources to +// permit an eager data movement mechanism +// conceptually similar to a psmi_context_t which refers to an HFI context +// TODO - later could optimize cache hit rates by putting some of the less +// frequently used fields in a different part of psm2_ep struct +struct psm2_verbs_ep { + char *ib_devname; + //struct ibv_device *ib_dev; + struct ibv_context *context; + struct ibv_port_attr port_attr; + struct ibv_pd *pd; + struct ibv_comp_channel *recv_comp_channel; + union ibv_gid lgid; // The GID to use when sending. + unsigned lgid_index; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_qp *qp; + struct ibv_qp_cap qp_cap; // capabilities of QP we got + uint32_t qkey; + uint8_t link_layer; // IBV_LINK_LAYER_ETHERNET or other + uint8_t active_rate; + uint32_t ip_addr; // ip_addr (valid for link_layer == Eth) + uint32_t ip_netmask; // netmask (valid for link_layer == Eth) + struct psm2_verbs_send_pool send_pool; + struct psm2_verbs_send_allocator send_allocator; + uint32_t send_rdma_outstanding; // number of outstanding RDMAs + uint32_t send_reap_thresh; // TBD if should be here or in pool + struct psm2_verbs_recv_pool recv_pool; +#if VERBS_RECV_CQE_BATCH > 1 + struct ibv_wc recv_wc_list[VERBS_RECV_CQE_BATCH]; + int recv_wc_count; // number left in recv_wc_list + int recv_wc_next; // next index +#else + // if asked to revisit a packet we save it here + rbuf_t revisit_buf; + uint32_t revisit_payload_size; +#endif +#ifdef RNDV_MOD_MR + psm2_rv_t rv; // rendezvous module open handle + uint32_t rv_index; + struct psm2_rv_conn_stats rv_conn_stats; + struct psm2_rv_event_stats rv_event_stats; +#endif +}; + +// given index, return buffer start +#define send_buffer_start(pool, i) ((pool)->send_buffer_size *(i)) +// given buffer start, return index +#define send_buffer_index(pool, buf) (((buf)-(pool)->send_buffers)/(pool)->send_buffer_size) + +// given index, return buffer start +#define recv_buffer_start(pool, i) ((pool)->recv_buffer_size *(i)) +// given buffer start, return index +#define recv_buffer_index(pool, buf) (((buf)-(pool)->recv_buffers)/(pool)->recv_buffer_size) + +extern psm2_error_t __psm2_ep_open_verbs(psm2_ep_t ep, int unit, int port, psm2_uuid_t const job_key); +extern void __psm2_ep_free_verbs(psm2_ep_t ep); +extern psm2_error_t __psm2_ep_initialize_queues(psm2_ep_t ep); +extern struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, + struct ibv_qp_cap *cap); +extern void rc_qp_destroy(struct ibv_qp *qp); +extern psm2_error_t modify_rc_qp_to_init(psm2_ep_t ep, struct ibv_qp *qp); +extern psm2_error_t modify_rc_qp_to_rtr(psm2_ep_t ep, struct ibv_qp *qp, + const struct psm_rc_qp_attr *req_attr, + const ips_path_rec_t *path_rec, uint32_t initpsn); +extern psm2_error_t modify_rc_qp_to_rts(psm2_ep_t ep, struct ibv_qp *qp, + const struct psm_rc_qp_attr *req_attr, uint32_t initpsn); +extern int __psm2_ep_poll_type(int poll_type, psm2_ep_t ep); +extern psm2_error_t psm_verbs_alloc_send_pool(psm2_ep_t ep, struct ibv_pd *pd, + psm2_verbs_send_pool_t pool, + uint32_t send_total, uint32_t send_buffer_size); +extern psm2_error_t psm_verbs_init_send_allocator( + psm2_verbs_send_allocator_t allocator, + psm2_verbs_send_pool_t pool); +extern psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp, + psm2_verbs_recv_pool_t pool, + uint32_t recv_total, uint32_t recv_buffer_size); +extern void psm_verbs_free_send_pool(psm2_verbs_send_pool_t pool); +extern void psm_verbs_free_recv_pool(psm2_verbs_recv_pool_t pool); +extern sbuf_t __psm2_ep_verbs_alloc_sbuf(psm2_verbs_send_allocator_t allocator); +extern void __psm2_ep_verbs_free_sbuf( + sbuf_t buf, uint32_t count); +extern psm2_error_t __psm2_ep_verbs_post_recv( + rbuf_t buf); +extern psm2_error_t __psm2_ep_verbs_prepost_recv(psm2_verbs_recv_pool_t pool); + +extern psm2_error_t psm2_verbs_post_rdma_write_immed(psm2_ep_t ep, + struct ibv_qp *qp, + void *loc_buf, struct psm2_verbs_mr *loc_mr, + uint64_t rem_buf, uint32_t rkey, + size_t len, uint32_t immed, uint64_t wr_id); + +#ifdef RNDV_MOD_MR +extern psm2_error_t psm2_verbs_post_rv_rdma_write_immed(psm2_ep_t ep, + psm2_rv_conn_t conn, + void *loc_buf, struct psm2_verbs_mr *loc_mr, + uint64_t rem_buf, uint32_t rkey, + size_t len, uint32_t immed, uint64_t wr_id, + uint8_t *sconn_index, uint32_t *conn_count); +#endif + +extern psm2_error_t psm2_verbs_completion_update(psm2_ep_t ep); + +extern void __psm2_dump_buf(uint8_t *buf, uint32_t len); +extern int __psm2_nonzero_gid(const union ibv_gid *gid); +extern char *__psm2_dump_gid(union ibv_gid *gid, char *buf, size_t bufsize); +extern void __psm2_dump_verbs_qp(struct ibv_qp *qp); +extern enum psm_ibv_rate min_rate(enum psm_ibv_rate a, enum psm_ibv_rate b); +#ifndef UD_SAMPLE +extern int verbs_get_port_index2pkey(psm2_ep_t ep, int port, int index); +#endif +#endif // _PSMI_VERBS_EP_H diff --git a/prov/psm3/psm3/psm_verbs_mr.c b/prov/psm3/psm3/psm_verbs_mr.c new file mode 100644 index 00000000000..e54d24deeec --- /dev/null +++ b/prov/psm3/psm3/psm_verbs_mr.c @@ -0,0 +1,752 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ +// This performs memory registration for RDMA Rendezvous +// It also tracks MRs in use and allows existing MRs to be shared. + +// in cache_mode MR_CACHE_MODE_USER, as a PoC we keep the cache overly simple. +// This approach is only viable for +// some microbenchmarks and simple apps. For more complex apps the lack of +// invalidate hooks into memory free may lead to memory corruption. +// However such hooks are not reliably possible until the 4.17+ kernels. +// The kernel RV module hooks into mmu_notfiers for invalidate. These are also +// used by hypervisors and hence are complete and reliable. + +#include +#include "psm_user.h" // pulls in psm_verbs_ep.h and psm_verbs_mr.h +#ifdef RNDV_MOD_MR +#include "psm_rndv_mod.h" +#endif +#ifdef PSM_FI +#include "ips_config.h" +#endif + +//#undef _HFI_MMDBG +//#define _HFI_MMDBG printf + +#ifdef min +#undef min +#endif +#define min(a, b) ((a) < (b) ? (a) : (b)) + +#ifdef max +#undef max +#endif +#define max(a, b) ((a) > (b) ? (a) : (b)) + +#define MEGABYTE (1024*1024) + +#ifndef container_of +/* + * container_of - cast a member of a structure out to the containing structure + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + */ +#define container_of(ptr, type, member) \ + ((type *) ((uint8_t *)(ptr) - offsetof(type, member))) +#endif + + +// Since rbtree.h and rbtree.c are designed to be included, and declare +// some hardcoded type names (cl_map_item_t and cl_qmap_t), we must limit +// our data type declarations which use those types to this .c file + +// this will be the payload of a cl_qmap_t +struct psm2_mr_cache_map_pl { + uint32_t nelems; // number of elements in cache +}; + +// rbtree.h uses these 2 well known defines to create the payload for +// cl_map_item_t and cl_qmap_t structures +#define RBTREE_MI_PL struct psm2_verbs_mr +#define RBTREE_MAP_PL struct psm2_mr_cache_map_pl +#include "rbtree.h" + +struct psm2_mr_cache { + uint32_t max_entries; + // limits to allow headroom for priority registrations + uint32_t limit_inuse; + uint64_t limit_inuse_bytes; +#ifdef RNDV_MOD_MR + psm2_rv_t rv; + int cmd_fd; +#endif +#ifdef PSM_FI + psm2_ep_t ep; +#endif + uint8_t cache_mode; // MR_CACHE_MODE_* + cl_qmap_t map; + cl_map_item_t root; + cl_map_item_t nil_item; + // Below is for queue of cache entries available for reuse (refcount==0) + // only used when cache_mode==MR_CACHE_MODE_USER. + // Available entries are added at end of list and reused from start. + // Hence having aging of cached entries. + // Aging helps reduce some of the corruption risk, + // but is not a full solution. Good enough for the PoC + TAILQ_HEAD(avail_list, psm2_verbs_mr) avail_list; + mpool_t mr_pool; // pool of MRs + // some statistics for user space + uint64_t hit; + uint64_t miss; + uint64_t rejected; // rejected non-priority registration + uint64_t full; // failed registration (tends to be priority) + uint64_t failed; // other failures, should be none + uint32_t inuse; // entry count in use + uint32_t max_inuse; + uint64_t inuse_bytes; + uint64_t max_inuse_bytes; + uint32_t max_nelems; + uint32_t max_refcount; +#ifdef RNDV_MOD_MR + struct psm2_rv_cache_stats rv_stats; // statistics from rv module + // will remain 0 if rv not open +#endif +}; + +static int mr_cache_key_cmp(const struct psm2_verbs_mr *a, + const struct psm2_verbs_mr *b) +{ + // to match addr, length and access must match + // we require exact match to avoid the issue of a release of the larger + // MR while smaller overlapping MR still in use, just in case an + // allocator frees the extra memory not in the smaller MR + // this may be paranoid, TBD if should treat a smaller MR as a match + // of a larger subset MR. + if (a->access < b->access) + return -1; + else if (a->access > b->access) + return 1; + if (a->addr < b->addr) + return -1; + else if (a->addr > b->addr) + return 1; + if (a->length < b->length) + return -1; + else if (a->length > b->length) + return 1; + return 0; +} + +// rbtree.c uses these defines to establish some of it's code and +// then provides all the rbtree manipulation functions +// we want to control the compare funciton so we define RBTREE_CMP and thus +// must define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR to avoid compiler errors +#define RBTREE_CMP(a,b) mr_cache_key_cmp((a), (b)) +#define RBTREE_ASSERT psmi_assert +#define RBTREE_MAP_COUNT(PAYLOAD_PTR) ((PAYLOAD_PTR)->nelems) +#define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR +#include "rbtree.c" + +// TBD - move to a utility macro header +// taken fron IbAccess imath.h and imath.c +static uint32_t +ones64(uint64_t x) +{ + x -= ((x >> 1) & 0x5555555555555555ULL); + x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL)); + x = (((x >> 4) + x) & 0x0f0f0f0f0f0f0f0fULL); + x += (x >> 8); + x += (x >> 16); + x += (x >> 32); + return(x & 0x0000003f); +} + +/* log2(x) truncated */ +uint32_t +FloorLog2(uint64_t x) +{ + x |= (x >> 1); + x |= (x >> 2); + x |= (x >> 4); + x |= (x >> 8); + x |= (x >> 16); + x |= (x >> 32); + return(ones64(x >> 1)); +} + +/* log2(x) rounded up if x is not a power of 2 */ +uint32_t CeilLog2(uint64_t val) +{ + uint32_t floor2 = FloorLog2(val); + if ((1ULL << floor2) == val) + return (floor2); + else + return (floor2+1); +} + +static inline uint32_t NextPower2(uint64_t x) +{ + return (1 << CeilLog2(x)); +} + +// accessor functions for statistics +#define CACHE_STAT_FUNC(func, stat) \ + static uint64_t func(void *context) \ + { \ + psm2_mr_cache_t cache = (psm2_mr_cache_t)context; \ + return cache->stat; \ + } + + +CACHE_STAT_FUNC(mr_cache_mode, cache_mode) +CACHE_STAT_FUNC(mr_cache_max_entries, max_entries) +CACHE_STAT_FUNC(mr_cache_nelems, map.payload.nelems) +CACHE_STAT_FUNC(mr_cache_max_nelems, max_nelems) +CACHE_STAT_FUNC(mr_cache_limit_inuse, limit_inuse) +CACHE_STAT_FUNC(mr_cache_inuse, inuse) +CACHE_STAT_FUNC(mr_cache_max_inuse, max_inuse) +CACHE_STAT_FUNC(mr_cache_max_refcount, max_refcount) +#undef CACHE_STAT_FUNC + +static uint64_t mr_cache_hit_rate(void *context) +{ + psm2_mr_cache_t cache = (psm2_mr_cache_t)context; + if (cache->miss) // all entries start with a miss, then get hits + return((cache->hit*100)/(cache->miss+cache->hit)); + else + return 0; +} + +static uint64_t mr_cache_miss_rate(void *context) +{ + psm2_mr_cache_t cache = (psm2_mr_cache_t)context; + if (cache->miss) // all entries start with a miss, then get hits + return((cache->miss*100)/(cache->miss+cache->hit)); + else + return 0; +} + +#ifdef RNDV_MOD_MR +static uint64_t mr_cache_rv_size(void *context) +{ + psm2_mr_cache_t cache = (psm2_mr_cache_t)context; + if (cache->rv) { + // this is a little sly, we know the stats processing routines will + // call the accessors in the order from the entries list + // so we use the 1st of the rv statistics accessors to get + // the statistics from rv into the cache structure so other accessors + // can simply return the relevant value + (void)__psm2_rv_get_cache_stats(cache->rv, &cache->rv_stats); + } + return cache->rv_stats.cache_size/MEGABYTE; +} + +#define CACHE_RV_STAT_FUNC(func, stat) \ + static uint64_t func(void *context) \ + { \ + psm2_mr_cache_t cache = (psm2_mr_cache_t)context; \ + return cache->rv_stats.stat; \ + } + +CACHE_RV_STAT_FUNC(mr_cache_rv_max_size, max_cache_size/MEGABYTE) +CACHE_RV_STAT_FUNC(mr_cache_rv_limit_size, limit_cache_size) +CACHE_RV_STAT_FUNC(mr_cache_rv_nelems, count) +CACHE_RV_STAT_FUNC(mr_cache_rv_max_nelems, max_count) +CACHE_RV_STAT_FUNC(mr_cache_rv_inuse, inuse) +CACHE_RV_STAT_FUNC(mr_cache_rv_max_inuse, max_inuse) +CACHE_RV_STAT_FUNC(mr_cache_rv_max_refcount, max_refcount) +#undef CACHE_RV_STAT_FUNC + +static uint64_t mr_cache_rv_hit_rate(void *context) +{ + psm2_mr_cache_t cache = (psm2_mr_cache_t)context; + if (cache->rv_stats.miss) // all entries start with a miss, then get hits + return((cache->rv_stats.hit*100)/(cache->rv_stats.miss+cache->rv_stats.hit)); + else + return 0; +} + +static uint64_t mr_cache_rv_miss_rate(void *context) +{ + psm2_mr_cache_t cache = (psm2_mr_cache_t)context; + if (cache->rv_stats.miss) // all entries start with a miss, then get hits + return((cache->rv_stats.miss*100)/(cache->rv_stats.miss+cache->rv_stats.hit)); + else + return 0; +} +#endif // RNDV_MOD_MR + +#define INC_STAT(cache, stat, max_stat) \ + do { \ + if (++((cache)->stat) > (cache)->max_stat) \ + (cache)->max_stat = (cache)->stat; \ + } while(0) + +#define ADD_STAT(cache, adder, stat, max_stat) \ + do { \ + if (((cache)->stat += (adder)) > (cache)->max_stat) \ + (cache)->max_stat = (cache)->stat; \ + } while(0) + + +// ep is used for RNDV_MOD_MR, memory tracking and stats +psm2_mr_cache_t psm2_verbs_alloc_mr_cache(psm2_ep_t ep, + uint32_t max_entries, uint8_t cache_mode, + uint32_t pri_entries, uint64_t pri_size) +{ + struct psm2_mr_cache *cache; + + cache = (struct psm2_mr_cache *)psmi_calloc(ep, DESCRIPTORS, + sizeof(*cache), 1); + if (! cache) + return NULL; + // max_entries for a pool must be power of 2 + max_entries = max(max_entries, pri_entries); + max_entries = NextPower2(max_entries); + cache->max_entries = max_entries; + cache->cache_mode = cache_mode; + // we leave headroom for priority registrations + cache->limit_inuse = max_entries - pri_entries; +#ifdef PSM_FI + cache->ep = ep; +#endif +#ifdef RNDV_MOD_MR + if (cache->cache_mode == MR_CACHE_MODE_KERNEL + || cache->cache_mode == MR_CACHE_MODE_RV) { + if (ep->rv_mr_cache_size*MEGABYTE < pri_size) { + _HFI_ERROR("PSM3_RV_MR_CACHE_SIZE=%u too small, require >= %"PRIu64"\n", + ep->rv_mr_cache_size, (pri_size + MEGABYTE-1)/MEGABYTE); + return NULL; + } + cache->limit_inuse_bytes = ep->rv_mr_cache_size*MEGABYTE - pri_size; + } else +#endif // RNDV_MOD_MR + cache->limit_inuse_bytes = UINT64_MAX; // no limit, just count inuse +#ifdef RNDV_MOD_MR + cache->rv = ep->verbs_ep.rv; + cache->cmd_fd = ep->verbs_ep.context->cmd_fd; +#endif // RNDV_MOD_MR + _HFI_MMDBG("cache alloc: max_entries=%u limit_inuse=%u limit_inuse_bytes=%"PRIu64", pri_entries=%u pri_size=%"PRIu64"\n", + cache->max_entries, cache->limit_inuse, + cache->limit_inuse_bytes, pri_entries, pri_size); + // max_entries must be power of 2>= obj per chunk which is also a power of 2 + cache->mr_pool = psmi_mpool_create(sizeof(cl_map_item_t), + min(128, max_entries), max_entries, 0, + DESCRIPTORS, NULL, NULL); + if (! cache->mr_pool) { + psmi_free(cache); + return NULL; + } + //nil_item already zeroed by calloc + //memset(&cache->nil_item.payload, 0, sizeof(cache->nil_item.payload)); + ips_cl_qmap_init(&cache->map, &cache->root, &cache->nil_item); + TAILQ_INIT(&cache->avail_list); + + struct psmi_stats_entry entries[] = { + PSMI_STATS_DECL("cache_mode", MPSPAWN_STATS_REDUCTION_ALL, + mr_cache_mode, NULL), + PSMI_STATS_DECL_FUNC("limit_entries", mr_cache_max_entries), + PSMI_STATS_DECL_FUNC("nelems", mr_cache_nelems), + PSMI_STATS_DECL_FUNC("max_nelems", mr_cache_max_nelems), + PSMI_STATS_DECL("limit_inuse", + MPSPAWN_STATS_REDUCTION_ALL, + mr_cache_limit_inuse, NULL), + PSMI_STATS_DECL_FUNC("inuse", mr_cache_inuse), + PSMI_STATS_DECL_FUNC("max_inuse", mr_cache_max_inuse), + PSMI_STATS_DECL("limit_inuse_bytes", + MPSPAWN_STATS_REDUCTION_ALL, + NULL, &cache->limit_inuse_bytes), + PSMI_STATS_DECLU64("inuse_bytes", &cache->inuse_bytes), + PSMI_STATS_DECLU64("max_inuse_bytes", &cache->max_inuse_bytes), + PSMI_STATS_DECL_FUNC("max_refcount", mr_cache_max_refcount), + PSMI_STATS_DECLU64("hit", &cache->hit), + PSMI_STATS_DECL("hit_%",MPSPAWN_STATS_REDUCTION_ALL, + mr_cache_hit_rate, NULL), + PSMI_STATS_DECLU64("miss", &cache->miss), + PSMI_STATS_DECL("miss_%", MPSPAWN_STATS_REDUCTION_ALL, + mr_cache_miss_rate, NULL), + PSMI_STATS_DECLU64("rejected", &cache->rejected), + PSMI_STATS_DECLU64("full", &cache->full), + PSMI_STATS_DECLU64("failed", &cache->failed), +#ifdef RNDV_MOD_MR + PSMI_STATS_DECL_FUNC("rv_size", mr_cache_rv_size), + PSMI_STATS_DECL_FUNC("rv_max_size", mr_cache_rv_max_size), + PSMI_STATS_DECL_FUNC("rv_limit", mr_cache_rv_limit_size), + PSMI_STATS_DECL_FUNC("rv_nelems", mr_cache_rv_nelems), + PSMI_STATS_DECL_FUNC("rv_max_nelems", mr_cache_rv_max_nelems), + PSMI_STATS_DECL_FUNC("rv_inuse", mr_cache_rv_inuse), + PSMI_STATS_DECL_FUNC("rv_max_inuse", mr_cache_rv_max_inuse), + PSMI_STATS_DECLU64("rv_inuse_bytes", (uint64_t*)&cache->rv_stats.inuse_bytes), + PSMI_STATS_DECLU64("rv_max_inuse_bytes", (uint64_t*)&cache->rv_stats.max_inuse_bytes), + PSMI_STATS_DECL_FUNC("rv_max_refcount", mr_cache_rv_max_refcount), + PSMI_STATS_DECLU64("rv_hit", (uint64_t*)&cache->rv_stats.hit), + PSMI_STATS_DECL("rv_hit %", MPSPAWN_STATS_REDUCTION_ALL, + mr_cache_rv_hit_rate, NULL), + PSMI_STATS_DECLU64("rv_miss", (uint64_t*)&cache->rv_stats.miss), + PSMI_STATS_DECL("rv_miss %", MPSPAWN_STATS_REDUCTION_ALL, + mr_cache_rv_miss_rate, NULL), + PSMI_STATS_DECLU64("rv_full", (uint64_t*)&cache->rv_stats.full), + PSMI_STATS_DECLU64("rv_failed", (uint64_t*)&cache->rv_stats.failed), + PSMI_STATS_DECLU64("rv_remove", (uint64_t*)&cache->rv_stats.remove), + PSMI_STATS_DECLU64("rv_evict", (uint64_t*)&cache->rv_stats.evict), +#endif // RNDV_MOD_MR + }; + psmi_stats_register_type("MR_Cache_Statistics", + PSMI_STATSTYPE_MR_CACHE, + entries, + PSMI_STATS_HOWMANY(entries), + ep->epid, cache); + + return cache; +} + +// checks for space for a non-priority registration +static inline int have_space(psm2_mr_cache_t cache, uint32_t length) +{ + return (cache->inuse < cache->limit_inuse + && cache->inuse_bytes + length < cache->limit_inuse_bytes); +} + +// each attempt will increment exactly one of: hit, miss, rejected, full, failed +struct psm2_verbs_mr * psm2_verbs_reg_mr(psm2_mr_cache_t cache, + bool priority, struct ibv_pd *pd, + void *addr, uint64_t length, int access) +{ + psm2_verbs_mr_t mrc; + +#ifdef PSM_FI + if_pf(PSMI_FAULTINJ_ENABLED_EP(cache->ep)) { + PSMI_FAULTINJ_STATIC_DECL(fi_reg_mr, "reg_mr", + "MR cache full, any request type", + 1, IPS_FAULTINJ_REG_MR); + if (psmi_faultinj_is_fault(fi_reg_mr)) { + cache->failed++; + errno = ENOMEM; + return NULL; + } + } + if_pf(!priority && PSMI_FAULTINJ_ENABLED_EP(cache->ep)) { + PSMI_FAULTINJ_STATIC_DECL(fi_nonpri_reg_mr, "nonpri_reg_mr", + "MR cache full, non-priority request", + 1, IPS_FAULTINJ_NONPRI_REG_MR); + if (psmi_faultinj_is_fault(fi_nonpri_reg_mr)) { + cache->failed++; + errno = ENOMEM; + return NULL; + } + } + if_pf(priority && PSMI_FAULTINJ_ENABLED_EP(cache->ep)) { + PSMI_FAULTINJ_STATIC_DECL(fi_pri_reg_mr, "pri_reg_mr", + "MR cache full, priority request", + 1, IPS_FAULTINJ_PRI_REG_MR); + if (psmi_faultinj_is_fault(fi_pri_reg_mr)) { + cache->failed++; + errno = ENOMEM; + return NULL; + } + } +#endif + access |= IBV_ACCESS_LOCAL_WRITE; // manditory flag +#ifndef RNDV_MOD_MR + if (access & IBV_ACCESS_IS_GPU_ADDR) { + _HFI_ERROR("unsupported GPU memory registration\n"); + cache->failed++; + errno = EINVAL; + return NULL; + } +#endif + struct psm2_verbs_mr key = { // our search key + .addr = addr, + .length = length, + // only 8 bits in mrc for access + .access = (access & ~(IBV_ACCESS_IS_GPU_ADDR +#ifdef RNDV_MOD_MR + |IBV_ACCESS_KERNEL +#endif + )) + }; + cl_map_item_t *p_item = ips_cl_qmap_searchv(&cache->map, &key); + if (p_item->payload.mr.mr_ptr) { + psmi_assert(p_item != cache->map.nil_item); + mrc = &p_item->payload; + if (! mrc->refcount) { + if (! priority && ! have_space(cache, (unsigned)length)) { + _HFI_MMDBG("cache has no headroom for non-priority hit addr %p len %u access 0x%x ptr %p\n", + addr, (unsigned)length, access, mrc); + cache->rejected++; + errno = ENOMEM; + return NULL; + } + // it was an entry on avail_list, take off list + TAILQ_REMOVE(&cache->avail_list, mrc, next); + INC_STAT(cache, inuse, max_inuse); + ADD_STAT(cache, (unsigned)length, inuse_bytes, max_inuse_bytes); + } + cache->hit++; + _HFI_MMDBG("cache hit MR addr %p len %u access 0x%x ptr %p\n", + addr, (unsigned)length, mrc->access, mrc); + mrc->refcount++; + cache->max_refcount = max(cache->max_refcount, mrc->refcount); + return mrc; + } + psmi_assert(p_item == cache->map.nil_item); + if (! priority && ! have_space(cache, (unsigned)length)) { + _HFI_MMDBG("cache has no headroom for non-priority miss addr %p len %u access 0x%x\n", + addr, (unsigned)length, access); + cache->rejected++; + errno = ENOMEM; + return NULL; + } + // we only reuse entries from avail_list once cache is full + // this helps improve cache hit rate. + // we only have items on avail_list when cache_mode==MR_CACHE_MODE_USER + if (cache->map.payload.nelems >= cache->max_entries) { + int ret; + mrc = TAILQ_FIRST(&cache->avail_list); + if (! mrc) { + _HFI_MMDBG("user space MR cache full\n"); + cache->full++; + errno = ENOMEM; + return NULL; + } + p_item = container_of(mrc, cl_map_item_t, payload); + psmi_assert(mrc->mr.mr_ptr); + psmi_assert(! mrc->refcount); + _HFI_MMDBG("reuse avail MR addr %p len %u access 0x%x ptr %p\n", + addr, (unsigned)length, mrc->access, mrc); + ips_cl_qmap_remove_item(&mrc->cache->map, p_item); + TAILQ_REMOVE(&cache->avail_list, mrc, next); +#ifdef RNDV_MOD_MR + if (cache->cache_mode == MR_CACHE_MODE_KERNEL + || cache->cache_mode == MR_CACHE_MODE_RV) // should not happen + ret = __psm2_rv_dereg_mem(cache->rv, mrc->mr.rv_mr); + else +#endif + ret = ibv_dereg_mr(mrc->mr.ibv_mr); + if (ret) { + _HFI_ERROR("unexpected dreg_mr failure: %s", strerror(errno)); + cache->failed++; + errno = EIO; + // MR is fouled up, we leak the MR and free the cache entry + // caller will try again later + mrc->mr.mr_ptr = NULL; + psmi_mpool_put(p_item); + return NULL; + } + mrc->mr.mr_ptr = NULL; + } else { + // allocate a new item + p_item = (cl_map_item_t *)psmi_mpool_get(cache->mr_pool); + if (! p_item) { // keep KW happy, should not happen, we check max above + _HFI_ERROR("unexpected cache pool allocate failure\n"); + cache->failed++; + return NULL; + } + mrc = &p_item->payload; + // we initialize mrc below + cache->max_nelems = max(cache->max_nelems, cache->map.payload.nelems+1); + } +#ifdef RNDV_MOD_MR + /* need cmd_fd for access to ucontext when converting user pd into kernel pd */ + if (cache->cache_mode == MR_CACHE_MODE_KERNEL) { + mrc->mr.rv_mr = __psm2_rv_reg_mem(cache->rv, cache->cmd_fd, pd, addr, length, access); + if (! mrc->mr.rv_mr) { + int save_errno = errno; + if (errno == ENOMEM) { + cache->full++; + } else { + _HFI_ERROR("reg_mr failed; %s", strerror(errno)); + cache->failed++; + } + psmi_mpool_put(p_item); + errno = save_errno; + return NULL; + } + mrc->iova = mrc->mr.rv_mr->iova; + mrc->lkey = mrc->mr.rv_mr->lkey; + mrc->rkey = mrc->mr.rv_mr->rkey; + } else if (cache->cache_mode == MR_CACHE_MODE_RV) { + mrc->mr.rv_mr = __psm2_rv_reg_mem(cache->rv, cache->cmd_fd, NULL, addr, length, access|IBV_ACCESS_KERNEL); + if (! mrc->mr.rv_mr) { + int save_errno = errno; + if (errno == ENOMEM) { + cache->full++; + } else { + _HFI_ERROR("reg_mr failed; %s", strerror(errno)); + cache->failed++; + } + psmi_mpool_put(p_item); + errno = save_errno; + return NULL; + } + mrc->iova = mrc->mr.rv_mr->iova; + mrc->lkey = mrc->mr.rv_mr->lkey; + mrc->rkey = mrc->mr.rv_mr->rkey; + } else +#endif + { + mrc->mr.ibv_mr = ibv_reg_mr(pd, addr, length, access); + if (! mrc->mr.ibv_mr) { + int save_errno = errno; + if (errno == ENOMEM) { + cache->full++; + } else { + _HFI_ERROR("reg_mr failed; %s", strerror(errno)); + cache->failed++; + } + psmi_mpool_put(p_item); + errno = save_errno; + return NULL; + } + mrc->iova = (uintptr_t)addr; + mrc->lkey = mrc->mr.ibv_mr->lkey; + mrc->rkey = mrc->mr.ibv_mr->rkey; + } + cache->miss++; + mrc->cache = cache; + mrc->refcount = 1; + mrc->addr = addr; + mrc->length = length; + mrc->access = access; + ips_cl_qmap_insert_item(&cache->map, p_item); + INC_STAT(cache, inuse, max_inuse); + ADD_STAT(cache, (unsigned)length, inuse_bytes, max_inuse_bytes); + _HFI_MMDBG("registered new MR pri %d addr %p len %u access 0x%x ptr %p nelems %u\n", + priority, addr, (unsigned)length, mrc->access, mrc, + cache->map.payload.nelems); + return mrc; +} + +int psm2_verbs_release_mr(struct psm2_verbs_mr *mrc) +{ + int ret = 0; + if (! mrc) { + errno = EINVAL; + return -1; + } + if (! mrc->refcount) { + errno = ENXIO; + return -1; + } + _HFI_MMDBG("releasing MR addr %p len %u access 0x%x ref %u ptr %p\n", + mrc->addr, (unsigned)mrc->length, mrc->access, + mrc->refcount, mrc); + if (mrc->cache->cache_mode == MR_CACHE_MODE_USER) { + // if refcount now zero, put on avail_list to be reclaimed if needed + if (! --(mrc->refcount)) { + mrc->cache->inuse--; + mrc->cache->inuse_bytes -= (unsigned)mrc->length; + TAILQ_INSERT_TAIL(&mrc->cache->avail_list, mrc, next); + } + } else { + if (! --(mrc->refcount)) { + _HFI_MMDBG("freeing MR addr %p len %u access 0x%x ref %u ptr %p nelems %u\n", + mrc->addr, (unsigned)mrc->length, mrc->access, + mrc->refcount, mrc, mrc->cache->map.payload.nelems); + mrc->cache->inuse--; + mrc->cache->inuse_bytes -= (unsigned)mrc->length; + cl_map_item_t *p_item = container_of(mrc, cl_map_item_t, payload); + ips_cl_qmap_remove_item(&mrc->cache->map, p_item); +#ifdef RNDV_MOD_MR + if (mrc->cache->cache_mode == MR_CACHE_MODE_KERNEL + || mrc->cache->cache_mode == MR_CACHE_MODE_RV) + ret = __psm2_rv_dereg_mem(mrc->cache->rv, mrc->mr.rv_mr); + else +#endif + ret = ibv_dereg_mr(mrc->mr.ibv_mr); + if (ret) { + // nasty choice, do we leak the MR or leak the cache entry + // we chose to leak the MR and free the cache entry + _HFI_ERROR("unexpected dreg_mr failure: %s", strerror(errno)); + errno = EIO; + ret = -1; + } + mrc->mr.mr_ptr = NULL; + psmi_mpool_put(p_item); + } + } + return ret; +} + +void psm2_verbs_free_mr_cache(psm2_mr_cache_t cache) +{ + psmi_stats_deregister_type(PSMI_STATSTYPE_MR_CACHE, cache); + while (cache->map.payload.nelems) { + cl_map_item_t *p_item = __cl_map_root(&cache->map); + psmi_assert(p_item != cache->map.nil_item); + psm2_verbs_mr_t mrc = &p_item->payload; + psmi_assert(mrc->mr.mr_ptr); + if (mrc->mr.mr_ptr) { + int ret; + _HFI_MMDBG("free MR addr %p len %u access 0x%x ref %u ptr %p\n", + mrc->addr, (unsigned)mrc->length, mrc->access, + mrc->refcount, mrc); + if (mrc->refcount) + _HFI_ERROR("unreleased MR in psm2_verbs_free_mr_cache addr %p len %u access 0x%x\n", mrc->addr, (unsigned)mrc->length, mrc->access); + mrc->refcount = 0; + cl_map_item_t *p_item = container_of(mrc, cl_map_item_t, payload); + ips_cl_qmap_remove_item(&cache->map, p_item); + TAILQ_REMOVE(&cache->avail_list, mrc, next); +#ifdef RNDV_MOD_MR + if (cache->cache_mode == MR_CACHE_MODE_KERNEL + || cache->cache_mode == MR_CACHE_MODE_RV) + ret = __psm2_rv_dereg_mem(cache->rv, mrc->mr.rv_mr); + else +#endif + ret = ibv_dereg_mr(mrc->mr.ibv_mr); + if (ret) + _HFI_ERROR("unexpected dreg_mr failure: %s", strerror(errno)); + mrc->mr.mr_ptr = NULL; + psmi_mpool_put(p_item); + } + } + psmi_assert(TAILQ_EMPTY(&cache->avail_list)); + psmi_assert(! cache->map.payload.nelems); + + psmi_mpool_destroy(cache->mr_pool); + psmi_free(cache); +} diff --git a/prov/psm3/psm3/psm_verbs_mr.h b/prov/psm3/psm3/psm_verbs_mr.h new file mode 100644 index 00000000000..f89c0d76381 --- /dev/null +++ b/prov/psm3/psm3/psm_verbs_mr.h @@ -0,0 +1,148 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + + +#ifndef _PSMI_IN_USER_H +#error psm_verbs_mr.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSMI_VERBS_MR_H +#define _PSMI_VERBS_MR_H + +#include +#include + +#define MR_CACHE_MODE_NONE 0 // user space MRs, but no caching +#define MR_CACHE_MODE_KERNEL 1 // kernel MR cache in rendezvous module +#define MR_CACHE_MODE_USER 2 // user space MR cache (demo quality only) +#define MR_CACHE_MODE_RV 3 // kernel MRs for kernel rendezvous module QPs +#define MR_CACHE_MODE_VALID(mode) ((unsigned)(mode) <= 3) + +// This performs memory registration for RDMA Rendezvous when PSM3_RDMA enabled +// Priority registration calls ere those immediately before the data transfer +// hence delaying their registration directly delays IOs. +// Non-priority calls are those registering the whole IO +// prior to sending/receiving the CTS. Delays in non-priority +// calls have less direct impacts on IO delays. +// Numbers and size limits for priority registrations are able to be directly +// estimated since limits on outstanding RDMAs and their size constrains them. +// Non-priority registrations and sizes are a function of application design +// and how many conurrent MPI_ISend/IRecv are outstanding and of what size. +// Given defaults, priority registrations limits will be 160 entries of +// 128K each for a total of < 20MB of the kernel cache. +// All non-priority registrations eventually get used for IOs and become +// a priority use case, so attempting to track whether each entry in the cache +// is a priority or non-priority entry is tricky, especially since there +// can be both priority and non-priority references. So instead of attempting +// to track the amount of priority and non-priority entries, we simply +// track current inuse entries and only allow non-priority registrations when +// we have a reasonable amount of headroom. This way most priority +// registrations will succeed. + +// the pointer to psm2_verbs_mr itself is the handle for subsequenent release +struct psm2_verbs_mr { + // fields for use by caller + // TBD - review use, we have rkey/lkey here and in mr itself, don't need both + uint64_t iova; // used by caller + uint32_t lkey; // used by caller + uint32_t rkey; // used by caller + // private fields below are not for use by caller + // for kernel rendezvous this might just be a kernel handle and this + // information may be private in the kernel + union { + void *mr_ptr; // for simple test of != NULL or clearing to NULL +#ifdef RNDV_MOD_MR + // when cache_mode = MR_CACHE_MODE_KERNEL + psm2_rv_mr_t rv_mr; // internally we can get addr, length and pd from here +#endif + // when cache_mode = MR_CACHE_MODE_NONE or MR_CACHE_MODE_USER + struct ibv_mr *ibv_mr; // internally we can get addr, length and pd from here + } mr; + struct psm2_mr_cache *cache; // TBD could have caller pass to release + uint32_t refcount; + // this structure will be used as a search key too, so must include + // addr and length directly since search key object won't have an mr ptr + // also addr is used in callers to translate remote addr returned in CTS + void *addr; + uint64_t length; + uint8_t access; + // below is for queue of cache entries available for reuse (refcount==0) + // only used when cache_mode==1 + TAILQ_ENTRY(psm2_verbs_mr) next; +}; +typedef struct psm2_verbs_mr *psm2_verbs_mr_t; + +// cache is kept opaque since it has some rbtree fields in it +struct psm2_mr_cache; +typedef struct psm2_mr_cache *psm2_mr_cache_t; + +extern psm2_mr_cache_t psm2_verbs_alloc_mr_cache(psm2_ep_t ep, + uint32_t num_entries, uint8_t cache_mode, + uint32_t pri_entries, uint64_t pri_size); +// pick a flag value unused by verbs.h +#define IBV_ACCESS_IS_GPU_ADDR 0x10000000 +// pd can be the verbs_ep.pd or NULL to use the RV module's kernel pd +extern psm2_verbs_mr_t psm2_verbs_reg_mr(psm2_mr_cache_t cache, + bool priority, struct ibv_pd *pd, + void *addr, uint64_t length, int access); +static inline psm2_verbs_mr_t psm2_verbs_ref_mr(psm2_verbs_mr_t mr) { + mr->refcount++; + return mr; +} +extern int psm2_verbs_release_mr(psm2_verbs_mr_t mrc); +extern void psm2_verbs_free_mr_cache(psm2_mr_cache_t cache); +void ips_tid_mravail_callback(struct ips_proto *proto); + +#endif // _PSMI_VERBS_MR_H diff --git a/prov/psm3/psm3/psmi_wrappers.c b/prov/psm3/psm3/psmi_wrappers.c new file mode 100644 index 00000000000..ba2b0a6224e --- /dev/null +++ b/prov/psm3/psm3/psmi_wrappers.c @@ -0,0 +1,94 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include +#include +#include "psmi_wrappers.h" +#include + +/* The following indirection wrappers for external functions + * are only created if this is a mocking tests build + */ +#ifdef PSM2_MOCK_TESTING + +void MOCKABLE(psmi_exit)(int status) +{ + exit(status); +} +MOCK_DEF_EPILOGUE(psmi_exit); + +ssize_t MOCKABLE(psmi_write)(int fd, const void *buf, size_t count) +{ + return write(fd, buf, count); +} +MOCK_DEF_EPILOGUE(psmi_write); + +int MOCKABLE(psmi_ioctl)(int fd, unsigned int cmd, unsigned long arg) +{ + return ioctl(fd, cmd, arg); +} +MOCK_DEF_EPILOGUE(psmi_ioctl); + +int MOCKABLE(psmi_sigaction)(int signum, const struct sigaction *act, struct sigaction *oldact) +{ + return sigaction(signum, act, oldact); +} +MOCK_DEF_EPILOGUE(psmi_sigaction); + +void MOCKABLE(psmi_rmb)(void) +{ + return ips_rmb(); +} +MOCK_DEF_EPILOGUE(psmi_rmb); + +#endif /* def PSM2_MOCK_TESTING */ diff --git a/prov/psm3/psm3/psmi_wrappers.h b/prov/psm3/psm3/psmi_wrappers.h new file mode 100644 index 00000000000..68f11c8109a --- /dev/null +++ b/prov/psm3/psm3/psmi_wrappers.h @@ -0,0 +1,98 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef _PSMI_WRAPPERS_H +#define _PSMI_WRAPPERS_H + +#include +#include "psm2_mock_testing.h" +#include "opa_intf.h" + +#if defined( IB_IOCTL_MAGIC ) +#include +#endif + +/* If this is a mocking tests build, we introduce "incision points" + * through which we can easily mock external dependencies. + * For non-mocking-tests build, we bypass those indirections + * for performance reasons. + */ + +#ifdef PSM2_MOCK_TESTING +void MOCKABLE(psmi_exit)(int status); +MOCK_DCL_EPILOGUE(psmi_exit); + +ssize_t MOCKABLE(psmi_write)(int fd, const void *buf, size_t count); +MOCK_DCL_EPILOGUE(psmi_write); + +int MOCKABLE(psmi_ioctl)(int fd, unsigned int cmd, unsigned long arg); +MOCK_DCL_EPILOGUE(psmi_ioctl); + +int MOCKABLE(psmi_sigaction)(int signum, const struct sigaction *act, struct sigaction *oldact); +MOCK_DCL_EPILOGUE(psmi_sigaction); + +void MOCKABLE(psmi_rmb)(void); +MOCK_DCL_EPILOGUE(psmi_rmb); + +#else /* def PSM2_MOCK_TESTING */ + +#define psmi_exit exit +#define psmi_write write +#define psmi_ioctl ioctl +#define psmi_sigaction sigaction +#define psmi_rmb ips_rmb + +#endif /* def PSM2_MOCK_TESTING */ + +#endif // _PSMI_WRAPPERS_H + diff --git a/prov/psm3/psm3/ptl.h b/prov/psm3/psm3/ptl.h new file mode 100644 index 00000000000..23dca3a0ebd --- /dev/null +++ b/prov/psm3/psm3/ptl.h @@ -0,0 +1,225 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +/* Interface implemented by Packet Transport layers such as + * ips and active messages. + * + * This interface can be volatile, it is never seen by PSM clients, and it will + * probably change as the AM ptl is developed. + */ + +#ifndef PSM_PTL_H +#define PSM_PTL_H +#include +#include +#include +#include +#include + +/* We currently have 3 PTLs, 0 is reserved. */ +#define PTL_DEVID_IPS 1 +#define PTL_DEVID_AMSH 2 +#define PTL_DEVID_SELF 3 + +/* We can currently initialize up to 3 PTLs */ +#define PTL_MAX_INIT 3 + +/* struct ptl is an incomplete type, and it serves as a generic or opaque + container. It should remain an incomplete type in the entire psm + source base. concrete ptl types need to have a suffix such as ptl_self, + ptl_ips. */ +struct ptl; +typedef struct ptl ptl_t; + +struct ptl_ctl; +typedef struct ptl_ctl ptl_ctl_t; + +struct ptl_mq_req; +typedef struct ptl_mq_req ptl_mq_req_t; + +struct ips_proto; +typedef struct ips_proto ips_proto_t; + +/* To be filled in statically by all PTLs */ +struct ptl_ctl_init { + size_t(*sizeof_ptl) (void); + + psm2_error_t(*init) (const psm2_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl); + + psm2_error_t(*fini) (ptl_t *ptl, int force, uint64_t timeout_ns); + + psm2_error_t + (*setopt) (const void *component_obj, int optname, + const void *optval, uint64_t optlen); + + psm2_error_t + (*getopt) (const void *component_obj, int optname, + void *optval, uint64_t *optlen); +}; + +struct ptl_ctl_rcvthread { + uint32_t(*is_enabled) (const ptl_t *ptl); + void(*transfer_ownership) (ptl_t *from_ptl, ptl_t *to_ptl); +}; + +typedef +struct ptl_arg { + union { + struct { + uint16_t u16w3; + uint16_t u16w2; + uint16_t u16w1; + uint16_t u16w0; + } PACK_SUFFIX; + struct { + uint32_t u32w1; + uint32_t u32w0; + } PACK_SUFFIX; + uint64_t u64w0; + uint64_t u64; + void *uptr; + }; +} PACK_SUFFIX ptl_arg_t; + +#include "ptl_self/ptl_fwd.h" +#include "ptl_ips/ptl_fwd.h" +#include "ptl_am/ptl_fwd.h" + +/* To be filled in as part of ptl_init */ +struct ptl_ctl { + ptl_t *ptl; /* pointer to ptl */ + psm2_ep_t ep; /* pointer to ep */ + + /* EP-specific stuff */ + psm2_error_t(*ep_poll) (ptl_t *ptl, int replyonly); + + /* PTL-level connect + * + * This PTL-level is slightly different from the top-level PSM connect. + * + * pre 1: Caller has masked off epids in epid array that are already + * connected at the PSM level. + * + * post 0: PTL has allocate all epaddrs and whatever internal ptladdr + * that ptl needs. + * post 1: PTL marks error[i] as UNREACHABLE if PTL can't get to epid[i] + * post 2: PTL marks error[i] as UNKNOWN for all epid[i] that couldn't + * be connected before a timeout occurred. + * post 3: PTL returns OK if all epids are either OK or UNREACHABLE + * post 4: PTL defines content or epaddr[i] only if epaddr[i] is OK. + */ + psm2_error_t(*ep_connect) (ptl_t *ptl, + int num_ep, + const psm2_epid_t input_array_of_epid[], + const int array_of_epid_mask[], + psm2_error_t output_array_of_errors[], + psm2_epaddr_t output_array_of_epddr[], + uint64_t timeout_ns); + + psm2_error_t (*ep_disconnect)(ptl_t *ptl, + int force, + int num_ep, + psm2_epaddr_t input_array_of_epaddr[], + const int array_of_epaddr_mask[], + psm2_error_t output_array_of_errors[], + uint64_t timeout_ns); + + /* MQ stuff */ + psm2_error_t(*mq_send) (psm2_mq_t mq, psm2_epaddr_t dest, + uint32_t flags, psm2_mq_tag_t *stag, + const void *buf, uint32_t len); + psm2_error_t(*mq_isend) (psm2_mq_t mq, psm2_epaddr_t dest, + uint32_t flags_user, uint32_t flags_internal, + psm2_mq_tag_t *stag, const void *buf, + uint32_t len, void *ctxt, psm2_mq_req_t *req); + +#if 0 // unused code, specific to QLogic MPI + int (*epaddr_stats_num) (void); + int (*epaddr_stats_init) (char *desc[], uint16_t *flags); + int (*epaddr_stats_get) (psm2_epaddr_t epaddr, uint64_t *stats); +#endif + + /* AM stuff */ + psm2_error_t(*am_get_parameters) (psm2_ep_t ep, + struct psm2_am_parameters * + parameters); + psm2_error_t(*am_short_request) (psm2_epaddr_t epaddr, + psm2_handler_t handler, + psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt); + psm2_error_t(*am_short_reply) (psm2_am_token_t token, + psm2_handler_t handler, + psm2_amarg_t *args, int nargs, void *src, + size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt); + /* Long messages currently unsupported */ +#if 0 + psm2_error_t(*am_long_request) (psm2_epaddr_t epaddr, + psm2_handler_t handler, + psm2_amarg_t *args, int nargs, + void *src, size_t len, void *dest, + int flags); + psm2_error_t(*am_long_reply) (psm2_am_token_t token, + psm2_handler_t handler, psm2_amarg_t *args, + int nargs, void *src, size_t len, + void *dest, int flags); +#endif + psm2_error_t (*msg_size_thresh_query) (enum psm2_info_query_thresh_et, + uint32_t *out, psm2_mq_t mq, psm2_epaddr_t); +}; +#endif diff --git a/prov/psm3/psm3/ptl_am/am_config.h b/prov/psm3/psm3/ptl_am/am_config.h new file mode 100644 index 00000000000..d887118273e --- /dev/null +++ b/prov/psm3/psm3/ptl_am/am_config.h @@ -0,0 +1,82 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2018 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2018 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef PTL_AM_AM_CONFIG_H +#define PTL_AM_AM_CONFIG_H + +#include "psm_config.h" + +/* + * Can change the rendezvous threshold based on usage of cma (or not) + */ +#define PSMI_MQ_RV_THRESH_CMA 16000 + +/* If no kernel assisted copy is available this is the rendezvous threshold */ +#define PSMI_MQ_RV_THRESH_NO_KASSIST 16000 + +#define AMSH_HAVE_CMA 0x1 +#define AMSH_HAVE_KASSIST 0x1 + +/* Each block reserves some space at the beginning to store auxiliary data */ +#define AMSH_BLOCK_HEADER_SIZE 4096 + +/* AMLONG_SZ is the total size in memory of a bulk packet, including an + * am_pkt_bulk_t header struct. + * AMLONG_MTU is the number of bytes available in a bulk packet for payload. */ +#define AMLONG_SZ 8192 +#define AMLONG_MTU (AMLONG_SZ-sizeof(am_pkt_bulk_t)) + +#define PSMI_KASSIST_MODE_DEFAULT PSMI_KASSIST_CMA_GET +#define PSMI_KASSIST_MODE_DEFAULT_STRING "cma-get" + +#endif /* PTL_AM_AM_CONFIG_H */ diff --git a/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.c b/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.c new file mode 100644 index 00000000000..a3801eaf499 --- /dev/null +++ b/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.c @@ -0,0 +1,492 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifdef PSM_CUDA + +#include "psm_user.h" +#include "am_cuda_memhandle_cache.h" + +/* + * rbtree cruft + */ +struct _cl_map_item; + +typedef struct +{ + unsigned long start; /* start virtual address */ + CUipcMemHandle cuda_ipc_handle; /* cuda ipc mem handle */ + CUdeviceptr cuda_ipc_dev_ptr;/* Cuda device pointer */ + uint16_t length; /* length*/ + psm2_epid_t epid; + struct _cl_map_item* i_prev; /* idle queue previous */ + struct _cl_map_item* i_next; /* idle queue next */ +}__attribute__ ((aligned (128))) rbtree_cuda_memhandle_cache_mapitem_pl_t; + +typedef struct { + uint32_t nelems; /* number of elements in the cache */ +} rbtree_cuda_memhandle_cache_map_pl_t; + +static psm2_error_t am_cuda_memhandle_mpool_init(uint32_t memcache_size); + +/* + * Custom comparator + */ +typedef rbtree_cuda_memhandle_cache_mapitem_pl_t cuda_cache_item; + +static int cuda_cache_key_cmp(const cuda_cache_item *a, const cuda_cache_item *b) +{ + // When multi-ep is disabled, cache can assume + // 1 epid == 1 remote process == 1 CUDA address space + // But when multi-ep is enabled, one process can have many epids, so in this case + // cannot use epid as part of cache key. + if (!psmi_multi_ep_enabled) { + if (a->epid < b->epid) + return -1; + if (a->epid > b->epid) + return 1; + } + + unsigned long a_end, b_end; + // normalize into inclusive upper bounds to handle + // 0-length entries + a_end = (a->start + a->length); + b_end = (b->start + b->length); + if (a->length > 0) + a_end--; + + if (b->length > 0) + b_end--; + + if (a_end < b->start) + return -1; + if (b_end < a->start) + return 1; + + return 0; +} + + +/* + * Necessary rbtree cruft + */ +#define RBTREE_MI_PL rbtree_cuda_memhandle_cache_mapitem_pl_t +#define RBTREE_MAP_PL rbtree_cuda_memhandle_cache_map_pl_t +#define RBTREE_CMP(a,b) cuda_cache_key_cmp((a), (b)) +#define RBTREE_ASSERT psmi_assert +#define RBTREE_MAP_COUNT(PAYLOAD_PTR) ((PAYLOAD_PTR)->nelems) +#define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR + +#include "rbtree.h" +#include "rbtree.c" + +/* + * Convenience rbtree cruft + */ +#define NELEMS cuda_memhandle_cachemap.payload.nelems + +#define IHEAD cuda_memhandle_cachemap.root +#define LAST IHEAD->payload.i_prev +#define FIRST IHEAD->payload.i_next +#define INEXT(x) x->payload.i_next +#define IPREV(x) x->payload.i_prev + +/* + * Actual module data + */ +static cl_qmap_t cuda_memhandle_cachemap; /* Global cache */ +static uint8_t cuda_memhandle_cache_enabled; +static mpool_t cuda_memhandle_mpool; +static uint32_t cuda_memhandle_cache_size; + +static uint64_t cache_hit_counter; +static uint64_t cache_miss_counter; +static uint64_t cache_evict_counter; +static uint64_t cache_collide_counter; +static uint64_t cache_clear_counter; + +static void print_cuda_memhandle_cache_stats(void) +{ + _HFI_DBG("enabled=%u,size=%u,hit=%lu,miss=%lu,evict=%lu,collide=%lu,clear=%lu\n", + cuda_memhandle_cache_enabled, cuda_memhandle_cache_size, + cache_hit_counter, cache_miss_counter, + cache_evict_counter, cache_collide_counter, cache_clear_counter); +} + +/* + * This is the callback function when mempool are resized or destroyed. + * Upon calling cache fini mpool is detroyed which in turn calls this callback + * which helps in closing all memhandles. + */ +static void +psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj) +{ + cl_map_item_t* memcache_item = (cl_map_item_t*)obj; + if (!is_alloc) { + if(memcache_item->payload.start) + PSMI_CUDA_CALL(cuIpcCloseMemHandle, + memcache_item->payload.cuda_ipc_dev_ptr); + } +} + +/* + * Creating mempool for cuda memhandle cache nodes. + */ +static psm2_error_t +am_cuda_memhandle_mpool_init(uint32_t memcache_size) +{ + psm2_error_t err; + if (memcache_size < 1) + return PSM2_PARAM_ERR; + + cuda_memhandle_cache_size = memcache_size; + /* Creating a memory pool of size PSM3_CUDA_MEMCACHE_SIZE + * which includes the Root and NIL items + */ + cuda_memhandle_mpool = psmi_mpool_create_for_cuda(sizeof(cl_map_item_t), + cuda_memhandle_cache_size, + cuda_memhandle_cache_size, 0, + UNDEFINED, NULL, NULL, + psmi_cuda_memhandle_cache_alloc_func, + NULL); + if (cuda_memhandle_mpool == NULL) { + err = psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, + "Couldn't allocate CUDA host receive buffer pool"); + return err; + } + return PSM2_OK; +} + +/* + * Initialize rbtree. + */ +psm2_error_t am_cuda_memhandle_cache_init(uint32_t memcache_size) +{ + psm2_error_t err = am_cuda_memhandle_mpool_init(memcache_size); + if (err != PSM2_OK) + return err; + + cl_map_item_t *root, *nil_item; + root = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t)); + if (root == NULL) + return PSM2_NO_MEMORY; + nil_item = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t)); + if (nil_item == NULL) { + psmi_free(root); + return PSM2_NO_MEMORY; + } + + nil_item->payload.start = 0; + nil_item->payload.epid = 0; + nil_item->payload.length = 0; + cuda_memhandle_cache_enabled = 1; + ips_cl_qmap_init(&cuda_memhandle_cachemap,root,nil_item); + NELEMS = 0; + + cache_hit_counter = 0; + cache_miss_counter = 0; + cache_evict_counter = 0; + cache_collide_counter = 0; + cache_clear_counter = 0; + + return PSM2_OK; +} + +void am_cuda_memhandle_cache_map_fini() +{ + print_cuda_memhandle_cache_stats(); + + if (cuda_memhandle_cachemap.nil_item) { + psmi_free(cuda_memhandle_cachemap.nil_item); + cuda_memhandle_cachemap.nil_item = NULL; + } + + if (cuda_memhandle_cachemap.root) { + psmi_free(cuda_memhandle_cachemap.root); + cuda_memhandle_cachemap.root = NULL; + } + + if (cuda_memhandle_cache_enabled) { + psmi_mpool_destroy(cuda_memhandle_mpool); + cuda_memhandle_cache_enabled = 0; + } + + cuda_memhandle_cache_size = 0; +} + +/* + * Insert at the head of Idleq. + */ +static void +am_cuda_idleq_insert(cl_map_item_t* memcache_item) +{ + if (FIRST == NULL) { + FIRST = memcache_item; + LAST = memcache_item; + return; + } + INEXT(FIRST) = memcache_item; + IPREV(memcache_item) = FIRST; + FIRST = memcache_item; + INEXT(FIRST) = NULL; + return; +} + +/* + * Remove least recent used element. + */ +static void +am_cuda_idleq_remove_last(cl_map_item_t* memcache_item) +{ + if (!INEXT(memcache_item)) { + LAST = NULL; + FIRST = NULL; + } else { + LAST = INEXT(memcache_item); + IPREV(LAST) = NULL; + } + // Null-out now-removed memcache_item's next and prev pointers out of + // an abundance of caution + INEXT(memcache_item) = IPREV(memcache_item) = NULL; +} + +static void +am_cuda_idleq_remove(cl_map_item_t* memcache_item) +{ + if (LAST == memcache_item) { + am_cuda_idleq_remove_last(memcache_item); + } else if (FIRST == memcache_item) { + FIRST = IPREV(memcache_item); + INEXT(FIRST) = NULL; + } else { + INEXT(IPREV(memcache_item)) = INEXT(memcache_item); + IPREV(INEXT(memcache_item)) = IPREV(memcache_item); + } + // Null-out now-removed memcache_item's next and prev pointers out of + // an abundance of caution + INEXT(memcache_item) = IPREV(memcache_item) = NULL; +} + +static void +am_cuda_idleq_reorder(cl_map_item_t* memcache_item) +{ + if (FIRST == memcache_item && LAST == memcache_item ) { + return; + } + am_cuda_idleq_remove(memcache_item); + am_cuda_idleq_insert(memcache_item); + return; +} + +/* + * After a successful cache hit, item is validated by doing a + * memcmp on the handle stored and the handle we recieve from the + * sender. If the validation fails the item is removed from the idleq, + * the rbtree, is put back into the mpool and IpcCloseMemHandle function + * is called. + */ +static psm2_error_t +am_cuda_memhandle_cache_validate(cl_map_item_t* memcache_item, + uintptr_t sbuf, CUipcMemHandle* handle, + uint32_t length, psm2_epid_t epid) +{ + if ((0 == memcmp(handle, &memcache_item->payload.cuda_ipc_handle, + sizeof(CUipcMemHandle))) + && sbuf == memcache_item->payload.start + && epid == memcache_item->payload.epid) { + return PSM2_OK; + } + _HFI_DBG("cache collision: new entry start=%lu,length=%u\n", sbuf, length); + + cache_collide_counter++; + ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, memcache_item); + PSMI_CUDA_CALL(cuIpcCloseMemHandle, + memcache_item->payload.cuda_ipc_dev_ptr); + am_cuda_idleq_remove(memcache_item); + memset(memcache_item, 0, sizeof(*memcache_item)); + psmi_mpool_put(memcache_item); + return PSM2_OK_NO_PROGRESS; +} + +/* + * Current eviction policy: Least Recently Used. + */ +static void +am_cuda_memhandle_cache_evict(void) +{ + cache_evict_counter++; + cl_map_item_t *p_item = LAST; + _HFI_VDBG("Removing (epid=%lu,start=%lu,length=%u,dev_ptr=0x%llX,it=%p) from cuda_memhandle_cachemap.\n", + p_item->payload.epid, p_item->payload.start, p_item->payload.length, + p_item->payload.cuda_ipc_dev_ptr, p_item); + ips_cl_qmap_remove_item(&cuda_memhandle_cachemap, p_item); + PSMI_CUDA_CALL(cuIpcCloseMemHandle, p_item->payload.cuda_ipc_dev_ptr); + am_cuda_idleq_remove_last(p_item); + memset(p_item, 0, sizeof(*p_item)); + psmi_mpool_put(p_item); +} + +static psm2_error_t +am_cuda_memhandle_cache_register(uintptr_t sbuf, CUipcMemHandle* handle, + uint32_t length, psm2_epid_t epid, + CUdeviceptr cuda_ipc_dev_ptr) +{ + if (NELEMS == cuda_memhandle_cache_size) + am_cuda_memhandle_cache_evict(); + + cl_map_item_t* memcache_item = psmi_mpool_get(cuda_memhandle_mpool); + /* memcache_item cannot be NULL as we evict + * before the call to mpool_get. Check has + * been fixed to help with klockwork analysis. + */ + if (memcache_item == NULL) + return PSM2_NO_MEMORY; + memcache_item->payload.start = sbuf; + memcache_item->payload.cuda_ipc_handle = *handle; + memcache_item->payload.cuda_ipc_dev_ptr = cuda_ipc_dev_ptr; + memcache_item->payload.length = length; + memcache_item->payload.epid = epid; + ips_cl_qmap_insert_item(&cuda_memhandle_cachemap, memcache_item); + am_cuda_idleq_insert(memcache_item); + return PSM2_OK; +} + +static void am_cuda_memhandle_cache_clear(void) +{ + _HFI_DBG("Closing all handles, clearing cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS); + while (NELEMS) { + am_cuda_memhandle_cache_evict(); + } + _HFI_DBG("Closed all handles, cleared cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS); +} + +/* + * The key used to search the cache is the senders buf address pointer. + * Upon a succesful hit in the cache, additional validation is required + * as multiple senders could potentially send the same buf address value. + */ +CUdeviceptr +am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle, + uint32_t length, psm2_epid_t epid) +{ + _HFI_VDBG("sbuf=%lu,handle=%p,length=%u,epid=%lu\n", + sbuf, handle, length, epid); + + CUdeviceptr cuda_ipc_dev_ptr; + if(!cuda_memhandle_cache_enabled) { + PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, + *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + return cuda_ipc_dev_ptr; + } + + cuda_cache_item key = { + .start = (unsigned long) sbuf, + .length= length, + .epid = epid + }; + + /* + * preconditions: + * 1) newrange [start,end) may or may not be in cachemap already + * 2) there are no overlapping address ranges in cachemap + * postconditions: + * 1) newrange is in cachemap + * 2) there are no overlapping address ranges in cachemap + * + * The key used to search the cache is the senders buf address pointer. + * Upon a succesful hit in the cache, additional validation is required + * as multiple senders could potentially send the same buf address value. + */ + cl_map_item_t *p_item = ips_cl_qmap_searchv(&cuda_memhandle_cachemap, &key); + while (p_item->payload.start) { + // Since a precondition is that there are no overlapping ranges in cachemap, + // an exact match implies no need to check further + if (am_cuda_memhandle_cache_validate(p_item, sbuf, handle, length, epid) == PSM2_OK) { + cache_hit_counter++; + am_cuda_idleq_reorder(p_item); + return p_item->payload.cuda_ipc_dev_ptr; + } + + // newrange is not in the cache and overlaps at least one existing range. + // am_cuda_memhandle_cache_validate() closed and removed existing range. + // Continue searching for more overlapping ranges + p_item = ips_cl_qmap_searchv(&cuda_memhandle_cachemap, &key); + } + cache_miss_counter++; + + CUresult cudaerr; + PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_ALREADY_MAPPED, cuIpcOpenMemHandle, + &cuda_ipc_dev_ptr, *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + + if (cudaerr == CUDA_ERROR_ALREADY_MAPPED) { + // remote memory already mapped. Close all handles, clear cache, + // and try again + am_cuda_memhandle_cache_clear(); + cache_clear_counter++; + PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, *handle, + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS); + } + + am_cuda_memhandle_cache_register(sbuf, handle, + length, epid, cuda_ipc_dev_ptr); + return cuda_ipc_dev_ptr; +} + +void +am_cuda_memhandle_release(CUdeviceptr cuda_ipc_dev_ptr) +{ + if(!cuda_memhandle_cache_enabled) + PSMI_CUDA_CALL(cuIpcCloseMemHandle, cuda_ipc_dev_ptr); + return; +} + +#endif diff --git a/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.h b/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.h new file mode 100644 index 00000000000..2b1dbc05a42 --- /dev/null +++ b/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.h @@ -0,0 +1,84 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifdef PSM_CUDA + +#ifndef _AM_CUDA_MEMHANDLE_CACHE_H +#define _AM_CUDA_MEMHANDLE_CACHE_H + +#include "psm_user.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define CUDA_MEMHANDLE_CACHE_SIZE 64 + +psm2_error_t am_cuda_memhandle_cache_init(uint32_t memcache_size); + +CUdeviceptr +am_cuda_memhandle_acquire(uintptr_t sbuf, CUipcMemHandle* handle, + uint32_t length, psm2_epid_t epid); +void +am_cuda_memhandle_release(CUdeviceptr cuda_ipc_dev_ptr); + +void am_cuda_memhandle_cache_map_fini(); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* _AM_CUDA_MEMHANDLE_CACHE_H */ + +#endif /* PSM_CUDA */ diff --git a/prov/psm3/psm3/ptl_am/am_reqrep.c b/prov/psm3/psm3/ptl_am/am_reqrep.c new file mode 100644 index 00000000000..5f90ec7267e --- /dev/null +++ b/prov/psm3/psm3/ptl_am/am_reqrep.c @@ -0,0 +1,118 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_am.h" +#include "psm_mq_internal.h" +#include "psm_am_internal.h" + +psm2_error_t +psmi_amsh_am_short_request(psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + psm2_amarg_t req_args[NSHORT_ARGS + NBULK_ARGS]; + + /* All sends are synchronous. Ignore PSM2_AM_FLAG_ASYNC. + * Treat PSM2_AM_FLAG_NOREPLY as "advisory". This was mainly + * used to optimize the IPS path though we could put a stricter interpretation + * on it to disallow any replies. + */ + + /* For now less than NSHORT_ARGS+NBULK_ARGS-1. We use the first arg to carry + * the handler index. + */ + psmi_assert(nargs <= (NSHORT_ARGS + NBULK_ARGS - 1)); + psmi_assert(epaddr->ptlctl->ptl != NULL); + + req_args[0].u32w0 = (uint32_t) handler; + psmi_mq_mtucpy((void *)&req_args[1], (const void *)args, + (nargs * sizeof(psm2_amarg_t))); + psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr, am_handler_hidx, + req_args, nargs + 1, src, len, 0); + + if (completion_fn) + completion_fn(completion_ctxt); + + return PSM2_OK; +} + +psm2_error_t +psmi_amsh_am_short_reply(psm2_am_token_t tok, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + psm2_amarg_t rep_args[NSHORT_ARGS + NBULK_ARGS]; + + /* For now less than NSHORT_ARGS+NBULK_ARGS-1. We use the first arg to carry + * the handler index. + */ + psmi_assert(nargs <= (NSHORT_ARGS + NBULK_ARGS - 1)); + rep_args[0].u32w0 = (uint32_t) handler; + psmi_mq_mtucpy((void *)&rep_args[1], (const void *)args, + (nargs * sizeof(psm2_amarg_t))); + + psmi_amsh_short_reply((amsh_am_token_t *) tok, am_handler_hidx, + rep_args, nargs + 1, src, len, 0); + + if (completion_fn) + completion_fn(completion_ctxt); + + return PSM2_OK; +} diff --git a/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c b/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c new file mode 100644 index 00000000000..2f135a83b83 --- /dev/null +++ b/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c @@ -0,0 +1,2716 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include /* shm_open and signal handling */ +#include +#include +#include +#include + +#include "psm_user.h" +#include "psm_mq_internal.h" +#include "psm_am_internal.h" +#include "cmarw.h" +#include "psmi_wrappers.h" + +#ifdef PSM_CUDA +#include "am_cuda_memhandle_cache.h" +#endif + +int psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST; + +static const amsh_qinfo_t amsh_qcounts = { + .qreqFifoShort = 1024, + .qreqFifoLong = 256, + .qrepFifoShort = 1024, + .qrepFifoLong = 256 +}; + +static const amsh_qinfo_t amsh_qelemsz = { + .qreqFifoShort = sizeof(am_pkt_short_t), + .qreqFifoLong = AMLONG_SZ, + .qrepFifoShort = sizeof(am_pkt_short_t), + .qrepFifoLong = AMLONG_SZ +}; + +ustatic struct { + void *addr; + size_t len; + struct sigaction SIGSEGV_old_act; + struct sigaction SIGBUS_old_act; +} action_stash; + +static psm2_error_t amsh_poll(ptl_t *ptl, int replyonly); +static void process_packet(ptl_t *ptl, am_pkt_short_t *pkt, int isreq); +static void amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, + void *buf, size_t len); + +/* Kassist helper functions */ +#if _HFI_DEBUGGING +static const char *psmi_kassist_getmode(int mode); +#endif +static int psmi_get_kassist_mode(); +int psmi_epaddr_pid(psm2_epaddr_t epaddr); + +static inline void +am_ctl_qhdr_init(volatile am_ctl_qhdr_t *q, int elem_cnt, int elem_sz) +{ + pthread_spin_init(&q->lock, PTHREAD_PROCESS_SHARED); + q->head = 0; + q->tail = 0; + q->elem_cnt = elem_cnt; + q->elem_sz = elem_sz; +} + +static void +am_ctl_bulkpkt_init(am_pkt_bulk_t *base_ptr, size_t elemsz, int nelems) +{ + int i; + am_pkt_bulk_t *bulkpkt; + uintptr_t bulkptr = (uintptr_t) base_ptr; + + for (i = 0; i < nelems; i++, bulkptr += elemsz) { + bulkpkt = (am_pkt_bulk_t *) bulkptr; + bulkpkt->idx = i; + } +} + +#define _PA(type) PSMI_ALIGNUP(amsh_qcounts.q ## type * amsh_qelemsz.q ## type, \ + PSMI_PAGESIZE) +static inline uintptr_t am_ctl_sizeof_block() +{ + return PSMI_ALIGNUP( + PSMI_ALIGNUP(AMSH_BLOCK_HEADER_SIZE, PSMI_PAGESIZE) + + /* reqctrl block */ + PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) + + _PA(reqFifoShort) + _PA(reqFifoLong) + + /*reqctrl block */ + PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) + + /* align to page size */ + _PA(repFifoShort) + _PA(repFifoLong), PSMI_PAGESIZE); +} + +#undef _PA + +static uint32_t create_extra_ep_data() +{ + uint32_t ret = getpid(); + +#ifdef PSM_CUDA + /* PID is at maximum 22 bits */ + ret |= my_gpu_device << 22; +#endif + + return ret; +} + +static void read_extra_ep_data(uint32_t data, uint32_t *pid, uint32_t *gpu) +{ + uint32_t pid_mask = (1 << 22) - 1; + + *pid = data & pid_mask; + *gpu = (data & ~pid_mask) >> 22; +} + +static void am_update_directory(struct am_ctl_nodeinfo *); + +static +void amsh_atexit() +{ + static ips_atomic_t atexit_once = { 0 }; + psm2_ep_t ep; + struct ptl_am *ptl; + + /* bail out if previous value is non-zero */ + if (ips_atomic_cmpxchg(&atexit_once, 0, 1) != 0) + return; + + ep = psmi_opened_endpoint; + while (ep) { + ptl = (struct ptl_am *)(ep->ptl_amsh.ptl); + if (ptl->self_nodeinfo && + ptl->amsh_keyname != NULL) { + _HFI_VDBG("unlinking shm file %s\n", + ptl->amsh_keyname); + shm_unlink(ptl->amsh_keyname); + } + ep = ep->user_ep_next; + } + + return; +} + +ustatic +void amsh_mmap_fault(int signo, siginfo_t *siginfo, void *context) +{ + if ((unsigned long int) siginfo->si_addr >= (unsigned long int) action_stash.addr && + (unsigned long int) siginfo->si_addr < (unsigned long int) action_stash.addr + (unsigned long int) action_stash.len) { + + static char shm_errmsg[256]; + + snprintf(shm_errmsg, sizeof(shm_errmsg), + "%s: Unable to allocate shared memory for intra-node messaging.\n" + "%s: Delete stale shared memory files in /dev/shm.\n", + psmi_gethostname(), psmi_gethostname()); + amsh_atexit(); + if (psmi_write(2, shm_errmsg, strlen(shm_errmsg) + 1) == -1) + psmi_exit(2); + else + psmi_exit(1); /* XXX revisit this... there's probably a better way to exit */ + } else { + if (signo == SIGSEGV) { + if (action_stash.SIGSEGV_old_act.sa_sigaction == (void*) SIG_DFL) { + psmi_sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL); + raise(SIGSEGV); + struct sigaction act; + act.sa_sigaction = amsh_mmap_fault; + act.sa_flags = SA_SIGINFO; + psmi_sigaction(SIGSEGV, &act, NULL); + } else if (action_stash.SIGSEGV_old_act.sa_sigaction == (void*) SIG_IGN) { + return; + } else { + action_stash.SIGSEGV_old_act.sa_sigaction(signo, siginfo, context); + } + } else if (signo == SIGBUS) { + if (action_stash.SIGBUS_old_act.sa_sigaction == (void*) SIG_DFL) { + psmi_sigaction(SIGBUS, &action_stash.SIGBUS_old_act, NULL); + raise(SIGBUS); + struct sigaction act; + act.sa_sigaction = amsh_mmap_fault; + act.sa_flags = SA_SIGINFO; + psmi_sigaction(SIGBUS, &act, NULL); + } else if (action_stash.SIGBUS_old_act.sa_sigaction == (void*) SIG_IGN) { + return; + } else { + action_stash.SIGBUS_old_act.sa_sigaction(signo, siginfo, context); + } + } else { + psmi_exit(signo); + } + } +} + +/** + * Create endpoint shared-memory object, containing ep's info + * and message queues. + */ +psm2_error_t psmi_shm_create(ptl_t *ptl_gen) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + psm2_ep_t ep = ptl->ep; + char shmbuf[256]; + void *mapptr; + size_t segsz; + psm2_error_t err = PSM2_OK; + int shmfd = -1; + char *amsh_keyname = NULL; + int iterator; + /* Get which kassist mode to use. */ + ptl->psmi_kassist_mode = psmi_get_kassist_mode(); + + if (_HFI_PRDBG_ON) { + _HFI_PRDBG_ALWAYS + ("kassist_mode %d %s use_kassist %d\n", + ptl->psmi_kassist_mode, + psmi_kassist_getmode(ptl->psmi_kassist_mode), + (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF)); + } + + segsz = am_ctl_sizeof_block(); + for (iterator = 0; iterator <= INT_MAX; iterator++) { + snprintf(shmbuf, + sizeof(shmbuf), + "/psm3_shm.%ld%016lx%d", + (long int) getuid(), + ep->epid, + iterator); + amsh_keyname = psmi_strdup(NULL, shmbuf); + if (amsh_keyname == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + shmfd = + shm_open(amsh_keyname, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if (shmfd < 0) { + psmi_free(amsh_keyname); + amsh_keyname = NULL; + if (errno == EACCES && iterator < INT_MAX) + continue; + else { + err = psmi_handle_error(NULL, + PSM2_SHMEM_SEGMENT_ERR, + "Error creating shared " + "memory object %s in " + "shm_open: %s", + amsh_keyname, strerror(errno)); + goto fail; + } + } else { + struct stat st; + if (fstat(shmfd, &st) == -1) { + err = psmi_handle_error(NULL, + PSM2_SHMEM_SEGMENT_ERR, + "Error validating " + "shared memory object %s " + "with fstat: %s", + amsh_keyname, strerror(errno)); + goto fail; + } + if (getuid() == st.st_uid) { + err = PSM2_OK; + break; + } else { + err = PSM2_SHMEM_SEGMENT_ERR; + close(shmfd); + } + } + } + if (err) { + if (amsh_keyname) psmi_free(amsh_keyname); + err = psmi_handle_error(NULL, + PSM2_SHMEM_SEGMENT_ERR, + "Error creating shared memory object " + "in shm_open: namespace exhausted."); + goto fail; + } + + /* Now register the atexit handler for cleanup, whether master or slave */ + atexit(amsh_atexit); + + _HFI_PRDBG("Opened shmfile %s\n", amsh_keyname); + + if (ftruncate(shmfd, segsz) != 0) { + err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, + "Error setting size of shared memory object to %u bytes in " + "ftruncate: %s\n", + (uint32_t) segsz, + strerror(errno)); + goto fail; + } + + mapptr = mmap(NULL, segsz, + PROT_READ | PROT_WRITE, MAP_SHARED, shmfd, 0); + if (mapptr == MAP_FAILED) { + err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, + "Error mmapping shared memory: %s", + strerror(errno)); + psmi_free(amsh_keyname); + goto fail; + } + + memset((void *) mapptr, 0, segsz); /* touch all of my pages */ + + /* Our own ep's info for ptl_am resides at the start of the + shm object. Other processes need some of this info to + understand the rest of the queue structure and other details. */ + ptl->self_nodeinfo = (struct am_ctl_nodeinfo *) mapptr; + ptl->amsh_keyname = amsh_keyname; + ptl->self_nodeinfo->amsh_shmbase = (uintptr_t) mapptr; + +fail: + if (shmfd >= 0) close(shmfd); + return err; +} + +psm2_error_t psmi_epdir_extend(ptl_t *ptl_gen) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + struct am_ctl_nodeinfo *new = NULL; + + new = (struct am_ctl_nodeinfo *) + psmi_memalign(ptl->ep, PER_PEER_ENDPOINT, 64, + (ptl->am_ep_size + AMSH_DIRBLOCK_SIZE) * + sizeof(struct am_ctl_nodeinfo)); + if (new == NULL) + return PSM2_NO_MEMORY; + + memcpy(new, ptl->am_ep, + ptl->am_ep_size * sizeof(struct am_ctl_nodeinfo)); + memset(new + ptl->am_ep_size, 0, + AMSH_DIRBLOCK_SIZE * sizeof(struct am_ctl_nodeinfo)); + + psmi_free(ptl->am_ep); + ptl->am_ep = new; + ptl->am_ep_size += AMSH_DIRBLOCK_SIZE; + + return PSM2_OK; +} + +/** + * Unmap shm regions upon proper disconnect with other processes + */ +psm2_error_t psmi_do_unmap(uintptr_t shmbase) +{ + psm2_error_t err = PSM2_OK; + if (munmap((void *)shmbase, am_ctl_sizeof_block())) { + err = + psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, + "Error with munmap of shared segment: %s", + strerror(errno)); + } + return err; +} + +/** + * Map a remote process' shared memory object. + * + * If the remote process has a shared memory object available, add it to our own + * directory and return the shmidx. If the shared memory object does not exist, + * return -1, and the connect poll function will try to map again later. + * + * If force_remap is true, then clear the entry that matches the epid. + */ +psm2_error_t psmi_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shmidx_o, int force_remap) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + int i; + int use_kassist; + uint16_t shmidx; + char shmbuf[256]; + void *dest_mapptr; + size_t segsz; + psm2_error_t err = PSM2_OK; + int dest_shmfd; + struct am_ctl_nodeinfo *dest_nodeinfo; + int iterator; + + shmidx = *shmidx_o = -1; + + for (i = 0; i <= ptl->max_ep_idx; i++) { + if (ptl->am_ep[i].epid == epid) { + if (force_remap) { + ptl->am_ep[i].epaddr = NULL; + ptl->am_ep[i].epid = 0; + break; + } + *shmidx_o = shmidx = i; + return err; + } + } + + + use_kassist = (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF); + + segsz = am_ctl_sizeof_block(); + for (iterator = 0; iterator <= INT_MAX; iterator++) { + snprintf(shmbuf, + sizeof(shmbuf), + "/psm3_shm.%ld%016lx%d", + (long int) getuid(), + epid, + iterator); + dest_shmfd = shm_open(shmbuf, O_RDWR, S_IRWXU); + if (dest_shmfd < 0) { + if (errno == EACCES && iterator < INT_MAX) + continue; + else { + err = psmi_handle_error(NULL, + PSM2_SHMEM_SEGMENT_ERR, + "Error opening remote " + "shared memory object %s " + "in shm_open: %s", + shmbuf, strerror(errno)); + goto fail; + } + } else { + struct stat st; + if (fstat(dest_shmfd, &st) == -1) { + err = psmi_handle_error(NULL, + PSM2_SHMEM_SEGMENT_ERR, + "Error validating " + "shared memory object %s " + "with fstat: %s", + shmbuf, strerror(errno)); + close(dest_shmfd); + goto fail; + } + if (getuid() == st.st_uid) { + err = PSM2_OK; + break; + } else { + err = PSM2_SHMEM_SEGMENT_ERR; + close(dest_shmfd); + } + } + } + if (err) { + err = psmi_handle_error(NULL, + PSM2_SHMEM_SEGMENT_ERR, + "Error opening remote shared " + "memory object in shm_open: " + "namespace exhausted."); + goto fail; + } + + dest_mapptr = mmap(NULL, segsz, + PROT_READ | PROT_WRITE, MAP_SHARED, dest_shmfd, 0); + if (dest_mapptr == MAP_FAILED) { + err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, + "Error mmapping remote shared memory: %s", + strerror(errno)); + close(dest_shmfd); + goto fail; + } + close(dest_shmfd); + dest_nodeinfo = (struct am_ctl_nodeinfo *)dest_mapptr; + + /* We core dump right after here if we don't check the mmap */ + action_stash.addr = dest_mapptr; + action_stash.len = segsz; + + struct sigaction act = { .sa_sigaction = amsh_mmap_fault, .sa_flags = SA_SIGINFO }; + + sigaction(SIGSEGV, &act, &action_stash.SIGSEGV_old_act); + sigaction(SIGBUS, &act, &action_stash.SIGBUS_old_act); + + { + volatile uint16_t *is_init = &dest_nodeinfo->is_init; + while (*is_init == 0) + usleep(1); + ips_sync_reads(); + _HFI_PRDBG("Got a published remote dirpage page at " + "%p, size=%dn", dest_mapptr, (int)segsz); + } + + shmidx = -1; + if ((ptl->max_ep_idx + 1) == ptl->am_ep_size) { + err = psmi_epdir_extend(ptl_gen); + if (err) + goto fail; + + for (i = 0; i <= ptl->max_ep_idx; i++) { + if (ptl->am_ep[i].epid != 0) + am_update_directory(&ptl->am_ep[i]); + } + } + for (i = 0; i < ptl->am_ep_size; i++) { + psmi_assert(ptl->am_ep[i].epid != epid); + if (ptl->am_ep[i].epid == 0) { + ptl->am_ep[i].epid = epid; + ptl->am_ep[i].psm_verno = dest_nodeinfo->psm_verno; + ptl->am_ep[i].pid = dest_nodeinfo->pid; + if (use_kassist) { + /* If we are able to use CMA assume everyone + * else on the node can also use it. + * Advertise that CMA is active via the + * feature flag. + */ + + if (cma_available()) { + ptl->am_ep[i].amsh_features |= + AMSH_HAVE_CMA; + psmi_shm_mq_rv_thresh = + PSMI_MQ_RV_THRESH_CMA; + } else { + ptl->psmi_kassist_mode = + PSMI_KASSIST_OFF; + use_kassist = 0; + psmi_shm_mq_rv_thresh = + PSMI_MQ_RV_THRESH_NO_KASSIST; + } + } else + psmi_shm_mq_rv_thresh = + PSMI_MQ_RV_THRESH_NO_KASSIST; + _HFI_PRDBG("KASSIST MODE: %s\n", + psmi_kassist_getmode(ptl->psmi_kassist_mode)); + shmidx = *shmidx_o = i; + _HFI_PRDBG("Mapped epid %lx into shmidx %d\n", epid, shmidx); + ptl->am_ep[i].amsh_shmbase = (uintptr_t) dest_mapptr; + ptl->am_ep[i].amsh_qsizes = dest_nodeinfo->amsh_qsizes; + if (i > ptl->max_ep_idx) + ptl->max_ep_idx = i; + break; + } + } + + /* install the old sighandler back */ + sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL); + sigaction(SIGBUS, &action_stash.SIGBUS_old_act, NULL); + + if (shmidx == (uint16_t)-1) + err = psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, + "Could not connect to local endpoint"); +fail: + return err; +} + +/** + * Initialize pointer structure and locks for endpoint shared-memory AM. + */ + +#define AMSH_QSIZE(type) \ + PSMI_ALIGNUP(amsh_qelemsz.q ## type * amsh_qcounts.q ## type, \ + PSMI_PAGESIZE) + +static psm2_error_t amsh_init_segment(ptl_t *ptl_gen) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + psm2_error_t err = PSM2_OK; + + /* Preconditions */ + psmi_assert_always(ptl != NULL); + psmi_assert_always(ptl->ep != NULL); + psmi_assert_always(ptl->epaddr != NULL); + psmi_assert_always(ptl->ep->epid != 0); + + if ((err = psmi_shm_create(ptl_gen))) + goto fail; + + ptl->self_nodeinfo->amsh_qsizes.qreqFifoShort = AMSH_QSIZE(reqFifoShort); + ptl->self_nodeinfo->amsh_qsizes.qreqFifoLong = AMSH_QSIZE(reqFifoLong); + ptl->self_nodeinfo->amsh_qsizes.qrepFifoShort = AMSH_QSIZE(repFifoShort); + ptl->self_nodeinfo->amsh_qsizes.qrepFifoLong = AMSH_QSIZE(repFifoLong); + + /* We core dump right after here if we don't check the mmap */ + + struct sigaction act = { + .sa_sigaction = amsh_mmap_fault, + .sa_flags = SA_SIGINFO + }; + + sigaction(SIGSEGV, &act, &action_stash.SIGSEGV_old_act); + sigaction(SIGBUS, &act, &action_stash.SIGBUS_old_act); + + /* + * Now that we know our epid, update it in the shmidx array + */ + ptl->reqH.base = ptl->reqH.head = ptl->reqH.end = NULL; + ptl->repH.base = ptl->repH.head = ptl->repH.end = NULL; + + am_update_directory(ptl->self_nodeinfo); + + ptl->reqH.head = ptl->reqH.base = (am_pkt_short_t *) + (((uintptr_t)ptl->self_nodeinfo->qdir.qreqFifoShort)); + ptl->reqH.end = (am_pkt_short_t *) + (((uintptr_t)ptl->self_nodeinfo->qdir.qreqFifoShort) + + amsh_qcounts.qreqFifoShort * amsh_qelemsz.qreqFifoShort); + + ptl->repH.head = ptl->repH.base = (am_pkt_short_t *) + (((uintptr_t)ptl->self_nodeinfo->qdir.qrepFifoShort)); + ptl->repH.end = (am_pkt_short_t *) + (((uintptr_t)ptl->self_nodeinfo->qdir.qrepFifoShort) + + amsh_qcounts.qrepFifoShort * amsh_qelemsz.qrepFifoShort); + + am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qreqH->shortq, + amsh_qcounts.qreqFifoShort, + amsh_qelemsz.qreqFifoShort); + am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qreqH->longbulkq, + amsh_qcounts.qreqFifoLong, amsh_qelemsz.qreqFifoLong); + am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qrepH->shortq, + amsh_qcounts.qrepFifoShort, + amsh_qelemsz.qrepFifoShort); + am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qrepH->longbulkq, + amsh_qcounts.qrepFifoLong, amsh_qelemsz.qrepFifoLong); + + /* Set bulkidx in every bulk packet */ + am_ctl_bulkpkt_init(ptl->self_nodeinfo->qdir.qreqFifoLong, + amsh_qelemsz.qreqFifoLong, + amsh_qcounts.qreqFifoLong); + am_ctl_bulkpkt_init(ptl->self_nodeinfo->qdir.qrepFifoLong, + amsh_qelemsz.qrepFifoLong, + amsh_qcounts.qrepFifoLong); + + /* install the old sighandler back */ + sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL); + sigaction(SIGBUS, &action_stash.SIGBUS_old_act, NULL); + +fail: + return err; +} + +psm2_error_t psmi_shm_detach(ptl_t *ptl_gen) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + psm2_error_t err = PSM2_OK; + uintptr_t shmbase; + + if (ptl->self_nodeinfo == NULL) + return err; + + _HFI_VDBG("unlinking shm file %s\n", ptl->amsh_keyname + 1); + shmbase = ptl->self_nodeinfo->amsh_shmbase; + shm_unlink(ptl->amsh_keyname); + psmi_free(ptl->amsh_keyname); + + if (munmap((void *)shmbase, am_ctl_sizeof_block())) { + err = + psmi_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, + "Error with munmap of shared segment: %s", + strerror(errno)); + goto fail; + } + ptl->self_nodeinfo = NULL; + return PSM2_OK; + +fail: + return err; +} + +/** + * Update locally shared-pointer directory. The directory must be + * updated when a new epaddr is connected to or on every epaddr already + * connected to whenever the shared memory segment is relocated via mremap. + * + * @param epaddr Endpoint address for which to update local directory. + */ + +static +void am_update_directory(struct am_ctl_nodeinfo *nodeinfo) +{ + uintptr_t base_this; + + base_this = nodeinfo->amsh_shmbase + + AMSH_BLOCK_HEADER_SIZE; + + /* Request queues */ + nodeinfo->qdir.qreqH = (am_ctl_blockhdr_t *) base_this; + nodeinfo->qdir.qreqFifoShort = (am_pkt_short_t *) + ((uintptr_t) nodeinfo->qdir.qreqH + + PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE)); + + nodeinfo->qdir.qreqFifoLong = (am_pkt_bulk_t *) + ((uintptr_t) nodeinfo->qdir.qreqFifoShort + + nodeinfo->amsh_qsizes.qreqFifoShort); + + /* Reply queues */ + nodeinfo->qdir.qrepH = (am_ctl_blockhdr_t *) + ((uintptr_t) nodeinfo->qdir.qreqFifoLong + + nodeinfo->amsh_qsizes.qreqFifoLong); + + nodeinfo->qdir.qrepFifoShort = (am_pkt_short_t *) + ((uintptr_t) nodeinfo->qdir.qrepH + + PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE)); + nodeinfo->qdir.qrepFifoLong = (am_pkt_bulk_t *) + ((uintptr_t) nodeinfo->qdir.qrepFifoShort + + nodeinfo->amsh_qsizes.qrepFifoShort); + + _HFI_VDBG("epaddr=%p Request Hdr=%p,Pkt=%p,Long=%p\n", + nodeinfo->epaddr, + nodeinfo->qdir.qreqH, + nodeinfo->qdir.qreqFifoShort, + nodeinfo->qdir.qreqFifoLong); + _HFI_VDBG("epaddr=%p Reply Hdr=%p,Pkt=%p,Long=%p\n", + nodeinfo->epaddr, + nodeinfo->qdir.qrepH, + nodeinfo->qdir.qrepFifoShort, + nodeinfo->qdir.qrepFifoLong); + + /* Sanity check */ + uintptr_t base_next = + (uintptr_t) nodeinfo->qdir.qrepFifoLong + + nodeinfo->amsh_qsizes.qrepFifoLong; + + psmi_assert_always(base_next - base_this <= am_ctl_sizeof_block()); +} + + +/* ep_epid_share_memory wrapper */ +static +int amsh_epid_reachable(ptl_t *ptl_gen, psm2_epid_t epid) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + int result; + psm2_error_t err; + err = psm2_ep_epid_share_memory(ptl->ep, epid, &result); + psmi_assert_always(err == PSM2_OK); + return result; +} + +static +psm2_error_t +amsh_epaddr_add(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t shmidx, psm2_epaddr_t *epaddr_o) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + psm2_epaddr_t epaddr; + am_epaddr_t *amaddr; + psm2_error_t err = PSM2_OK; + + psmi_assert(psmi_epid_lookup(ptl->ep, epid) == NULL); + + /* The self PTL handles loopback communication. */ + psmi_assert(epid != ptl->epid); + + /* note the size of the memory is am_epaddr_t */ + epaddr = (psm2_epaddr_t) psmi_calloc(ptl->ep, + PER_PEER_ENDPOINT, 1, + sizeof(am_epaddr_t)); + if (epaddr == NULL) { + return PSM2_NO_MEMORY; + } + psmi_assert_always(ptl->am_ep[shmidx].epaddr == NULL); + + if ((err = psmi_epid_set_hostname(psm2_epid_nid(epid), + psmi_gethostname(), 0))) + goto fail; + + epaddr->ptlctl = ptl->ctl; + epaddr->epid = epid; + + /* convert to am_epaddr_t */ + amaddr = (am_epaddr_t *) epaddr; + /* tell the other endpoint their location in our directory */ + amaddr->shmidx = shmidx; + /* we haven't connected yet, so we can't give them the same hint */ + amaddr->return_shmidx = -1; + amaddr->cstate_outgoing = AMSH_CSTATE_OUTGOING_NONE; + amaddr->cstate_incoming = AMSH_CSTATE_INCOMING_NONE; + + /* other setup */ + ptl->am_ep[shmidx].epaddr = epaddr; + am_update_directory(&ptl->am_ep[shmidx]); + /* Finally, add to table */ + if ((err = psmi_epid_add(ptl->ep, epid, epaddr))) + goto fail; + _HFI_VDBG("epaddr=%s added to ptl=%p\n", + psmi_epaddr_get_name(epid), ptl); + *epaddr_o = epaddr; + return PSM2_OK; +fail: + if (epaddr != ptl->epaddr) + psmi_free(epaddr); + return err; +} + +static +void +amsh_epaddr_update(ptl_t *ptl_gen, psm2_epaddr_t epaddr) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + am_epaddr_t *amaddr; + uint16_t shmidx; + struct am_ctl_nodeinfo *nodeinfo; + + amaddr = (am_epaddr_t *) epaddr; + shmidx = amaddr->shmidx; + nodeinfo = (struct am_ctl_nodeinfo *) ptl->am_ep[shmidx].amsh_shmbase; + + /* restart the connection process */ + amaddr->return_shmidx = -1; + amaddr->cstate_outgoing = AMSH_CSTATE_OUTGOING_NONE; + + /* wait for the other process to init again */ + { + volatile uint16_t *is_init = &nodeinfo->is_init; + while (*is_init == 0) + usleep(1); + ips_sync_reads(); + } + + /* get the updated values from the new nodeinfo page */ + ptl->am_ep[shmidx].psm_verno = nodeinfo->psm_verno; + ptl->am_ep[shmidx].pid = nodeinfo->pid; + ptl->am_ep[shmidx].amsh_qsizes = nodeinfo->amsh_qsizes; + am_update_directory(&ptl->am_ep[shmidx]); + return; +} + +struct ptl_connection_req { + int isdone; + int op; /* connect or disconnect */ + int numep; + int numep_left; + int phase; + + int *epid_mask; + const psm2_epid_t *epids; /* input epid list */ + psm2_epaddr_t *epaddr; + psm2_error_t *errors; /* inout errors */ + + /* Used for connect/disconnect */ + psm2_amarg_t args[4]; +}; + +static +void amsh_free_epaddr(psm2_epaddr_t epaddr) +{ + psmi_epid_remove(epaddr->ptlctl->ep, epaddr->epid); + psmi_free(epaddr); + return; +} + +#define PTL_OP_CONNECT 0 +#define PTL_OP_DISCONNECT 1 +#define PTL_OP_ABORT 2 + +static +psm2_error_t +amsh_ep_connreq_init(ptl_t *ptl_gen, int op, /* connect, disconnect or abort */ + int numep, const psm2_epid_t *array_of_epid, /* non-NULL on connect */ + const int array_of_epid_mask[], + psm2_error_t *array_of_errors, + psm2_epaddr_t *array_of_epaddr, + struct ptl_connection_req **req_o) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + int i, cstate; + psm2_epaddr_t epaddr; + psm2_epid_t epid; + struct ptl_connection_req *req = NULL; + + req = (struct ptl_connection_req *) + psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, 1, + sizeof(struct ptl_connection_req)); + if (req == NULL) + return PSM2_NO_MEMORY; + req->isdone = 0; + req->op = op; + req->numep = numep; + req->numep_left = 0; + req->phase = ptl->connect_phase; + req->epid_mask = (int *) + psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, numep, sizeof(int)); + if (req->epid_mask == NULL) { + psmi_free(req); + return PSM2_NO_MEMORY; + } + req->epaddr = array_of_epaddr; + req->epids = array_of_epid; + req->errors = array_of_errors; + + /* First check if there's really something to connect/disconnect + * for this PTL */ + for (i = 0; i < numep; i++) { + req->epid_mask[i] = AMSH_CMASK_NONE; /* no connect by default */ + if (!array_of_epid_mask[i]) + continue; + if (op == PTL_OP_CONNECT) { + epid = array_of_epid[i]; + + /* Connect only to other processes reachable by shared memory. + The self PTL handles loopback communication, so explicitly + refuse to connect to self. */ + if (!amsh_epid_reachable(ptl_gen, epid) + || epid == ptl->epid) { + array_of_errors[i] = PSM2_EPID_UNREACHABLE; + array_of_epaddr[i] = NULL; + continue; + } + + _HFI_VDBG("looking at epid %llx\n", + (unsigned long long)epid); + epaddr = psmi_epid_lookup(ptl->ep, epid); + if (epaddr != NULL) { + if (epaddr->ptlctl->ptl != ptl_gen) { + array_of_errors[i] = + PSM2_EPID_UNREACHABLE; + array_of_epaddr[i] = NULL; + continue; + } + cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing; + if (cstate == AMSH_CSTATE_OUTGOING_ESTABLISHED) { + array_of_epaddr[i] = epaddr; + array_of_errors[i] = PSM2_OK; + } else { + psmi_assert(cstate == + AMSH_CSTATE_OUTGOING_NONE); + array_of_errors[i] = PSM2_TIMEOUT; + array_of_epaddr[i] = epaddr; + req->epid_mask[i] = AMSH_CMASK_PREREQ; + } + } else { + req->epid_mask[i] = AMSH_CMASK_PREREQ; + array_of_epaddr[i] = NULL; + } + } else { /* disc or abort */ + epaddr = array_of_epaddr[i]; + if (epaddr->ptlctl->ptl != ptl_gen) + continue; + + psmi_assert(epaddr != NULL); + cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing; + if (cstate == AMSH_CSTATE_OUTGOING_ESTABLISHED) { + req->epid_mask[i] = AMSH_CMASK_PREREQ; + _HFI_VDBG + ("Just set index %d to AMSH_CMASK_PREREQ\n", + i); + } + /* XXX undef ? */ + } + if (req->epid_mask[i] != AMSH_CMASK_NONE) + req->numep_left++; + } + + if (req->numep_left == 0) { /* nothing to do */ + psmi_free(req->epid_mask); + psmi_free(req); + _HFI_VDBG("Nothing to connect, bump up phase\n"); + ptl->connect_phase++; + *req_o = NULL; + return PSM2_OK; + } else { + *req_o = req; + return PSM2_OK_NO_PROGRESS; + } +} + +static +psm2_error_t +amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + int i, j, cstate; + uint16_t shmidx = (uint16_t)-1; + psm2_error_t err = PSM2_OK; + psm2_epid_t epid; + psm2_epaddr_t epaddr; + + if (req == NULL || req->isdone) + return PSM2_OK; + + psmi_assert_always(ptl->connect_phase == req->phase); + + if (req->op == PTL_OP_DISCONNECT || req->op == PTL_OP_ABORT) { + for (i = 0; i < req->numep; i++) { + if (req->epid_mask[i] == AMSH_CMASK_NONE || + req->epid_mask[i] == AMSH_CMASK_DONE) + continue; + + epaddr = req->epaddr[i]; + psmi_assert(epaddr != NULL); + if (req->epid_mask[i] == AMSH_CMASK_PREREQ) { + shmidx = ((am_epaddr_t *) epaddr)->shmidx; + /* Make sure the target of the disconnect is still there */ + if (ptl->am_ep[shmidx]. + epid != epaddr->epid) { + req->numep_left--; + req->epid_mask[i] = AMSH_CMASK_DONE; + ((am_epaddr_t *) epaddr)->cstate_outgoing = + AMSH_CSTATE_OUTGOING_NONE; + } + } + + if (req->epid_mask[i] == AMSH_CMASK_PREREQ) { + req->args[0].u16w0 = PSMI_AM_DISC_REQ; + req->args[0].u16w1 = shmidx; + req->args[0].u32w1 = ptl->connect_phase; + req->args[1].u64w0 = (uint64_t) ptl->epid; + psmi_assert(shmidx != (uint16_t)-1); + req->args[2].u32w0 = create_extra_ep_data(); + req->args[2].u32w1 = PSM2_OK; + req->args[3].u64w0 = + (uint64_t) (uintptr_t) &req->errors[i]; + psmi_amsh_short_request(ptl_gen, epaddr, + amsh_conn_handler_hidx, + req->args, 4, NULL, 0, + 0); + ((am_epaddr_t *) epaddr)->cstate_outgoing = + AMSH_CSTATE_OUTGOING_DISC_REQUESTED; + /** + * Only munmap if we have nothing more to + * communicate with the other node, i.e. we + * already recieved a disconnect req from the + * other node. + */ + if (((am_epaddr_t *) epaddr)->cstate_incoming == + AMSH_CSTATE_INCOMING_DISC_REQUESTED) + err = psmi_do_unmap(ptl->am_ep[shmidx].amsh_shmbase); + req->epid_mask[i] = AMSH_CMASK_POSTREQ; + } else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) { + cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing; + if (cstate == AMSH_CSTATE_OUTGOING_DISC_REPLIED) { + req->numep_left--; + req->epid_mask[i] = AMSH_CMASK_DONE; + ((am_epaddr_t *) epaddr)->cstate_outgoing = + AMSH_CSTATE_OUTGOING_NONE; + } + } + } + } else { + /* First see if we've made progress on any postreqs */ + int n_prereq = 0; + for (i = 0; i < req->numep; i++) { + int cstate; + if (req->epid_mask[i] != AMSH_CMASK_POSTREQ) { + if (req->epid_mask[i] == AMSH_CMASK_PREREQ) + n_prereq++; + continue; + } + epaddr = req->epaddr[i]; + psmi_assert(epaddr != NULL); + + /* detect if a race has occurred on due to re-using an + * old shm file - if so, restart the connection */ + shmidx = ((am_epaddr_t *) epaddr)->shmidx; + if (ptl->am_ep[shmidx].pid != + ((struct am_ctl_nodeinfo *) ptl->am_ep[shmidx].amsh_shmbase)->pid) { + req->epid_mask[i] = AMSH_CMASK_PREREQ; + ((am_epaddr_t *) epaddr)->cstate_outgoing = + AMSH_CSTATE_OUTGOING_NONE; + n_prereq++; + amsh_epaddr_update(ptl_gen, epaddr); + continue; + } + + cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing; + if (cstate == AMSH_CSTATE_OUTGOING_REPLIED) { + req->numep_left--; + ((am_epaddr_t *) epaddr)->cstate_outgoing = + AMSH_CSTATE_OUTGOING_ESTABLISHED; + req->epid_mask[i] = AMSH_CMASK_DONE; + continue; + } + } + if (n_prereq > 0) { + psmi_assert(req->numep_left > 0); + /* Go through the list of peers we need to connect to and find out + * if they each shared ep is mapped into shm */ + for (i = 0; i < req->numep; i++) { + if (req->epid_mask[i] != AMSH_CMASK_PREREQ) + continue; + epid = req->epids[i]; + epaddr = req->epaddr[i]; + /* Go through mapped epids and find the epid we're looking for */ + for (shmidx = -1, j = 0; + j <= ptl->max_ep_idx; j++) { + /* epid is connected and ready to go */ + if (ptl->am_ep[j]. + epid == epid) { + shmidx = j; + break; + } + } + if (shmidx == (uint16_t)-1) { + /* Couldn't find peer's epid in dirpage. + Check shmdir to see if epid is up now. */ + if ((err = psmi_shm_map_remote(ptl_gen, epid, &shmidx, 0))) { + return err; + } + continue; + } + /* Before we even send the request out, check to see if + * versions are interoperable */ + if (!psmi_verno_isinteroperable + (ptl->am_ep[shmidx]. + psm_verno)) { + char buf[32]; + uint16_t their_verno = + ptl->am_ep[shmidx]. + psm_verno; + snprintf(buf, sizeof(buf), "%d.%d", + PSMI_VERNO_GET_MAJOR + (their_verno), + PSMI_VERNO_GET_MINOR + (their_verno)); + + _HFI_INFO("Local endpoint id %" PRIx64 + " has version %s " + "which is not supported by library version %d.%d", + epid, buf, PSM2_VERNO_MAJOR, + PSM2_VERNO_MINOR); + req->errors[i] = + PSM2_EPID_INVALID_VERSION; + req->numep_left--; + req->epid_mask[i] = AMSH_CMASK_DONE; + continue; + } + if (epaddr != NULL) { + psmi_assert(((am_epaddr_t *) epaddr)-> + shmidx == shmidx); + } else + if ((epaddr = + psmi_epid_lookup(ptl->ep, + epid)) == NULL) { + if ((err = + amsh_epaddr_add(ptl_gen, epid, shmidx, + &epaddr))) { + return err; + } + /* Remote pid is unknown at the moment */ + ((am_epaddr_t *) epaddr)->pid = + AMSH_PID_UNKNOWN; + } + req->epaddr[i] = epaddr; + req->args[0].u16w0 = PSMI_AM_CONN_REQ; + /* tell the other process its shmidx here */ + req->args[0].u16w1 = shmidx; + req->args[0].u32w1 = ptl->connect_phase; + req->args[1].u64w0 = (uint64_t) ptl->epid; + req->args[2].u32w0 = create_extra_ep_data(); + req->args[2].u32w1 = PSM2_OK; + req->args[3].u64w0 = + (uint64_t) (uintptr_t) &req->errors[i]; + req->epid_mask[i] = AMSH_CMASK_POSTREQ; + psmi_amsh_short_request(ptl_gen, epaddr, + amsh_conn_handler_hidx, + req->args, 4, NULL, 0, + 0); + _HFI_PRDBG("epaddr=%p, epid=%" PRIx64 + " at shmidx=%d\n", epaddr, epid, + shmidx); + } + } + } + + if (req->numep_left == 0) { /* we're all done */ + req->isdone = 1; + return PSM2_OK; + } else { + sched_yield(); + return PSM2_OK_NO_PROGRESS; + } +} + +static +psm2_error_t +amsh_ep_connreq_fini(ptl_t *ptl_gen, struct ptl_connection_req *req) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + psm2_error_t err = PSM2_OK; + int i; + + /* Wherever we are at in our connect process, we've been instructed to + * finish the connection process */ + if (req == NULL) + return PSM2_OK; + + /* This prevents future connect replies from referencing data structures + * that disappeared */ + ptl->connect_phase++; + + /* First process any leftovers in postreq or prereq */ + for (i = 0; i < req->numep; i++) { + if (req->epid_mask[i] == AMSH_CMASK_NONE) + continue; + else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) { + int cstate; + req->epid_mask[i] = AMSH_CMASK_DONE; + cstate = ((am_epaddr_t *) req->epaddr[i])->cstate_outgoing; + if (cstate == AMSH_CSTATE_OUTGOING_REPLIED) { + req->numep_left--; + ((am_epaddr_t *) req->epaddr[i])->cstate_outgoing = + AMSH_CSTATE_OUTGOING_ESTABLISHED; + } else { /* never actually got reply */ + req->errors[i] = PSM2_TIMEOUT; + } + } + /* If we couldn't go from prereq to postreq, that means we couldn't + * find the shmidx for an epid in time. This can only be a case of + * time out */ + else if (req->epid_mask[i] == AMSH_CMASK_PREREQ) { + req->errors[i] = PSM2_TIMEOUT; + req->numep_left--; + req->epid_mask[i] = AMSH_CMASK_DONE; + } + } + + /* Whatever is left can only be in DONE or NONE state */ + for (i = 0; i < req->numep; i++) { + if (req->epid_mask[i] == AMSH_CMASK_NONE) + continue; + psmi_assert(req->epid_mask[i] == AMSH_CMASK_DONE); + + err = psmi_error_cmp(err, req->errors[i]); + /* XXX TODO: Report errors in connection. */ + /* Only free epaddr if they have disconnected from us */ + int cstate = ((am_epaddr_t *) req->epaddr[i])->cstate_incoming; + if (cstate == AMSH_CSTATE_INCOMING_DISC_REQUESTED) { + if (req->op == PTL_OP_DISCONNECT || req->op == PTL_OP_ABORT) { + psmi_assert(req->epaddr[i] != NULL); + amsh_free_epaddr(req->epaddr[i]); + req->epaddr[i] = NULL; + } + } + } + + psmi_free(req->epid_mask); + psmi_free(req); + + return err; +} + +/* Wrapper for 2.0's use of connect/disconnect. The plan is to move the + * init/poll/fini interface up to the PTL level for 2.2 */ +#define CONNREQ_ZERO_POLLS_BEFORE_YIELD 20 +static +psm2_error_t +amsh_ep_connreq_wrap(ptl_t *ptl_gen, int op, + int numep, + const psm2_epid_t *array_of_epid, + const int array_of_epid_mask[], + psm2_error_t *array_of_errors, + psm2_epaddr_t *array_of_epaddr, uint64_t timeout_ns) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + psm2_error_t err; + uint64_t t_start; + struct ptl_connection_req *req; + int num_polls_noprogress = 0; + static int shm_polite_attach = -1; + + if (shm_polite_attach == -1) { + char *p = getenv("PSM3_SHM_POLITE_ATTACH"); + if (p && *p && atoi(p) != 0) { + fprintf(stderr, "%s: Using Polite SHM segment attach\n", + psmi_gethostname()); + shm_polite_attach = 1; + } + shm_polite_attach = 0; + } + + /* Initialize */ + err = amsh_ep_connreq_init(ptl_gen, op, numep, + array_of_epid, array_of_epid_mask, + array_of_errors, array_of_epaddr, &req); + if (err != PSM2_OK_NO_PROGRESS) /* Either we're all done with connect or + * there was an error */ + return err; + + /* Poll until either + * 1. We time out + * 2. We are done with connecting + */ + t_start = get_cycles(); + do { + psmi_poll_internal(ptl->ep, 1); + err = amsh_ep_connreq_poll(ptl_gen, req); + if (err == PSM2_OK) + break; /* Finished before timeout */ + else if (err != PSM2_OK_NO_PROGRESS) { + psmi_free(req->epid_mask); + psmi_free(req); + goto fail; + } else if (shm_polite_attach && + ++num_polls_noprogress == + CONNREQ_ZERO_POLLS_BEFORE_YIELD) { + num_polls_noprogress = 0; + PSMI_YIELD(ptl->ep->mq->progress_lock); + } + } + while (psmi_cycles_left(t_start, timeout_ns)); + + err = amsh_ep_connreq_fini(ptl_gen, req); + +fail: + return err; +} + +static +psm2_error_t +amsh_ep_connect(ptl_t *ptl, + int numep, + const psm2_epid_t *array_of_epid, + const int array_of_epid_mask[], + psm2_error_t *array_of_errors, + psm2_epaddr_t *array_of_epaddr, uint64_t timeout_ns) +{ + return amsh_ep_connreq_wrap(ptl, PTL_OP_CONNECT, numep, array_of_epid, + array_of_epid_mask, array_of_errors, + array_of_epaddr, timeout_ns); +} + +static +psm2_error_t +amsh_ep_disconnect(ptl_t *ptl, int force, int numep, + psm2_epaddr_t array_of_epaddr[], + const int array_of_epaddr_mask[], + psm2_error_t array_of_errors[], uint64_t timeout_ns) +{ + return amsh_ep_connreq_wrap(ptl, + force ? PTL_OP_ABORT : PTL_OP_DISCONNECT, + numep, NULL, array_of_epaddr_mask, + array_of_errors, + array_of_epaddr, + timeout_ns); +} + +#undef CSWAP +PSMI_ALWAYS_INLINE( +int32_t +cswap(volatile int32_t *p, int32_t old_value, int32_t new_value)) +{ + asm volatile ("lock cmpxchg %2, %0" : + "+m" (*p), "+a"(old_value) : "r"(new_value) : "memory"); + return old_value; +} + +PSMI_ALWAYS_INLINE( +am_pkt_short_t * +am_ctl_getslot_pkt_inner(volatile am_ctl_qhdr_t *shq, am_pkt_short_t *pkt0)) +{ + am_pkt_short_t *pkt; + uint32_t idx; +#ifndef CSWAP + pthread_spin_lock(&shq->lock); + idx = shq->tail; + pkt = (am_pkt_short_t *) ((uintptr_t) pkt0 + idx * shq->elem_sz); + if (pkt->flag == QFREE) { + ips_sync_reads(); + pkt->flag = QUSED; + shq->tail += 1; + if (shq->tail == shq->elem_cnt) + shq->tail = 0; + } else { + pkt = 0; + } + pthread_spin_unlock(&shq->lock); +#else + uint32_t idx_next; + do { + idx = shq->tail; + idx_next = (idx + 1 == shq->elem_cnt) ? 0 : idx + 1; + } while (cswap(&shq->tail, idx, idx_next) != idx); + + pkt = (am_pkt_short_t *) ((uintptr_t) pkt0 + idx * shq->elem_sz); + while (cswap(&pkt->flag, QFREE, QUSED) != QFREE); +#endif + return pkt; +} + +/* This is safe because 'flag' is at the same offset on both pkt and bulkpkt */ +#define am_ctl_getslot_bulkpkt_inner(shq, pkt0) ((am_pkt_bulk_t *) \ + am_ctl_getslot_pkt_inner(shq, (am_pkt_short_t *)(pkt0))) + +PSMI_ALWAYS_INLINE( +am_pkt_short_t * +am_ctl_getslot_pkt(ptl_t *ptl_gen, uint16_t shmidx, int is_reply)) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + volatile am_ctl_qhdr_t *shq; + am_pkt_short_t *pkt0; + if (!is_reply) { + shq = &(ptl->am_ep[shmidx].qdir.qreqH->shortq); + pkt0 = ptl->am_ep[shmidx].qdir.qreqFifoShort; + } else { + shq = &(ptl->am_ep[shmidx].qdir.qrepH->shortq); + pkt0 = ptl->am_ep[shmidx].qdir.qrepFifoShort; + } + return am_ctl_getslot_pkt_inner(shq, pkt0); +} + +PSMI_ALWAYS_INLINE( +am_pkt_bulk_t * +am_ctl_getslot_long(ptl_t *ptl_gen, uint16_t shmidx, int is_reply)) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + volatile am_ctl_qhdr_t *shq; + am_pkt_bulk_t *pkt0; + if (!is_reply) { + shq = &(ptl->am_ep[shmidx].qdir.qreqH->longbulkq); + pkt0 = ptl->am_ep[shmidx].qdir.qreqFifoLong; + } else { + shq = &(ptl->am_ep[shmidx].qdir.qrepH->longbulkq); + pkt0 = ptl->am_ep[shmidx].qdir.qrepFifoLong; + } + return am_ctl_getslot_bulkpkt_inner(shq, pkt0); +} + +psmi_handlertab_t psmi_allhandlers[] = { + {0} + , + {amsh_conn_handler} + , + {psmi_am_mq_handler} + , + {psmi_am_mq_handler_data} + , + {psmi_am_mq_handler_rtsmatch} + , + {psmi_am_mq_handler_rtsdone} + , + {psmi_am_handler} +}; + +PSMI_ALWAYS_INLINE(void advance_head(volatile am_ctl_qshort_cache_t *hdr)) +{ + QMARKFREE(hdr->head); + hdr->head++; + if (hdr->head == hdr->end) + hdr->head = hdr->base; +} + +#define AMSH_ZERO_POLLS_BEFORE_YIELD 64 +#define AMSH_POLLS_BEFORE_PSM_POLL 16 + +/* XXX this can be made faster. Instead of checking the flag of the head, keep + * a cached copy of the integer value of the tail and compare it against the + * previous one we saw. + */ +PSMI_ALWAYS_INLINE( +psm2_error_t +amsh_poll_internal_inner(ptl_t *ptl_gen, int replyonly, + int is_internal)) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + psm2_error_t err = PSM2_OK_NO_PROGRESS; + /* poll replies */ + if (!QISEMPTY(ptl->repH.head->flag)) { + do { + ips_sync_reads(); + process_packet(ptl_gen, (am_pkt_short_t *) ptl->repH.head, + 0); + advance_head(&ptl->repH); + err = PSM2_OK; + } while (!QISEMPTY(ptl->repH.head->flag)); + } + + if (!replyonly) { + /* Request queue not enable for 2.0, will be re-enabled to support long + * replies */ + if (!is_internal && ptl->psmi_am_reqq_fifo.first != NULL) { + psmi_am_reqq_drain(ptl_gen); + err = PSM2_OK; + } + if (!QISEMPTY(ptl->reqH.head->flag)) { + do { + ips_sync_reads(); + process_packet(ptl_gen, + (am_pkt_short_t *) ptl->reqH. + head, 1); + advance_head(&ptl->reqH); + err = PSM2_OK; + } while (!QISEMPTY(ptl->reqH.head->flag)); + } + } + + if (is_internal) { + if (err == PSM2_OK) /* some progress, no yields */ + ptl->zero_polls = 0; + else if (++ptl->zero_polls == AMSH_ZERO_POLLS_BEFORE_YIELD) { + /* no progress for AMSH_ZERO_POLLS_BEFORE_YIELD */ + sched_yield(); + ptl->zero_polls = 0; + } + + if (++ptl->amsh_only_polls == AMSH_POLLS_BEFORE_PSM_POLL) { + psmi_poll_internal(ptl->ep, 0); + ptl->amsh_only_polls = 0; + } + } + return err; /* if we actually did something */ +} + +/* non-inlined version */ +static +psm2_error_t +amsh_poll_internal(ptl_t *ptl, int replyonly) +{ + return amsh_poll_internal_inner(ptl, replyonly, 1); +} + +#ifdef PSM_PROFILE +#define AMSH_POLL_UNTIL(ptl, isreply, cond) \ + do { \ + PSMI_PROFILE_BLOCK(); \ + while (!(cond)) { \ + PSMI_PROFILE_REBLOCK( \ + amsh_poll_internal(ptl, isreply) == \ + PSM2_OK_NO_PROGRESS); \ + } \ + PSMI_PROFILE_UNBLOCK(); \ + } while (0) +#else +#define AMSH_POLL_UNTIL(ptl, isreply, cond) \ + do { \ + while (!(cond)) { \ + amsh_poll_internal(ptl, isreply); \ + } \ + } while (0) +#endif + +static psm2_error_t amsh_poll(ptl_t *ptl, int replyonly) +{ + return amsh_poll_internal_inner(ptl, replyonly, 0); +} + +PSMI_ALWAYS_INLINE( +void +am_send_pkt_short(ptl_t *ptl, uint32_t destidx, uint32_t returnidx, + uint32_t bulkidx, uint16_t fmt, uint16_t nargs, + uint16_t handleridx, psm2_amarg_t *args, + const void *src, uint32_t len, int isreply)) +{ + int i; + volatile am_pkt_short_t *pkt; + int copy_nargs; + + AMSH_POLL_UNTIL(ptl, isreply, + (pkt = + am_ctl_getslot_pkt(ptl, destidx, isreply)) != NULL); + + /* got a free pkt... fill it in */ + pkt->bulkidx = bulkidx; + pkt->shmidx = returnidx; + pkt->type = fmt; + pkt->nargs = nargs; + pkt->handleridx = handleridx; + + /* Limit the number of args copied here to NSHORT_ARGS. Additional args + are carried in the bulkpkt. */ + copy_nargs = nargs; + if (copy_nargs > NSHORT_ARGS) { + copy_nargs = NSHORT_ARGS; + } + + for (i = 0; i < copy_nargs; i++) + pkt->args[i] = args[i]; + + if (fmt == AMFMT_SHORT_INLINE) + mq_copy_tiny((uint32_t *) &pkt->args[nargs], (uint32_t *) src, + len); + + _HFI_VDBG("pkt=%p fmt=%d bulkidx=%d,flag=%d,nargs=%d," + "buf=%p,len=%d,hidx=%d,value=%d\n", pkt, (int)fmt, bulkidx, + pkt->flag, pkt->nargs, src, (int)len, (int)handleridx, + src != NULL ? *((uint32_t *) src) : 0); + QMARKREADY(pkt); +} + +#define amsh_shm_copy_short psmi_mq_mtucpy +#define amsh_shm_copy_long psmi_mq_mtucpy + +PSMI_ALWAYS_INLINE( +int +psmi_amsh_generic_inner(uint32_t amtype, ptl_t *ptl_gen, psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, void *dst, int flags)) +{ +#ifdef PSM_DEBUG + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; +#endif + uint16_t type; + uint32_t bulkidx; + uint16_t hidx = (uint16_t) handler; + int destidx = ((am_epaddr_t *) epaddr)->shmidx; + int returnidx = ((am_epaddr_t *) epaddr)->return_shmidx; + int is_reply = AM_IS_REPLY(amtype); + volatile am_pkt_bulk_t *bulkpkt; + + _HFI_VDBG("%s epaddr=%s, shmidx=%d, type=%d\n", + is_reply ? "reply" : "request", + psmi_epaddr_get_name(epaddr->epid), + ((am_epaddr_t *) epaddr)->shmidx, amtype); + psmi_assert(epaddr != ptl->epaddr); + + switch (amtype) { + case AMREQUEST_SHORT: + case AMREPLY_SHORT: + if (len + (nargs << 3) <= (NSHORT_ARGS << 3)) { + /* Payload fits in args packet */ + type = AMFMT_SHORT_INLINE; + bulkidx = len; + } else { + int i; + + psmi_assert(len < amsh_qelemsz.qreqFifoLong); + psmi_assert(src != NULL || nargs > NSHORT_ARGS); + type = AMFMT_SHORT; + + AMSH_POLL_UNTIL(ptl_gen, is_reply, + (bulkpkt = + am_ctl_getslot_long(ptl_gen, destidx, + is_reply)) != + NULL); + + bulkidx = bulkpkt->idx; + bulkpkt->len = len; + _HFI_VDBG("bulkpkt %p flag is %d from idx %d\n", + bulkpkt, bulkpkt->flag, destidx); + + for (i = 0; i < nargs - NSHORT_ARGS; i++) { + bulkpkt->args[i] = args[i + NSHORT_ARGS]; + } + + amsh_shm_copy_short((void *)bulkpkt->payload, src, + (uint32_t) len); + QMARKREADY(bulkpkt); + } + am_send_pkt_short(ptl_gen, destidx, returnidx, bulkidx, type, + nargs, hidx, args, src, len, is_reply); + break; + + case AMREQUEST_LONG: + case AMREPLY_LONG: + { + uint32_t bytes_left = len; + uint8_t *src_this = (uint8_t *) src; + uint8_t *dst_this = (uint8_t *) dst; + uint32_t bytes_this; + + type = AMFMT_LONG; + + _HFI_VDBG("[long][%s] src=%p,dest=%p,len=%d,hidx=%d\n", + is_reply ? "rep" : "req", src, dst, + (uint32_t) len, hidx); + while (bytes_left) { + bytes_this = min(bytes_left, AMLONG_MTU); + AMSH_POLL_UNTIL(ptl_gen, is_reply, + (bulkpkt = + am_ctl_getslot_long(ptl_gen, + destidx, + is_reply)) + != NULL); + bytes_left -= bytes_this; + if (bytes_left == 0) + type = AMFMT_LONG_END; + bulkidx = bulkpkt->idx; + amsh_shm_copy_long((void *)bulkpkt->payload, + src_this, bytes_this); + + bulkpkt->dest = (uintptr_t) dst; + bulkpkt->dest_off = + (uint32_t) ((uintptr_t) dst_this - + (uintptr_t) dst); + bulkpkt->len = bytes_this; + QMARKREADY(bulkpkt); + am_send_pkt_short(ptl_gen, destidx, returnidx, + bulkidx, type, nargs, hidx, + args, NULL, 0, is_reply); + src_this += bytes_this; + dst_this += bytes_this; + } + break; + } + default: + break; + } + return 1; +} + +/* A generic version that's not inlined */ +int +psmi_amsh_generic(uint32_t amtype, ptl_t *ptl, psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, void *dst, int flags) +{ + return psmi_amsh_generic_inner(amtype, ptl, epaddr, handler, args, + nargs, src, len, dst, flags); +} + +int +psmi_amsh_short_request(ptl_t *ptl, psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, int flags) +{ + return psmi_amsh_generic_inner(AMREQUEST_SHORT, ptl, epaddr, handler, + args, nargs, src, len, NULL, flags); +} + +int +psmi_amsh_long_request(ptl_t *ptl, psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, void *dest, int flags) +{ + return psmi_amsh_generic_inner(AMREQUEST_LONG, ptl, epaddr, handler, + args, nargs, src, len, dest, flags); +} + +void +psmi_amsh_short_reply(amsh_am_token_t *tok, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, int flags) +{ + psmi_amsh_generic_inner(AMREPLY_SHORT, tok->ptl, tok->tok.epaddr_incoming, + handler, args, nargs, src, len, NULL, flags); + return; +} + +void +psmi_amsh_long_reply(amsh_am_token_t *tok, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, void *dest, int flags) +{ + psmi_amsh_generic_inner(AMREPLY_LONG, tok->ptl, tok->tok.epaddr_incoming, + handler, args, nargs, src, len, dest, flags); + return; +} + +void psmi_am_reqq_init(ptl_t *ptl_gen) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + ptl->psmi_am_reqq_fifo.first = NULL; + ptl->psmi_am_reqq_fifo.lastp = &ptl->psmi_am_reqq_fifo.first; +} + +psm2_error_t psmi_am_reqq_drain(ptl_t *ptl_gen) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + am_reqq_t *reqn = ptl->psmi_am_reqq_fifo.first; + am_reqq_t *req; + psm2_error_t err = PSM2_OK_NO_PROGRESS; + + /* We're going to process the entire list, and running the generic handler + * below can cause other requests to be enqueued in the queue that we're + * processing. */ + ptl->psmi_am_reqq_fifo.first = NULL; + ptl->psmi_am_reqq_fifo.lastp = &ptl->psmi_am_reqq_fifo.first; + + while ((req = reqn) != NULL) { + err = PSM2_OK; + reqn = req->next; + _HFI_VDBG + ("push of reqq=%p epaddr=%s localreq=%p remotereq=%p\n", + req, psmi_epaddr_get_hostname(req->epaddr->epid), + (void *)(uintptr_t) req->args[1].u64w0, + (void *)(uintptr_t) req->args[0].u64w0); + psmi_amsh_generic(req->amtype, req->ptl, req->epaddr, + req->handler, req->args, req->nargs, req->src, + req->len, req->dest, req->amflags); + if (req->flags & AM_FLAG_SRC_TEMP) + psmi_free(req->src); + psmi_free(req); + } + return err; +} + +void +psmi_am_reqq_add(int amtype, ptl_t *ptl_gen, psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, void *dest, int amflags) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + int i; + int flags = 0; + am_reqq_t *nreq = + (am_reqq_t *) psmi_malloc(ptl->ep, UNDEFINED, sizeof(am_reqq_t)); + psmi_assert_always(nreq != NULL); + _HFI_VDBG("alloc of reqq=%p, to epaddr=%s, ptr=%p, len=%d, " + "localreq=%p, remotereq=%p\n", nreq, + psmi_epaddr_get_hostname(epaddr->epid), dest, + (int)len, (void *)(uintptr_t) args[1].u64w0, + (void *)(uintptr_t) args[0].u64w0); + + psmi_assert(nargs <= 8); + nreq->next = NULL; + nreq->amtype = amtype; + nreq->ptl = ptl_gen; + nreq->epaddr = epaddr; + nreq->handler = handler; + for (i = 0; i < nargs; i++) + nreq->args[i] = args[i]; + nreq->nargs = nargs; + if (AM_IS_LONG(amtype) && src != NULL && + len > 0 && !(amflags & AM_FLAG_SRC_ASYNC)) { + abort(); + flags |= AM_FLAG_SRC_TEMP; + nreq->src = psmi_malloc(ptl->ep, UNDEFINED, len); + psmi_assert_always(nreq->src != NULL); /* XXX mem */ + amsh_shm_copy_short(nreq->src, src, len); + } else + nreq->src = src; + nreq->len = len; + nreq->dest = dest; + nreq->amflags = amflags; + nreq->flags = flags; + + nreq->next = NULL; + *(ptl->psmi_am_reqq_fifo.lastp) = nreq; + ptl->psmi_am_reqq_fifo.lastp = &nreq->next; +} + +static +void process_packet(ptl_t *ptl_gen, am_pkt_short_t *pkt, int isreq) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + amsh_am_token_t tok; + psmi_handler_fn_t fn; + psm2_amarg_t *args = pkt->args; + uint16_t shmidx = pkt->shmidx; + int nargs = pkt->nargs; + + tok.tok.epaddr_incoming = ((shmidx != (uint16_t)-1) ? ptl->am_ep[shmidx].epaddr : 0); + tok.ptl = ptl_gen; + tok.mq = ptl->ep->mq; + tok.shmidx = shmidx; + + uint16_t hidx = (uint16_t) pkt->handleridx; + uint32_t bulkidx = pkt->bulkidx; + uintptr_t bulkptr; + am_pkt_bulk_t *bulkpkt; + + fn = (psmi_handler_fn_t) psmi_allhandlers[hidx].fn; + psmi_assert(fn != NULL); + psmi_assert((uintptr_t) pkt > ptl->self_nodeinfo->amsh_shmbase); + + if (pkt->type == AMFMT_SHORT_INLINE) { + _HFI_VDBG + ("%s inline flag=%d nargs=%d from_idx=%d pkt=%p hidx=%d\n", + isreq ? "request" : "reply", pkt->flag, nargs, shmidx, pkt, + hidx); + + fn(&tok, args, nargs, pkt->length > 0 ? + (void *)&args[nargs] : NULL, pkt->length); + } else { + int isend = 0; + switch (pkt->type) { + case AMFMT_LONG_END: + isend = 1; + /* fall through */ + case AMFMT_LONG: + case AMFMT_SHORT: + if (isreq) { + bulkptr = + (uintptr_t) ptl->self_nodeinfo->qdir. + qreqFifoLong; + bulkptr += bulkidx * amsh_qelemsz.qreqFifoLong; + } else { + bulkptr = + (uintptr_t) ptl->self_nodeinfo->qdir. + qrepFifoLong; + bulkptr += bulkidx * amsh_qelemsz.qrepFifoLong; + } + break; + default: + bulkptr = 0; + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Unknown/unhandled packet type 0x%x", + pkt->type); + return; + } + + bulkpkt = (am_pkt_bulk_t *) bulkptr; + _HFI_VDBG("ep=%p mq=%p type=%d bulkidx=%d flag=%d/%d nargs=%d " + "from_idx=%d pkt=%p/%p hidx=%d\n", + ptl->ep, ptl->ep->mq, pkt->type, bulkidx, pkt->flag, + bulkpkt->flag, nargs, shmidx, pkt, bulkpkt, hidx); + psmi_assert(bulkpkt->flag == QREADY); + + if (nargs > NSHORT_ARGS || isend == 1) { + /* Either there are more args in the bulkpkt, or this is the last + packet of a long payload. In either case, copy the args. */ + int i; + args = + alloca((NSHORT_ARGS + + NBULK_ARGS) * sizeof(psm2_amarg_t)); + + for (i = 0; i < NSHORT_ARGS; i++) { + args[i] = pkt->args[i]; + } + + for (; i < nargs; i++) { + args[i] = bulkpkt->args[i - NSHORT_ARGS]; + } + } + + if (pkt->type == AMFMT_SHORT) { + fn(&tok, args, nargs, + (void *)bulkpkt->payload, bulkpkt->len); + QMARKFREE(bulkpkt); + } else { + amsh_shm_copy_long((void *)(bulkpkt->dest + + bulkpkt->dest_off), + bulkpkt->payload, bulkpkt->len); + + /* If this is the last packet, copy args before running the + * handler */ + if (isend) { + void *dest = (void *)bulkpkt->dest; + size_t len = + (size_t) (bulkpkt->dest_off + bulkpkt->len); + QMARKFREE(bulkpkt); + fn(&tok, args, nargs, dest, len); + } else + QMARKFREE(bulkpkt); + } + } + return; +} + +static +psm2_error_t +amsh_mq_rndv(ptl_t *ptl, psm2_mq_t mq, psm2_mq_req_t req, + psm2_epaddr_t epaddr, psm2_mq_tag_t *tag, const void *buf, + uint32_t len) +{ + psm2_amarg_t args[5]; + psm2_error_t err = PSM2_OK; + + args[0].u32w0 = MQ_MSG_LONGRTS; + args[0].u32w1 = len; + args[1].u32w1 = tag->tag[0]; + args[1].u32w0 = tag->tag[1]; + args[2].u32w1 = tag->tag[2]; + args[3].u64w0 = (uint64_t) (uintptr_t) req; + args[4].u64w0 = (uint64_t) (uintptr_t) buf; + + psmi_assert(req != NULL); + req->type = MQE_TYPE_SEND; + req->req_data.buf = (void *)buf; + req->req_data.buf_len = len; + req->req_data.send_msglen = len; + req->send_msgoff = 0; + +#ifdef PSM_CUDA + /* If the send buffer is on gpu, we create a cuda IPC + * handle and send it as payload in the RTS */ + if (req->is_buf_gpu_mem) { + CUdeviceptr buf_base_ptr; + PSMI_CUDA_CALL(cuMemGetAddressRange, &buf_base_ptr, NULL, (CUdeviceptr)buf); + + /* Offset in GPU buffer from which we copy data, we have to + * send it separetly because this offset is lost + * when cuIpcGetMemHandle is called */ + req->cuda_ipc_offset = buf - (void*)buf_base_ptr; + args[2].u32w0 = (uint32_t)req->cuda_ipc_offset; + + PSMI_CUDA_CALL(cuIpcGetMemHandle, + &req->cuda_ipc_handle, + (CUdeviceptr) buf); + if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) { + psmi_am_reqq_add(AMREQUEST_SHORT, ptl, + epaddr, mq_handler_hidx, + args, 5, (void*)&req->cuda_ipc_handle, + sizeof(CUipcMemHandle), NULL, 0); + } else { + psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx, + args, 5, (void*)&req->cuda_ipc_handle, + sizeof(CUipcMemHandle), 0); + } + req->cuda_ipc_handle_attached = 1; + } else +#endif + if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) { + psmi_am_reqq_add(AMREQUEST_SHORT, ptl, epaddr, mq_handler_hidx, + args, 5, NULL, 0, NULL, 0); + } else { + psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx, + args, 5, NULL, 0, 0); + } + + mq->stats.tx_num++; + mq->stats.tx_shm_num++; + mq->stats.tx_rndv_num++; + mq->stats.tx_rndv_bytes += len; + + return err; +} + +PSMI_ALWAYS_INLINE( +psm2_error_t +amsh_mq_send_inner_eager(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, + psm2_amarg_t *args, uint32_t flags_user, uint32_t flags_internal, + psm2_mq_tag_t *tag, const void *ubuf, uint32_t len)) +{ + uint32_t bytes_left = len; + uint32_t bytes_this = 0; + + psm2_handler_t handler = mq_handler_hidx; + + args[1].u32w1 = tag->tag[0]; + args[1].u32w0 = tag->tag[1]; + args[2].u32w1 = tag->tag[2]; + args[2].u32w0 = 0; + + if (!flags_user && len <= AMLONG_MTU) { + if (len <= 32) + args[0].u32w0 = MQ_MSG_TINY; + else + args[0].u32w0 = MQ_MSG_SHORT; + } else { + args[0].u32w0 = MQ_MSG_EAGER; + args[0].u32w1 = len; + } + + do { + args[2].u32w0 += bytes_this; + bytes_this = min(bytes_left, AMLONG_MTU); + + /* Assume that shared-memory active messages are delivered in order */ + if (flags_internal & PSMI_REQ_FLAG_FASTPATH) { + psmi_am_reqq_add(AMREQUEST_SHORT, epaddr->ptlctl->ptl, + epaddr, handler, args, 3, (void *)ubuf, + bytes_this, NULL, 0); + } else { + psmi_amsh_short_request(epaddr->ptlctl->ptl, epaddr, + handler, args, 3, ubuf, bytes_this, 0); + } + + ubuf = (void *)((uint8_t *)ubuf + bytes_this); + bytes_left -= bytes_this; + handler = mq_handler_data_hidx; + } while(bytes_left); + + /* All eager async sends are always "all done" */ + if (req != NULL) { + req->state = MQ_STATE_COMPLETE; + mq_qq_append(&mq->completed_q, req); + } + + mq->stats.tx_num++; + mq->stats.tx_shm_num++; + mq->stats.tx_eager_num++; + mq->stats.tx_eager_bytes += len; + + return PSM2_OK; +} + +/* + * All shared am mq sends, req can be NULL + */ +PSMI_ALWAYS_INLINE( +psm2_error_t +amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, + uint32_t flags_user, uint32_t flags_internal, psm2_mq_tag_t *tag, + const void *ubuf, uint32_t len)) +{ + psm2_amarg_t args[3]; + psm2_error_t err = PSM2_OK; + int is_blocking = (req == NULL); + +#ifdef PSM_CUDA + int gpu_mem = 0; + int ep_supports_p2p = (1 << ((am_epaddr_t *) epaddr)->gpuid) & gpu_p2p_supported; + + if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf)) { + gpu_mem = 1; + + /* All sends from a gpu buffer use the rendezvous protocol if p2p is supported */ + if (ep_supports_p2p) { + goto do_rendezvous; + } + + /* + * Use eager messages if P2P is unsupported between endpoints. + * Potentially use rendezvous with blocking requests only. + */ + if (!is_blocking) + goto do_eager; + } +#endif + if (flags_user & PSM2_MQ_FLAG_SENDSYNC) + goto do_rendezvous; + + if (len <= mq->shm_thresh_rv) +#ifdef PSM_CUDA +do_eager: +#endif + return amsh_mq_send_inner_eager(mq, req, epaddr, args, flags_user, + flags_internal, tag, ubuf, len); +do_rendezvous: + if (is_blocking) { + req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); + if_pf(req == NULL) + return PSM2_NO_MEMORY; + req->req_data.send_msglen = len; + req->req_data.tag = *tag; + + /* Since SEND command is blocking, this request is + * entirely internal and we will not be exposed to user. + * Setting as internal so it will not be added to + * mq->completed_q */ + req->flags_internal |= (flags_internal | PSMI_REQ_FLAG_IS_INTERNAL); + } +#ifdef PSM_CUDA + void *host_buf = NULL; + + req->is_buf_gpu_mem = gpu_mem; + if (req->is_buf_gpu_mem) { + psmi_cuda_set_attr_sync_memops(ubuf); + + /* Use host buffer for blocking requests if GPU P2P is + * unsupported between endpoints. + * This will be only used with blocking requests. */ + if (!ep_supports_p2p) { + host_buf = psmi_malloc(epaddr->ptlctl->ep, UNDEFINED, len); + PSMI_CUDA_CALL(cuMemcpyDtoH, host_buf, (CUdeviceptr)ubuf, len); + + /* Reset is_buf_gpu_mem since host buffer is being used + * instead of one from GPU. */ + ubuf = host_buf; + req->is_buf_gpu_mem = 0; + } + } +#endif + + err = amsh_mq_rndv(epaddr->ptlctl->ptl, mq, req, epaddr, tag, ubuf, len); + + if (err == PSM2_OK && is_blocking) { /* wait... */ + err = psmi_mq_wait_internal(&req); + } + +#ifdef PSM_CUDA + if (err == PSM2_OK && host_buf) + psmi_free(host_buf); +#endif + + return err; +} + +static +psm2_error_t +amsh_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags_user, + uint32_t flags_internal, psm2_mq_tag_t *tag, const void *ubuf, + uint32_t len, void *context, psm2_mq_req_t *req_o) +{ + psm2_mq_req_t req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); + if_pf(req == NULL) + return PSM2_NO_MEMORY; + + req->req_data.send_msglen = len; + req->req_data.tag = *tag; + req->req_data.context = context; + req->flags_user = flags_user; + req->flags_internal = flags_internal; + _HFI_VDBG("[ishrt][%s->%s][n=0][b=%p][l=%d][t=%08x.%08x.%08x]\n", + psmi_epaddr_get_name(epaddr->ptlctl->ep->epid), + psmi_epaddr_get_name(epaddr->epid), ubuf, len, + tag->tag[0], tag->tag[1], tag->tag[2]); + + amsh_mq_send_inner(mq, req, epaddr, flags_user, flags_internal, tag, ubuf, len); + + *req_o = req; + return PSM2_OK; +} + +static +psm2_error_t +amsh_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags, + psm2_mq_tag_t *tag, const void *ubuf, uint32_t len) +{ + _HFI_VDBG("[shrt][%s->%s][n=0][b=%p][l=%d][t=%08x.%08x.%08x]\n", + psmi_epaddr_get_name(epaddr->ptlctl->ep->epid), + psmi_epaddr_get_name(epaddr->epid), ubuf, len, + tag->tag[0], tag->tag[1], tag->tag[2]); + + amsh_mq_send_inner(mq, NULL, epaddr, flags, PSMI_REQ_FLAG_NORMAL, tag, ubuf, len); + + return PSM2_OK; +} + +/* kassist-related handling */ +int psmi_epaddr_pid(psm2_epaddr_t epaddr) +{ + uint16_t shmidx = ((am_epaddr_t *) epaddr)->shmidx; + return ((struct ptl_am *)(epaddr->ptlctl->ptl))->am_ep[shmidx].pid; +} +#if _HFI_DEBUGGING +static +const char *psmi_kassist_getmode(int mode) +{ + switch (mode) { + case PSMI_KASSIST_OFF: + return "kassist off"; + case PSMI_KASSIST_CMA_GET: + return "cma get"; + case PSMI_KASSIST_CMA_PUT: + return "cma put"; + default: + return "unknown"; + } +} +#endif + +static +int psmi_get_kassist_mode() +{ + /* Cuda PSM2 supports only KASSIST_CMA_GET */ + int mode = PSMI_KASSIST_CMA_GET; +#ifndef PSM_CUDA + union psmi_envvar_val env_kassist; + + if (!psmi_getenv("PSM3_KASSIST_MODE", + "PSM Shared memory kernel assist mode " + "(cma-put, cma-get, none)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val) + PSMI_KASSIST_MODE_DEFAULT_STRING, &env_kassist)) { + char *s = env_kassist.e_str; + if (strcasecmp(s, "cma-put") == 0) + mode = PSMI_KASSIST_CMA_PUT; + else if (strcasecmp(s, "cma-get") == 0) + mode = PSMI_KASSIST_CMA_GET; + else + mode = PSMI_KASSIST_OFF; + } +#endif + return mode; +} + +/* Connection handling for shared memory AM. + * + * arg0 => conn_op, result (PSM error type) + * arg1 => epid (always) + * arg2 => pid, version. + * arg3 => pointer to error for replies. + */ +static +void +amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, + size_t len) +{ + int op = args[0].u16w0; + int phase = args[0].u32w1; + psm2_epid_t epid = args[1].u64w0; + int16_t return_shmidx = args[0].u16w1; + psm2_error_t err = (psm2_error_t) args[2].u32w1; + psm2_error_t *perr = (psm2_error_t *) (uintptr_t) args[3].u64w0; + unsigned int pid; + unsigned int gpuid; + int force_remap = 0; + + psm2_epaddr_t epaddr; + amsh_am_token_t *tok = (amsh_am_token_t *) toki; + uint16_t shmidx = tok->shmidx; + int is_valid; + struct ptl_am *ptl = (struct ptl_am *)(tok->ptl); + ptl_t *ptl_gen = tok->ptl; + int cstate; + + /* We do this because it's an assumption below */ + psmi_assert_always(buf == NULL && len == 0); + read_extra_ep_data(args[2].u32w0, &pid, &gpuid); + + _HFI_VDBG("Conn op=%d, phase=%d, epid=%llx, err=%d\n", + op, phase, (unsigned long long)epid, err); + + switch (op) { + case PSMI_AM_CONN_REQ: + _HFI_VDBG("Connect from %d:%d\n", + (int)psm2_epid_nid(epid), (int)psm2_epid_context(epid)); + epaddr = psmi_epid_lookup(ptl->ep, epid); + if (epaddr && ((am_epaddr_t *) epaddr)->pid != pid) { + /* If old pid is unknown consider new pid the correct one */ + if (((am_epaddr_t *) epaddr)->pid == AMSH_PID_UNKNOWN) { + ((am_epaddr_t *) epaddr)->pid = pid; + ((am_epaddr_t *) epaddr)->gpuid = gpuid; + } else { + psmi_epid_remove(ptl->ep, epid); + epaddr = NULL; + force_remap = 1; + } + } + + if (shmidx == (uint16_t)-1) { + /* incoming packet will never be from our shmidx slot 0 + thus the other process doesn't know our return info. + attach_to will lookup or create the proper shmidx */ + if ((err = psmi_shm_map_remote(ptl_gen, epid, &shmidx, force_remap))) { + psmi_handle_error(PSMI_EP_NORETURN, err, + "Fatal error in " + "connecting to shm segment"); + } + am_update_directory(&ptl->am_ep[shmidx]); + tok->shmidx = shmidx; + } + + if (epaddr == NULL) { + uintptr_t args_segoff = + (uintptr_t) args - ptl->self_nodeinfo->amsh_shmbase; + if ((err = amsh_epaddr_add(ptl_gen, epid, shmidx, &epaddr))) + /* Unfortunately, no way out of here yet */ + psmi_handle_error(PSMI_EP_NORETURN, err, + "Fatal error " + "in connecting to shm segment"); + args = + (psm2_amarg_t *) (ptl->self_nodeinfo->amsh_shmbase + + args_segoff); + + ((am_epaddr_t *) epaddr)->pid = pid; + ((am_epaddr_t *) epaddr)->gpuid = gpuid; + } + + /* Rewrite args */ + ptl->connect_incoming++; + args[0].u16w0 = PSMI_AM_CONN_REP; + /* and return our shmidx for the connecting process */ + args[0].u16w1 = shmidx; + args[1].u64w0 = (psm2_epid_t) ptl->epid; + args[2].u32w0 = create_extra_ep_data(); + args[2].u32w1 = PSM2_OK; + ((am_epaddr_t *) epaddr)->cstate_incoming = + AMSH_CSTATE_INCOMING_ESTABLISHED; + ((am_epaddr_t *) epaddr)->return_shmidx = return_shmidx; + tok->tok.epaddr_incoming = epaddr; /* adjust token */ + psmi_amsh_short_reply(tok, amsh_conn_handler_hidx, + args, narg, NULL, 0, 0); + break; + + case PSMI_AM_CONN_REP: + if (ptl->connect_phase != phase) { + _HFI_VDBG("Out of phase connect reply\n"); + return; + } + epaddr = ptl->am_ep[shmidx].epaddr; + /* check if a race has occurred on shm-file reuse. + * if so, don't transition to the next state. + * the next call to connreq_poll() will restart the + * connection. + */ + if (ptl->am_ep[shmidx].pid != + ((struct am_ctl_nodeinfo *) ptl->am_ep[shmidx].amsh_shmbase)->pid) + break; + + *perr = err; + ((am_epaddr_t *) epaddr)->cstate_outgoing + = AMSH_CSTATE_OUTGOING_REPLIED; + ((am_epaddr_t *) epaddr)->return_shmidx = return_shmidx; + ptl->connect_outgoing++; + _HFI_VDBG("CCC epaddr=%s connected to ptl=%p\n", + psmi_epaddr_get_name(epaddr->epid), ptl); + break; + + case PSMI_AM_DISC_REQ: + epaddr = psmi_epid_lookup(ptl->ep, epid); + if (!epaddr) { + _HFI_VDBG("Dropping disconnect request from an epid that we are not connected to\n"); + return; + } + args[0].u16w0 = PSMI_AM_DISC_REP; + args[2].u32w1 = PSM2_OK; + ((am_epaddr_t *) epaddr)->cstate_incoming = + AMSH_CSTATE_INCOMING_DISC_REQUESTED; + ptl->connect_incoming--; + /* Before sending the reply, make sure the process + * is still connected */ + + if (ptl->am_ep[shmidx].epid != epaddr->epid) + is_valid = 0; + else + is_valid = 1; + + if (is_valid) { + psmi_amsh_short_reply(tok, amsh_conn_handler_hidx, + args, narg, NULL, 0, 0); + /** + * Only munmap if we have nothing more to + * communicate with the other node, i.e. we are + * already disconnected with the other node + * or have sent a disconnect request. + */ + cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing; + if (cstate == AMSH_CSTATE_OUTGOING_DISC_REQUESTED) { + err = psmi_do_unmap(ptl->am_ep[shmidx].amsh_shmbase); + psmi_epid_remove(epaddr->ptlctl->ep, epaddr->epid); + } + } + break; + + case PSMI_AM_DISC_REP: + if (ptl->connect_phase != phase) { + _HFI_VDBG("Out of phase disconnect reply\n"); + return; + } + *perr = err; + epaddr = tok->tok.epaddr_incoming; + ((am_epaddr_t *) epaddr)->cstate_outgoing = + AMSH_CSTATE_OUTGOING_DISC_REPLIED; + ptl->connect_outgoing--; + break; + + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Unknown/unhandled connect handler op=%d", + op); + break; + } + return; +} + +static +size_t amsh_sizeof(void) +{ + return sizeof(struct ptl_am); +} + +/* Fill in AM capabilities parameters */ +psm2_error_t +psmi_amsh_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters) +{ + if (parameters == NULL) { + return PSM2_PARAM_ERR; + } + + parameters->max_handlers = PSMI_AM_NUM_HANDLERS; + parameters->max_nargs = PSMI_AM_MAX_ARGS; + parameters->max_request_short = AMLONG_MTU; + parameters->max_reply_short = AMLONG_MTU; + + return PSM2_OK; +} + +/** + * @param ep PSM Endpoint, guaranteed to have initialized epaddr and epid. + * @param ptl Pointer to caller-allocated space for PTL (fill in) + * @param ctl Pointer to caller-allocated space for PTL-control + * structure (fill in) + */ +static +psm2_error_t +amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + psm2_error_t err = PSM2_OK; + + /* Preconditions */ + psmi_assert_always(ep != NULL); + psmi_assert_always(ep->epaddr != NULL); + psmi_assert_always(ep->epid != 0); + + ptl->ep = ep; /* back pointer */ + ptl->epid = ep->epid; /* cache epid */ + ptl->epaddr = ep->epaddr; /* cache a copy */ + ptl->ctl = ctl; + ptl->zero_polls = 0; + + ptl->connect_phase = 0; + ptl->connect_incoming = 0; + ptl->connect_outgoing = 0; + + memset(&ptl->amsh_empty_shortpkt, 0, sizeof(ptl->amsh_empty_shortpkt)); + memset(&ptl->psmi_am_reqq_fifo, 0, sizeof(ptl->psmi_am_reqq_fifo)); + + ptl->max_ep_idx = -1; + ptl->am_ep_size = AMSH_DIRBLOCK_SIZE; + + ptl->am_ep = (struct am_ctl_nodeinfo *) + psmi_memalign(ptl->ep, PER_PEER_ENDPOINT, 64, + ptl->am_ep_size * sizeof(struct am_ctl_nodeinfo)); + + if (ptl->am_ep == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + memset(ptl->am_ep, 0, ptl->am_ep_size * sizeof(struct am_ctl_nodeinfo)); + + if ((err = amsh_init_segment(ptl_gen))) + goto fail; + + ptl->self_nodeinfo->psm_verno = PSMI_VERNO; + if (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF) { + if (cma_available()) { + ptl->self_nodeinfo->amsh_features |= + AMSH_HAVE_CMA; + psmi_shm_mq_rv_thresh = + PSMI_MQ_RV_THRESH_CMA; + } else { + ptl->psmi_kassist_mode = + PSMI_KASSIST_OFF; + psmi_shm_mq_rv_thresh = + PSMI_MQ_RV_THRESH_NO_KASSIST; + } + } else { + psmi_shm_mq_rv_thresh = + PSMI_MQ_RV_THRESH_NO_KASSIST; + } + ptl->self_nodeinfo->pid = getpid(); + ptl->self_nodeinfo->epid = ep->epid; + ptl->self_nodeinfo->epaddr = ep->epaddr; + + ips_mb(); + ptl->self_nodeinfo->is_init = 1; + + psmi_am_reqq_init(ptl_gen); + memset(ctl, 0, sizeof(*ctl)); + + /* Fill in the control structure */ + ctl->ep = ep; + ctl->ptl = ptl_gen; + ctl->ep_poll = amsh_poll; + ctl->ep_connect = amsh_ep_connect; + ctl->ep_disconnect = amsh_ep_disconnect; + + ctl->mq_send = amsh_mq_send; + ctl->mq_isend = amsh_mq_isend; + + ctl->am_get_parameters = psmi_amsh_am_get_parameters; + ctl->am_short_request = psmi_amsh_am_short_request; + ctl->am_short_reply = psmi_amsh_am_short_reply; + +#if 0 // unused code, specific to QLogic MPI + /* No stats in shm (for now...) */ + ctl->epaddr_stats_num = NULL; + ctl->epaddr_stats_init = NULL; + ctl->epaddr_stats_get = NULL; +#endif +#ifdef PSM_CUDA + union psmi_envvar_val env_memcache_enabled; + psmi_getenv("PSM3_CUDA_MEMCACHE_ENABLED", + "PSM cuda ipc memhandle cache enabled (default is enabled)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) + 1, &env_memcache_enabled); + if (PSMI_IS_CUDA_ENABLED && env_memcache_enabled.e_uint) { + union psmi_envvar_val env_memcache_size; + psmi_getenv("PSM3_CUDA_MEMCACHE_SIZE", + "Size of the cuda ipc memhandle cache ", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) + CUDA_MEMHANDLE_CACHE_SIZE, &env_memcache_size); + if ((err = am_cuda_memhandle_cache_init(env_memcache_size.e_uint) != PSM2_OK)) + goto fail; + } +#endif +fail: + return err; +} + +static psm2_error_t amsh_fini(ptl_t *ptl_gen, int force, uint64_t timeout_ns) +{ + struct ptl_am *ptl = (struct ptl_am *)ptl_gen; + struct psmi_eptab_iterator itor; + psm2_epaddr_t epaddr; + psm2_error_t err = PSM2_OK; + psm2_error_t err_seg; + uint64_t t_start = get_cycles(); + int i = 0; + + /* Close whatever has been left open -- this will be factored out for 2.1 */ + if (ptl->connect_outgoing > 0) { + int num_disc = 0; + int *mask; + psm2_error_t *errs; + psm2_epaddr_t *epaddr_array; + + psmi_epid_itor_init(&itor, ptl->ep); + while ((epaddr = psmi_epid_itor_next(&itor))) { + if (epaddr->ptlctl->ptl != ptl_gen) + continue; + if (((am_epaddr_t *) epaddr)->cstate_outgoing == + AMSH_CSTATE_OUTGOING_ESTABLISHED) + num_disc++; + } + psmi_epid_itor_fini(&itor); + + mask = + (int *)psmi_calloc(ptl->ep, UNDEFINED, num_disc, + sizeof(int)); + errs = (psm2_error_t *) + psmi_calloc(ptl->ep, UNDEFINED, num_disc, + sizeof(psm2_error_t)); + epaddr_array = (psm2_epaddr_t *) + psmi_calloc(ptl->ep, UNDEFINED, num_disc, + sizeof(psm2_epaddr_t)); + + if (errs == NULL || epaddr_array == NULL || mask == NULL) { + if (epaddr_array) + psmi_free(epaddr_array); + if (errs) + psmi_free(errs); + if (mask) + psmi_free(mask); + err = PSM2_NO_MEMORY; + goto fail; + } + psmi_epid_itor_init(&itor, ptl->ep); + while ((epaddr = psmi_epid_itor_next(&itor))) { + if (epaddr->ptlctl->ptl == ptl_gen) { + if (((am_epaddr_t *) epaddr)->cstate_outgoing == + AMSH_CSTATE_OUTGOING_ESTABLISHED) { + mask[i] = 1; + epaddr_array[i] = epaddr; + i++; + } + } + } + psmi_epid_itor_fini(&itor); + psmi_assert(i == num_disc && num_disc > 0); + err = amsh_ep_disconnect(ptl_gen, force, num_disc, epaddr_array, + mask, errs, timeout_ns); + psmi_free(mask); + psmi_free(errs); + psmi_free(epaddr_array); + } + + if (ptl->connect_incoming > 0 || ptl->connect_outgoing > 0) { + while (ptl->connect_incoming > 0 || ptl->connect_outgoing > 0) { + if (!psmi_cycles_left(t_start, timeout_ns)) { + err = PSM2_TIMEOUT; + _HFI_VDBG("CCC timed out with from=%d,to=%d\n", + ptl->connect_incoming, ptl->connect_outgoing); + break; + } + psmi_poll_internal(ptl->ep, 1); + } + } else + _HFI_VDBG("CCC complete disconnect from=%d,to=%d\n", + ptl->connect_incoming, ptl->connect_outgoing); + + if ((err_seg = psmi_shm_detach(ptl_gen))) { + err = err_seg; + goto fail; + } + + /* This prevents poll calls between now and the point where the endpoint is + * deallocated to reference memory that disappeared */ + ptl->repH.head = &ptl->amsh_empty_shortpkt; + ptl->reqH.head = &ptl->amsh_empty_shortpkt; + + if (ptl->am_ep) + psmi_free(ptl->am_ep); + +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED) + am_cuda_memhandle_cache_map_fini(); +#endif + return PSM2_OK; +fail: + return err; + +} + +static +psm2_error_t +amsh_setopt(const void *component_obj, int optname, + const void *optval, uint64_t optlen) +{ + /* No options for AM PTL at the moment */ + return psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Unknown AM ptl option %u.", optname); +} + +static +psm2_error_t +amsh_getopt(const void *component_obj, int optname, + void *optval, uint64_t *optlen) +{ + /* No options for AM PTL at the moment */ + return psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Unknown AM ptl option %u.", optname); +} + +/* Only symbol we expose out of here */ +struct ptl_ctl_init +psmi_ptl_amsh = { + amsh_sizeof, amsh_init, amsh_fini, amsh_setopt, amsh_getopt +}; diff --git a/prov/psm3/psm3/ptl_am/cmarw.h b/prov/psm3/psm3/ptl_am/cmarw.h new file mode 100644 index 00000000000..0317ed422b1 --- /dev/null +++ b/prov/psm3/psm3/ptl_am/cmarw.h @@ -0,0 +1,73 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include +#include + +/* + * read from remote process pid + */ +int64_t cma_get(pid_t pid, const void *src, void *dst, int64_t n); + +/* + * write to remote process pid + */ +int64_t cma_put(const void *src, pid_t pid, void *dst, int64_t n); + +/* + * Test if CMA is available by trying a no-op call. + * Returns 1 if CMA is present, 0 if not. + */ +int cma_available(void); diff --git a/prov/psm3/psm3/ptl_am/cmarwu.c b/prov/psm3/psm3/ptl_am/cmarwu.c new file mode 100644 index 00000000000..9c859da699e --- /dev/null +++ b/prov/psm3/psm3/ptl_am/cmarwu.c @@ -0,0 +1,207 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include +#include +#include +#include +#include +#include + +#include "psm_user.h" +#include "cmarw.h" + +/* An iovec looks like this: + * struct iovec { + * void *iov_base; // Starting address + * size_t iov_len; // Number of bytes to transfer + * }; + */ + +#if 0 +#define __NR_process_vm_readv 310 +#define __NR_process_vm_writev 311 + +#define process_vm_readv(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \ + syscall(__NR_process_vm_readv, \ + pid, local_iov, liovcnt, remote_iov, riovcnt, flags) + +#define process_vm_writev(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \ + syscall(__NR_process_vm_writev, \ + pid, local_iov, liovcnt, remote_iov, riovcnt, flags) +#endif + +/*CMA syscall wrappers were added in glibc 2.15. For anything older than that, + we need to define our own wrappers. Apparently older (and maybe newer?) + (2.12 from RHEL6.3 definitely has this bug) glibcs only pass up to 5 + arguments via the generic syscall() function. These CMA functions, however, + have 6 arguments. So for now, we hack our way around it by generating ASM + code for doing a syscall directly. +*/ + +#if defined(__GLIBC__) && ((__GLIBC__ == 2) && (__GLIBC_MINOR__ < 15)) + +#ifdef __x86_64__ + +#define __NR_process_vm_readv 310 +#define __NR_process_vm_writev 311 + +static inline ssize_t __x86_64_syscall6(int syscall, + pid_t pid, + const struct iovec *local_iov, + unsigned long liovcnt, + const struct iovec *remote_iov, + unsigned long riovcnt, + unsigned long flags) +{ + /*GCC inline ASM is annoying -- can't specify all the x86_64 registers + directly, so declare register-specific variables and use them. */ + register int64_t rax asm("rax") = syscall; + register int64_t rdi asm("rdi") = pid; + register int64_t rsi asm("rsi") = (intptr_t) local_iov; + register int64_t rdx asm("rdx") = liovcnt; + register int64_t r10 asm("r10") = (intptr_t) remote_iov; + register int64_t r8 asm("r8") = riovcnt; + register int64_t r9 asm("r9") = flags; + + asm volatile ("syscall\n" : "=a" (rax) + : "r"(rax), "r"(rdi), "r"(rsi), "r"(rdx), "r"(r10), + "r"(r8), "r"(r9) + : "%rcx", "%r11", "cc", "memory"); + return rax; +} + +#define process_vm_readv(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \ + __x86_64_syscall6(__NR_process_vm_readv, \ + pid, local_iov, liovcnt, remote_iov, riovcnt, flags) + +#define process_vm_writev(pid, local_iov, liovcnt, remote_iov, riovcnt, flags) \ + __x86_64_syscall6(__NR_process_vm_writev, \ + pid, local_iov, liovcnt, remote_iov, riovcnt, flags) + +#else /* ndef __x86_64__ */ +#error "Can't compile CMA support for this architecture." +#endif /* __x86_64__ */ +#endif /* __GLIBC__ < 2.15 */ + +int64_t cma_get(pid_t pid, const void *src, void *dst, int64_t n) +{ + int64_t nr, sum; + struct iovec local = { + .iov_base = dst, + .iov_len = n + }; + struct iovec remote = { + .iov_base = (void *)src, + .iov_len = n + }; + nr = sum = 0; + while (sum != n) { + nr = process_vm_readv(pid, &local, 1, &remote, 1, 0); + if (nr == -1) { + return -1; + } + sum += nr; + local.iov_base = (void *)((uint8_t *)local.iov_base + nr); + local.iov_len -= nr; + remote.iov_base = (void *)((uint8_t *)remote.iov_base + nr); + remote.iov_len -= nr; + } + return sum; +} + +int64_t cma_put(const void *src, pid_t pid, void *dst, int64_t n) +{ + int64_t nr, sum; + struct iovec local = { + .iov_base = (void *)src, + .iov_len = n + }; + struct iovec remote = { + .iov_base = dst, + .iov_len = n + }; + + nr = sum = 0; + while (sum != n) { + nr = process_vm_writev(pid, &local, 1, &remote, 1, 0); + if (nr == -1) { + return -1; + } + sum += nr; + local.iov_base = (void *)((uint8_t *)local.iov_base + nr); + local.iov_len -= nr; + remote.iov_base = (void *)((uint8_t *)remote.iov_base + nr); + remote.iov_len -= nr; + } + return sum; +} + +/* Test if CMA is available by trying a no-op call. */ +int cma_available(void) +{ + + /* Make a no-op CMA syscall. If CMA is present, 0 (bytes transferred) + * should be returned. If not present, expect -ENOSYS. */ + + int ret = process_vm_readv(getpid(), NULL, 0, NULL, 0, 0); + + if (ret == 0) { + /* CMA is available! */ + return 1; + } + + return 0; +} diff --git a/prov/psm3/psm3/ptl_am/psm_am_internal.h b/prov/psm3/psm3/ptl_am/psm_am_internal.h new file mode 100644 index 00000000000..c4c08a5f007 --- /dev/null +++ b/prov/psm3/psm3/ptl_am/psm_am_internal.h @@ -0,0 +1,448 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + +#ifndef PSMI_AM_H +#define PSMI_AM_H + +#include "am_config.h" +#include "../psm_am_internal.h" + +#define AMSH_DIRBLOCK_SIZE 128 + +typedef +struct am_epaddr { + /* + * epaddr must be the first field to have the same address as this + * structure + */ + struct psm2_epaddr epaddr; + + uint16_t shmidx; + uint16_t return_shmidx; + + uint32_t cstate_outgoing:3; + uint32_t cstate_incoming:3; + uint32_t pid:22; + /* + * Device number of GPU used by given EP, only used when CUDA is + * enabled. There is no gain from #ifdefing it out, since it does not + * use any extra space. + */ + uint32_t gpuid:4; +} am_epaddr_t; + +/* Up to NSHORT_ARGS are supported via am_pkt_short_t; the remaining + arguments are passed using space in am_pkt_bulk_t. One additional argument + is added for passing the internal ptl_am handler index. */ +#define NSHORT_ARGS 6 +#define NBULK_ARGS (PSMI_AM_MAX_ARGS - NSHORT_ARGS + 1) + +typedef +struct amsh_am_token { + struct psmi_am_token tok; + + ptl_t *ptl; /**> What PTL was it received on */ + psm2_mq_t mq; /**> What matched queue is this for ? */ + uint16_t shmidx; /**> what shmidx sent this */ +} amsh_am_token_t; + +typedef void (*psmi_handler_fn_t) (void *token, psm2_amarg_t *args, int nargs, + void *src, size_t len); + +typedef struct psmi_handlertab { + psmi_handler_fn_t fn; +} psmi_handlertab_t; + +#define PSMI_AM_CONN_REQ 1 +#define PSMI_AM_CONN_REP 2 +#define PSMI_AM_DISC_REQ 3 +#define PSMI_AM_DISC_REP 4 + +#define PSMI_KASSIST_OFF 0x0 +#define PSMI_KASSIST_CMA_GET 0x1 +#define PSMI_KASSIST_CMA_PUT 0x2 + +#define PSMI_KASSIST_CMA 0x3 +#define PSMI_KASSIST_GET 0x1 +#define PSMI_KASSIST_PUT 0x2 +#define PSMI_KASSIST_MASK 0x3 + +int psmi_epaddr_pid(psm2_epaddr_t epaddr); + +/* + * Eventually, we will allow users to register handlers as "don't reply", which + * may save on some of the buffering requirements + */ +#define PSMI_HANDLER_NEEDS_REPLY(handler) 1 +#define PSMI_VALIDATE_REPLY(handler) assert(PSMI_HANDLER_NEEDS_REPLY(handler)) + +int psmi_amsh_poll(ptl_t *ptl, int replyonly); + +/* Shared memory AM, forward decls */ +int +psmi_amsh_short_request(ptl_t *ptl, psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, int flags); + +void +psmi_amsh_short_reply(amsh_am_token_t *tok, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, int flags); + +int +psmi_amsh_long_request(ptl_t *ptl, psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, void *dest, int flags); + +void +psmi_amsh_long_reply(amsh_am_token_t *tok, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + const void *src, size_t len, void *dest, int flags); + +void psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, + size_t len); + +void psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, + size_t len); +void psmi_am_mq_handler_data(void *toki, psm2_amarg_t *args, int narg, + void *buf, size_t len); +void psmi_am_mq_handler_complete(void *toki, psm2_amarg_t *args, int narg, + void *buf, size_t len); +void psmi_am_mq_handler_rtsmatch(void *toki, psm2_amarg_t *args, int narg, + void *buf, size_t len); +void psmi_am_mq_handler_rtsdone(void *toki, psm2_amarg_t *args, int narg, + void *buf, size_t len); +void psmi_am_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, + size_t len); + +/* AM over shared memory (forward decls) */ +psm2_error_t +psmi_amsh_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters); + +psm2_error_t +psmi_amsh_am_short_request(psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt); + +psm2_error_t +psmi_amsh_am_short_reply(psm2_am_token_t tok, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt); + +#define amsh_conn_handler_hidx 1 +#define mq_handler_hidx 2 +#define mq_handler_data_hidx 3 +#define mq_handler_rtsmatch_hidx 4 +#define mq_handler_rtsdone_hidx 5 +#define am_handler_hidx 6 + +#define AMREQUEST_SHORT 0 +#define AMREQUEST_LONG 1 +#define AMREPLY_SHORT 2 +#define AMREPLY_LONG 3 +#define AM_IS_REPLY(x) ((x)&0x2) +#define AM_IS_REQUEST(x) (!AM_IS_REPLY(x)) +#define AM_IS_LONG(x) ((x)&0x1) +#define AM_IS_SHORT(x) (!AM_IS_LONG(x)) + +#define AM_FLAG_SRC_ASYNC 0x1 +#define AM_FLAG_SRC_TEMP 0x2 + +/* + * Request Fifo. + */ +typedef +struct am_reqq { + struct am_reqq *next; + + ptl_t *ptl; + psm2_epaddr_t epaddr; + int amtype; + psm2_handler_t handler; + psm2_amarg_t args[8]; + int nargs; + uint32_t len; + void *src; + void *dest; + int amflags; + int flags; +} am_reqq_t; + +struct am_reqq_fifo_t { + am_reqq_t *first; + am_reqq_t **lastp; +}; + +psm2_error_t psmi_am_reqq_drain(ptl_t *ptl); +void psmi_am_reqq_add(int amtype, ptl_t *ptl, psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, void *dest, int flags); + +/* + * Shared memory Active Messages, implementation derived from + * Lumetta, Mainwaring, Culler. Multi-Protocol Active Messages on a Cluster of + * SMP's. Supercomputing 1997. + * + * We support multiple endpoints in shared memory, but we only support one + * shared memory context with up to AMSH_MAX_LOCAL_PROCS local endpoints. Some + * structures are endpoint specific (as denoted * with amsh_ep_) and others are + * specific to the single shared memory context * (amsh_ global variables). + * + * Each endpoint maintains a shared request block and a shared reply block. + * Each block is composed of queues for small, medium and large messages. + */ + +#define QFREE 0 +#define QUSED 1 +#define QREADY 2 +#define QREADYMED 3 +#define QREADYLONG 4 + +#define QISEMPTY(flag) (flag < QREADY) +#if defined(__x86_64__) || defined(__i386__) +# define _QMARK_FLAG_FENCE() asm volatile("" : : : "memory") /* compilerfence */ +#else +# error No _QMARK_FLAG_FENCE() defined for this platform +#endif + +#define _QMARK_FLAG(pkt_ptr, _flag) \ + do { \ + _QMARK_FLAG_FENCE(); \ + (pkt_ptr)->flag = (_flag); \ + } while (0) + +#define QMARKFREE(pkt_ptr) _QMARK_FLAG(pkt_ptr, QFREE) +#define QMARKREADY(pkt_ptr) _QMARK_FLAG(pkt_ptr, QREADY) +#define QMARKUSED(pkt_ptr) _QMARK_FLAG(pkt_ptr, QUSED) + +#define AMFMT_SYSTEM 1 +#define AMFMT_SHORT_INLINE 2 +#define AMFMT_SHORT 3 +#define AMFMT_LONG 4 +#define AMFMT_LONG_END 5 + +#define AMSH_CMASK_NONE 0 +#define AMSH_CMASK_PREREQ 1 +#define AMSH_CMASK_POSTREQ 2 +#define AMSH_CMASK_DONE 3 + +#define AMSH_CSTATE_OUTGOING_NONE 1 +#define AMSH_CSTATE_OUTGOING_REPLIED 2 +#define AMSH_CSTATE_OUTGOING_ESTABLISHED 3 +#define AMSH_CSTATE_OUTGOING_DISC_REPLIED 4 +#define AMSH_CSTATE_OUTGOING_DISC_REQUESTED 5 + +#define AMSH_CSTATE_INCOMING_NONE 1 +#define AMSH_CSTATE_INCOMING_DISC_REQUESTED 4 +#define AMSH_CSTATE_INCOMING_ESTABLISHED 5 + +#define AMSH_PID_UNKNOWN 0 + +/********************************** + * Shared memory packet formats + **********************************/ +typedef +struct am_pkt_short { + uint32_t flag; /**> Packet state */ + union { + uint32_t bulkidx; /**> index in bulk packet queue */ + uint32_t length; /**> length when no bulkidx used */ + }; + uint16_t shmidx; /**> index in shared segment */ + uint16_t type; + uint16_t nargs; + uint16_t handleridx; + + psm2_amarg_t args[NSHORT_ARGS]; /* AM arguments */ + + /* We eventually will expose up to 8 arguments, but this isn't implemented + * For now. >6 args will probably require a medium instead of a short */ +} __attribute__ ((aligned(64))) +am_pkt_short_t; +PSMI_STRICT_SIZE_DECL(am_pkt_short_t, 64); + +typedef struct am_pkt_bulk { + uint32_t flag; + uint32_t idx; + uintptr_t dest; /* Destination pointer in "longs" */ + uint32_t dest_off; /* Destination pointer offset */ + uint32_t len; /* Destination length within offset */ + psm2_amarg_t args[NBULK_ARGS]; /* Additional "spillover" for >6 args */ + uint8_t payload[0]; +} am_pkt_bulk_t; +/* No strict size decl, used for mediums and longs */ + +/**************************************************** + * Shared memory header and block control structures + ***************************************************/ + +/* Each pkt queue has the same header format, although the queue + * consumers don't use the 'head' index in the same manner. */ +typedef struct am_ctl_qhdr { + uint32_t head; /* Touched only by 1 consumer */ + uint8_t _pad0[64 - 4]; + + pthread_spinlock_t lock; + uint32_t tail; /* XXX candidate for fetch-and-incr */ + uint32_t elem_cnt; + uint32_t elem_sz; + uint8_t _pad1[64 - 3 * 4 - sizeof(pthread_spinlock_t)]; +} am_ctl_qhdr_t; +PSMI_STRICT_SIZE_DECL(am_ctl_qhdr_t, 128); + +/* Each process has a reply qhdr and a request qhdr */ +typedef struct am_ctl_blockhdr { + volatile am_ctl_qhdr_t shortq; + volatile am_ctl_qhdr_t longbulkq; +} am_ctl_blockhdr_t; +PSMI_STRICT_SIZE_DECL(am_ctl_blockhdr_t, 128 * 2); + +/* We cache the "shorts" because that's what we poll on in the critical path. + * We take care to always update these pointers whenever the segment is remapped. + */ +typedef struct am_ctl_qshort_cache { + volatile am_pkt_short_t *base; + volatile am_pkt_short_t *head; + volatile am_pkt_short_t *end; +} am_ctl_qshort_cache_t; + +/****************************************** + * Shared segment local directory (global) + ****************************************** + * + * Each process keeps a directory for where request and reply structures are + * located at its peers. This directory must be re-initialized every time the + * shared segment moves in the VM, and the segment moves every time we remap() + * for additional memory. + */ +struct amsh_qdirectory { + am_ctl_blockhdr_t *qreqH; + am_pkt_short_t *qreqFifoShort; + am_pkt_bulk_t *qreqFifoLong; + + am_ctl_blockhdr_t *qrepH; + am_pkt_short_t *qrepFifoShort; + am_pkt_bulk_t *qrepFifoLong; +} __attribute__ ((aligned(64))); + +/****************************************** + * Shared fifo element counts and sizes + ****************************************** + * These values are context-wide, they can only be set early on and can't be * + * modified at runtime. All endpoints are expected to use the same values. + */ +typedef +struct amsh_qinfo { + int qreqFifoShort; + int qreqFifoLong; + + int qrepFifoShort; + int qrepFifoLong; +} amsh_qinfo_t; + +/****************************************** + * Per-endpoint structures (ep-local) + ****************************************** + * Each endpoint keeps its own information as to where it resides in the + * directory, and maintains its own cached copies of where the short header + * resides in shared memory. + * + * This structure is carefully arranged to optimize cache locality and + * performance. Do not modify without careful and thorough analysis. + */ +struct am_ctl_nodeinfo { + uint16_t psm_verno; + volatile uint16_t is_init; + volatile pid_t pid; + psm2_epid_t epid; + psm2_epaddr_t epaddr; + uintptr_t amsh_shmbase; + amsh_qinfo_t amsh_qsizes; + uint32_t amsh_features; + struct amsh_qdirectory qdir; +} __attribute__((aligned(64))); + +struct ptl_am { + psm2_ep_t ep; + psm2_epid_t epid; + psm2_epaddr_t epaddr; + ptl_ctl_t *ctl; + + int connect_phase; + int connect_outgoing; + int connect_incoming; + + int zero_polls; + int amsh_only_polls; + int max_ep_idx, am_ep_size; + int psmi_kassist_mode; + char *amsh_keyname; + + /* These three items carefully picked to fit in one cache line. */ + am_ctl_qshort_cache_t reqH; + am_ctl_qshort_cache_t repH; + struct am_reqq_fifo_t psmi_am_reqq_fifo; + + am_pkt_short_t amsh_empty_shortpkt; + + struct am_ctl_nodeinfo *self_nodeinfo; + struct am_ctl_nodeinfo *am_ep; +} __attribute__((aligned(64))); + +#endif diff --git a/prov/psm3/psm3/ptl_am/ptl.c b/prov/psm3/psm3/ptl_am/ptl.c new file mode 100644 index 00000000000..2e42c1b4363 --- /dev/null +++ b/prov/psm3/psm3/ptl_am/ptl.c @@ -0,0 +1,378 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm_mq_internal.h" +#include "psm_am_internal.h" +#include "cmarw.h" + +#ifdef PSM_CUDA +#include "am_cuda_memhandle_cache.h" +#endif + +/** + * Callback function when a receive request is matched with the + * tag obtained from the RTS packet. + */ +static +psm2_error_t +ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted, + amsh_am_token_t *tok) +{ + psm2_amarg_t args[5]; + psm2_epaddr_t epaddr = req->rts_peer; + struct ptl_am *ptl = (struct ptl_am *)(epaddr->ptlctl->ptl); + int cma_succeed = 0; + int pid = 0, cuda_ipc_send_completion = 0; + + PSM2_LOG_MSG("entering."); + psmi_assert((tok != NULL && was_posted) + || (tok == NULL && !was_posted)); + + _HFI_VDBG("[shm][rndv][recv] req=%p dest=%p len=%d tok=%p\n", + req, req->req_data.buf, req->req_data.recv_msglen, tok); +#ifdef PSM_CUDA + if (req->cuda_ipc_handle_attached) { + + CUdeviceptr cuda_ipc_dev_ptr = am_cuda_memhandle_acquire(req->rts_sbuf - req->cuda_ipc_offset, + (CUipcMemHandle*)&req->cuda_ipc_handle, + req->req_data.recv_msglen, + req->rts_peer->epid); + cuda_ipc_dev_ptr = cuda_ipc_dev_ptr + req->cuda_ipc_offset; + /* cuMemcpy into the receive side buffer + * based on its location */ + if (req->is_buf_gpu_mem) { + PSMI_CUDA_CALL(cuMemcpyDtoD, (CUdeviceptr)req->req_data.buf, cuda_ipc_dev_ptr, + req->req_data.recv_msglen); + PSMI_CUDA_CALL(cuEventRecord, req->cuda_ipc_event, 0); + PSMI_CUDA_CALL(cuEventSynchronize, req->cuda_ipc_event); + } else + PSMI_CUDA_CALL(cuMemcpyDtoH, req->req_data.buf, cuda_ipc_dev_ptr, + req->req_data.recv_msglen); + cuda_ipc_send_completion = 1; + am_cuda_memhandle_release(cuda_ipc_dev_ptr - req->cuda_ipc_offset); + req->cuda_ipc_handle_attached = 0; + goto send_cts; + } +#endif + + if ((ptl->psmi_kassist_mode & PSMI_KASSIST_GET) + && req->req_data.recv_msglen > 0 + && (pid = psmi_epaddr_pid(epaddr))) { +#ifdef PSM_CUDA + /* If the buffer on the send side is on the host, + * we alloc a bounce buffer, use kassist and then + * do a cuMemcpy if the buffer on the recv side + * resides on the GPU + */ + if (req->is_buf_gpu_mem) { + void* cuda_ipc_bounce_buf = psmi_malloc(PSMI_EP_NONE, UNDEFINED, req->req_data.recv_msglen); + size_t nbytes = cma_get(pid, (void *)req->rts_sbuf, + cuda_ipc_bounce_buf, req->req_data.recv_msglen); + psmi_assert_always(nbytes == req->req_data.recv_msglen); + PSMI_CUDA_CALL(cuMemcpyHtoD, (CUdeviceptr)req->req_data.buf, cuda_ipc_bounce_buf, + req->req_data.recv_msglen); + /* Cuda library has recent optimizations where they do + * not guarantee synchronus nature for Host to Device + * copies for msg sizes less than 64k. The event record + * and synchronize calls are to guarentee completion. + */ + PSMI_CUDA_CALL(cuEventRecord, req->cuda_ipc_event, 0); + PSMI_CUDA_CALL(cuEventSynchronize, req->cuda_ipc_event); + psmi_free(cuda_ipc_bounce_buf); + } else { + /* cma can be done in handler context or not. */ + size_t nbytes = cma_get(pid, (void *)req->rts_sbuf, + req->req_data.buf, req->req_data.recv_msglen); + psmi_assert_always(nbytes == req->req_data.recv_msglen); + } +#else + /* cma can be done in handler context or not. */ + size_t nbytes = cma_get(pid, (void *)req->rts_sbuf, + req->req_data.buf, req->req_data.recv_msglen); + if (nbytes == -1) { + ptl->psmi_kassist_mode = PSMI_KASSIST_OFF; + _HFI_ERROR("Reading from remote process' memory failed. Disabling CMA support\n"); + } + else { + psmi_assert_always(nbytes == req->req_data.recv_msglen); + cma_succeed = 1; + } + psmi_assert_always(nbytes == req->req_data.recv_msglen); +#endif + } + +#ifdef PSM_CUDA +send_cts: +#endif + args[0].u64w0 = (uint64_t) (uintptr_t) req->ptl_req_ptr; + args[1].u64w0 = (uint64_t) (uintptr_t) req; + args[2].u64w0 = (uint64_t) (uintptr_t) req->req_data.buf; + args[3].u32w0 = req->req_data.recv_msglen; + args[3].u32w1 = tok != NULL ? 1 : 0; + args[4].u32w0 = ptl->psmi_kassist_mode; // pass current kassist mode to the peer process + + if (tok != NULL) { + psmi_am_reqq_add(AMREQUEST_SHORT, tok->ptl, + tok->tok.epaddr_incoming, mq_handler_rtsmatch_hidx, + args, 5, NULL, 0, NULL, 0); + } else + psmi_amsh_short_request((struct ptl *)ptl, epaddr, mq_handler_rtsmatch_hidx, + args, 5, NULL, 0, 0); + + /* 0-byte completion or we used kassist */ + if (pid || cma_succeed || + req->req_data.recv_msglen == 0 || cuda_ipc_send_completion == 1) { + psmi_mq_handle_rts_complete(req); + } + PSM2_LOG_MSG("leaving."); + return PSM2_OK; +} + +static +psm2_error_t +ptl_handle_rtsmatch(psm2_mq_req_t req, int was_posted) +{ + /* was_posted == 0 allows us to assume that we're not running this callback + * within am handler context (i.e. we can poll) */ + psmi_assert(was_posted == 0); + return ptl_handle_rtsmatch_request(req, 0, NULL); +} + +void +psmi_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, + size_t len) +{ + amsh_am_token_t *tok = (amsh_am_token_t *) toki; + psm2_mq_req_t req; + psm2_mq_tag_t tag; + int rc; + uint32_t opcode = args[0].u32w0; + uint32_t msglen = opcode <= MQ_MSG_SHORT ? len : args[0].u32w1; + + tag.tag[0] = args[1].u32w1; + tag.tag[1] = args[1].u32w0; + tag.tag[2] = args[2].u32w1; + psmi_assert(toki != NULL); + _HFI_VDBG("mq=%p opcode=%d, len=%d, msglen=%d\n", + tok->mq, opcode, (int)len, msglen); + + switch (opcode) { + case MQ_MSG_TINY: + case MQ_MSG_SHORT: + case MQ_MSG_EAGER: + rc = psmi_mq_handle_envelope(tok->mq, tok->tok.epaddr_incoming, + &tag, msglen, 0, buf, + (uint32_t) len, 1, opcode, &req); + + /* for eager matching */ + req->ptl_req_ptr = (void *)tok->tok.epaddr_incoming; + req->msg_seqnum = 0; /* using seqnum 0 */ + break; + default:{ + void *sreq = (void *)(uintptr_t) args[3].u64w0; + uintptr_t sbuf = (uintptr_t) args[4].u64w0; + psmi_assert(narg == 5); + psmi_assert_always(opcode == MQ_MSG_LONGRTS); + rc = psmi_mq_handle_rts(tok->mq, tok->tok.epaddr_incoming, + &tag, msglen, NULL, 0, 1, + ptl_handle_rtsmatch, &req); + + req->rts_peer = tok->tok.epaddr_incoming; + req->ptl_req_ptr = sreq; + req->rts_sbuf = sbuf; +#ifdef PSM_CUDA + /* Payload in RTS would mean an IPC handle has been + * sent. This would also mean the sender has to + * send from a GPU buffer + */ + if (buf && len > 0) { + req->cuda_ipc_handle = *((CUipcMemHandle*)buf); + req->cuda_ipc_handle_attached = 1; + req->cuda_ipc_offset = args[2].u32w0; + } +#endif + + if (rc == MQ_RET_MATCH_OK) /* we are in handler context, issue a reply */ + ptl_handle_rtsmatch_request(req, 1, tok); + /* else will be called later */ + break; + } + } + return; +} + +void +psmi_am_mq_handler_data(void *toki, psm2_amarg_t *args, int narg, void *buf, + size_t len) +{ + amsh_am_token_t *tok = (amsh_am_token_t *) toki; + + psmi_assert(toki != NULL); + + psm2_epaddr_t epaddr = (psm2_epaddr_t) tok->tok.epaddr_incoming; + psm2_mq_req_t req = mq_eager_match(tok->mq, epaddr, 0); /* using seqnum 0 */ + psmi_assert_always(req != NULL); + psmi_mq_handle_data(tok->mq, req, args[2].u32w0, buf, len); + + return; +} + +/** + * Function to handle CTS on the sender. + */ +void +psmi_am_mq_handler_rtsmatch(void *toki, psm2_amarg_t *args, int narg, void *buf, + size_t len) +{ + amsh_am_token_t *tok = (amsh_am_token_t *) toki; + + psmi_assert(toki != NULL); + + ptl_t *ptl = tok->ptl; + psm2_mq_req_t sreq = (psm2_mq_req_t) (uintptr_t) args[0].u64w0; +#ifdef PSM_CUDA + /* If send side req has a cuda ipc handle attached then we can + * assume the data has been copied as soon as we get a CTS + */ + if (sreq->cuda_ipc_handle_attached) { + sreq->cuda_ipc_handle_attached = 0; + psmi_mq_handle_rts_complete(sreq); + return; + } +#endif + void *dest = (void *)(uintptr_t) args[2].u64w0; + uint32_t msglen = args[3].u32w0; + psm2_amarg_t rarg[1]; + + _HFI_VDBG("[rndv][send] req=%p dest_req=%p src=%p dest=%p len=%d\n", + sreq, (void *)(uintptr_t) args[1].u64w0, sreq->req_data.buf, dest, + msglen); + + if (msglen > 0) { + rarg[0].u64w0 = args[1].u64w0; /* rreq */ + int kassist_mode = ((struct ptl_am *)ptl)->psmi_kassist_mode; + int kassist_mode_peer = args[4].u32w0; + // In general, peer process(es) shall have the same kassist mode set, + // but due to dynamic CMA failure detection, we must align local and remote state, + // and make protocol to adopt to that potential change. + if (kassist_mode_peer == PSMI_KASSIST_OFF && (kassist_mode & PSMI_KASSIST_MASK)) { + ((struct ptl_am *)ptl)->psmi_kassist_mode = PSMI_KASSIST_OFF; + goto no_kassist; + } + + if (kassist_mode & PSMI_KASSIST_PUT) { + int pid = psmi_epaddr_pid(tok->tok.epaddr_incoming); + size_t nbytes = cma_put(sreq->req_data.buf, pid, dest, msglen); + if (nbytes == -1) { + _HFI_ERROR("Writing to remote process' memory failed. Disabling CMA support\n"); + ((struct ptl_am *)ptl)->psmi_kassist_mode = PSMI_KASSIST_OFF; + goto no_kassist; + } + + psmi_assert_always(nbytes == msglen); + + /* Send response that PUT is complete */ + psmi_amsh_short_reply(tok, mq_handler_rtsdone_hidx, + rarg, 1, NULL, 0, 0); + } else if (!(kassist_mode & PSMI_KASSIST_MASK)) { + /* Only transfer if kassist is off, i.e. neither GET nor PUT. */ +no_kassist: + psmi_amsh_long_reply(tok, mq_handler_rtsdone_hidx, rarg, + 1, sreq->req_data.buf, msglen, dest, 0); + } + } + psmi_mq_handle_rts_complete(sreq); +} + +void +psmi_am_mq_handler_rtsdone(void *toki, psm2_amarg_t *args, int narg, void *buf, + size_t len) +{ + psm2_mq_req_t rreq = (psm2_mq_req_t) (uintptr_t) args[0].u64w0; + psmi_assert(narg == 1); + _HFI_VDBG("[rndv][recv] req=%p dest=%p len=%d\n", rreq, rreq->req_data.buf, + rreq->req_data.recv_msglen); + psmi_mq_handle_rts_complete(rreq); +} + +void +psmi_am_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, size_t len) +{ + amsh_am_token_t *tok = (amsh_am_token_t *) toki; + struct psm2_ep_am_handle_entry *hentry; + + psmi_assert(toki != NULL); + + hentry = psm_am_get_handler_function(tok->mq->ep, + (psm2_handler_t) args[0].u32w0); + + /* Note a guard here for hentry != NULL is not needed because at + * initialization, a psmi_assert_always() assure the entry will be + * non-NULL. */ + + /* Invoke handler function. For AM we do not support break functionality */ + if (likely(hentry->version == PSM2_AM_HANDLER_V2)) { + psm2_am_handler_2_fn_t hfn2 = + (psm2_am_handler_2_fn_t)hentry->hfn; + hfn2(toki, args + 1, narg - 1, buf, len, hentry->hctx); + } else { + psm2_am_handler_fn_t hfn1 = + (psm2_am_handler_fn_t)hentry->hfn; + hfn1(toki, args + 1, narg - 1, buf, len); + } + + return; +} diff --git a/prov/psm3/psm3/ptl_am/ptl_fwd.h b/prov/psm3/psm3/ptl_am/ptl_fwd.h new file mode 100644 index 00000000000..1d0fec4073a --- /dev/null +++ b/prov/psm3/psm3/ptl_am/ptl_fwd.h @@ -0,0 +1,64 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + +#ifndef _PTL_FWD_AMSH_H +#define _PTL_FWD_AMSH_H + +/* Symbol in am ptl */ +extern struct ptl_ctl_init psmi_ptl_amsh; + +extern int psmi_shm_mq_rv_thresh; + +#endif diff --git a/prov/psm3/psm3/ptl_ips/ips_config.h b/prov/psm3/psm3/ptl_ips/ips_config.h new file mode 100644 index 00000000000..06b36451c20 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_config.h @@ -0,0 +1,124 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2018 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2018 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef PTL_IPS_IPS_CONFIG_H +#define PTL_IPS_IPS_CONFIG_H + +#include "psm_config.h" + +/* Allocate new epaddrs in chunks of 128 */ +#define PTL_EPADDR_ALLOC_CHUNK 128 + +/* Generate an expected header every 16 packets */ +#define PSM_DEFAULT_EXPECTED_HEADER 16 + +#define DF_OPP_LIBRARY "libopasadb.so.1.0.0" +#define DATA_VFABRIC_OFFSET 8 + +/* Send retransmission */ +#define IPS_PROTO_SPIO_RETRY_US_DEFAULT 2 /* in uS */ + +#define IPS_PROTO_ERRCHK_MS_MIN_DEFAULT 160 /* in millisecs */ +#define IPS_PROTO_ERRCHK_MS_MAX_DEFAULT 640 /* in millisecs */ +#define IPS_PROTO_ERRCHK_FACTOR_DEFAULT 2 +#define PSM_TID_TIMEOUT_DEFAULT "160:640:2" /* update from above params */ + +#ifdef PSM_FI + +/* Fault injection, becomes parameters to psmi_faultinj_getspec so + * a comma-delimited list of + * "spec_name", num, denom + * Where num/denom means fault num out of every denom. + * The defines set 'denum' and assume that num is set to 1 + * + * These values are all defaults, each is overridable via + * PSM3_FI_ in the environment (and yes, spec_name is in lowercase + * *in the environment* just to minimize it appearing in the wild). The format + * there is so the same thing except that one can set + * a specific seed to the random number generator. + */ +#define IPS_FAULTINJ_RECVLOST 5000 /* 1 every X pkts dropped at recv */ +#define IPS_FAULTINJ_SENDLOST 5000 /* 1 every X pkts dropped at send */ +#define IPS_FAULTINJ_RQ_LKEY 5000 /* 0 every X RQ WQE bad lkey */ +#define IPS_FAULTINJ_SQ_LKEY 5000 /* 0 every X SQ WQE bad lkey */ +#define IPS_FAULTINJ_RC_RDMA_LKEY 5000 /* 0 every X RC SQ RDMA bad lkey */ +#define IPS_FAULTINJ_RC_RDMA_RKEY 5000 /* 0 every X RC SQ RDMA bad rkey */ +#define IPS_FAULTINJ_RV_RDMA_LEN 5000 /* 0 every X RV SQ RDMA bad len */ +#define IPS_FAULTINJ_RV_RDMA_RKEY 5000 /* 1 every X RV SQ RDMA bad rkey */ +#define IPS_FAULTINJ_REG_MR 100 /* 1 every X reg_mr ENOMEM */ +#define IPS_FAULTINJ_NONPRI_REG_MR 50 /* 1 every X non-pri reg_mr ENOMEM */ +#define IPS_FAULTINJ_PRI_REG_MR 1000 /* 1 every X pri reg_mr ENOMEM */ + +#endif /* #ifdef PSM_FI */ + + + +/* rcv thread */ +/* All in milliseconds */ +#define RCVTHREAD_TO_MIN_FREQ 10 /* min of 10 polls per sec */ +#define RCVTHREAD_TO_MAX_FREQ 100 /* max of 100 polls per sec */ +#define RCVTHREAD_TO_SHIFT 1 + +/* ptl.c */ +#define PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS 250 + +/* ips_proto_recv.c */ +#define PSM_STRAY_WARN_INTERVAL_DEFAULT_SECS 30 + +/* + * Easy switch to (say) _HFI_INFO if debugging in the expected protocol is + * needed + */ +#define _HFI_EXP _HFI_VDBG + +#endif /* PTL_IPS_IPS_CONFIG_H */ diff --git a/prov/psm3/psm3/ptl_ips/ips_crc32.c b/prov/psm3/psm3/ptl_ips/ips_crc32.c new file mode 100644 index 00000000000..589f3278d3b --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_crc32.c @@ -0,0 +1,93 @@ +/* The code in this file was derived from crc32.c in zlib 1.2.3, and + modified from its original form to suit our requirements. The zlib + license and crc32.c copyright and credits are preserved below. */ + +/* zlib.h -- interface of the 'zlib' general purpose compression library + version 1.2.3, July 18th, 2005 + + Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + jloup@gzip.org madler@alumni.caltech.edu + + The data format used by the zlib library is described by RFCs (Request for + Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt + (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format). +*/ + +/* crc32.c -- compute the CRC-32 of a data stream + * Copyright (C) 1995-2005 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + * + * Thanks to Rodney Brown for his contribution of faster + * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing + * tables for updating the shift register in one step with three exclusive-ors + * instead of four steps with four exclusive-ors. This results in about a + * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3. + */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" +#include "ips_proto_internal.h" + +/* Table of CRCs of all 8-bit messages. */ +static uint32_t crc_table[256]; + +/* Flag: has the table been computed? Initially false. */ +static int crc_table_computed; + +/* Make the table for a fast CRC. */ +static void make_crc_table(void) +{ + uint32_t c; + int n, k; + + for (n = 0; n < 256; n++) { + c = (uint32_t) n; + for (k = 0; k < 8; k++) { + if (c & 1) + c = 0xedb88320 ^ (c >> 1); + else + c = c >> 1; + } + crc_table[n] = c; + } + crc_table_computed = 1; +} + +/* Update a running CRC with the bytes buf[0..len-1]--the CRC + * should be initialized to all 1's, and the transmitted value + * is the 1's complement of the final running CRC (see the + * crc() routine below)). + */ + +uint32_t ips_crc_calculate(uint32_t len, uint8_t *data, uint32_t crc) +{ + uint32_t c = crc; + uint32_t n; + + if (!crc_table_computed) { + make_crc_table(); + } + for (n = 0; n < len; n++) { + c = crc_table[(c ^ data[n]) & 0xff] ^ (c >> 8); + } + return c; +} diff --git a/prov/psm3/psm3/ptl_ips/ips_epstate.c b/prov/psm3/psm3/ptl_ips/ips_epstate.c new file mode 100644 index 00000000000..12b80cfe9ec --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_epstate.c @@ -0,0 +1,174 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" +#include "ips_proto_internal.h" +#include "ips_epstate.h" + +/* The indexes are used to map a particular endpoint to a structure at the + * receiver. Although we take extra care to validate the identity of endpoints + * when packets are received, the communication index is at an offset selected + * by the endpoint that allocates the index. This narrows the window of two + * jobs communicated with the same set of indexes from getting crosstalk. + */ + +psm2_error_t +ips_epstate_init(struct ips_epstate *eps, const psmi_context_t *context) +{ + memset(eps, 0, sizeof(*eps)); + eps->context = context; + eps->eps_base_idx = ((ips_epstate_idx)get_cycles()) & + (IPS_EPSTATE_CONNIDX_MAX-1); + return PSM2_OK; +} + +psm2_error_t ips_epstate_fini(struct ips_epstate *eps) +{ + if (eps->eps_tab) + psmi_free(eps->eps_tab); + memset(eps, 0, sizeof(*eps)); + return PSM2_OK; +} + +/* + * Use this to debug issues involving the epstate table. + */ +void ips_epstate_dump(struct ips_epstate *eps) +{ + if (_HFI_DBG_ON) { + int i=0; + _HFI_DBG_ALWAYS("eps_base_idx = 0x%x, eps_tabsize = %d, " + "eps_tabsizeused = %d, eps_tab_nextidx = %d\n", + eps->eps_base_idx, eps->eps_tabsize, + eps->eps_tabsizeused, eps->eps_tab_nextidx); + for (i=0; ieps_tabsize; i++) { + _HFI_DBG_ALWAYS("%03d: ipsaddr = %p, cstate-o: %u, cstate-i: %u\n", i, + eps->eps_tab[i].ipsaddr, + eps->eps_tab[i].ipsaddr->cstate_outgoing, + eps->eps_tab[i].ipsaddr->cstate_incoming); + } + } +} + +/* + * Add ipsaddr with epid to the epstate table, return new index to caller in + * 'connidx'. + */ +psm2_error_t +ips_epstate_add(struct ips_epstate *eps, struct ips_epaddr *ipsaddr, + ips_epstate_idx *connidx_o) +{ + int i, j; + ips_epstate_idx connidx; + + if (++eps->eps_tabsizeused > eps->eps_tabsize) { /* realloc */ + struct ips_epstate_entry *newtab; + eps->eps_tabsize += PTL_EPADDR_ALLOC_CHUNK; + newtab = (struct ips_epstate_entry *) + psmi_calloc(eps->context->ep, PER_PEER_ENDPOINT, + eps->eps_tabsize, + sizeof(struct ips_epstate_entry)); + if (newtab == NULL) + return PSM2_NO_MEMORY; + else if (eps->eps_tab) { /* NOT first alloc */ + for (i = 0; + i < eps->eps_tabsize - PTL_EPADDR_ALLOC_CHUNK; i++) + newtab[i] = eps->eps_tab[i]; /* deep copy */ + psmi_free(eps->eps_tab); + } + eps->eps_tab = newtab; + } + /* Find the next free hole. We can afford to do this since connect is not + * in the critical path */ + for (i = 0, j = eps->eps_tab_nextidx; i < eps->eps_tabsize; i++, j++) { + if (j == eps->eps_tabsize) + j = 0; + if (eps->eps_tab[j].ipsaddr == NULL) { + eps->eps_tab_nextidx = j + 1; + if (eps->eps_tab_nextidx == eps->eps_tabsize) + eps->eps_tab_nextidx = 0; + break; + } + } + psmi_assert_always(i != eps->eps_tabsize); + connidx = (j - eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1); + _HFI_VDBG("node %s gets connidx=%d (table idx %d)\n", + psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), connidx, + j); + eps->eps_tab[j].ipsaddr = ipsaddr; + if (j >= IPS_EPSTATE_CONNIDX_MAX) { + return psmi_handle_error(eps->context->ep, + PSM2_TOO_MANY_ENDPOINTS, + "Can't connect to more than %d non-local endpoints", + IPS_EPSTATE_CONNIDX_MAX); + } + *connidx_o = connidx; + return PSM2_OK; +} + +psm2_error_t ips_epstate_del(struct ips_epstate *eps, ips_epstate_idx connidx) +{ + ips_epstate_idx idx; + /* actual table index */ + idx = (connidx + eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1); + psmi_assert_always(idx < eps->eps_tabsize); + _HFI_VDBG("connidx=%d, table_idx=%d\n", connidx, idx); + eps->eps_tab[idx].ipsaddr = NULL; + /* We may eventually want to release memory, but probably not */ + eps->eps_tabsizeused--; + return PSM2_OK; +} diff --git a/prov/psm3/psm3/ptl_ips/ips_epstate.h b/prov/psm3/psm3/ptl_ips/ips_epstate.h new file mode 100644 index 00000000000..b63c2ce9f4c --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_epstate.h @@ -0,0 +1,103 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_EPSTATE_H +#define _IPS_EPSTATE_H + +#include "psm_user.h" + +typedef uint32_t ips_epstate_idx; +#define IPS_EPSTATE_CONNIDX_MAX (1<<26) + +struct ips_epaddr; + +struct ips_epstate_entry { + struct ips_epaddr *ipsaddr; +}; + +struct ips_epstate { + const psmi_context_t *context; + ips_epstate_idx eps_base_idx; + int eps_tabsize; + int eps_tabsizeused; + int eps_tab_nextidx; + + struct ips_epstate_entry *eps_tab; +}; + +psm2_error_t ips_epstate_init(struct ips_epstate *eps, + const psmi_context_t *contextj); +psm2_error_t ips_epstate_fini(struct ips_epstate *eps); + +psm2_error_t ips_epstate_add(struct ips_epstate *eps, + struct ips_epaddr *ipsaddr, + ips_epstate_idx *connidx); +psm2_error_t ips_epstate_del(struct ips_epstate *eps, ips_epstate_idx connidx); + +/* Use this to debug EP issues. */ +void ips_epstate_dump(struct ips_epstate *eps); + +PSMI_INLINE( +struct ips_epstate_entry * +ips_epstate_lookup(const struct ips_epstate *eps, ips_epstate_idx idx)) +{ + idx = (idx + eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1); + if (idx < (ips_epstate_idx)eps->eps_tabsize) + return &eps->eps_tab[idx]; + else + return NULL; +} + +#endif /* _IPS_EPSTATE_H */ diff --git a/prov/psm3/psm3/ptl_ips/ips_expected_proto.h b/prov/psm3/psm3/ptl_ips/ips_expected_proto.h new file mode 100644 index 00000000000..e7044110b5f --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_expected_proto.h @@ -0,0 +1,379 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +/* + * Control and state structure for one instance of the expected protocol. The + * protocol depends on some upcalls from internal portions of the receive + * protocol (such as opcodes dedicated for expected protocol handling) + */ + +/* + * Expected tid operations are carried out over "sessions". One session is a + * collection of N tids where N is determined by the expected message window + * size (-W option or PSM3_MQ_RNDV_NIC_WINDOW). Since naks can cause + * retransmissions, each session has an session index (_desc_idx) and a + * generation count (_desc_genc) to be able to identify if retransmitted + * packets reference the correct session. + * + * index and generation count are each 4 bytes encoded in one ptl_arg. They + * could be compressed further but we have the header space, so we don't + * bother. + */ + +#ifndef __IPS_EXPECTED_PROTO_H__ + +#define __IPS_EXPECTED_PROTO_H__ 1 + +#define _desc_idx u32w0 +#define _desc_genc u32w1 + +/* + * For debug and/or other reasons, we can log the state of each tid and + * optionally associate it to a particular receive descriptor + */ + +#define TIDSTATE_FREE 0 +#define TIDSTATE_USED 1 + +struct ips_tidinfo { + uint32_t tid; + uint32_t state; + struct ips_tid_recv_desc *tidrecvc; +}; + +struct ips_protoexp { + const struct ptl *ptl; + struct ips_proto *proto; + struct psmi_timer_ctrl *timerq; + struct ips_tf tfc; + + psm_transfer_type_t ctrl_xfer_type; + struct ips_scbctrl tid_scbc_rv; // pool of SCBs for TID sends + // for OPA this includes: TIDEXP, CTS, + // EXPTID_COMPLETION + // For UD: CTS, ERR_CHK_RDMA, + // ERR_CHK_RDMA_RESP + mpool_t tid_desc_send_pool; + mpool_t tid_getreq_pool; + mpool_t tid_sreq_pool; /* backptr into proto->ep->mq */ + mpool_t tid_rreq_pool; /* backptr into proto->ep->mq */ + uint32_t tid_flags; + + STAILQ_HEAD(ips_tid_send_pend, /* pending exp. sends */ + ips_tid_send_desc) pend_sendq; + struct psmi_timer timer_send; + + STAILQ_HEAD(ips_tid_get_pend, ips_tid_get_request) pend_getreqsq; /* pending tid reqs */ +#ifdef RNDV_MOD_MR + STAILQ_HEAD(ips_tid_err_resp_pend, ips_epaddr) pend_err_resp; /* pending ERR CHK RDMA RESP */ +#endif + /* services pend_getreqsq and pend_err_chk_rdma_resp */ + struct psmi_timer timer_getreqs; + +#ifdef PSM_CUDA + STAILQ_HEAD(ips_tid_get_cudapend, /* pending cuda transfers */ + ips_tid_get_request) cudapend_getreqsq; + struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_recv_cfg; + struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_small_recv_cfg; + mpool_t cuda_hostbuf_pool_recv; + mpool_t cuda_hostbuf_pool_small_recv; + CUstream cudastream_recv; +#endif +}; + +/* + * TID member list format used in communication. + * Since the compiler does not make sure the bit fields order, + * we use mask and shift defined below. +typedef struct { + uint32_t length:11; // in page unit, max 1024 pages + uint32_t reserved:9; // for future usage + uint32_t tidctrl:2; // hardware defined tidctrl value + uint32_t tid:10; // hardware only support 10bits +} +ips_tid_session_member; + */ +#define IPS_TIDINFO_LENGTH_SHIFT 0 +#define IPS_TIDINFO_LENGTH_MASK 0x7ff +#define IPS_TIDINFO_TIDCTRL_SHIFT 20 +#define IPS_TIDINFO_TIDCTRL_MASK 0x3 +#define IPS_TIDINFO_TID_SHIFT 22 +#define IPS_TIDINFO_TID_MASK 0x3ff + +#define IPS_TIDINFO_GET_LENGTH(tidinfo) \ + (((tidinfo)>>IPS_TIDINFO_LENGTH_SHIFT)&IPS_TIDINFO_LENGTH_MASK) +#define IPS_TIDINFO_GET_TIDCTRL(tidinfo) \ + (((tidinfo)>>IPS_TIDINFO_TIDCTRL_SHIFT)&IPS_TIDINFO_TIDCTRL_MASK) +#define IPS_TIDINFO_GET_TID(tidinfo) \ + (((tidinfo)>>IPS_TIDINFO_TID_SHIFT)&IPS_TIDINFO_TID_MASK) + +// This structure is used as CTS payload to describe TID receive +// for UD it describes the destination for an RDMA Write +// N/A for UDP +typedef struct ips_tid_session_list_tag { + // TBD on how we will handle unaligned start/end at receiver + uint32_t tsess_srcoff; /* source offset from beginning */ + uint32_t tsess_length; /* session length, including start/end */ + uint64_t tsess_raddr; /* RDMA virt addr this part of receiver's buffer */ + /* already adjusted for srcoff */ + uint32_t tsess_rkey; /* rkey for receiver's buffer */ +} PACK_SUFFIX ips_tid_session_list; + +/* + * Send-side expected send descriptors. + * + * Descriptors are allocated when tid grant requests are received (the 'target' + * side of an RDMA get request). Descriptors are added to a pending queue of + * expected sends and processed one at a time (scb's are requested and messages + * sent until all fragments of the descriptor's length are put on the wire). + * + */ +#define TIDSENDC_SDMA_VEC_DEFAULT 260 + +struct ips_tid_send_desc { + struct ips_protoexp *protoexp; + STAILQ_ENTRY(ips_tid_send_desc) next; + + /* Filled in at allocation time */ + ptl_arg_t sdescid; /* sender descid */ + ptl_arg_t rdescid; /* reciever descid */ + ips_epaddr_t *ipsaddr; + psm2_mq_req_t mqreq; + + psm2_verbs_mr_t mr; + + /* Iterated during send progress */ + void *userbuf; /* user privided buffer */ + void *buffer; + uint32_t length; /* total length, includint start/end */ + + + uint8_t is_complete:1; // all packets for send queued, waiting CQE/response +#ifdef RNDV_MOD_MR + uint8_t rv_need_err_chk_rdma:1; // need to determine if a retry is required + uint8_t reserved:6; + uint8_t rv_sconn_index; // sconn in rv we issued RDMA write on + uint32_t rv_conn_count;// Count of sconn completed conn establishments +#else + uint8_t reserved:7; +#endif + +#ifdef PSM_CUDA + /* As size of cuda_hostbuf is less than equal to window size, + * there is a guarantee that the maximum number of host bufs we + * would need to attach to a tidsendc would be 2 + */ + struct ips_cuda_hostbuf *cuda_hostbuf[2]; + /* Number of hostbufs attached */ + uint8_t cuda_num_buf; +#endif + // ips_tid_session_list is fixed sized for UD + // N/A to UDP + ips_tid_session_list tid_list; +}; + +#define TIDRECVC_STATE_FREE 0 +#define TIDRECVC_STATE_BUSY 1 + +struct ips_expected_recv_stats { + uint32_t nSeqErr; + uint32_t nGenErr; + uint32_t nReXmit; + uint32_t nErrChkReceived; +}; + +struct ips_tid_recv_desc { + const psmi_context_t *context; + struct ips_protoexp *protoexp; + + ptl_arg_t rdescid; /* reciever descid */ + ips_epaddr_t *ipsaddr; + struct ips_tid_get_request *getreq; + + /* scb to send tid grant CTS */ + ips_scb_t *grantscb; + psm2_verbs_mr_t mr; // MR for this message window/chunk + + /* TF protocol state (recv) */ + uint32_t state; + // TBD - these next 3 fields are probably not needed for PSM_UD USE_RC + uint32_t tidflow_active_gen; + uint32_t tidflow_nswap_gen; + psmi_seqnum_t tidflow_genseq; + +#ifdef PSM_CUDA + struct ips_cuda_hostbuf *cuda_hostbuf; + uint8_t is_ptr_gpu_backed; +#endif + + void *buffer; + uint32_t recv_msglen; + + struct ips_expected_recv_stats stats; + + /* bitmap of queued control messages for */ + uint16_t ctrl_msg_queued; + // ips_tid_session_list is fixed sized for UD + // N/A to UDP + ips_tid_session_list tid_list; +}; + +/* + * Get requests, issued by MQ when there's a match on a large message. Unlike + * an RDMA get, the initiator identifies the location of the data at the target + * using a 'send token' instead of a virtual address. This, of course, assumes + * that the target has already registered the token and communicated it to the + * initiator beforehand (it actually sends the token as part of the initial + * MQ message that contains the MQ tag). + * + * The operation is semantically a two-sided RDMA get. + */ +typedef void (*ips_tid_completion_callback_t) (psm2_mq_req_t); + +struct ips_tid_get_request { + STAILQ_ENTRY(ips_tid_get_request) tidgr_next; + struct ips_protoexp *tidgr_protoexp; + psm2_epaddr_t tidgr_epaddr; + + void *tidgr_lbuf; + uint32_t tidgr_length; + uint32_t tidgr_rndv_winsz; + uint32_t tidgr_sendtoken; + ips_tid_completion_callback_t tidgr_callback; + psm2_mq_req_t tidgr_req; + + uint32_t tidgr_offset; /* offset in bytes */ + uint32_t tidgr_bytesdone; + uint32_t tidgr_flags; + +#ifdef PSM_CUDA + int cuda_hostbuf_used; + uint32_t tidgr_cuda_bytesdone; + STAILQ_HEAD(ips_tid_getreq_cuda_hostbuf_pend, /* pending exp. sends */ + ips_cuda_hostbuf) pend_cudabuf; +#endif +}; + +/* + * Descriptor limits, structure contents of struct psmi_rlimit_mpool for + * normal, min and large configurations. + */ +#define TID_SENDSESSIONS_LIMITS { \ + .env = "PSM3_RDMA_SENDSESSIONS_MAX", \ + .descr = "RDMA max send session descriptors", \ + .env_level = PSMI_ENVVAR_LEVEL_USER, \ + .minval = 1, \ + .maxval = 1<<30, \ + .mode[PSMI_MEMMODE_NORMAL] = { 256, 8192 }, \ + .mode[PSMI_MEMMODE_MINIMAL] = { 1, 1 }, \ + .mode[PSMI_MEMMODE_LARGE] = { 512, 16384 } \ + } + +/* + * Expected send support + */ +/* + * The expsend token is currently always a pointer to a MQ request. It is + * echoed on the wire throughout various phases of the expected send protocol + * to identify a particular send. + */ +psm2_error_t +MOCKABLE(ips_protoexp_init)(const psmi_context_t *context, + const struct ips_proto *proto, + uint32_t protoexp_flags, int num_of_send_bufs, + int num_of_send_desc, + struct ips_protoexp **protoexp_o); +MOCK_DCL_EPILOGUE(ips_protoexp_init); + +psm2_error_t ips_protoexp_fini(struct ips_protoexp *protoexp); + +int ips_protoexp_handle_immed_data(struct ips_proto *proto, uint64_t conn_ref, + int conn_type, uint32_t immed, uint32_t len); +int ips_protoexp_rdma_write_completion( uint64_t wr_id); +#ifdef RNDV_MOD_MR +int ips_protoexp_rdma_write_completion_error(psm2_ep_t ep, uint64_t wr_id, + enum ibv_wc_status wc_status); +int ips_protoexp_process_err_chk_rdma(struct ips_recvhdrq_event *rcv_ev); +int ips_protoexp_process_err_chk_rdma_resp(struct ips_recvhdrq_event *rcv_ev); +#endif + + +PSMI_ALWAYS_INLINE( +void ips_protoexp_unaligned_copy(uint8_t *dst, uint8_t *src, uint16_t len)) +{ + while (len) { + dst[len-1] = src[len-1]; + len--; + } +} + +/* + * Peer is waiting (blocked) for this request + */ +#define IPS_PROTOEXP_TIDGET_WAIT 0x1 +#define IPS_PROTOEXP_TIDGET_PEERWAIT 0x2 +psm2_error_t ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp, + void *buf, uint32_t length, + psm2_epaddr_t epaddr, + uint32_t remote_tok, uint32_t flags, + ips_tid_completion_callback_t + callback, psm2_mq_req_t req); +psm2_error_t +ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, + ips_epaddr_t *ipsaddr, psm2_mq_req_t req, + ptl_arg_t rdescid, uint32_t tidflow_genseq, + ips_tid_session_list *tid_list, + uint32_t tid_list_size); +#endif /* #ifndef __IPS_EXPECTED_PROTO_H__ */ diff --git a/prov/psm3/psm3/ptl_ips/ips_opp_path_rec.c b/prov/psm3/psm3/ptl_ips/ips_opp_path_rec.c new file mode 100644 index 00000000000..395326d05e0 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_opp_path_rec.c @@ -0,0 +1,582 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" +#include + +/* SLID and DLID are in network byte order */ +static psm2_error_t +ips_opp_get_path_rec(ips_path_type_t type, struct ips_proto *proto, + uint16_t slid, uint16_t dlid, + ips_path_rec_t **ppath_rec) +{ + psm2_error_t err = PSM2_OK; + ibta_path_rec_t query, opp_response; +#ifdef _HFI_DEBUGGING + int opp_response_set = 0; +#endif + ips_path_rec_t *path_rec; + int opp_err; + ENTRY elid, *epath = NULL; + char eplid[128]; + uint64_t timeout_ack_ms; + + /* Query path record query cache first */ + bzero(&query, sizeof(query)); + bzero(eplid, sizeof(eplid)); + + /* Bulk service ID is control service id + 1 */ + switch (type) { + case IPS_PATH_LOW_PRIORITY: + query.service_id = + __cpu_to_be64(proto->ep->service_id + DATA_VFABRIC_OFFSET); + break; + case IPS_PATH_NORMAL_PRIORITY: + case IPS_PATH_HIGH_PRIORITY: + default: + query.service_id = __cpu_to_be64(proto->ep->service_id); + } + + query.slid = slid; + query.dlid = dlid; + + snprintf(eplid, sizeof(eplid), "%s_%x_%x", + (type == IPS_PATH_LOW_PRIORITY) ? "LOW" : "HIGH", + query.slid, query.dlid); + elid.key = eplid; + hsearch_r(elid, FIND, &epath, &proto->ips_path_rec_hash); + + if (!epath) { /* Unable to find path record in cache */ + elid.key = + psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1); + path_rec = (ips_path_rec_t *) + psmi_calloc(proto->ep, UNDEFINED, 1, + sizeof(ips_path_rec_t)); + if (!elid.key || !path_rec) { + if (elid.key) + psmi_free(elid.key); + if (path_rec) + psmi_free(path_rec); + err = PSM2_NO_MEMORY; + goto fail; + } + + /* Get path record between local LID and remote */ + opp_err = + proto->opp_fn.op_path_get_path_by_rec(proto->opp_ctxt, + &query, + &opp_response); + if (opp_err) { + psmi_free(path_rec); + psmi_free(elid.key); + err = PSM2_EPID_PATH_RESOLUTION; + goto fail; + } +#ifdef _HFI_DEBUGGING + opp_response_set = 1; +#endif + // this should not happen since we are using a LID to LID query + // but at some point we need to figure out how to deal with + // virtualized IB environments where a GRH may be needed + // HOP Limit >1 indicates a global route with a GRH + if ((__be32_to_cpu(opp_response.hop_flow_raw) & 0xFF) > 1) { + _HFI_ERROR + ("Global Routed Path Record not supported SLID 0x%d DLID 0x%x\n", + __be16_to_cpu(slid), __be16_to_cpu(dlid)); + err = PSM2_EPID_PATH_RESOLUTION; + goto fail; + } + /* Create path record */ + path_rec->pr_slid = opp_response.slid; + path_rec->pr_dlid = opp_response.dlid; + path_rec->pr_mtu = + min(opa_mtu_enum_to_int(opp_response.mtu & 0x3f) + - MAX_PSM_HEADER + , proto->epinfo.ep_mtu); + path_rec->pr_pkey = ntohs(opp_response.pkey); + path_rec->pr_sl = ntohs(opp_response.qos_class_sl); + path_rec->pr_static_rate = opp_response.rate & 0x3f; + + /* Setup CCA parameters for path */ + if (path_rec->pr_sl > PSMI_SL_MAX) { + psmi_free(path_rec); + psmi_free(elid.key); + err = PSM2_INTERNAL_ERR; + goto fail; + } + + /* Compute max timeout based on pkt life time for path */ + timeout_ack_ms = + ((4096UL * (1UL << (opp_response.pkt_life & 0x3f))) / + 1000000UL); + timeout_ack_ms = + ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT + + timeout_ack_ms); + if (proto->epinfo.ep_timeout_ack_max < timeout_ack_ms) + proto->epinfo.ep_timeout_ack_max = timeout_ack_ms; + err = ips_make_ah(proto->ep, path_rec); + if (err != PSM2_OK) { + psmi_free(elid.key); + psmi_free(path_rec); + return err; + } + + /* Add path record into cache */ + strcpy(elid.key, eplid); + elid.data = (void *)path_rec; + hsearch_r(elid, ENTER, &epath, &proto->ips_path_rec_hash); + } else /* Path record found in cache */ + path_rec = (ips_path_rec_t *) epath->data; + +#ifdef _HFI_DEBUGGING + /* Dump path record stats */ + _HFI_PRDBG("Path Record ServiceID: %" PRIx64 " %x -----> %x\n", + (uint64_t) __be64_to_cpu(query.service_id), + __be16_to_cpu(slid), __be16_to_cpu(dlid)); + if (opp_response_set) + { + _HFI_PRDBG("MTU: %x, %x\n", (opp_response.mtu & 0x3f), + path_rec->pr_mtu); + _HFI_PRDBG("PKEY: 0x%04x\n", ntohs(opp_response.pkey)); + _HFI_PRDBG("SL: 0x%04x\n", ntohs(opp_response.qos_class_sl)); + _HFI_PRDBG("Rate: %x\n", (opp_response.rate & 0x3f)); + } + _HFI_PRDBG("Timeout Init.: 0x%" PRIx64 " Max: 0x%" PRIx64 "\n", + proto->epinfo.ep_timeout_ack, + proto->epinfo.ep_timeout_ack_max); +#endif + /* Return the IPS path record */ + *ppath_rec = path_rec; + +fail: + return err; +} + +static psm2_error_t +ips_opp_path_rec(struct ips_proto *proto, + uint16_t slid, uint16_t dlid, + uint16_t ip_hi, // unused here, but must match API signature + unsigned long timeout, ips_path_grp_t **ppathgrp) +{ + psm2_error_t err = PSM2_OK; + uint16_t pidx, cpath, num_path = (1 << proto->epinfo.ep_lmc); + ips_path_type_t path_type = IPS_PATH_NORMAL_PRIORITY; + ips_path_rec_t *path; + ips_path_grp_t *pathgrp; + uint16_t path_slid, path_dlid; + ENTRY elid, *epath = NULL; + char eplid[128]; + + /* + * High Priority Path + * ------------------ + * + * Uses the "base" Service ID. For now there exists only 1 high priority + * path between nodes even for non zero LMC fabrics. + * + * Normal/Low Priority Paths + * ------------------------- + * + * Currently these paths are the same i.e. they are queried for the same + * Service ID/vFabric which is the Base Service ID for High Priority + 1. + * + * Use case Scenarios + * ------------------ + * + * Since with vFabrics we have the capability to define different QoS + * parameters per vFabric it is envisioned that the IPS_PATH_HIGH_PRIORITY is + * setup in a separate vFabric for high priority traffic. The NORMAL paths + * are setup in a separate vFabric optimized for high bandwidth. This allows + * us to potentially have control traffic (RTS, CTS etc.) not be bottlenecked + * by bulk transfer data. All control messages (ACKs,NAKs, TID_GRANT etc.) + * also use the high priority control vFabric. + * + * NOTE: In order to distinguish between the different vFabrics the user + * specifies the service ID to use via mpirun (or environment variable). + * This is the service ID for the high priority control traffic. The bulk + * data vFabric is identified by service ID + 1. So for each MPI application + * one should specify two service IDs for the high priority and bulk data. + * Both these service IDs can be placed in the same vFabric which can be + * configured for high priority or bandwidth traffic giving us the default + * behavior upto Infinhfi 2.5 release. + * + * NOTE: All of the above would have really helped if the S20 silicon could + * correctly support IBTA QoS features. Due to S20 design we can only have + * high priority VLarb table (low priority VLarb table results in round + * robin arbitration ignoring the weights!). But if this is fixed in a + * subsequent chip respin then this may potentially help our scalability + * on large fabrics. + * + * Mesh/Torus and DOR routed networks + * ---------------------------------- + * + * In a mesh/torus fabric we always have a non zero LMC (at least 1 can be + * more). We would like to take advantage of dispersive routing on these + * fabrics as well to obtain better "worst case/congested" bandwidth. For + * these networks currently the base LIDs are used for UPDN routing which + * is suboptimal on these networks. Higher order LIDs (+1 .. +N) use DOR + * routing (Dimension Ordered Routing) to avoid deadlocks and provide + * higher performance. If a fabric is disrupted then only the base UPDN + * routing is available. PSM should continue to operate in this environment + * albeit with degraded performance. In disrupted fabric the OPP path + * record queries may fail for some DOR routed LIDs i.e. no path exists + * PSM should hence ignore path record failures as they indicate a disrupted + * fabric and only use valid paths that are returned from the replica. This + * will degenerate to only using the UPDN paths on disrupted fabrics and DOR + * routes only for fully configured fabrics. Note: For a clean fabric the + * base LIDs that are configured for UPDN route will not exist in the replica + * as DOR routes are preferred. Hence we will only dispersively route across + * the DOR routes only using the UPDN route for disrupted fabrics. + * + * AS LONG AS ONE PATH EXISTS (for each of the priorities) COMMUNICATION CAN + * TAKE PLACE. + */ + + /* Check if this path grp is already in hash table */ + snprintf(eplid, sizeof(eplid), "%x_%x", slid, dlid); + elid.key = eplid; + hsearch_r(elid, FIND, &epath, &proto->ips_path_grp_hash); + + if (epath) { /* Find path group in cache */ + *ppathgrp = (ips_path_grp_t *) epath->data; + return err; + } + + /* If base lids are only used then reset num_path to 1 */ + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE) + num_path = 1; + + /* Allocate a new pathgroup */ + elid.key = psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1); + pathgrp = (ips_path_grp_t *) + psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(ips_path_grp_t) + + num_path * IPS_PATH_MAX_PRIORITY * + sizeof(ips_path_rec_t *)); + if (!elid.key || !pathgrp) { + if (elid.key) + psmi_free(elid.key); + if (pathgrp) + psmi_free(pathgrp); + err = PSM2_NO_MEMORY; + goto fail; + } + + /* + * dlid is the peer base lid. + * slid is the base lid for the local end point. + * Store here in network byte order. + */ + pathgrp->pg_base_dlid = dlid; + pathgrp->pg_base_slid = slid; + + pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] = + pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] = + pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] = 0; + + /* For now there is always only one high priority path between nodes. */ + for (pidx = 0, cpath = 0; pidx < num_path && cpath == 0; pidx++) { + path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx); + path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx); + + err = ips_opp_get_path_rec(IPS_PATH_HIGH_PRIORITY, proto, + path_slid, path_dlid, + &path); + + if (err == PSM2_OK) { /* Valid high priority path found */ + /* Resolved high priority path successfully */ + pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY]++; + pathgrp->pg_path[cpath][IPS_PATH_HIGH_PRIORITY] = path; + + /* Increment current path index */ + cpath++; + } + + PSM2_LOG_MSG("path %p slid %hu dlid %hu\n", + path, + __be16_to_cpu(path->pr_slid), + __be16_to_cpu(path->pr_dlid)); + } + + /* Make sure we have atleast 1 high priority path */ + if (pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] == 0) { + psmi_free(elid.key); + psmi_free(pathgrp); + err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION, + "OFED Plus path lookup failed. Unable to resolve high priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %" + PRIx64 " defined?", ntohs(slid), + ntohs(dlid), + (uint64_t) proto->ep->service_id); + goto fail; + } + + + + /* Next setup the bulk paths. If the subnet administrator has misconfigured + * or rather not configured two separate service IDs we place the bulk + * paths in the same vFabric as the control paths. + */ + + path_type = IPS_PATH_NORMAL_PRIORITY; + for (pidx = 0, cpath = 0; pidx < num_path; pidx++) { + path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx); + path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx); + +retry_normal_path_res: + err = ips_opp_get_path_rec(path_type, proto, + path_slid, path_dlid, + &path); + if (err != PSM2_OK) { + if (path_type == IPS_PATH_NORMAL_PRIORITY) { + /* Subnet may only be configured for one service ID/vFabric. Default + * to using the control vFabric/service ID for bulk data as well. + */ + path_type = IPS_PATH_HIGH_PRIORITY; + goto retry_normal_path_res; + } + + /* Unable to resolve path for . This is possible + * for disrupted fabrics using DOR routing so continue to acquire paths + */ + err = PSM2_OK; + continue; + } + + /* Valid path. */ + pathgrp->pg_path[cpath][IPS_PATH_NORMAL_PRIORITY] = path; + pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY]++; + cpath++; + } + + /* Make sure we have at least have a single bulk data transfer path */ + if (pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] == 0) { + psmi_free(elid.key); + psmi_free(pathgrp); + err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION, + "OFED Plus path lookup failed. Unable to resolve normal priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %" + PRIx64 " defined?", ntohs(slid), + ntohs(dlid), + (uint64_t) proto->ep->service_id); + goto fail; + } + + path_type = IPS_PATH_LOW_PRIORITY; + for (pidx = 0, cpath = 0; pidx < num_path; pidx++) { + path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx); + path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx); + +retry_low_path_res: + err = ips_opp_get_path_rec(path_type, proto, + path_slid, path_dlid, + &path); + if (err != PSM2_OK) { + if (path_type == IPS_PATH_LOW_PRIORITY) { + /* Subnet may only be configured for one service ID/vFabric. Default + * to using the control vFabric/service ID for bulk data as well. + */ + path_type = IPS_PATH_HIGH_PRIORITY; + goto retry_low_path_res; + } + + /* Unable to resolve path for . This is possible + * for disrupted fabrics using DOR routing so continue to acquire paths + */ + err = PSM2_OK; + continue; + } + + /* Valid path. */ + pathgrp->pg_path[cpath][IPS_PATH_LOW_PRIORITY] = path; + pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY]++; + cpath++; + } + + /* Make sure we have at least have a single bulk data transfer path */ + if (pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] == 0) { + psmi_free(elid.key); + psmi_free(pathgrp); + err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION, + "OFED Plus path lookup failed. Unable to resolve low priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %" + PRIx64 " defined?", ntohs(slid), + ntohs(dlid), + (uint64_t) proto->ep->service_id); + goto fail; + } + + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) { + pathgrp->pg_next_path[IPS_PATH_NORMAL_PRIORITY] = + proto->epinfo.EP_HASH % + pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY]; + pathgrp->pg_next_path[IPS_PATH_LOW_PRIORITY] = + proto->epinfo.EP_HASH % + pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY]; + } + + /* Add path group into cache */ + strcpy(elid.key, eplid); + elid.data = (void *)pathgrp; + hsearch_r(elid, ENTER, &epath, &proto->ips_path_grp_hash); + + *ppathgrp = pathgrp; + +fail: + if (err != PSM2_OK) + _HFI_PRDBG + ("Unable to get path record for LID 0x%x <---> DLID 0x%x.\n", + slid, dlid); + return err; +} + +static psm2_error_t ips_opp_fini(struct ips_proto *proto) +{ + psm2_error_t err = PSM2_OK; + + if (proto->opp_lib) + dlclose(proto->opp_lib); + + return err; +} + +psm2_error_t ips_opp_init(struct ips_proto *proto) +{ + psm2_error_t err = PSM2_OK; + char hfiName[32]; + + proto->opp_lib = dlopen(DF_OPP_LIBRARY, RTLD_NOW); + if (!proto->opp_lib) { + char *err = dlerror(); + _HFI_ERROR + ("Unable to open OFED Plus Plus library %s. Error: %s\n", + DF_OPP_LIBRARY, err ? err : "no dlerror()"); + goto fail; + } + + /* Resolve symbols that we require within opp library */ + proto->opp_fn.op_path_find_hca = + dlsym(proto->opp_lib, "op_path_find_hfi"); + proto->opp_fn.op_path_open = dlsym(proto->opp_lib, "op_path_open"); + proto->opp_fn.op_path_close = dlsym(proto->opp_lib, "op_path_close"); + proto->opp_fn.op_path_get_path_by_rec = + dlsym(proto->opp_lib, "op_path_get_path_by_rec"); + + /* If we can't resovle any symbol then fail to load opp module */ + if (!proto->opp_fn.op_path_find_hca || !proto->opp_fn.op_path_open || + !proto->opp_fn.op_path_close + || !proto->opp_fn.op_path_get_path_by_rec) { + _HFI_ERROR + ("Unable to resolve symbols in OPP library. Unloading.\n"); + goto fail; + } + + /* If PSM3_IDENTIFY is set display the OPP library location being used. */ + if (psmi_parse_identify()) { + Dl_info info_opp; + printf + ("PSM3 path record queries using OFED Plus Plus (%s) from %s\n", + DF_OPP_LIBRARY, dladdr(proto->opp_fn.op_path_open, + &info_opp) ? info_opp. + dli_fname : + "Unknown/unsupported version of OPP library found!"); + } + + /* Obtain handle to hfi (requires verbs on node) */ + snprintf(hfiName, sizeof(hfiName), "%s_%d", + psmi_hal_get_hfi_name(), + proto->ep->unit_id); + proto->hndl = proto->opp_fn.op_path_find_hca(hfiName, &proto->device); + if (!proto->hndl) { + _HFI_ERROR + ("OPP: Unable to find NIC %s. Disabling OPP interface for path record queries.\n", + hfiName); + goto fail; + } + + /* Get OPP context */ + proto->opp_ctxt = proto->opp_fn.op_path_open(proto->device, 1); + if (!proto->opp_ctxt) { + _HFI_ERROR + ("OPP: Unable to obtain OPP context. Disabling OPP interface for path record queries.\n"); + goto fail; + } + + /* Setup default errorcheck timeout. OPP may change it later. */ + proto->epinfo.ep_timeout_ack = + ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT); + proto->epinfo.ep_timeout_ack_max = + ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT); + proto->epinfo.ep_timeout_ack_factor = IPS_PROTO_ERRCHK_FACTOR_DEFAULT; + + /* OPP initialized successfully */ + proto->ibta.get_path_rec = ips_opp_path_rec; + proto->ibta.fini = ips_opp_fini; + proto->flags |= IPS_PROTO_FLAG_QUERY_PATH_REC; + + return err; + +fail: + _HFI_ERROR("Make sure SM is running...\n"); + _HFI_ERROR("Make sure service ibacm is running...\n"); + _HFI_ERROR("to start ibacm: service ibacm start\n"); + _HFI_ERROR("or enable it at boot time: iefsconfig -E ibacm\n\n"); + + err = psmi_handle_error(NULL, PSM2_EPID_PATH_RESOLUTION, + "Unable to initialize OFED Plus library successfully.\n"); + + if (proto->opp_lib) + dlclose(proto->opp_lib); + + return err; +} diff --git a/prov/psm3/psm3/ptl_ips/ips_path_rec.c b/prov/psm3/psm3/ptl_ips/ips_path_rec.c new file mode 100644 index 00000000000..7918ba319b8 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_path_rec.c @@ -0,0 +1,554 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include +#include +#include + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" + +/* + * These are the default values used in parsing the environment + * variable PSM3_PATH_NO_LMC_RANGE, which can be used to exclude + * a range of message sizes from the LMC LID assignments used to + * implement dispersive routing. + * + * This value is 2^32 - 1. + */ +#define DEF_LIMITS_STRING "4294967295:4294967295" +#define DEF_LIMITS_VALUE 4294967295 + + + + + +static psm2_error_t +ips_none_get_path_rec(struct ips_proto *proto, + uint16_t slid, uint16_t dlid, + uint16_t ip_hi, + unsigned long timeout, ips_path_rec_t **ppath_rec) +{ + psm2_error_t err = PSM2_OK; + ips_path_rec_t *path_rec; + ENTRY elid, *epath = NULL; + char eplid[128]; + + /* Query the path record cache */ + // TBD - slid same until have dispersive LMC-like, could just use dest + snprintf(eplid, sizeof(eplid), "%x_%x%04x", slid, ip_hi, dlid); + elid.key = eplid; + hsearch_r(elid, FIND, &epath, &proto->ips_path_rec_hash); + + if (!epath) { + elid.key = + psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1); + path_rec = (ips_path_rec_t *) + psmi_calloc(proto->ep, UNDEFINED, 1, + sizeof(ips_path_rec_t)); + if (!elid.key || !path_rec) { + if (elid.key) + psmi_free(elid.key); + if (path_rec) + psmi_free(path_rec); + return PSM2_NO_MEMORY; + } + + /* Create path record */ + path_rec->pr_slid = slid; + path_rec->pr_dlid = dlid; + path_rec->pr_mtu = proto->epinfo.ep_mtu; + path_rec->pr_pkey = proto->epinfo.ep_pkey; + path_rec->pr_sl = proto->epinfo.ep_sl; + path_rec->pr_ip_hi = ip_hi; + path_rec->pr_static_rate = proto->epinfo.ep_link_rate; + + + /* Setup CCA parameters for path */ + if (path_rec->pr_sl > PSMI_SL_MAX) { + psmi_free(elid.key); + psmi_free(path_rec); + return PSM2_INTERNAL_ERR; + } + err = ips_make_ah(proto->ep, path_rec); + if (err != PSM2_OK) { + psmi_free(elid.key); + psmi_free(path_rec); + return err; + } + + /* Add path record into cache */ + strcpy(elid.key, eplid); + elid.data = (void *)path_rec; + hsearch_r(elid, ENTER, &epath, &proto->ips_path_rec_hash); + } else + path_rec = (ips_path_rec_t *) epath->data; + + /* Return IPS path record */ + *ppath_rec = path_rec; + + return err; +} + +// This works for UD address vectors as well as the ah_attr in an RC QP attrs +psm2_error_t ips_path_rec_to_ah_attr(psm2_ep_t ep, + const ips_path_rec_t *path_rec, struct ibv_ah_attr *ah_attr) +{ + memset(ah_attr, 0, sizeof *ah_attr); + + // we keep PR in network byte order + // ah_attr is in CPU byte order except for GIDs which are always + // in network byte order + ah_attr->sl = path_rec->pr_sl; + ah_attr->port_num = ep->portnum; + ah_attr->static_rate = path_rec->pr_static_rate; + // for OPA/IB we use dlid and is_global=0, for eth use dgid and is_global=1 + if (ep->verbs_ep.link_layer != IBV_LINK_LAYER_ETHERNET) { + // OPA or IB + // NIC/HCA/HFI will only look at low "LMC" worth of bits + ah_attr->src_path_bits = __be16_to_cpu(path_rec->pr_slid); + ah_attr->dlid = __be16_to_cpu(path_rec->pr_dlid); + ah_attr->is_global = 0; + _HFI_UDDBG("creating AH with DLID %u\n", ah_attr->dlid); + } else { + ah_attr->src_path_bits = 0; + ah_attr->dlid = 1; // not used on ethernet, make non-zero + ah_attr->is_global = 1; + ah_attr->grh.dgid = ep->verbs_ep.lgid; + ah_attr->grh.dgid.raw[12] = (uint8_t)(__be16_to_cpu(path_rec->pr_ip_hi)>>8); + ah_attr->grh.dgid.raw[13] = (uint8_t)(__be16_to_cpu(path_rec->pr_ip_hi)); + ah_attr->grh.dgid.raw[14] = (uint8_t)(__be16_to_cpu(path_rec->pr_dlid)>>8); + ah_attr->grh.dgid.raw[15] = (uint8_t)(__be16_to_cpu(path_rec->pr_dlid)); + ah_attr->grh.sgid_index = ep->verbs_ep.lgid_index; + ah_attr->grh.hop_limit = 0xFF; + ah_attr->grh.traffic_class = 0; + if (_HFI_UDDBG_ON) { + char buf[80]; + _HFI_UDDBG("creating AH with DGID: %s\n", + __psm2_dump_gid(&ah_attr->grh.dgid, buf, sizeof(buf))); + } + } + return PSM2_OK; +} + +psm2_error_t ips_make_ah(psm2_ep_t ep, ips_path_rec_t *path_rec) +{ + struct ibv_ah_attr ah_attr; + + if (path_rec->ah) { + _HFI_UDDBG("make_ah called second time on given path_rec, skipping\n"); + return PSM2_OK; + } + if (PSM2_OK != ips_path_rec_to_ah_attr(ep, path_rec, &ah_attr)) { + _HFI_ERROR( "Unable to convert path_rec to AH\n"); + return PSM2_INTERNAL_ERR; + } + path_rec->ah = ibv_create_ah(ep->verbs_ep.pd, &ah_attr); + if (! path_rec->ah) { + int save_errno = errno; + _HFI_ERROR( "Unable to create AH: %s (%d)\n", strerror(save_errno), save_errno); + if (save_errno == ETIMEDOUT) + return PSM2_EPID_PATH_RESOLUTION; + else + return PSM2_INTERNAL_ERR; + } + _HFI_UDDBG("created AH %p\n", path_rec->ah); + // PSM doesn't free path_rec structures on shutdown, so this will + // simply leak and be cleaned up by the kernel close when we shutdown + return PSM2_OK; +} + +#ifdef RNDV_MOD_MR +void ips_path_rec_to_ib_user_path_rec(psm2_ep_t ep, + const ips_path_rec_t *path_rec, union ibv_gid *dgid, + struct ib_user_path_rec *path) +{ + memset(path, 0, sizeof(*path)); + memcpy(&path->sgid, &ep->verbs_ep.lgid, sizeof(path->sgid)); + memcpy(&path->dgid, dgid, sizeof(path->dgid)); + path->slid = path_rec->pr_slid; /* __be16 */ + if (ep->verbs_ep.link_layer != IBV_LINK_LAYER_ETHERNET) + path->dlid = path_rec->pr_dlid; /* __be16 */ + else + path->dlid = __cpu_to_be16(1); + //path->raw_traffic + //path->flow_label + path->reversible = 1; + path->mtu = opa_mtu_int_to_enum(path_rec->pr_mtu); + path->pkey = __cpu_to_be16(path_rec->pr_pkey); /* __be16 */ + path->hop_limit = (ep->verbs_ep.link_layer == IBV_LINK_LAYER_ETHERNET) + ?0xFF:0; // indicates if need GRH + //path->traffic_class + path->numb_path = 1; + path->sl = path_rec->pr_sl; + path->mtu_selector = 2; /* Exactly the given MTU */ + path->rate_selector = 2; /* Exactly the given rate */ + // ips_path_rec.pr_static_rate is negotiated in PSM REQ/REP + // then also use negotiated rate in user RC QP, ah_attr above and here + path->rate = path_rec->pr_static_rate; + path->packet_life_time_selector = 2; /* Exactly the given LT */ + // the value supplied here will be increased by the CM based on ack_delay + // typically ack_delay will be small compared to packet_life_time + // in which case the CM wil end up using packet_life_time+1 as the timeout + // so we pass timeout-1 here so final timeout is usually what was requested + path->packet_life_time = ep->hfi_qp_timeout - 1; + //path->preferences +} +#endif // RNDV_MOD_MR + +static psm2_error_t +ips_none_path_rec(struct ips_proto *proto, + uint16_t slid, uint16_t dlid, + uint16_t ip_hi, + unsigned long timeout, ips_path_grp_t **ppathgrp) +{ + psm2_error_t err = PSM2_OK; + uint16_t pidx, num_path = (1 << proto->epinfo.ep_lmc); + uint16_t path_slid, path_dlid; + ips_path_rec_t *path; + ips_path_grp_t *pathgrp; + ENTRY elid, *epath = NULL; + char eplid[128]; + + num_path = 1; // don't yet have multi-path dispersive routing + // maybe we use env to derrive multiple sequential IP + // addresses, sort of like an LMC concept + // or use ECMP or other mechanism + + /* For the "none" path record resolution all paths are assumed to be + * of equal priority however since we want to isolate all control + * traffic (acks, naks) to a separate path for non zero LMC subnets + * the "first path" between a pair of endpoints is always the "higher" + * priority paths. The rest of the paths are the normal (and low + * priority) paths. + */ + + /* Query the path record cache */ + // TBD - slid same until have dispersive LMC-like, could just use dest + snprintf(eplid, sizeof(eplid), "%x_%x%04x", slid, ip_hi, dlid); + elid.key = eplid; + hsearch_r(elid, FIND, &epath, &proto->ips_path_grp_hash); + + if (epath) { /* Find path group in cache */ + *ppathgrp = (ips_path_grp_t *) epath->data; + return err; + } + + /* If base lids are only used then reset num_path to 1 */ + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE) + num_path = 1; + + /* Allocate a new pathgroup */ + elid.key = psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1); + pathgrp = (ips_path_grp_t *) + psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(ips_path_grp_t) + + num_path * IPS_PATH_MAX_PRIORITY * + sizeof(ips_path_rec_t *)); + if (!elid.key || !pathgrp) { + if (elid.key) + psmi_free(elid.key); + if (pathgrp) + psmi_free(pathgrp); + err = PSM2_NO_MEMORY; + goto fail; + } + + /* + * dlid is the peer base lid. + * slid is the base lid for the local end point. + * Store in network byte order. + */ + pathgrp->pg_base_dlid = dlid; + pathgrp->pg_base_slid = slid; + + if (num_path > 1) { + /* One control path and (num_path - 1) norm and low priority paths */ + pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] = 1; + pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] = num_path - 1; + pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] = num_path - 1; + } else { + /* LMC of 0. Use the same path for all priorities */ + pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY] = 1; + pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY] = 1; + pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY] = 1; + } + + /* For "none" path record we just setup 2^lmc paths. To get better load + * balance + */ + for (pidx = 0; pidx < num_path; pidx++) { + path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx); + path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx); + + err = + ips_none_get_path_rec(proto, path_slid, path_dlid, + ip_hi, + timeout, &path); + if (err != PSM2_OK) { + psmi_free(elid.key); + psmi_free(pathgrp); + goto fail; + } + + if (num_path > 1) { + if (pidx == 0) { + /* First path is always the high priority path */ + pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY] = + path; + } else { + pathgrp->pg_path[pidx - + 1][IPS_PATH_NORMAL_PRIORITY] = + path; + pathgrp->pg_path[pidx - + 1][IPS_PATH_LOW_PRIORITY] = + path; + } + } else { + pathgrp->pg_path[0][IPS_PATH_HIGH_PRIORITY] = path; + pathgrp->pg_path[0][IPS_PATH_NORMAL_PRIORITY] = path; + pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY] = path; + } + PSM2_LOG_MSG("path %p slid %hu dlid %hu ip_hi %hu\n", + path, + __be16_to_cpu(path->pr_slid), + __be16_to_cpu(path->pr_dlid), + __be16_to_cpu(path->pr_ip_hi)); + + } + + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) { + pathgrp->pg_next_path[IPS_PATH_NORMAL_PRIORITY] = + proto->epinfo.EP_HASH % + pathgrp->pg_num_paths[IPS_PATH_NORMAL_PRIORITY]; + pathgrp->pg_next_path[IPS_PATH_LOW_PRIORITY] = + proto->epinfo.EP_HASH % + pathgrp->pg_num_paths[IPS_PATH_LOW_PRIORITY]; + } + + /* Add path record into cache */ + strcpy(elid.key, eplid); + elid.data = (void *)pathgrp; + hsearch_r(elid, ENTER, &epath, &proto->ips_path_grp_hash); + + *ppathgrp = pathgrp; + +fail: + if (err != PSM2_OK) + _HFI_PRDBG + ("Unable to get path record for LID %x <---> DLID %x.\n", + slid, dlid); + return err; +} + +static psm2_error_t ips_none_path_rec_init(struct ips_proto *proto) +{ + psm2_error_t err = PSM2_OK; + + /* Obtain the SL and PKEY to use from the environment (PSM3_NIC_SL & PSM_KEY) */ + proto->epinfo.ep_sl = proto->ep->out_sl; + proto->epinfo.ep_pkey = (uint16_t) proto->ep->network_pkey; + + /* + * Parse the err_chk settings from the environment. + * :: + */ + { + union psmi_envvar_val env_to; + char *errchk_to = PSM_TID_TIMEOUT_DEFAULT; + int tvals[3] = { + IPS_PROTO_ERRCHK_MS_MIN_DEFAULT, + IPS_PROTO_ERRCHK_MS_MAX_DEFAULT, + IPS_PROTO_ERRCHK_FACTOR_DEFAULT + }; + + if (!psmi_getenv("PSM3_ERRCHK_TIMEOUT", + "Errchk timeouts in mS ", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)errchk_to, &env_to)) { + /* Not using default values, parse what we can */ + errchk_to = env_to.e_str; + psmi_parse_str_tuples(errchk_to, 3, tvals); + /* Adjust for max smaller than min, things would break */ + if (tvals[1] < tvals[0]) + tvals[1] = tvals[0]; + } + + proto->epinfo.ep_timeout_ack = ms_2_cycles(tvals[0]); + proto->epinfo.ep_timeout_ack_max = ms_2_cycles(tvals[1]); + proto->epinfo.ep_timeout_ack_factor = tvals[2]; + } + + proto->ibta.get_path_rec = ips_none_path_rec; + proto->ibta.fini = NULL; + + + return err; +} + + +/* On link up/down we need to update some state */ +psm2_error_t ips_ibta_link_updown_event(struct ips_proto *proto) +{ + psm2_error_t err = PSM2_OK; + + /* Get base lid, lmc and rate as these may have changed if the link bounced */ + proto->epinfo.ep_base_lid = + __cpu_to_be16((uint16_t) psm2_epid_nid(proto->ep->context.epid)); + + proto->epinfo.ep_lmc = 0; // No LMC for UD + proto->epinfo.ep_link_rate = proto->ep->verbs_ep.active_rate; + return err; +} + +psm2_error_t +MOCKABLE(ips_ibta_init)(struct ips_proto *proto) +{ + psm2_error_t err = PSM2_OK; + union psmi_envvar_val path_disable_lmc_interval; + + proto->flags |= IPS_PROTO_FLAG_PPOLICY_ADAPTIVE; + + /* Initialize path record/group hash table */ + + { + uint32_t lmc_disable_low, lmc_disable_high; + int sscanf_ret; + + /* The default disable_low and disable_low values + * are 2^32 - 1, the maximum allowable message size. + * So by default all messages should be smaller than the + * lower limit, and so will not have LMC dispersive + * routing disabled. + * + * Add to this, these limits are applied only to SDMA + * and PIO message, NOT TID messages. So this size + * bigger than any PIO size. + */ + psmi_getenv("PSM3_PATH_NO_LMC_RANGE", + "Disable LMC route dispersion within this range, " + "low_value:high_value\n", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)DEF_LIMITS_STRING, + &path_disable_lmc_interval); + + sscanf_ret = sscanf(path_disable_lmc_interval.e_str, "%u:%u", + &lmc_disable_low, &lmc_disable_high); + + /* + * It's "invalid" for the low end of the range to be + * larger than the hig end of the range, so revert + * to the "maximum message size" (2^32 - 1). + */ + if ((sscanf_ret != 2) || (lmc_disable_low > lmc_disable_high)) { + lmc_disable_low = lmc_disable_high = DEF_LIMITS_VALUE; + } + + PSM2_LOG_MSG("PSM3_PATH_NO_LMC_RANGE: " + "lmc_disable_low %u lmc_disable_high %u\n", + lmc_disable_low, lmc_disable_high); + + /* + * These specify the range of message sizes in bytes, of + * the messages to disable LMC dynamic LID assignment. + */ + proto->ips_lmc_disable_low = lmc_disable_low; + proto->ips_lmc_disable_high = lmc_disable_high; + } + + hcreate_r(DF_PATH_REC_HASH_SIZE, &proto->ips_path_rec_hash); + hcreate_r(DF_PATH_GRP_HASH_SIZE, &proto->ips_path_grp_hash); + + /* On startup treat it as a link up/down event to setup state . */ + if ((err = ips_ibta_link_updown_event(proto)) != PSM2_OK) + goto fail; + + /* Setup the appropriate query interface for the endpoint */ + switch (proto->ep->path_res_type) { + case PSM2_PATH_RES_OPP: + err = ips_opp_init(proto); + if (err != PSM2_OK) + _HFI_ERROR + ("Unable to use OFED Plus Plus for path record queries.\n"); + break; + case PSM2_PATH_RES_UMAD: + _HFI_ERROR + ("Path record queries using UMAD is not supported in PSM version %d.%dx\n", + PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR); + err = PSM2_EPID_PATH_RESOLUTION; + break; + case PSM2_PATH_RES_NONE: + default: + err = ips_none_path_rec_init(proto); + } + +fail: + return err; +} +MOCK_DEF_EPILOGUE(ips_ibta_init); + +psm2_error_t ips_ibta_fini(struct ips_proto *proto) +{ + psm2_error_t err = PSM2_OK; + + if (proto->ibta.fini) + err = proto->ibta.fini(proto); + + /* Destroy the path record/group hash */ + hdestroy_r(&proto->ips_path_rec_hash); + hdestroy_r(&proto->ips_path_grp_hash); + + return err; +} diff --git a/prov/psm3/psm3/ptl_ips/ips_path_rec.h b/prov/psm3/psm3/ptl_ips/ips_path_rec.h new file mode 100644 index 00000000000..aa0ff5f1c93 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_path_rec.h @@ -0,0 +1,201 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2009-2014 Intel Corporation. All rights reserved. */ + + +#ifndef _IPS_PATH_REC_H_ +#define _IPS_PATH_REC_H_ + +#include + +/* Default size of path record hash table */ +#define DF_PATH_REC_HASH_SIZE 2047 + +/* Default size of path group hash table */ +#define DF_PATH_GRP_HASH_SIZE 255 + +/* Default size of CCT table. Must be multiple of 64 */ +#define DF_CCT_TABLE_SIZE 128 + +/* CCT max IPD delay. */ +#define DF_CCT_MAX_IPD_DELAY_US 21 + +/* CCA divisor shift */ +#define CCA_DIVISOR_SHIFT 14 + +/* CCA ipd mask */ +#define CCA_IPD_MASK 0x3FFF + +/* A lot of these are IBTA specific defines that are available in other header + * files. To minimize dependencies with PSM build process they are listed + * here. Most of this is used to implement IBTA compliance features with PSM + * like path record query etc. + */ + +enum opa_mtu { + IBTA_MTU_256 = 1, + IBTA_MTU_512 = 2, + IBTA_MTU_1024 = 3, + IBTA_MTU_2048 = 4, + IBTA_MTU_4096 = 5, + OPA_MTU_8192 = 6, + OPA_MTU_10240 = 7, + IBTA_MTU_MIN = IBTA_MTU_256, + OPA_MTU_MIN = IBTA_MTU_256, + OPA_MTU_MAX = IBTA_MTU_4096, +}; + +typedef enum psm_ibv_rate opa_rate; + +static inline int opa_mtu_enum_to_int(enum opa_mtu mtu) +{ + switch (mtu) { + case IBTA_MTU_256: + return 256; + case IBTA_MTU_512: + return 512; + case IBTA_MTU_1024: + return 1024; + case IBTA_MTU_2048: + return 2048; + case IBTA_MTU_4096: + return 4096; + case OPA_MTU_8192: + return 8192; + case OPA_MTU_10240: + return 10240; + default: + return -1; + } +} + +static inline enum opa_mtu opa_mtu_int_to_enum(int mtu) +{ + // the PSM mtu may be slightly less than wire MTU to allow for + // PSM headers, so round up to nearest MTU enum + if (mtu <= 256) + return IBTA_MTU_256; + else if (mtu <= 512) + return IBTA_MTU_512; + else if (mtu <= 1024) + return IBTA_MTU_1024; + else if (mtu <= 2048) + return IBTA_MTU_2048; + else if (mtu <= 4096) + return IBTA_MTU_4096; +// TBD if we should allow these values on standard verbs + else if (mtu <= 8192) + return OPA_MTU_8192; + else + return OPA_MTU_10240; +} + +/* This is same as ob_path_rec from ib_types.h. Listed here to be self + * contained to minimize dependencies during build etc. + */ +typedef struct _ibta_path_rec { + uint64_t service_id; /* net order */ + uint8_t dgid[16]; + uint8_t sgid[16]; + uint16_t dlid; /* net order */ + uint16_t slid; /* net order */ + uint32_t hop_flow_raw; /* net order */ + uint8_t tclass; + uint8_t num_path; + uint16_t pkey; /* net order */ + uint16_t qos_class_sl; /* net order */ + uint8_t mtu; /* IBTA encoded */ + uint8_t rate; /* IBTA encoded */ + uint8_t pkt_life; /* IBTA encoded */ + uint8_t preference; + uint8_t resv2[6]; +} ibta_path_rec_t; + +/* + * PSM IPS path record components for endpoint. + * + * For Torus/non-zero LMC fabrics, pr_slid and pr_dlid may be different from + * the "base lid" values for this connection. + */ +struct ips_proto; + +typedef struct ips_path_rec { + uint16_t pr_slid; + uint16_t pr_dlid; + uint16_t pr_mtu; /* PSM payload in bytes, < Path's MTU */ + uint16_t pr_pkey; + uint8_t pr_sl; + uint8_t pr_static_rate; // psm_ibv_rate enum + uint16_t pr_ip_hi; // high 16 bits of IP address for ethernet + // and low 16 are in pr_dlid + + // address handle for UD comms + struct ibv_ah *ah; +#ifdef RNDV_MOD_MR + psm2_rv_conn_t rv_conn; + uint8_t connecting; +#endif +} ips_path_rec_t; + +psm2_error_t ips_opp_init(struct ips_proto *proto); +psm2_error_t ips_make_ah(psm2_ep_t ep, ips_path_rec_t *path_rec); +psm2_error_t ips_path_rec_to_ah_attr(psm2_ep_t ep, + const ips_path_rec_t *path_rec, struct ibv_ah_attr *ah_attr); +#ifdef RNDV_MOD_MR +void ips_path_rec_to_ib_user_path_rec(psm2_ep_t ep, + const ips_path_rec_t *path_rec, union ibv_gid *dgid, + struct ib_user_path_rec *path); +#endif + +#endif diff --git a/prov/psm3/psm3/ptl_ips/ips_proto.c b/prov/psm3/psm3/ptl_ips/ips_proto.c new file mode 100644 index 00000000000..9083570645b --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_proto.c @@ -0,0 +1,1600 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +/* + * IPS - Interconnect Protocol Stack. + */ + +#include +#include /* writev */ +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" +#include "ips_proto_internal.h" +#include "ips_proto_help.h" +#include "psmi_wrappers.h" +#include "psm_mq_internal.h" + +#ifdef PSM_CUDA +#include "psm_gdrcpy.h" +#endif + +/* + * Control message types have their own flag to determine whether a message of + * that type is queued or not. These flags are kept in a state bitfield. + */ +#define CTRL_MSG_ACK_QUEUED 0x0001 +#define CTRL_MSG_NAK_QUEUED 0x0002 +#define CTRL_MSG_BECN_QUEUED 0x0004 +#define CTRL_MSG_ERR_CHK_QUEUED 0x0008 +// reserved 0x0010 +#define CTRL_MSG_CONNECT_REQUEST_QUEUED 0x0020 +#define CTRL_MSG_CONNECT_REPLY_QUEUED 0x0040 +#define CTRL_MSG_DISCONNECT_REQUEST_QUEUED 0x0080 +#define CTRL_MSG_DISCONNECT_REPLY_QUEUED 0x0100 + +#ifdef PSM_CUDA +uint32_t gpudirect_send_threshold; +uint32_t gpudirect_recv_threshold; +#endif + +static void ctrlq_init(struct ips_ctrlq *ctrlq, struct ips_proto *proto); + +#ifdef PSM_CUDA +void psmi_cuda_hostbuf_alloc_func(int is_alloc, void *context, void *obj) +{ + struct ips_cuda_hostbuf *icb; + struct ips_cuda_hostbuf_mpool_cb_context *ctxt = + (struct ips_cuda_hostbuf_mpool_cb_context *) context; + + icb = (struct ips_cuda_hostbuf *)obj; + if (is_alloc) { + PSMI_CUDA_CALL(cuMemHostAlloc, + (void **) &icb->host_buf, + ctxt->bufsz, + CU_MEMHOSTALLOC_PORTABLE); + PSMI_CUDA_CALL(cuEventCreate, &icb->copy_status, CU_EVENT_DEFAULT); + } else { + if (icb->host_buf) { + PSMI_CUDA_CALL(cuMemFreeHost, icb->host_buf); + PSMI_CUDA_CALL(cuEventDestroy, icb->copy_status); + } + } + return; +} +#endif + +psm2_error_t +ips_proto_init(const psmi_context_t *context, const ptl_t *ptl, + int num_of_send_bufs, int num_of_send_desc, uint32_t imm_size, + const struct psmi_timer_ctrl *timerq, + const struct ips_epstate *epstate, + void *spioc, struct ips_proto *proto) +{ + uint32_t protoexp_flags, cksum_sz; + union psmi_envvar_val env_tid, env_cksum, env_mtu; + psm2_error_t err = PSM2_OK; + + /* + * Checksum packets within PSM. Default is off. + * This is heavy weight and done in software so not recommended for + * production runs. + */ + + psmi_getenv("PSM3_CHECKSUM", + "Enable checksum of messages (0 disables checksum)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)0, &env_cksum); + + memset(proto, 0, sizeof(struct ips_proto)); + proto->ptl = (ptl_t *) ptl; + proto->ep = context->ep; /* cached */ + proto->mq = context->ep->mq; /* cached */ + proto->pend_sends.proto = proto; + psmi_timer_entry_init(&proto->pend_sends.timer, + ips_proto_timer_pendq_callback, + &proto->pend_sends); + STAILQ_INIT(&proto->pend_sends.pendq); + proto->epstate = (struct ips_epstate *)epstate; + proto->timerq = (struct psmi_timer_ctrl *)timerq; + proto->spioc = spioc; + + // hash for dispersive routing + proto->epinfo.ep_hash = context->ep->verbs_ep.qp->qp_num;// low 8b only + + /* If checksums enabled we insert checksum at end of packet */ + cksum_sz = env_cksum.e_uint ? PSM_CRC_SIZE_IN_BYTES : 0; + proto->epinfo.ep_mtu = context->ep->mtu; + /* Decrement checksum */ + proto->epinfo.ep_mtu -= cksum_sz; + + /* See if user specifies a lower MTU to use */ + if (!psmi_getenv("PSM3_MTU", + "Upper bound on packet MTU (<=0 uses port MTU): 1-5,256,512,1024,2048,4096]", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)-1, &env_mtu)) { + if (env_mtu.e_int >= OPA_MTU_MIN && env_mtu.e_int <= OPA_MTU_MAX) //enum + env_mtu.e_int = opa_mtu_enum_to_int((enum opa_mtu)env_mtu.e_int); + else if (env_mtu.e_int < OPA_MTU_MIN) // pick default + env_mtu.e_int = 8192; + else // wash through enum to force round up to next valid MTU + env_mtu.e_int = opa_mtu_enum_to_int(opa_mtu_int_to_enum(env_mtu.e_int)); + if (proto->epinfo.ep_mtu > env_mtu.e_int) { + proto->epinfo.ep_mtu = env_mtu.e_int; + proto->epinfo.ep_mtu -= MAX_PSM_HEADER; + } + } + // ep_mtu is our final choice of local PSM payload we can support, save it + // back to ep->mtu + proto->ep->mtu = proto->epinfo.ep_mtu; + + // create and size the buffer pools based on the selected ep->mtu + err = __psm2_ep_initialize_queues(proto->ep); + if (err) + goto fail; + + /* sdma queue size */ + proto->sdma_queue_size = 16; // hack until we ifdef rest of sdma + /* don't use the last slot */ + + if (proto->sdma_queue_size > 8) { + /* configure sdma_avail_counter */ + proto->sdma_avail_counter = 8; // hack until we ifdef rest of sdma + } else { + err = PSM2_PARAM_ERR; + goto fail; + } + + + proto->sdma_fill_index = 0; + proto->sdma_done_index = 0; + proto->sdma_scb_queue = (struct ips_scb **) + psmi_calloc(proto->ep, UNDEFINED, + proto->sdma_queue_size, sizeof(struct ips_scb *)); + if (proto->sdma_scb_queue == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + + proto->timeout_send = us_2_cycles(IPS_PROTO_SPIO_RETRY_US_DEFAULT); + proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = ~0U; + proto->t_init = get_cycles(); + proto->t_fini = 0; + proto->flags = env_cksum.e_uint ? IPS_PROTO_FLAG_CKSUM : 0; + proto->runid_key = getpid(); + + proto->num_connected_outgoing = 0; + proto->num_connected_incoming = 0; + proto->num_disconnect_requests = 0; + proto->stray_warn_interval = (uint64_t) -1; + proto->done_warning = 0; + proto->done_once = 0; + proto->num_bogus_warnings = 0; + proto->psmi_logevent_tid_send_reqs.interval_secs = 15; + proto->psmi_logevent_tid_send_reqs.next_warning = 0; + proto->psmi_logevent_tid_send_reqs.count = 0; + + { + /* threshold for multirail load balancing */ + union psmi_envvar_val env_thresh_load_balance; + + psmi_getenv("PSM3_MULTIRAIL_THRESH_LOAD_BALANCE", + "Min packet size at which load balance for multi-rail (default is 0)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)0, + &env_thresh_load_balance); + proto->multirail_thresh_load_balance = env_thresh_load_balance.e_uint; + } + + /* Initialize IBTA related stuff (path record, SL2VL, CCA etc.) */ + if ((err = ips_ibta_init(proto))) + goto fail; + + { + /* User asks for HFI loopback? */ + union psmi_envvar_val env_loopback; + + psmi_getenv("PSM3_NIC_LOOPBACK", + "PSM uses NIC loopback (default is disabled i.e. 0)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)0, /* Disabled by default */ + &env_loopback); + + if (env_loopback.e_uint) + proto->flags |= IPS_PROTO_FLAG_LOOPBACK; + } + + + { + /* Disable coalesced ACKs? */ + union psmi_envvar_val env_coalesce_acks; + + psmi_getenv("PSM3_COALESCE_ACKS", "Coalesce ACKs on the wire (default is enabled i.e. 1)", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)1, /* Enabled by default */ + &env_coalesce_acks); + + if (env_coalesce_acks.e_uint) + proto->flags |= IPS_PROTO_FLAG_COALESCE_ACKS; + } + + { + /* Number of credits per flow */ + union psmi_envvar_val env_flow_credits; + int df_flow_credits = min(PSM2_FLOW_CREDITS, num_of_send_desc); + + psmi_getenv("PSM3_FLOW_CREDITS", + "Number of unacked packets (credits) per flow (default is 64)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)df_flow_credits, + &env_flow_credits); + proto->flow_credits = env_flow_credits.e_uint; + } + + /* + * Pre-calculate the PSN mask to support 31 bit PSN. + */ + proto->psn_mask = 0x7FFFFFFF; + + /* + * Initialize SDMA, otherwise, turn on all PIO. + */ + { + proto->flags |= IPS_PROTO_FLAG_SPIO; + proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = + ~0U; + } + + /* + * Setup the protocol wide short message ep flow. + */ + proto->msgflowid = EP_FLOW_GO_BACK_N_PIO; + + /* + * Clone sendreq mpool configuration for pend sends config + */ + { + uint32_t chunks, maxsz; + + psmi_assert_always(proto->ep->mq->sreq_pool != NULL); + psmi_mpool_get_obj_info(proto->ep->mq->sreq_pool, &chunks, + &maxsz); + + proto->pend_sends_pool = + psmi_mpool_create(sizeof(struct ips_pend_sreq), chunks, + maxsz, 0, DESCRIPTORS, NULL, NULL); + if (proto->pend_sends_pool == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + } + + /* + * Create a pool of CCA timers for path_rec. The timers should not + * exceed the scb number num_of_send_desc(default 4K). + */ + { + uint32_t chunks, maxsz; + + chunks = 256; + maxsz = num_of_send_desc; + + proto->timer_pool = + psmi_mpool_create(sizeof(struct psmi_timer), chunks, maxsz, + 0, DESCRIPTORS, NULL, NULL); + if (proto->timer_pool == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + } + + /* + * Register ips protocol statistics + * + * We put a (*) in the output to denote stats that may cause a drop in + * performance. + * + * We put a (**) in the output of those stats that "should never happen" + */ + { + + struct psmi_stats_entry entries[] = { + PSMI_STATS_DECLU64("pio_busy_count", + &proto->stats.pio_busy_cnt), + /* Throttling by kernel */ + PSMI_STATS_DECLU64("writev_busy_cnt", + &proto->stats.writev_busy_cnt), + PSMI_STATS_DECLU64("scb_unavail_eager_count", + &proto->stats.scb_egr_unavail_cnt), + PSMI_STATS_DECLU64("unknown_packets_(**)", /* shouldn't happen */ + &proto->stats.unknown_packets), + PSMI_STATS_DECLU64("stray_packets_(*)", + &proto->stats.stray_packets), + PSMI_STATS_DECLU64("err_chk_send", + &proto->epaddr_stats.err_chk_send), + PSMI_STATS_DECLU64("err_chk_recv", + &proto->epaddr_stats.err_chk_recv), +#ifdef RNDV_MOD_MR + PSMI_STATS_DECLU64("err_chk_rdma_send", + &proto->epaddr_stats.err_chk_rdma_send), + PSMI_STATS_DECLU64("err_chk_rdma_recv", + &proto->epaddr_stats.err_chk_rdma_recv), +#endif + PSMI_STATS_DECLU64("nak_send", + &proto->epaddr_stats.nak_send), + PSMI_STATS_DECLU64("nak_recv", + &proto->epaddr_stats.nak_recv), + PSMI_STATS_DECLU64("connect_req_send", + &proto->epaddr_stats.connect_req_send), + PSMI_STATS_DECLU64("connect_req_recv", + &proto->epaddr_stats.connect_req_recv), + PSMI_STATS_DECLU64("connect_rep_send", + &proto->epaddr_stats.connect_rep_send), + PSMI_STATS_DECLU64("connect_rep_recv", + &proto->epaddr_stats.connect_rep_recv), + PSMI_STATS_DECLU64("disconnect_req_send", + &proto->epaddr_stats.disconnect_req_send), + PSMI_STATS_DECLU64("disconnect_req_recv", + &proto->epaddr_stats.disconnect_req_recv), + PSMI_STATS_DECLU64("disconnect_rep_send", + &proto->epaddr_stats.disconnect_rep_send), + PSMI_STATS_DECLU64("disconnect_rep_recv", + &proto->epaddr_stats.disconnect_rep_recv), + PSMI_STATS_DECLU64("tids_grant_send", + &proto->epaddr_stats.tids_grant_send), + PSMI_STATS_DECLU64("tids_grant_recv", + &proto->epaddr_stats.tids_grant_recv), + PSMI_STATS_DECLU64("send_rexmit", + &proto->epaddr_stats.send_rexmit), +#ifdef RNDV_MOD_MR + PSMI_STATS_DECLU64("rdma_rexmit", + &proto->epaddr_stats.rdma_rexmit), +#endif + }; + + err = + psmi_stats_register_type + ("PSM_low-level_protocol_stats", + PSMI_STATSTYPE_IPSPROTO, entries, + PSMI_STATS_HOWMANY(entries), proto->ep->epid, proto); + if (err != PSM2_OK) + goto fail; + } + + /* + * Control Queue and messaging + */ + ctrlq_init(&proto->ctrlq, proto); + + /* + * Receive-side handling + */ + if ((err = ips_proto_recv_init(proto))) + goto fail; + + /* If progress thread is enabled, set the proto flag */ + { + if (psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD)) + proto->flags |= IPS_PROTO_FLAG_RCVTHREAD; + } + + /* + * Eager buffers. We don't care to receive a callback when eager buffers + * are newly released since we actively poll for new bufs. + */ + { + /* configure PSM bounce buffer size */ + union psmi_envvar_val env_bbs; + + psmi_getenv("PSM3_BOUNCE_SZ", + "PSM send bounce buffer size (default is 8192B)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)8192, + &env_bbs); + + proto->scb_bufsize = env_bbs.e_uint; + } + + if ((err = ips_scbctrl_init(context, num_of_send_desc, + num_of_send_bufs, imm_size, + proto->scb_bufsize, NULL, NULL, + &proto->scbc_egr))) + goto fail; + + /* + * Expected protocol handling. + * If we enable tid-based expected rendezvous, the expected protocol code + * handles its own rv scb buffers. If not, we have to enable eager-based + * rendezvous and we allocate scb buffers for it. + * For UD PSM3_RDMA (ep->rdmamode) controls our use of RDMA for Rendezvous + * For STL100 PSM3_TID controls use of EXPTID for Rendezvous + */ + env_tid.e_uint = proto->ep->rdmamode; // PSM3_RDMA + protoexp_flags = env_tid.e_uint; + + // protoexp implements RDMA for UD and TID for STL100 native. N/A to UDP + // when proto->protoexp is NULL, we will not attempt to use TID nor RDMA + { + // for UD, even when RDMA is enabled, we may fall back to LONG_DATA + // in which case we want the scbc_rv scb's so we don't exhaust the + // scbc_egr pool + proto->scbc_rv = (struct ips_scbctrl *) + psmi_calloc(proto->ep, DESCRIPTORS, + 1, sizeof(struct ips_scbctrl)); + if (proto->scbc_rv == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + /* + * Rendezvous buffers. We want to get a callback for rendezvous bufs + * since we asynchronously try to make progress on these sends and only + * schedule them on the timerq if there are pending sends and available + * bufs. + */ + if ((err = + ips_scbctrl_init(context, num_of_send_desc, + 0 /* no bufs */ , + 0, 0 /* bufsize==0 */ , + ips_proto_rv_scbavail_callback, + proto, proto->scbc_rv))) + goto fail; + } + if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) { +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED) { + PSMI_CUDA_CALL(cuStreamCreate, + &proto->cudastream_send, CU_STREAM_NON_BLOCKING); + } +#endif + if ((err = ips_protoexp_init(context, proto, protoexp_flags, + num_of_send_bufs, num_of_send_desc, + &proto->protoexp))) + goto fail; + } else { + proto->protoexp = NULL; + } + + // we allocate MR cache here (as opposed to in protoexp) in case we later + // decide to implement RC send for medium messages and use it to register + // medium sized user eager buffers (SDMA-like) + if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) { + union psmi_envvar_val env_mr_cache_size; + uint32_t default_cache_size; // in entries + uint32_t cache_pri_entries; + uint64_t cache_pri_size; // in bytes + + // we can have at most HFI_TF_NFLOWS inbound RDMA and hfi_num_send_rdma + // outbound RDMA. Each of which potentially needs an MR. + // so mr_cache_size should be >= HFI_TF_NFLOWS + ep->hfi_num_send_rdma + // but can survive if it's smaller as we will delay transfer til avail + cache_pri_entries = HFI_TF_NFLOWS + proto->ep->hfi_num_send_rdma; + cache_pri_size = (uint64_t)cache_pri_entries * proto->mq->hfi_base_window_rv; + if (proto->ep->mr_cache_mode == MR_CACHE_MODE_USER) { + // we attempt to cache, so can benefit from more than inflight + default_cache_size = cache_pri_entries * 16; + } else { + // we only reference count + // could benefit from some extra so we can preregister MRs for + // transfers we don't yet have resources for + default_cache_size = cache_pri_entries * 8; + } + /* Size of user space MR Cache + */ + psmi_getenv("PSM3_MR_CACHE_SIZE", + "user space MR table/cache size (num MRs)", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)default_cache_size, &env_mr_cache_size); + + proto->mr_cache = psm2_verbs_alloc_mr_cache(proto->ep, + env_mr_cache_size.e_uint, proto->ep->mr_cache_mode, + cache_pri_entries, cache_pri_size); + if (! proto->mr_cache) { + _HFI_ERROR( "Unable to allocate MR cache (%u entries)\n", + env_mr_cache_size.e_uint); + err = PSM2_NO_MEMORY; + goto fail; + } + } + + + /* Active Message interface. AM requests compete with MQ for eager + * buffers, since request establish the amount of buffering in the + * network (maximum number of requests in flight). The AM init function + * does not allow the number of send buffers to be set separately from + * the number of send descriptors, because otherwise it would have to + * impose extremely arcane constraints on the relative amounts to avoid + * a deadlock scenario. Thus, it handles it internally. The constraint + * is: In a node pair, the number of reply send buffers on at least one + * of the nodes must be at least double the number (optimal: double + 1) + * of send descriptors on the other node. */ + if ((err = ips_proto_am_init(proto, + min(num_of_send_bufs, num_of_send_desc), + imm_size, + &proto->proto_am))) + goto fail; + +#if 0 + if (!host_pid) { + char ipbuf[INET_ADDRSTRLEN], *p; + host_pid = (uint32_t) getpid(); + host_ipv4addr = psmi_get_ipv4addr(); /* already be */ + if (host_ipv4addr == 0) { + _HFI_DBG("Unable to obtain local IP address, " + "not fatal but some features may be disabled\n"); + } else if (host_ipv4addr == __cpu_to_be32(0x7f000001)) { + _HFI_INFO("Localhost IP address is set to the " + "loopback address 127.0.0.1, " + "not fatal but some features may be disabled\n"); + } else { + p = (char *)inet_ntop(AF_INET, + (const void *)&host_ipv4addr, + ipbuf, sizeof(ipbuf)); + _HFI_PRDBG("Ethernet Host IP=%s and PID=%d\n", p, + host_pid); + } + + /* Store in big endian for use in ERR_CHK */ + host_pid = __cpu_to_be32(host_pid); + } +#endif +#ifdef PSM_CUDA + union psmi_envvar_val env_gpudirect_rdma; + psmi_getenv("PSM3_GPUDIRECT", + "Use GPUDirect RDMA support to allow the NIC to directly read" + " from the GPU for SDMA and write to the GPU for TID RDMA." + " Requires driver support.(default is disabled i.e. 0)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)0, /* Disabled by default */ + &env_gpudirect_rdma); + /* The following cases need to be handled: + * 1) GPU DIRECT is turned off but GDR COPY is turned on by the user or + * by default - Turn off GDR COPY + * 2) GPU DIRECT is on but GDR COPY is turned off by the user - Leave + *. this config as it is. + */ + if (!env_gpudirect_rdma.e_uint) + is_gdr_copy_enabled = 0; + + /* Default Send threshold for Gpu-direct set to 30000 */ + union psmi_envvar_val env_gpudirect_send_thresh; + psmi_getenv("PSM3_GPUDIRECT_SEND_THRESH", + "GPUDirect feature on send side will be switched off if threshold value is exceeded.", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)30000, &env_gpudirect_send_thresh); + gpudirect_send_threshold = env_gpudirect_send_thresh.e_uint; + + union psmi_envvar_val env_gpudirect_recv_thresh; + psmi_getenv("PSM3_GPUDIRECT_RECV_THRESH", + "GPUDirect feature on receive side will be switched off if threshold value is exceeded.", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)UINT_MAX, &env_gpudirect_recv_thresh); + gpudirect_recv_threshold = env_gpudirect_recv_thresh.e_uint; + + if (env_gpudirect_rdma.e_uint && device_support_gpudirect) { + if (PSMI_IS_CUDA_DISABLED || + !(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) || + PSMI_IS_DRIVER_GPUDIRECT_DISABLED) + err = psmi_handle_error(PSMI_EP_NORETURN, + PSM2_INTERNAL_ERR, + "Requires hfi1 driver with GPU-Direct feature enabled.\n"); + proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND; + proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV; + } else { + /* The following environment variables are here for internal + * experimentation and will not be documented for any customers. + */ + /* Use GPUDirect RDMA for SDMA send? */ + union psmi_envvar_val env_gpudirect_rdma_send; + psmi_getenv("PSM3_GPUDIRECT_RDMA_SEND", + "Use GPUDirect RDMA support to allow the NIC to directly" + " read from the GPU for SDMA. Requires driver" + " support.(default is disabled i.e. 0)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)0, /* Disabled by default */ + &env_gpudirect_rdma_send); + + if (env_gpudirect_rdma_send.e_uint && device_support_gpudirect) { + if (PSMI_IS_CUDA_DISABLED + || !(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) + ) + err = psmi_handle_error(PSMI_EP_NORETURN, + PSM2_INTERNAL_ERR, + "Unable to start run as PSM would require cuda, sdma" + "and TID support\n"); + proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND; + } + /* Use GPUDirect RDMA for recv? */ + union psmi_envvar_val env_gpudirect_rdma_recv; + psmi_getenv("PSM3_GPUDIRECT_RDMA_RECV", + "Use GPUDirect RDMA support to allow the NIC to directly" + " write into GPU. Requires driver support.(default is" + " disabled i.e. 0)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)0, /* Disabled by default */ + &env_gpudirect_rdma_recv); + + if (env_gpudirect_rdma_recv.e_uint && device_support_gpudirect) { + if (PSMI_IS_CUDA_DISABLED || + !(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) + err = psmi_handle_error(PSMI_EP_NORETURN, + PSM2_INTERNAL_ERR, + "Unable to start run as PSM would require cuda," + " sdma and TID support\n"); + proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV; + } + } + + if (PSMI_IS_CUDA_ENABLED && + (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) { + struct psmi_rlimit_mpool rlim = CUDA_HOSTBUFFER_LIMITS; + uint32_t maxsz, chunksz, max_elements; + + if ((err = psmi_parse_mpool_env(proto->mq, 1, + &rlim, &maxsz, &chunksz))) + goto fail; + + /* the maxsz is the amount in MB, not the number of entries, + * since the element size depends on the window size */ + max_elements = (maxsz*1024*1024) / proto->mq->hfi_base_window_rv; + /* mpool requires max_elements to be power of 2. round down. */ + max_elements = 1 << (31 - __builtin_clz(max_elements)); + proto->cuda_hostbuf_send_cfg.bufsz = proto->mq->hfi_base_window_rv; + proto->cuda_hostbuf_pool_send = + psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf), + chunksz, max_elements, 0, + UNDEFINED, NULL, NULL, + psmi_cuda_hostbuf_alloc_func, + (void *) + &proto->cuda_hostbuf_send_cfg); + + if (proto->cuda_hostbuf_pool_send == NULL) { + err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, + "Couldn't allocate CUDA host send buffer pool"); + goto fail; + } + + /* use the same number of elements for the small pool */ + proto->cuda_hostbuf_small_send_cfg.bufsz = CUDA_SMALLHOSTBUF_SZ; + proto->cuda_hostbuf_pool_small_send = + psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf), + chunksz, max_elements, 0, + UNDEFINED, NULL, NULL, + psmi_cuda_hostbuf_alloc_func, + (void *) + &proto->cuda_hostbuf_small_send_cfg); + + if (proto->cuda_hostbuf_pool_small_send == NULL) { + err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, + "Couldn't allocate CUDA host small send buffer pool"); + goto fail; + } + + /* Configure the amount of prefetching */ + union psmi_envvar_val env_prefetch_limit; + + psmi_getenv("PSM3_CUDA_PREFETCH_LIMIT", + "How many TID windows to prefetch at RTS time(default is 2)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)CUDA_WINDOW_PREFETCH_DEFAULT, + &env_prefetch_limit); + proto->cuda_prefetch_limit = env_prefetch_limit.e_uint; + } +#endif +fail: + return err; +} + +psm2_error_t +ips_proto_fini(struct ips_proto *proto, int force, uint64_t timeout_in) +{ + struct psmi_eptab_iterator itor; + uint64_t t_start; + uint64_t t_grace_start, t_grace_time, t_grace_interval; + psm2_epaddr_t epaddr; + psm2_error_t err = PSM2_OK; + int i; + union psmi_envvar_val grace_intval; + + /* Poll one more time to attempt to synchronize with the peer ep's. */ + ips_ptl_poll(proto->ptl, 0); + + psmi_getenv("PSM3_CLOSE_GRACE_PERIOD", + "Additional grace period in seconds for closing end-point.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)0, &grace_intval); + + if (getenv("PSM3_CLOSE_GRACE_PERIOD")) { + t_grace_time = grace_intval.e_uint * SEC_ULL; + } else if (timeout_in > 0) { + /* default to half of the close time-out */ + t_grace_time = timeout_in / 2; + } else { + /* propagate the infinite time-out case */ + t_grace_time = 0; + } + + if (t_grace_time > 0 && t_grace_time < PSMI_MIN_EP_CLOSE_TIMEOUT) + t_grace_time = PSMI_MIN_EP_CLOSE_TIMEOUT; + + /* At close we will busy wait for the grace interval to see if any + * receive progress is made. If progress is made we will wait for + * another grace interval, until either no progress is made or the + * entire grace period has passed. If the grace interval is too low + * we may miss traffic and exit too early. If the grace interval is + * too large the additional time spent while closing the program + * will become visible to the user. */ + psmi_getenv("PSM3_CLOSE_GRACE_INTERVAL", + "Grace interval in seconds for closing end-point.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)0, &grace_intval); + + if (getenv("PSM3_CLOSE_GRACE_INTERVAL")) { + t_grace_interval = grace_intval.e_uint * SEC_ULL; + } else { + /* A heuristic is used to scale up the timeout linearly with + * the number of endpoints, and we allow one second per 1000 + * endpoints. */ + t_grace_interval = (proto->ep->connections * SEC_ULL) / 1000; + } + + if (t_grace_interval < PSMI_MIN_EP_CLOSE_GRACE_INTERVAL) + t_grace_interval = PSMI_MIN_EP_CLOSE_GRACE_INTERVAL; + if (t_grace_interval > PSMI_MAX_EP_CLOSE_GRACE_INTERVAL) + t_grace_interval = PSMI_MAX_EP_CLOSE_GRACE_INTERVAL; + + PSMI_LOCK_ASSERT(proto->mq->progress_lock); + + t_start = proto->t_fini = get_cycles(); + + /* Close whatever has been left open */ + if (proto->num_connected_outgoing > 0) { + int num_disc = 0; + int *mask; + psm2_error_t *errs; + psm2_epaddr_t *epaddr_array; + + psmi_epid_itor_init(&itor, proto->ep); + while ((epaddr = psmi_epid_itor_next(&itor))) { + if (epaddr->ptlctl->ptl == proto->ptl) + num_disc++; + } + psmi_epid_itor_fini(&itor); + mask = + (int *)psmi_calloc(proto->ep, UNDEFINED, num_disc, + sizeof(int)); + errs = (psm2_error_t *) + psmi_calloc(proto->ep, UNDEFINED, num_disc, + sizeof(psm2_error_t)); + epaddr_array = (psm2_epaddr_t *) + psmi_calloc(proto->ep, UNDEFINED, num_disc, + sizeof(psm2_epaddr_t)); + + if (errs == NULL || epaddr_array == NULL || mask == NULL) { + if (epaddr_array) + psmi_free(epaddr_array); + if (errs) + psmi_free(errs); + if (mask) + psmi_free(mask); + err = PSM2_NO_MEMORY; + goto fail; + } + psmi_epid_itor_init(&itor, proto->ep); + i = 0; + while ((epaddr = psmi_epid_itor_next(&itor))) { + /* + * if cstate_outgoing is CSTATE_NONE, then we know it + * is an uni-directional connect, in that the peer + * sent a connect request to us, but we never sent one + * out to the peer epid. Ignore handling those in + * ips_proto_disconnect() as we will do the right thing + * when a disconnect request for the epaddr comes in from the peer. + */ + if (epaddr->ptlctl->ptl == proto->ptl && + ((ips_epaddr_t *) epaddr)->cstate_outgoing != CSTATE_NONE) { + mask[i] = 1; + epaddr_array[i] = epaddr; + i++; + IPS_MCTXT_REMOVE((ips_epaddr_t *) epaddr); + } + } + psmi_epid_itor_fini(&itor); + err = ips_proto_disconnect(proto, force, num_disc, epaddr_array, + mask, errs, timeout_in); + psmi_free(mask); + psmi_free(errs); + psmi_free(epaddr_array); + } + + t_grace_start = get_cycles(); + + while (psmi_cycles_left(t_grace_start, t_grace_time)) { + uint64_t t_grace_interval_start = get_cycles(); + int num_disconnect_requests = proto->num_disconnect_requests; + PSMI_BLOCKUNTIL( + proto->ep, err, + proto->num_connected_incoming == 0 || + (!psmi_cycles_left(t_start, timeout_in) && + (!psmi_cycles_left(t_grace_interval_start, + t_grace_interval) || + !psmi_cycles_left(t_grace_start, t_grace_time)))); + if (num_disconnect_requests == proto->num_disconnect_requests) { + /* nothing happened in this grace interval so break out early */ + break; + } + } + +#if _HFI_DEBUGGING + if (_HFI_PRDBG_ON) { + uint64_t t_grace_finish = get_cycles(); + + _HFI_PRDBG_ALWAYS( + "Closing endpoint disconnect left to=%d,from=%d after %d millisec of grace (out of %d)\n", + proto->num_connected_outgoing, proto->num_connected_incoming, + (int)(cycles_to_nanosecs(t_grace_finish - t_grace_start) / + MSEC_ULL), (int)(t_grace_time / MSEC_ULL)); + } +#endif + +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED && proto->cudastream_send) { + PSMI_CUDA_CALL(cuStreamDestroy, proto->cudastream_send); + } +#endif + + if ((err = ips_ibta_fini(proto))) + goto fail; + + if ((err = ips_proto_am_fini(&proto->proto_am))) + goto fail; + + if ((err = ips_scbctrl_fini(&proto->scbc_egr))) + goto fail; + + ips_proto_recv_fini(proto); + + if (proto->protoexp) { + if ((err = ips_protoexp_fini(proto->protoexp))) + goto fail; + } + if (proto->scbc_rv) { + ips_scbctrl_fini(proto->scbc_rv); + psmi_free(proto->scbc_rv); + } + + if (proto->mr_cache) { + psm2_verbs_free_mr_cache(proto->mr_cache); + proto->mr_cache = NULL; + } + psmi_stats_deregister_type(PSMI_STATSTYPE_IPSPROTO, proto); + + psmi_mpool_destroy(proto->pend_sends_pool); + psmi_mpool_destroy(proto->timer_pool); + + psmi_free(proto->sdma_scb_queue); + +fail: + proto->t_fini = proto->t_init = 0; + return err; +} + + +static +void ctrlq_init(struct ips_ctrlq *ctrlq, struct ips_proto *proto) +{ + /* clear the ctrl send queue */ + memset(ctrlq, 0, sizeof(*ctrlq)); + + proto->message_type_to_index[OPCODE_ACK] = CTRL_MSG_ACK_QUEUED; + proto->message_type_to_index[OPCODE_NAK] = CTRL_MSG_NAK_QUEUED; + proto->message_type_to_index[OPCODE_BECN] = CTRL_MSG_BECN_QUEUED; + proto->message_type_to_index[OPCODE_ERR_CHK] = CTRL_MSG_ERR_CHK_QUEUED; + proto->message_type_to_index[OPCODE_CONNECT_REQUEST] = + CTRL_MSG_CONNECT_REQUEST_QUEUED; + proto->message_type_to_index[OPCODE_CONNECT_REPLY] = + CTRL_MSG_CONNECT_REPLY_QUEUED; + proto->message_type_to_index[OPCODE_DISCONNECT_REQUEST] = + CTRL_MSG_DISCONNECT_REQUEST_QUEUED; + proto->message_type_to_index[OPCODE_DISCONNECT_REPLY] = + CTRL_MSG_DISCONNECT_REPLY_QUEUED; + + ctrlq->ctrlq_head = ctrlq->ctrlq_tail = 0; + ctrlq->ctrlq_overflow = 0; + ctrlq->ctrlq_proto = proto; + + /* + * We never enqueue ctrl messages with real payload. If we do, + * the queue 'elem_payload' size needs to be big enough. + * Note: enqueue nak/ack is very important for performance. + */ + proto->ctrl_msg_queue_enqueue = + CTRL_MSG_ACK_QUEUED | + CTRL_MSG_NAK_QUEUED | + CTRL_MSG_BECN_QUEUED; + + psmi_timer_entry_init(&ctrlq->ctrlq_timer, + ips_proto_timer_ctrlq_callback, ctrlq); + + return; +} + +static __inline__ void _build_ctrl_message(struct ips_proto *proto, + struct ips_flow *flow, uint8_t message_type, + ips_scb_t *ctrlscb, uint32_t paylen) +{ + uint32_t tot_paywords = (sizeof(struct ips_message_header) + + HFI_CRC_SIZE_IN_BYTES + paylen) >> BYTE2DWORD_SHIFT; + uint32_t slid, dlid; + ips_epaddr_t *ipsaddr = flow->ipsaddr; + struct ips_message_header *p_hdr = &ctrlscb->ips_lrh; + ips_path_rec_t *ctrl_path = + ipsaddr->pathgrp->pg_path[ipsaddr-> + hpp_index][IPS_PATH_HIGH_PRIORITY]; + + if ((proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) && + (++ipsaddr->hpp_index >= + ipsaddr->pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY])) + ipsaddr->hpp_index = 0; + + /* + * If the size of the transfer is NOT within the "exclusion range", + * then use the "dispersive routling" slid/dlid. Otherwise + * use the base LIDS. + * + * This is a control message, so it should never be a TID transfer. + */ + slid = ctrl_path->pr_slid; + dlid = ctrl_path->pr_dlid; + if (ctrlscb->scb_flags & IPS_SEND_FLAG_NO_LMC) { + slid = ipsaddr->pathgrp->pg_base_slid; + dlid = ipsaddr->pathgrp->pg_base_dlid; + } + + /* Control messages go over the control path. */ + p_hdr->lrh[0] = __cpu_to_be16(HFI_LRH_BTH | + ((ctrl_path->pr_sl & HFI_LRH_SL_MASK) << + HFI_LRH_SL_SHIFT) + ); + p_hdr->lrh[1] = dlid; + p_hdr->lrh[2] = __cpu_to_be16(tot_paywords & HFI_LRH_PKTLEN_MASK); + p_hdr->lrh[3] = slid; + + p_hdr->bth[0] = __cpu_to_be32(ctrl_path->pr_pkey | + (message_type << HFI_BTH_OPCODE_SHIFT)); + + p_hdr->bth[1] = __cpu_to_be32(flow->flowid << HFI_BTH_FLOWID_SHIFT); + flow->flags &= ~IPS_FLOW_FLAG_GEN_BECN; + + /* p_hdr->bth[2] already set by caller, or don't care */ + /* p_hdr->ack_seq_num already set by caller, or don't care */ + + p_hdr->connidx = ipsaddr->connidx_outgoing; + p_hdr->flags = 0; + + p_hdr->khdr.kdeth0 = __cpu_to_le32( + (ctrlscb->scb_flags & IPS_SEND_FLAG_INTR) | + (IPS_PROTO_VERSION << HFI_KHDR_KVER_SHIFT)); + p_hdr->khdr.kdeth1 = 0; + + return; +} + +psm2_error_t +ips_proto_timer_ctrlq_callback(struct psmi_timer *timer, uint64_t t_cyc_expire) +{ + struct ips_ctrlq *ctrlq = (struct ips_ctrlq *)timer->context; + struct ips_proto *proto = ctrlq->ctrlq_proto; + struct ips_ctrlq_elem *cqe; + uint32_t have_cksum = proto->flags & IPS_PROTO_FLAG_CKSUM; + psm2_error_t err; + + /* service ctrl send queue first */ + while (ctrlq->ctrlq_cqe[ctrlq->ctrlq_tail].msg_queue_mask) { + cqe = &ctrlq->ctrlq_cqe[ctrlq->ctrlq_tail]; + GENERIC_PERF_BEGIN(PSM_TX_SPEEDPATH_CTR); /* perf stats */ + if (cqe->msg_scb.flow->transfer == PSM_TRANSFER_PIO) { + err = psmi_hal_spio_transfer_frame(proto, + cqe->msg_scb.flow, &cqe->msg_scb, + cqe->msg_scb.cksum, 0, PSMI_TRUE, + have_cksum, cqe->msg_scb.cksum[0], + proto->ep->context.psm_hw_ctxt +#ifdef PSM_CUDA + , 0 +#endif + ); + } else { + psmi_assert_always(0); + err = PSM2_INTERNAL_ERR; + } + GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); /* perf stats */ + + if (err == PSM2_OK) { + PSM2_LOG_PKT_STRM(PSM2_LOG_TX,&cqe->msg_scb.ips_lrh,"PKT_STRM: err: %d", err); + ips_proto_epaddr_stats_set(proto, cqe->message_type); + *cqe->msg_queue_mask &= + ~message_type2index(proto, cqe->message_type); + cqe->msg_queue_mask = NULL; + ctrlq->ctrlq_tail = + (ctrlq->ctrlq_tail + 1) % CTRL_MSG_QEUEUE_SIZE; + } else { + psmi_assert(err == PSM2_EP_NO_RESOURCES); + + proto->stats.pio_busy_cnt++; + /* re-request a timer expiration */ + psmi_timer_request(proto->timerq, &ctrlq->ctrlq_timer, + PSMI_TIMER_PRIO_0); + return PSM2_OK; + } + } + + return PSM2_OK; +} + +/* Update cqe struct which is a single element from pending control message queue */ +PSMI_ALWAYS_INLINE( +void ips_proto_update_cqe(struct ips_ctrlq_elem *cqe, uint16_t *msg_queue_mask, + struct ips_flow *flow, ips_scb_t *ctrlscb, uint8_t message_type)){ + + cqe->message_type = message_type; + cqe->msg_queue_mask = msg_queue_mask; + psmi_mq_mtucpy(&cqe->msg_scb.ips_lrh, + &ctrlscb->ips_lrh, sizeof(ctrlscb->ips_lrh)); + cqe->msg_scb.flow = flow; + cqe->msg_scb.cksum[0] = ctrlscb->cksum[0]; +} + +psm2_error_t +ips_proto_send_ctrl_message(struct ips_flow *flow, uint8_t message_type, + uint16_t *msg_queue_mask, ips_scb_t *ctrlscb, + void *payload, uint32_t paylen) +{ + psm2_error_t err = PSM2_EP_NO_RESOURCES; + ips_epaddr_t *ipsaddr = flow->ipsaddr; + struct ips_proto *proto = ((psm2_epaddr_t) ipsaddr)->proto; + struct ips_ctrlq *ctrlq = &proto->ctrlq; + struct ips_ctrlq_elem *cqe = ctrlq->ctrlq_cqe; + uint32_t have_cksum; + + psmi_assert(message_type >= OPCODE_ACK && + message_type <= OPCODE_DISCONNECT_REPLY); + psmi_assert((paylen & 0x3) == 0); /* require 4-byte multiple */ + psmi_assert(flow->frag_size >= + (paylen + PSM_CRC_SIZE_IN_BYTES)); + + /* Drain queue if non-empty */ + if (cqe[ctrlq->ctrlq_tail].msg_queue_mask) + ips_proto_timer_ctrlq_callback(&ctrlq->ctrlq_timer, 0ULL); + + /* finish setup control message header */ + ips_set_LMC_LID_choice(proto, ctrlscb, paylen); + _build_ctrl_message(proto, flow, message_type, ctrlscb, paylen); + + /* If enabled checksum control message */ + have_cksum = proto->flags & IPS_PROTO_FLAG_CKSUM; + if (have_cksum) { + ctrlscb->ips_lrh.flags |= IPS_SEND_FLAG_PKTCKSUM; + ips_do_cksum(proto, &ctrlscb->ips_lrh, + payload, paylen, ctrlscb->cksum); + } + + /* + * for ACK/NAK/BECN, we use the fast flow to send over, otherwise, + * we use the original flow + */ + if (message_type == OPCODE_ACK || + message_type == OPCODE_NAK || + message_type == OPCODE_BECN) + { + psmi_assert(proto->msgflowid < EP_FLOW_LAST); + flow = &ipsaddr->flows[proto->msgflowid]; + } + + switch (flow->transfer) { + case PSM_TRANSFER_PIO: + GENERIC_PERF_BEGIN(PSM_TX_SPEEDPATH_CTR); /* perf stats */ + err = psmi_hal_spio_transfer_frame(proto, flow, + ctrlscb, payload, paylen, + PSMI_TRUE, have_cksum, ctrlscb->cksum[0], + proto->ep->context.psm_hw_ctxt +#ifdef PSM_CUDA + , 0 +#endif + ); + GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); /* perf stats */ + break; + default: + err = PSM2_INTERNAL_ERR; + break; + } + + if (err == PSM2_OK) + { + PSM2_LOG_PKT_STRM(PSM2_LOG_TX,&ctrlscb->ips_lrh,"PKT_STRM: err: %d", err); + ips_proto_epaddr_stats_set(proto, message_type); + } + + _HFI_VDBG("transfer_frame of opcode=0x%x,remote_lid=%d," + "src=%p,len=%d returns %d\n", + (int)_get_proto_hfi_opcode(&ctrlscb->ips_lrh), + __be16_to_cpu(ctrlscb->ips_lrh.lrh[1]), payload, paylen, err); + + if (err != PSM2_EP_NO_RESOURCES) + return err; + proto->stats.pio_busy_cnt++; + + if (proto->ctrl_msg_queue_enqueue & proto-> + message_type_to_index[message_type]) { + /* We only queue control msg without payload */ + psmi_assert(paylen == 0); + + if ((*msg_queue_mask) & proto-> + message_type_to_index[message_type]) { + + if (message_type == OPCODE_ACK) { + /* Pending queue should contain latest ACK type message, + * overwrite the previous one. */ + ips_proto_update_cqe(&cqe[flow->ack_index], msg_queue_mask, + flow, ctrlscb, message_type); + } + + err = PSM2_OK; + } else if (cqe[ctrlq->ctrlq_head].msg_queue_mask == NULL) { + /* entry is free */ + if (message_type == OPCODE_ACK) { + /* Track the index of last ACK type message in queue*/ + flow->ack_index = ctrlq->ctrlq_head; + } + + *msg_queue_mask |= + message_type2index(proto, message_type); + + ips_proto_update_cqe(&cqe[ctrlq->ctrlq_head], msg_queue_mask, + flow, ctrlscb, message_type); + + ctrlq->ctrlq_head = + (ctrlq->ctrlq_head + 1) % CTRL_MSG_QEUEUE_SIZE; + /* _HFI_INFO("requesting ctrlq timer for msgtype=%d!\n", message_type); */ + psmi_timer_request(proto->timerq, &ctrlq->ctrlq_timer, + PSMI_TIMER_PRIO_0); + + err = PSM2_OK; + } else { + proto->ctrl_msg_queue_overflow++; + } + } + + return err; +} + +void MOCKABLE(ips_proto_flow_enqueue)(struct ips_flow *flow, ips_scb_t *scb) +{ + ips_epaddr_t *ipsaddr = flow->ipsaddr; + struct ips_proto *proto = ((psm2_epaddr_t) ipsaddr)->proto; + + ips_scb_prepare_flow_inner(proto, ipsaddr, flow, scb); + if ((proto->flags & IPS_PROTO_FLAG_CKSUM) && + (scb->tidctrl == 0) && (scb->nfrag == 1)) { + scb->ips_lrh.flags |= IPS_SEND_FLAG_PKTCKSUM; + ips_do_cksum(proto, &scb->ips_lrh, + ips_scb_buffer(scb), scb->payload_size, &scb->cksum[0]); + } + + /* If this is the first scb on flow, pull in both timers. */ + if (flow->timer_ack == NULL) { + psmi_assert(flow->timer_send == NULL); + flow->timer_ack = scb->timer_ack; + flow->timer_send = scb->timer_send; + } + psmi_assert(flow->timer_ack != NULL); + psmi_assert(flow->timer_send != NULL); + + /* Every flow has a pending head that points into the unacked queue. + * If sends are already pending, process those first */ + if (SLIST_EMPTY(&flow->scb_pend)) + { + PSM2_LOG_PKT_STRM(PSM2_LOG_PEND,&scb->ips_lrh,"PKT_STRM: pkt in pend list"); + SLIST_FIRST(&flow->scb_pend) = scb; + } + + /* Insert scb into flow's unacked queue */ + STAILQ_INSERT_TAIL(&flow->scb_unacked, scb, nextq); + +#ifdef PSM_DEBUG + /* update scb counters in flow. */ + flow->scb_num_pending++; + flow->scb_num_unacked++; +#endif +} +MOCK_DEF_EPILOGUE(ips_proto_flow_enqueue); + +/* + * This function attempts to flush the current list of pending + * packets through PIO. + * + * Recoverable errors: + * PSM2_OK: Packet triggered through PIO. + * PSM2_EP_NO_RESOURCES: No PIO bufs available or cable pulled. + * + * Unrecoverable errors: + * PSM2_EP_NO_NETWORK: No network, no lid, ... + * PSM2_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc. + */ +psm2_error_t +ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed) +{ + struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto; + struct ips_scb_pendlist *scb_pend = &flow->scb_pend; + int num_sent = 0; + uint64_t t_cyc; + ips_scb_t *scb; + psm2_error_t err = PSM2_OK; + + psmi_assert(!SLIST_EMPTY(scb_pend)); + + /* Out of credits - ACKs/NAKs reclaim recredit or congested flow */ + if_pf((flow->credits <= 0) + ) { + if (nflushed) + *nflushed = 0; + return PSM2_EP_NO_RESOURCES; + } + + while (!SLIST_EMPTY(scb_pend) && flow->credits > 0) { + scb = SLIST_FIRST(scb_pend); + psmi_assert(scb->nfrag == 1); + GENERIC_PERF_BEGIN(PSM_TX_SPEEDPATH_CTR); /* perf stats */ + if ((err = psmi_hal_spio_transfer_frame(proto, flow, scb, + ips_scb_buffer(scb), + scb->payload_size, + PSMI_FALSE, + scb->ips_lrh.flags & + IPS_SEND_FLAG_PKTCKSUM, + scb->cksum[0], + proto->ep->context.psm_hw_ctxt +#ifdef PSM_CUDA + , IS_TRANSFER_BUF_GPU_MEM(scb) +#endif + )) + == PSM2_OK) { + GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); /* perf stats */ + t_cyc = get_cycles(); + scb->scb_flags &= ~IPS_SEND_FLAG_PENDING; + scb->ack_timeout = proto->epinfo.ep_timeout_ack; + scb->abs_timeout = proto->epinfo.ep_timeout_ack + t_cyc; + psmi_timer_request(proto->timerq, flow->timer_ack, + scb->abs_timeout); + num_sent++; + flow->credits--; + SLIST_REMOVE_HEAD(scb_pend, next); +#ifdef PSM_DEBUG + flow->scb_num_pending--; +#endif + PSM2_LOG_PKT_STRM(PSM2_LOG_TX,&scb->ips_lrh,"PKT_STRM: err: %d", err); + + } else + { + GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); /* perf stats */ + break; + } + } + + /* If out of flow credits re-schedule send timer */ + if (!SLIST_EMPTY(scb_pend)) { + proto->stats.pio_busy_cnt++; + psmi_timer_request(proto->timerq, flow->timer_send, + get_cycles() + proto->timeout_send); + } + + if (nflushed != NULL) + *nflushed = num_sent; + + return err; +} + +/* + * Flush all packets currently marked as pending + */ +static psm2_error_t scb_dma_send(struct ips_proto *proto, struct ips_flow *flow, + struct ips_scb_pendlist *slist, int *num_sent); + +/* + * Flush all packets queued up on a flow via send DMA. + * + * Recoverable errors: + * PSM2_OK: Able to flush entire pending queue for DMA. + * PSM2_OK_NO_PROGRESS: Flushed at least 1 but not all pending packets for DMA. + * PSM2_EP_NO_RESOURCES: No scb's available to handle unaligned packets + * or writev returned a recoverable error (no mem for + * descriptors, dma interrupted or no space left in dma + * queue). + * + * Unrecoverable errors: + * PSM2_EP_DEVICE_FAILURE: Unexpected error calling writev(), chip failure, + * rxe/txe parity error. + * PSM2_EP_NO_NETWORK: No network, no lid, ... + */ +psm2_error_t +ips_proto_flow_flush_dma(struct ips_flow *flow, int *nflushed) +{ + struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto; + struct ips_scb_pendlist *scb_pend = &flow->scb_pend; + ips_scb_t *scb = NULL; + psm2_error_t err = PSM2_OK; + int nsent = 0; + + psmi_assert(!SLIST_EMPTY(scb_pend)); + + /* Out of credits - ACKs/NAKs reclaim recredit or congested flow */ + if_pf((flow->credits <= 0) + ) { + if (nflushed) + *nflushed = 0; + return PSM2_EP_NO_RESOURCES; + } + + // scb will descrbe header needed, which may be TID + err = scb_dma_send(proto, flow, scb_pend, &nsent); + if (err != PSM2_OK && err != PSM2_EP_NO_RESOURCES && + err != PSM2_OK_NO_PROGRESS) + goto fail; + + if (nsent > 0) { + uint64_t t_cyc = get_cycles(); + int i = 0; + /* + * inflight counter proto->iovec_cntr_next_inflight should not drift + * from completion counter proto->iovec_cntr_last_completed away too + * far because we only have very small scb counter compared with + * uint32_t counter value. + */ +#ifdef PSM_DEBUG + flow->scb_num_pending -= nsent; +#endif + SLIST_FOREACH(scb, scb_pend, next) { + if (++i > nsent) + break; + + PSM2_LOG_PKT_STRM(PSM2_LOG_TX,&scb->ips_lrh,"PKT_STRM: (dma)"); + + scb->scb_flags &= ~IPS_SEND_FLAG_PENDING; + scb->ack_timeout = + scb->nfrag * proto->epinfo.ep_timeout_ack; + scb->abs_timeout = + scb->nfrag * proto->epinfo.ep_timeout_ack + t_cyc; + + psmi_assert(proto->sdma_scb_queue + [proto->sdma_fill_index] == NULL); + proto->sdma_scb_queue[proto->sdma_fill_index] = scb; + scb->dma_complete = 0; + + proto->sdma_avail_counter--; + proto->sdma_fill_index++; + if (proto->sdma_fill_index == proto->sdma_queue_size) + proto->sdma_fill_index = 0; + + /* Flow credits can temporarily go to negative for + * packets tracking purpose, because we have sdma + * chunk processing which can't send exact number + * of packets as the number of credits. + */ + flow->credits -= scb->nfrag; + } + SLIST_FIRST(scb_pend) = scb; + } + + if (SLIST_FIRST(scb_pend) != NULL) { + psmi_assert(flow->scb_num_pending > 0); + + switch (flow->protocol) { + case PSM_PROTOCOL_TIDFLOW: + // for UD we use RC QP instead of STL100's TIDFLOW HW + // UDP has no RDMA + psmi_assert_always(0); // we don't allocate ips_flow for TID + + break; + case PSM_PROTOCOL_GO_BACK_N: + default: + if (flow->credits > 0) { + /* Schedule send timer and increment writev_busy_cnt */ + psmi_timer_request(proto->timerq, + flow->timer_send, + get_cycles() + + (proto->timeout_send << 1)); + proto->stats.writev_busy_cnt++; + } else { + /* Schedule ACK timer to reap flow credits */ + psmi_timer_request(proto->timerq, + flow->timer_ack, + get_cycles() + + (proto->epinfo. + ep_timeout_ack >> 2)); + } + break; + } + } else { + /* Schedule ack timer */ + psmi_timer_cancel(proto->timerq, flow->timer_send); + psmi_timer_request(proto->timerq, flow->timer_ack, + get_cycles() + proto->epinfo.ep_timeout_ack); + } + + /* We overwrite error with its new meaning for flushing packets */ + if (nsent > 0) + if (scb) + err = PSM2_OK_NO_PROGRESS; /* partial flush */ + else + err = PSM2_OK; /* complete flush */ + else + err = PSM2_EP_NO_RESOURCES; /* no flush at all */ + +fail: + if (nflushed) + *nflushed = nsent; + + return err; +} + + + + + +/* + * Caller still expects num_sent to always be correctly set in case of an + * error. + * + * Recoverable errors: + * PSM2_OK: At least one packet was successfully queued up for DMA. + * PSM2_EP_NO_RESOURCES: No scb's available to handle unaligned packets + * or writev returned a recoverable error (no mem for + * descriptors, dma interrupted or no space left in dma + * queue). + * PSM2_OK_NO_PROGRESS: Cable pulled. + * + * Unrecoverable errors: + * PSM2_EP_DEVICE_FAILURE: Error calling hfi_sdma_inflight() or unexpected + * error in calling writev(), or chip failure, rxe/txe + * parity error. + * PSM2_EP_NO_NETWORK: No network, no lid, ... + */ +static +psm2_error_t +scb_dma_send(struct ips_proto *proto, struct ips_flow *flow, + struct ips_scb_pendlist *slist, int *num_sent) +{ + psmi_assert_always(0); // should not get here + return PSM2_INTERNAL_ERR; +} + + +psm2_error_t +ips_proto_timer_ack_callback(struct psmi_timer *current_timer, + uint64_t current) +{ + struct ips_flow *flow = ((ips_scb_t *)current_timer->context)->flow; + struct ips_proto *proto = ((psm2_epaddr_t) (flow->ipsaddr))->proto; + uint64_t t_cyc_next = get_cycles(); + psmi_seqnum_t err_chk_seq; + ips_scb_t *scb, ctrlscb; + uint8_t message_type; + + if (STAILQ_EMPTY(&flow->scb_unacked)) + return PSM2_OK; + + scb = STAILQ_FIRST(&flow->scb_unacked); + + if (current >= scb->abs_timeout) { + int done_local = 0; + + done_local = 1; /* Always done for PIO flows */ + + scb->ack_timeout = + min(scb->ack_timeout * proto->epinfo.ep_timeout_ack_factor, + proto->epinfo.ep_timeout_ack_max); + scb->abs_timeout = t_cyc_next + scb->ack_timeout; + if (done_local) { + _HFI_VDBG + ("sending err_chk flow=%d with first=%d,last=%d\n", + flow->flowid, + STAILQ_FIRST(&flow->scb_unacked)->seq_num.psn_num, + STAILQ_LAST(&flow->scb_unacked, ips_scb, + nextq)->seq_num.psn_num); + + ctrlscb.scb_flags = 0; + if (proto->flags & IPS_PROTO_FLAG_RCVTHREAD) + ctrlscb.scb_flags |= IPS_SEND_FLAG_INTR; + + err_chk_seq = (SLIST_EMPTY(&flow->scb_pend)) ? + flow->xmit_seq_num : + SLIST_FIRST(&flow->scb_pend)->seq_num; + + if (flow->protocol == PSM_PROTOCOL_TIDFLOW) { + // for UD we use RC QP instead of STL100's TIDFLOW HW + // UDP has no RDMA + psmi_assert_always(0); // we don't allocate ips_flow for TID + message_type = OPCODE_ERR_CHK; // keep KlockWorks happy + } else { + PSM2_LOG_MSG("sending ERR_CHK message"); + message_type = OPCODE_ERR_CHK; + err_chk_seq.psn_num = (err_chk_seq.psn_num - 1) + & proto->psn_mask; + } + ctrlscb.ips_lrh.bth[2] = + __cpu_to_be32(err_chk_seq.psn_num); + + ips_proto_send_ctrl_message(flow, message_type, + &flow->ipsaddr->ctrl_msg_queued, + &ctrlscb, ctrlscb.cksum, 0); + } + + t_cyc_next = get_cycles() + scb->ack_timeout; + } else + t_cyc_next += (scb->abs_timeout - current); + + psmi_timer_request(proto->timerq, current_timer, t_cyc_next); + + return PSM2_OK; +} + +psm2_error_t +ips_proto_timer_send_callback(struct psmi_timer *current_timer, + uint64_t current) +{ + struct ips_flow *flow = ((ips_scb_t *)current_timer->context)->flow; + + if (!SLIST_EMPTY(&flow->scb_pend)) + flow->flush(flow, NULL); + + return PSM2_OK; +} + diff --git a/prov/psm3/psm3/ptl_ips/ips_proto.h b/prov/psm3/psm3/ptl_ips/ips_proto.h new file mode 100644 index 00000000000..3b0ed8d7e96 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_proto.h @@ -0,0 +1,712 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_PROTO_H +#define _IPS_PROTO_H + +#include "ips_config.h" +#include "psm_user.h" + +#include "ips_tid.h" +#include "ips_recvhdrq.h" +#include "ips_epstate.h" +#include "ips_proto_am.h" +#include "ips_tidflow.h" +#include "ips_path_rec.h" + +typedef enum ips_path_type { + IPS_PATH_LOW_PRIORITY, + IPS_PATH_NORMAL_PRIORITY, + IPS_PATH_HIGH_PRIORITY, + IPS_PATH_MAX_PRIORITY +} ips_path_type_t; + +/* + * Local Endpoint info. + * + * Contains information necessary for composing packets for the local endpoint + */ +struct ips_epinfo { + uint16_t ep_base_lid; + uint8_t ep_hash; // for hashing adaptive dispersive routing +#define EP_HASH ep_hash + uint8_t ep_lmc; + opa_rate ep_link_rate; + uint16_t ep_sl; /* PSM3_NIC_SL only when path record not used */ + uint16_t ep_mtu; // PSM payload after potential hdr & PSM3_MTU decrease + uint16_t ep_pkey; /* PSM3_PKEY only when path record not used */ + uint64_t ep_timeout_ack; /* PSM3_ERRCHK_TIMEOUT if no path record */ + uint64_t ep_timeout_ack_max; + uint32_t ep_timeout_ack_factor; +}; + +/* + * This contains a path record table table that Enumerate the paths available + * between the local node and a remote node associated with an end point. + * Also maintain a state value for each message priority that keeps indicates + * which path should be assigned to the next message of that priority. + * + * For LMC/Torus, keep list of base and max dlid. Used for pkt verification + * + * pg_base_dlid and pg_base_slid are in network byte order. + */ +#define IPS_MAX_PATH_LMC 3 +typedef struct ips_path_grp { + uint16_t pg_base_dlid; + uint16_t pg_base_slid; + uint8_t pg_num_paths[IPS_PATH_MAX_PRIORITY]; + uint8_t pg_next_path[IPS_PATH_MAX_PRIORITY]; + ips_path_rec_t *pg_path[0][IPS_PATH_MAX_PRIORITY]; +} ips_path_grp_t; + +/* + * Start and finish routines for constructing an ips_proto. + */ +struct ips_proto; +psm2_error_t ips_proto_init(const psmi_context_t *context, + const struct ptl *ptl, + int num_of_send_bufs, + int num_of_send_desc, + uint32_t imm_size, + const struct psmi_timer_ctrl *timerq, /* PTL's timerq */ + const struct ips_epstate *epstate, /* PTL's epstate */ + void *spioc, /* PTL's opaque spio control */ + struct ips_proto *proto); /* output protocol */ + +psm2_error_t ips_proto_fini(struct ips_proto *proto, int force, + uint64_t timeout); + +/* + * Control message structures + * + * ips low-level control messages to ensure reliability of eager packets. + */ +#define CTRL_MSG_QEUEUE_SIZE 64 /* power of two */ + +struct ips_ctrlq_elem { + uint8_t message_type; + uint16_t *msg_queue_mask; + ips_scb_t msg_scb; +}; + +struct ips_ctrlq { + /* Queued control messages, queued when pio is busy */ + struct ips_proto *ctrlq_proto; + + uint32_t ctrlq_head; + uint32_t ctrlq_tail; + uint32_t ctrlq_overflow; + + struct ips_ctrlq_elem ctrlq_cqe[CTRL_MSG_QEUEUE_SIZE] PSMI_CACHEALIGN; + struct psmi_timer ctrlq_timer; /* when in timerq */ +}; + +/* Connect/disconnect, as implemented by ips */ + +/* + * Connections are not pairwise but we keep a single 'epaddr' for messages-from + * and messages-to a remote 'epaddr'. State transitions for connecting TO and + * FROM 'epaddrs' are the following: + * Connect TO (Connect OUTGOING): + * NONE -> WAITING -> ESTABLISHED -> WAITING_DISC -> DISCONNECTED -> NONE + * + * Connect FROM (we receive a connect request - Connect INCOMING) + * NONE -> ESTABLISHED -> NONE + */ +#define CSTATE_ESTABLISHED 1 +#define CSTATE_NONE 2 +#define CSTATE_OUTGOING_DISCONNECTED 3 +#define CSTATE_OUTGOING_WAITING 4 +#define CSTATE_OUTGOING_WAITING_DISC 5 + +psm2_error_t ips_proto_connect(struct ips_proto *proto, int numep, + const psm2_epid_t *array_of_epid, + const int *array_of_epid_mask, + psm2_error_t *array_of_errors, + psm2_epaddr_t *array_of_epaddr, + uint64_t timeout_in); + +psm2_error_t ips_proto_disconnect(struct ips_proto *proto, int force, int numep, + psm2_epaddr_t array_of_epaddr[], + const int array_of_epaddr_mask[], + psm2_error_t array_of_errors[], + uint64_t timeout_in); + +int ips_proto_isconnected(struct ips_epaddr *ipsaddr); + +/* + * Pending operation structures + */ +struct ips_pend_sreq { + STAILQ_ENTRY(ips_pend_sreq) next; + psm2_mq_req_t req; + uint32_t type; +}; + +#define IPS_PENDSEND_EAGER_DATA 1 +#define IPS_PENDSEND_EAGER_REQ 2 +#define IPS_PENDSEND_EXP_TIDS 3 +#define IPS_PENDSEND_EXP_SENDS 4 + +STAILQ_HEAD(ips_pendsendq, ips_pend_sreq); + +struct ips_pend_sends { + struct ips_proto *proto; /* back ptr */ + struct psmi_timer timer; + struct ips_pendsendq pendq; +}; + +/* + * One instance of the protocol + */ + +struct ips_protoexp; + +struct ips_proto_stats { + uint64_t pio_busy_cnt; + uint64_t writev_busy_cnt; + uint64_t scb_egr_unavail_cnt; + uint64_t unknown_packets; + uint64_t stray_packets; +}; + + +/* + * Updates to these stats must be reflected in ips_ptl_epaddr_stats_init + */ +struct ips_proto_epaddr_stats { + uint64_t err_chk_send; + uint64_t err_chk_recv; +#ifdef RNDV_MOD_MR + uint64_t err_chk_rdma_send; + uint64_t err_chk_rdma_recv; +#endif + uint64_t nak_send; + uint64_t nak_recv; + uint64_t connect_req_send; + uint64_t connect_req_recv; + uint64_t connect_rep_send; + uint64_t connect_rep_recv; + uint64_t disconnect_req_send; + uint64_t disconnect_req_recv; + uint64_t disconnect_rep_send; + uint64_t disconnect_rep_recv; + uint64_t tids_grant_send; + uint64_t tids_grant_recv; + uint64_t send_rexmit; +#ifdef RNDV_MOD_MR + uint64_t rdma_rexmit; +#endif +}; + +/* OPP support structure. */ +struct opp_api { + void *(*op_path_find_hca) (const char *name, void **device); + void *(*op_path_open) (void *device, int port_num); + void (*op_path_close) (void *context); + int (*op_path_get_path_by_rec) (void *context, ibta_path_rec_t *query, + ibta_path_rec_t *response); +}; + +struct ips_ibta_compliance_fn { + psm2_error_t(*get_path_rec) (struct ips_proto *proto, uint16_t slid, + uint16_t dlid, + uint16_t ip_hi, + unsigned long timeout, + ips_path_grp_t **ppathgrp); + psm2_error_t(*fini) (struct ips_proto *proto); +}; + +/* please don't change the flow id order */ +typedef enum ips_epaddr_flow { + EP_FLOW_GO_BACK_N_PIO, + EP_FLOW_TIDFLOW, /* Can either pio or dma for tidflow */ + EP_FLOW_LAST /* Keep this the last endpoint flow */ +} ips_epaddr_flow_t; + +typedef enum psm_transfer_type { + PSM_TRANSFER_PIO, + PSM_TRANSFER_LAST /* Keep this the last transfer type */ +} psm_transfer_type_t; + +typedef enum psm_protocol_type { + PSM_PROTOCOL_GO_BACK_N, + PSM_PROTOCOL_TIDFLOW, + PSM_PROTOCOL_LAST /* Keep this the last protocol type */ +} psm_protocol_type_t; + +struct ips_proto { + struct ptl *ptl; /* cached */ + psm2_ep_t ep; /* cached, for errors */ + psm2_mq_t mq; /* cached, for mq handling */ + /* Pending sends */ + struct ips_pend_sends pend_sends; + struct ips_epstate *epstate; + struct psmi_timer_ctrl *timerq; + + struct ips_protoexp *protoexp; + struct ips_scbctrl *scbc_rv; + struct ips_spio *spioc; + struct ips_scbctrl scbc_egr; + struct ips_epinfo epinfo; + + ips_scb_t **sdma_scb_queue; + uint16_t sdma_queue_size; + uint16_t sdma_fill_index; + uint16_t sdma_done_index; + uint16_t sdma_avail_counter; + + uint64_t timeout_send; + uint32_t flags; + uint32_t iovec_thresh_eager; + uint32_t iovec_thresh_eager_blocking; + uint32_t psn_mask; + uint32_t scb_bufsize; + uint32_t multirail_thresh_load_balance; + uint16_t flow_credits; + mpool_t pend_sends_pool; + mpool_t timer_pool; + struct ips_ibta_compliance_fn ibta; + struct ips_proto_stats stats; + struct ips_proto_epaddr_stats epaddr_stats; + + struct ips_proto_am proto_am; + + struct ips_ctrlq ctrlq; + /* pure sdma mode, use dma flow, otherwise, use pio flow */ + ips_epaddr_flow_t msgflowid; + + // mr_cache is only allocated and used when PSM3_RDMA enabled + psm2_mr_cache_t mr_cache; + + + uint64_t t_init; + uint64_t t_fini; + uint32_t runid_key; /* we use our pid, not ideal */ + + int num_connected_outgoing; + int num_connected_incoming; + int num_disconnect_requests; + + /* misc state variables. */ + + /* Smallest interval in cycles between which we warn about stray + * messages This is a per-endpoint quantity, overridable with + * PSM_STRAY_WARN_INTERVAL We use the same interval to send the "die" + * message. + */ + uint64_t stray_warn_interval; + int done_warning; + int done_once; + int num_bogus_warnings; + struct { + uint32_t interval_secs; + uint64_t next_warning; + uint64_t count; + } psmi_logevent_tid_send_reqs; + + /* + * Disable the LMC based dispersive routing for all message + * sizes in bytes between ips_lmc_disable_low and ips_lmc_disable_high, + * inclusive. + */ + uint32_t ips_lmc_disable_low; + uint32_t ips_lmc_disable_high; + struct hsearch_data ips_path_rec_hash; + struct hsearch_data ips_path_grp_hash; + void *opp_lib; + void *hndl; + void *device; + void *opp_ctxt; + struct opp_api opp_fn; + +#ifdef PSM_CUDA + struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_send_cfg; + struct ips_cuda_hostbuf_mpool_cb_context cuda_hostbuf_small_send_cfg; + mpool_t cuda_hostbuf_pool_send; + mpool_t cuda_hostbuf_pool_small_send; + CUstream cudastream_send; + unsigned cuda_prefetch_limit; +#endif + +/* + * Control message queue for pending messages. + * + * Control messages are queued as pending when no PIO is available for sending + * the message. They are composed on the fly and do not need buffering. + * + * Variables here are write once (at init) and read afterwards (except the msg + * queue overflow counters). + */ + uint32_t ctrl_msg_queue_overflow; + uint32_t ctrl_msg_queue_enqueue; + uint32_t message_type_to_index[256]; +#define message_type2index(proto, msg_type) (proto->message_type_to_index[(msg_type)]) + + time_t writevFailTime; +}; + + +/* + * Test the payload length against the lmc_disable_low and lmc_disable_hi + * values, to determine if a transfer of this size should use LMC LIDs. + * Set the IPS_SEND_FLAG_NO_LMC flag in the scb. + */ +static inline void +ips_set_LMC_LID_choice(struct ips_proto *proto, ips_scb_t *scb, uint32_t len) +{ + if ((len >= proto->ips_lmc_disable_low) && + (len <= proto->ips_lmc_disable_high)) { + PSM2_LOG_MSG("DISABLE LMC paylen %u\n", len); + scb->scb_flags |= IPS_SEND_FLAG_NO_LMC; + } + + return; +} + +/* + * Endpoint address, encapsulates per-endpoint protocol metadata + * + * Directly implements the ptl epaddr. + */ +typedef psm2_error_t(*ips_flow_flush_fn_t) (struct ips_flow *, int *nflushed); + +/** + * ips_flow is a structure that combines all information regarding a send + * from one endpoint to another one. Specifically, it is the place where + * the Maximum Transmission Unit for a send is calculated, given how many + * factors could possibly influence the MTU calculation. See ips_flow_init + * documentation for more details. + */ +struct ips_flow { + SLIST_ENTRY(ips_flow) next; /* List of flows with pending acks */ + ips_flow_flush_fn_t flush; /* flush function for this flow */ + + struct ips_epaddr *ipsaddr; /* back pointer, remote endpoint */ + ips_path_rec_t *path; /* Path to use for flow */ + + uint16_t frag_size; /* < This flow's fragment size, calculated as the + < minimum of all relevant MTUs involved */ + + uint16_t flowid:2; /* flow id: pio(0) or dma(1) or tidflow(2) */ + uint16_t transfer:3; /* spio or sdma */ + uint16_t protocol:3; /* go-back-n or tidflow */ + uint16_t flags:8; /* flow state flags */ + + uint16_t cwin; /* Size of congestion window */ + uint16_t ack_interval; /* interval to ack packets */ + uint16_t ack_counter; /* counter to ack packets */ + int16_t credits; /* Current credits available to send on flow */ + uint32_t ack_index; /* Index of the last ACK message type in pending message queue */ + + psmi_seqnum_t xmit_seq_num; /* transmit packet sequence number */ + psmi_seqnum_t xmit_ack_num; /* acked packet sequence number */ + psmi_seqnum_t recv_seq_num; /* recieved packet sequence number */ + + psmi_timer *timer_send; /* timer for frames that got a busy PIO */ + psmi_timer *timer_ack; /* timer for unacked frames */ + + STAILQ_HEAD(ips_scb_unackedq, ips_scb) scb_unacked; /* unacked queue */ + SLIST_HEAD(ips_scb_pendlist, ips_scb) scb_pend; /* pending queue */ + +#ifdef PSM_DEBUG + uint32_t scb_num_pending; /* pending scb counter */ + uint32_t scb_num_unacked; /* unacked scb counter */ +#endif +}; + +#define IPS_FLOW_MSG_TOGGLE_OOO_MASK (1 << 0) /* ooo msg check */ +#define IPS_FLOW_MSG_TOGGLE_UNEXP_MASK (1 << 1) /* unexp msg check */ +/* + * Make sure ips_epaddr_t and psm2_epaddr_t can be converted each other. + */ +struct ips_epaddr { + struct psm2_epaddr epaddr; /* inlined psm level epaddr */ + struct ips_msgctl *msgctl; /* ips level msg control */ + + struct ips_epaddr *next; /* linklist */ + + struct ips_flow flows[EP_FLOW_LAST - 1]; /* pio and dma */ + ips_path_grp_t *pathgrp; /* pointer to slid/dlid group in hash */ + + uint32_t connidx_outgoing; /* peer's connection idx */ + uint32_t connidx_incoming; /* my connection idx */ + + uint16_t ctrl_msg_queued; /* bitmap of queued control messages to be send */ + uint32_t window_rv; /* RNDV window size per connection */ + + uint8_t hpp_index; /* high priority index */ + uint8_t msg_toggle; /* only 2 bits used, 6 bits for future */ + // on UD/UDP context only used for hashing adaptive dispersive routing + uint32_t remote_qpn; +#define IPSADDR_HASH remote_qpn +#ifdef RNDV_MOD_MR + union ibv_gid remote_gid; /* GID of dest to use for IB CM */ + psm2_rv_conn_t rv_conn; + uint32_t remote_rv_index; // RV index of dest to use for immed */ + // state of connection - need it here so we don't call kernel to poll + // ! conn - no connection + // conn && ! connected - connection processes started, but not done + // connected - connection established and usable (implies conn) + uint8_t rv_connected:1; + uint8_t reserved:4; + // during error recovery a receiver may be unable to allocate an scb to + // send the respond. In which case the information is stashed here and + // checked in ips_proto_timer_send_callback for the proto->msgflowid flow + // when an scb is available, this info allows the response to be built + // Since we can only stash one such info per ipsaddr, we limit senders + // to one outstanding err_chk_rdma at a time. Recovery is infrequent + // and already slow due to QP reconnect so this is a reasonable compromise + // the idea of using the ctrlq (64 entries deep per proto) was explored + // but is not really for "level 2" reliability messages so this approach + // was deemed simpler to implement and lower risk to mature code + uint8_t rv_err_chk_rdma_outstanding:1; /* only one per requestor */ + uint8_t rv_need_send_err_chk_rdma_resp:1; /* is resp info stashed */ + uint8_t rv_err_chk_rdma_resp_need_resend:1; /* info for resp */ + ptl_arg_t rv_err_chk_rdma_resp_rdesc_id; /* info for resp */ + ptl_arg_t rv_err_chk_rdma_resp_sdesc_id; /* info for resp */ + STAILQ_ENTRY(ips_epaddr) pend_err_resp_next; /* queue to send resp */ +#endif + // TBD - to reduce memory footprint, perhaps allocate a separate + // structure only when RC QP enabled and point to it here + struct ibv_qp *rc_qp; + struct psm2_verbs_recv_pool recv_pool; + uint32_t rc_qp_max_recv_wr; // TBD if we allocated recv buffers sooner we + // wouldn't need this field + uint32_t rc_qp_max_inline_data; + struct psm2_verbs_send_allocator send_allocator; + // use_* help avoid if tests in post_send datapath + psm2_verbs_send_allocator_t use_allocator; // points to verbs_ep until + // rc_connected + struct ibv_qp *use_qp; // points to verbs_ep UD QP until + // rc_connected + uint32_t use_max_inline_data; // verbs_ep UD QP value until connected + uint8_t rc_connected; + + /* this portion is only for connect/disconnect */ + uint64_t s_timeout; /* used as a time in close */ + uint32_t runid_key; /* peer process pid */ + uint32_t credit:2; /* credit to connect/disconnect: 0 or 1 */ + uint32_t cstate_outgoing:3; /* connection state to, max 7 */ + uint32_t cstate_incoming:3; /* connection state from, max 7 */ + uint32_t delay_in_ms:8; /* disconnect delay in ms */ + uint32_t cerror_outgoing:8; /* error code during connection */ + uint32_t cerror_incoming:8; /* error code during connection */ +}; + +static inline int +ips_epaddr_connected(struct ips_epaddr *ipsaddr) +{ + if (ipsaddr->rc_connected) + return 1; +#ifdef RNDV_MOD_MR + if (ipsaddr->rv_connected) + return 1; +#endif + return 0; +} + +/* + * ips_msgctl_t is per connection struct. + */ +struct ips_msgctl { + struct ips_epaddr master_epaddr; /* Master rail's epaddr */ + + struct ips_epaddr *ipsaddr_next; /* next ipsaddr to send packet */ + uint16_t mq_send_seqnum; /* next sending message sequence */ + uint16_t mq_recv_seqnum; /* next receiving message sequence */ + uint16_t am_send_seqnum; /* next sending message sequence */ + uint16_t am_recv_seqnum; /* next receiving message sequence */ + uint16_t ipsaddr_count; /* number of ipsaddr to use */ + uint16_t outoforder_count; /* number of outoforder messages */ +}; + +static inline __attribute__ ((unused)) +void IPS_MCTXT_APPEND(ips_epaddr_t *head, ips_epaddr_t *node) +{ + ips_epaddr_t *cur; + + /* The new node is inserted before head. */ + node->next = head; + + /* Circle around the linked list to head's predecessor and update. */ + for (cur = head; cur->next != head; cur = cur->next); + cur->next = node; +} + +static inline __attribute__ ((unused)) +void IPS_MCTXT_REMOVE(ips_epaddr_t *node) +{ + ips_epaddr_t *cur; + + /* Circle around to node's predecessor and update. */ + for (cur = node; cur->next != node; cur = cur->next); + cur->next = node->next; + node->next = node; +} + +/* + * Initialize a flow, setting its attributes. Selects the path the flow will + * use as well as calculates the flow's fragment size defined as: + * - min(remote EP MTU, selected path's MTU, local EP MTU) for DMA sends + * - min(remote EP MTU, selected path's MTU, local EP MTU, local PIO bufsize) for PIO sends + */ +void MOCKABLE(ips_flow_init)(struct ips_flow *flow, struct ips_proto *proto, + ips_epaddr_t *ipsaddr, psm_transfer_type_t transfer_type, + psm_protocol_type_t protocol, ips_path_type_t path_type, + uint32_t flow_index); +MOCK_DCL_EPILOGUE(ips_flow_init); + +void ips_scb_prepare_flow(ips_scb_t *scb, ips_epaddr_t *ipsaddr, + struct ips_flow *flow); + +void MOCKABLE(ips_proto_flow_enqueue)(struct ips_flow *flow, ips_scb_t *scb); +MOCK_DCL_EPILOGUE(ips_proto_flow_enqueue); + +psm2_error_t ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed); +psm2_error_t ips_proto_flow_flush_dma(struct ips_flow *flow, int *nflushed); + +/* Wrapper for enqueue + flush */ +psm2_error_t ips_proto_scb_pio_send(struct ips_flow *flow, ips_scb_t *scb); + +void ips_proto_scb_dma_enqueue(struct ips_proto *proto, ips_scb_t *scb); +psm2_error_t ips_proto_scb_dma_flush(struct ips_proto *proto, + ips_epaddr_t *ipsaddr, int *nflushed); +psm2_error_t ips_proto_dma_wait_until(struct ips_proto *proto, ips_scb_t *scb); + +/* + * Protocol receive processing + * + */ +/* Error handling for unknown packet, packet is unknown when epid doesn't match + * in epstate table */ +int ips_proto_process_unknown(const struct ips_recvhdrq_event *rcv_ev); +/* Exposed for fastpath only */ +int ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev); +int ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev); + +/* + * Protocol exception handling and frame dumps + */ +void ips_proto_show_header(struct ips_message_header *p_hdr, char *msg); +void ips_proto_dump_frame(void *frame, int lenght, char *message); +void ips_proto_dump_data(void *data, int data_length); +void ips_proto_dump_eager(uint32_t *curr_rcv_hdr); + +/* + * Checksum of ips packets + */ +uint32_t ips_crc_calculate(uint32_t len, uint8_t *data, uint32_t crc); + +/* + * Matched-Queue processing and sends + */ +psm2_error_t ips_proto_mq_push_cts_req(struct ips_proto *proto, + psm2_mq_req_t req); +psm2_error_t ips_proto_mq_push_rts_data(struct ips_proto *proto, + psm2_mq_req_t req); +int ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev); +int ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev); +int ips_proto_mq_handle_tiny(struct ips_recvhdrq_event *rcv_ev); +int ips_proto_mq_handle_short(struct ips_recvhdrq_event *rcv_ev); +int ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev); +void ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl); +int ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev); + +psm2_error_t ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr, + uint32_t flags, psm2_mq_tag_t *tag, + const void *ubuf, uint32_t len); + +psm2_error_t ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, + uint32_t flags_user, uint32_t flags_internal, + psm2_mq_tag_t *tag, const void *ubuf, uint32_t len, + void *context, psm2_mq_req_t *req_o); + +psm2_error_t ips_proto_msg_size_thresh_query (enum psm2_info_query_thresh_et, + uint32_t *out, psm2_mq_t mq, psm2_epaddr_t); + +int ips_proto_am(struct ips_recvhdrq_event *rcv_ev); + +/* + * IPS packet service routine table. + */ +typedef int (*ips_packet_service_fn_t)(struct ips_recvhdrq_event *rcv_ev); +extern ips_packet_service_fn_t + ips_packet_service_routine[OPCODE_FUTURE_FROM-OPCODE_RESERVED]; + +psm2_error_t ips_ibta_link_updown_event(struct ips_proto *proto); + +psm2_error_t +MOCKABLE(ips_ibta_init)(struct ips_proto *proto); +MOCK_DCL_EPILOGUE(ips_ibta_init); + +psm2_error_t ips_ibta_fini(struct ips_proto *proto); + + +#ifdef PSM_CUDA +PSMI_ALWAYS_INLINE( +uint32_t ips_cuda_next_window(uint32_t max_window, uint32_t offset, + uint32_t len)) +{ + uint32_t window_len; + window_len = len - offset; + if (window_len >= max_window) + window_len = max_window; + return window_len; +} +#endif + + +#endif /* _IPS_PROTO_H */ diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_am.c b/prov/psm3/psm3/ptl_ips/ips_proto_am.c new file mode 100644 index 00000000000..995c6862a77 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_proto_am.c @@ -0,0 +1,618 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "psm2_am.h" +#include "psm_am_internal.h" +#include "psm_mq_internal.h" +#include "ips_proto.h" +#include "ips_expected_proto.h" +#include "ips_proto_help.h" + +struct ips_am_token { + struct psmi_am_token tok; + + /* ptl-specific token stuff */ + struct ips_epaddr *epaddr_rail; + struct ips_proto_am *proto_am; +}; + +struct ips_am_message { + struct ips_message_header p_hdr; + struct ips_am_message *next; + struct ips_epaddr *ipsaddr; + struct ips_proto_am *proto_am; + uint64_t *payload; + uint32_t paylen; + uint16_t seqnum; +}; + +/* These variables are shared for all packet flows in a PSM process; they are + * shared across multiple rails. There is no single AM object to hang these + * off of, so they are declared here as globals. */ +static struct { + struct ips_am_message head; + struct ips_am_message *tail; +} ips_am_outoforder_q; + +static mpool_t ips_am_msg_pool; + +/* This calculation ensures that the number of reply slots will always be at + * least twice as large + 1 as the number of request slots. This is optimal: the + * minimum amount required is actually only twice as many, but it is much + * slower. */ +#define calc_optimal_num_reply_slots(nslots) (((nslots)*2 / 3) + 1) + +psm2_error_t +MOCKABLE(ips_proto_am_init)(struct ips_proto *proto, + int num_send_slots, + uint32_t imm_size, + struct ips_proto_am *proto_am) +{ + psm2_error_t err = PSM2_OK; + int send_buf_size = proto->epinfo.ep_mtu; + int num_rep_slots = calc_optimal_num_reply_slots(num_send_slots); + int num_req_slots = num_send_slots - num_rep_slots; + + proto_am->proto = proto; + + /* In a node pair, the number of reply send buffers on at least one of + * the nodes must be at least double the number (optimal: double + 1) of + * send descriptors on the other node. While this constraint applies + * only to the reply send buffers, allowing the caller to tune only the + * number of request send buffers would be awkward, as they have no + * knowledge of the subdivision of the memory into separate mempools for + * requests and replies. It's an internal concern at this point. */ + if ((err = ips_scbctrl_init(&proto->ep->context, + num_req_slots, + num_req_slots, + imm_size, + send_buf_size, + NULL, + NULL, + &proto_am->scbc_request))) + goto fail; + + if ((err = ips_scbctrl_init(&proto->ep->context, + num_rep_slots, + num_rep_slots, + imm_size, + send_buf_size, + NULL, + NULL, + &proto_am->scbc_reply))) + goto fail; + + if (ips_am_msg_pool == NULL) { + union psmi_envvar_val max_msgs; + + ips_am_outoforder_q.head.next = NULL; + ips_am_outoforder_q.tail = &ips_am_outoforder_q.head; + + psmi_getenv("PSM3_AM_MAX_OOO_MSGS", + "Maximum number of OOO Active Messages to queue before dropping.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)1024, &max_msgs); + + ips_am_msg_pool = psmi_mpool_create( + sizeof(struct ips_am_message), + 32, max_msgs.e_uint, 0, UNDEFINED, NULL, NULL); + } +fail: + return err; +} +MOCK_DEF_EPILOGUE(ips_proto_am_init); + +psm2_error_t ips_proto_am_fini(struct ips_proto_am *proto_am) +{ + ips_scbctrl_fini(&proto_am->scbc_request); + ips_scbctrl_fini(&proto_am->scbc_reply); + if (ips_am_msg_pool != NULL) { + psmi_mpool_destroy(ips_am_msg_pool); + ips_am_msg_pool = NULL; + } + + return PSM2_OK; +} + +/* Fill in AM capabilities parameters */ +psm2_error_t +ips_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters) +{ + int max_nargs = min(1 << IPS_AM_HDR_NARGS_BITS, PSMI_AM_MAX_ARGS); + int max_payload = + ep->mtu - + ((max_nargs - IPS_AM_HDR_NARGS) * sizeof(psm2_amarg_t)); + + if (parameters == NULL) { + return PSM2_PARAM_ERR; + } + + parameters->max_handlers = 1 << IPS_AM_HDR_HIDX_BITS; + parameters->max_nargs = max_nargs; + parameters->max_request_short = max_payload; + parameters->max_reply_short = max_payload; + + return PSM2_OK; +} + +static +psm2_error_t +am_short_reqrep(ips_scb_t *scb, struct ips_epaddr *ipsaddr, + psm2_amarg_t *args, int nargs, uint8_t opcode, + void *src, size_t len, int flags, int pad_bytes) +{ + int i, hdr_qwords = IPS_AM_HDR_NARGS; + struct ips_proto *proto = ((psm2_epaddr_t)ipsaddr)->proto; + + psmi_assert(proto->msgflowid < EP_FLOW_LAST); + + struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid]; + + /* There are a limited number of bits for nargs in the header, making + overflow very easy. Make sure the values match. */ + psmi_assert(nargs == scb->ips_lrh.amhdr_nargs); + + _HFI_VDBG("%s src=%p len=%d, nargs=%d\n", + ((opcode == OPCODE_AM_REQUEST) || + (opcode == OPCODE_AM_REQUEST_NOREPLY)) ? "req" : "rep", + src, (int)len, nargs); + + if (nargs == 1) { /* fastpath */ + scb->ips_lrh.data[0].u64w0 = args[0].u64w0; + hdr_qwords--; + } else if (nargs > 1) { + /* Easily unrollable but leave as is in case we can increase + * qwords on the chip in the near future */ + for (i = 0; i < IPS_AM_HDR_NARGS; i++, hdr_qwords--) + scb->ips_lrh.data[i].u64w0 = args[i].u64w0; + + if (nargs > IPS_AM_HDR_NARGS) { + /* Slow case -- we don't have iovec and not enough + * space in the message header, so we have to copy the + * user's arguments even if the payload is marked ASYNC + */ + uintptr_t bufp = (uintptr_t) ips_scb_buffer(scb); + size_t arg_payload_len = + sizeof(psm2_amarg_t) * (nargs - IPS_AM_HDR_NARGS); + + psmi_mq_mtucpy((void *)bufp, + &args[IPS_AM_HDR_NARGS], + arg_payload_len); + bufp += arg_payload_len; + scb->payload_size = arg_payload_len; + + if (src != NULL && len > 0) { + psmi_mq_mtucpy((void *)bufp, src, len); + scb->payload_size += len; + } + + psmi_assert(pad_bytes < (1 << IPS_AM_HDR_LEN_BITS)); + scb->payload_size += pad_bytes; + scb->ips_lrh.amhdr_len = pad_bytes; + goto send_scb; + } + } + + if (len == 0) { + scb->payload_size = 0; + scb->ips_lrh.amhdr_len = 0; + } else if (len <= (hdr_qwords << 3)) { + /* Inline the payload into the header. */ + /* This path CANNOT handle length = 0 due to limited space + in the header. If IPS_SEND_FLAG_AMISTINY is set, an + amhdr_len value of 0 means a full payload, i.e. + 1 << IPS_AM_HDR_LEN_BITS bytes of packed payload. */ + psmi_assert(len > 0); + + psmi_mq_mtucpy(&scb->ips_lrh. + data[IPS_AM_HDR_NARGS - hdr_qwords], src, len); + scb->payload_size = 0; + psmi_assert(len <= (1 << IPS_AM_HDR_LEN_BITS)); + scb->ips_lrh.amhdr_len = len & ((1 << IPS_AM_HDR_LEN_BITS) - 1); + scb->scb_flags |= IPS_SEND_FLAG_AMISTINY; + } else { /* Whatever's left requires a separate payload */ + if (ips_scb_buffer(scb) == NULL) /* Just attach the buffer */ + ips_scb_buffer(scb) = src; + else /* May need to re-xmit user data, keep it around */ + psmi_mq_mtucpy(ips_scb_buffer(scb), src, len); + + psmi_assert(pad_bytes < (1 << IPS_AM_HDR_LEN_BITS)); + scb->payload_size = len + pad_bytes; + scb->ips_lrh.amhdr_len = pad_bytes; + } + +send_scb: + ips_scb_opcode(scb) = opcode; + scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->am_send_seqnum++; + ips_proto_flow_enqueue(flow, scb); + flow->flush(flow, NULL); + + return PSM2_OK; +} + +static inline int +calculate_pad_bytes(size_t len) +{ + /* Align to dword (4 bytes) */ + size_t dword_aligned_len = (len + 3) & ~3; + return dword_aligned_len - len; +} + +static inline +void +ips_am_scb_init(ips_scb_t *scb, uint8_t handler, int nargs, + int pad_bytes, + psm2_am_completion_fn_t completion_fn, void *completion_ctxt) +{ + psmi_assert(pad_bytes < (1 << IPS_AM_HDR_LEN_BITS)); + + scb->completion_am = completion_fn; + scb->cb_param = completion_ctxt; + scb->ips_lrh.amhdr_hidx = handler; + scb->ips_lrh.amhdr_len = pad_bytes; + scb->ips_lrh.amhdr_nargs = nargs; + scb->ips_lrh.flags = 0; + if (completion_fn) + scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; + return; +} + +psm2_error_t +ips_am_short_request(psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + struct ips_proto_am *proto_am = &epaddr->proto->proto_am; + psm2_error_t err; + ips_scb_t *scb; + ips_epaddr_t *ipsaddr; + int pad_bytes = calculate_pad_bytes(len); + int payload_sz = (nargs << 3); + + if_pt(!(flags & PSM2_AM_FLAG_ASYNC)) + payload_sz += len; + + if (payload_sz > (IPS_AM_HDR_NARGS << 3)) { + /* Payload can't fit in header, allocate buffer to carry data */ + int arg_sz = (nargs > IPS_AM_HDR_NARGS) ? + ((nargs - IPS_AM_HDR_NARGS) << 3) : 0; + + /* len + pad_bytes + overflow_args */ + PSMI_BLOCKUNTIL(epaddr->ptlctl->ep, + err, + ((scb = ips_scbctrl_alloc( + &proto_am->scbc_request, + 1, + len + pad_bytes + arg_sz, + IPS_SCB_FLAG_ADD_BUFFER)) != NULL)); + } else { + PSMI_BLOCKUNTIL(epaddr->ptlctl->ep, + err, + ((scb = ips_scbctrl_alloc_tiny( + &proto_am->scbc_request)) != NULL)); + } + + psmi_assert_always(scb != NULL); + ips_am_scb_init(scb, handler, nargs, pad_bytes, + completion_fn, completion_ctxt); + + if (payload_sz >= epaddr->proto->multirail_thresh_load_balance) { + /* Select the next ipsaddr for multi-rail */ + ipsaddr = ((ips_epaddr_t *)epaddr)->msgctl->ipsaddr_next; + ipsaddr->msgctl->ipsaddr_next = ipsaddr->next; + } else { + ipsaddr = (ips_epaddr_t *)epaddr; + } + + return am_short_reqrep(scb, ipsaddr, args, + nargs, + (flags & PSM2_AM_FLAG_NOREPLY) ? + OPCODE_AM_REQUEST_NOREPLY : OPCODE_AM_REQUEST, + src, len, flags, pad_bytes); +} + +psm2_error_t +ips_am_short_reply(psm2_am_token_t tok, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, void *completion_ctxt) +{ + struct ips_am_token *token = (struct ips_am_token *)tok; + struct ips_proto_am *proto_am = token->proto_am; + struct ips_epaddr *ipsaddr = token->epaddr_rail; + int pad_bytes = calculate_pad_bytes(len); + int scb_flags = 0; + ips_scb_t *scb; + + if (!token->tok.can_reply) { + _HFI_ERROR("Invalid AM reply for request!"); + return PSM2_AM_INVALID_REPLY; + } + + psmi_assert(ips_scbctrl_avail(&proto_am->scbc_reply)); + + if ((nargs << 3) + len <= (IPS_AM_HDR_NARGS << 3)) { + scb = ips_scbctrl_alloc_tiny(&proto_am->scbc_reply); + } else { + int payload_sz = (nargs << 3); + + payload_sz += (flags & PSM2_AM_FLAG_ASYNC) ? + 0 : (len + pad_bytes); + scb_flags |= (payload_sz > (IPS_AM_HDR_NARGS << 3)) ? + IPS_SCB_FLAG_ADD_BUFFER : 0; + + scb = + ips_scbctrl_alloc(&proto_am->scbc_reply, 1, payload_sz, + scb_flags); + } + + psmi_assert_always(scb != NULL); + ips_am_scb_init(scb, handler, nargs, pad_bytes, + completion_fn, completion_ctxt); + am_short_reqrep(scb, ipsaddr, args, nargs, OPCODE_AM_REPLY, + src, len, flags, pad_bytes); + return PSM2_OK; +} + +/* Prepares and runs a handler from a receive event. */ +static int +ips_am_run_handler(const struct ips_message_header *p_hdr, + struct ips_epaddr *ipsaddr, struct ips_proto_am *proto_am, + uint64_t *payload, + uint32_t paylen) +{ + struct ips_am_token token; + int nargs = p_hdr->amhdr_nargs; + int ret; + struct psm2_ep_am_handle_entry *hentry; + psm2_amarg_t *args = (psm2_amarg_t *)p_hdr->data; + + token.tok.flags = p_hdr->flags; + token.tok.epaddr_incoming = (psm2_epaddr_t)&ipsaddr->msgctl->master_epaddr; + token.tok.can_reply = + (_get_proto_hfi_opcode(p_hdr) == OPCODE_AM_REQUEST); + token.epaddr_rail = ipsaddr; + token.proto_am = proto_am; + + if (token.tok.flags & IPS_SEND_FLAG_AMISTINY) { + /* Payload is packed into header after args */ + payload = (uint64_t *)&p_hdr->data[nargs].u64; + paylen = p_hdr->amhdr_len; + /* Interpret amhdr_len == 0 as 16 bytes of payload */ + if (paylen == 0) + paylen = 1 << IPS_AM_HDR_LEN_BITS; + } else { + if (nargs > IPS_AM_HDR_NARGS) { + /* Args are split across header and payload */ + int payload_args_len = + (nargs - IPS_AM_HDR_NARGS) * + sizeof(psm2_amarg_t); + + args = alloca(PSMI_AM_MAX_ARGS * sizeof(psm2_amarg_t)); + + args[0].u64 = p_hdr->data[0].u64; + args[1].u64 = p_hdr->data[1].u64; + + memcpy(&args[2], payload, payload_args_len); + + payload += nargs - IPS_AM_HDR_NARGS; + paylen -= payload_args_len; + } + + /* Subtract off padding bytes (dword padding) for non-TINY. */ + paylen -= p_hdr->amhdr_len; + } + + hentry = psm_am_get_handler_function(proto_am->proto->ep, + p_hdr->amhdr_hidx); + + /* Note a guard here for hentry != NULL is not needed because at + * initialization, a psmi_assert_always() assure the entry will be + * non-NULL. */ + + if (likely(hentry->version == PSM2_AM_HANDLER_V2)) { + psm2_am_handler_2_fn_t hfn2 = + (psm2_am_handler_2_fn_t)hentry->hfn; + ret = hfn2(&token, args, nargs, payload, paylen, hentry->hctx); + } else { + psm2_am_handler_fn_t hfn1 = + (psm2_am_handler_fn_t)hentry->hfn; + ret = hfn1(&token, args, nargs, payload, paylen); + } + + return ret; +} + +static int +ips_proto_am_handle_outoforder_queue() +{ + struct ips_am_message *msg, *prev; + int ret = IPS_RECVHDRQ_CONTINUE; + + prev = &ips_am_outoforder_q.head; + msg = ips_am_outoforder_q.head.next; + + while (msg != NULL) { + struct ips_epaddr *ipsaddr = msg->ipsaddr; + if (ipsaddr->msgctl->am_recv_seqnum != msg->seqnum) { + prev = msg; + msg = msg->next; + continue; + } + + ipsaddr->msgctl->am_recv_seqnum++; + + if (ips_am_run_handler(&msg->p_hdr, + ipsaddr, msg->proto_am, + msg->payload, msg->paylen)) + ret = IPS_RECVHDRQ_BREAK; + + prev->next = msg->next; + if (prev->next == NULL) + ips_am_outoforder_q.tail = prev; + + psmi_mq_sysbuf_free(msg->proto_am->proto->mq, msg->payload); + psmi_mpool_put(msg); + + msg = prev->next; + } + + return ret; +} + +static void +ips_proto_am_queue_msg(struct ips_am_message *msg) +{ + msg->next = NULL; + ips_am_outoforder_q.tail->next = msg; + ips_am_outoforder_q.tail = msg; +} + +int ips_proto_am(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ips_epaddr *ipsaddr = rcv_ev->ipsaddr; + struct ips_proto_am *proto_am = &rcv_ev->proto->proto_am; + ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); + struct ips_flow *flow; + struct ips_am_message *msg = NULL; + int ret = IPS_RECVHDRQ_CONTINUE; + enum ips_msg_order msgorder; + + psmi_assert(flowid < EP_FLOW_LAST); + flow = &ipsaddr->flows[flowid]; + /* + * Based on AM request/reply traffic pattern, if we don't have a reply + * scb slot then we can't process the request packet, we just silently + * drop it. Otherwise, it will be a deadlock. note: + * ips_proto_is_expected_or_nak() can not be called in this case. + */ + if (_get_proto_hfi_opcode(p_hdr) == OPCODE_AM_REQUEST && + !ips_scbctrl_avail(&proto_am->scbc_reply)) + return IPS_RECVHDRQ_CONTINUE; + + if (!ips_proto_is_expected_or_nak(rcv_ev)) + return IPS_RECVHDRQ_CONTINUE; + + uint16_t send_msgseq = + __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK; + msgorder = ips_proto_check_msg_order(ipsaddr, flow, send_msgseq, + &ipsaddr->msgctl->am_recv_seqnum); + + if (msgorder == IPS_MSG_ORDER_FUTURE) + return IPS_RECVHDRQ_REVISIT; + else if (msgorder == IPS_MSG_ORDER_FUTURE_RECV) { + uint64_t *msg_payload; + uint64_t *payload = ips_recvhdrq_event_payload(rcv_ev); + uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev); + + psmi_assert(paylen == 0 || payload); + msg = psmi_mpool_get(ips_am_msg_pool); + if (unlikely(msg == NULL)) { + /* Out of memory, drop the packet. */ + flow->recv_seq_num.psn_num = + (flow->recv_seq_num.psn_num - 1) & + rcv_ev->proto->psn_mask; + return IPS_RECVHDRQ_BREAK; + } + msg_payload = psmi_mq_sysbuf_alloc( + proto_am->proto->mq, + ips_recvhdrq_event_paylen(rcv_ev)); + if (unlikely(msg_payload == NULL)) { + /* Out of memory, drop the packet. */ + flow->recv_seq_num.psn_num = + (flow->recv_seq_num.psn_num - 1) & + rcv_ev->proto->psn_mask; + psmi_mpool_put(msg); + return IPS_RECVHDRQ_BREAK; + } + + memcpy(&msg->p_hdr, p_hdr, sizeof(struct ips_message_header)); + memcpy(msg_payload, payload, paylen); + + msg->payload = msg_payload; + msg->ipsaddr = ipsaddr; + msg->proto_am = proto_am; + msg->paylen = paylen; + msg->seqnum = + __le32_to_cpu(p_hdr->khdr.kdeth0) & + HFI_KHDR_MSGSEQ_MASK; + + ips_proto_am_queue_msg(msg); + } else if ((msgorder == IPS_MSG_ORDER_EXPECTED) || + (msgorder == IPS_MSG_ORDER_EXPECTED_MATCH)) { + uint64_t *payload = ips_recvhdrq_event_payload(rcv_ev); + uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev); + + psmi_assert(paylen == 0 || payload); + if (ips_am_run_handler(p_hdr, ipsaddr, proto_am, + payload, paylen)) + ret = IPS_RECVHDRQ_BREAK; + + ips_proto_am_handle_outoforder_queue(); + } + + /* Look if the handler replied, if it didn't, ack the request */ + if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); + + ips_proto_process_ack(rcv_ev); + return ret; +} diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_am.h b/prov/psm3/psm3/ptl_ips/ips_proto_am.h new file mode 100644 index 00000000000..3e0a2717205 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_proto_am.h @@ -0,0 +1,93 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_PROTO_AM_H +#define _IPS_PROTO_AM_H + +#include "psm_user.h" +#include "ips_scb.h" + +struct ips_proto_am { + struct ips_proto *proto; /* back pointer */ + struct ips_scbctrl scbc_request; + struct ips_scbctrl scbc_reply; +}; + +psm2_error_t +ips_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters); + +psm2_error_t +ips_am_short_reply(psm2_am_token_t tok, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, void *completion_ctxt); + +psm2_error_t +ips_am_short_request(psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt); + +psm2_error_t +MOCKABLE(ips_proto_am_init)(struct ips_proto *proto, + int num_send_slots, + uint32_t imm_size, + struct ips_proto_am *proto_am); +MOCK_DCL_EPILOGUE(ips_proto_am_init); + +psm2_error_t ips_proto_am_fini(struct ips_proto_am *proto_am); + +#endif /* _IPS_PROTO_AM_H */ diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_connect.c b/prov/psm3/psm3/ptl_ips/ips_proto_connect.c new file mode 100644 index 00000000000..e3adc87ff7a --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_proto_connect.c @@ -0,0 +1,1939 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" +#include "psm_mq_internal.h" +#include "ips_proto_internal.h" +#ifdef RNDV_MOD_MR +#include "psm_rndv_mod.h" +#endif +/* + * define connection version. this is the basic version, optimized + * version will be added later for scalability. + * version kept in 2 nibbles in this format: 0xMMmm MM=major, mm=minor version + */ +// a litle paranod as a UD or UDP connect can't reach a STL100 PSM recv context +// but we don't worry about UDP vs UD since can't reach eachother either +#define IPS_CONNECT_VERNO 0x0100 // 1.0 + +struct ips_connect_hdr { + uint16_t connect_verno; /* should be ver IPS_CONNECT_VERNO */ + uint16_t psm_verno; /* should be 2.0 */ + uint32_t connidx; /* ignore if 0xffffffff */ + uint64_t epid; /* epid of connector process */ +} PACK_SUFFIX; + +struct ips_connect_reqrep { + uint16_t connect_verno; /* should be ver IPS_CONNECT_VERNO */ + uint16_t psm_verno; /* should be 2.0 */ + uint32_t connidx; /* ignore if 0xffffffff */ + uint64_t epid; /* epid of connector process */ + /* above should be same as ips_connect_hdr */ + + // fields below specific to CONNECT_REQUEST/REPLY + uint16_t connect_result; /* error code */ + uint16_t sl; /* service level for matching */ + uint16_t mtu; /* receive payload */ + uint16_t job_pkey; /* partition key for verification */ + + uint32_t runid_key; /* one-time stamp connect key */ + uint32_t initpsn; /* initial psn for flow */ + + char hostname[128]; /* sender's hostname string */ + // fields below added as part of IPS_CONNECT_VERNO 1.0 + // used for rndv and user space RC QP connection (CONNECT_REQUEST/REPLY) + uint8_t rdmamode; /* IPS_PROTOEXP_FLAG_RDMA_MASK portion of rdmamode */ + uint8_t static_rate; /* ibv_rate enum */ + uint8_t reserved[6+16]; // 1st 6 bytes keep fields below 64b aligned + // fields below can be zero depending on rdmamode + + // for rndv module connection establishment only set for RNDV_MOD_MR + union ibv_gid gid; /* sender's gid */ // zero if no rndv mod RDMA + uint32_t rv_index; /* senders process index */ // zero if no rndv mod RDMA + uint32_t resv; // alignment + // for user space RC QP connection establishment only set for USE_RC + struct psm_rc_qp_attr qp_attr; // zero if no user space RC QPs + // 8 bytes of subnet and 8 bytes of epid may follow for each of up to + // PSMI_MAX_QPS in case this is a multi-rail run and/or mutliple QPs + // are opened per NIC. +} PACK_SUFFIX; + +/* Startup protocol in PSM/IPS + * + * Start timer. + * + * For all nodes to connect to: + * Grab connect lock + * Look up epid in table + * MATCH. + * assert cstate_outgoing != CONNECT_WAITING (no re-entrancy) + * If cstate_outgoing == CONNECT_DONE + * return the already connected address. + * else + * assert cstate_outgoing == CONNECT_NONE + * assert cstate_incoming == CONNECT_DONE + * cstate_outgoing := CONNECT_WAITING + * assert connidx_outgoing != UNKNOWN && connidx_incoming != UNKNOWN + * req->connidx := epaddr->connidx_incoming + * add to list of pending connect. + * NO MATCH + * allocate epaddr and put in table + * cstate_outgoing := CONNECT_WAITING + * cstate_incoming := CONNECT_NONE + * connidx_outgoing := UNKNOWN + * req->connidx := epaddr->connidx_incoming := NEW connidx integer + * add to list of pending connect + * Release connect lock + * + * expected_connect_count = ep->total_connect_count + num_to_connect + * while (expected_connect_count != ep->total_connect_count) + * check for timeout + * progress(); + * + * For all connection requests received (within progress loop) + * If uuid doesn't match, NAK the connect and skip request + * Grab connect lock + * Lock up epid in table + * MATCH + * if cstate_incoming == CONNECT_DONE + * req->connidx := epaddr->connidx_incoming + * compose reply and send again (this is a dupe request). + * else + * assert cstate_incoming == CONNECT_NONE + * assert cstate_outgoing == (CONNECT_WAITING | CONNECT_DONE) + * cstate_incoming := CONNECT_DONE + * epaddr->connidx_outgoing := req->connidx + * req->connidx := epaddr->connidx_incoming + * NO MATCH + * allocate epaddr and put in table + * cstate_incoming := CONNECT_DONE + * epaddr->connidx_outgoing = req->connidx; + * rep->connidx := epaddr->connidx_incoming := NEW connidx integer + * compose connect reply and send + * Release connect lock + * + * For all connection replies received: + * If connect_result != 0, process error and skip. + * assert cstate_outgoing == CONNECT_WAITING + * if cstate_incoming == CONNECT_DONE + * assert rep->connidx == epaddr->connidx_outgoing + * else + * epaddr->connidx_outgoing := rep->connidx + * cstate_outgoing := CONNECT_DONE + * ep->total_connect_count ++ + * + * * Fill in a connection request: + * 1. Set connect protocol version and PSM versions + * 2. Set the uuid attached to current endpoint and add the job_pkey + * the node wishes to communicate post-connect. + * 3. Set our mtu, bitwidth and endianess to detect inconsistencies + * + */ + +static int +ips_proto_build_connect_message(struct ips_proto *proto, + ips_epaddr_t *ipsaddr, uint8_t opcode, void *payload, + size_t max_paylen); + +#ifdef RNDV_MOD_MR +/* on -1 errno is status + * EIO is connection error other values are more serious (invalid call, etc) + */ +static int is_rv_connected(ips_epaddr_t *ipsaddr) +{ + int ret; + + /* ! rv_conn means we don't need a rv connection, otherwise + * return status of the connection + */ + if (! ipsaddr->rv_conn || ipsaddr->rv_connected) + return 1; + ret = __psm2_rv_connected(ipsaddr->rv_conn); + if (ret < 0 && errno != EIO) { + int save_errno = errno; + perror("can't query rv connection\n"); + errno = save_errno; + } + ipsaddr->rv_connected = (1 == ret); + return ret; +} +#else // RNDV_MOD_MR +static inline int is_rv_connected(ips_epaddr_t *ipsaddr) { return 1; } +#endif // RNDV_MOD_MR +/** + * Configure flows for an ipsaddr. + * + * @arg ipsaddr - the ipsaddr to configure the flows for + * @arg proto - the protocol used + * + * @pre proto's flags must be set + * + * Flows should be configured: + * - immediately upon creation of an ipsaddr + * - whenever a connection is established and the receiver's characteristics + * (e.g. mtu) become known + */ +ustatic +void +ips_ipsaddr_configure_flows(struct ips_epaddr *ipsaddr, struct ips_proto *proto) +{ + /* PIO flow uses the normal priority path, to separate low + * priority path for bulk sdma data packets + */ + ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO], proto, + ipsaddr, PSM_TRANSFER_PIO, PSM_PROTOCOL_GO_BACK_N, + IPS_PATH_NORMAL_PRIORITY, EP_FLOW_GO_BACK_N_PIO); + +} + +/* + * Teardown any unnecessary timers that could still be active and assign NULL + * to pointers in flow structs. We do this mainly for PIO and DMA flows. + * TidFlow teardowns are conducted in ips_protoexp_fini() + */ +static +void +ips_flow_fini(struct ips_epaddr *ipsaddr, struct ips_proto *proto) +{ + struct ips_flow *flow; + int i; + + for (i = 0; i < EP_FLOW_LAST-1; i++) { + flow = &ipsaddr->flows[i]; + + /* Cancel any stale flow->timers in flight */ + if (flow->timer_ack) { + psmi_timer_cancel(proto->timerq, flow->timer_ack); + flow->timer_ack = NULL; + } + + if (flow->timer_send) { + psmi_timer_cancel(proto->timerq, flow->timer_send); + flow->timer_send = NULL; + } + + flow->flush = NULL; + flow->path = NULL; + flow->ipsaddr = NULL; + } +} + +static +psm2_epaddr_t +ips_alloc_epaddr(struct ips_proto *proto, int master, psm2_epid_t epid, + const char *hostname, + unsigned long timeout, psm2_error_t *err_out); + +/* + * Given a connection request, set mtu, communication index and hdr length + * parameters. + * + * The most subtle parameter is the mtu. When set as 'req->mtu', the mtu + * is our connecting peer's declared mtu (which may not be the same as our + * mtu). The approach is to take the smaller of both mtus when communicating + * with that peer. Also, when using pio, the size can be further restricted by + * the pio send buffer sizes (i.e. 4K IB MTU but only 2K PIO buffers). + */ +static +psm2_error_t +ips_ipsaddr_set_req_params(struct ips_proto *proto, + ips_epaddr_t *ipsaddr, + const struct ips_connect_reqrep *req, + uint32_t paylen) +{ + psm2_ep_t ep; + psm2_epaddr_t epaddr; + psm2_error_t err = PSM2_OK; + int i, start, count; + uint64_t *data; + psmi_assert_always(req->mtu > 0); + // common_mtu will be further reduced by pr_mtu to set frag_size and RC mtu + uint16_t common_mtu = min(req->mtu, proto->epinfo.ep_mtu); + psmi_assert_always(req->static_rate > 0); + enum psm_ibv_rate common_rate = min_rate(req->static_rate, + proto->epinfo.ep_link_rate); + int ptype, pidx; + + ipsaddr->window_rv = proto->mq->hfi_base_window_rv; + + /* + * For static routes i.e. "none" path resolution update all paths to + * have the same profile (mtu, sl etc.). + * + * For path record queries the epr_mtu and epr_sl are setup correctly + * from the path itself. + */ + for (ptype = IPS_PATH_LOW_PRIORITY; + ptype < IPS_PATH_MAX_PRIORITY; ptype++) + for (pidx = 0; + pidx < ipsaddr->pathgrp->pg_num_paths[ptype]; pidx++) { + if (proto->ep->path_res_type == PSM2_PATH_RES_NONE) { + ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu = + common_mtu; + ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_static_rate = + common_rate; + } else { + ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu = + min(common_mtu, + ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_mtu); + ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_static_rate = + min_rate(common_rate, + ipsaddr->pathgrp->pg_path[pidx][ptype]->pr_static_rate); + } + } + + /* + * We've got updated mtu/path records, need to re-initialize the flows to take + * into account _real_ (updated) remote endpoint characteristics + */ + ips_ipsaddr_configure_flows(ipsaddr, proto); + + /* + * Save peer's info. + */ + ipsaddr->connidx_outgoing = req->connidx; + ipsaddr->runid_key = req->runid_key; + /* ipsaddr->initpsn = req->initpsn; */ + + err = + psmi_epid_set_hostname(psm2_epid_nid(((psm2_epaddr_t) ipsaddr)->epid), + (char *)req->hostname, 0); + if (err) + return err; + +#ifdef RNDV_MOD_MR + ipsaddr->remote_gid = req->gid; + ipsaddr->remote_rv_index = req->rv_index; + if (ipsaddr->rv_conn) { + psmi_assert(IPS_PROTOEXP_FLAG_KERNEL_QP(proto->ep->rdmamode)); + psmi_assert(proto->ep->verbs_ep.rv); + if (! __psm2_nonzero_gid(&req->gid)) { + _HFI_ERROR("mismatched PSM3_RDMA config, remote end not in mode 1\n"); + return PSM2_INTERNAL_ERR; + // TBD - if we wanted to allow mismatched config to run in UD mode + //__psm2_rv_destroy_conn(ipsaddr->rv_conn); + //ipsaddr->rv_conn = NULL; + } else { + // both sides are ready, so we can start rv_connect now + if (! ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->connecting) { + char buf[80]; + struct ib_user_path_rec path; + _HFI_MMDBG("rv_connect to: %s\n", __psm2_dump_gid(&ipsaddr->remote_gid, buf, sizeof(buf))); + // pg_path has negotiated pr_mtu and pr_static_rate + ips_path_rec_to_ib_user_path_rec(proto->ep, + ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY], + &ipsaddr->remote_gid, &path); + if (__psm2_rv_connect(ipsaddr->rv_conn, &path)) { + _HFI_ERROR("rv_connect failed: %s\n", strerror(errno)); + return PSM2_INTERNAL_ERR; + } + ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->connecting = 1; + } + } + // } else if (__psm2_nonzero_gid(&req->gid)) { + // We could fail here, but we just let remote end decide + // _HFI_ERROR("mismatched PSM3_RDMA config, remote end in mode 1\n"); + // return PSM2_INTERNAL_ERR; + } +#endif // RNDV_MOD_MR + if (ipsaddr->rc_qp) { + psmi_assert(IPS_PROTOEXP_FLAG_USER_RC_QP(proto->ep->rdmamode)); + psmi_assert(proto->ep->verbs_ep.rv + || proto->ep->mr_cache_mode != MR_CACHE_MODE_KERNEL); + if (! req->qp_attr.qpn) { + _HFI_ERROR("mismatched PSM3_RDMA config, remote end not in mode 2 or 3\n"); + return PSM2_INTERNAL_ERR; + // TBD - if we wanted to allow mismatched config to run in UD mode + //rc_qp_destroy(ipsaddr->rc_qp); + //ipsaddr->rc_qp = NULL; + } else { + // we got a REQ or a REP, we can move to RTR + // if we are only doing RDMA, we don't need any buffers, but we need a + // pool object for RQ coallesce, so we create a pool with 0 size buffers + if (PSM2_OK != psm_verbs_alloc_recv_pool(proto->ep, ipsaddr->rc_qp, &ipsaddr->recv_pool, + min(proto->ep->hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION, ipsaddr->rc_qp_max_recv_wr), + (proto->ep->rdmamode == IPS_PROTOEXP_FLAG_RDMA_USER)? 0 + // want to end up with multiple of cache line (64) + // pr_mtu is negotiated max PSM payload, not including hdrs + // pr_mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU + // be conservative (+BUFFER_HEADROOM) + : ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->pr_mtu + + MAX_PSM_HEADER + BUFFER_HEADROOM + )) { + _HFI_ERROR("failed to alloc RC recv buffers\n"); + return PSM2_INTERNAL_ERR; + } + + if (modify_rc_qp_to_init(proto->ep, ipsaddr->rc_qp)) { + _HFI_ERROR("qp_to_init failed\n"); + return PSM2_INTERNAL_ERR; + } + if (PSM2_OK != __psm2_ep_verbs_prepost_recv(&ipsaddr->recv_pool)) { + _HFI_ERROR("prepost failed\n"); + return PSM2_INTERNAL_ERR; + } + // RC QP MTU will be set to min of req->qp_attr and pr_mtu + // TBD - we already factored in req vs pr to update pr no need + // for modify_cq_qp_to_rtr to repeat it + if (modify_rc_qp_to_rtr(proto->ep, ipsaddr->rc_qp, &req->qp_attr, + ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY], //TBD path_rec + req->initpsn)) { + _HFI_ERROR("qp_to_rtr failed\n"); + return PSM2_INTERNAL_ERR; + } + } + // } else if (req->qp_attr.qpn) { + // We could fail here, but we just let remote end decide + // _HFI_ERROR("mismatched PSM3_RDMA config, remote end in mode 2 or 3\n"); + // return PSM2_INTERNAL_ERR; + } + + /* + * Check if there is other rails to setup. + */ + paylen -= sizeof(struct ips_connect_reqrep); + if (paylen == 0) + return PSM2_OK; + + /* + * Yes, other rail's gid/epid is attached. + */ + if (paylen % (sizeof(uint64_t) + sizeof(psm2_epid_t))) { + return PSM2_INTERNAL_ERR; + } + count = paylen / (sizeof(uint64_t) + sizeof(psm2_epid_t)); + if (count > PSMI_MAX_QPS) + return PSM2_INTERNAL_ERR; + + /* + * Both side are ordered, so just search from small to big. + */ + start = 0; + data = (uint64_t *) (req + 1); + ep = proto->ep->mctxt_next; + + struct drand48_data drand48_data; + srand48_r((long int)(ipsaddr->epaddr.epid + proto->ep->epid), &drand48_data); + + /* Loop over all slave endpoints */ + while (ep != ep->mctxt_master) { + for (i = start; i < count; i++) { + + /* There is a gid match, create the epaddr */ + if (data[2 * i] == ep->gid_hi + // allow_routers only applied to ethernet epids (V4) + || (psmi_allow_routers && PSMI_EPID_GET_EPID_VERSION(data[2*i+1]) == PSMI_EPID_V4) + ) { + + epaddr = + ips_alloc_epaddr(&((struct ptl_ips *)(ep->ptl_ips.ptl))->proto, 0, + data[2 * i + 1], NULL, + 5000, &err); + if (epaddr == NULL) + return err; + + /* link the ipsaddr */ + IPS_MCTXT_APPEND(ipsaddr, + (ips_epaddr_t *) epaddr); + + /* Setup message control info to the same struct */ + ((ips_epaddr_t *) epaddr)->msgctl = + ipsaddr->msgctl; + ipsaddr->msgctl->ipsaddr_count++; + + /* randomize the rail to start traffic */ + long int rnum; + lrand48_r(&drand48_data, &rnum); + if ((rnum % count) == i) { + ipsaddr->msgctl->ipsaddr_next = + (ips_epaddr_t *) epaddr; + } + + /* update the starting point, + * all previous ones are not valid anymore */ + start = i + 1; + break; + } + } + + ep = ep->mctxt_next; + } + + return PSM2_OK; +} + +static psm2_error_t +ips_proto_send_ctrl_message_request(struct ips_proto *proto, + struct ips_flow *flow, uint8_t message_type, + uint16_t *msg_queue_mask, uint64_t timeout) +{ + psm2_error_t err = PSM2_OK; + ips_scb_t ctrlscb; + + /* msg header plus gid+epid for all rails plus checksum */ + char payload[sizeof(struct ips_connect_reqrep) + + 16*PSMI_MAX_QPS + PSM_CRC_SIZE_IN_BYTES]; + uint32_t paylen; + + ctrlscb.scb_flags = 0; + paylen = ips_proto_build_connect_message(proto, + flow->ipsaddr, message_type, payload, sizeof(payload)); + psmi_assert_always(paylen <= sizeof(payload)); + + do { + err = ips_proto_send_ctrl_message(flow, message_type, + msg_queue_mask, &ctrlscb, payload, paylen); + if (err == PSM2_OK) { + break; + } + if ((err = psmi_err_only(psmi_poll_internal(proto->ep, 1)))) { + break; + } + } while (get_cycles() < timeout); + + return err; +} + +static psm2_error_t +ips_proto_send_ctrl_message_reply(struct ips_proto *proto, + struct ips_flow *flow, uint8_t message_type, + uint16_t *msg_queue_mask) +{ + /* This will try up to 100 times until the message is sent. The code + * is persistent because dropping replies will lead to a lack of + * overall progress on the connection/disconnection. We do not want + * to poll from here, and we cannot afford a lengthy timeout, since + * this is called from the receive path. + */ + psm2_error_t err = PSM2_OK; + int i; + ips_scb_t ctrlscb; + /* msg header plus gid+epid for all rails plus checksum */ + char payload[sizeof(struct ips_connect_reqrep) + + 16*PSMI_MAX_QPS + PSM_CRC_SIZE_IN_BYTES]; + uint32_t paylen; + + ctrlscb.scb_flags = 0; + paylen = ips_proto_build_connect_message(proto, + flow->ipsaddr, message_type, payload, sizeof(payload)); + psmi_assert_always(paylen <= sizeof(payload)); + + for (i = 0; i < 100; i++) { + err = ips_proto_send_ctrl_message(flow, message_type, + msg_queue_mask, &ctrlscb, payload, paylen); + if (err == PSM2_OK) { + break; + } + } + + return err; +} + +static int +ips_proto_build_connect_message(struct ips_proto *proto, + ips_epaddr_t *ipsaddr, uint8_t opcode, void *payload, + size_t max_paylen) +{ + struct ips_connect_hdr *hdr = (struct ips_connect_hdr *)payload; + struct ips_connect_reqrep *req = (struct ips_connect_reqrep *)payload; + uint32_t paylen = 0; + + psmi_assert_always(proto != NULL); + + hdr->connect_verno = IPS_CONNECT_VERNO; + hdr->psm_verno = PSMI_VERNO; + hdr->connidx = (uint32_t) ipsaddr->connidx_incoming; + hdr->epid = proto->ep->epid; + + switch (opcode) { + case OPCODE_CONNECT_REPLY: + case OPCODE_CONNECT_REQUEST: + if (opcode == OPCODE_CONNECT_REQUEST) { + req->connect_result = PSM2_OK; + req->runid_key = proto->runid_key; + } else { + req->connect_result = ipsaddr->cerror_incoming; + req->runid_key = ipsaddr->runid_key; + } + + req->sl = proto->epinfo.ep_sl; + // we keep this simple and send our local PSM payload (MTU) + // after connection negotiation of a common_mtu, the MTU will be + // further reduced by pr_mtu to set frag_size and RC QP mtu + req->mtu = proto->epinfo.ep_mtu; + req->job_pkey = proto->epinfo.ep_pkey; + + strncpy(req->hostname, psmi_gethostname(), + sizeof(req->hostname) - 1); + req->hostname[sizeof(req->hostname) - 1] = '\0'; + req->rdmamode = proto->ep->rdmamode & IPS_PROTOEXP_FLAG_RDMA_MASK; + req->static_rate = proto->epinfo.ep_link_rate; + memset(&req->reserved, 0, sizeof(req->reserved)); +#ifdef RNDV_MOD_MR + // only supply gid if we want to use kernel rv + if (IPS_PROTOEXP_FLAG_KERNEL_QP(proto->ep->rdmamode) + && proto->ep->verbs_ep.rv) { + req->gid = proto->ep->verbs_ep.lgid; + req->rv_index = proto->ep->verbs_ep.rv_index; + } else +#endif + { + memset(&req->gid, 0, sizeof(req->gid)); + req->rv_index = 0; + } + if (ipsaddr->rc_qp) { + psmi_assert(IPS_PROTOEXP_FLAG_USER_RC_QP(proto->ep->rdmamode)); + req->initpsn = proto->runid_key;// pid, not ideal, better than const + req->qp_attr.qpn = ipsaddr->rc_qp->qp_num; + req->qp_attr.mtu = opa_mtu_int_to_enum(req->mtu); + req->qp_attr.srq = 0; + req->qp_attr.resv = 0; + req->qp_attr.target_ack_delay = 0; // TBD; - from local device + req->qp_attr.resv2 = 0; + req->qp_attr.responder_resources = 0; + req->qp_attr.initiator_depth = 0; + memset(&req->qp_attr.resv3, 0, sizeof(req->qp_attr.resv3)); + } else + memset(&req->qp_attr, 0, sizeof(req->qp_attr)); + + paylen = sizeof(struct ips_connect_reqrep); + + /* Attach all multi-context subnetids and epids. */ + if (proto->ep->mctxt_master == proto->ep) { + psm2_ep_t ep = proto->ep->mctxt_next; + uint64_t *data = (uint64_t *) (req + 1); + while (ep != proto->ep) { + *data = ep->gid_hi; + paylen += sizeof(uint64_t); + data++; + *data = ep->epid; + paylen += sizeof(uint64_t); + data++; + psmi_assert_always(paylen <= max_paylen); + ep = ep->mctxt_next; + } + } + + break; + + case OPCODE_DISCONNECT_REQUEST: + case OPCODE_DISCONNECT_REPLY: + paylen = sizeof(struct ips_connect_hdr); + // TBD - this is redundant if transfer_frame uses UD for all + // control messages, but it also makes sure we stop using + // RC for any non-control messages (should be none) after disconnect + // use the UD QP's allocator and inline now and going forward + ipsaddr->use_allocator = &proto->ep->verbs_ep.send_allocator; + ipsaddr->use_qp = proto->ep->verbs_ep.qp; + ipsaddr->use_max_inline_data = proto->ep->verbs_ep.qp_cap.max_inline_data; + _HFI_MMDBG("RC discon\n"); + // ultimately we will free ipsaddr + // so that will free RC QP and its buffers + break; + + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Unexpected/unhandled connection opcode 0x%x\n", + opcode); + break; + } + + return paylen; +} + +void +MOCKABLE(ips_flow_init)(struct ips_flow *flow, struct ips_proto *proto, + ips_epaddr_t *ipsaddr, psm_transfer_type_t transfer_type, + psm_protocol_type_t protocol, ips_path_type_t path_type, + uint32_t flow_index) +{ + psmi_assert_always(protocol < PSM_PROTOCOL_LAST); + psmi_assert_always(flow_index < EP_FLOW_LAST); + + SLIST_NEXT(flow, next) = NULL; + if (transfer_type == PSM_TRANSFER_PIO) { + flow->flush = ips_proto_flow_flush_pio; + } else { + flow->flush = ips_proto_flow_flush_dma; + } + + flow->path = + ips_select_path(proto, path_type, ipsaddr, ipsaddr->pathgrp); + + /* Select the fragment size for this flow. Flow is the common + * denominator between the local endpoint, the remote endpoint, + * the path between those and whether it's a PIO or DMA send. + * Hence, it "owns" the maximum transmission unit in its frag_size + * member. + */ + + /* min of local MTU and path MTU */ + flow->frag_size = min(proto->epinfo.ep_mtu, flow->path->pr_mtu); + _HFI_VDBG("[ipsaddr=%p] UD flow->frag_size: %u = min(" + "proto->epinfo.ep_mtu(%u), flow->path->pr_mtu(%u))\n", + ipsaddr, flow->frag_size, proto->epinfo.ep_mtu, + flow->path->pr_mtu); + + flow->ipsaddr = ipsaddr; + flow->transfer = transfer_type; + flow->protocol = protocol; + flow->flowid = flow_index; + flow->xmit_seq_num.psn_val = 0; + flow->recv_seq_num.psn_val = 0; + flow->xmit_ack_num.psn_val = 0; + flow->flags = 0; + flow->credits = flow->cwin = proto->flow_credits; + flow->ack_interval = max((proto->flow_credits >> 2) - 1, 1); + flow->ack_counter = 0; +#ifdef PSM_DEBUG + flow->scb_num_pending = 0; + flow->scb_num_unacked = 0; +#endif + + flow->timer_ack = NULL; + flow->timer_send = NULL; + + STAILQ_INIT(&flow->scb_unacked); + SLIST_INIT(&flow->scb_pend); + return; +} +MOCK_DEF_EPILOGUE(ips_flow_init); + +static +psm2_epaddr_t +ips_alloc_epaddr(struct ips_proto *proto, int master, psm2_epid_t epid, + const char *hostname, + unsigned long timeout, psm2_error_t *err_out) +{ + psm2_error_t err = PSM2_OK; + psm2_epaddr_t epaddr; + ips_epaddr_t *ipsaddr; + ips_path_grp_t *pathgrp; + uint16_t lid; + uint16_t ip_hi; + + /* The PSM/PTL-level epaddr, ips-level epaddr, and per-peer msgctl + * structures are collocated in memory for performance reasons -- this is + * why ips allocates memory for all three together. + * + * The PSM/PTL structure data is filled in upon successfully ep connect in + * ips_ptl_connect(). + */ + if (master) { + struct ips_msgctl *msgctl; + _HFI_VDBG("ips_alloc_epaddr for EPID= 0x%"PRIx64"\n", epid); + + /* Although an ips_msgtl is allocated here, it can be safely casted to + both an ips_epaddr and a psm2_epaddr. It is eventually freed as an + ips_epaddr. */ + msgctl = + (struct ips_msgctl *)psmi_calloc(proto->ep, + PER_PEER_ENDPOINT, 1, + sizeof(struct ips_msgctl)); + if (msgctl == NULL) { + *err_out = PSM2_NO_MEMORY; + return NULL; + } + + ipsaddr = &msgctl->master_epaddr; + epaddr = (psm2_epaddr_t) ipsaddr; + + ipsaddr->msgctl = msgctl; + + /* initialize items in ips_msgctl_t */ + msgctl->ipsaddr_next = ipsaddr; + msgctl->mq_send_seqnum = 0; + msgctl->mq_recv_seqnum = 0; + msgctl->am_send_seqnum = 0; + msgctl->am_recv_seqnum = 0; + msgctl->ipsaddr_count = 1; + msgctl->outoforder_count = 0; + } else { + epaddr = + (psm2_epaddr_t) psmi_calloc(proto->ep, PER_PEER_ENDPOINT, 1, + sizeof(struct ips_epaddr)); + if (!epaddr) { + *err_out = PSM2_NO_MEMORY; + return NULL; + } + ipsaddr = (ips_epaddr_t *) epaddr; + } + + epaddr->ptlctl = ((struct ptl_ips *)(proto->ptl))->ctl; + epaddr->proto = proto; + epaddr->epid = epid; + + /* IPS-level epaddr */ + ipsaddr->next = ipsaddr; + + ipsaddr->ctrl_msg_queued = 0; + ipsaddr->msg_toggle = 0; + + ipsaddr->remote_qpn = PSMI_EPID_GET_CONTEXT(epid); + + /* Get path record for tuple */ + lid = PSMI_EPID_GET_LID(epid); + ip_hi = PSMI_EPID_GET_LID(epid) >> 16; + _HFI_UDDBG("qpn=0x%x lid=0x%x ip_hi=0x%x\n", ipsaddr->remote_qpn, lid, ip_hi); + + err = proto->ibta.get_path_rec(proto, proto->epinfo.ep_base_lid, + __cpu_to_be16(lid), + __cpu_to_be16(ip_hi), + timeout, + &pathgrp); + if (err != PSM2_OK) { + goto fail; + } + ipsaddr->pathgrp = pathgrp; + + /* Setup high priority path index, control messages use the high + * priority CONTROL path. + */ + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) + ipsaddr->hpp_index = 0; + else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST) + ipsaddr->hpp_index = ipsaddr->IPSADDR_HASH % + ipsaddr->pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY]; + else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC) + ipsaddr->hpp_index = proto->epinfo.EP_HASH % + ipsaddr->pathgrp->pg_num_paths[IPS_PATH_HIGH_PRIORITY]; + else /* Base LID */ + ipsaddr->hpp_index = 0; + + if (IPS_PROTOEXP_FLAG_USER_RC_QP(proto->ep->rdmamode) +#ifdef RNDV_MOD_MR + // if verbs_ep allows us to open w/o rv_open then we can't use RC QP + && (proto->ep->verbs_ep.rv + || proto->ep->mr_cache_mode != MR_CACHE_MODE_KERNEL) +#endif + ) { + struct ibv_qp_cap qp_cap; + ipsaddr->rc_qp = rc_qp_create(proto->ep, ipsaddr, &qp_cap); + if (! ipsaddr->rc_qp) { + _HFI_ERROR("unable to create RC QP\n"); + err = PSM2_INTERNAL_ERR; + goto fail; + } + if ((proto->ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) { + // we need to make sure we can't overflow send Q + if (qp_cap.max_send_wr < proto->ep->verbs_ep.send_pool.send_total) { + _HFI_ERROR("RC QP Send Q too small\n"); + err = PSM2_INTERNAL_ERR; + goto fail; + } + } + ipsaddr->rc_qp_max_recv_wr = qp_cap.max_recv_wr; + ipsaddr->rc_qp_max_inline_data = qp_cap.max_inline_data; + if (PSM2_OK != psm_verbs_init_send_allocator(&ipsaddr->send_allocator, + &proto->ep->verbs_ep.send_pool)) { + _HFI_ERROR("can't init RC QP send allocator\n"); + err = PSM2_INTERNAL_ERR; + goto fail; + } + } + // until our QP is connected, use the UD QP's allocator and inline + ipsaddr->use_allocator = &proto->ep->verbs_ep.send_allocator; + ipsaddr->use_qp = proto->ep->verbs_ep.qp; + ipsaddr->use_max_inline_data = proto->ep->verbs_ep.qp_cap.max_inline_data; + +#ifdef RNDV_MOD_MR + if (IPS_PROTOEXP_FLAG_KERNEL_QP(proto->ep->rdmamode) + && proto->ep->verbs_ep.rv) { + struct ibv_ah_attr ah_attr; + + ipsaddr->rv_connected = 0; // redundant since we calloc above + // Right now we are not doing multi-pathing and + // multi-priority so using path 0 in LOW PRIORITY (TID RDMA) is ok + // we're going to share the same path with all processes. So we + // don't want to apply dispersive routing. Hence we don't use + //ips_select_path(proto, IPS_PATH_LOW_PRIORITY, ipsaddr, ipsaddr->pathgrp); + // nor do we use the ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].path which + // configure_flows will have setup similarly + // we only need 1 connn per remote node, can share same conn for + // all ipsaddr which go to same node. so we track rv_conn at + // path record level + if (ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->rv_conn) { + ipsaddr->rv_conn = ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->rv_conn; + } else { + err = ips_path_rec_to_ah_attr(proto->ep, + ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY], // ???? + &ah_attr); + if (PSM2_OK != err) { + _HFI_ERROR("unable to get ah from path\n"); + goto fail; + } + ipsaddr->rv_conn = __psm2_rv_create_conn(proto->ep->verbs_ep.rv, + &ah_attr, (uint32_t)PSMI_EPID_GET_LID(epid)); + ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->rv_conn = ipsaddr->rv_conn; + if (! ipsaddr->rv_conn) { + _HFI_ERROR("rv_create_conn failed: %s\n", strerror(errno)); + err = PSM2_INTERNAL_ERR; +//TBD - should we make this non-fatal? Just regress to UD mode and output ERROR + goto fail; + } + } + } +#endif // RNDV_MOD_MR + + /* + * Set up the flows on this ipsaddr + */ + ips_ipsaddr_configure_flows(ipsaddr, proto); + + /* clear connection state. */ + ipsaddr->cstate_outgoing = CSTATE_NONE; + ipsaddr->cstate_incoming = CSTATE_NONE; + + /* Add epaddr to PSM's epid table */ + psmi_epid_add(proto->ep, epaddr->epid, epaddr); + psmi_assert(psmi_epid_lookup(proto->ep, epaddr->epid) == epaddr); + + *err_out = PSM2_OK; + return epaddr; + +fail: + if (ipsaddr->rc_qp) { + rc_qp_destroy(ipsaddr->rc_qp); + ipsaddr->rc_qp = NULL; + } + psmi_free(epaddr); + *err_out = err; + return NULL; +} + +static +void ips_free_epaddr(psm2_epaddr_t epaddr, struct ips_proto *proto) +{ + ips_epaddr_t *ipsaddr = (ips_epaddr_t *) epaddr; + ips_flow_fini(ipsaddr, proto); + + _HFI_VDBG("epaddr=%p,ipsaddr=%p,connidx_incoming=%d\n", epaddr, ipsaddr, + ipsaddr->connidx_incoming); +#ifdef RNDV_MOD_MR + _HFI_MMDBG("free_epaddr\n"); + if (ipsaddr->rv_conn) { + //__psm2_rv_destroy_conn(ipsaddr->rv_conn); + // TBD - call rv_disconnect or maybe rv_destroy_conn + // TBD disconnect and free rv_conn + // TBD - can we do this in a synchronous manner? + // below we free epaddr, so we will lose track of rv_conn + // but maybe rndv module will track it enough that we don't have to + // here, provided we don't confuse ourselves with a discon resp + // because the rv_conn's content we will get in that callback + // may be pointing to a freed rv_conn or freed epaddr + // maybe just call rndv_mod to set context to 0? But could + // be races for callbacks and events already queued + } +#endif // RNDV_MOD_MR + if (ipsaddr->rc_qp) { + rc_qp_destroy(ipsaddr->rc_qp); + ipsaddr->rc_qp = NULL; + } + psm_verbs_free_recv_pool(&ipsaddr->recv_pool); + psmi_epid_remove(epaddr->proto->ep, epaddr->epid); + ips_epstate_del(epaddr->proto->epstate, ipsaddr->connidx_incoming); + psmi_free(epaddr); + return; +} + +static +psm2_error_t +ptl_handle_connect_req(struct ips_proto *proto, + psm2_epaddr_t epaddr, struct ips_connect_reqrep *req, + uint32_t paylen); + +psm2_error_t +ips_proto_process_connect(struct ips_proto *proto, uint8_t opcode, + struct ips_message_header *p_hdr, void *payload, + uint32_t paylen) +{ + struct ips_connect_hdr *hdr = (struct ips_connect_hdr *)payload; + psm2_epaddr_t epaddr; + ips_epaddr_t *ipsaddr; + psm2_error_t err = PSM2_OK; + + PSMI_LOCK_ASSERT(proto->mq->progress_lock); + + epaddr = psmi_epid_lookup(proto->ep, hdr->epid); + ipsaddr = epaddr ? (ips_epaddr_t *) epaddr : NULL; + + _HFI_CONNDBG("Conn Pkt Rcv'd: op=0x%02x from: 0x%lx to: 0x%lx\n", + opcode, hdr->epid, proto->ep->epid); + switch (opcode) { + case OPCODE_CONNECT_REQUEST: + proto->epaddr_stats.connect_req_recv++; + err = ptl_handle_connect_req(proto, epaddr, + (struct ips_connect_reqrep *)hdr, + paylen); + break; + + case OPCODE_CONNECT_REPLY: + { + struct ips_connect_reqrep *req = + (struct ips_connect_reqrep *)payload; + + proto->epaddr_stats.connect_rep_recv++; + if (!ipsaddr || req->runid_key != proto->runid_key) { + _HFI_PRDBG + ("Unknown connectrep (ipsaddr=%p, %d,%d) from epid 0x%"PRIx64": %s\n", + ipsaddr, req->runid_key, proto->runid_key, + hdr->epid, psmi_epaddr_fmt_addr(hdr->epid)); + } else if (ipsaddr->cstate_outgoing != CSTATE_OUTGOING_WAITING) { + /* possible dupe */ + _HFI_VDBG("connect dupe, expected %d got %d\n", + CSTATE_OUTGOING_WAITING, + ipsaddr->cstate_outgoing); + } else { + /* Reply to our request for connection (i.e. outgoing connection) */ + if (ipsaddr->cstate_incoming != CSTATE_ESTABLISHED) { + err = + ips_ipsaddr_set_req_params(proto, + ipsaddr, + req, + paylen); + if (err) + goto fail; + } + if (ipsaddr->rc_qp) { + psmi_assert(IPS_PROTOEXP_FLAG_USER_RC_QP(proto->ep->rdmamode)); + psmi_assert(req->qp_attr.qpn); // checked in set_req_params + // we got a a REP, we can move to RTS + if (modify_rc_qp_to_rts(proto->ep, ipsaddr->rc_qp, + &req->qp_attr, proto->runid_key)) { // initpsn we sent + _HFI_ERROR("qp_to_rts failed\n"); + return PSM2_INTERNAL_ERR; + } + if ((proto->ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) { + // use RC QPs for eager and RDMA + // now we can use our own send Q and send allocator + ipsaddr->use_allocator = &ipsaddr->send_allocator; + ipsaddr->use_qp = ipsaddr->rc_qp; + ipsaddr->use_max_inline_data = ipsaddr->rc_qp_max_inline_data; + _HFI_MMDBG("RC enabled\n"); + } + ipsaddr->rc_connected = 1; + } + ipsaddr->cstate_outgoing = CSTATE_ESTABLISHED; + ipsaddr->cerror_outgoing = req->connect_result; + } + } + break; + + case OPCODE_DISCONNECT_REQUEST: + { + ips_epaddr_t ipsaddr_f; /* fake a ptl addr */ + int epaddr_do_free = 0; + psmi_assert_always(paylen == + sizeof(struct ips_connect_hdr)); + _HFI_VDBG("Got a disconnect from %s\n", + psmi_epaddr_get_name(hdr->epid)); + proto->num_disconnect_requests++; + proto->epaddr_stats.disconnect_req_recv++; + /* It's possible to get a disconnection request on a ipsaddr that + * we've since removed if the request is a dupe. Instead of + * silently dropping the packet, we "echo" the request in the + * reply. */ + if (ipsaddr == NULL) { + ips_path_grp_t *pathgrp; + uint16_t lid; + uint16_t ip_hi; + + ipsaddr = &ipsaddr_f; + memset(&ipsaddr_f, 0, sizeof(ips_epaddr_t)); + ipsaddr_f.IPSADDR_HASH = + PSMI_EPID_GET_CONTEXT(hdr->epid); + + /* Get path record for peer */ + lid = PSMI_EPID_GET_LID(hdr->epid); + ip_hi = PSMI_EPID_GET_LID(hdr->epid) >> 16; + err = proto->ibta.get_path_rec(proto, + proto->epinfo. + ep_base_lid, + __cpu_to_be16(lid), + __cpu_to_be16(ip_hi), + 3000, &pathgrp); + if (err != PSM2_OK) + goto fail; + + ipsaddr_f.pathgrp = pathgrp; + ((psm2_epaddr_t) &ipsaddr_f)->ptlctl = + ((struct ptl_ips *)(proto->ptl))->ctl; + ((psm2_epaddr_t) &ipsaddr_f)->proto = proto; + /* If the send fails because of pio_busy, don't let ips queue + * the request on an invalid ipsaddr, just drop the reply */ + ipsaddr_f.ctrl_msg_queued = ~0; + + psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); + + ips_flow_init(&ipsaddr_f. + flows[proto->msgflowid], proto, + &ipsaddr_f, PSM_TRANSFER_PIO, + PSM_PROTOCOL_GO_BACK_N, + IPS_PATH_LOW_PRIORITY, + EP_FLOW_GO_BACK_N_PIO); + _HFI_VDBG + ("Disconnect on unknown epaddr, just echo request\n"); + } else if (ipsaddr->cstate_incoming != CSTATE_NONE) { + ipsaddr->cstate_incoming = CSTATE_NONE; + proto->num_connected_incoming--; + if (ipsaddr->cstate_outgoing == CSTATE_NONE) { + epaddr_do_free = 1; + } + } + + psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); + // TBD - this is redundant if transfer_frame uses UD for all + // control messages, but it also makes sure we stop using + // RC for any non-control messages (should be none) after disconnect + // use the UD QP's allocator and inline now and going forward + ipsaddr->use_allocator = &proto->ep->verbs_ep.send_allocator; + ipsaddr->use_qp = proto->ep->verbs_ep.qp; + ipsaddr->use_max_inline_data = proto->ep->verbs_ep.qp_cap.max_inline_data; + _HFI_MMDBG("RC discon\n"); + // we will free ipsaddr below for all but "fake ipsaddr" case + // so that will free RC QP and its buffers + + ips_proto_send_ctrl_message_reply(proto, &ipsaddr-> + flows[proto-> + msgflowid], + OPCODE_DISCONNECT_REPLY, + &ipsaddr-> + ctrl_msg_queued); + /* We can safely free the ipsaddr if required since disconnect + * messages are never enqueued so no reference to ipsaddr is kept */ + if (epaddr_do_free) { + ips_free_epaddr(epaddr, proto); + epaddr = NULL; + } + } + break; + + case OPCODE_DISCONNECT_REPLY: + proto->epaddr_stats.disconnect_rep_recv++; + if (!ipsaddr) { + _HFI_CONNDBG + ("Unknown disconnect reply from epid 0x%"PRIx64": %s\n", + hdr->epid, psmi_epaddr_fmt_addr(hdr->epid)); + break; + } else if (ipsaddr->cstate_outgoing == CSTATE_OUTGOING_WAITING_DISC) { + ipsaddr->cstate_outgoing = CSTATE_OUTGOING_DISCONNECTED; + /* Freed in disconnect() if cstate_incoming == NONE */ + } /* else dupe reply */ + break; + + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Unexpected/unhandled connect opcode 0x%x\n", + opcode); + } + +fail: + return err; +} + +static +psm2_error_t +ptl_handle_connect_req(struct ips_proto *proto, psm2_epaddr_t epaddr, + struct ips_connect_reqrep *req, uint32_t paylen) +{ + ips_epaddr_t *ipsaddr; + psm2_error_t err = PSM2_OK; + uint16_t connect_result; + int newconnect = 0; + + if (req->epid == proto->ep->epid) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_EPID_NETWORK_ERROR, + "Network connectivity problem: Locally detected duplicate " + "LIDs 0x%04x on hosts %s and %s. (Exiting)", + (uint32_t) psm2_epid_nid(req->epid), + psmi_epaddr_get_hostname(req->epid), + psmi_gethostname()); + /* XXX no return */ + abort(); + } else if (epaddr == NULL) { /* new ep connect before we call into connect */ + newconnect = 1; + if ((epaddr = + ips_alloc_epaddr(proto, 1, req->epid, req->hostname, + 5000, &err)) == NULL) { + goto fail; + } + } else if (((ips_epaddr_t *) epaddr)->cstate_incoming == CSTATE_ESTABLISHED) { + ipsaddr = (ips_epaddr_t *) epaddr; + /* Duplicate lid detection. */ + if (ipsaddr->runid_key == req->runid_key) + goto do_reply; /* duplicate request, not duplicate lid */ + else { /* Some out of context message. Just drop it */ + if (!proto->done_warning) { + psmi_syslog(proto->ep, 1, LOG_INFO, + "Non-fatal connection problem: Received an out-of-context " + "connection message from host %s LID=0x%x context=%d. (Ignoring)", + req->hostname, + (int)psm2_epid_nid(req->epid), + psm2_epid_context(req->epid)); + proto->done_warning = 1; + } + goto no_reply; + } + } else if (((ips_epaddr_t *) epaddr)->cstate_outgoing == CSTATE_NONE) { + /* pre-created epaddr in multi-rail */ + psmi_assert_always(epaddr->proto->ep != + epaddr->proto->ep->mctxt_master); + newconnect = 1; + } + + ipsaddr = (ips_epaddr_t *) epaddr; + psmi_assert_always(ipsaddr->cstate_incoming == CSTATE_NONE); + + /* Check connect version and psm version */ + // for now we are strict about major rev, if we add additional optional + // features they can be minor revs and may need more sophisticated handling + if ((req->connect_verno >>8) != (IPS_CONNECT_VERNO >>8)) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_EPID_INVALID_VERSION, + "Connect protocol (%x,%x) is incompatible with %x.%x", + (req->connect_verno >> 8) & 0xff, + req->connect_verno & 0xff, + (IPS_CONNECT_VERNO >> 8) & 0xff, + IPS_CONNECT_VERNO & 0xff); + connect_result = PSM2_EPID_INVALID_CONNECT; + } else if (!psmi_verno_isinteroperable(req->psm_verno)) { + connect_result = PSM2_EPID_INVALID_VERSION; + } else if (!(proto->flags & IPS_PROTO_FLAG_QUERY_PATH_REC) && + proto->epinfo.ep_pkey != HFI_DEFAULT_P_KEY && + proto->epinfo.ep_pkey != req->job_pkey) { + connect_result = PSM2_EPID_INVALID_PKEY; + } else if (req->sl != proto->epinfo.ep_sl) { + connect_result = PSM2_EPID_INVALID_CONNECT; + _HFI_ERROR("Connection error: Service Level mismatch (local:%d, remote:%d)\n", proto->epinfo.ep_sl, req->sl); + } else if (req->rdmamode != (proto->ep->rdmamode & IPS_PROTOEXP_FLAG_RDMA_MASK) ) { + connect_result = PSM2_EPID_INVALID_CONNECT; + _HFI_ERROR("Connection error: RDMA Mode mismatch (local:%d, remote:%d)\n", + (proto->ep->rdmamode & IPS_PROTOEXP_FLAG_RDMA_MASK), req->rdmamode); + } else { + connect_result = PSM2_OK; + if (ipsaddr->cstate_outgoing == CSTATE_NONE) { + ips_epstate_idx idx; + psmi_assert_always(newconnect == 1); + err = ips_epstate_add(proto->epstate, ipsaddr, &idx); + if (err) + goto fail; + ipsaddr->connidx_incoming = idx; + } + } + + /* Incoming connection request */ + if (ipsaddr->cstate_outgoing != CSTATE_ESTABLISHED) { + err = ips_ipsaddr_set_req_params(proto, ipsaddr, req, paylen); + if (err) + goto fail; + } + ipsaddr->cstate_incoming = CSTATE_ESTABLISHED; + ipsaddr->cerror_incoming = connect_result; + + ipsaddr->runid_key = req->runid_key; + + proto->num_connected_incoming++; + +do_reply: + _HFI_CONNDBG("Conn Pkt Sent: op=0x%02x from: 0x%lx to: 0x%lx\n", + OPCODE_CONNECT_REPLY, proto->ep->epid, ipsaddr->epaddr.epid); + psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); + ips_proto_send_ctrl_message_reply(proto, + &ipsaddr->flows[proto->msgflowid], + OPCODE_CONNECT_REPLY, + &ipsaddr->ctrl_msg_queued); +no_reply: +fail: + return err; +} + +psm2_error_t +ips_proto_connect(struct ips_proto *proto, int numep, + const psm2_epid_t *array_of_epid, + const int *array_of_epid_mask, psm2_error_t *array_of_errors, + psm2_epaddr_t *array_of_epaddr, uint64_t timeout_in) +{ + int i, n, n_first; + psm2_error_t err = PSM2_OK; + psm2_epaddr_t epaddr; + ips_epaddr_t *ipsaddr; + ips_epstate_idx idx; + int numep_toconnect = 0, numep_left; + union psmi_envvar_val credits_intval; + int connect_credits; + + psmi_getenv("PSM3_CONNECT_CREDITS", + "End-point connect request credits.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)100, &credits_intval); + + connect_credits = credits_intval.e_uint; + + PSMI_LOCK_ASSERT(proto->mq->progress_lock); + + /* All timeout values are in cycles */ + uint64_t t_start = get_cycles(); + /* Print a timeout at the warning interval */ + union psmi_envvar_val warn_intval; + uint64_t to_warning_interval; + uint64_t to_warning_next; + + /* Setup warning interval */ + psmi_getenv("PSM3_CONNECT_WARN_INTERVAL", + "Period in seconds to warn if connections are not completed." + "Default is 300 seconds, 0 to disable", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)300, &warn_intval); + + to_warning_interval = nanosecs_to_cycles(warn_intval.e_uint * SEC_ULL); + to_warning_next = t_start + to_warning_interval; + + /* Some sanity checks */ + psmi_assert_always(array_of_epid_mask != NULL); + + /* First pass: make sure array of errors is at least fully defined */ + for (i = 0; i < numep; i++) { + _HFI_VDBG("epid-connect=%s connect to epid 0x%"PRIx64": %s\n", + array_of_epid_mask[i] ? "YES" : " NO", + array_of_epid[i], + psmi_epaddr_fmt_addr(array_of_epid[i])); + if (array_of_epid_mask[i]) { + array_of_errors[i] = PSM2_EPID_UNKNOWN; + array_of_epaddr[i] = NULL; + } + } + + /* Second pass: see what to connect and what is connectable. */ + for (i = 0, numep_toconnect = 0; i < numep; i++) { + if (!array_of_epid_mask[i]) + continue; + + /* Can't send to epid on same lid if not loopback */ + if ((psm2_epid_nid(proto->ep->epid) == + psm2_epid_nid(array_of_epid[i])) && + !(proto->flags & IPS_PROTO_FLAG_LOOPBACK)) { + array_of_errors[i] = PSM2_EPID_UNREACHABLE; + continue; + } + + if ((PSMI_EPID_VERSION == PSMI_EPID_V3 + || (PSMI_EPID_VERSION == PSMI_EPID_V4 && ! psmi_allow_routers)) + && (PSMI_GET_SUBNET_ID(proto->ep->gid_hi) != + PSMI_EPID_GET_SUBNET_ID(array_of_epid[i]))) { + char buf1[INET_ADDRSTRLEN]; + char buf2[INET_ADDRSTRLEN]; + if (PSMI_EPID_VERSION == PSMI_EPID_V3) + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + " Trying to connect to a node (subnet id - %"PRIx64") on a" + " different subnet - %"PRIx64"\n", + PSMI_GET_SUBNET_ID(proto->ep->gid_hi), + (uint64_t)PSMI_EPID_GET_SUBNET_ID(array_of_epid[i])); + else // V4 + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + " Trying to connect to a node (subnet %s) on a" + " different subnet %s\n", + psmi_ipv4_ntop((uint32_t)PSMI_GET_SUBNET_ID(proto->ep->gid_hi), buf1, sizeof(buf1)), + psmi_ipv4_ntop((uint32_t)PSMI_EPID_GET_SUBNET_ID(array_of_epid[i]), buf2, sizeof(buf2))); + } + + epaddr = psmi_epid_lookup(proto->ep, array_of_epid[i]); + if (epaddr == NULL) { + /* We're sending a connect request message before some other node + * has sent its connect message */ + // so we lack it's hostname, rv and qpn info + epaddr = ips_alloc_epaddr(proto, 1, array_of_epid[i], + NULL, + (timeout_in / 1000000UL), &err); + if (epaddr == NULL) { + _HFI_ERROR("Unable to issue connect to %s: %s\n", + psmi_epaddr_get_name(array_of_epid[i]), + psm2_error_get_string(err)); + goto fail; + } + ipsaddr = (ips_epaddr_t *) epaddr; + err = ips_epstate_add(proto->epstate, ipsaddr, &idx); + if (err) + goto fail; + ipsaddr->connidx_incoming = idx; + } else if (((ips_epaddr_t *) epaddr)->cstate_outgoing != CSTATE_NONE) { /* already connected */ + psmi_assert_always(((ips_epaddr_t *) epaddr)-> + cstate_outgoing == CSTATE_ESTABLISHED); + array_of_errors[i] = PSM2_EPID_ALREADY_CONNECTED; + array_of_epaddr[i] = epaddr; + continue; + } else if (((ips_epaddr_t *) epaddr)->cstate_incoming == + CSTATE_NONE) { + /* pre-created epaddr in multi-rail */ + psmi_assert_always(epaddr->proto->ep != + epaddr->proto->ep->mctxt_master); + ipsaddr = (ips_epaddr_t *) epaddr; + err = ips_epstate_add(proto->epstate, ipsaddr, &idx); + if (err) + goto fail; + ipsaddr->connidx_incoming = idx; + } else { + /* We've already received a connect request message from a remote + * peer, it's time to send our own. */ + ipsaddr = (ips_epaddr_t *) epaddr; + /* No re-entrancy sanity check and makes sure we are not connected + * twice (caller's precondition) */ + psmi_assert(ipsaddr->cstate_outgoing == CSTATE_NONE); + psmi_assert(ipsaddr->cstate_incoming != CSTATE_NONE); + } + + ipsaddr->cstate_outgoing = CSTATE_OUTGOING_WAITING; + ipsaddr->cerror_outgoing = PSM2_OK; + array_of_epaddr[i] = epaddr; + ipsaddr->s_timeout = get_cycles(); + ipsaddr->delay_in_ms = 1; + ipsaddr->credit = 0; + numep_toconnect++; + } + + /* Second pass: do the actual connect. + * PSM2_EPID_UNKNOWN: Not connected yet. + * PSM2_EPID_UNREACHABLE: Not to be connected. + * PSM2_OK: Successfully connected. + * Start sending connect messages at a random index between 0 and numep-1 + */ + numep_left = numep_toconnect; + n_first = ((uint32_t) get_cycles()) % numep; + while (numep_left > 0) { + for (n = 0; n < numep; n++) { + int keep_polling = 1; + i = (n_first + n) % numep; + if (!array_of_epid_mask[i]) + continue; + switch (array_of_errors[i]) { + case PSM2_EPID_UNREACHABLE: + case PSM2_EPID_ALREADY_CONNECTED: + case PSM2_OK: + continue; + default: + break; + } + psmi_assert_always(array_of_epaddr[i] != NULL); + ipsaddr = (ips_epaddr_t *) array_of_epaddr[i]; + if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED) { + if (ipsaddr->credit) { + connect_credits++; + ipsaddr->credit = 0; + } + switch (is_rv_connected(ipsaddr)) { + case 1: + /* This is not the real error code, we only set OK here + * so we know to stop polling for the reply. The actual + * error is in ipsaddr->cerror_outgoing */ + array_of_errors[i] = PSM2_OK; + numep_left--; + continue; + break; + case 0: + // fall through to "keep_polling" loop below to check timers + break; + default: + /* This is not the real error code, we only set OK here + * so we know to stop polling for the reply. The actual + * error is in ipsaddr->cerror_outgoing */ + array_of_errors[i] = PSM2_OK; + numep_left--; + if (ipsaddr->cerror_outgoing == PSM2_OK) + ipsaddr->cerror_outgoing = PSM2_EPID_RV_CONNECT_ERROR; + // EIO is connect error + if (errno != EIO) { + err = PSM2_INTERNAL_ERR; + goto fail; // serious error + } + continue; + break; + } + } + while (keep_polling) { + if (!psmi_cycles_left(t_start, timeout_in)) { + err = PSM2_TIMEOUT; + goto err_timeout; + } + if (to_warning_interval + && get_cycles() >= to_warning_next) { +#if _HFI_DEBUGGING + uint64_t waiting_time = 0; + if (_HFI_INFO_ON) { + waiting_time = cycles_to_nanosecs( + get_cycles() - + t_start) / SEC_ULL; + } +#endif + const char *first_name = NULL; + int num_waiting = 0; + + for (i = 0; i < numep; i++) { + if (!array_of_epid_mask[i] || + array_of_errors[i] != + PSM2_EPID_UNKNOWN) + continue; + if (!first_name) + first_name = + psmi_epaddr_get_name + (array_of_epid[i]); + num_waiting++; + } + if (_HFI_INFO_ON) { + if (first_name) { + _HFI_INFO_ALWAYS + ("Couldn't connect to %s (and %d others). " + "Time elapsed %02i:%02i:%02i. Still trying...\n", + first_name, num_waiting, + (int)(waiting_time / 3600), + (int)((waiting_time / 60) - + ((waiting_time / + 3600) * 60)), + (int)(waiting_time - + ((waiting_time / + 60) * 60))); + } + } + to_warning_next = + get_cycles() + to_warning_interval; + } + if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED) { + // just waiting for rv to be connected + if ((err = psmi_err_only(psmi_poll_internal(proto->ep, 1)))) + goto fail; + break; // let outer loop start another REQ + } + + if (get_cycles() > ipsaddr->s_timeout) { + if (!ipsaddr->credit && connect_credits) { + ipsaddr->credit = 1; + connect_credits--; + } + if (ipsaddr->credit) { + _HFI_CONNDBG("Conn Pkt Sent: op=0x%02x from: 0x%lx to: 0x%lx\n", + OPCODE_CONNECT_REQUEST, proto->ep->epid, ipsaddr->epaddr.epid); + psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); + if ( + ips_proto_send_ctrl_message_request + (proto, &ipsaddr-> + flows[proto->msgflowid], + OPCODE_CONNECT_REQUEST, + &ipsaddr->ctrl_msg_queued, + 0) == PSM2_OK) { + keep_polling = 0; + ipsaddr->delay_in_ms = + min(100, + ipsaddr-> + delay_in_ms << + 1); + ipsaddr->s_timeout = + get_cycles() + + nanosecs_to_cycles + (ipsaddr-> + delay_in_ms * + MSEC_ULL); + } + /* If not, send got "busy", keep trying */ + } else { + keep_polling = 0; + } + } + + if ((err = psmi_err_only(psmi_poll_internal(proto->ep, 1)))) + goto fail; + + if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED) { + connect_credits++; + ipsaddr->credit = 0; + switch (is_rv_connected(ipsaddr)) { + case 1: + /* This is not the real error code, we only set OK here + * so we know to stop polling for the reply. The actual + * error is in ipsaddr->cerror_outgoing */ + array_of_errors[i] = PSM2_OK; + numep_left--; + break; + case 0: + break; + default: + /* This is not the real error code, we only set OK here + * so we know to stop polling for the reply. The actual + * error is in ipsaddr->cerror_outgoing */ + array_of_errors[i] = PSM2_OK; + numep_left--; + if (ipsaddr->cerror_outgoing == PSM2_OK) + ipsaddr->cerror_outgoing = PSM2_EPID_RV_CONNECT_ERROR; + // EIO is connect error + if (errno != EIO) { + err = PSM2_INTERNAL_ERR; + goto fail; // serious error + } + break; + } + // even if ! rv_connected, let outer loop start next REQ + break; + } + } + } + } + +err_timeout: + /* Find the worst error to report */ + for (i = 0; i < numep; i++) { + if (!array_of_epid_mask[i]) + continue; + switch (array_of_errors[i]) { + /* These are benign */ + case PSM2_EPID_UNREACHABLE: + case PSM2_EPID_ALREADY_CONNECTED: + break; + case PSM2_EPID_UNKNOWN: + array_of_errors[i] = PSM2_TIMEOUT; + err = psmi_error_cmp(err, PSM2_TIMEOUT); + _HFI_CONNDBG("EP has timed out on connect.\n"); + break; + case PSM2_OK: + /* Restore the real connect error */ + ipsaddr = (ips_epaddr_t *) array_of_epaddr[i]; + array_of_errors[i] = psmi_error_cmp(ipsaddr->cerror_outgoing, + ipsaddr->cerror_incoming); + psmi_assert_always(ipsaddr->cstate_outgoing == + CSTATE_ESTABLISHED); + if (array_of_errors[i] != PSM2_OK) { + err = psmi_error_cmp(err, array_of_errors[i]); + ips_free_epaddr(array_of_epaddr[i], proto); + array_of_epaddr[i] = NULL; + } else { + proto->num_connected_outgoing++; + psmi_assert_always(ipsaddr->pathgrp-> + pg_path[0] + [IPS_PATH_HIGH_PRIORITY]-> + pr_mtu > 0); + } + break; + default: + _HFI_CONNDBG("EP has error code %d\n", array_of_errors[i]); + break; + } + } + +fail: + return err; +} + +/* Repercussions on MQ. + * + * If num_connected==0, everything that exists in the posted queue should + * complete and the error must be marked epid_was_closed. + * + */ + +psm2_error_t +ips_proto_disconnect(struct ips_proto *proto, int force, int numep, + psm2_epaddr_t array_of_epaddr[], + const int array_of_epaddr_mask[], + psm2_error_t array_of_errors[], uint64_t timeout_in) +{ + ips_epaddr_t *ipsaddr; + int numep_left, numep_todisc, i, n; + int n_first; + int has_pending; + uint64_t timeout; + psm2_error_t err = PSM2_OK; + uint64_t reqs_sent = 0; + union psmi_envvar_val credits_intval; + int disconnect_credits; + uint64_t t_warning, t_start; + union psmi_envvar_val warn_intval; + unsigned warning_secs; + + /* In case of a forced close, we cancel whatever timers are pending + * on the proto so that we don't have zombie timers coming back + * after the internal structures of PSM2 have been destroyed + */ + if (force) { + struct psmi_timer *t_cursor; + TAILQ_FOREACH(t_cursor, &proto->timerq->timerq, timer) { + psmi_timer_cancel(proto->timerq, t_cursor); + } + } + + psmi_assert_always(numep > 0); + + psmi_getenv("PSM3_DISCONNECT_CREDITS", + "End-point disconnect request credits.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)100, &credits_intval); + + disconnect_credits = credits_intval.e_uint; + + /* Setup warning interval */ + psmi_getenv("PSM3_DISCONNECT_WARN_INTERVAL", + "Period in seconds to warn if disconnections are not completed." + "Default is 300 seconds, 0 to disable.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)300, &warn_intval); + + warning_secs = warn_intval.e_uint; + + PSMI_LOCK_ASSERT(proto->mq->progress_lock); + + /* First pass: see what to disconnect and what is disconnectable */ + for (i = 0, numep_todisc = 0; i < numep; i++) { + if (!array_of_epaddr_mask[i]) + continue; + psmi_assert_always(array_of_epaddr[i]->ptlctl->ptl == + proto->ptl); + ipsaddr = (ips_epaddr_t *) array_of_epaddr[i]; + ipsaddr->credit = 0; + if (ipsaddr->cstate_outgoing == CSTATE_NONE) { + array_of_errors[i] = PSM2_OK; + continue; + } else { + psmi_assert_always(ipsaddr->cstate_outgoing == + CSTATE_ESTABLISHED); + } + _HFI_VDBG("disconnecting %p\n", array_of_epaddr[i]); + array_of_errors[i] = PSM2_EPID_UNKNOWN; + numep_todisc++; + } + if (numep_todisc == 0) + goto success; + + /* Wait for everyone to ack previous packets before putting */ + if (timeout_in == 0) + timeout = ~0ULL; + else + timeout = get_cycles() + nanosecs_to_cycles(timeout_in); + + t_start = get_cycles(); + t_warning = t_start + nanosecs_to_cycles(warning_secs * SEC_ULL); + + n_first = ((uint32_t) get_cycles()) % numep; + if (!force) { + numep_left = numep_todisc; + do { + for (n = 0; n < numep; n++) { + i = (n_first + n) % numep; + if (!array_of_epaddr_mask[i] + || array_of_errors[i] == PSM2_OK) + continue; + ipsaddr = (ips_epaddr_t *) array_of_epaddr[i]; + switch (ipsaddr->cstate_outgoing) { + case CSTATE_OUTGOING_DISCONNECTED: + array_of_errors[i] = PSM2_OK; + numep_left--; + disconnect_credits++; + ipsaddr->credit = 0; + continue; + case CSTATE_OUTGOING_WAITING_DISC: + if (ipsaddr->s_timeout > get_cycles()) + continue; + ipsaddr->delay_in_ms = + min(100, ipsaddr->delay_in_ms << 1); + ipsaddr->s_timeout = get_cycles() + + nanosecs_to_cycles(ipsaddr-> + delay_in_ms * + MSEC_ULL); + psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); + ips_proto_send_ctrl_message_request + (proto, + &ipsaddr->flows[proto->msgflowid], + OPCODE_DISCONNECT_REQUEST, + &ipsaddr->ctrl_msg_queued, + timeout); + reqs_sent++; + break; + case CSTATE_ESTABLISHED: + /* Still pending acks, hold off for now */ + has_pending = + !STAILQ_EMPTY(&ipsaddr->flows + [EP_FLOW_GO_BACK_N_PIO]. + scb_unacked) + ; + if (has_pending) + continue; + if (!ipsaddr->credit + && disconnect_credits) { + ipsaddr->credit = 1; + disconnect_credits--; + } + if (!ipsaddr->credit) + continue; + ipsaddr->delay_in_ms = 1; + ipsaddr->cstate_outgoing = + CSTATE_OUTGOING_WAITING_DISC; + ipsaddr->s_timeout = + get_cycles() + + nanosecs_to_cycles(MSEC_ULL); + psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); + ips_proto_send_ctrl_message_request + (proto, + &ipsaddr->flows[proto->msgflowid], + OPCODE_DISCONNECT_REQUEST, + &ipsaddr->ctrl_msg_queued, + timeout); + reqs_sent++; + break; + default: + psmi_handle_error(PSMI_EP_NORETURN, + PSM2_INTERNAL_ERR, + "Unhandled/unknown close state %d", + ipsaddr->cstate_outgoing); + break; + } + } + if (numep_left == 0) + break; + + if ((err = + psmi_err_only(psmi_poll_internal(proto->ep, 1)))) + goto fail; + + if (warning_secs && get_cycles() > t_warning) { + _HFI_INFO + ("graceful close in progress for %d/%d peers " + "(elapsed=%d millisecs,timeout=%d millisecs,reqs=%lld)\n", + numep_left, numep_todisc, + (int)(cycles_to_nanosecs + (get_cycles() - t_start) / MSEC_ULL), + (int)(timeout_in / MSEC_ULL), + (unsigned long long)reqs_sent); + t_warning = + get_cycles() + + nanosecs_to_cycles(warning_secs * SEC_ULL); + } + } + while (timeout > get_cycles()); + + if (numep_left > 0) { + err = PSM2_TIMEOUT; + for (i = 0; i < numep; i++) { + if (!array_of_epaddr_mask[i]) + continue; + if (array_of_errors[i] == PSM2_EPID_UNKNOWN) { + array_of_errors[i] = PSM2_TIMEOUT; + _HFI_VDBG + ("disc timeout on index %d, epaddr %s\n", + i, + psmi_epaddr_get_name + (array_of_epaddr[i]->epid)); + } + } + _HFI_PRDBG("graceful close incomplete for %d/%d peers " + "(elapsed=%d millisecs,timeout=%d millisecs,reqs=%lld)\n", + numep_left, numep_todisc, + (int)(cycles_to_nanosecs + (get_cycles() - t_start) / MSEC_ULL), + (int)(timeout_in / MSEC_ULL), + (unsigned long long)reqs_sent); + } else + _HFI_PRDBG + ("graceful close complete from %d peers in %d millisecs, reqs_sent=%lld\n", + numep_todisc, + (int)(cycles_to_nanosecs(get_cycles() - t_start) / + MSEC_ULL), (unsigned long long)reqs_sent); + } else { + psmi_assert_always(proto->msgflowid < EP_FLOW_LAST); + for (n = 0; n < numep; n++) { + i = (n_first + n) % numep; + if (!array_of_epaddr_mask[i]) + continue; + ipsaddr = (ips_epaddr_t *) array_of_epaddr[i]; + psmi_assert_always(ipsaddr->cstate_outgoing == + CSTATE_ESTABLISHED); + ips_proto_send_ctrl_message_request(proto, &ipsaddr-> + flows[proto->msgflowid], + OPCODE_DISCONNECT_REQUEST, + &ipsaddr->ctrl_msg_queued, + 0); + /* Force state to DISCONNECTED */ + ipsaddr->cstate_outgoing = CSTATE_OUTGOING_DISCONNECTED; + array_of_errors[i] = PSM2_OK; + } + _HFI_VDBG("non-graceful close complete from %d peers\n", numep); + } + + for (i = 0; i < numep; i++) { + if (!array_of_epaddr_mask[i] || array_of_errors[i] != PSM2_OK) + continue; + ipsaddr = (ips_epaddr_t *) array_of_epaddr[i]; + if (ipsaddr->cstate_outgoing == CSTATE_NONE) + continue; + psmi_assert_always(ipsaddr->cstate_outgoing == + CSTATE_OUTGOING_DISCONNECTED); + proto->num_connected_outgoing--; + /* Remote disconnect req arrived already, remove this epid. If it + * hasn't arrived yet, that's okay, we'll pick it up later and just + * mark our connect-to status as being "none". */ + if (ipsaddr->cstate_incoming == CSTATE_NONE) { + ips_free_epaddr(array_of_epaddr[i], proto); + array_of_epaddr[i] = NULL; + } else + ipsaddr->cstate_outgoing = CSTATE_NONE; + } + +fail: +success: + return err; +} + +int ips_proto_isconnected(ips_epaddr_t *ipsaddr) +{ + if (ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED || + ipsaddr->cstate_incoming == CSTATE_ESTABLISHED) + return 1; + else + return 0; +} diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_dump.c b/prov/psm3/psm3/ptl_ips/ips_proto_dump.c new file mode 100644 index 00000000000..927a8bde48a --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_proto_dump.c @@ -0,0 +1,150 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" +#include "ips_expected_proto.h" +#include "ips_proto_help.h" + +void ips_proto_dump_frame(void *frame, int lenght, char *message) +{ + uint8_t *raw_frame = frame; + int counter; + char default_message[] = ""; + + if (!message) + message = default_message; + + printf("\nHex dump of %i bytes at %p from %s\n", lenght, frame, + message); + + for (counter = 0; counter < lenght; counter++) { + if ((counter % 16) == 0) + printf("\n"); + + if ((counter % 4) == 0) + printf(" "); + + printf("%02X ", raw_frame[counter]); + } + printf("\n"); +} + +void ips_proto_dump_data(void *data, int data_length) +{ + int counter; + uint8_t *payload = (uint8_t *) data; + + printf("\nHex dump of data, length = %i\n", data_length); + + for (counter = 0; counter < data_length; counter++) { + if ((counter % 16) == 0) + printf("\n %04d: ", counter); + + if ((counter % 4) == 0) + printf(" "); + + printf("%02X ", payload[counter]); + } + printf("\n"); +} + +void ips_proto_show_header(struct ips_message_header *p_hdr, char *msg) +{ + psmi_seqnum_t ack_seq_num; + + printf("\nHeader decoding in hex: %s\n", msg ? msg : ""); + + printf("LRH: VL4-LVer4-SL4-Res2-LNH2: %x\n", + __be16_to_cpu(p_hdr->lrh[0])); + printf("LRH: DLID %x\n", __be16_to_cpu(p_hdr->lrh[1])); + printf("LRH: Res4-PktLen12 %x\n", __be16_to_cpu(p_hdr->lrh[2])); + printf("LRH: SLID %x\n", __be16_to_cpu(p_hdr->lrh[3])); + + printf("BTH: OpCode8-SE1-M1-PC2-TVer4-Pkey16 %x\n", + __be32_to_cpu(p_hdr->bth[0])); + printf("BTH: Res24-Flow8 %x\n", __be32_to_cpu(p_hdr->bth[1])); + printf("BTH: A1-PSN31 %x\n", __be32_to_cpu(p_hdr->bth[2])); + + printf("IPH: jkey-hcrc %x\n", __le32_to_cpu(p_hdr->khdr.kdeth1)); + printf("IPH: kver-sh-intr-tidctrl-tid-om-offset %x\n", + __le32_to_cpu(p_hdr->khdr.kdeth0)); + + printf("opcode %x\n", _get_proto_hfi_opcode(p_hdr)); + + ack_seq_num.psn_num = p_hdr->ack_seq_num; + if (GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0))) + printf("TidFlow Flow: %x, Gen: %x, Seq: %x\n", + (__be32_to_cpu(p_hdr->bth[1]) >> + HFI_BTH_FLOWID_SHIFT) & HFI_BTH_FLOWID_MASK, + (__be32_to_cpu(p_hdr->bth[2]) >> + HFI_BTH_GEN_SHIFT) & HFI_BTH_GEN_MASK, + (__be32_to_cpu(p_hdr->bth[2]) >> + HFI_BTH_SEQ_SHIFT) & HFI_BTH_SEQ_MASK); + else if (ips_proto_flowid(p_hdr) == EP_FLOW_TIDFLOW) + printf("ack_seq_num gen %x, seq %x\n", + ack_seq_num.psn_gen, ack_seq_num.psn_seq); + else + printf("ack_seq_num %x\n", ack_seq_num.psn_num); + + printf("src_rank/connidx %x\n", p_hdr->connidx); + if (GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0))) + printf("tid_session_gen %d\n", p_hdr->exp_rdescid_genc); + printf("flags %x\n", p_hdr->flags); +} + + + diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_expected.c b/prov/psm3/psm3/ptl_ips/ips_proto_expected.c new file mode 100644 index 00000000000..4d8ffd406b8 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_proto_expected.c @@ -0,0 +1,2399 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2016 Intel Corporation. All rights reserved. */ + +// This file implements the TID protocol for STL100 and the RDMA +// protocol for UD mode. The majority of functons in this file (perhaps all) +// are not used when TID/RDMA is disabled via PSM3_TID o PSM3_RDMA respectively +// RDMA is N/A for UDP, so it will behave as if PSM3_RDMA is disabled +// and not use functions in this file. + +#include "psm_user.h" +#include "psm2_hal.h" + +#include "ips_scb.h" +#include "ips_tid.h" +#include "ips_tidflow.h" +#include "ips_proto.h" +#include "ips_expected_proto.h" +#include "ips_proto_help.h" +#include "psm_mq_internal.h" + +/* + * Timer callbacks. When we need work to be done out of the receive process + * loop, we schedule work on timers to be done at a later time. + */ +static psm2_error_t +ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current); + +static psm2_error_t +ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current); + +#ifdef RNDV_MOD_MR +static void ips_protoexp_send_err_chk_rdma_resp(struct ips_flow *flow); +static void ips_tid_reissue_rdma_write(struct ips_tid_send_desc *tidsendc); +#endif + +static void ips_tid_scbavail_callback(struct ips_scbctrl *scbc, void *context); +static void ips_tidflow_avail_callback(struct ips_tf *tfc, void *context); + + +static psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc); +static psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc); + +#ifdef PSM_CUDA +static +void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp, + struct ips_tid_send_desc *tidsendc); +static void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, + psm2_mq_req_t req, + struct ips_tid_send_desc *tidsendc, + struct ips_cuda_hostbuf *chb_prev, + uint32_t tsess_srcoff, + uint32_t tsess_length, + uint32_t tsess_unaligned_start, + psm2_chb_match_type_t type); +#endif + +psm2_error_t +MOCKABLE(ips_protoexp_init)(const psmi_context_t *context, + const struct ips_proto *proto, + uint32_t protoexp_flags, + int num_of_send_bufs, + int num_of_send_desc, struct ips_protoexp **protoexp_o) +{ + struct ips_protoexp *protoexp = NULL; + psm2_error_t err = PSM2_OK; + + protoexp = (struct ips_protoexp *) + psmi_calloc(context->ep, UNDEFINED, 1, sizeof(struct ips_protoexp)); + if (protoexp == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + *protoexp_o = protoexp; + + protoexp->ptl = (const struct ptl *)proto->ptl; + protoexp->proto = (struct ips_proto *)proto; + protoexp->timerq = proto->timerq; + protoexp->tid_flags = protoexp_flags; + + if (context->ep->memmode == PSMI_MEMMODE_MINIMAL) { + protoexp->tid_flags |= IPS_PROTOEXP_FLAG_CTS_SERIALIZED; + } + + + /* Must be initialized already */ + /* Comment out because of Klockwork scanning critical error. CQ 11/16/2012 + psmi_assert_always(proto->ep != NULL && proto->ep->mq != NULL && + proto->ep->mq->rreq_pool != NULL && + proto->ep->mq->sreq_pool != NULL); + */ + psmi_assert_always(proto->timerq != NULL); + + /* These request pools are managed by the MQ component */ + protoexp->tid_sreq_pool = proto->ep->mq->sreq_pool; + protoexp->tid_rreq_pool = proto->ep->mq->rreq_pool; + + protoexp->ctrl_xfer_type = PSM_TRANSFER_PIO; + + /* Initialize tid flow control. */ + err = ips_tf_init(protoexp, context, &protoexp->tfc, + ips_tidflow_avail_callback); + if (err != PSM2_OK) + goto fail; + + + if ((err = ips_scbctrl_init(context, num_of_send_desc, 0, + 0, 0, ips_tid_scbavail_callback, + protoexp, &protoexp->tid_scbc_rv))) + goto fail; + + + { + union psmi_envvar_val env_rts_cts_interleave; + + psmi_getenv("PSM3_RTS_CTS_INTERLEAVE", + "Interleave the handling of RTS to provide a fair distribution between multiple senders", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)0, &env_rts_cts_interleave); + if (env_rts_cts_interleave.e_uint) + protoexp->tid_flags |= IPS_PROTOEXP_FLAG_RTS_CTS_INTERLEAVE; + } + + /* Send descriptors. + * + * There can be up to 2^32 of these send descriptors. We conservatively + * allocate 256 but large node configurations can allocate up to sdesc_num + * of these (they are about 2k each). + * We impose a theoretical limit of 2^30. + */ + { + struct psmi_rlimit_mpool rlim = TID_SENDSESSIONS_LIMITS; + uint32_t maxsz, chunksz; + + if ((err = psmi_parse_mpool_env(protoexp->proto->mq, 1, + &rlim, &maxsz, &chunksz))) + goto fail; + + protoexp->tid_desc_send_pool = + psmi_mpool_create(sizeof(struct ips_tid_send_desc), chunksz, + maxsz, 0, DESCRIPTORS, NULL, NULL); + + if (protoexp->tid_desc_send_pool == NULL) { + err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, + "Couldn't allocate tid descriptor memory pool"); + goto fail; + } + } + + /* Receive descriptors are an array in tidflow structure. */ + + /* This pool can never be smaller than the max number of rreqs that can be + * allocated. */ + { + uint32_t rreq_per_chunk, rreq_max; + + psmi_assert_always(protoexp->proto->mq->rreq_pool != NULL); + + psmi_mpool_get_obj_info(protoexp->proto->mq->rreq_pool, + &rreq_per_chunk, &rreq_max); + + protoexp->tid_getreq_pool = + psmi_mpool_create(sizeof(struct ips_tid_get_request), + rreq_per_chunk, rreq_max, 0, DESCRIPTORS, + NULL, NULL); + + if (protoexp->tid_getreq_pool == NULL) { + err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, + "Couldn't allocate getreq descriptor memory pool"); + goto fail; + } + } + + /* Timers to handle requeueing of work out of the receive path */ + psmi_timer_entry_init(&protoexp->timer_send, + ips_tid_pendsend_timer_callback, protoexp); + STAILQ_INIT(&protoexp->pend_sendq); + psmi_timer_entry_init(&protoexp->timer_getreqs, + ips_tid_pendtids_timer_callback, protoexp); + STAILQ_INIT(&protoexp->pend_getreqsq); +#ifdef RNDV_MOD_MR + STAILQ_INIT(&protoexp->pend_err_resp); +#endif + + +#ifdef PSM_CUDA + { + if (PSMI_IS_CUDA_ENABLED && + !(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) { + struct psmi_rlimit_mpool rlim = CUDA_HOSTBUFFER_LIMITS; + uint32_t maxsz, chunksz, max_elements; + + if ((err = psmi_parse_mpool_env(protoexp->proto->mq, 1, + &rlim, &maxsz, &chunksz))) + goto fail; + + /* the maxsz is the amount in MB, not the number of entries, + * since the element size depends on the window size */ + max_elements = (maxsz*1024*1024) / proto->mq->hfi_base_window_rv; + /* mpool requires max_elements to be power of 2. round down. */ + max_elements = 1 << (31 - __builtin_clz(max_elements)); + protoexp->cuda_hostbuf_recv_cfg.bufsz = + proto->mq->hfi_base_window_rv; + + protoexp->cuda_hostbuf_pool_recv = + psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf), + chunksz, max_elements, 0, + UNDEFINED, NULL, NULL, + psmi_cuda_hostbuf_alloc_func, + (void *) + &protoexp->cuda_hostbuf_recv_cfg); + + if (protoexp->cuda_hostbuf_pool_recv == NULL) { + err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, + "Couldn't allocate CUDA host receive buffer pool"); + goto fail; + } + + protoexp->cuda_hostbuf_small_recv_cfg.bufsz = + CUDA_SMALLHOSTBUF_SZ; + protoexp->cuda_hostbuf_pool_small_recv = + psmi_mpool_create_for_cuda(sizeof(struct ips_cuda_hostbuf), + chunksz, max_elements, 0, + UNDEFINED, NULL, NULL, + psmi_cuda_hostbuf_alloc_func, + (void *) + &protoexp->cuda_hostbuf_small_recv_cfg); + + if (protoexp->cuda_hostbuf_pool_small_recv == NULL) { + err = psmi_handle_error(proto->ep, PSM2_NO_MEMORY, + "Couldn't allocate CUDA host small receive buffer pool"); + goto fail; + } + + PSMI_CUDA_CALL(cuStreamCreate, + &protoexp->cudastream_recv, + CU_STREAM_NON_BLOCKING); + STAILQ_INIT(&protoexp->cudapend_getreqsq); + } else { + protoexp->cuda_hostbuf_pool_recv = NULL; + protoexp->cuda_hostbuf_pool_small_recv = NULL; + } + } +#endif + psmi_assert(err == PSM2_OK); + return err; + +fail: +#ifdef PSM_CUDA + if (protoexp != NULL && protoexp->cuda_hostbuf_pool_recv != NULL) + psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_recv); + if (protoexp != NULL && protoexp->cuda_hostbuf_pool_small_recv != NULL) + psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_small_recv); +#endif + if (protoexp != NULL && protoexp->tid_getreq_pool != NULL) + psmi_mpool_destroy(protoexp->tid_getreq_pool); + if (protoexp != NULL && protoexp->tid_desc_send_pool != NULL) + psmi_mpool_destroy(protoexp->tid_desc_send_pool); + if (protoexp != NULL) + ips_scbctrl_fini(&protoexp->tid_scbc_rv); + if (protoexp != NULL) + psmi_free(protoexp); + return err; +} +MOCK_DEF_EPILOGUE(ips_protoexp_init); + +psm2_error_t ips_protoexp_fini(struct ips_protoexp *protoexp) +{ + psm2_error_t err = PSM2_OK; + +#ifdef PSM_CUDA + if(PSMI_IS_CUDA_ENABLED && + !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) { + psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_small_recv); + psmi_mpool_destroy(protoexp->cuda_hostbuf_pool_recv); + PSMI_CUDA_CALL(cuStreamDestroy, protoexp->cudastream_recv); + } +#endif + psmi_mpool_destroy(protoexp->tid_getreq_pool); + psmi_mpool_destroy(protoexp->tid_desc_send_pool); + + if ((err = ips_scbctrl_fini(&protoexp->tid_scbc_rv))) + goto fail; + + + /* finalize tid flow control. */ + if ((err = ips_tf_fini(&protoexp->tfc))) + goto fail; + + + psmi_free(protoexp); + +fail: + return err; +} + +/* New scbs now available. If we have pending sends or pending get requests, + * turn on the timer so it can be processed. */ +/* for RDMA we can also use this routine when an MR is freed. scbc is not used + */ +static +void ips_tid_scbavail_callback(struct ips_scbctrl *scbc, void *context) +{ + struct ips_protoexp *protoexp = (struct ips_protoexp *)context; + + if (!STAILQ_EMPTY(&protoexp->pend_sendq)) + psmi_timer_request(protoexp->timerq, + &protoexp->timer_send, PSMI_TIMER_PRIO_1); + if (!STAILQ_EMPTY(&protoexp->pend_getreqsq) +#ifdef RNDV_MOD_MR + || !STAILQ_EMPTY(&protoexp->pend_err_resp) +#endif + ) + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); + return; +} + +void ips_tid_mravail_callback(struct ips_proto *proto) +{ + ips_tid_scbavail_callback(NULL, proto->protoexp); +} + + +// On STL100 ips_tf is a user space control for the HW tidflow which +// would fully process most valid inbound EXPTID packets within an RV Window. +// For UD we maintain the user space control to help manage each active +// RV window. +// There is one CTS per RV window (typically 128K). +// For UD with RV, RDMA is used instread of EXPTID, with 1 RDMA per RV window. +// Typically there are 32 (HFI_TF_NFLOWS) configured. +// The 32 is hard coded, could make it tunable. +// The tidflow provides a natural pacing mechanism and limits the total amount +// of inflight EXPTID or RDMA incoming to given receiver. +// In addition on STL100 there is an upper bound on TIDs which limited total +// inbound DMA for a receiver to avoid 4MB. For smaller messages tidflow +// count may be the limit, for larger messages TIDs would be the limit. + +/* New Tid Flows are available. If there are pending get requests put the + * get timer on the timerq so it can be processed. */ +static +void ips_tidflow_avail_callback(struct ips_tf *tfc, void *context) +{ + struct ips_protoexp *protoexp = (struct ips_protoexp *)context; + + if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)) + { + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); + } + return; +} + +// this is called from ips_proto_mq_rts_match_callback when a RTS is matched +// and we chose to use the TID receive mechanism +// this kicks off the receiver side protocol for preparing TIDs and issuing a +// CTS which requests use of TID +/* + * The tid get request is always issued from within the receive progress loop, + * which is why we always enqueue the request instead of issuing it directly. + * Eventually, if we expose tid_get to users, we will want to differentiate + * when the request comes from the receive progress loop from cases where the + * tid_get is issued directly from user code. + * + */ +psm2_error_t +ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp, + void *buf, + uint32_t length, + psm2_epaddr_t epaddr, + uint32_t remote_tok, + uint32_t flags, + ips_tid_completion_callback_t callback, + psm2_mq_req_t req) +{ + struct ips_tid_get_request *getreq; + int count; + int tidflows; + uint64_t nbytes; + + PSM2_LOG_MSG("entering"); + psmi_assert((((ips_epaddr_t *) epaddr)->window_rv % PSMI_PAGESIZE) == 0); + getreq = (struct ips_tid_get_request *) + psmi_mpool_get(protoexp->tid_getreq_pool); + + /* We can't *really* run out of these here because we always allocate as + * much as available receive reqs */ + if_pf(getreq == NULL) + { + PSM2_LOG_MSG("leaving"); + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Ran out of 'getreq' descriptors"); + } + + getreq->tidgr_protoexp = protoexp; + getreq->tidgr_epaddr = epaddr; + getreq->tidgr_lbuf = buf; + getreq->tidgr_length = length; + getreq->tidgr_sendtoken = remote_tok; + getreq->tidgr_req = req; + getreq->tidgr_callback = callback; + getreq->tidgr_offset = 0; + getreq->tidgr_bytesdone = 0; + getreq->tidgr_flags = flags; + +#ifdef PSM_CUDA + if ((req->is_buf_gpu_mem && + !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) || + ((req->is_buf_gpu_mem && + (protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) && + gpudirect_recv_threshold && + length > gpudirect_recv_threshold))) { + getreq->cuda_hostbuf_used = 1; + getreq->tidgr_cuda_bytesdone = 0; + STAILQ_INIT(&getreq->pend_cudabuf); + } else + getreq->cuda_hostbuf_used = 0; +#endif + + /* nbytes is the bytes each channel should transfer. */ + count = ((ips_epaddr_t *) epaddr)->msgctl->ipsaddr_count; +#ifdef PSM_CUDA + if (req->is_buf_gpu_mem) + nbytes = PSMI_ALIGNUP((length + count - 1) / count, PSMI_GPU_PAGESIZE); + else +#endif + nbytes = PSMI_ALIGNUP((length + count - 1) / count, PSMI_PAGESIZE); + getreq->tidgr_rndv_winsz = + min(nbytes, ((ips_epaddr_t *) epaddr)->window_rv); + _HFI_MMDBG("posting TID get request: nbytes=%"PRIu64" winsz=%u len=%u\n", + nbytes, getreq->tidgr_rndv_winsz, getreq->tidgr_length); + // we have now computed the size of each TID sequence (tidgr_rndv_winsz) + + STAILQ_INSERT_TAIL(&protoexp->pend_getreqsq, getreq, tidgr_next); + // by using tidflow we also constrain amount of concurrent RDMA to our NIC + tidflows = ips_tf_available(&protoexp->tfc); + _HFI_MMDBG("available tidflow %u\n", tidflows); + + if ( + tidflows > 0) + // get the actual TIDs and tidflows and send the CTS + ips_tid_pendtids_timer_callback(&protoexp->timer_getreqs, 0); + else if ( + tidflows != -1) + // out of TIDs, set a timer to try again later + psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs, + PSMI_TIMER_PRIO_1); + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} + +/* List of perf events */ +#define _ips_logeventid_tid_send_reqs 0 /* out of tid send descriptors */ + +#define ips_logevent_id(event) _ips_logeventid_ ## event +#define ips_logevent(proto, event, ptr) ips_logevent_inner(proto, ips_logevent_id(event), ptr) + +static +void ips_logevent_inner(struct ips_proto *proto, int eventid, void *context) +{ + uint64_t t_now = get_cycles(); + + switch (eventid) { + case ips_logevent_id(tid_send_reqs):{ + psm2_epaddr_t epaddr = (psm2_epaddr_t) context; + proto->psmi_logevent_tid_send_reqs.count++; + + if (t_now >= + proto->psmi_logevent_tid_send_reqs.next_warning) { + psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_OK, + "Non-fatal temporary exhaustion of send rdma descriptors " + "(elapsed=%.3fs, source LID=0x%x/context=%d, count=%lld)", + (double) + cycles_to_nanosecs(t_now - + proto-> + t_init) / + 1.0e9, + (int)psm2_epid_nid(epaddr-> + epid), + (int)psm2_epid_context(epaddr-> + epid), + (long long)proto-> + psmi_logevent_tid_send_reqs. + count); + proto->psmi_logevent_tid_send_reqs. + next_warning = + t_now + + sec_2_cycles(proto-> + psmi_logevent_tid_send_reqs. + interval_secs); + } + } + break; + + default: + break; + } + + return; +} + +/* + * Expected Protocol. + * + * We're granted tids (as part of a tid get request) and expected to fulfill + * the request by associating the request's sendtoken to a tid send descriptor. + * + * It's possible to be out of tid send descriptors when somehow all allocated + * descriptors can't complete all of their sends. For example, the targets of + * the sends may be busy in computation loops and not processing incoming + * packets. + */ + +// build and issue CTS +void +ips_protoexp_send_tid_grant(struct ips_tid_recv_desc *tidrecvc) +{ + ips_epaddr_t *ipsaddr = tidrecvc->ipsaddr; + struct ips_proto *proto = tidrecvc->protoexp->proto; + psmi_assert(proto->msgflowid < EP_FLOW_LAST); + struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid]; + ips_scb_t *scb; + + scb = tidrecvc->grantscb; + ips_scb_opcode(scb) = OPCODE_LONG_CTS; + scb->ips_lrh.khdr.kdeth0 = 0; + scb->ips_lrh.mdata = tidrecvc->tidflow_genseq.psn_val; + scb->ips_lrh.data[0] = tidrecvc->rdescid; + scb->ips_lrh.data[1].u32w1 = tidrecvc->getreq->tidgr_length; + scb->ips_lrh.data[1].u32w0 = tidrecvc->getreq->tidgr_sendtoken; + + ips_scb_buffer(scb) = (void *)&tidrecvc->tid_list; + ips_scb_length(scb) = sizeof(tidrecvc->tid_list); + _HFI_MMDBG("sending CTS\n"); + + PSM2_LOG_EPM(OPCODE_LONG_CTS,PSM2_LOG_TX, proto->ep->epid, + flow->ipsaddr->epaddr.epid ,"tidrecvc->getreq->tidgr_sendtoken; %d", + tidrecvc->getreq->tidgr_sendtoken); + + ips_proto_flow_enqueue(flow, scb); + flow->flush(flow, NULL); +} + + +#ifdef PSM_CUDA +static +void psmi_deallocate_chb(struct ips_cuda_hostbuf* chb) +{ + PSMI_CUDA_CALL(cuMemFreeHost, chb->host_buf); + PSMI_CUDA_CALL(cuEventDestroy, chb->copy_status); + psmi_free(chb); + return; +} +#endif + +// indicate the given tidsendc has been completed and cleanup after it +static void +ips_protoexp_tidsendc_complete(struct ips_tid_send_desc *tidsendc) +{ + struct ips_protoexp *protoexp = tidsendc->protoexp; + psm2_mq_req_t req = tidsendc->mqreq; + + _HFI_MMDBG("ips_protoexp_tidsendc_complete\n"); + PSM2_LOG_MSG("entering"); + + req->send_msgoff += tidsendc->length; + + if (tidsendc->mr) { + _HFI_MMDBG("send chunk complete, releasing MR: rkey: 0x%x\n", tidsendc->mr->rkey); + psm2_verbs_release_mr(tidsendc->mr); + tidsendc->mr = NULL; + } + +#ifdef PSM_CUDA + if (req->cuda_hostbuf_used) { + if (tidsendc->cuda_num_buf == 1) { + tidsendc->cuda_hostbuf[0]->bytes_read += + tidsendc->tid_list.tsess_length; + if(tidsendc->cuda_hostbuf[0]->bytes_read == + tidsendc->cuda_hostbuf[0]->size){ + STAILQ_REMOVE(&req->sendreq_prefetch, + tidsendc->cuda_hostbuf[0], + ips_cuda_hostbuf, req_next); + if (tidsendc->cuda_hostbuf[0]->is_tempbuf) + psmi_deallocate_chb(tidsendc->cuda_hostbuf[0]); + else { + tidsendc->cuda_hostbuf[0]->req = NULL; + tidsendc->cuda_hostbuf[0]->offset = 0; + tidsendc->cuda_hostbuf[0]->bytes_read = 0; + psmi_mpool_put(tidsendc->cuda_hostbuf[0]); + } + psmi_cuda_run_prefetcher(protoexp, tidsendc); + } + } else + psmi_free(tidsendc->userbuf); + } +#endif + /* Check if we can complete the send request. */ + if (req->send_msgoff == req->req_data.send_msglen) { + psmi_mq_handle_rts_complete(req); + } + + psmi_mpool_put(tidsendc); + /* we freed an MR If we have pending sends or pending get requests, + * turn on the timer so it can be processed. */ + ips_tid_mravail_callback(protoexp->proto); + + PSM2_LOG_MSG("leaving"); +} + +// our RDMA Write has completed on our send Q (RV or user space RC QP) +// This is called by the send CQE polling which might be within a send +// so it cannot issue any sends directly, otherwise we will have a recursive +// situation and potentially deeper recursion if more send CQEs found +// key notes in this regard: +// OPA100 code which may send acks here is ifdef'ed out since N/A to RC QP RDMA +// psmi_mq_handle_rts_complete - sets flags in req and queues it, no callbacks +// psmi_mpool_put(tidsendc) - tid_desc_send_pool has no callback configured +// ips_tid_mravail_callback - psmi_timer_request call queues timer for future +// callback (no immediate callback) +// psmi_mpool_put(tidsendc->cuda_hostbuf[0]) - cuda_hostbuf_pool_send has a +// callback of psmi_cuda_hostbuf_alloc_func which +// manages cuda buffers but does not issue any sends + +int +ips_protoexp_rdma_write_completion(uint64_t wr_id) +{ + struct ips_tid_send_desc *tidsendc = (struct ips_tid_send_desc *)(uintptr_t)wr_id; + + _HFI_MMDBG("ips_protoexp_rdma_write_completion\n"); + PSM2_LOG_MSG("entering"); + + ips_protoexp_tidsendc_complete(tidsendc); + + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; +} + +#ifdef RNDV_MOD_MR +// our RV RDMA Write has completed with error on our send Q +// This is called by the send CQE polling which might be within a send +// so it cannot issue any sends directly, otherwise we will have a recursive +// situation and potentially deeper recursion if more send CQEs found +// key notes in this regard: +// if we don't return PSM2_OK, caller will consider it an unrecoverable error +int +ips_protoexp_rdma_write_completion_error(psm2_ep_t ep, uint64_t wr_id, + enum ibv_wc_status wc_status) +{ + struct ips_tid_send_desc *tidsendc = (struct ips_tid_send_desc *)(uintptr_t)wr_id; + struct ips_protoexp *protoexp; + + PSM2_LOG_MSG("entering"); + if (! tidsendc) { + psmi_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "rv RDMA Write with invalid tidsendc: status: '%s' (%d)\n", + ibv_wc_status_str(wc_status),(int)wc_status); + goto fail_ret; + } + protoexp = tidsendc->protoexp; + _HFI_MMDBG("failed rv RDMA Write on %s to %s status: '%s' (%d)\n", + ep->verbs_ep.ib_devname, + psmi_epaddr_get_name(tidsendc->ipsaddr->epaddr.epid), + ibv_wc_status_str(wc_status),(int)wc_status); + + if (! protoexp->proto->ep->rv_reconnect_timeout) + goto fail; /* reconnect disabled, can't recover */ + + // perhaps depending on wc_status + // IBV_WC_REM_ACCESS_ERR and others unrecoverable + // IBV_WC_RETRY_EXC_ERR may be recoverable + // IBV_WC_RNR_RETRY_EXC_ERR may be recoverable + // IBV_WC_RESP_TIMEOUT_ERR may be recoverable (is this applicable?) + // any others? IB_WC_GENERAL_ERR? + + tidsendc->rv_need_err_chk_rdma = 1; + tidsendc->is_complete = 0; // status of send of err_chk_rdma + + /* Add as a pending op and ring up the timer */ + /* ips_tid_pendsend_timer_callback timer will issue ERR_CHK_RDMA */ + STAILQ_INSERT_TAIL(&protoexp->pend_sendq, tidsendc, next); + psmi_timer_request(protoexp->timerq, &protoexp->timer_send, + PSMI_TIMER_PRIO_1); + + return PSM2_OK; + +fail: + psmi_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "failed rv RDMA Write on %s to %s status: '%s' (%d)\n", + ep->verbs_ep.ib_devname, + psmi_epaddr_get_name(tidsendc->ipsaddr->epaddr.epid), + ibv_wc_status_str(wc_status),(int)wc_status); +fail_ret: + PSM2_LOG_MSG("leaving"); + return PSM2_INTERNAL_ERR; +} +#endif // RNDV_MOD_MR + +#ifdef RNDV_MOD_MR +static psm2_error_t ips_protoexp_send_err_chk_rdma(struct ips_tid_send_desc *tidsendc) +{ + ips_scb_t *scb = NULL; + struct ips_protoexp *protoexp = tidsendc->protoexp; + struct ips_proto *proto = protoexp->proto; + ips_epaddr_t *ipsaddr = tidsendc->ipsaddr; + struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid]; + psm2_error_t err = PSM2_OK; + uint32_t conn_count; + + PSM2_LOG_MSG("entering"); + _HFI_MMDBG("ips_protoexp_send_err_chk_rdma\n"); + + // we delay our sending of err chk rdma until after the connection is + // restored as reflected by an increment of conn_count relative to when + // tidsendc issued the rdma_write. This also forms a barrier to + // ensure our err chk rdma does not arrive at receiver prior to the + // rdma completion (eg. in case we timeded out for RC QP ack but + // receiver got the full rdma write). + if (__psm2_rv_get_conn_count(proto->ep->verbs_ep.rv, ipsaddr->rv_conn, + tidsendc->rv_sconn_index, &conn_count)) { + psmi_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "send_err_chk_rdma: Connect unrecoverable on %s to %s\n", + proto->ep->verbs_ep.ib_devname, + psmi_epaddr_get_name(ipsaddr->epaddr.epid)); + err = PSM2_TIMEOUT; /* force a resend reschedule */ + goto done; + } + + // conn_count only advances. Only need to test for equality. + // 32b reconnect_count sufficient for 13 years of constant reconnect + // at 100ms intervals (eg. RV_DELAY) before wrapping + if (conn_count == tidsendc->rv_conn_count) { + err = PSM2_TIMEOUT; /* force a resend reschedule */ + goto done; + } + + // limit to 1 outstanding per remote connection. + // receiver can only queue 1 response if it's low on scb's + if (ipsaddr->rv_err_chk_rdma_outstanding) { + err = PSM2_TIMEOUT; /* force a resend reschedule */ + goto done; + } + + scb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0); + if (scb == NULL) { + // ips_tid_scbavail_callback will trigger pend_sendq again + // and call ips_tid_pendsend_timer_callback + err = PSM2_EP_NO_RESOURCES; + goto done; + } + + _HFI_MMDBG("sending ERR_CHK_RDMA\n"); + PSM2_LOG_EPM(OPCODE_ERR_CHK_RDMA,PSM2_LOG_TX, proto->ep->epid, + ipsaddr->epaddr.epid, + "psmi_mpool_get_obj_index(tidsendc->mqreq): %d, tidsendc->rdescid. _desc_genc %d _desc_idx: %d, tidsendc->sdescid._desc_idx: %d", + psmi_mpool_get_obj_index(tidsendc->mqreq), + tidsendc->rdesc_id._dsc_genc,tidsendc->rdescid._desc_idx, + tidsendc->sdescid._desc_idx); + + ips_scb_opcode(scb) = OPCODE_ERR_CHK_RDMA; + scb->ips_lrh.khdr.kdeth0 = 0; + // providing our REQ index gives receiver an extra sanity check + scb->ips_lrh.mdata = psmi_mpool_get_obj_index(tidsendc->mqreq); + scb->ips_lrh.data[0] = tidsendc->rdescid; + scb->ips_lrh.data[1] = tidsendc->sdescid; + /* path is having issue, ask for ack */ + scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; + /* INTR makes sure remote end works on it immediately */ + if (proto->flags & IPS_PROTO_FLAG_RCVTHREAD) + scb->scb_flags |= IPS_SEND_FLAG_INTR; + + ipsaddr->rv_err_chk_rdma_outstanding = 1; + tidsendc->is_complete = 1; // status of send of err_chk_rdma + + proto->epaddr_stats.err_chk_rdma_send++; + + ips_proto_flow_enqueue(flow, scb); + flow->flush(flow, NULL); + + /* inbound ack will free scb */ +done: + PSM2_LOG_MSG("leaving"); + return err; +} +#endif // RNDV_MOD_MR + +#ifdef RNDV_MOD_MR +// scan all alternate addresses for "expected" (multi-QP and multi-EP) +// to see if a match for "got" can be found +static +int ips_protoexp_ipsaddr_match(ips_epaddr_t *expected, ips_epaddr_t *got) +{ + ips_epaddr_t *p = expected; + + do { + if (p == got) + return 1; + p = p->next; + } while (p != expected); + + return 0; +} +#endif // RNDV_MOD_MR + +#ifdef RNDV_MOD_MR +int ips_protoexp_process_err_chk_rdma(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_proto *proto = rcv_ev->proto; + struct ips_protoexp *protoexp = proto->protoexp; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + __u32 sendtoken = p_hdr->mdata; + ptl_arg_t rdesc_id = p_hdr->data[0]; + ptl_arg_t sdesc_id = p_hdr->data[1]; + struct ips_tid_recv_desc *tidrecvc; + psmi_assert(proto->msgflowid < EP_FLOW_LAST); + struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid]; + + PSM2_LOG_MSG("entering"); + _HFI_MMDBG("ips_protoexp_process_err_chk_rdma\n"); + + /* normal packet reliabilty protocol handling */ + if (!ips_proto_is_expected_or_nak(rcv_ev)) + goto done; + + /* processing specific to err chk rdma packet */ + proto->epaddr_stats.err_chk_rdma_recv++; + + _HFI_MMDBG("received ERR_CHK_RDMA\n"); + PSM2_LOG_EPM(OPCODE_ERR_CHK_RDMA,PSM2_LOG_RX,ipsaddr->epaddr.epid, + proto->ep->epid, + "rdescid._desc_genc %d _desc_idx: %d, sdescid._desc_idx: %d", + rdesc_id._dsc_genc,rdescid._desc_idx, sdescid._desc_idx); + + if (ipsaddr->rv_need_send_err_chk_rdma_resp) { + /* sender has >1 err chk rdma outstanding: protocol violation */ + psmi_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "process_err_chk_rdma: Protocol Violation: > 1 outstanding from remote node %s on %s\n", + psmi_epaddr_get_name(ipsaddr->epaddr.epid), + proto->ep->verbs_ep.ib_devname); + goto do_acks; + } + + /* Get receive descriptor */ + psmi_assert(rdesc_id._desc_idx < HFI_TF_NFLOWS); + tidrecvc = &protoexp->tfc.tidrecvc[rdesc_id._desc_idx]; + + tidrecvc->stats.nErrChkReceived++; + + // stash information to build resp in ipsaddr + psmi_assert(! ipsaddr->rv_need_send_err_chk_rdma_resp); + ipsaddr->rv_need_send_err_chk_rdma_resp = 1; + ipsaddr->rv_err_chk_rdma_resp_rdesc_id = rdesc_id; + ipsaddr->rv_err_chk_rdma_resp_sdesc_id = sdesc_id; + + // for the rare case that err_chk_rdma has a rdescid which we completed + // a while ago, we need to sanity check not only rdescid, but also + // the identity of the sender and the sendtoken for the senders RTS + // this protects us in case rdescid generation has wrapped + if (tidrecvc->rdescid._desc_genc != rdesc_id._desc_genc + || tidrecvc->state != TIDRECVC_STATE_BUSY + || ! ips_protoexp_ipsaddr_match(tidrecvc->ipsaddr, ipsaddr) + || tidrecvc->getreq->tidgr_sendtoken != sendtoken + ) { + /* Receive descriptor mismatch in time and space. + * Must have completed recv for this RDMA + * (eg. sender timeout waiting for RC QP ack) + */ + ipsaddr->rv_err_chk_rdma_resp_need_resend = 0; + } else if (__psm2_rv_scan_cq(proto->ep->verbs_ep.rv, RV_WC_RECV_RDMA_WITH_IMM, + RDMA_IMMED_DESC_MASK, + RDMA_PACK_IMMED(tidrecvc->rdescid._desc_genc, + tidrecvc->rdescid._desc_idx, 0))) { + // the CQ scan above solves a very rare race where the receiving QP is + // very slow to issue CQEs and PSM happens to poll the UD QP and find + // the err chk rdma before finding a succesful RDMA Write received. + // Due to reconnection essentially being a barrier, we know the + // CQE must be processed in RV drain prior to the new connection and + // hence prior to the err chk rdma on UD QP. So we scan the RV CQ + // to close the race, if we find a matching completion we can + // respond with resend_needed=0 and know we will process the CQE + // soon to fully complete the RDMA receipt. + // We ignore RV_IDX in this scan, it should always match us and better + // to not ask for a resend and fail when we process the completion + // than to ask for an a resend into a freed buffer + ipsaddr->rv_err_chk_rdma_resp_need_resend = 0; + } else { + tidrecvc->stats.nReXmit++; + ipsaddr->rv_err_chk_rdma_resp_need_resend = 1; + } + + // try to send it now, will remain "queued" until we can send + ips_protoexp_send_err_chk_rdma_resp(flow); + if (ipsaddr->rv_need_send_err_chk_rdma_resp) + // ips_tid_scbavail_callback will trigger pend_err_resp again + // and call ips_tid_pendtids_timer_callback + STAILQ_INSERT_TAIL(&protoexp->pend_err_resp, ipsaddr, pend_err_resp_next); + +do_acks: + if (__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) + ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, + &ipsaddr->flows[ips_proto_flowid(p_hdr)]); + + ips_proto_process_ack(rcv_ev); +done: + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; +} +#endif // RNDV_MOD_MR + +#ifdef RNDV_MOD_MR +static +void ips_protoexp_send_err_chk_rdma_resp(struct ips_flow *flow) +{ + ips_epaddr_t *ipsaddr = flow->ipsaddr; + struct ips_proto *proto = ipsaddr->epaddr.proto; + struct ips_protoexp *protoexp = proto->protoexp; + ips_scb_t *scb; + + PSM2_LOG_MSG("entering"); + _HFI_MMDBG("ips_protoexp_send_err_chk_rdma_resp\n"); + psmi_assert(ipsaddr->rv_need_send_err_chk_rdma_resp); + scb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0); + if (scb == NULL) { + /* ips_tid_scbavail_callback() will reschedule */ + return; + } + + _HFI_MMDBG("sending ERR_CHK_RDMA_RESP\n"); + PSM2_LOG_EPM(OPCODE_ERR_CHK_RDMA,PSM2_LOG_TX, proto->ep->epid, + ipsaddr->epaddr.epid, + "need_resend %d rdescid. _desc_genc %d _desc_idx: %d, sdescid._desc_idx: %d", + ipsaddr->rv_err_chk_rdma_resp_need_resend, + ipsaddr->rv_err_chk_rdma_resp_rdesc_id._desc_genc, + ipsaddr->rv_err_chk_rdma_resp_rdesc_id._desc_idx, + ipsaddr->rv_err_chk_rdma_resp_sdesc_id._desc_idx) + + ips_scb_opcode(scb) = OPCODE_ERR_CHK_RDMA_RESP; + scb->ips_lrh.khdr.kdeth0 = 0; + scb->ips_lrh.mdata = ipsaddr->rv_err_chk_rdma_resp_need_resend; + scb->ips_lrh.data[0] = ipsaddr->rv_err_chk_rdma_resp_rdesc_id; + scb->ips_lrh.data[1] = ipsaddr->rv_err_chk_rdma_resp_sdesc_id; + /* path is having issue, ask for ack */ + scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; + /* INTR makes sure remote end works on it immediately */ + if (proto->flags & IPS_PROTO_FLAG_RCVTHREAD) + scb->scb_flags |= IPS_SEND_FLAG_INTR; + + // The scb will own reliable transmission of resp, we can clear flag + ipsaddr->rv_need_send_err_chk_rdma_resp = 0; + + ips_proto_flow_enqueue(flow, scb); + flow->flush(flow, NULL); + + PSM2_LOG_MSG("leaving"); + return; +} +#endif // RNDV_MOD_MR + +#ifdef RNDV_MOD_MR +int ips_protoexp_process_err_chk_rdma_resp(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + struct ips_tid_send_desc *tidsendc; + uint32_t need_resend = p_hdr->mdata; + //ptl_arg_t rdesc_id = p_hdr->data[0]; + ptl_arg_t sdesc_id = p_hdr->data[1]; + + PSM2_LOG_MSG("entering"); + _HFI_MMDBG("ips_protoexp_process_err_chk_rdma_resp\n"); + + /* normal packet reliabilty protocol handling */ + if (!ips_proto_is_expected_or_nak(rcv_ev)) + goto done; + + /* processing specific to err chk rdma resp packet */ + + _HFI_MMDBG("received ERR_CHK_RDMA_RESP\n"); + PSM2_LOG_EPM(OPCODE_ERR_CHK_RDMA,PSM2_LOG_RX,ipsaddr->epaddr.epid, + protoexp->proto->ep->epid, + "rdescid. _desc_genc %d _desc_idx: %d, sdescid._desc_idx: %d", + p_hdr->data[0]._dsc_genc,p_hdr->data[0]._desc_idx, + sdescid._desc_idx); + /* Get the session send descriptor + * a subset of get_tidflow in ips_proto_recv.c since we don't + * have tidflow sequence numbers to check + */ + tidsendc = (struct ips_tid_send_desc *) + psmi_mpool_find_obj_by_index(protoexp->tid_desc_send_pool, + sdesc_id._desc_idx); + _HFI_VDBG("desc_id=%d (%p)\n", sdesc_id._desc_idx, tidsendc); + if (tidsendc == NULL) { + _HFI_ERROR("err_chk_rdma_resp: Index %d is out of range\n", + sdesc_id._desc_idx); + goto do_acks; + } else { + ptl_arg_t desc_tidsendc; + + psmi_mpool_get_obj_index_gen_count(tidsendc, + &desc_tidsendc._desc_idx, &desc_tidsendc._desc_genc); + + _HFI_VDBG("sdesc_req:id=%d,gen=%d desc_sendc:id=%d,gen=%d\n", + sdesc_id._desc_idx, sdesc_id._desc_genc, + desc_tidsendc._desc_idx, desc_tidsendc._desc_genc); + + /* See if the reference is still live and valid */ + if (desc_tidsendc.u64 != sdesc_id.u64) { + _HFI_ERROR("err_chk_rdma_resp: Genc %d does not match\n", + sdesc_id._desc_genc); + goto do_acks; + } + } + + ipsaddr->rv_err_chk_rdma_outstanding = 0; + tidsendc->rv_need_err_chk_rdma = 0; + if (need_resend) + ips_tid_reissue_rdma_write(tidsendc); + else + ips_protoexp_tidsendc_complete(tidsendc); + +do_acks: + if (__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) + ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, + &ipsaddr->flows[ips_proto_flowid(p_hdr)]); + + ips_proto_process_ack(rcv_ev); +done: + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; +} +#endif // RNDV_MOD_MR + +// Intermediate STL100 EXTID packets can be delivered to software when +// acks are requested. +// The final packet in a STL100 EXTID flow is also delivered to software +// to indicate the completion of the flow and can contain unaligned data. +// for RDMA Write we will simply use immediate data in the write +// to indicate the completed receive of the RDMA Write +// if we use RDMA Read, the local SQ Completion will indicate this +// could build and pass a ips_recvhdrq_event or pass struct ips_recvhdrq +// but all we really need is proto and len +// conn indicates where we received RDMA Write, just for quick sanity check +// for RV module conn will be the psm2_rv_conn_t +// for user RC QPs conn will be will be the RC struct ibv_qp* +int ips_protoexp_handle_immed_data(struct ips_proto *proto, uint64_t conn_ref, + int conn_type, uint32_t immed, uint32_t len) +{ + struct ips_tid_recv_desc *tidrecvc; + struct ips_protoexp *protoexp = proto->protoexp; + ptl_arg_t desc_id; + _HFI_MMDBG("ips_protoexp_immed_data\n"); + PSM2_LOG_MSG("entering"); + desc_id._desc_genc = RDMA_UNPACK_IMMED_GENC(immed); + desc_id._desc_idx = RDMA_UNPACK_IMMED_IDX(immed); + + tidrecvc = &protoexp->tfc.tidrecvc[desc_id._desc_idx]; + + if ((tidrecvc->rdescid._desc_genc & IPS_HDR_RDESCID_GENC_MASK) + != desc_id._desc_genc) { + _HFI_ERROR("stale inbound rv RDMA generation: expected %u got %u\n", + tidrecvc->rdescid._desc_genc, desc_id._desc_genc); + tidrecvc->stats.nGenErr++; + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; /* skip */ + } + + // maybe should use assert below so don't add test in production code + if (tidrecvc->state != TIDRECVC_STATE_BUSY) { + _HFI_ERROR("stale inbound rv RDMA (tidrecvc not busy)\n"); + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; /* skip */ + } + // some sanity checks + // maybe this should be an assert so don't add test in production code + if (len != tidrecvc->recv_msglen) { + // RDMA Write does not match what we asked for in CTS + _HFI_ERROR("incorrect RDMA Write Len: expected %u got %u\n", + tidrecvc->recv_msglen, len); + // TBD - what to do? + } + psmi_assert(IPS_PROTOEXP_FLAG_ENABLED & tidrecvc->protoexp->proto->ep->rdmamode); +#ifdef RNDV_MOD_MR + if (conn_type == RDMA_IMMED_RV + && RDMA_UNPACK_IMMED_RV_IDX(immed) != proto->ep->verbs_ep.rv_index) { + // RV module should not have delivered this CQE to us + _HFI_ERROR("incorrect RDMA RV Index: expected %u got %u\n", + proto->ep->verbs_ep.rv_index, RDMA_UNPACK_IMMED_RV_IDX(immed)); + return IPS_RECVHDRQ_CONTINUE; /* skip */ + } +#endif + // For User RC conn_ref is context we set in rc_qp_create (*ipsaddr) + // For Kernel RC, conn_ref is the conn handle (psm2_rv_conn_get_conn_handle) + // maybe this should be an assert so don't add test in production code + if ((conn_type == RDMA_IMMED_USER_RC) + && (uint64_t)tidrecvc->ipsaddr != conn_ref) { + // RDWA Write is not on expected RC QP from remote node + _HFI_ERROR("RDMA Write on Wrong User QP 0x%"PRIx64", expect 0x%"PRIx64"\n", + conn_ref, (uint64_t)tidrecvc->ipsaddr); + // TBD - what to do? + } +#ifdef RNDV_MOD_MR + if (conn_type == RDMA_IMMED_RV + && psm2_rv_conn_get_conn_handle(tidrecvc->ipsaddr->rv_conn) + != conn_ref) { + // RDWA Write is not on expected RV QP from remote node + _HFI_ERROR("RDMA Write on Wrong RV QP 0x%"PRIx64", expect 0x%"PRIx64"\n", + conn_ref, + psm2_rv_conn_get_conn_handle(tidrecvc->ipsaddr->rv_conn)); + // TBD - what to do? + } +#endif + if (_HFI_PDBG_ON) + __psm2_dump_buf(tidrecvc->buffer, len); + + /* Reset the swapped generation count as we received a valid packet */ + tidrecvc->tidflow_nswap_gen = 0; + + /* Do some sanity checking */ + psmi_assert_always(tidrecvc->state == TIDRECVC_STATE_BUSY); + // STL100 does this at the end of ips_protoexp_send_tid_completion + // TBD - seems like this should be done after ips_tid_recv_free + // so we have more likelihood of getting freshly freed resources? + if (tidrecvc->protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) { + tidrecvc->ipsaddr->flows[protoexp->proto->msgflowid].flags &= ~IPS_FLOW_FLAG_SKIP_CTS; /* Let the next CTS be processed */ + ips_tid_pendtids_timer_callback(&tidrecvc->protoexp->timer_getreqs, 0); /* and make explicit progress for it. */ + } + + /* Mark receive as done */ + ips_tid_recv_free(tidrecvc); + _HFI_MMDBG("tidrecv done\n"); + PSM2_LOG_MSG("leaving"); + + return IPS_RECVHDRQ_CONTINUE; +} + + + +#ifdef PSM_CUDA +static +psm2_error_t +psmi_cuda_reclaim_hostbufs(struct ips_tid_get_request *getreq) +{ + struct ips_protoexp *protoexp = getreq->tidgr_protoexp; + struct ips_tid_getreq_cuda_hostbuf_pend *cmemcpyhead = + &getreq->pend_cudabuf; + struct ips_cuda_hostbuf *chb; + CUresult status; + + /* Get the getreq's first memcpy op */ + while (!STAILQ_EMPTY(cmemcpyhead)) { + chb = STAILQ_FIRST(cmemcpyhead); + PSMI_CUDA_CHECK_EVENT(chb->copy_status, status); + if (status != CUDA_SUCCESS) { + /* At least one of the copies is still + * in progress. Schedule the timer, + * then leave the CUDA progress phase + * and check for other pending TID work. + */ + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, + PSMI_TIMER_PRIO_1); + return PSM2_OK_NO_PROGRESS; + } + /* The getreq's oldest cudabuf is done. Reclaim it. */ + getreq->tidgr_cuda_bytesdone += chb->size; + STAILQ_REMOVE_HEAD(cmemcpyhead, next); + psmi_mpool_put(chb); + } + return PSM2_OK; +} + +static +struct ips_cuda_hostbuf* psmi_allocate_chb(uint32_t window_len) +{ + struct ips_cuda_hostbuf* chb = (struct ips_cuda_hostbuf*) + psmi_calloc(PSMI_EP_NONE, + UNDEFINED, 1, + sizeof(struct ips_cuda_hostbuf)); + if (chb == NULL) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY, + "Couldn't allocate cuda host buffers "); + } + PSMI_CUDA_CALL(cuMemHostAlloc, + (void **) &chb->host_buf, + window_len, + CU_MEMHOSTALLOC_PORTABLE); + PSMI_CUDA_CALL(cuEventCreate, &chb->copy_status, CU_EVENT_DEFAULT); + return chb; +} + +static +void psmi_cuda_run_prefetcher(struct ips_protoexp *protoexp, + struct ips_tid_send_desc *tidsendc) +{ + struct ips_proto *proto = protoexp->proto; + struct ips_cuda_hostbuf *chb = NULL; + psm2_mq_req_t req = tidsendc->mqreq; + uint32_t offset, window_len; + + /* try to push the prefetcher forward */ + if (req->prefetch_send_msgoff < req->req_data.send_msglen) { + /* some data remains to be sent */ + offset = req->prefetch_send_msgoff; + window_len = + ips_cuda_next_window(tidsendc->ipsaddr->window_rv, + offset, req->req_data.buf_len); + if (window_len <= CUDA_SMALLHOSTBUF_SZ) + chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( + proto->cuda_hostbuf_pool_small_send); + if (chb == NULL) + chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( + proto->cuda_hostbuf_pool_send); + /* were any buffers available for the prefetcher? */ + if (chb == NULL) + return; + req->prefetch_send_msgoff += window_len; + chb->offset = offset; + chb->size = window_len; + chb->req = req; + chb->gpu_buf = (CUdeviceptr) req->req_data.buf + offset; + chb->bytes_read = 0; + PSMI_CUDA_CALL(cuMemcpyDtoHAsync, + chb->host_buf, chb->gpu_buf, + window_len, + proto->cudastream_send); + PSMI_CUDA_CALL(cuEventRecord, chb->copy_status, + proto->cudastream_send); + + STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next); + return; + } + return; +} + +static +void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, + psm2_mq_req_t req, + struct ips_tid_send_desc *tidsendc, + struct ips_cuda_hostbuf *chb_prev, + uint32_t tsess_srcoff, + uint32_t tsess_length, + uint32_t tsess_unaligned_start, + psm2_chb_match_type_t type) +{ + struct ips_proto *proto = protoexp->proto; + struct ips_cuda_hostbuf *chb = NULL; + uint32_t offset, window_len, attached=0; + + /* try to push the prefetcher forward */ + while (req->prefetch_send_msgoff < tsess_srcoff + tsess_length) { + /* some data remains to be sent */ + offset = req->prefetch_send_msgoff; + window_len = + ips_cuda_next_window(tidsendc->ipsaddr->window_rv, + offset, req->req_data.buf_len); + if (window_len <= CUDA_SMALLHOSTBUF_SZ) + chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( + proto->cuda_hostbuf_pool_small_send); + if (chb == NULL) + chb = (struct ips_cuda_hostbuf *) psmi_mpool_get( + proto->cuda_hostbuf_pool_send); + + /* were any buffers available? If not force allocate */ + if (chb == NULL) { + chb = psmi_allocate_chb(window_len); + psmi_assert(chb); + chb->is_tempbuf = 1; + } + req->prefetch_send_msgoff += window_len; + chb->offset = offset; + chb->size = window_len; + chb->req = req; + chb->gpu_buf = (CUdeviceptr) req->req_data.buf + offset; + chb->bytes_read = 0; + PSMI_CUDA_CALL(cuMemcpyDtoHAsync, + chb->host_buf, chb->gpu_buf, + window_len, + proto->cudastream_send); + PSMI_CUDA_CALL(cuEventRecord, chb->copy_status, + proto->cudastream_send); + + STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next); + if (type == PSMI_CUDA_PARTIAL_MATCH_FOUND) { + if ((tsess_srcoff < chb->offset) + && ((tsess_srcoff + tsess_length) > chb->offset)) { + tidsendc->cuda_hostbuf[0] = chb_prev; + tidsendc->cuda_hostbuf[1] = chb; + tidsendc->cuda_num_buf = 2; + void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED, + tsess_length); + tidsendc->userbuf = + (void *)((uintptr_t) buffer); + tidsendc->buffer = + (void *)((uintptr_t)tidsendc->userbuf + + tsess_unaligned_start); + return; + } + } else { + if (attached) { + tidsendc->cuda_hostbuf[0] = chb_prev; + tidsendc->cuda_hostbuf[1] = chb; + tidsendc->cuda_num_buf = 2; + void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED, + tsess_length); + tidsendc->userbuf = + (void *)((uintptr_t) buffer); + tidsendc->buffer = + (void *)((uintptr_t)tidsendc->userbuf + + tsess_unaligned_start); + attached = 0; + return; + } + if ((tsess_srcoff > chb->offset) + && (tsess_srcoff < (chb->offset + chb->size)) + && ((tsess_srcoff + tsess_length) > (chb->offset + chb->size))) { + chb_prev = chb; + attached = 1; + chb = NULL; + continue; + } else if ((chb->offset <= tsess_srcoff) && + ((tsess_srcoff + tsess_length) <= + (chb->offset+chb->size))) { + tidsendc->cuda_hostbuf[0] = chb; + tidsendc->cuda_hostbuf[1] = NULL; + tidsendc->cuda_num_buf = 1; + tidsendc->userbuf = + (void *)((uintptr_t) chb->host_buf + + tsess_srcoff - chb->offset); + tidsendc->buffer = + (void *)((uintptr_t)tidsendc->userbuf + + tsess_unaligned_start ); + return; + } else + chb = NULL; + } + } +} + + +static +psm2_chb_match_type_t psmi_find_match_in_prefeteched_chb(struct ips_cuda_hostbuf* chb, + ips_tid_session_list *tid_list, + uint32_t prefetch_send_msgoff) +{ + /* To get a match: + * 1. Tid list offset + length is contained within a chb + * 2. Tid list offset + length is contained within + * the prefetched offset of this req. + * 3. Tid list offset + length is partially prefetched + * within one chb. (A partial match) + */ + if (chb->offset <= tid_list->tsess_srcoff) { + if ((chb->offset + chb->size) >= + (tid_list->tsess_srcoff + tid_list->tsess_length)) { + return PSMI_CUDA_FULL_MATCH_FOUND; + } else { + if((chb->offset + chb->size) > tid_list->tsess_srcoff){ + if(((chb->offset + (2 * chb->size)) > + (tid_list->tsess_srcoff + tid_list->tsess_length)) && + ((prefetch_send_msgoff) >= + (tid_list->tsess_srcoff + tid_list->tsess_length))){ + return PSMI_CUDA_SPLIT_MATCH_FOUND; + } else if((tid_list->tsess_srcoff + tid_list->tsess_length) + > prefetch_send_msgoff) { + return PSMI_CUDA_PARTIAL_MATCH_FOUND; + } + } + } + } + return PSMI_CUDA_CONTINUE; +} +#endif + +// sender handling of a CTS which indicates use of TID protocol +psm2_error_t +ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, + ips_epaddr_t *ipsaddr, + psm2_mq_req_t req, + ptl_arg_t rdescid, + uint32_t tidflow_genseq, + ips_tid_session_list *tid_list, + uint32_t tid_list_size) +{ + struct ips_tid_send_desc *tidsendc; + _HFI_MMDBG("ips_tid_send_handle_tidreq\n"); + + PSM2_LOG_MSG("entering"); + psmi_assert(tid_list_size == sizeof(ips_tid_session_list)); + + tidsendc = (struct ips_tid_send_desc *) + psmi_mpool_get(protoexp->tid_desc_send_pool); + if (tidsendc == NULL) { + PSM2_LOG_MSG("leaving"); + ips_logevent(protoexp->proto, tid_send_reqs, ipsaddr); + return PSM2_EP_NO_RESOURCES; + } + + req->ptl_req_ptr = (void *)tidsendc; + tidsendc->protoexp = protoexp; + + /* Uniquely identify this send descriptor in space and time */ + tidsendc->sdescid._desc_idx = psmi_mpool_get_obj_index(tidsendc); + tidsendc->sdescid._desc_genc = psmi_mpool_get_obj_gen_count(tidsendc); + tidsendc->rdescid = rdescid; + tidsendc->ipsaddr = ipsaddr; + tidsendc->mqreq = req; + + /* Copy received tidinfo to local tidsendc buffer. */ + psmi_mq_mtucpy_host_mem(&tidsendc->tid_list, tid_list, + sizeof(ips_tid_session_list)); + // for UD we do not need a ips_flow since we will use the RC QP and + // then will use our main flow for the final RV completion control msg + // The path record for use by RDMA will be selected when the connection + // is established + tidsendc->mr = NULL; // be safe,but should be NULL since clear on release + _HFI_VDBG("recv'd CTS: rkey 0x%x srcoff %u raddr 0x%"PRIx64" len %u\n", + tid_list->tsess_rkey, tid_list->tsess_srcoff, tid_list->tsess_raddr, + tid_list->tsess_length); + + tidsendc->userbuf = + (void *)((uintptr_t) req->req_data.buf + tid_list->tsess_srcoff); + tidsendc->buffer = (void *)((uintptr_t)tidsendc->userbuf + ); + tidsendc->length = tid_list->tsess_length; + _HFI_MMDBG("tidsendc created userbuf %p buffer %p length %u\n", + tidsendc->userbuf, tidsendc->buffer, tidsendc->length); + +#ifdef PSM_CUDA + /* Matching on previous prefetches and initiating next prefetch */ + struct ips_cuda_hostbuf *chb = NULL, *chb_next = NULL; + psm2_chb_match_type_t rc = PSMI_CUDA_CONTINUE; + + /* check if the prefetcher has a buffer ready to use */ + tidsendc->cuda_hostbuf[0] = NULL; + tidsendc->cuda_hostbuf[1] = NULL; + tidsendc->cuda_num_buf = 0; + if (req->cuda_hostbuf_used) { + /* To get a match: + * 1. Tid list offset + length is contained within a chb + * 2. Tid list offset + length is contained within + * the prefetched offset of this req. + * 3. Tid list offset + length is partially prefetched + * within one chb. (A partial match) + */ + STAILQ_FOREACH(chb, &req->sendreq_prefetch, req_next) { + rc = psmi_find_match_in_prefeteched_chb(chb, + tid_list, + req->prefetch_send_msgoff); + if (rc < PSMI_CUDA_CONTINUE) + break; + } + if (rc == PSMI_CUDA_FULL_MATCH_FOUND) { + tidsendc->userbuf = + (void *)((uintptr_t) chb->host_buf+ + tid_list->tsess_srcoff - chb->offset); + tidsendc->buffer = + (void *)((uintptr_t)tidsendc->userbuf + ); + /* now associate the buffer with the tidsendc */ + tidsendc->cuda_hostbuf[0] = chb; + tidsendc->cuda_hostbuf[1] = NULL; + tidsendc->cuda_num_buf = 1; + } else if (rc == PSMI_CUDA_SPLIT_MATCH_FOUND){ + void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED, + tid_list->tsess_length); + tidsendc->userbuf = + (void *)((uintptr_t) buffer); + tidsendc->buffer = + (void *)((uintptr_t)tidsendc->userbuf + ); + chb_next = STAILQ_NEXT(chb, req_next); + tidsendc->cuda_hostbuf[0] = chb; + tidsendc->cuda_hostbuf[1] = chb_next; + tidsendc->cuda_num_buf = 2; + } else if (rc == PSMI_CUDA_PARTIAL_MATCH_FOUND) { + psmi_attach_chb_to_tidsendc(protoexp, req, + tidsendc, + chb, + tid_list->tsess_srcoff, + tid_list->tsess_length, + 0, + rc); + } else { + psmi_attach_chb_to_tidsendc(protoexp, req, + tidsendc, + NULL, + tid_list->tsess_srcoff, + tid_list->tsess_length, + 0, + PSMI_CUDA_CONTINUE); + } + } +#endif // PSM_CUDA + + tidsendc->is_complete = 0; + tidsendc->reserved = 0; +#ifdef RNDV_MOD_MR + tidsendc->rv_need_err_chk_rdma = 0; + tidsendc->rv_sconn_index = 0; + tidsendc->rv_conn_count = 0; +#endif + + _HFI_EXP + ("alloc tidsend=%4d tidrecv=%4d srcoff=%6d length=%6d" + "\n", + tidsendc->sdescid._desc_idx, rdescid._desc_idx, + tid_list->tsess_srcoff, tid_list->tsess_length + ); + + // start sending TIDEXP packets + ips_tid_send_exp(tidsendc); + + /* Add as a pending op and ring up the timer */ + if (tidsendc->is_complete == 0) { + STAILQ_INSERT_TAIL(&protoexp->pend_sendq, tidsendc, next); + psmi_timer_request(protoexp->timerq, &protoexp->timer_send, + PSMI_TIMER_PRIO_1); + } + + PSM2_LOG_MSG("leaving"); + /* Consider breaking out of progress engine here */ + return PSM2_OK; +} + + +/* + * Returns: + * + * PSM2_OK: scb was allocated for at least one frame, the packet may be queued + * or actually sent. + * + * PSM2_OK_NO_PROGRESS: Reached a limit on the maximum number of sends we allow + * to be enqueued before polling receive queue. + * + * PSM2_EP_NO_RESOURCES: No scbs, available, a callback will be issued when more + * scbs become available. + * + * PSM2_TIMEOUT: PIO-busy or DMA-busy, stop trying to send for now. + * + */ + +// issue RDMA Write in response to a CTS +psm2_error_t ips_tid_issue_rdma_write(struct ips_tid_send_desc *tidsendc) +{ + struct ips_protoexp *protoexp = tidsendc->protoexp; + struct ips_proto *proto = protoexp->proto; + psm2_error_t err = PSM2_OK; + + // for STL100 native we would loop on ips_scb_prepare_tid_sendctrl and + // ips_proto_flow_enqueue to prepare EXPTID scbs for the TIDFLOW protocol + // and queue and issue them. Once they were all posted the is_complete + // flag would be set. For larger messages, it might take multiple + // attempts to get resources to queue everything in which case callbacks + // and timers ensure progress + // For verbs we are delegating the RC Write "flow" to the NIC's RC QP + // it will manage segmentation, sequence numbers and acks for the flow + // so our job is done here after one call. + // we use immediate data with the rdescid to trigger a CQE on receiver + // so it knows when RDMA is done + // if too many RDMA in flight retry later when next RDMA completes + if (protoexp->proto->ep->verbs_ep.send_rdma_outstanding + >= protoexp->proto->ep->hfi_num_send_rdma) { + err = PSM2_EP_NO_RESOURCES; // try again on next RDMA completion + } else if (tidsendc->mr) { + // registered or referenced in previous failed ips_tid_send_exp attempt + // no need to register again + err = PSM2_OK; + } else if ( +#ifdef PSM_CUDA + ! tidsendc->mqreq->cuda_hostbuf_used && +#endif + // separate MR cache's per EP, so this confirms we have the same EP + tidsendc->mqreq->mr && tidsendc->mqreq->mr->cache == proto->mr_cache) { + // we can use the same MR as the whole mqreq + _HFI_MMDBG("CTS send chunk reference send: %p %u bytes via %p %"PRIu64"\n", tidsendc->buffer, tidsendc->length, tidsendc->mqreq->mr->addr, tidsendc->mqreq->mr->length); + tidsendc->mr = psm2_verbs_ref_mr(tidsendc->mqreq->mr); + } else { + // we need an MR for this chunk + _HFI_MMDBG("CTS send chunk register send: %p %u bytes\n", tidsendc->buffer , tidsendc->length); + tidsendc->mr = psm2_verbs_reg_mr(proto->mr_cache, 1, + proto->ep->verbs_ep.pd, + tidsendc->buffer, tidsendc->length, 0 +#ifdef PSM_CUDA + | ((tidsendc->mqreq->is_buf_gpu_mem + && !tidsendc->mqreq->cuda_hostbuf_used) + ?IBV_ACCESS_IS_GPU_ADDR:0) +#endif + ); + if (! tidsendc->mr) + err = PSM2_TIMEOUT; /* force a resend reschedule */ + } + + // if post_send fails below, we'll try again later + // completion handler decides how to handle any WQE/CQE errors + _HFI_MMDBG("tidsendc prior to post userbuf %p buffer %p length %u\n", + tidsendc->userbuf, tidsendc->buffer, tidsendc->length); +#ifdef RNDV_MOD_MR + if (err == PSM2_OK) { + psmi_assert(IPS_PROTOEXP_FLAG_ENABLED & protoexp->proto->ep->rdmamode); + if (IPS_PROTOEXP_FLAG_KERNEL_QP(protoexp->proto->ep->rdmamode)) + err = psm2_verbs_post_rv_rdma_write_immed( + protoexp->proto->ep, + tidsendc->ipsaddr->rv_conn, + tidsendc->buffer, tidsendc->mr, + tidsendc->tid_list.tsess_raddr, tidsendc->tid_list.tsess_rkey, + tidsendc->tid_list.tsess_length, + RDMA_PACK_IMMED(tidsendc->rdescid._desc_genc, + tidsendc->rdescid._desc_idx, + tidsendc->ipsaddr->remote_rv_index), + (uintptr_t)tidsendc, + &tidsendc->rv_sconn_index, &tidsendc->rv_conn_count); + else if (IPS_PROTOEXP_FLAG_USER_RC_QP(protoexp->proto->ep->rdmamode)) + err = psm2_verbs_post_rdma_write_immed( + protoexp->proto->ep, + tidsendc->ipsaddr->rc_qp, + tidsendc->buffer, tidsendc->mr, + tidsendc->tid_list.tsess_raddr, tidsendc->tid_list.tsess_rkey, + tidsendc->tid_list.tsess_length, + RDMA_PACK_IMMED(tidsendc->rdescid._desc_genc, + tidsendc->rdescid._desc_idx, 0), + (uintptr_t)tidsendc); + } + if (err == PSM2_OK) + tidsendc->is_complete = 1; // send queued +#else // RNDV_MOD_MR + if (err == PSM2_OK) { + psmi_assert(IPS_PROTOEXP_FLAG_ENABLED & protoexp->proto->ep->rdmamode); + if (IPS_PROTOEXP_FLAG_USER_RC_QP(protoexp->proto->ep->rdmamode)) + err = psm2_verbs_post_rdma_write_immed( + protoexp->proto->ep, + tidsendc->ipsaddr->rc_qp, + tidsendc->buffer, tidsendc->mr, + tidsendc->tid_list.tsess_raddr, tidsendc->tid_list.tsess_rkey, + tidsendc->tid_list.tsess_length, + RDMA_PACK_IMMED(tidsendc->rdescid._desc_genc, + tidsendc->rdescid._desc_idx, 0), + (uintptr_t)tidsendc); + } + if (err == PSM2_OK) + tidsendc->is_complete = 1; // send queued +#endif // RNDV_MOD_MR + return err; +} + +/* + * Returns: + * + * PSM2_OK: scb was allocated for at least one frame, the packet may be queued + * or actually sent. + * + * PSM2_OK_NO_PROGRESS: Reached a limit on the maximum number of sends we allow + * to be enqueued before polling receive queue. + * + * PSM2_EP_NO_RESOURCES: No scbs, available, a callback will be issued when more + * scbs become available. + * + * PSM2_TIMEOUT: PIO-busy or DMA-busy, stop trying to send for now. + * + */ + +// we got a CTS and processed it. Now we can start sending EXPTID packets. +// For UD we will use RDMA instead of EXPTID +static +psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc) +{ + psm2_error_t err = PSM2_OK; +#if defined(PSM_CUDA) + struct ips_protoexp *protoexp = tidsendc->protoexp; +#endif + + _HFI_MMDBG("ips_tid_send_exp\n"); +#ifdef PSM_CUDA + struct ips_cuda_hostbuf *chb, *chb_next; + CUresult chb_status; + uint32_t offset_in_chb, i; + for (i = 0; i < tidsendc->cuda_num_buf; i++) { + chb = tidsendc->cuda_hostbuf[i]; + if (chb) { + PSMI_CUDA_CHECK_EVENT(chb->copy_status, chb_status); + if (chb_status != CUDA_SUCCESS) { + err = PSM2_OK_NO_PROGRESS; + PSM2_LOG_MSG("leaving"); + return err; + } + } + } + + if (tidsendc->cuda_num_buf == 2) { + chb = tidsendc->cuda_hostbuf[0]; + chb_next = tidsendc->cuda_hostbuf[1]; + offset_in_chb = tidsendc->tid_list.tsess_srcoff - chb->offset; + /* Copying data from multiple cuda + * host buffers into a bounce buffer. + */ + memcpy(tidsendc->buffer, chb->host_buf + + offset_in_chb, chb->size-offset_in_chb); + memcpy(tidsendc->buffer+ chb->size - + offset_in_chb, chb_next->host_buf, + tidsendc->tid_list.tsess_srcoff + + tidsendc->tid_list.tsess_length - chb_next->offset); + + chb->bytes_read += chb->size - offset_in_chb; + chb_next->bytes_read += tidsendc->tid_list.tsess_srcoff + + tidsendc->tid_list.tsess_length - + chb_next->offset; + if(chb->bytes_read == chb->size) { + STAILQ_REMOVE(&tidsendc->mqreq->sendreq_prefetch, chb, + ips_cuda_hostbuf, req_next); + if (chb->is_tempbuf) + psmi_deallocate_chb(chb); + else { + chb->req = NULL; + chb->offset = 0; + chb->bytes_read = 0; + psmi_mpool_put(chb); + } + psmi_cuda_run_prefetcher(protoexp, tidsendc); + } + if(chb_next->bytes_read == chb_next->size) { + STAILQ_REMOVE(&tidsendc->mqreq->sendreq_prefetch, chb_next, + ips_cuda_hostbuf, req_next); + if (chb_next->is_tempbuf) + psmi_deallocate_chb(chb_next); + else{ + chb_next->req = NULL; + chb_next->offset = 0; + chb_next->bytes_read = 0; + psmi_mpool_put(chb_next); + } + psmi_cuda_run_prefetcher(protoexp, tidsendc); + } + } +#endif + err = ips_tid_issue_rdma_write(tidsendc); + + PSM2_LOG_MSG("leaving"); + return err; +} + +#ifdef RNDV_MOD_MR +// Used when err chk rdma resp indicates we must resend the rdma +static +void ips_tid_reissue_rdma_write(struct ips_tid_send_desc *tidsendc) +{ + struct ips_protoexp *protoexp = tidsendc->protoexp; + + _HFI_MMDBG("ips_tid_reissue_rdma_write\n"); + + PSM2_LOG_MSG("entering"); + protoexp->proto->epaddr_stats.rdma_rexmit++; + tidsendc->is_complete = 0; // sends not yet queued + + ips_tid_issue_rdma_write(tidsendc); + + /* Add as a pending op and ring up the timer */ + if (tidsendc->is_complete == 0) { + STAILQ_INSERT_TAIL(&protoexp->pend_sendq, tidsendc, next); + psmi_timer_request(protoexp->timerq, &protoexp->timer_send, + PSMI_TIMER_PRIO_1); + } + + PSM2_LOG_MSG("leaving"); +} +#endif // RNDV_MOD_MR + +static +psm2_error_t +ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current) +{ + struct ips_protoexp *protoexp = (struct ips_protoexp *)timer->context; + struct ips_tid_send_pend *phead = &protoexp->pend_sendq; + struct ips_tid_send_desc *tidsendc; + psm2_error_t err = PSM2_OK; + _HFI_MMDBG("ips_tid_pendsend_timer_callback\n"); + + while (!STAILQ_EMPTY(phead)) { + tidsendc = STAILQ_FIRST(phead); + + // we have some scb's and can use them to queue some more EXPTID packets +#ifdef RNDV_MOD_MR + if (tidsendc->rv_need_err_chk_rdma) + err = ips_protoexp_send_err_chk_rdma(tidsendc); + else +#endif + err = ips_tid_send_exp(tidsendc); + + if (tidsendc->is_complete) + STAILQ_REMOVE_HEAD(phead, next); + + if (err == PSM2_OK) { + /* Was able to complete the send, keep going */ + } else if (err == PSM2_EP_NO_RESOURCES) { + /* No more sendbufs available, sendbuf callback will requeue this + * timer */ + break; + } else if (err == PSM2_TIMEOUT + || err == PSM2_EPID_RV_CONNECT_RECOVERING + || err == PSM2_EPID_RV_CONNECT_ERROR) { + /* Always a case of try later: + * On PIO flow, means no send pio bufs available + * On DMA flow, means kernel can't queue request or would have to block + * On RV or user RDMA QP is full or connection recovery/issues + */ + psmi_timer_request(protoexp->proto->timerq, + &protoexp->timer_send, + get_cycles() + + protoexp->proto->timeout_send); + break; + } else { + /* Forced to reschedule later so we can check receive queue */ + psmi_assert(err == PSM2_OK_NO_PROGRESS); + psmi_timer_request(protoexp->proto->timerq, + &protoexp->timer_send, + PSMI_TIMER_PRIO_1); + break; + } + } + + return PSM2_OK; +} + +/* Right now, in the kernel we are allowing for virtually non-contiguous pages, + in a single call, and we are therefore locking one page at a time, but since + the intended use of this routine is for a single group of + virtually contiguous pages, that should change to improve + performance. That means possibly changing the calling MPI code. + Doing so gets rid of some of the loop stuff here, and in the driver, + and allows for a single call to the core VM code in the kernel, + rather than one per page, definitely improving performance. */ + + +static +psm2_error_t +ips_tid_recv_alloc(struct ips_protoexp *protoexp, + ips_epaddr_t *ipsaddr, + const struct ips_tid_get_request *getreq, + uint32_t nbytes_this, struct ips_tid_recv_desc **ptidrecvc) +{ + psm2_error_t err; + ips_scb_t *grantscb; + psm2_mq_req_t req = getreq->tidgr_req; + struct ips_proto *proto = protoexp->proto; + + struct ips_tid_recv_desc *tidrecvc; + + PSM2_LOG_MSG("entering"); + /* Allocate all necessary resources. */ + _HFI_MMDBG("tid_recv_alloc\n"); + + // allocate what we need to handle TID or RDMA on receive side + // we do this before we issue CTS + + /* 1. allocate a tid grant (CTS) scb. */ + grantscb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0); + if (grantscb == NULL) { + _HFI_MMDBG("Wait: NO GRANT SCB\n"); + /* ips_tid_scbavail_callback() will reschedule */ + PSM2_LOG_MSG("leaving"); + return PSM2_EP_NO_RESOURCES; + } + + + /* 3. allocate a tid flow entry. */ + err = ips_tf_allocate(&protoexp->tfc, &tidrecvc); + if (err != PSM2_OK) { + _HFI_MMDBG("Wait: NO tid flow\n"); + ips_scbctrl_free(grantscb); + /* Unable to get a tidflow for expected protocol. */ + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); + PSM2_LOG_MSG("leaving"); + return err; + } + tidrecvc->mr = NULL; // be safe,but should be NULL since clear on release + +#ifdef PSM_CUDA + if (req->is_buf_gpu_mem) + tidrecvc->is_ptr_gpu_backed = !getreq->cuda_hostbuf_used; + else + tidrecvc->is_ptr_gpu_backed = req->is_buf_gpu_mem; + + /* 4. allocate a cuda bounce buffer, if required */ + struct ips_cuda_hostbuf *chb = NULL; + if (getreq->cuda_hostbuf_used) { + if (nbytes_this <= CUDA_SMALLHOSTBUF_SZ) + chb = (struct ips_cuda_hostbuf *) + psmi_mpool_get( + protoexp->cuda_hostbuf_pool_small_recv); + if (chb == NULL) + chb = (struct ips_cuda_hostbuf *) + psmi_mpool_get( + protoexp->cuda_hostbuf_pool_recv); + if (chb == NULL) { + /* Unable to get a cudahostbuf for TID. + * Release the resources we're holding and reschedule.*/ + ips_tf_deallocate(&protoexp->tfc, + tidrecvc->rdescid._desc_idx); + ips_scbctrl_free(grantscb); + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, + PSMI_TIMER_PRIO_1); + PSM2_LOG_MSG("leaving"); + return PSM2_EP_NO_RESOURCES; + } + + tidrecvc->cuda_hostbuf = chb; + tidrecvc->buffer = chb->host_buf; + chb->size = 0; + chb->gpu_buf = (CUdeviceptr) getreq->tidgr_lbuf + + getreq->tidgr_offset; + } else { + chb = NULL; + tidrecvc->buffer = (void *)((uintptr_t) getreq->tidgr_lbuf + + getreq->tidgr_offset); + tidrecvc->cuda_hostbuf = NULL; + } +#else // PSM_CUDA + tidrecvc->buffer = + (void *)((uintptr_t) getreq->tidgr_lbuf + getreq->tidgr_offset); +#endif // PSM_CUDA + + // separate MR cache's per EP, so this confirms we have the same EP + if ( +#ifdef PSM_CUDA + ! getreq->cuda_hostbuf_used && +#endif + req->mr && req->mr->cache == proto->mr_cache) { + _HFI_MMDBG("CTS chunk reference recv: %p %u bytes via %p %"PRIu64"\n", tidrecvc->buffer, nbytes_this, req->mr->addr, req->mr->length); + tidrecvc->mr = psm2_verbs_ref_mr(req->mr); + } else { + _HFI_MMDBG("CTS chunk register recv: %p %u bytes\n", tidrecvc->buffer, nbytes_this); + tidrecvc->mr = psm2_verbs_reg_mr(proto->mr_cache, 1, + proto->ep->verbs_ep.pd, + tidrecvc->buffer, nbytes_this, IBV_ACCESS_REMOTE_WRITE +#ifdef PSM_CUDA + | (tidrecvc->is_ptr_gpu_backed?IBV_ACCESS_IS_GPU_ADDR:0) +#endif + ); + if (! tidrecvc->mr) { +#ifdef PSM_CUDA + if (chb) + psmi_mpool_put(chb); +#endif + ips_tf_deallocate(&protoexp->tfc, tidrecvc->rdescid._desc_idx); + //ips_scbctrl_free(completescb); + ips_scbctrl_free(grantscb); + /* Unable to register MR */ + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); + PSM2_LOG_MSG("leaving"); + return PSM2_TIMEOUT; // make sure we try again + } + _HFI_MMDBG("CTS chunk registered: addr %p len %d rkey 0x%x\n", tidrecvc->buffer , nbytes_this, tidrecvc->mr->rkey); + } + + tidrecvc->recv_msglen = nbytes_this; + + /* Initialize recv descriptor */ + tidrecvc->ipsaddr = ipsaddr; + tidrecvc->getreq = (struct ips_tid_get_request *)getreq; + + + tidrecvc->tidflow_nswap_gen = 0; + tidrecvc->tidflow_genseq.psn_gen = tidrecvc->tidflow_active_gen; + tidrecvc->tidflow_genseq.psn_seq = 0; /* Always start sequence number at 0 (zero), + in order to prevent wraparound sequence numbers */ + + tidrecvc->tid_list.tsess_srcoff = getreq->tidgr_offset; + tidrecvc->tid_list.tsess_length = tidrecvc->recv_msglen; + // when using kernel PD/MR for kernel rendezvous QP, we need to xlat + // our buffer to the kernel PD/MR iova space. + // for user space PD/MR iova == addr and xlat is a noop + tidrecvc->tid_list.tsess_rkey = tidrecvc->mr->rkey; + tidrecvc->tid_list.tsess_raddr = tidrecvc->mr->iova + ((uintptr_t)tidrecvc->buffer - (uintptr_t)tidrecvc->mr->addr); + + tidrecvc->state = TIDRECVC_STATE_BUSY; + + tidrecvc->stats.nSeqErr = 0; + tidrecvc->stats.nGenErr = 0; + tidrecvc->stats.nReXmit = 0; + tidrecvc->stats.nErrChkReceived = 0; + + _HFI_EXP("alloc tidrecv=%d\n", + tidrecvc->rdescid._desc_idx); + + tidrecvc->grantscb = grantscb; + + *ptidrecvc = tidrecvc; /* return to caller */ + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} + +// process receiver side TID queue +// If we have TID resources, we will acquire them, setup TID HW, +// prepare a CTS and send it +// If we run out of resources with more to do, we reschedule ourselves on timer +// called directly or on timer +static +psm2_error_t +ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current) +{ + struct ips_tid_get_pend *phead = + &((struct ips_protoexp *)timer->context)->pend_getreqsq; + struct ips_protoexp *protoexp; + struct ips_tid_get_request *getreq; + struct ips_tid_recv_desc *tidrecvc; + ips_epaddr_t *ipsaddr; + uint32_t nbytes_this, count; +#ifdef RNDV_MOD_MR + struct ips_tid_err_resp_pend *phead_resp = + &((struct ips_protoexp *)timer->context)->pend_err_resp; +#endif + int ret; + + PSM2_LOG_MSG("entering"); + _HFI_MMDBG("ips_tid_pendtids_timer_callback\n"); + +#ifdef RNDV_MOD_MR + while (!STAILQ_EMPTY(phead_resp)) { + ipsaddr = STAILQ_FIRST(phead_resp); + protoexp = ipsaddr->epaddr.proto->protoexp; + psmi_assert(ipsaddr->rv_need_send_err_chk_rdma_resp); + ips_protoexp_send_err_chk_rdma_resp(&ipsaddr->flows[protoexp->proto->msgflowid]); + if (! ipsaddr->rv_need_send_err_chk_rdma_resp) + STAILQ_REMOVE_HEAD(phead_resp, pend_err_resp_next); + else + break; // ips_tid_scbavail_callback will trigger us again + } +#endif + +#ifdef PSM_CUDA + if (!(((struct ips_protoexp *)timer->context)->proto->flags + & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) || + ((((struct ips_protoexp *)timer->context)->proto->flags & + IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) && + gpudirect_recv_threshold)) { + /* Before processing pending TID requests, first try to free up + * any CUDA host buffers that are now idle. */ + struct ips_tid_get_cudapend *cphead = + &((struct ips_protoexp *)timer->context)->cudapend_getreqsq; + psm2_error_t err; + + /* See if any CUDA memcpys are in progress. Grab the first getreq... */ + while (!STAILQ_EMPTY(cphead)) { + getreq = STAILQ_FIRST(cphead); + + err = psmi_cuda_reclaim_hostbufs(getreq); + if (err == PSM2_OK_NO_PROGRESS) + goto cudapend_exit; + + /* This pending cuda getreq has no more CUDA ops queued up. + * Either it's completely done, or the CUDA copies have caught + * up with the TID data xfer, but the TID xfer itself is not + * finished. + */ + if (getreq->tidgr_cuda_bytesdone == getreq->tidgr_length) { + /* TID xfer is done. + * We should only get here if: + * this was involved a cuda copy, and + * the TIX xfer is done. + */ + psmi_assert(getreq->cuda_hostbuf_used); + psmi_assert(getreq->tidgr_length == + getreq->tidgr_offset); + + /* Remove from the cudapend list, and reclaim */ + getreq->tidgr_protoexp = NULL; + getreq->tidgr_epaddr = NULL; + STAILQ_REMOVE_HEAD(cphead, tidgr_next); + + /* mark the req as done */ + if (getreq->tidgr_callback) + getreq->tidgr_callback(getreq->tidgr_req); + psmi_mpool_put(getreq); + } else + break; /* CUDA xfers in progress. Leave. */ + } + } +cudapend_exit: +#endif + + while (!STAILQ_EMPTY(phead)) { + getreq = STAILQ_FIRST(phead); + ipsaddr = (ips_epaddr_t *) (getreq->tidgr_epaddr); + count = ipsaddr->msgctl->ipsaddr_count; + +ipsaddr_next: + // always stripe for rendezvous, ignore multirail_thresh_load_balance + // TBD - for multi-rail does this eliminate any advantages of + // registering the MR in ips_proto_mq_rts_match_callback + ipsaddr = ipsaddr->msgctl->ipsaddr_next; + ipsaddr->msgctl->ipsaddr_next = ipsaddr->next; + protoexp = ((psm2_epaddr_t) ipsaddr)->proto->protoexp; + + if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) { + psmi_assert(protoexp->proto->msgflowid < EP_FLOW_LAST); + struct ips_flow *flow = &ipsaddr->flows[protoexp->proto->msgflowid]; + if (flow->flags & IPS_FLOW_FLAG_SKIP_CTS) { + break; /* skip sending next CTS */ + } + } + +#ifdef PSM_CUDA + if (getreq->cuda_hostbuf_used) { + /* If this is a large transfer, we may be able to + * start reclaiming before all of the data is sent. */ + psmi_cuda_reclaim_hostbufs(getreq); + } +#endif + /* + * Calculate the next window size, avoid the last + * window too small. + */ + nbytes_this = getreq->tidgr_length - getreq->tidgr_offset; + if (nbytes_this >= 2 * getreq->tidgr_rndv_winsz) + nbytes_this = getreq->tidgr_rndv_winsz; + else if (nbytes_this > getreq->tidgr_rndv_winsz) + nbytes_this /= 2; + _HFI_MMDBG("ips_tid_pendtids_timer_callback: getreq len %u offset %u nbytes_this %u\n", getreq->tidgr_length, getreq->tidgr_offset, nbytes_this); + + /* + * If there is a next window and the next window + * length is greater than PAGESIZE, make sure the window + * starts on a page boundary. + */ +#ifdef PSM_CUDA + psm2_mq_req_t req = getreq->tidgr_req; + if (req->is_buf_gpu_mem){ + if (((getreq->tidgr_offset + nbytes_this) < + getreq->tidgr_length) && + nbytes_this > PSMI_GPU_PAGESIZE) { + uint32_t pageoff = + (((uintptr_t)getreq->tidgr_lbuf) & + (PSMI_GPU_PAGESIZE - 1)) + + getreq->tidgr_offset + nbytes_this; + nbytes_this -= pageoff & (PSMI_GPU_PAGESIZE - 1); + } + } else +#endif +// TBD - we may not need this page alignment of nbytes_this + { + if ((getreq->tidgr_offset + nbytes_this) < + getreq->tidgr_length && + nbytes_this > PSMI_PAGESIZE) { + uint32_t pageoff = + (((uintptr_t)getreq->tidgr_lbuf) & + (PSMI_PAGESIZE - 1)) + + getreq->tidgr_offset + nbytes_this; + nbytes_this -= pageoff & (PSMI_PAGESIZE - 1); + } + } + _HFI_MMDBG("ips_tid_pendtids_timer_callback: page align nbytes_this %u\n", nbytes_this); + + psmi_assert(nbytes_this >= 4); + + // for STL native the tids and tidflows available pace incoming TIDs + // for UD we still use tidflows available to pace incoming RDMA + if ((ret = ips_tf_available(&protoexp->tfc)) <= 0) { + /* We're out of tidflow. If this process used all the resource, + * the free callback will reschedule the operation, otherwise, + * we reschedule it here */ + if (ret == 0) + { + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, + PSMI_TIMER_PRIO_1); + } + } else if (ips_tid_recv_alloc(protoexp, ipsaddr, + getreq, nbytes_this, &tidrecvc) == PSM2_OK) { + // send the actual CTS + ips_protoexp_send_tid_grant(tidrecvc); + _HFI_VDBG("GRANT sent tididx=%d srcoff=%d nbytes=%d/%d\n", + tidrecvc->rdescid._desc_idx, + getreq->tidgr_offset, tidrecvc->recv_msglen, + getreq->tidgr_length); + + if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) { + /* + * Once the CTS was sent, we mark it per 'flow' object + * not to proceed with next CTSes until that one is done. + */ + struct ips_proto *proto = tidrecvc->protoexp->proto; + psmi_assert(proto->msgflowid < EP_FLOW_LAST); + struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid]; + flow->flags |= IPS_FLOW_FLAG_SKIP_CTS; + } + + /* + * nbytes_this is the asked length for this session, + * ips_tid_recv_alloc() might register less pages, the + * real length is in tidrecvc->recv_msglen. + */ + getreq->tidgr_offset += tidrecvc->recv_msglen; + psmi_assert(getreq->tidgr_offset <= + getreq->tidgr_length); + + if (getreq->tidgr_offset == getreq->tidgr_length) { +#ifdef PSM_CUDA + if (getreq->cuda_hostbuf_used) { + /* this completes the tid xfer setup. + move to the pending cuda ops queue, + set the timer to catch completion */ + STAILQ_REMOVE_HEAD(phead, tidgr_next); + STAILQ_INSERT_TAIL( + &getreq->tidgr_protoexp->cudapend_getreqsq, + getreq, tidgr_next); + psmi_timer_request(getreq->tidgr_protoexp->timerq, + &getreq->tidgr_protoexp->timer_getreqs, + PSMI_TIMER_PRIO_1); + continue; + } +#endif + getreq->tidgr_protoexp = NULL; + getreq->tidgr_epaddr = NULL; + STAILQ_REMOVE_HEAD(phead, tidgr_next); + continue; /* try next grant request */ + } + else if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_RTS_CTS_INTERLEAVE) { + /* In case of multi rail, PSM sends one CTS per request + * per card after which the request is moved to the end + * of the queue. + */ + count--; + if (count) + goto ipsaddr_next; + STAILQ_REMOVE_HEAD(phead, tidgr_next); + STAILQ_INSERT_TAIL(phead, getreq ,tidgr_next); + continue; + } + + /* created a tidrecvc, reset count */ + count = ipsaddr->msgctl->ipsaddr_count; + goto ipsaddr_next; /* try next fragment on next ipsaddr */ + } + + /* + * We need to loop until we can't get a tidrecvc on all + * ipsaddrs, then the callbacks on the home protoexp where + * getreq is linked can resume this routine. Otherwise, we + * might make this getreq to be orphaned and cause deadlock. + */ + count--; + if (count) + goto ipsaddr_next; + break; + } + PSM2_LOG_MSG("leaving"); + return PSM2_OK; /* XXX err-broken */ +} + +#ifdef PSM_CUDA +static +void psmi_cudamemcpy_tid_to_device(struct ips_tid_recv_desc *tidrecvc) +{ + struct ips_protoexp *protoexp = tidrecvc->protoexp; + struct ips_cuda_hostbuf *chb; + + chb = tidrecvc->cuda_hostbuf; + chb->size += tidrecvc->recv_msglen; + ; + + PSMI_CUDA_CALL(cuMemcpyHtoDAsync, + chb->gpu_buf, chb->host_buf, + tidrecvc->recv_msglen + , + protoexp->cudastream_recv); + PSMI_CUDA_CALL(cuEventRecord, chb->copy_status, + protoexp->cudastream_recv); + + STAILQ_INSERT_TAIL(&tidrecvc->getreq->pend_cudabuf, chb, next); + tidrecvc->cuda_hostbuf = NULL; + ips_tid_pendtids_timer_callback(&tidrecvc->getreq->tidgr_protoexp->timer_getreqs,0); +} +#endif + +// we have completed receipt of the TIDs for a given CTS +// For RC QP, this is indicated by RDMA completion w/immediate +static +psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc) +{ + struct ips_protoexp *protoexp = tidrecvc->protoexp; + struct ips_tid_get_request *getreq = tidrecvc->getreq; + psm2_error_t err = PSM2_OK; + + psmi_assert(getreq != NULL); + psmi_assert(tidrecvc->state == TIDRECVC_STATE_BUSY); + +#ifdef PSM_CUDA + if (tidrecvc->cuda_hostbuf) + psmi_cudamemcpy_tid_to_device(tidrecvc); +#endif + + if (tidrecvc->mr) { + _HFI_MMDBG("CTS recv chunk complete, releasing MR: rkey: 0x%x\n", tidrecvc->mr->rkey); + psm2_verbs_release_mr(tidrecvc->mr); + tidrecvc->mr = NULL; + } + + getreq->tidgr_bytesdone += tidrecvc->recv_msglen; + + _HFI_EXP("req=%p bytes=%d/%d\n", + getreq->tidgr_req, + getreq->tidgr_bytesdone, getreq->tidgr_length); + + tidrecvc->state = TIDRECVC_STATE_FREE; + + /* finally free the tidflow */ + ips_tf_deallocate(&protoexp->tfc, tidrecvc->rdescid._desc_idx); + + if (getreq->tidgr_bytesdone == getreq->tidgr_length) { +#ifdef PSM_CUDA + /* if cuda, we handle callbacks when the cuda xfer is done */ + if (!getreq->cuda_hostbuf_used) { + if (getreq->tidgr_callback) + getreq->tidgr_callback(getreq->tidgr_req); + psmi_mpool_put(getreq); + } +#else + if (getreq->tidgr_callback) + getreq->tidgr_callback(getreq->tidgr_req); + psmi_mpool_put(getreq); +#endif + } else { + /* We just released some tids. + * If requests are waiting on tids to be + * freed, queue up the timer */ + if (getreq->tidgr_offset < getreq->tidgr_length) { + ips_tid_pendtids_timer_callback(&getreq-> + tidgr_protoexp-> + timer_getreqs, 0); + } + } + + /* we freed some an MR If we have pending sends or pending get requests, + * turn on the timer so it can be processed. */ + ips_tid_mravail_callback(protoexp->proto); + + return err; +} + + + + + + + diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_header.h b/prov/psm3/psm3/ptl_ips/ips_proto_header.h new file mode 100644 index 00000000000..8c4ae1d6df9 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_proto_header.h @@ -0,0 +1,202 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_PROTO_HEADER_H +#define _IPS_PROTO_HEADER_H + +/* Although defined as macros, the *_BITS values below are NOT meant to be + changed. They are defined this way so that their values are written in + exactly one place. These macros are used in struct ips_message_header + below, as well as in the active messages code for the purpose of + establishing how many arguments/handlers are supported, and to assert that + values written into the header fields are not too large for the number of + bits available. The preprocessor check below ensures less than 32 bits are + used. + */ + +/* Number of bits to use for the amhdr_len field. */ +#define IPS_AM_HDR_LEN_BITS 4 + +/* Number of bits to use for the amhdr_hidx field. Bounds the number of + * handlers supported (1 << IPS_AM_HDR_HIDX_BITS). */ +#define IPS_AM_HDR_HIDX_BITS 8 + +/* Number of bits to use for the amhdr_nargs field. Bounds the number of + arguments supported (1 << IPS_AM_HDR_NARGS_BITS). */ +#define IPS_AM_HDR_NARGS_BITS 4 + +#if (IPS_AM_HDR_LEN_BITS + IPS_AM_HDR_HIDX_BITS + IPS_AM_HDR_NARGS_BITS) > 32 +#error "Bad IPS header definition: AM fields must use 32 bits or less" +#endif + +/* Number of AM arguments that can be packets into struct_ips_message_header. + Remaining arguments up to the max (1 << IPS_AM_HDR_NARGS_BITS) are placed in + the data payload. */ +#define IPS_AM_HDR_NARGS \ + (sizeof(((struct ips_message_header *)0)->data) / sizeof(psm2_amarg_t)) + +/* The actual size of the message header is determined by three paramters: + * IPS_HEADER_QUEUE_IWORDS (fixed at 5 by hardware) + * OPA words contain LRH and BTH + * IPS_HEADER_QUEUE_HWORDS (fixed at 2 by ips protocol) + * IPS hardware words contain ips-protocol-specific data + * IPS_HEADER_QUEUE_UWORDS (fixed at 7 by ips protocol) + * IPS user words contain ips-protocol-specific data + * + * The header message size is determined to as IWORDS + HWORDS + UWORDS + */ +struct ips_message_header { + __be16 lrh[4]; + __be32 bth[3]; + + // 32b alignment + /* fields below this point are in host byte order */ + struct hfi_kdeth khdr; + + // 32b alignment + struct { + __u32 flags:6; + __u32 connidx:26; /* connection idx */ + } PACK_SUFFIX; + + // 64b alignment + union { + struct { + struct { + __u32 ack_seq_num:31; + __u32 reserved:1; + } PACK_SUFFIX; + + // 32b alignment + union { + struct { /* for active message */ + __u32 amhdr_len:IPS_AM_HDR_LEN_BITS; + __u32 amhdr_nargs:IPS_AM_HDR_NARGS_BITS; + __u32 amhdr_hidx:IPS_AM_HDR_HIDX_BITS; + } PACK_SUFFIX; + __u32 mdata; /* for misc data */ + }; + + // 64b alignment + /* Inline arguments and/or message payload */ + union { + ptl_arg_t data[2]; + __u32 uwords[4]; + }; + } PACK_SUFFIX; + + /* for message header packet only */ + struct { + __u32 pad1; + __u32 tag[3]; /* 96 bits psm tag */ + ptl_arg_t hdr_data; + } PACK_SUFFIX; + + /* for expected tid packet only */ + struct { + __u8 exp_ustart[3]; /* unaligned start bytes */ + __u8 exp_uend[3]; /* unaligned end bytes */ + __u16 exp_rdescid_genc; /* tidrecvc gen count */ + ptl_arg_t exp_sdescid; /* sender descriptor id */ + __u32 exp_cksum; /* optional checksum */ + __u32 exp_offset; /* packet offset */ + } PACK_SUFFIX; + }; +} PACK_SUFFIX; +/* desc_genc is up to 32 bits, but EXPTID header (and RDMA immediate data) + * only has room for 16 bits + */ +#define IPS_HDR_RDESCID_GENC_MASK 0xffff + +/* + * OpCodes in BTH[0], 24-31 bits. Order is important!!! + */ +#define OPCODE_RESERVED 0xC0 /* reserved */ +/* TINY to EXPTID_COMPLETION/ERR_CHK_RDMA_RESP are level 2 packets */ +/* sending queue keeps a copy and resends if timeout waiting for ack */ +/* order and reliability maintained */ +#define OPCODE_TINY 0xC1 /* 0 <= msglen <= 8 */ +#define OPCODE_SHORT 0xC2 /* 8 < msglen <= MTU */ +#define OPCODE_EAGER 0xC3 /* eager packet */ +#define OPCODE_LONG_RTS 0xC4 /* ready to send */ +#define OPCODE_LONG_CTS 0xC5 /* confirm to send */ +#define OPCODE_LONG_DATA 0xC6 /* long data packets */ +#define OPCODE_ERR_CHK_RDMA 0xC7 /* RDMA error recovery */ +#define OPCODE_ERR_CHK_RDMA_RESP 0xC8 /* RDMA error recovery response */ +/* ACK to ERR_CHK_GEN are "level 0 control packets" state machine driven send */ +/* reissue if given state persists */ +/* duplicates can occur with no consequences */ +#define OPCODE_ACK 0xC9 /* explicit ACK packet */ +#define OPCODE_NAK 0xCA /* explicit NAK packet */ +#define OPCODE_BECN 0xCB /* congestion control */ +#define OPCODE_ERR_CHK 0xCC /* query eager receiving */ +// 0xCD /* reserved */ +/* CONNECT_REQUEST to DISCONNECT_REPLY are "level 1 control packets" */ +/* timer based resend, but rebuild on fly when resend */ +/* consumer must deal with duplicates */ +#define OPCODE_CONNECT_REQUEST 0xCE /* connect request */ +#define OPCODE_CONNECT_REPLY 0xCF /* connect reply */ +#define OPCODE_DISCONNECT_REQUEST 0xD0 /* disconnect request */ +#define OPCODE_DISCONNECT_REPLY 0xD1 /* disconnect reply */ +/* AM_REQUEST_NOREPLY to AM_REPLY are level 2 packets */ +/* sending queue keeps a copy and resends if timeout waiting for ack */ +/* order and reliability maintained */ +#define OPCODE_AM_REQUEST_NOREPLY 0xD2 /* AM request w/o reply */ +#define OPCODE_AM_REQUEST 0xD3 /* AM request */ +#define OPCODE_AM_REPLY 0xD4 /* AM reply */ +#define OPCODE_FUTURE_FROM 0xD5 /* reserved for expansion */ +#define OPCODE_FUTURE_TO 0xDF /* reserved for expansion */ + +#endif /* _IPS_PROTO_HEADER_H */ diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_help.h b/prov/psm3/psm3/ptl_ips/ips_proto_help.h new file mode 100644 index 00000000000..a7b90faa41b --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_proto_help.h @@ -0,0 +1,557 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2017 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2017 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_PROTO_HELP_H +#define _IPS_PROTO_HELP_H + +#include "ptl_ips.h" + +/* hfi_opcode is not the ips-level opcode. */ +PSMI_ALWAYS_INLINE( +uint8_t +_get_proto_hfi_opcode(const struct ips_message_header *p_hdr)) +{ + return ((__be32_to_cpu(p_hdr->bth[0]) >> + HFI_BTH_OPCODE_SHIFT) & HFI_BTH_OPCODE_MASK); +} + +PSMI_ALWAYS_INLINE( +uint8_t +ips_flow_gen_ackflags(ips_scb_t *scb, struct ips_flow *flow)) +{ + /* + * Setup ACK request if more than ack_interval packets + * have not been requested an ACK + */ + if (scb->scb_flags & IPS_SEND_FLAG_ACKREQ || scb->nfrag > 1) { + flow->ack_counter = 0; + } else { + flow->ack_counter++; + if (flow->ack_counter > flow->ack_interval) { + flow->ack_counter = 0; + scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; + } + } + + /* Bottom 6 bits wind up in protocol header fields, other bits + * control other aspects of packet composition */ + return (uint8_t) (scb->scb_flags & IPS_SEND_FLAG_PROTO_OPTS); +} + +PSMI_ALWAYS_INLINE( +ips_epaddr_flow_t +ips_proto_flowid(struct ips_message_header *p_hdr)) +{ + return (ips_epaddr_flow_t) ((__be32_to_cpu(p_hdr->bth[1]) >> + HFI_BTH_FLOWID_SHIFT) & + HFI_BTH_FLOWID_MASK); +} + +PSMI_ALWAYS_INLINE( +int +ips_do_cksum(struct ips_proto *proto, struct ips_message_header *p_hdr, + void *payload, uint32_t paylen, uint32_t *cksum)) +{ + uint16_t paywords; + + /* Update the payload words in header */ + paywords = (sizeof(struct ips_message_header) + paylen + + PSM_CRC_SIZE_IN_BYTES + HFI_CRC_SIZE_IN_BYTES) >> + BYTE2DWORD_SHIFT; + p_hdr->lrh[2] = __cpu_to_be16(paywords & HFI_LRH_PKTLEN_MASK); + + /* Need to regenerate KDETH checksum after updating payload length */ + /* ips_kdeth_cksum(p_hdr); */ + + *cksum = 0xffffffff; + + /* Checksum header */ + *cksum = ips_crc_calculate(sizeof(struct ips_message_header), + (uint8_t *) p_hdr, *cksum); + + /* Checksum payload (if any) */ + if (paylen) { + psmi_assert_always(payload); + *cksum = ips_crc_calculate(paylen, (uint8_t *) payload, *cksum); + } + + return 0; +} + + +PSMI_ALWAYS_INLINE( +void +ips_proto_hdr(struct ips_proto *proto, struct ips_epaddr *ipsaddr, + struct ips_flow *flow, ips_scb_t *scb, uint8_t flags)) +{ + uint16_t slid, dlid; + uint32_t paywords = (sizeof(struct ips_message_header) + + scb->payload_size + HFI_CRC_SIZE_IN_BYTES) >> + BYTE2DWORD_SHIFT; + struct ips_message_header *p_hdr = &scb->ips_lrh; +#if 0 + /* + * This scb has been used by this connection last time, + * so some of the header fields are already set. + */ + if (scb->flow == flow) { + p_hdr->lrh[2] = __cpu_to_be16(paywords & HFI_LRH_PKTLEN_MASK); + + p_hdr->bth[0] = __cpu_to_be32(flow->path->pr_pkey | + (scb-> + opcode << BTH_OPCODE_SHIFT) | + (extra_bytes << + BTH_EXTRA_BYTE_SHIFT)); + p_hdr->bth[2] = + __cpu_to_be32(flow->xmit_seq_num. + psn | (scb->scb_flags & IPS_SEND_FLAG_ACKREQ)); + + p_hdr->khdr.kdeth0 = __cpu_to_le32(scb->offset | + (scb-> + offset_mode << + HFI_KHDR_OM_SHIFT) | (scb-> + tid << + HFI_KHDR_TID_SHIFT) + | (scb-> + tidctrl << + HFI_KHDR_TIDCTRL_SHIFT) | + (scb-> + flags & IPS_SEND_FLAG_INTR) + | (scb-> + flags & + IPS_SEND_FLAG_HDR_SUPPRESS) + | (IPS_PROTO_VERSION << + HFI_KHDR_KVER_SHIFT)); + + /* ips_kdeth_cksum(p_hdr); // Generate KDETH checksum */ + + p_hdr->ack_seq_num = flow->recv_seq_num.psn; + p_hdr->flags = flags; + + return; + } +#endif + slid = flow->path->pr_slid; + dlid = flow->path->pr_dlid; + if (scb->scb_flags & IPS_SEND_FLAG_NO_LMC) { + slid = ipsaddr->pathgrp->pg_base_slid; + dlid = ipsaddr->pathgrp->pg_base_dlid; + } + + /* Setup LRH fields */ + p_hdr->lrh[0] = __cpu_to_be16(HFI_LRH_BTH | + ((flow->path->pr_sl & HFI_LRH_SL_MASK) << + HFI_LRH_SL_SHIFT) + ); + p_hdr->lrh[1] = dlid; + p_hdr->lrh[2] = __cpu_to_be16(paywords & HFI_LRH_PKTLEN_MASK); + p_hdr->lrh[3] = slid; + + /* Setup BTH fields */ + p_hdr->bth[0] = __cpu_to_be32(flow->path->pr_pkey | + (scb->opcode << HFI_BTH_OPCODE_SHIFT)); + p_hdr->bth[2] = __cpu_to_be32(flow->xmit_seq_num.psn_num | + (scb->scb_flags & IPS_SEND_FLAG_ACKREQ)); + + if (scb->tidctrl) { /* expected receive packet */ + psmi_assert(scb->tidsendc != NULL); + p_hdr->bth[1] = __cpu_to_be32((scb->tidsendc-> + rdescid._desc_idx + << HFI_BTH_FLOWID_SHIFT)); + + /* Setup KHDR fields */ + p_hdr->khdr.kdeth0 = __cpu_to_le32(p_hdr->khdr.kdeth0 | + (scb->tidctrl << + HFI_KHDR_TIDCTRL_SHIFT) | + (scb->scb_flags & + IPS_SEND_FLAG_INTR) + | (IPS_PROTO_VERSION << + HFI_KHDR_KVER_SHIFT)); + } else { /* eager receive packet */ + p_hdr->bth[1] = __cpu_to_be32((flow->flowid + << HFI_BTH_FLOWID_SHIFT)); + /* Setup KHDR fields */ + p_hdr->khdr.kdeth0 = __cpu_to_le32(p_hdr->khdr.kdeth0 | + (scb->scb_flags & + IPS_SEND_FLAG_INTR) + | (IPS_PROTO_VERSION << + HFI_KHDR_KVER_SHIFT)); + + p_hdr->ack_seq_num = flow->recv_seq_num.psn_num; + } + + p_hdr->khdr.job_key = 0; + p_hdr->connidx = ipsaddr->connidx_outgoing; + p_hdr->flags = flags; + + scb->flow = flow; + + return; +} + +/* + * Assumes that the following fields are already set in scb: + * payload + * payload_size + * flags + */ +PSMI_INLINE( +void +ips_scb_prepare_flow_inner(struct ips_proto *proto, struct ips_epaddr *ipsaddr, + struct ips_flow *flow, ips_scb_t *scb)) +{ + psmi_assert((scb->payload_size & 3) == 0); + ips_proto_hdr(proto, ipsaddr, flow, scb, + ips_flow_gen_ackflags(scb, flow)); + + scb->ack_timeout = proto->epinfo.ep_timeout_ack; + scb->abs_timeout = TIMEOUT_INFINITE; + scb->scb_flags |= IPS_SEND_FLAG_PENDING; + + if (flow->protocol == PSM_PROTOCOL_TIDFLOW) { + flow->xmit_seq_num.psn_seq += scb->nfrag; + scb->seq_num = flow->xmit_seq_num; + scb->seq_num.psn_seq--; + } else { + flow->xmit_seq_num.psn_num = + (flow->xmit_seq_num.psn_num + scb->nfrag) & proto->psn_mask; + scb->seq_num.psn_num = + (flow->xmit_seq_num.psn_num - 1) & proto->psn_mask; + } + + return; +} + +PSMI_ALWAYS_INLINE( +void +ips_proto_epaddr_stats_set(struct ips_proto *proto, uint8_t msgtype)) +{ + switch (msgtype) { + case OPCODE_ACK: + break; + case OPCODE_ERR_CHK: + proto->epaddr_stats.err_chk_send++; + break; + case OPCODE_NAK: + proto->epaddr_stats.nak_send++; + break; + case OPCODE_CONNECT_REQUEST: + proto->epaddr_stats.connect_req_send++; + break; + case OPCODE_CONNECT_REPLY: + proto->epaddr_stats.connect_rep_send++; + break; + case OPCODE_DISCONNECT_REQUEST: + proto->epaddr_stats.disconnect_req_send++; + break; + case OPCODE_DISCONNECT_REPLY: + proto->epaddr_stats.disconnect_rep_send++; + break; + default: + break; + } + return; +} + +/* + * Exported there solely for inlining is_expected_or_nak and mq_tiny handling + */ +extern +psm2_error_t ips_proto_send_ctrl_message(struct ips_flow *flow, + uint8_t message_type, uint16_t *msg_queue_mask, + ips_scb_t *ctrlscb, void *payload, uint32_t paylen); + +PSMI_ALWAYS_INLINE( +void +ips_proto_send_ack(struct ips_recvhdrq *recvq, struct ips_flow *flow)) +{ + if_pt(recvq->proto->flags & IPS_PROTO_FLAG_COALESCE_ACKS) { + if (flow->flags & IPS_FLOW_FLAG_PENDING_NAK) { + flow->flags &= ~IPS_FLOW_FLAG_PENDING_NAK; /* ACK clears NAK */ + } else if (!(flow->flags & IPS_FLOW_FLAG_PENDING_ACK)) { + SLIST_INSERT_HEAD(&recvq->pending_acks, flow, next); + } + + flow->flags |= IPS_FLOW_FLAG_PENDING_ACK; + } + else { + ips_scb_t ctrlscb; + + ctrlscb.scb_flags = 0; + ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num; + /* Coalesced ACKs disabled. Send ACK immediately */ + ips_proto_send_ctrl_message(flow, OPCODE_ACK, + &flow->ipsaddr->ctrl_msg_queued, + &ctrlscb, ctrlscb.cksum, 0); + } +} + +PSMI_ALWAYS_INLINE( +void +ips_proto_send_nak(struct ips_recvhdrq *recvq, struct ips_flow *flow)) +{ + if_pt(recvq->proto->flags & IPS_PROTO_FLAG_COALESCE_ACKS) { + if (flow->flags & IPS_FLOW_FLAG_PENDING_ACK) { + flow->flags &= ~IPS_FLOW_FLAG_PENDING_ACK; /* NAK clears ACK */ + } else if (!(flow->flags & IPS_FLOW_FLAG_PENDING_NAK)) { + SLIST_INSERT_HEAD(&recvq->pending_acks, flow, next); + } + + flow->flags |= IPS_FLOW_FLAG_PENDING_NAK; + } + else { + ips_scb_t ctrlscb; + + ctrlscb.scb_flags = 0; + ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num; + /* Coalesced ACKs disabled. Send NAK immediately */ + ips_proto_send_ctrl_message(flow, OPCODE_NAK, + &flow->ipsaddr->ctrl_msg_queued, + &ctrlscb, ctrlscb.cksum, 0); + } +} + +/* return 1 if packet is next expected in flow + * return 0 if packet is not next expected in flow (and nak packet). + */ +PSMI_ALWAYS_INLINE( +int +ips_proto_is_expected_or_nak(struct ips_recvhdrq_event *rcv_ev)) +{ + struct ips_proto *proto = rcv_ev->proto; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); + struct ips_flow *flow; + psmi_seqnum_t sequence_num; + + psmi_assert(flowid == EP_FLOW_GO_BACK_N_PIO); + flow = &ipsaddr->flows[flowid]; + + sequence_num.psn_val = __be32_to_cpu(p_hdr->bth[2]); + if_pf(flow->recv_seq_num.psn_num == sequence_num.psn_num) { + flow->flags &= ~IPS_FLOW_FLAG_NAK_SEND; + + flow->recv_seq_num.psn_num = + (flow->recv_seq_num.psn_num + 1) & proto->psn_mask; + + /* don't process ack, caller will do it. */ + return 1; + + } + + int16_t diff = (int16_t) (sequence_num.psn_num - + flow->recv_seq_num.psn_num); + if (diff > 0) { + if (!(flow->flags & IPS_FLOW_FLAG_NAK_SEND)) { + /* Queue/Send NAK to peer */ + ips_proto_send_nak((struct ips_recvhdrq *) + rcv_ev->recvq, flow); + flow->flags |= IPS_FLOW_FLAG_NAK_SEND; + } + } + + /* process ack if packet is not in sequence. */ + ips_proto_process_ack(rcv_ev); + + return 0; +} + +/* + * Note, some code depends on the literal values specified in this enum. + */ +enum ips_msg_order { + IPS_MSG_ORDER_PAST = 3, /* Old message, recv & drop */ + IPS_MSG_ORDER_EXPECTED_MATCH = 2, /* Expected message, recv on match */ + IPS_MSG_ORDER_EXPECTED = 1, /* Expected message, always recv */ + IPS_MSG_ORDER_FUTURE_RECV = 0, /* Future message, buffer in OOO Q */ + IPS_MSG_ORDER_FUTURE = -1, /* Future message, leave on RHQ */ +}; + +PSMI_ALWAYS_INLINE( +enum ips_msg_order +ips_proto_check_msg_order(ips_epaddr_t *ipsaddr, + struct ips_flow *flow, + uint16_t send_seqnum, + uint16_t *recv_seqnum)) + +{ + int16_t diff = (int16_t) (*recv_seqnum - send_seqnum); + + if (likely(diff == 0)) { + *recv_seqnum += 1; + + ipsaddr->msg_toggle ^= IPS_FLOW_MSG_TOGGLE_UNEXP_MASK; + if (ipsaddr->msg_toggle & IPS_FLOW_MSG_TOGGLE_UNEXP_MASK) + return IPS_MSG_ORDER_EXPECTED_MATCH; + + return IPS_MSG_ORDER_EXPECTED; + } else if (diff > 0) { + return IPS_MSG_ORDER_PAST; + } + + ipsaddr->msg_toggle ^= IPS_FLOW_MSG_TOGGLE_OOO_MASK; + if (!(ipsaddr->msg_toggle & IPS_FLOW_MSG_TOGGLE_OOO_MASK)) { + /* + * Second time to see the same ooo message, receive and put + * into OOO queue. + */ + return IPS_MSG_ORDER_FUTURE_RECV; + } + + /* The first time to see an OOO message, leave it there and try + * next time. But we need to revert back the receiving flow PSN. */ + uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask; + flow->recv_seq_num.psn_num = + (flow->recv_seq_num.psn_num - 1) & psn_mask; + return IPS_MSG_ORDER_FUTURE; +} + +PSMI_INLINE( +int +ips_proto_process_packet(const struct ips_recvhdrq_event *rcv_ev)) +{ + uint32_t index; + +#ifdef PSM_FI + + if_pf(PSMI_FAULTINJ_ENABLED_EP(rcv_ev->proto->ep)) { + PSMI_FAULTINJ_STATIC_DECL(fi_recv, "recvlost", + "drop " + "RC eager or any " + "UD packet at recv", + 1, IPS_FAULTINJ_RECVLOST); + if (psmi_faultinj_is_fault(fi_recv)) + return IPS_RECVHDRQ_CONTINUE; + } +#endif /* #ifdef PSM_FI */ + /* see file ips_proto_header.h for details */ + index = _get_proto_hfi_opcode(rcv_ev->p_hdr) - OPCODE_RESERVED; + if (index >= (OPCODE_FUTURE_FROM - OPCODE_RESERVED)) + index = 0; + + return ips_packet_service_routine[index] + ((struct ips_recvhdrq_event *)rcv_ev); +} + +/* + * Breaks header encapsulation but needed in mq sends so we can pay + * "near-equal" attention to putting sends on the wire and servicing the + * receive queue. + */ + +PSMI_ALWAYS_INLINE( +psm2_error_t +ips_recv_progress_if_busy(ptl_t *ptl_gen, psm2_error_t err)) +{ + struct ptl_ips *ptl = (struct ptl_ips *) ptl_gen; + + if (err == PSM2_EP_NO_RESOURCES) { + ptl->ctl->ep_poll(ptl_gen, 0); + return PSM2_OK; + } else + return err; +} + +/* Find next lowest power of a two for a 32 bit number*/ +PSMI_ALWAYS_INLINE( +unsigned int +ips_next_low_pow2(unsigned int v)) +{ + + const unsigned int b[] = { 0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000 }; + const unsigned int S[] = { 1, 2, 4, 8, 16 }; + register unsigned int r = 1; + int i; + + for (i = 4; i >= 0; i--) { + if (v & b[i]) { + v >>= S[i]; + r <<= S[i]; + } + } + + return r; +} + +PSMI_ALWAYS_INLINE( +ips_path_rec_t * +ips_select_path(struct ips_proto *proto, ips_path_type_t path_type, + ips_epaddr_t *ipsaddr, ips_path_grp_t *pathgrp)) +{ + uint32_t path_idx; + + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) { + /* If dispersive routes are configured then select the routes + * in round robin order. We may want to use congestion + * information to select the least lightly loaded path. + */ + path_idx = pathgrp->pg_next_path[path_type]; + if (++pathgrp->pg_next_path[path_type] >= + pathgrp->pg_num_paths[path_type]) + pathgrp->pg_next_path[path_type] = 0; + } else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST) + path_idx = /* Key on destination context */ + ipsaddr->IPSADDR_HASH % pathgrp->pg_num_paths[path_type]; + else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC) + path_idx = /* Key off src context */ + proto->epinfo.EP_HASH % pathgrp->pg_num_paths[path_type]; + else /* Base LID routed - Default in Infinhfi 2.5 (Oct 09). */ + path_idx = 0; + + return pathgrp->pg_path[path_idx][path_type]; +} + +#endif /* _IPS_PROTO_HELP_H */ diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_internal.h b/prov/psm3/psm3/ptl_ips/ips_proto_internal.h new file mode 100644 index 00000000000..917e098842a --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_proto_internal.h @@ -0,0 +1,85 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_PROTO_INTERNAL_H +#define _IPS_PROTO_INTERNAL_H + +#include "ips_expected_proto.h" +#include "ips_proto_help.h" + +/* + * Connect protocol. + * + * On receive, handled by upcalling into the connect interface. + * On send, handled by ips_proto by having connect compose the message. + */ +psm2_error_t ips_proto_process_connect(struct ips_proto *proto, + uint8_t opcode, + struct ips_message_header *p_hdr, + void *payload, uint32_t paylen); +psm2_error_t ips_proto_timer_ack_callback(struct psmi_timer *, uint64_t); +psm2_error_t ips_proto_timer_send_callback(struct psmi_timer *, uint64_t); +psm2_error_t ips_proto_timer_ctrlq_callback(struct psmi_timer *, uint64_t); +psm2_error_t ips_proto_timer_pendq_callback(struct psmi_timer *, uint64_t); +void ips_proto_rv_scbavail_callback(struct ips_scbctrl *scbc, void *context); + +psm2_error_t ips_proto_recv_init(struct ips_proto *proto); +psm2_error_t ips_proto_recv_fini(struct ips_proto *proto); + +int ips_proto_process_err_chk(struct ips_recvhdrq_event *rcv_ev); +int ips_proto_connect_disconnect(struct ips_recvhdrq_event *rcv_ev); +int ips_proto_process_unknown_opcode(struct ips_recvhdrq_event *rcv_ev); + +#endif /* _IPS_PROTO_INTERNAL_H */ diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_mq.c b/prov/psm3/psm3/ptl_ips/ips_proto_mq.c new file mode 100644 index 00000000000..1a71f3156c9 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_proto_mq.c @@ -0,0 +1,2013 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#ifdef PSM_CUDA +#include "psm_gdrcpy.h" +#endif +#include "ips_scb.h" +#include "ips_proto.h" +#include "psm_mq_internal.h" +#include "ips_expected_proto.h" +#include "ips_proto_help.h" + +PSMI_NEVER_INLINE(ips_scb_t * + ips_poll_scb(struct ips_proto *proto, + int npkts, int len, uint32_t flags, int istiny)) +{ + ips_scb_t *scb = NULL; + psmi_assert(npkts > 0); + psm2_error_t err; + + proto->stats.scb_egr_unavail_cnt++; + + PSMI_BLOCKUNTIL(proto->ep, err, + ((scb = + (istiny ? + ips_scbctrl_alloc_tiny(&proto->scbc_egr) : + ips_scbctrl_alloc(&proto->scbc_egr, npkts, len, + flags))) != NULL)); + psmi_assert(scb != NULL); + return scb; +} + +PSMI_ALWAYS_INLINE(ips_scb_t *mq_alloc_tiny(struct ips_proto *proto)) +{ + ips_scb_t *scb = ips_scbctrl_alloc_tiny(&proto->scbc_egr); + /* common case should branch right through */ + if_pt(scb != NULL) + return scb; + else + return ips_poll_scb(proto, 1, 0, 0, 1); +} + +PSMI_ALWAYS_INLINE( +ips_scb_t * +mq_alloc_pkts(struct ips_proto *proto, int npkts, int len, uint32_t flags)) +{ + psmi_assert(npkts > 0); + ips_scb_t *scb = ips_scbctrl_alloc(&proto->scbc_egr, npkts, len, flags); + if_pt(scb != NULL) { + return scb; + } + else { + return ips_poll_scb(proto, npkts, len, flags, + 0 /* not tiny scb */); + } +} + +// handle end to end completion of eager and LONG_DATA sends +static +int ips_proto_mq_eager_complete(void *reqp, uint32_t nbytes) +{ + psm2_mq_req_t req = (psm2_mq_req_t) reqp; + + /* This code path is executed when the send is on a device buffer + * and the receive is completed using eager buffers. As there is no + * completion notification sent to the sender, this is the only place + * where send side chb's can be freed and put back into the mpool. + */ +#ifdef PSM_CUDA + struct ips_cuda_hostbuf *chb; + if (req->cuda_hostbuf_used) { + while (!STAILQ_EMPTY(&req->sendreq_prefetch)) { + /* If any prefetched buffers weren't used, they + must be reclaimed here. */ + chb = STAILQ_FIRST(&req->sendreq_prefetch); + STAILQ_REMOVE_HEAD(&req->sendreq_prefetch, + req_next); + psmi_mpool_put(chb); + } + } +#endif + + req->send_msgoff += nbytes; + /* + * the reason to use >= is because + * we may have DW pad in nbytes. + */ + if (req->send_msgoff >= req->req_data.send_msglen) { + // If we predicted use of RDMA and pre-registered our buffer when we + // sent RTS, and receiver chose LONG_DATA in CTS, we can end up here + // and need to release our MR + if (req->mr) { + _HFI_MMDBG("RTS complete, releasing MR: rkey: 0x%x\n", req->mr->rkey); + psm2_verbs_release_mr(req->mr); + req->mr = NULL; + ips_tid_mravail_callback(req->rts_peer->proto); + } + req->state = MQ_STATE_COMPLETE; + ips_barrier(); + if(!psmi_is_req_internal(req)) + mq_qq_append(&req->mq->completed_q, req); + } + return IPS_RECVHDRQ_CONTINUE; +} + +static +void ips_proto_mq_rv_complete(psm2_mq_req_t req) +{ + psmi_mq_handle_rts_complete(req); +} + +PSMI_ALWAYS_INLINE( +void +ips_shortcpy(void *vdest, const void *vsrc, uint32_t nchars)) +{ + unsigned char *dest = vdest; + const unsigned char *src = vsrc; + +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED && (PSMI_IS_CUDA_MEM(vdest) || PSMI_IS_CUDA_MEM((void *) vsrc))) { + PSMI_CUDA_CALL(cuMemcpy, + (CUdeviceptr)vdest, (CUdeviceptr)vsrc, nchars); + return; + } +#endif + + if (nchars >> 2) + hfi_dwordcpy((uint32_t *) dest, (uint32_t *) src, nchars >> 2); + dest += (nchars >> 2) << 2; + src += (nchars >> 2) << 2; + switch (nchars & 0x03) { + case 3: *dest++ = *src++; + /* fall through */ + case 2: *dest++ = *src++; + /* fall through */ + case 1: *dest++ = *src++; + } + return; +} + +#ifdef PSM_CUDA +PSMI_ALWAYS_INLINE( +void +ips_shortcpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars)) +{ + unsigned char *dest = vdest; + const unsigned char *src = vsrc; + + if (nchars >> 2) + hfi_dwordcpy((uint32_t *) dest, (uint32_t *) src, nchars >> 2); + dest += (nchars >> 2) << 2; + src += (nchars >> 2) << 2; + switch (nchars & 0x03) { + case 3: *dest++ = *src++; + /* fall through */ + case 2: *dest++ = *src++; + /* fall through */ + case 1: *dest++ = *src++; + } + return; +} +#endif + +extern psm2_error_t ips_ptl_poll(ptl_t *ptl, int _ignored); + +/* + * Mechanism to capture PIO-ing or DMA-ing the MQ message envelope + * + * Recoverable errors: + * PSM2_OK: If PIO, envelope is sent. + * If DMA, all queued up packets on flow were flushed. + * + * Recoverable errors converted to PSM2_OK just before return: + * PSM2_OK_NO_PROGRESS: DMA-only, flushed 1 but not all queued packets. + * PSM2_EP_NO_RESOURCES: + * If PIO, no pio available or cable currently pulled. + * If DMA, can be that no scb's available to handle unaligned packets + * or writev returned a recoverable error (no mem for + * descriptors, dma interrupted or no space left in dma queue). + * + * Unrecoverable errors (PIO or DMA). + * PSM2_EP_DEVICE_FAILURE: Unexpected error calling writev(), chip failure, + * rxe/txe parity error. + * PSM2_EP_NO_NETWORK: No network, no lid, ... + */ +PSMI_ALWAYS_INLINE( +psm2_error_t +ips_mq_send_envelope(struct ips_proto *proto, struct ips_flow *flow, + struct ips_scb *scb, int do_flush)) +{ + psm2_error_t err = PSM2_OK; + + ips_proto_flow_enqueue(flow, scb); + + if ((flow->transfer == PSM_TRANSFER_PIO) || do_flush) + err = flow->flush(flow, NULL); + + if (do_flush) + err = ips_recv_progress_if_busy(proto->ptl, err); + + /* As per the PSM error model (or lack thereof), PSM clients expect to see + * only PSM2_OK as a recoverable error */ + if (err == PSM2_EP_NO_RESOURCES || err == PSM2_OK_NO_PROGRESS) + err = PSM2_OK; + return err; +} + +/* + * We don't use message striping for middle message protocol, + * Tests on sandy-bridge two HFIs show lower bandwidth if + * message striping is used. + */ +ustatic +psm2_error_t +ips_ptl_mq_eager(struct ips_proto *proto, psm2_mq_req_t req, + struct ips_flow *flow, psm2_mq_tag_t *tag, const void *ubuf, + uint32_t len) +{ + ips_epaddr_t *ipsaddr = flow->ipsaddr; + psm2_error_t err = PSM2_OK; + uintptr_t buf = (uintptr_t) ubuf; + uint32_t nbytes_left, pktlen, offset, chunk_size; + uint16_t msgseq, padding; + ips_scb_t *scb; + uint32_t is_non_dw_mul_allowed = 0; + + psmi_assert(len > 0); + psmi_assert(req != NULL); + + chunk_size = flow->frag_size; + msgseq = ipsaddr->msgctl->mq_send_seqnum++; + + nbytes_left = len; + offset = 0; + do { + if (is_non_dw_mul_allowed) { + /* No need to care about padding if non-double word + * multiple message size is allowed. + */ + padding = 0; + } else { + padding = nbytes_left & 0x3; + } + + if (padding) { + psmi_assert(nbytes_left > flow->frag_size); + /* over reading should be OK on sender because + * the padding area is within the whole buffer, + * receiver will discard the extra bytes via + * padcnt in packet header + */ + padding = 4 - padding; + pktlen = flow->frag_size - padding; + } else { + pktlen = min(chunk_size, nbytes_left); + psmi_assert(((pktlen & 0x3) == 0) || (is_non_dw_mul_allowed)); + } + + scb = mq_alloc_pkts(proto, 1, 0, 0); + psmi_assert(scb != NULL); + ips_scb_opcode(scb) = OPCODE_EAGER; + ips_set_LMC_LID_choice(proto, scb, len); + scb->ips_lrh.khdr.kdeth0 = msgseq; + ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag); + scb->ips_lrh.hdr_data.u32w1 = len; + scb->ips_lrh.hdr_data.u32w0 = offset; /* initial offset */ + + _HFI_VDBG + ("payload=%p, thislen=%d, frag_size=%d, nbytes_left=%d\n", + (void *)buf, pktlen, flow->frag_size, nbytes_left); + ips_scb_buffer(scb) = (void *)buf; + +#ifdef PSM_CUDA + /* PSM would never send packets using eager protocol + * if GPU Direct RDMA is turned off, which makes setting + * these flags safe. + */ + if (req->is_buf_gpu_mem) { + ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU; + ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; + } +#endif + + buf += pktlen; + offset += pktlen; + nbytes_left -= pktlen; + + pktlen += padding; + psmi_assert(((pktlen & 0x3) == 0) || (is_non_dw_mul_allowed)); + + scb->frag_size = flow->frag_size; + scb->nfrag = (pktlen + flow->frag_size - 1) / flow->frag_size; + if (scb->nfrag > 1) { + ips_scb_length(scb) = flow->frag_size; + scb->nfrag_remaining = scb->nfrag; + scb->chunk_size = + scb->chunk_size_remaining = pktlen; + } else + ips_scb_length(scb) = pktlen; + + if (nbytes_left == 0) { /* last segment/packet */ + ips_scb_cb(scb) = ips_proto_mq_eager_complete; + ips_scb_cb_param(scb) = req; + + /* Set ACKREQ if single packet per scb. For multi + * packets per scb, it is SDMA, driver will set + * ACKREQ in last packet, we only need ACK for + * last packet. + */ + if (scb->nfrag == 1) + ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ; + } else { + req->send_msgoff += pktlen; + } + + ips_proto_flow_enqueue(flow, scb); + if (flow->transfer == PSM_TRANSFER_PIO) { + /* we need to flush the pio pending queue as quick as possible */ + err = flow->flush(flow, NULL); + } + + } while (nbytes_left); + + + /* Before return, try to make some progress as long as the operation is + * not a fast path isend. If this is a fast path isend we cannot call + * progress functions since that will cause recursion into recvhdrq_progress + * and cause messages to be lost. Instead, for fast path if the operation + * was successfully enqueued, but flush returned PSM2_OK_NO_PROGRESS we return + * PSM2_OK since the user will progress the queue once the fast path call is + * complete. + */ + if (err == PSM2_EP_NO_RESOURCES || err == PSM2_OK_NO_PROGRESS) { + if (likely(!(req->flags_internal & PSMI_REQ_FLAG_FASTPATH))) { + err = ips_recv_progress_if_busy(proto->ptl, PSM2_EP_NO_RESOURCES); + } else if (err == PSM2_EP_NO_RESOURCES) { + err = PSM2_OK; + } + } + + return err; +} + +static +psm2_error_t +ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, + ips_epaddr_t *ipsaddr, const void *buf, uint32_t len) +{ + psmi_assert(proto->msgflowid < EP_FLOW_LAST); + struct ips_flow *flow = &ipsaddr->flows[proto->msgflowid]; + psm2_error_t err = PSM2_OK; + ips_scb_t *scb; + + PSM2_LOG_MSG("entering"); + req->req_data.buf = (void *)buf; + req->req_data.buf_len = len; + req->req_data.send_msglen = len; + req->recv_msgoff = 0; + req->rts_peer = (psm2_epaddr_t) ipsaddr; + + scb = mq_alloc_pkts(proto, 1, 0, 0); + psmi_assert(scb); + ips_scb_opcode(scb) = OPCODE_LONG_RTS; + ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ; + if (req->type & MQE_TYPE_WAITING) + ips_scb_flags(scb) |= IPS_SEND_FLAG_BLOCKING; + scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++; + ips_scb_copy_tag(scb->ips_lrh.tag, req->req_data.tag.tag); + scb->ips_lrh.hdr_data.u32w1 = len; + scb->ips_lrh.hdr_data.u32w0 = psmi_mpool_get_obj_index(req); + + // small well aligned synchronous payload is sent in RTS itself + // CTS becomes the synchronous ACK + if (len <= flow->frag_size && +#ifdef PSM_CUDA + !req->is_buf_gpu_mem && +#endif + !(len & 0x3)) { + ips_scb_buffer(scb) = (void *)buf; + ips_scb_length(scb) = len; + req->send_msgoff = len; + } else { + ips_scb_length(scb) = 0; + req->send_msgoff = 0; + } + +#ifdef PSM_CUDA + /* Used to indicate to the receiver that the send + * is issued on a device buffer. This helps the + * receiver select TID instead of using eager buffers. + */ + if (req->is_buf_gpu_mem) { + ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; + scb->mq_req = req; /* request comes from GPU domain (device) ... */ + } + req->cuda_hostbuf_used = 0; + if ((!(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND) && + req->is_buf_gpu_mem && + (len > GPUDIRECT_THRESH_RV)) || + ((proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND) && + req->is_buf_gpu_mem && + (len > gpudirect_send_threshold))) { + /* send from intermediate host buffer */ + struct ips_cuda_hostbuf *chb; + uint32_t offset, window_len; + int prefetch_lookahead = 0; + + STAILQ_INIT(&req->sendreq_prefetch); + offset = 0; + req->cuda_hostbuf_used = 1; + /* start prefetching */ + req->prefetch_send_msgoff = 0; + while ((offset < len) && + (prefetch_lookahead < proto->cuda_prefetch_limit)) { + chb = NULL; + window_len = + ips_cuda_next_window(ipsaddr->window_rv, + offset, len); + + if (window_len <= CUDA_SMALLHOSTBUF_SZ) + chb = (struct ips_cuda_hostbuf *) + psmi_mpool_get( + proto->cuda_hostbuf_pool_small_send); + if (chb == NULL) + chb = (struct ips_cuda_hostbuf *) + psmi_mpool_get( + proto->cuda_hostbuf_pool_send); + + /* any buffers available? */ + if (chb == NULL) + break; + + req->prefetch_send_msgoff += window_len; + + chb->offset = offset; + chb->size = window_len; + chb->req = req; + chb->gpu_buf = (CUdeviceptr) buf + offset; + chb->bytes_read = 0; + + PSMI_CUDA_CALL(cuMemcpyDtoHAsync, + chb->host_buf, chb->gpu_buf, + window_len, + proto->cudastream_send); + PSMI_CUDA_CALL(cuEventRecord, + chb->copy_status, + proto->cudastream_send); + + STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, + req_next); + offset += window_len; + prefetch_lookahead++; + } + } +#endif + + PSM2_LOG_EPM_COND((len > proto->mq->hfi_thresh_rv) && + proto->protoexp, + OPCODE_LONG_RTS,PSM2_LOG_TX,proto->ep->epid, req->rts_peer->epid, + "scb->ips_lrh.hdr_data.u32w0: %d",scb->ips_lrh.hdr_data.u32w0); + + _HFI_VDBG("sending with rndv %u\n", len); + /* If this is a fast path isend, then we cannot poll or + * allow progressing of the mq from within the fast path + * call otherwise messages will be lost. Therefore given fast path + * we will avoid calling poll_internal and not set PSMI_TRUE which would + * call ips_recv_progress_if_busy. + */ + if ((err = ips_mq_send_envelope(proto, flow, scb, + ! unlikely(req->flags_internal & PSMI_REQ_FLAG_FASTPATH)))) + goto fail; +// TBD - we may want to include odd bytes at start +// and end of message in the RTS itself as opposed to being in last +// EXPTID payload packet's header +// then the RDMA Write can be better aligned and may perform better + // Start registering memory for anticipated CTS requesting RDMA + // TBD - we could reduce duation of memory pin by doing this only + // once we receive CTS, but that will put this call in the critical + // path. If done after getting CTS we don't have to predict + // if remote end will chose RDMA vs LONG DATA approach (eg. if tests of + // length, etc below) + // + // register buffer we will use as source for RDMA Write + // for PSM_CUDA, a group of host bounce buffers may be used above + // ips_scb_buffer catches when RTS contains the data, in which case no + // need for memory registration. While unlkely we also skip + // registration for zero length sync messages + // PSM3_RDMA if disabled causes proto->protoexp == NULL + if (! ips_scb_buffer(scb) && len + && len > proto->mq->hfi_thresh_rv + && proto->protoexp /* expected tid recieve enabled */ + && ips_epaddr_connected(ipsaddr) +#ifdef PSM_CUDA + && len > GPUDIRECT_THRESH_RV + && ! req->cuda_hostbuf_used +#endif + ) { + req->mr = psm2_verbs_reg_mr(proto->mr_cache, 0, proto->ep->verbs_ep.pd, + req->req_data.buf, req->req_data.send_msglen, 0 +#ifdef PSM_CUDA + | (req->is_buf_gpu_mem?IBV_ACCESS_IS_GPU_ADDR:0) +#endif + ); + // if we failed to register memory we will try again when + // we get the CTS. + } + + if_pt (! (req->flags_internal & PSMI_REQ_FLAG_FASTPATH)) { + /* Assume that we already put a few rndv requests in flight. This helps + * for bibw microbenchmarks and doesn't hurt the 'blocking' case since + * we're going to poll anyway */ + psmi_poll_internal(proto->ep, 1); + } + +fail: + _HFI_VDBG + ("[rndv][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p/%d]: %s\n", + psmi_epaddr_get_name(proto->ep->epid), + psmi_epaddr_get_name(req->rts_peer->epid), buf, len, + req->req_data.tag.tag[0], req->req_data.tag.tag[1], req->req_data.tag.tag[2], req, + psmi_mpool_get_obj_index(req), psm2_error_get_string(err)); + PSM2_LOG_MSG("leaving"); + return err; +} + +#ifdef PSM_CUDA +static inline +int psmi_cuda_is_buffer_gpu_mem(void *ubuf) +{ + return (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf)); +} + +static inline +int psmi_cuda_is_needed_rendezvous(struct ips_proto *proto, uint32_t len) +{ + if (!(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND) || + !PSMI_IS_GDR_COPY_ENABLED || + len < 1 || len > cuda_thresh_rndv){ + return 1; + } + + return 0; +} +#endif + +/* Find the correct flow (PIO/DMA) */ +static inline +ips_epaddr_flow_t +flow_select_type(struct ips_proto *proto, uint32_t len, int gpu_mem, + uint32_t eager_thresh) +{ + // minor optimization, we don't use SDMA for UD or UDP yet, so just return a + // constant and compiler will optimize + return EP_FLOW_GO_BACK_N_PIO; +} + +psm2_error_t ips_proto_msg_size_thresh_query (enum psm2_info_query_thresh_et qt, + uint32_t *out, psm2_mq_t mq, psm2_epaddr_t epaddr) +{ + struct ptl_ips *ptl = (struct ptl_ips *) epaddr->ptlctl->ptl; + psm2_error_t rv = PSM2_INTERNAL_ERR; + + switch (qt) + { + case PSM2_INFO_QUERY_THRESH_IPS_PIO_DMA: + *out = ptl->proto.iovec_thresh_eager; + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_THRESH_IPS_TINY: + *out = mq->hfi_thresh_tiny; + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_THRESH_IPS_PIO_FRAG_SIZE: + { + ips_epaddr_t *ipsaddr = ((ips_epaddr_t *) epaddr)->msgctl->ipsaddr_next; + *out = ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].frag_size; + } + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_THRESH_IPS_DMA_FRAG_SIZE: + *out = 0; + rv = PSM2_OK; + break; + case PSM2_INFO_QUERY_THRESH_IPS_RNDV: + *out = mq->hfi_thresh_rv; + rv = PSM2_OK; + break; + default: + break; + } + + return rv; +} + +psm2_error_t +ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user, + uint32_t flags_internal, psm2_mq_tag_t *tag, const void *ubuf, + uint32_t len, void *context, psm2_mq_req_t *req_o) +{ + psm2_error_t err = PSM2_OK; + ips_epaddr_flow_t flow_type; + struct ips_proto *proto; + struct ips_flow *flow; + ips_epaddr_t *ipsaddr; + ips_scb_t *scb; + psm2_mq_req_t req; +#if defined(PSM_CUDA) + int converted = 0; +#endif // PSM_CUDA + + req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); + if_pf(req == NULL) + return PSM2_NO_MEMORY; + + _HFI_VDBG("(req=%p) ubuf=%p len=%u\n", req, ubuf, len); + + req->flags_user = flags_user; + req->flags_internal = flags_internal; + if (len >= mepaddr->proto->multirail_thresh_load_balance) { + ipsaddr = ((ips_epaddr_t *) mepaddr)->msgctl->ipsaddr_next; + ipsaddr->msgctl->ipsaddr_next = ipsaddr->next; + } else { + ipsaddr = (ips_epaddr_t *)mepaddr; + } + psmi_assert(ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED); + + proto = ((psm2_epaddr_t) ipsaddr)->proto; + + psmi_assert(proto->msgflowid < EP_FLOW_LAST); + req->req_data.send_msglen = len; + req->req_data.tag = *tag; + req->req_data.context = context; + +#ifdef PSM_CUDA + req->is_buf_gpu_mem = psmi_cuda_is_buffer_gpu_mem((void*)ubuf); + req->cuda_hostbuf_used = 0; + if (req->is_buf_gpu_mem) { + psmi_cuda_set_attr_sync_memops(ubuf); + if (psmi_cuda_is_needed_rendezvous(proto, len)) + goto do_rendezvous; + } +#else + req->is_buf_gpu_mem = 0; +#endif + flow_type = flow_select_type(proto, len, req->is_buf_gpu_mem, + proto->iovec_thresh_eager); + flow = &ipsaddr->flows[flow_type]; + + if (flags_user & PSM2_MQ_FLAG_SENDSYNC) { + goto do_rendezvous; + } else if (len <= mq->hfi_thresh_tiny) { + scb = mq_alloc_tiny(proto); + psmi_assert(scb); + ips_scb_opcode(scb) = OPCODE_TINY; + ips_set_LMC_LID_choice(proto, scb, len); + scb->ips_lrh.khdr.kdeth0 = + ((len & HFI_KHDR_TINYLEN_MASK) << HFI_KHDR_TINYLEN_SHIFT) | + ipsaddr->msgctl->mq_send_seqnum++; + ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag); + + const void *user_buffer = ubuf; +#ifdef PSM_CUDA + if (req->is_buf_gpu_mem) { + /* The following functions PINS the GPU pages + * and mmaps the pages into the process virtual + * space. This allows PSM to issue a standard + * memcpy to move data between HFI resources + * and the GPU + */ + ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; + user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, + (unsigned long)ubuf, len, 0, proto); + converted = 1; + } + mq_copy_tiny_host_mem((uint32_t *) &scb->ips_lrh.hdr_data, + (uint32_t *) user_buffer, len); + if (converted) { + gdr_unmap_gpu_host_addr(GDR_FD, user_buffer, len, proto); + } +#else + mq_copy_tiny((uint32_t *) &scb->ips_lrh.hdr_data, + (uint32_t *) user_buffer, len); +#endif + + /* If this is a fast path isend, then we cannot allow + * progressing of the mq from within the fast path + * call otherwise messages will be lost. Therefore given fast path + * we will set PSMI_FALSE which will prevent the call to + * ips_recv_progress_if_busy. + */ + err = ips_mq_send_envelope(proto, flow, scb, !(flags_internal & PSMI_REQ_FLAG_FASTPATH)); + if (err != PSM2_OK) + return err; + + /* We can mark this op complete since all the data is now copied + * into an SCB that remains live until it is remotely acked */ + req->state = MQ_STATE_COMPLETE; + mq_qq_append(&mq->completed_q, req); + _HFI_VDBG + ("[itiny][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p]\n", + psmi_epaddr_get_name(mq->ep->epid), + psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf, + len, tag->tag[0], tag->tag[1], tag->tag[2], req); + } else if (len <= flow->frag_size) { + uint32_t paylen = len & ~0x3; + + scb = mq_alloc_pkts(proto, 1, 0, 0); + psmi_assert(scb); + ips_scb_opcode(scb) = OPCODE_SHORT; + ips_set_LMC_LID_choice(proto, scb, len); + scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++; + scb->ips_lrh.hdr_data.u32w1 = len; + ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag); + const void * user_buffer = ubuf; +#ifdef PSM_CUDA + if (req->is_buf_gpu_mem && len <= gdr_copy_threshold_send){ + ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; + user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, + (unsigned long)ubuf, len , 0, proto); + converted = 1; + } +#endif + + ips_scb_buffer(scb) = (void *)user_buffer; + + ips_scb_length(scb) = paylen; + if (len > paylen) { + /* there are nonDW bytes, copy to header */ + mq_copy_tiny + ((uint32_t *)&scb->ips_lrh.hdr_data.u32w0, + (uint32_t *)((uintptr_t)ubuf + paylen), + len - paylen); + + /* for complete callback */ + req->send_msgoff = len - paylen; + } else { + req->send_msgoff = 0; + } + +#if defined(PSM_CUDA) + if (converted) { + gdr_unmap_gpu_host_addr(GDR_FD, user_buffer, len, proto); + } +#endif // PSM_CUDA + /* + * Need ack for send side completion because we + * send from user buffer. + */ + ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ; +#ifdef PSM_CUDA + if (req->is_buf_gpu_mem && len > gdr_copy_threshold_send) { + ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; + ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU; + } +#endif + /* If this is a fast path isend, then we cannot allow + * progressing of the mq from within the fast path + * call otherwise messages will be lost. Therefore given fast path + * we will set PSMI_FALSE which will prevent the call to + * ips_recv_progress_if_busy. + */ + err = ips_mq_send_envelope(proto, flow, scb, !(flags_internal & PSMI_REQ_FLAG_FASTPATH)); + if (err != PSM2_OK) + return err; + + /* + * It should be OK to check the buffer address in + * 'scb' to be changed, when this scb is done, the + * address is set to NULL when scb is put back to + * scb pool. Even if the same scb is re-used, it + * is not possible to set to this 'buf' address. + */ + if (ips_scb_buffer(scb) == (void *)user_buffer) { + /* continue to send from user buffer */ + ips_scb_cb(scb) = ips_proto_mq_eager_complete; + ips_scb_cb_param(scb) = req; + } else { + /* mark the message done */ + req->state = MQ_STATE_COMPLETE; + mq_qq_append(&mq->completed_q, req); + } + _HFI_VDBG + ("[ishrt][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p]\n", + psmi_epaddr_get_name(mq->ep->epid), + psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf, + len, tag->tag[0], tag->tag[1], tag->tag[2], req); + } else if (len <= mq->hfi_thresh_rv) { + req->send_msgoff = 0; + err = ips_ptl_mq_eager(proto, req, flow, tag, ubuf, len); + if (err != PSM2_OK) + return err; + + _HFI_VDBG + ("[ilong][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x][req=%p]\n", + psmi_epaddr_get_name(mq->ep->epid), + psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), ubuf, + len, tag->tag[0], tag->tag[1], tag->tag[2], req); + } else { /* skip eager accounting below */ +do_rendezvous: + err = ips_ptl_mq_rndv(proto, req, ipsaddr, ubuf, len); + *req_o = req; + return err; + } + + *req_o = req; + mq->stats.tx_num++; + mq->stats.tx_eager_num++; + mq->stats.tx_eager_bytes += len; + + return err; +} + +psm2_error_t +ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, + psm2_mq_tag_t *tag, const void *ubuf, uint32_t len) +{ + psm2_error_t err = PSM2_OK; + ips_epaddr_flow_t flow_type; + struct ips_proto *proto; + struct ips_flow *flow; + ips_epaddr_t *ipsaddr; + ips_scb_t *scb; + int gpu_mem = 0; +#if defined(PSM_CUDA) + int converted = 0; +#endif // PSM_CUDA + + _HFI_VDBG("ubuf=%p len=%u\n", ubuf, len); + + if (len >= mepaddr->proto->multirail_thresh_load_balance) { + ipsaddr = ((ips_epaddr_t *) mepaddr)->msgctl->ipsaddr_next; + ipsaddr->msgctl->ipsaddr_next = ipsaddr->next; + } else { + ipsaddr = (ips_epaddr_t *)mepaddr; + } + psmi_assert(ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED); + + proto = ((psm2_epaddr_t) ipsaddr)->proto; + + psmi_assert(proto->msgflowid < EP_FLOW_LAST); + +#ifdef PSM_CUDA + gpu_mem = psmi_cuda_is_buffer_gpu_mem((void*)ubuf); + if (gpu_mem) { + psmi_cuda_set_attr_sync_memops(ubuf); + if (psmi_cuda_is_needed_rendezvous(proto, len)) + goto do_rendezvous; + } +#endif + flow_type = flow_select_type(proto, len, gpu_mem, + proto->iovec_thresh_eager_blocking); + flow = &ipsaddr->flows[flow_type]; + + if (flags & PSM2_MQ_FLAG_SENDSYNC) { + goto do_rendezvous; + } else if (len <= mq->hfi_thresh_tiny) { + scb = mq_alloc_tiny(proto); + psmi_assert(scb); + ips_scb_opcode(scb) = OPCODE_TINY; + ips_set_LMC_LID_choice(proto, scb, len); + scb->ips_lrh.khdr.kdeth0 = + ((len & HFI_KHDR_TINYLEN_MASK) << HFI_KHDR_TINYLEN_SHIFT) | + ipsaddr->msgctl->mq_send_seqnum++; + ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag); +#ifdef PSM_CUDA + const void *user_buffer = ubuf; + if (gpu_mem){ + /* The following functions PINS the GPU pages + * and mmaps the pages into the process virtual + * space. This allows PSM to issue a standard + * memcpy to move data between HFI resources + * and the GPU + */ + ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; + user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, + (unsigned long)ubuf, len, 0, proto); + converted = 1; + } + mq_copy_tiny_host_mem((uint32_t *) &scb->ips_lrh.hdr_data, + (uint32_t *) user_buffer, len); + if (converted) { + gdr_unmap_gpu_host_addr(GDR_FD, user_buffer, len, proto); + } +#else // PSM_CUDA + mq_copy_tiny + ((uint32_t *) &scb->ips_lrh.hdr_data, + (uint32_t *) ubuf, len); +#endif // PSM_CUDA + err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE); + if (err != PSM2_OK) + return err; + + _HFI_VDBG("[tiny][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n", + psmi_epaddr_get_name(mq->ep->epid), + psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), + ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]); + } else if (len <= flow->frag_size) { + uint32_t paylen = len & ~0x3; + + scb = mq_alloc_pkts(proto, 1, 0, 0); + psmi_assert(scb); + ips_scb_opcode(scb) = OPCODE_SHORT; + ips_set_LMC_LID_choice(proto, scb, len); + scb->ips_lrh.khdr.kdeth0 = ipsaddr->msgctl->mq_send_seqnum++; + scb->ips_lrh.hdr_data.u32w1 = len; + ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag); + + const void * user_buffer = ubuf; +#ifdef PSM_CUDA + if (gpu_mem && len <= gdr_copy_threshold_send) { + ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; + user_buffer = gdr_convert_gpu_to_host_addr(GDR_FD, + (unsigned long)ubuf, len, 0, proto); + converted = 1; + } +#endif + + + ips_scb_buffer(scb) = (void *)user_buffer; + ips_scb_length(scb) = paylen; + if (len > paylen) { + /* there are nonDW bytes, copy to header */ + mq_copy_tiny + ((uint32_t *)&scb->ips_lrh.hdr_data.u32w0, + (uint32_t *)((uintptr_t)ubuf + paylen), + len - paylen); + } +#if defined(PSM_CUDA) + if (converted) { + gdr_unmap_gpu_host_addr(GDR_FD, user_buffer, len, proto); + } +#endif // PSM_CUDA + + /* + * Need ack for send side completion because we + * send from user buffer. + */ + ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ; +#ifdef PSM_CUDA + if (gpu_mem && len > gdr_copy_threshold_send) { + ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; + ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU; + } +#endif + err = ips_mq_send_envelope(proto, flow, scb, PSMI_TRUE); + if (err != PSM2_OK) + return err; + + /* + * It should be OK to check the buffer address in + * 'scb' to be changed, when this scb is done, the + * address is set to NULL when scb is put back to + * scb pool. Even if the same scb is re-used, it + * is not possible to set to this 'ubuf' address. + */ + if (ips_scb_buffer(scb) == (void *)user_buffer) { + if (flow->transfer != PSM_TRANSFER_PIO || + paylen > proto->scb_bufsize || + !ips_scbctrl_bufalloc(scb)) { + /* sdma transfer (can't change user buffer), + * or, payload is larger than bounce buffer, + * or, can't allocate bounce buffer, + * send from user buffer till complete */ + PSMI_BLOCKUNTIL(mq->ep, err, + ips_scb_buffer(scb) != (void*)user_buffer); + if (err > PSM2_OK_NO_PROGRESS) + return err; + err = PSM2_OK; + } else { + /* copy to bounce buffer */ +#ifdef PSM_CUDA + ips_shortcpy_host_mem +#else + ips_shortcpy +#endif + (ips_scb_buffer(scb), + (void*)user_buffer, paylen); + } + } + _HFI_VDBG("[shrt][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n", + psmi_epaddr_get_name(mq->ep->epid), + psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), + ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]); + + } else if (len <= mq->hfi_thresh_rv) { + psm2_mq_req_t req; + + /* Block until we can get a req */ + PSMI_BLOCKUNTIL(mq->ep, err, + (req = + psmi_mq_req_alloc(mq, MQE_TYPE_SEND))); + if (err > PSM2_OK_NO_PROGRESS) + return err; + +#ifdef PSM_CUDA + req->cuda_hostbuf_used = 0; + if (gpu_mem) { + req->is_buf_gpu_mem = 1; + } else + req->is_buf_gpu_mem = 0; +#endif + + req->type |= MQE_TYPE_WAITING; + req->req_data.send_msglen = len; + req->req_data.tag = *tag; + req->send_msgoff = 0; + req->flags_user = flags; + req->flags_internal |= PSMI_REQ_FLAG_IS_INTERNAL; + + err = ips_ptl_mq_eager(proto, req, flow, tag, ubuf, len); + if (err != PSM2_OK) + return err; + + psmi_mq_wait_internal(&req); + + _HFI_VDBG("[long][%s->%s][b=%p][m=%d][t=%08x.%08x.%08x]\n", + psmi_epaddr_get_name(mq->ep->epid), + psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), + ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]); + } else { + psm2_mq_req_t req; +do_rendezvous: + /* Block until we can get a req */ + PSMI_BLOCKUNTIL(mq->ep, err, + (req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND))); + if (err > PSM2_OK_NO_PROGRESS) + return err; + + req->type |= MQE_TYPE_WAITING; + req->req_data.tag = *tag; + req->flags_user = flags; + req->flags_internal |= PSMI_REQ_FLAG_IS_INTERNAL; + +#ifdef PSM_CUDA + if (gpu_mem) { + req->is_buf_gpu_mem = 1; + } else + req->is_buf_gpu_mem = 0; +#endif + + err = ips_ptl_mq_rndv(proto, req, ipsaddr, ubuf, len); + if (err != PSM2_OK) + return err; + psmi_mq_wait_internal(&req); + return err; /* skip accounting, done separately at completion time */ + } + + mq->stats.tx_num++; + mq->stats.tx_eager_num++; + mq->stats.tx_eager_bytes += len; + + return err; +} + +static +psm2_error_t +ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted) +{ + psm2_epaddr_t epaddr = req->rts_peer; + struct ips_proto *proto = epaddr->proto; + + /* We have a match. + * We may already set with first packet, + * If we're doing eager-based r-v, just send back the sreq and length and + * have the sender complete the send. + */ + PSM2_LOG_MSG("entering"); + _HFI_MMDBG("rts_match_callback\n"); +#ifdef PSM_CUDA + /* Cases where we do not use TIDs: + * 1) Recv on a host buffer, Send on a gpu buffer and len is less than 3 bytes + * 2) Recv on a host buffer, Send on a host buffer and len is less than hfi_thresh_rv + * 3) Recv on gpu buf and len is less than 3 bytes + * 4) Expected protocol not initialized. + */ + if ((!req->is_buf_gpu_mem && ((req->is_sendbuf_gpu_mem && + req->req_data.recv_msglen <= GPUDIRECT_THRESH_RV)|| + (!req->is_sendbuf_gpu_mem && + req->req_data.recv_msglen <= proto->mq->hfi_thresh_rv))) || + (req->is_buf_gpu_mem && req->req_data.recv_msglen <= GPUDIRECT_THRESH_RV) || + ! ips_epaddr_connected((ips_epaddr_t *) epaddr) || + proto->protoexp == NULL) { /* no expected tid recieve */ +#else // PSM_CUDA + if ( + ! ips_epaddr_connected((ips_epaddr_t *) epaddr) || + req->req_data.recv_msglen <= proto->mq->hfi_thresh_rv || /* less rv theshold */ + proto->protoexp == NULL) { /* no expected tid recieve */ +#endif // PSM_CUDA +//do_long_data: + // send CTS asking for use of LONG_DATA send of large message + + /* there is no order requirement, try to push CTS request + * directly, if fails, then queue it for later try. */ + _HFI_VDBG("pushing CTS\n"); + if (ips_proto_mq_push_cts_req(proto, req) != PSM2_OK) { + struct ips_pend_sends *pends = &proto->pend_sends; + struct ips_pend_sreq *sreq = + psmi_mpool_get(proto->pend_sends_pool); + psmi_assert(sreq != NULL); + if (sreq == NULL) + { + PSM2_LOG_MSG("leaving"); + return PSM2_NO_MEMORY; + } + sreq->type = IPS_PENDSEND_EAGER_REQ; + sreq->req = req; + + STAILQ_INSERT_TAIL(&pends->pendq, sreq, next); + psmi_timer_request(proto->timerq, &pends->timer, + PSMI_TIMER_PRIO_1); + } + } else { + // send CTS asking for use of TID send of large message + // register buffer we will use as destination for remote RDMA Write + // We choose not to register memory when recv is posted since + // that could pin memory for a long time waiting for a tag match + // and recv buffers could be much larger than the messages they tag + // match with, resulting in unnecessary MR registration. + // req->req_data.buf is app buffer + // req->req_data.buf_len is app buffer length + // req->req_data.send_msglen is agreed amount to transfer (<= buf_len) + // TBD - if we were tight on MR resources, this could tie up more + // resources than needed, in which case skipping this and registering + // per CTS below could be better + // TBD - it might help MR cache hit rate if we registered the whole + // receive buffer (req->req_data.buf_len), this way large receive + // buffers which match smaller messages can get MR cache hit for + // various sized messages which may arrive in the buffer + psmi_assert(req->req_data.send_msglen); // 0 len uses LONG_DATA above +#ifdef PSM_CUDA + // for GPU receive buffer we need to sort things out at a lower level + // since may use a host bounce buffer for RDMA and need to register it + if (! req->is_buf_gpu_mem) { +#else + { +#endif + req->mr = psm2_verbs_reg_mr(proto->mr_cache, 0, + proto->ep->verbs_ep.pd, + req->req_data.buf, req->req_data.send_msglen, + IBV_ACCESS_REMOTE_WRITE); + if (! req->mr) { + // ips_protoexp_tid_get_from_token will try to get MR again + // and will retry via ips_tid_pendtids_timer_callback. So we + // can just fall through with req->mr == NULL. + // The alternative would be to goto and force use of LONG_DATA + //goto do_long_data; + } else { + _HFI_MMDBG("rbuf registered: addr %p len %d rkey 0x%x\n", req->req_data.buf, req->req_data.send_msglen, req->mr->rkey); + } + } + _HFI_VDBG("matched rts, trying TID\n"); + ips_protoexp_tid_get_from_token(proto->protoexp, req->req_data.buf, + req->req_data.recv_msglen, epaddr, + req->rts_reqidx_peer, + req->type & MQE_TYPE_WAITING_PEER ? + IPS_PROTOEXP_TIDGET_PEERWAIT : + 0, ips_proto_mq_rv_complete, + req); + } + + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} + +psm2_error_t +ips_proto_mq_push_cts_req(struct ips_proto *proto, psm2_mq_req_t req) +{ + ips_epaddr_t *ipsaddr = (ips_epaddr_t *) (req->rts_peer); + struct ips_flow *flow; + ips_scb_t *scb; + ptl_arg_t *args; + + PSM2_LOG_MSG("entering"); + psmi_assert(proto->msgflowid < EP_FLOW_LAST); + flow = &ipsaddr->flows[proto->msgflowid]; + scb = ips_scbctrl_alloc(&proto->scbc_egr, 1, 0, 0); + if (scb == NULL) + { + PSM2_LOG_MSG("leaving"); + return PSM2_OK_NO_PROGRESS; + } + args = (ptl_arg_t *) scb->ips_lrh.data; + + ips_scb_opcode(scb) = OPCODE_LONG_CTS; + scb->ips_lrh.khdr.kdeth0 = 0; + args[0].u32w0 = psmi_mpool_get_obj_index(req); + args[1].u32w1 = req->req_data.recv_msglen; + args[1].u32w0 = req->rts_reqidx_peer; + + PSM2_LOG_EPM(OPCODE_LONG_CTS,PSM2_LOG_TX, proto->ep->epid, + flow->ipsaddr->epaddr.epid ,"req->rts_reqidx_peer: %d", + req->rts_reqidx_peer); + + ips_proto_flow_enqueue(flow, scb); + flow->flush(flow, NULL); + + /* have already received enough bytes */ + if (req->recv_msgoff == req->req_data.recv_msglen) { + ips_proto_mq_rv_complete(req); + } + + PSM2_LOG_MSG("leaving"); + return PSM2_OK; +} + +// rendezvous using LONG DATA "eager push" instead of TID +psm2_error_t +ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) +{ + psm2_error_t err = PSM2_OK; + uintptr_t buf = (uintptr_t) req->req_data.buf + req->recv_msgoff; + ips_epaddr_t *ipsaddr = (ips_epaddr_t *) (req->rts_peer); + uint32_t nbytes_left = req->req_data.send_msglen - req->recv_msgoff; + uint32_t nbytes_sent = 0; + uint32_t nbytes_this, chunk_size; + uint16_t frag_size, unaligned_bytes; + struct ips_flow *flow; + ips_scb_t *scb; + + psmi_assert(nbytes_left > 0); + + PSM2_LOG_MSG("entering."); + { + /* use PIO transfer */ + flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO]; + chunk_size = frag_size = flow->frag_size; + } + + do { + /* + * don't try to call progression routine such as: + * ips_recv_progress_if_busy() in this loop, + * it will cause recursive call of this function. + */ + + /* + * When tid code path is enabled, we don’t allocate scbc_rv + * objects. If the message is less than the hfi_thresh_rv, + * we normally use eager protocol to do the transfer. + * However, if it is sync send, we use the rendezvous + * rts/cts/rts-data protocol. + * In this case, because scbc_rv is null, + * we use scbc_egr instead. + */ + + scb = ips_scbctrl_alloc(proto->scbc_rv ? proto->scbc_rv + : &proto->scbc_egr, 1, 0, 0); + if (scb == NULL) { + err = PSM2_OK_NO_PROGRESS; + break; + } + ips_scb_opcode(scb) = OPCODE_LONG_DATA; + scb->ips_lrh.khdr.kdeth0 = 0; + scb->ips_lrh.data[0].u32w0 = req->rts_reqidx_peer; + scb->ips_lrh.data[1].u32w1 = req->req_data.send_msglen; + + /* attached unaligned bytes into packet header */ + unaligned_bytes = nbytes_left & 0x3; + if (unaligned_bytes) { + mq_copy_tiny((uint32_t *)&scb->ips_lrh.mdata, + (uint32_t *)buf, unaligned_bytes); + + /* position to send */ + buf += unaligned_bytes; + req->recv_msgoff += unaligned_bytes; + psmi_assert(req->recv_msgoff < 4); + + /* for complete callback */ + req->send_msgoff += unaligned_bytes; + + nbytes_left -= unaligned_bytes; + nbytes_sent += unaligned_bytes; + } + scb->ips_lrh.data[1].u32w0 = req->recv_msgoff; + ips_scb_buffer(scb) = (void *)buf; +#ifdef PSM_CUDA + // SDMA identifies GPU buffers itself. But PIO path needs flags + if (req->is_buf_gpu_mem + ) { + ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU; + ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU; + } +#endif + + scb->frag_size = frag_size; + nbytes_this = min(chunk_size, nbytes_left); + if (nbytes_this > 0) + scb->nfrag = (nbytes_this + frag_size - 1) / frag_size; + else + scb->nfrag = 1; + + if (scb->nfrag > 1) { + ips_scb_length(scb) = frag_size; + scb->nfrag_remaining = scb->nfrag; + scb->chunk_size = + scb->chunk_size_remaining = nbytes_this; + } else + ips_scb_length(scb) = nbytes_this; + + buf += nbytes_this; + req->recv_msgoff += nbytes_this; + nbytes_sent += nbytes_this; + nbytes_left -= nbytes_this; + if (nbytes_left == 0) { + /* because of scb callback, use eager complete */ + ips_scb_cb(scb) = ips_proto_mq_eager_complete; + ips_scb_cb_param(scb) = req; + + /* Set ACKREQ if single packet per scb. For multi + * packets per scb, it is SDMA, driver will set + * ACKREQ in last packet, we only need ACK for + * last packet. + */ + if (scb->nfrag == 1) + ips_scb_flags(scb) |= IPS_SEND_FLAG_ACKREQ; + } else { + req->send_msgoff += nbytes_this; + } + + ips_proto_flow_enqueue(flow, scb); + if (flow->transfer == PSM_TRANSFER_PIO) { + /* we need to flush the pio pending queue as quick as possible */ + flow->flush(flow, NULL); + } + + } while (nbytes_left); + + + PSM2_LOG_MSG("leaving."); + + return err; +} + +// received a CTS +int +ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ips_proto *proto = rcv_ev->proto; + psm2_mq_t mq = proto->ep->mq; + struct ips_flow *flow; + psm2_mq_req_t req; + uint32_t paylen; + + /* + * if PSN does not match, drop the packet. + */ + PSM2_LOG_MSG("entering"); + if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev)) + { + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; + } + req = psmi_mpool_find_obj_by_index(mq->sreq_pool, p_hdr->data[1].u32w0); + psmi_assert(req != NULL); + + /* + * if there is payload, it is expected tid protocol + * with tid session info as the payload. + */ + paylen = ips_recvhdrq_event_paylen(rcv_ev); + if (paylen > 0) { + // we will use TID RDMA + ips_tid_session_list *payload = + ips_recvhdrq_event_payload(rcv_ev); + psmi_assert(paylen == 0 || payload); + PSM2_LOG_EPM(OPCODE_LONG_CTS,PSM2_LOG_RX,rcv_ev->ipsaddr->epaddr.epid, + mq->ep->epid,"p_hdr->data[1].u32w0 %d", + p_hdr->data[1].u32w0); + proto->epaddr_stats.tids_grant_recv++; + + psmi_assert(p_hdr->data[1].u32w1 > mq->hfi_thresh_rv); // msglen + psmi_assert(proto->protoexp != NULL); + + /* ptl_req_ptr will be set to each tidsendc */ + if (req->ptl_req_ptr == NULL) { + req->req_data.send_msglen = p_hdr->data[1].u32w1; + } + psmi_assert(req->req_data.send_msglen == p_hdr->data[1].u32w1); + + if (! req->mr +#ifdef PSM_CUDA + && ! req->cuda_hostbuf_used +#endif + ) { + // we predicted use of LONG DATA and remote side chose RDMA + // or we failed to register memory previously. + req->mr = psm2_verbs_reg_mr(proto->mr_cache, 0, + proto->ep->verbs_ep.pd, + req->req_data.buf, req->req_data.send_msglen, 0 +#ifdef PSM_CUDA + | (req->is_buf_gpu_mem?IBV_ACCESS_IS_GPU_ADDR:0) +#endif + ); + // if we still don't have an MR, we will try again later + } + _HFI_MMDBG("ips_proto_mq_handle_cts for TID CTS\n"); + if (ips_tid_send_handle_tidreq(proto->protoexp, + rcv_ev->ipsaddr, req, p_hdr->data[0], + p_hdr->mdata, payload, paylen) == 0) { + proto->psmi_logevent_tid_send_reqs.next_warning = 0; + } else { + flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)]; + flow->recv_seq_num.psn_num -= 1; /* Decrement seq number to NAK proper CTS */ + ips_proto_send_nak((struct ips_recvhdrq *)rcv_ev->recvq, flow); + static unsigned int msg_cnt = 0; + if (msg_cnt++ == 0) { /* Report the message only once */ + _HFI_INFO("PSM3 memory shortage detected. Please consider modifying PSM3_MEMORY setting\n"); + } + return PSM2_EP_NO_RESOURCES; + } + } else { + // we will use LONG DATA push + req->rts_reqidx_peer = p_hdr->data[0].u32w0; /* eager receive only */ + req->req_data.send_msglen = p_hdr->data[1].u32w1; + + if (req->send_msgoff >= req->req_data.send_msglen) { + /* already sent enough bytes, may truncate so using >= */ + ips_proto_mq_rv_complete(req); + } else if (ips_proto_mq_push_rts_data(proto, req) != PSM2_OK) { + /* there is no order requirement, tried to push RTS data + * directly and not done, so queue it for later try. */ + struct ips_pend_sreq *sreq = + psmi_mpool_get(proto->pend_sends_pool); + psmi_assert(sreq != NULL); + + sreq->type = IPS_PENDSEND_EAGER_DATA; + sreq->req = req; + STAILQ_INSERT_TAIL(&proto->pend_sends.pendq, sreq, next); + /* Make sure it's processed by timer */ + psmi_timer_request(proto->timerq, &proto->pend_sends.timer, + PSMI_TIMER_PRIO_1); + } + } + + flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)]; + if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); + + ips_proto_process_ack(rcv_ev); + + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; +} + +// received an RTS +int +ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev) +{ + int ret = IPS_RECVHDRQ_CONTINUE; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)]; + psm2_mq_t mq = rcv_ev->proto->mq; + ips_msgctl_t *msgctl = ipsaddr->msgctl; + enum ips_msg_order msgorder; + char *payload; + uint32_t paylen; + psm2_mq_req_t req; + + /* + * if PSN does not match, drop the packet. + */ + PSM2_LOG_MSG("entering"); + _HFI_MMDBG("got rts\n"); + if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev)) + { + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; + } + + msgorder = ips_proto_check_msg_order(ipsaddr, flow, + __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK, + &ipsaddr->msgctl->mq_recv_seqnum); + if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE)) + { + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_REVISIT; + } + + payload = ips_recvhdrq_event_payload(rcv_ev); + paylen = ips_recvhdrq_event_paylen(rcv_ev); + /* either no payload or whole message */ + psmi_assert(paylen == 0 || paylen >= p_hdr->data[1].u32w1); + + /* + * We can't have past message sequence here. For eager message, + * it must always have an eager queue matching because even in + * truncation case the code logic will wait till all packets + * have been received. + */ + psmi_assert(msgorder != IPS_MSG_ORDER_PAST); + + _HFI_VDBG("tag=%llx reqidx_peer=%d, msglen=%d\n", + (long long)p_hdr->data[0].u64, + p_hdr->data[1].u32w0, p_hdr->data[1].u32w1); + + int rc = psmi_mq_handle_rts(mq, + (psm2_epaddr_t) &ipsaddr->msgctl-> + master_epaddr, + (psm2_mq_tag_t *) p_hdr->tag, + p_hdr->data[1].u32w1, payload, paylen, + msgorder, ips_proto_mq_rts_match_callback, + &req); + if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) { + // as a performance optimization, the 1st time we process an + // unmatched RTS, we ask to REVISIT it next poll loop hoping for + // a match due to a slightly late MPI_recv call + uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask; + + flow->recv_seq_num.psn_num = + (flow->recv_seq_num.psn_num - 1) & psn_mask; + ipsaddr->msgctl->mq_recv_seqnum--; + + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_REVISIT; + } + + req->rts_peer = (psm2_epaddr_t) ipsaddr; + req->rts_reqidx_peer = p_hdr->data[1].u32w0; + if (req->req_data.send_msglen > mq->hfi_thresh_rv) + { + PSM2_LOG_EPM(OPCODE_LONG_RTS,PSM2_LOG_RX,req->rts_peer->epid,mq->ep->epid, + "req->rts_reqidx_peer: %d",req->rts_reqidx_peer); + } + if (p_hdr->flags & IPS_SEND_FLAG_BLOCKING) + req->type |= MQE_TYPE_WAITING_PEER; + +#ifdef PSM_CUDA + if (p_hdr->flags & IPS_SEND_FLAG_USER_BUF_GPU) + req->is_sendbuf_gpu_mem = 1; + else + req->is_sendbuf_gpu_mem = 0; +#endif + + if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) { + /* for out of order matching only */ + req->msg_seqnum = + __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK; + req->ptl_req_ptr = (void *)msgctl; + + msgctl->outoforder_count++; + mq_qq_append(&mq->outoforder_q, req); + + ret = IPS_RECVHDRQ_BREAK; + } else { + ipsaddr->msg_toggle = 0; + if (rc == MQ_RET_MATCH_OK) + ips_proto_mq_rts_match_callback(req, 1); + /* XXX if blocking, break out of progress loop */ + + if (msgctl->outoforder_count) + ips_proto_mq_handle_outoforder_queue(mq, msgctl); + + if (rc == MQ_RET_UNEXP_OK) + ret = IPS_RECVHDRQ_BREAK; + } + + if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); + + ips_proto_process_ack(rcv_ev); + + PSM2_LOG_MSG("leaving"); + return ret; +} + +int +ips_proto_mq_handle_tiny(struct ips_recvhdrq_event *rcv_ev) +{ + int ret = IPS_RECVHDRQ_CONTINUE; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)]; + psm2_mq_t mq = rcv_ev->proto->mq; + ips_msgctl_t *msgctl = ipsaddr->msgctl; + enum ips_msg_order msgorder; + char *payload; + uint32_t paylen; + psm2_mq_req_t req; + + /* + * if PSN does not match, drop the packet. + */ + if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev)) + return IPS_RECVHDRQ_CONTINUE; + + msgorder = ips_proto_check_msg_order(ipsaddr, flow, + __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK, + &ipsaddr->msgctl->mq_recv_seqnum); + if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE)) + return IPS_RECVHDRQ_REVISIT; + + payload = (void *)&p_hdr->hdr_data; + paylen = (__le32_to_cpu(p_hdr->khdr.kdeth0) >> + HFI_KHDR_TINYLEN_SHIFT) & HFI_KHDR_TINYLEN_MASK; + + /* + * We can't have past message sequence here. For eager message, + * it must always have an eager queue matching because even in + * truncation case the code logic will wait till all packets + * have been received. + */ + psmi_assert(msgorder != IPS_MSG_ORDER_PAST); + + _HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n", + p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2], + OPCODE_TINY, p_hdr->hdr_data.u32w1); + + /* store in req below too! */ + int rc = psmi_mq_handle_envelope(mq, + (psm2_epaddr_t) &ipsaddr->msgctl->master_epaddr, + (psm2_mq_tag_t *) p_hdr->tag, paylen, 0, + payload, paylen, msgorder, OPCODE_TINY, &req); + if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) { + uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask; + + flow->recv_seq_num.psn_num = + (flow->recv_seq_num.psn_num - 1) & psn_mask; + ipsaddr->msgctl->mq_recv_seqnum--; + + return IPS_RECVHDRQ_REVISIT; + } + + if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) { + /* for out of order matching only */ + req->msg_seqnum = + __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK; + req->ptl_req_ptr = (void *)msgctl; + + msgctl->outoforder_count++; + mq_qq_append(&mq->outoforder_q, req); + + ret = IPS_RECVHDRQ_BREAK; + } else { + ipsaddr->msg_toggle = 0; + + if (msgctl->outoforder_count) + ips_proto_mq_handle_outoforder_queue(mq, msgctl); + + if (rc == MQ_RET_UNEXP_OK) + ret = IPS_RECVHDRQ_BREAK; + } + + if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); + + ips_proto_process_ack(rcv_ev); + + return ret; +} + +int +ips_proto_mq_handle_short(struct ips_recvhdrq_event *rcv_ev) +{ + int ret = IPS_RECVHDRQ_CONTINUE; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)]; + psm2_mq_t mq = rcv_ev->proto->mq; + ips_msgctl_t *msgctl = ipsaddr->msgctl; + enum ips_msg_order msgorder; + char *payload; + uint32_t paylen; + psm2_mq_req_t req; + + /* + * if PSN does not match, drop the packet. + */ + if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev)) + return IPS_RECVHDRQ_CONTINUE; + + msgorder = ips_proto_check_msg_order(ipsaddr, flow, + __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK, + &ipsaddr->msgctl->mq_recv_seqnum); + if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE)) + return IPS_RECVHDRQ_REVISIT; + + payload = ips_recvhdrq_event_payload(rcv_ev); + paylen = ips_recvhdrq_event_paylen(rcv_ev); + psmi_assert(paylen == 0 || payload); + + /* + * We can't have past message sequence here. For eager message, + * it must always have an eager queue matching because even in + * truncation case the code logic will wait till all packets + * have been received. + */ + psmi_assert(msgorder != IPS_MSG_ORDER_PAST); + + _HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n", + p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2], + OPCODE_SHORT, p_hdr->hdr_data.u32w1); + + /* store in req below too! */ + int rc = psmi_mq_handle_envelope(mq, + (psm2_epaddr_t) &ipsaddr->msgctl->master_epaddr, + (psm2_mq_tag_t *) p_hdr->tag, + p_hdr->hdr_data.u32w1, p_hdr->hdr_data.u32w0, + payload, paylen, msgorder, OPCODE_SHORT, &req); + if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) { + uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask; + + flow->recv_seq_num.psn_num = + (flow->recv_seq_num.psn_num - 1) & psn_mask; + ipsaddr->msgctl->mq_recv_seqnum--; + + return IPS_RECVHDRQ_REVISIT; + } + + if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) { + /* for out of order matching only */ + req->msg_seqnum = + __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK; + req->ptl_req_ptr = (void *)msgctl; + + msgctl->outoforder_count++; + mq_qq_append(&mq->outoforder_q, req); + + ret = IPS_RECVHDRQ_BREAK; + } else { + ipsaddr->msg_toggle = 0; + + if (msgctl->outoforder_count) + ips_proto_mq_handle_outoforder_queue(mq, msgctl); + + if (rc == MQ_RET_UNEXP_OK) + ret = IPS_RECVHDRQ_BREAK; + } + + if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); + + ips_proto_process_ack(rcv_ev); + + return ret; +} + +int +ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev) +{ + int ret = IPS_RECVHDRQ_CONTINUE; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + struct ips_flow *flow = &ipsaddr->flows[ips_proto_flowid(p_hdr)]; + psm2_mq_t mq = rcv_ev->proto->mq; + ips_msgctl_t *msgctl = ipsaddr->msgctl; + enum ips_msg_order msgorder; + char *payload; + uint32_t paylen; + psm2_mq_req_t req; +#if defined(PSM_CUDA) + int converted = 0; +#endif // PSM_CUDA + + /* + * if PSN does not match, drop the packet. + */ + if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev)) + return IPS_RECVHDRQ_CONTINUE; + + msgorder = ips_proto_check_msg_order(ipsaddr, flow, + __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK, + &ipsaddr->msgctl->mq_recv_seqnum); + if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE)) + return IPS_RECVHDRQ_REVISIT; + + payload = ips_recvhdrq_event_payload(rcv_ev); + paylen = ips_recvhdrq_event_paylen(rcv_ev); + psmi_assert(paylen == 0 || payload); + + if (msgorder == IPS_MSG_ORDER_PAST || + msgorder == IPS_MSG_ORDER_FUTURE_RECV) { + req = mq_eager_match(mq, msgctl, + __le32_to_cpu(p_hdr->khdr.kdeth0)&HFI_KHDR_MSGSEQ_MASK); + /* + * It is future message sequence or past message sequence, + * and there is request matching in eager queue, we handle + * the packet data and return. We can't go continue to + * match envelope. + * Past message sequence must always have a matching!!! + * error is caught below. + */ + if (req) { +#ifdef PSM_CUDA + if (PSMI_USE_GDR_COPY(req, req->req_data.send_msglen)) { + req->req_data.buf = gdr_convert_gpu_to_host_addr(GDR_FD, + (unsigned long)req->user_gpu_buffer, + req->req_data.send_msglen, 1, rcv_ev->proto); + converted = 1; + } +#endif + psmi_mq_handle_data(mq, req, + p_hdr->data[1].u32w0, payload, paylen); +#if defined(PSM_CUDA) + if (converted) { + gdr_unmap_gpu_host_addr(GDR_FD, req->req_data.buf, + req->req_data.send_msglen, rcv_ev->proto); + } +#endif // PSM_CUDA + + if (msgorder == IPS_MSG_ORDER_FUTURE_RECV) + ret = IPS_RECVHDRQ_BREAK; + + if ((__be32_to_cpu(p_hdr->bth[2]) & + IPS_SEND_FLAG_ACKREQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *) + rcv_ev->recvq, flow); + + ips_proto_process_ack(rcv_ev); + + return ret; + } + + psmi_assert(msgorder == IPS_MSG_ORDER_FUTURE_RECV); + /* + * For future message sequence, since there is no eager + * queue matching yet, this must be the first packet for + * the message sequence. And of course, expected message + * sequence is always the first packet for the sequence. + */ + } + + /* + * We can't have past message sequence here. For eager message, + * it must always have an eager queue matching because even in + * truncation case the code logic will wait till all packets + * have been received. + */ + psmi_assert(msgorder != IPS_MSG_ORDER_PAST); + + _HFI_VDBG("tag=%08x.%08x.%08x opcode=%d, msglen=%d\n", + p_hdr->tag[0], p_hdr->tag[1], p_hdr->tag[2], + OPCODE_EAGER, p_hdr->hdr_data.u32w1); + + /* store in req below too! */ + int rc = psmi_mq_handle_envelope(mq, + (psm2_epaddr_t) &ipsaddr->msgctl->master_epaddr, + (psm2_mq_tag_t *) p_hdr->tag, + p_hdr->hdr_data.u32w1, p_hdr->hdr_data.u32w0, + payload, paylen, msgorder, OPCODE_EAGER, &req); + if (unlikely(rc == MQ_RET_UNEXP_NO_RESOURCES)) { + uint32_t psn_mask = ((psm2_epaddr_t)ipsaddr)->proto->psn_mask; + + flow->recv_seq_num.psn_num = + (flow->recv_seq_num.psn_num - 1) & psn_mask; + ipsaddr->msgctl->mq_recv_seqnum--; + + return IPS_RECVHDRQ_REVISIT; + } + + /* for both outoforder matching and eager matching */ + req->msg_seqnum = + __le32_to_cpu(p_hdr->khdr.kdeth0) & HFI_KHDR_MSGSEQ_MASK; + req->ptl_req_ptr = (void *)msgctl; + + if (unlikely(msgorder == IPS_MSG_ORDER_FUTURE_RECV)) { + msgctl->outoforder_count++; + mq_qq_append(&mq->outoforder_q, req); + + ret = IPS_RECVHDRQ_BREAK; + } else { + ipsaddr->msg_toggle = 0; + + if (msgctl->outoforder_count) + ips_proto_mq_handle_outoforder_queue(mq, msgctl); + + if (rc == MQ_RET_UNEXP_OK) + ret = IPS_RECVHDRQ_BREAK; + } + + if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); + + ips_proto_process_ack(rcv_ev); + + return ret; +} + +/* + * Progress the out of order queue to see if any message matches + * current receiving sequence number. + */ +void +ips_proto_mq_handle_outoforder_queue(psm2_mq_t mq, ips_msgctl_t *msgctl) +{ + psm2_mq_req_t req; + + do { + req = + mq_ooo_match(&mq->outoforder_q, msgctl, + msgctl->mq_recv_seqnum); + if (req == NULL) + return; + + msgctl->outoforder_count--; + msgctl->mq_recv_seqnum++; + + psmi_mq_handle_outoforder(mq, req); + + } while (msgctl->outoforder_count > 0); + + return; +} + +int +ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + psm2_mq_t mq = rcv_ev->proto->mq; + char *payload; + uint32_t paylen; + psm2_mq_req_t req; + struct ips_flow *flow; + + /* + * if PSN does not match, drop the packet. + */ + if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event *)rcv_ev)) + return IPS_RECVHDRQ_CONTINUE; + + req = psmi_mpool_find_obj_by_index(mq->rreq_pool, p_hdr->data[0].u32w0); + psmi_assert(req != NULL); + psmi_assert(p_hdr->data[1].u32w1 == req->req_data.send_msglen); + + /* + * if a packet has very small offset, it must have unaligned data + * attached in the packet header, and this must be the first packet + * for that message. + */ + if (p_hdr->data[1].u32w0 < 4 && p_hdr->data[1].u32w0 > 0) { + psmi_assert(p_hdr->data[1].u32w0 == (req->req_data.send_msglen&0x3)); + mq_copy_tiny((uint32_t *)req->req_data.buf, + (uint32_t *)&p_hdr->mdata, + p_hdr->data[1].u32w0); + req->send_msgoff += p_hdr->data[1].u32w0; + } + + payload = ips_recvhdrq_event_payload(rcv_ev); + paylen = ips_recvhdrq_event_paylen(rcv_ev); + psmi_assert(paylen == 0 || payload); + + psmi_mq_handle_data(mq, req, p_hdr->data[1].u32w0, payload, paylen); + + flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)]; + if ((__be32_to_cpu(p_hdr->bth[2]) & IPS_SEND_FLAG_ACKREQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *)rcv_ev->recvq, flow); + + ips_proto_process_ack(rcv_ev); + + return IPS_RECVHDRQ_CONTINUE; +} diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_params.h b/prov/psm3/psm3/ptl_ips/ips_proto_params.h new file mode 100644 index 00000000000..0ad8b6d04c8 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_proto_params.h @@ -0,0 +1,245 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_PROTO_PARAMS_H +#define _IPS_PROTO_PARAMS_H + +/* + * send method: dma, pio; + * recv method: tid, egr; + * + * send-recv mode combinations: 1=on, 0=off + * A: dma:1, pio=1, tid=1, egr=1; + * B: dma:0, pio=1, tid=1, egr=1; + * C: dma:1, pio=0, tid=1, egr=1; + * D: dma:1, pio=1, tid=0, egr=1; + * E: dma:0, pio=1, tid=0, egr=1; + * F: dma:1, pio=0, tid=0, egr=1; + * + * message packet type: + * T: tiny; S: short; E: eager; + * LR: long rts; LC: long cts; LD: long data; + * ED: expected data; EC: expected completion; + * C: ctrl msg; + * + * send,recv method for each packet type and each send-recv mode + * ------------------------------------------------------------------- + * | | A | B | C | D | E | F | + * ------------------------------------------------------------------- + * | T | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr | + * ------------------------------------------------------------------- + * | S | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr | + * ------------------------------------------------------------------- + * | E | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr |threshold + * ------------------------------------------------------------------- + * | LR | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr | + * ------------------------------------------------------------------- + * | LC | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr | + * ------------------------------------------------------------------- + * | LD | x | x | x | pio,egr | pio,egr | dma,egr |threshold + * ------------------------------------------------------------------- + * | ED | dma,tid | pio,tid | dma,tid | x | x | x | + * ------------------------------------------------------------------- + * | EC | pio,egr | pio,egr | dma,egr | x | x | x | + * ------------------------------------------------------------------- + * | C | pio,egr | pio,egr | dma,egr | pio,egr | pio,egr | dma,egr | + * ------------------------------------------------------------------- + */ + +/* Constants */ +#define BYTE2DWORD_SHIFT 2 +#define LOWER_16_BITS 0xFFFF +#define PSM_CACHE_LINE_BYTES 64 +#define PSM2_FLOW_CREDITS 64 +#define PSM_CRC_SIZE_IN_BYTES 8 + +/* + * version of protocol header (known to chip also). + * This value for OPA is defined in spec. + */ +#define IPS_PROTO_VERSION 0x1 + +/* time conversion macros */ +#define us_2_cycles(us) nanosecs_to_cycles(1000ULL*(us)) +#define ms_2_cycles(ms) nanosecs_to_cycles(1000000ULL*(ms)) +#define sec_2_cycles(sec) nanosecs_to_cycles(1000000000ULL*(sec)) + +/* Per-flow flags */ +#define IPS_FLOW_FLAG_NAK_SEND 0x01 +#define IPS_FLOW_FLAG_PENDING_ACK 0x02 +#define IPS_FLOW_FLAG_PENDING_NAK 0x04 +#define IPS_FLOW_FLAG_GEN_BECN 0x08 +#define IPS_FLOW_FLAG_SKIP_CTS 0x20 + +/* tid session expected send flags */ +#define EXP_SEND_FLAG_CLEAR_ALL 0x00 +#define EXP_SEND_FLAG_FREE_TIDS 0x01 + +#define TIMEOUT_INFINITE 0xFFFFFFFFFFFFFFFFULL /* 64 bit all-one's */ + +/* + * scb flags for wire, + * Only the lower 6 bits are wire-protocol options + */ +#define IPS_SEND_FLAG_NONE 0x00 +#define IPS_SEND_FLAG_BLOCKING 0x01 /* blocking send */ +#define IPS_SEND_FLAG_PKTCKSUM 0x02 /* Has packet checksum */ +#define IPS_SEND_FLAG_AMISTINY 0x04 /* AM is tiny, exclusive */ + +#ifdef PSM_CUDA +/* This flag is used to indicate to the reciever when + * the send is issued on a device buffer. This helps in + * selecting TID path on the recieve side regardless of + * the receive buffers locality. It is used + * in a special case where the send is on a device + * buffer and the receive is on a host buffer. + */ +#define IPS_SEND_FLAG_USER_BUF_GPU 0x08 +#endif + +#define IPS_SEND_FLAG_PROTO_OPTS 0x3f /* only 6bits wire flags */ + +/* scb flags */ +#define IPS_SEND_FLAG_PENDING 0x0100 +#define IPS_SEND_FLAG_PERSISTENT 0x0200 +#define IPS_SEND_FLAG_NO_LMC 0x0400 + +#ifdef PSM_CUDA +/* This flag is used to indicate if the send is on + * a GPU buffer. This helps PIO/SDMA paths to detect + * if payload is GPU buffer without having to call + * cudaGetPointerAttribute. + */ +#define IPS_SEND_FLAG_PAYLOAD_BUF_GPU 0x0800 +#endif + +/* 0x10000000, interrupt when done */ +#define IPS_SEND_FLAG_INTR (1< 0) + proto->stray_warn_interval = sec_2_cycles(interval_secs); + else + proto->stray_warn_interval = 0; + + return PSM2_OK; +} + +psm2_error_t ips_proto_recv_fini(struct ips_proto *proto) +{ + ips_report_strays(proto); + return PSM2_OK; +} + +#define cycles_to_sec_f(cycles) \ + (((double)cycles_to_nanosecs(cycles)) / 1000000000.0) + +struct ips_stray_epid { + psm2_epid_t epid; + uint32_t err_check_bad_sent; + uint32_t ipv4_addr; + uint32_t pid; + uint32_t num_messages; + uint64_t t_warn_next; + uint64_t t_first; + uint64_t t_last; +}; + +static +void ips_report_strays(struct ips_proto *proto) +{ + struct ips_stray_epid *sepid; + struct psmi_eptab_iterator itor; + psmi_epid_itor_init(&itor, PSMI_EP_CROSSTALK); + +#if _HFI_DEBUGGING + double t_first = 0; + double t_last = 0; + double t_runtime = 0; + if (_HFI_INFO_ON) { + t_runtime = cycles_to_sec_f(proto->t_fini - proto->t_init); + } +#endif + + while ((sepid = psmi_epid_itor_next(&itor))) { + char ipbuf[INET_ADDRSTRLEN], *ip = NULL; + char bufpid[32]; + uint32_t lid = psm2_epid_nid(sepid->epid); +#if _HFI_DEBUGGING + if (_HFI_INFO_ON) { + t_first = + cycles_to_sec_f(sepid->t_first - proto->t_init); + t_last = + cycles_to_sec_f(sepid->t_last - proto->t_init); + } +#endif + if (sepid->ipv4_addr) + ip = (char *) + inet_ntop(AF_INET, &sepid->ipv4_addr, ipbuf, + sizeof(ipbuf)); + if (!ip) + snprintf(ipbuf, sizeof(ipbuf), "%d (%x)", lid, lid); + + if (sepid->pid) + snprintf(bufpid, sizeof(bufpid), "PID=%d", sepid->pid); + else + snprintf(bufpid, sizeof(bufpid), "PID unknown"); + + if (_HFI_INFO_ON) { + _HFI_INFO_ALWAYS + ("Process %s on host %s=%s sent %d stray message(s) and " + "was told so %d time(s) (first stray message at %.1fs " + "(%d%%), last at %.1fs (%d%%) into application run)\n", + bufpid, ip ? "IP" : "LID", ipbuf, sepid->num_messages, + sepid->err_check_bad_sent, t_first, + (int)(t_first * 100.0 / t_runtime), t_last, + (int)(t_last * 100.0 / t_runtime)); + } + + psmi_epid_remove(PSMI_EP_CROSSTALK, sepid->epid); + psmi_free(sepid); + } + psmi_epid_itor_fini(&itor); + return; +} + +/* New scbs now available. If we have pending sends because we were out of + * scbs, put the pendq on the timerq so it can be processed. */ +void ips_proto_rv_scbavail_callback(struct ips_scbctrl *scbc, void *context) +{ + struct ips_proto *proto = (struct ips_proto *)context; + struct ips_pend_sreq *sreq = STAILQ_FIRST(&proto->pend_sends.pendq); + if (sreq != NULL) + psmi_timer_request(proto->timerq, + &proto->pend_sends.timer, PSMI_TIMER_PRIO_1); + return; +} + +psm2_error_t +ips_proto_timer_pendq_callback(struct psmi_timer *timer, uint64_t current) +{ + psm2_error_t err = PSM2_OK; + struct ips_pend_sends *pend_sends = + (struct ips_pend_sends *)timer->context; + struct ips_pendsendq *phead = &pend_sends->pendq; + struct ips_proto *proto = (struct ips_proto *)pend_sends->proto; + struct ips_pend_sreq *sreq; + + while (!STAILQ_EMPTY(phead)) { + sreq = STAILQ_FIRST(phead); + switch (sreq->type) { + case IPS_PENDSEND_EAGER_REQ: + err = ips_proto_mq_push_cts_req(proto, sreq->req); + break; + case IPS_PENDSEND_EAGER_DATA: + err = ips_proto_mq_push_rts_data(proto, sreq->req); + break; + + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Unknown pendq state %d\n", + sreq->type); + } + + if (err == PSM2_OK) { + STAILQ_REMOVE_HEAD(phead, next); + psmi_mpool_put(sreq); + } else { /* out of scbs. wait for the next scb_avail callback */ + /* printf("!!!!! breaking out of pendq progress\n"); */ + break; + } + } + + return err; +} + +PSMI_INLINE( +int +between(int first_seq, int last_seq, int seq)) +{ + if (last_seq >= first_seq) { + if (seq < first_seq || seq > last_seq) { + return 0; + } + } else { + if (seq > last_seq && seq < first_seq) { + return 0; + } + } + return 1; +} + +PSMI_INLINE( +int +pio_dma_ack_valid(struct ips_proto *proto, struct ips_flow *flow, + psmi_seqnum_t ack_seq_num)) +{ + uint32_t last_num; + struct ips_scb_unackedq *unackedq = &flow->scb_unacked; + + if (STAILQ_EMPTY(unackedq)) + return 0; + + /* scb_pend will be moved back when an nak is received, but + * the packet may actually be received and acked after the nak, + * so we use the tail of unacked queue, which may include packets + * not being sent out yet, this is over do, but it should be OK. */ + last_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num.psn_num; + + return between(flow->xmit_ack_num.psn_num, + last_num, ack_seq_num.psn_num); +} + + + +/* NAK post process for dma flow */ +void ips_dmaflow_nak_post_process(struct ips_proto *proto, + struct ips_flow *flow) +{ + ips_scb_t *scb; + uint32_t first_num, ack_num; + uint16_t padding = 0; + + scb = STAILQ_FIRST(&flow->scb_unacked); + first_num = __be32_to_cpu(scb->ips_lrh.bth[2]) & proto->psn_mask; + ack_num = (flow->xmit_ack_num.psn_num - 1) & proto->psn_mask; + + + /* If the ack PSN falls into a multi-packets scb, + * don't re-send the packets already acked. */ + psmi_assert(scb->nfrag > 1); + if (between(first_num, scb->seq_num.psn_num, ack_num)) { + uint32_t npkt, pktlen, nbytes; + + /* how many packets acked in this scb */ + npkt = ((ack_num - first_num) & proto->psn_mask) + 1; + + /* how many bytes already acked in this scb, for eager receive + * packets, all payload size is frag_size except the last packet + * which is not acked yet */ + pktlen = scb->frag_size; + nbytes = (((ack_num - first_num) & + proto->psn_mask) + 1) * pktlen; + + /* 0. update scb info */ + psmi_assert(scb->nfrag_remaining > npkt); + scb->nfrag_remaining -= npkt; + psmi_assert(scb->chunk_size_remaining > nbytes); + scb->chunk_size_remaining -= nbytes; + ips_scb_buffer(scb) = (void *)((char *)ips_scb_buffer(scb) + nbytes); + + /* 1. if last packet in sequence, set IPS_SEND_FLAG_ACKREQ */ + if (scb->chunk_size_remaining <= scb->frag_size) { + psmi_assert(scb->nfrag_remaining == 1); + scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; + + /* last packet is what remaining */ + /* check if padding is required*/ + padding = scb->chunk_size_remaining & 0x3; + if_pf(padding) { + /* how much to pad with also equals how many bytes we need + * to rewind the source buffer offset by to keep it dw aligned */ + padding = 4 - padding; + ips_scb_buffer(scb) = (void *)((char*)ips_scb_buffer(scb) - padding); + scb->chunk_size_remaining += padding; + } + pktlen = scb->chunk_size_remaining; + } + + /* 2. set new packet sequence number */ + scb->ips_lrh.bth[2] = __cpu_to_be32( + ((ack_num + 1) & proto->psn_mask) | + (scb->scb_flags & IPS_SEND_FLAG_ACKREQ)); + + /* 3. set new packet offset adjusted with padding */ + scb->ips_lrh.hdr_data.u32w0 += nbytes - padding; + + /* 4. if packet length is changed, set new length */ + if (scb->payload_size != pktlen) { + scb->payload_size = pktlen; + scb->ips_lrh.lrh[2] = __cpu_to_be16(( + (scb->payload_size + + sizeof(struct ips_message_header) + + HFI_CRC_SIZE_IN_BYTES) >> + BYTE2DWORD_SHIFT) & HFI_LRH_PKTLEN_MASK); + } + } +} + +/* process an incoming ack message. Separate function to allow */ +/* for better optimization by compiler */ +int +ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_proto *proto = rcv_ev->proto; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ips_flow *flow = NULL; + struct ips_scb_unackedq *unackedq; + struct ips_scb_pendlist *scb_pend; + psmi_seqnum_t ack_seq_num, last_seq_num; + ips_epaddr_flow_t flowid; + ips_scb_t *scb; + uint32_t tidctrl; + + ack_seq_num.psn_num = p_hdr->ack_seq_num; + tidctrl = GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0)); + if (!tidctrl && ((flowid = ips_proto_flowid(p_hdr)) < EP_FLOW_TIDFLOW)) { + ack_seq_num.psn_num = + (ack_seq_num.psn_num - 1) & proto->psn_mask; + psmi_assert(flowid < EP_FLOW_LAST); + flow = &ipsaddr->flows[flowid]; + if (!pio_dma_ack_valid(proto, flow, ack_seq_num)) + goto ret; + } else { + // we don't use tidflow on UD nor UDP, shouldn't get ACKs about it + _HFI_ERROR("Got ack for TID flow, not allowed for UD\n"); + goto ret; + } + flow->xmit_ack_num.psn_num = p_hdr->ack_seq_num; + + unackedq = &flow->scb_unacked; + scb_pend = &flow->scb_pend; + + if (STAILQ_EMPTY(unackedq)) + goto ret; + + last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num; + + INC_TIME_SPEND(TIME_SPEND_USER2); + + /* For tidflow, psn_gen matches. So for all flows, tid/pio/dma, + * we can used general psn_num to compare the PSN. */ + while (between((scb = STAILQ_FIRST(unackedq))->seq_num.psn_num, + last_seq_num.psn_num, ack_seq_num.psn_num) + ) { + + /* take it out of the xmit queue and .. */ + if (scb == SLIST_FIRST(scb_pend)) { +#ifdef PSM_DEBUG + flow->scb_num_pending--; +#endif + SLIST_REMOVE_HEAD(scb_pend, next); + } + + STAILQ_REMOVE_HEAD(unackedq, nextq); +#ifdef PSM_DEBUG + flow->scb_num_unacked--; + psmi_assert(flow->scb_num_unacked >= flow->scb_num_pending); +#endif + flow->credits += scb->nfrag; + + + if (scb->callback) + (*scb->callback) (scb->cb_param, scb->nfrag > 1 ? + scb->chunk_size : scb->payload_size); + + if (!(scb->scb_flags & IPS_SEND_FLAG_PERSISTENT)) + ips_scbctrl_free(scb); + + /* set all index pointer to NULL if all frames have been + * acked */ + if (STAILQ_EMPTY(unackedq)) { + psmi_timer_cancel(proto->timerq, flow->timer_ack); + flow->timer_ack = NULL; + psmi_timer_cancel(proto->timerq, flow->timer_send); + flow->timer_send = NULL; + + SLIST_FIRST(scb_pend) = NULL; + psmi_assert(flow->scb_num_pending == 0); + /* Reset congestion window - all packets ACK'd */ + flow->credits = flow->cwin = proto->flow_credits; + flow->ack_interval = max((flow->credits >> 2) - 1, 1); + goto ret; + } else if (flow->timer_ack == scb->timer_ack) { + /* + * Exchange timers with last scb on unackedq. + * timer in scb is used by flow, cancelling current + * timer and then requesting a new timer takes more + * time, instead, we exchange the timer between current + * freeing scb and the last scb on unacked queue. + */ + psmi_timer *timer; + ips_scb_t *last = STAILQ_LAST(unackedq, ips_scb, nextq); + + timer = scb->timer_ack; + scb->timer_ack = last->timer_ack; + last->timer_ack = timer; + timer = scb->timer_send; + scb->timer_send = last->timer_send; + last->timer_send = timer; + + scb->timer_ack->context = scb; + scb->timer_send->context = scb; + last->timer_ack->context = last; + last->timer_send->context = last; + } + } + + psmi_assert(!STAILQ_EMPTY(unackedq)); /* sanity for above loop */ + + { + /* Increase congestion window if flow is not congested */ + if_pf(flow->cwin < proto->flow_credits) { + flow->credits += + min(flow->cwin << 1, + proto->flow_credits) - flow->cwin; + flow->cwin = min(flow->cwin << 1, proto->flow_credits); + flow->ack_interval = max((flow->credits >> 2) - 1, 1); + } + } + + /* Reclaimed some credits - attempt to flush flow */ + if (!SLIST_EMPTY(scb_pend)) + flow->flush(flow, NULL); + + /* + * If the next packet has not even been put on the wire, cancel the + * retransmission timer since we're still presumably waiting on free + * pio bufs + */ + if (STAILQ_FIRST(unackedq)->abs_timeout == TIMEOUT_INFINITE) + psmi_timer_cancel(proto->timerq, flow->timer_ack); + +ret: + return IPS_RECVHDRQ_CONTINUE; +} + +/* process an incoming nack message. Separate function to allow */ +/* for better optimization by compiler */ +int ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_proto *proto = rcv_ev->proto; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ips_flow *flow = NULL; + struct ips_scb_unackedq *unackedq; + struct ips_scb_pendlist *scb_pend; + psmi_seqnum_t ack_seq_num, last_seq_num; + psm_protocol_type_t protocol; + ips_epaddr_flow_t flowid; + ips_scb_t *scb; + uint32_t tidctrl; + + INC_TIME_SPEND(TIME_SPEND_USER3); + + ack_seq_num.psn_num = p_hdr->ack_seq_num; + tidctrl = GET_HFI_KHDR_TIDCTRL(__le32_to_cpu(p_hdr->khdr.kdeth0)); + if (!tidctrl && ((flowid = ips_proto_flowid(p_hdr)) < EP_FLOW_TIDFLOW)) { + protocol = PSM_PROTOCOL_GO_BACK_N; + psmi_assert(flowid < EP_FLOW_LAST); + flow = &ipsaddr->flows[flowid]; + if (!pio_dma_ack_valid(proto, flow, ack_seq_num)) + goto ret; + ack_seq_num.psn_num = + (ack_seq_num.psn_num - 1) & proto->psn_mask; + flow->xmit_ack_num.psn_num = p_hdr->ack_seq_num; + } else { + // we don't use tidflow on UD nor UDP, shouldn't get NAKs about it + _HFI_ERROR("Got nak for TID flow, not allowed for UD\n"); + goto ret; /* Invalid ack for flow */ + ack_seq_num.psn_seq--; + + psmi_assert(flow->xmit_seq_num.psn_gen == ack_seq_num.psn_gen); + psmi_assert(flow->xmit_ack_num.psn_gen == ack_seq_num.psn_gen); + /* Update xmit_ack_num with both new generation and new + * acked sequence; update xmit_seq_num with the new flow + * generation, don't change the sequence number. */ + flow->xmit_ack_num = (psmi_seqnum_t) p_hdr->data[1].u32w0; + flow->xmit_seq_num.psn_gen = flow->xmit_ack_num.psn_gen; + psmi_assert(flow->xmit_seq_num.psn_gen != ack_seq_num.psn_gen); + } + + unackedq = &flow->scb_unacked; + scb_pend = &flow->scb_pend; + + if (STAILQ_EMPTY(unackedq)) + goto ret; + + last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num; + + proto->epaddr_stats.nak_recv++; + + _HFI_VDBG("got a nack %d on flow %d, " + "first is %d, last is %d\n", ack_seq_num.psn_num, + flow->flowid, + STAILQ_EMPTY(unackedq) ? -1 : STAILQ_FIRST(unackedq)->seq_num. + psn_num, STAILQ_EMPTY(unackedq) ? -1 : STAILQ_LAST(unackedq, + ips_scb, + nextq)-> + seq_num.psn_num); + + /* For tidflow, psn_gen matches. So for all flows, tid/pio/dma, + * we can use general psn_num to compare the PSN. */ + while (between((scb = STAILQ_FIRST(unackedq))->seq_num.psn_num, + last_seq_num.psn_num, ack_seq_num.psn_num) + ) { + /* take it out of the xmit queue and .. */ + if (scb == SLIST_FIRST(scb_pend)) { +#ifdef PSM_DEBUG + flow->scb_num_pending--; +#endif + SLIST_REMOVE_HEAD(scb_pend, next); + } + + STAILQ_REMOVE_HEAD(unackedq, nextq); +#ifdef PSM_DEBUG + flow->scb_num_unacked--; + psmi_assert(flow->scb_num_unacked >= flow->scb_num_pending); +#endif + + + if (scb->callback) + (*scb->callback) (scb->cb_param, scb->nfrag > 1 ? + scb->chunk_size : scb->payload_size); + + if (!(scb->scb_flags & IPS_SEND_FLAG_PERSISTENT)) + ips_scbctrl_free(scb); + + /* set all index pointer to NULL if all frames has been acked */ + if (STAILQ_EMPTY(unackedq)) { + psmi_timer_cancel(proto->timerq, flow->timer_ack); + flow->timer_ack = NULL; + psmi_timer_cancel(proto->timerq, flow->timer_send); + flow->timer_send = NULL; + + SLIST_FIRST(scb_pend) = NULL; + psmi_assert(flow->scb_num_pending == 0); + /* Reset congestion window if all packets acknowledged */ + flow->credits = flow->cwin = proto->flow_credits; + flow->ack_interval = max((flow->credits >> 2) - 1, 1); + goto ret; + } else if (flow->timer_ack == scb->timer_ack) { + /* + * Exchange timers with last scb on unackedq. + * timer in scb is used by flow, cancelling current + * timer and then requesting a new timer takes more + * time, instead, we exchange the timer between current + * freeing scb and the last scb on unacked queue. + */ + psmi_timer *timer; + ips_scb_t *last = STAILQ_LAST(unackedq, ips_scb, nextq); + + timer = scb->timer_ack; + scb->timer_ack = last->timer_ack; + last->timer_ack = timer; + timer = scb->timer_send; + scb->timer_send = last->timer_send; + last->timer_send = timer; + + scb->timer_ack->context = scb; + scb->timer_send->context = scb; + last->timer_ack->context = last; + last->timer_send->context = last; + } + } + + psmi_assert(!STAILQ_EMPTY(unackedq)); /* sanity for above loop */ + + if (protocol == PSM_PROTOCOL_TIDFLOW) + _HFI_ERROR("post processing, Got nak for TID flow, not allowed for UD\n"); + else if (scb->nfrag > 1) + ips_dmaflow_nak_post_process(proto, flow); + + /* Always cancel ACK timer as we are going to restart the flow */ + psmi_timer_cancel(proto->timerq, flow->timer_ack); + + /* What's now pending is all that was unacked */ + SLIST_FIRST(scb_pend) = scb; +#ifdef PSM_DEBUG + flow->scb_num_pending = flow->scb_num_unacked; +#endif + while (scb && !(scb->scb_flags & IPS_SEND_FLAG_PENDING)) { + + scb->scb_flags |= IPS_SEND_FLAG_PENDING; + scb = SLIST_NEXT(scb, next); + } + + { + int num_resent = 0; + + /* Reclaim all credits upto congestion window only */ + flow->credits = flow->cwin; + flow->ack_interval = max((flow->credits >> 2) - 1, 1); + + /* Flush pending scb's */ + flow->flush(flow, &num_resent); + + proto->epaddr_stats.send_rexmit += num_resent; + } + +ret: + return IPS_RECVHDRQ_CONTINUE; +} + +int +ips_proto_process_err_chk(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_recvhdrq *recvq = (struct ips_recvhdrq *)rcv_ev->recvq; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + ips_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); + struct ips_flow *flow; + psmi_seqnum_t seq_num; + int16_t seq_off; + + INC_TIME_SPEND(TIME_SPEND_USER4); + PSM2_LOG_MSG("entering"); + psmi_assert(flowid < EP_FLOW_LAST); + flow = &ipsaddr->flows[flowid]; + recvq->proto->epaddr_stats.err_chk_recv++; + + seq_num.psn_val = __be32_to_cpu(p_hdr->bth[2]); + seq_off = (int16_t) (flow->recv_seq_num.psn_num - seq_num.psn_num); + + if_pf(seq_off <= 0) { + _HFI_VDBG("naking for seq=%d, off=%d on flowid %d\n", + seq_num.psn_num, seq_off, flowid); + + if (seq_off < -flow->ack_interval) + flow->flags |= IPS_FLOW_FLAG_GEN_BECN; + + ips_proto_send_nak(recvq, flow); + flow->flags |= IPS_FLOW_FLAG_NAK_SEND; + } + else { + ips_scb_t ctrlscb; + + ctrlscb.scb_flags = 0; + ctrlscb.ips_lrh.ack_seq_num = flow->recv_seq_num.psn_num; + ips_proto_send_ctrl_message(flow, OPCODE_ACK, + &ipsaddr->ctrl_msg_queued, + &ctrlscb, ctrlscb.cksum, 0); + } + + PSM2_LOG_MSG("leaving"); + return IPS_RECVHDRQ_CONTINUE; +} + + + +static void ips_bad_opcode(uint8_t op_code, struct ips_message_header *proto) +{ + _HFI_DBG("Discarding message with bad opcode 0x%x\n", op_code); + + if (hfi_debug & __HFI_DBG) { + ips_proto_show_header(proto, "received bad opcode"); + ips_proto_dump_frame(proto, sizeof(struct ips_message_header), + "Opcode error protocol header dump"); + } +} + +int +ips_proto_process_unknown_opcode(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_message_header *protocol_header = rcv_ev->p_hdr; + struct ips_proto *proto = rcv_ev->proto; + + proto->stats.unknown_packets++; + ips_bad_opcode(_get_proto_hfi_opcode(protocol_header), protocol_header); + + return IPS_RECVHDRQ_CONTINUE; +} + +int +ips_proto_connect_disconnect(struct ips_recvhdrq_event *rcv_ev) +{ + psm2_error_t err = PSM2_OK; + char *payload = ips_recvhdrq_event_payload(rcv_ev); + uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev); + + psmi_assert(payload); + err = ips_proto_process_connect(rcv_ev->proto, + _get_proto_hfi_opcode(rcv_ev->p_hdr), + rcv_ev->p_hdr, + payload, + paylen); + if (err != PSM2_OK) + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Process connect/disconnect error: %d, opcode %d\n", + err, _get_proto_hfi_opcode(rcv_ev->p_hdr)); + + return IPS_RECVHDRQ_CONTINUE; +} + +/* Return 1 if packet is ok. */ +/* Return 0 if packet should be skipped */ +int ips_proto_process_unknown(const struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ips_proto *proto = rcv_ev->proto; + int opcode = (int)_get_proto_hfi_opcode(p_hdr); + + /* + * If the protocol is disabled or not yet enabled, no processing happens + * We set it t_init to 0 when disabling the protocol + */ + if (proto->t_init == 0) + return IPS_RECVHDRQ_CONTINUE; + + /* Connect messages don't have to be from a known epaddr */ + switch (opcode) { + case OPCODE_CONNECT_REQUEST: + case OPCODE_CONNECT_REPLY: + case OPCODE_DISCONNECT_REQUEST: + case OPCODE_DISCONNECT_REPLY: + ips_proto_connect_disconnect( + (struct ips_recvhdrq_event *)rcv_ev); + return IPS_RECVHDRQ_CONTINUE; + default: + break; + } + + /* Packet from "unknown" peer. Log the packet and payload if at appropriate + * verbose level. + */ + { + char *payload = ips_recvhdrq_event_payload(rcv_ev); + uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) + + ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3); + + if (hfi_debug & __HFI_PKTDBG) { + ips_proto_dump_frame(rcv_ev->p_hdr, + HFI_MESSAGE_HDR_SIZE, "header"); + if (paylen) + ips_proto_dump_frame(payload, paylen, "data"); + } + } + + + proto->stats.stray_packets++; + + /* If we have debug mode, print the complete packet every time */ + if (hfi_debug & __HFI_PKTDBG) + ips_proto_show_header(p_hdr, "invalid connidx"); + + psmi_handle_error(PSMI_EP_LOGEVENT, PSM2_EPID_NETWORK_ERROR, + "Received message(s) opcode=0x%x from an unknown process", opcode); + + return 0; /* Always skip this packet unless the above call was a noreturn + * call */ +} + + diff --git a/prov/psm3/psm3/ptl_ips/ips_recvhdrq.c b/prov/psm3/psm3/ptl_ips/ips_recvhdrq.c new file mode 100644 index 00000000000..1fd124346e5 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_recvhdrq.c @@ -0,0 +1,404 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" + +#include "ips_epstate.h" +#include "ips_proto.h" +#include "ips_expected_proto.h" +#include "ips_proto_help.h" +#include "ips_proto_internal.h" + +/* + * Receive header queue initialization. + */ +psm2_error_t +ips_recvhdrq_init(const psmi_context_t *context, + const struct ips_epstate *epstate, + const struct ips_proto *proto, + const struct ips_recvhdrq_callbacks *callbacks, + struct ips_recvhdrq *recvq + ) +{ + psm2_error_t err = PSM2_OK; + + memset(recvq, 0, sizeof(*recvq)); + recvq->proto = (struct ips_proto *)proto; + recvq->context = context; + pthread_spin_init(&recvq->hdrq_lock, PTHREAD_PROCESS_SHARED); + + recvq->epstate = epstate; + recvq->recvq_callbacks = *callbacks; /* deep copy */ + SLIST_INIT(&recvq->pending_acks); + + return err; +} + + +/* flush the eager buffers, by setting the eager index head to eager index tail + if eager buffer queue is full. + + Called when we had eager buffer overflows (ERR_TID/HFI_RHF_H_TIDERR + was set in RHF errors), and no good eager packets were received, so + that eager head wasn't advanced. +*/ +#if 0 +static void ips_flush_egrq_if_required(struct ips_recvhdrq *recvq) +{ + const uint32_t tail = ips_recvq_tail_get(&recvq->egrq); + const uint32_t head = ips_recvq_head_get(&recvq->egrq); + uint32_t egr_cnt = recvq->egrq.elemcnt; + + if ((head % egr_cnt) == ((tail + 1) % egr_cnt)) { + _HFI_DBG("eager array full after overflow, flushing " + "(head %llx, tail %llx)\n", + (long long)head, (long long)tail); + recvq->proto->stats.egr_overflow++; + } + return; +} +#endif + +/* + * Helpers for ips_recvhdrq_progress. + */ + + + + +#ifdef PSM_DEBUG +#endif + + +PSMI_ALWAYS_INLINE( +void +process_pending_acks(struct ips_recvhdrq *recvq)) +{ + ips_scb_t ctrlscb; + struct ips_message_header *msg_hdr = NULL; + + /* If any pending acks, dispatch them now */ + while (!SLIST_EMPTY(&recvq->pending_acks)) { + struct ips_flow *flow = SLIST_FIRST(&recvq->pending_acks); + + SLIST_REMOVE_HEAD(&recvq->pending_acks, next); + SLIST_NEXT(flow, next) = NULL; + + ctrlscb.scb_flags = 0; + msg_hdr = &ctrlscb.ips_lrh; + msg_hdr->ack_seq_num = flow->recv_seq_num.psn_num; + + if (flow->flags & IPS_FLOW_FLAG_PENDING_ACK) { + psmi_assert_always((flow-> + flags & IPS_FLOW_FLAG_PENDING_NAK) + == 0); + + flow->flags &= ~IPS_FLOW_FLAG_PENDING_ACK; + ips_proto_send_ctrl_message(flow, OPCODE_ACK, + &flow->ipsaddr-> + ctrl_msg_queued, + &ctrlscb, ctrlscb.cksum, 0); + } else { + psmi_assert_always(flow-> + flags & IPS_FLOW_FLAG_PENDING_NAK); + + flow->flags &= ~IPS_FLOW_FLAG_PENDING_NAK; + ips_proto_send_ctrl_message(flow, OPCODE_NAK, + &flow->ipsaddr-> + ctrl_msg_queued, + &ctrlscb, ctrlscb.cksum, 0); + } + } +} + +#ifdef RNDV_MOD_MR +// check for and process RV RDMA sends and RDMA recv +psm2_error_t check_rv_completion(psm2_ep_t ep, struct ips_proto *proto) +{ + struct rv_event ev; + psm2_error_t ret = PSM2_OK_NO_PROGRESS; + + if (! IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode)) + return ret; + + while (__psm2_rv_poll_cq(ep->verbs_ep.rv, &ev) > 0) { + ret = PSM2_OK; + switch (ev.event_type) { + case RV_WC_RDMA_WRITE: + ep->verbs_ep.send_rdma_outstanding--; + if_pf (ev.wc.status || ev.wc.wr_id == 0) { + if (PSM2_OK != ips_protoexp_rdma_write_completion_error( + ep, ev.wc.wr_id, ev.wc.status)) + return PSM2_INTERNAL_ERR; + + } else { + ips_protoexp_rdma_write_completion( ev.wc.wr_id); + } + break; + case RV_WC_RECV_RDMA_WITH_IMM: + if_pf (ev.wc.status) { + if (ep->rv_reconnect_timeout) + break; /* let sender handle errors */ + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "failed rv recv RDMA '%s' (%d) on epid 0x%lx\n", + ibv_wc_status_str(ev.wc.status), (int)ev.wc.status, ep->epid); + return PSM2_INTERNAL_ERR; + } + _HFI_MMDBG("got RV RDMA Write Immediate RQ CQE %u bytes\n", + ev.wc.byte_len); + ips_protoexp_handle_immed_data(proto, + ev.wc.conn_handle, RDMA_IMMED_RV, + ev.wc.imm_data, ev.wc.byte_len); + break; + default: + psmi_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "unexpected rv event %d status '%s' (%d) on epid 0x%lx\n", + ev.event_type, ibv_wc_status_str(ev.wc.status), + (int)ev.wc.status, ep->epid); + break; + } + } + return ret;; +} +#endif // RNDV_MOD_MR + +psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq) +{ + GENERIC_PERF_BEGIN(PSM_RX_SPEEDPATH_CTR); /* perf stats */ + + int ret = IPS_RECVHDRQ_CONTINUE; + struct ips_epstate_entry *epstaddr; + psm2_ep_t ep = recvq->proto->ep; + PSMI_CACHEALIGN struct ips_recvhdrq_event rcv_ev = { + .proto = recvq->proto, + .recvq = recvq, + //.ptype = RCVHQ_RCV_TYPE_ERROR + }; + rbuf_t buf; + uint32_t num_done = 0; + int err; + +#ifdef RNDV_MOD_MR + // rv completes are for larger RDMAs and should be infrequent, give + // them 1st chance + switch (check_rv_completion(ep, recvq->proto)) { + case PSM2_OK: + num_done=1; // triggers PSM_OK return below + break; + case PSM2_OK_NO_PROGRESS: + break; + default: + goto fail; + break; + } +#endif +#if VERBS_RECV_CQE_BATCH > 1 + int done = 0; + do { + struct ibv_wc *wc; +// a little atypical, but allows ifdef to be smaller scope +#undef WC +#define WC(field) ((wc)->field) + if (! ep->verbs_ep.recv_wc_count) { + // TBD - negative error return is possible but unlikely + if (0 == (err = ibv_poll_cq(ep->verbs_ep.recv_cq, VERBS_RECV_CQE_BATCH, ep->verbs_ep.recv_wc_list))) + break; + else if_pf (err < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK + || errno == EBUSY || errno = EINTR) + break; + _HFI_ERROR("failed ibv_poll_cq '%s' (%d) on epid 0x%lx\n", + strerror(errno), errno, ep->epid); + goto fail; + } + ep->verbs_ep.recv_wc_count = err; + ep->verbs_ep.recv_wc_next = 0; + // once drained break out of loop w/o polling CQ again + // don't worry about small race of new pkt arriving while we + // process the CQEs. poll_cq is expensive so avoid doing it + // an extra time because it will usually be empty + done = (ep->verbs_ep.recv_wc_count < VERBS_RECV_CQE_BATCH); + } + // consume next wc + wc = &(ep->verbs_ep.recv_wc_list[ep->verbs_ep.recv_wc_next++]); + ep->verbs_ep.recv_wc_count--; + { +#else // VERBS_RECV_CQE_BATCH > 1 + while (1) { + struct ibv_wc wc; +// a little atypical, but allows ifdef to be smaller scope +#undef WC +#define WC(field) ((wc).field) + // TBD really only need to check this on 1st loop + if_pf (ep->verbs_ep.revisit_buf) { + buf = ep->verbs_ep.revisit_buf; + ep->verbs_ep.revisit_buf = NULL; + rcv_ev.payload_size = ep->verbs_ep.revisit_payload_size; + } else if (0 == (err = ibv_poll_cq(ep->verbs_ep.recv_cq, 1, &wc))) { + break; + } else if_pf (err < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK + || errno == EBUSY || errno == EINTR) + break; + _HFI_ERROR("failed ibv_poll_cq '%s' (%d) on epid 0x%lx\n", + strerror(errno), errno, ep->epid); + goto fail; + } else { +#endif // VERBS_RECV_CQE_BATCH > 1 + psmi_assert_always(WC(wr_id)); + buf = (rbuf_t)WC(wr_id); + if_pf (WC(status)) { + if (WC(status) != IBV_WC_WR_FLUSH_ERR) + _HFI_ERROR("failed recv '%s' (%d) on epid 0x%lx QP %u\n", + ibv_wc_status_str(WC(status)), (int)WC(status), ep->epid, WC(qp_num)); + goto fail; + } + switch (WC(opcode)) { + case IBV_WC_RECV_RDMA_WITH_IMM: + _HFI_MMDBG("got RDMA Write Immediate RQ CQE %u bytes\n", + WC(byte_len)); + // wc.byte_len is len of inbound rdma write not including immed + // wc.qp_num - local QP + ips_protoexp_handle_immed_data(rcv_ev.proto, + (uint64_t)(rbuf_qp(ep, buf)->qp_context), + RDMA_IMMED_USER_RC, WC(imm_data), WC(byte_len)); + goto repost; + break; + default: + _HFI_ERROR("unexpected recv opcode %d on epid 0x%lx QP %u\n", + WC(opcode), ep->epid, WC(qp_num)); + goto repost; + break; + case IBV_WC_RECV: + _HFI_VDBG("got CQE %u bytes\n", WC(byte_len)); + // wc.byte_len is length of data including rbuf_addition + // actual data starts after rbuf_addition in posted recv buffer + // if we need it wc has: + // qp_num - local QP + // src_qp - remote QP + // slid - remote SLID + // probably have GRH at start of buffer with remote GID + if_pf (_HFI_PDBG_ON) + __psm2_dump_buf(rbuf_to_buffer(buf), WC(byte_len)); + if_pf (WC(byte_len) < rbuf_addition(buf)+sizeof(struct ips_message_header)) { + _HFI_ERROR( "unexpected small recv: %u\n", WC(byte_len)); + goto repost; + } + rcv_ev.payload_size = WC(byte_len) - rbuf_addition(buf) - sizeof(struct ips_message_header); + break; + } + // fall through to process recv pkt in buf of rcv_ev.payload_size + } + rcv_ev.p_hdr = (struct ips_message_header *)(rbuf_to_buffer(buf)+rbuf_addition(buf)); + rcv_ev.payload = (rbuf_to_buffer(buf) + rbuf_addition(buf) + sizeof(struct ips_message_header)); + _HFI_VDBG("%s receive - opcode %x\n", qp_type_str(rbuf_qp(ep, buf)), + _get_proto_hfi_opcode(rcv_ev.p_hdr)); + + epstaddr = ips_epstate_lookup(recvq->epstate, rcv_ev.p_hdr->connidx); + + if_pf((epstaddr == NULL) || (epstaddr->ipsaddr == NULL)) { + rcv_ev.ipsaddr = NULL; + recvq->recvq_callbacks.callback_packet_unknown(&rcv_ev); + } else { + rcv_ev.ipsaddr = epstaddr->ipsaddr; + ret = ips_proto_process_packet(&rcv_ev); + if_pf (ret == IPS_RECVHDRQ_REVISIT) + { + // try processing on next progress call + _HFI_VDBG("REVISIT returned on process_packet\n"); + // process this CQE again next time called +#if VERBS_RECV_CQE_BATCH > 1 + ep->verbs_ep.recv_wc_next--; + ep->verbs_ep.recv_wc_count++; +#else + ep->verbs_ep.revisit_buf = buf; + ep->verbs_ep.revisit_payload_size = rcv_ev.payload_size; +#endif + GENERIC_PERF_END(PSM_RX_SPEEDPATH_CTR); /* perf stats */ + return PSM2_OK_NO_PROGRESS; + } + } +repost: + num_done++; + // buffer processing is done, we can requeue it on QP + if_pf (PSM2_OK != __psm2_ep_verbs_post_recv( + buf)) + _HFI_ERROR( "unable to post recv\n"); // leak the buffer + + // if we can't process this now (such as an RTS we revisited and + // ended up queueing on unexpected queue) we're told + // to stop processing, we'll look at the rest later + if_pf (ret == IPS_RECVHDRQ_BREAK) { + _HFI_VDBG("stop rcvq\n"); + break; + } +#if VERBS_RECV_CQE_BATCH > 1 + } while(! done); +#else + } +#endif + + /* Process any pending acks before exiting */ + process_pending_acks(recvq); + GENERIC_PERF_END(PSM_RX_SPEEDPATH_CTR); /* perf stats */ + + return num_done?PSM2_OK:PSM2_OK_NO_PROGRESS; + +fail: + GENERIC_PERF_END(PSM_RX_SPEEDPATH_CTR); /* perf stats */ + return PSM2_INTERNAL_ERR; +} + diff --git a/prov/psm3/psm3/ptl_ips/ips_recvhdrq.h b/prov/psm3/psm3/ptl_ips/ips_recvhdrq.h new file mode 100644 index 00000000000..9fcbc112669 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_recvhdrq.h @@ -0,0 +1,166 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2015 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "ips_proto_params.h" +#include "ips_proto_header.h" + +#ifndef _IPS_RECVHDRQ_H +#define _IPS_RECVHDRQ_H + +struct ips_recvhdrq; +struct ips_recvhdrq_state; +struct ips_epstate; + +/* process current packet, continue on next packet */ +#define IPS_RECVHDRQ_CONTINUE 0 +/* process current packet, break and return to caller */ +#define IPS_RECVHDRQ_BREAK 1 +/* keep current packet, revisit the same packet next time */ +#define IPS_RECVHDRQ_REVISIT 2 + + +struct ips_recvhdrq_event { + struct ips_proto *proto; + const struct ips_recvhdrq *recvq; /* where message received */ + struct ips_message_header *p_hdr; /* protocol header in rcv_hdr */ + struct ips_epaddr *ipsaddr; /* peer ipsaddr, if available */ + // we point to the payload part of our recv buffer + uint8_t *payload; + uint32_t payload_size; +}; + +struct ips_recvhdrq_callbacks { + int (*callback_packet_unknown) (const struct ips_recvhdrq_event *); +}; + +psm2_error_t +ips_recvhdrq_init(const psmi_context_t *context, + const struct ips_epstate *epstate, + const struct ips_proto *proto, + const struct ips_recvhdrq_callbacks *callbacks, + struct ips_recvhdrq *recvq + ); + +psm2_error_t ips_recvhdrq_progress(struct ips_recvhdrq *recvq); + + +/* + * Structure containing state for recvhdrq reading. This is logically + * part of ips_recvhdrq but needs to be separated out for context + * sharing so that it can be put in a shared memory page and hence + * be available to all processes sharing the context. Generally, do not + * put pointers in here since the address map of each process can be + * different. + */ +#define NO_EAGER_UPDATE ~0U +struct ips_recvhdrq_state { +}; + +/* + * Structure to read from recvhdrq + */ +struct ips_recvhdrq { + struct ips_proto *proto; + const psmi_context_t *context; /* error handling, epid id, etc. */ + /* Header queue handling */ + pthread_spinlock_t hdrq_lock; /* Lock for thread-safe polling */ + /* Lookup endpoints epid -> ptladdr (rank)) */ + const struct ips_epstate *epstate; + + /* Callbacks to handle recvq events */ + struct ips_recvhdrq_callbacks recvq_callbacks; + + /* List of flows with pending acks for receive queue */ + SLIST_HEAD(pending_flows, ips_flow) pending_acks; + +}; + + +PSMI_INLINE( +void * +ips_recvhdrq_event_payload(const struct ips_recvhdrq_event *rcv_ev)) +{ + psmi_assert(rcv_ev); + return rcv_ev->payload; +} + +PSMI_INLINE( +uint32_t +ips_recvhdrq_event_paylen(const struct ips_recvhdrq_event *rcv_ev)) +{ + psmi_assert(rcv_ev); + return rcv_ev->payload_size; +} + +PSMI_INLINE(int ips_recvhdrq_trylock(struct ips_recvhdrq *recvq)) +{ + int ret = pthread_spin_trylock(&recvq->hdrq_lock); + return !ret; +} + +PSMI_INLINE(int ips_recvhdrq_lock(struct ips_recvhdrq *recvq)) +{ + int ret = pthread_spin_lock(&recvq->hdrq_lock); + return !ret; +} + +PSMI_INLINE(int ips_recvhdrq_unlock(struct ips_recvhdrq *recvq)) +{ + int ret = pthread_spin_unlock(&recvq->hdrq_lock); + return !ret; +} + +#endif /* _IPS_RECVHDRQ_H */ diff --git a/prov/psm3/psm3/ptl_ips/ips_recvq.c b/prov/psm3/psm3/ptl_ips/ips_recvq.c new file mode 100644 index 00000000000..be7248db7be --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_recvq.c @@ -0,0 +1,92 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "ips_recvq.h" + +/* We return a table of pointer indexes. + * + * From the point of view of the returned pointer, index -1 always points to + * the address to call psmi_free on (since we force page-alignment). + */ +void **ips_recvq_egrbuf_table_alloc(psm2_ep_t ep, void *baseptr, + uint32_t bufnum, uint32_t bufsize) +{ + unsigned i; + void *ptr_alloc; + uintptr_t *buft; + uintptr_t base = (uintptr_t) baseptr; + + ptr_alloc = psmi_malloc(ep, UNDEFINED, + PSMI_PAGESIZE + sizeof(uintptr_t) * (bufnum + + 1)); + if (ptr_alloc == NULL) + return NULL; + /* First pointer is to the actual allocated address, so we can free it but + * buft[1] is first on the page boundary + */ + buft = (uintptr_t *) PSMI_ALIGNUP((uint8_t *)ptr_alloc + 1, PSMI_PAGESIZE); + buft[-1] = (uintptr_t) ptr_alloc; + for (i = 0; i < bufnum; i++) + buft[i] = (uintptr_t) ((char *)base + i * bufsize); + return (void **)buft; +} + +void ips_recvq_egrbuf_table_free(void **buftable) +{ + uintptr_t *buft = (uintptr_t *) buftable; + void *ptr_alloc = (void *)buft[-1]; + psmi_free(ptr_alloc); +} diff --git a/prov/psm3/psm3/ptl_ips/ips_recvq.h b/prov/psm3/psm3/ptl_ips/ips_recvq.h new file mode 100644 index 00000000000..7d1a990d433 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_recvq.h @@ -0,0 +1,73 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_RECVQ_H +#define _IPS_RECVQ_H + +#include "psm_user.h" + +/* + * Tables to map eager indexes into their buffer addresses + * + * If function returns NULL, no memory has been allocated and the error handler + * has been executed on 'ep' and hence assume status PSM2_NO_MEMORY. + */ +void **ips_recvq_egrbuf_table_alloc(psm2_ep_t ep, + void *base, uint32_t bufnum, + uint32_t bufsize); +void ips_recvq_egrbuf_table_free(void **buftable); + + +#endif /* _IPS_RECVQ_H */ diff --git a/prov/psm3/psm3/ptl_ips/ips_scb.c b/prov/psm3/psm3/ptl_ips/ips_scb.c new file mode 100644 index 00000000000..83517aca1ba --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_scb.c @@ -0,0 +1,347 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" +#include "ips_scb.h" +#include "ips_proto_internal.h" + +psm2_error_t +ips_scbctrl_init(const psmi_context_t *context, + uint32_t numscb, uint32_t numbufs, + uint32_t imm_size, uint32_t bufsize, + ips_scbctrl_avail_callback_fn_t scb_avail_callback, + void *scb_avail_context, struct ips_scbctrl *scbc) +{ + int i; + struct ips_scb *scb; + size_t scb_size; + size_t alloc_sz; + uintptr_t base, imm_base; + psm2_ep_t ep = context->ep; + /* scbc->context = context; */ + psm2_error_t err = PSM2_OK; + + psmi_assert_always(numscb > 0); + scbc->sbuf_num = scbc->sbuf_num_cur = numbufs; + SLIST_INIT(&scbc->sbuf_free); + scbc->sbuf_buf_size = bufsize; + scbc->sbuf_buf_base = NULL; + scbc->sbuf_buf_alloc = NULL; + scbc->sbuf_buf_last = NULL; + + /* send buffers are not mandatory but when allocating them, make sure they + * are on a page boundary */ + if (numbufs > 0) { + struct ips_scbbuf *sbuf; + + bufsize = PSMI_ALIGNUP(bufsize, 64); + + alloc_sz = numbufs * bufsize + PSMI_PAGESIZE; + scbc->sbuf_buf_alloc = + psmi_calloc(ep, NETWORK_BUFFERS, 1, alloc_sz); + if (scbc->sbuf_buf_alloc == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + base = (uintptr_t) scbc->sbuf_buf_alloc; + base = PSMI_ALIGNUP(base, PSMI_PAGESIZE); + scbc->sbuf_buf_base = (void *)base; + scbc->sbuf_buf_last = (void *)(base + bufsize * (numbufs - 1)); + _HFI_VDBG + ("sendbufs=%d, (size=%d),base=[%p..%p)\n", + numbufs, bufsize, + (void *)scbc->sbuf_buf_base, (void *)scbc->sbuf_buf_last); + + for (i = 0; i < numbufs; i++) { + sbuf = (struct ips_scbbuf *)(base + bufsize * i); + SLIST_NEXT(sbuf, next) = NULL; + SLIST_INSERT_HEAD(&scbc->sbuf_free, sbuf, next); + } + } + + imm_base = 0; + scbc->scb_imm_size = imm_size; + if (scbc->scb_imm_size) { + scbc->scb_imm_size = PSMI_ALIGNUP(imm_size, 64); + alloc_sz = numscb * scbc->scb_imm_size + 64; + scbc->scb_imm_buf = psmi_memalign(ep, NETWORK_BUFFERS, 64, + alloc_sz); + + if (scbc->scb_imm_buf == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + + memset(scbc->scb_imm_buf, 0, alloc_sz); + imm_base = PSMI_ALIGNUP(scbc->scb_imm_buf, 64); + } else + scbc->scb_imm_buf = NULL; + + scbc->scb_num = scbc->scb_num_cur = numscb; + SLIST_INIT(&scbc->scb_free); + + scb_size = PSMI_ALIGNUP(sizeof(*scb), 64); + alloc_sz = numscb * scb_size; + + scbc->scb_base = psmi_memalign(ep, NETWORK_BUFFERS, 64, alloc_sz); + if (scbc->scb_base == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + + memset(scbc->scb_base, 0, alloc_sz); + base = (uintptr_t) scbc->scb_base; + + /* + * Allocate ack/send timer for each scb object. + */ + scbc->timers = (struct psmi_timer *) + psmi_calloc(ep, UNDEFINED, 2*numscb, + sizeof(struct psmi_timer)); + if (scbc->timers == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + + for (i = 0; i < numscb; i++) { + scb = (struct ips_scb *)(base + i * scb_size); + + scb->scbc = scbc; + if (scbc->scb_imm_buf) + scb->imm_payload = + (void *)(imm_base + (i * scbc->scb_imm_size)); + else + scb->imm_payload = NULL; + + SLIST_INSERT_HEAD(&scbc->scb_free, scb, next); + + /* + * Initialize timers. + * Associate the timers to each scb, the association is + * not fixed because later PSM may exchange the timers + * between scb, the reason for exchanging is that the + * timer is currently using by flow, but the scb is to + * be freed. see ack/nak processing in file ips_prot_recv.c + */ + scb->timer_ack = &scbc->timers[2*i]; + psmi_timer_entry_init(scb->timer_ack, + ips_proto_timer_ack_callback, scb); + + scb->timer_send = &scbc->timers[2*i+1]; + psmi_timer_entry_init(scb->timer_send, + ips_proto_timer_send_callback, scb); + } + scbc->scb_avail_callback = scb_avail_callback; + scbc->scb_avail_context = scb_avail_context; + + +fail: + return err; +} + +psm2_error_t ips_scbctrl_fini(struct ips_scbctrl *scbc) +{ + if (scbc->scb_base != NULL) { + psmi_free(scbc->scb_base); + } + if (scbc->sbuf_buf_alloc) { + psmi_free(scbc->sbuf_buf_alloc); + } + if (scbc->timers != NULL) { + psmi_free(scbc->timers); + } + if (scbc->scb_imm_buf) { + psmi_free(scbc->scb_imm_buf); + } + return PSM2_OK; +} + +int ips_scbctrl_bufalloc(ips_scb_t *scb) +{ + struct ips_scbctrl *scbc = scb->scbc; + + psmi_assert(scbc->sbuf_num > 0); + psmi_assert(!((ips_scb_buffer(scb) >= scbc->sbuf_buf_base) && + (ips_scb_buffer(scb) <= scbc->sbuf_buf_last))); + psmi_assert(scb->payload_size <= scbc->sbuf_buf_size); + + if (scb->payload_size <= scbc->scb_imm_size) { + /* Attach immediate buffer */ + ips_scb_buffer(scb) = scb->imm_payload; + return 1; + } + + if (SLIST_EMPTY(&scbc->sbuf_free)) + return 0; + else { + psmi_assert(scbc->sbuf_num_cur); + ips_scb_buffer(scb) = SLIST_FIRST(&scbc->sbuf_free); + scbc->sbuf_num_cur--; + + /* If under memory pressure request ACK for packet to reclaim + * credits. + */ + if (scbc->sbuf_num_cur < (scbc->sbuf_num >> 1)) + scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; + + SLIST_REMOVE_HEAD(&scbc->sbuf_free, next); + return 1; + } +} + +int ips_scbctrl_avail(struct ips_scbctrl *scbc) +{ + return (!SLIST_EMPTY(&scbc->scb_free) && scbc->sbuf_num_cur > 0); +} + +ips_scb_t *MOCKABLE(ips_scbctrl_alloc)(struct ips_scbctrl *scbc, int scbnum, int len, + uint32_t flags) +{ + ips_scb_t *scb, *scb_head = NULL; + + psmi_assert(flags & IPS_SCB_FLAG_ADD_BUFFER ? (scbc->sbuf_num > 0) : 1); + psmi_assert(scbc->sbuf_buf_size >= len); + + while (scbnum--) { + if (SLIST_EMPTY(&scbc->scb_free)) + break; + scb = SLIST_FIRST(&scbc->scb_free); + /* Need to set this here as bufalloc may request + * an ACK under memory pressure + */ + scb->scb_flags = 0; + if (flags & IPS_SCB_FLAG_ADD_BUFFER) { + scb->payload_size = len; + if (!ips_scbctrl_bufalloc(scb)) + break; + } else { + ips_scb_buffer(scb) = NULL; + scb->payload_size = 0; + } + + scb->tidsendc = NULL; + scb->callback = NULL; + scb->tidctrl = 0; + scb->nfrag = 1; + scb->frag_size = 0; +#ifdef PSM_CUDA + scb->mq_req = NULL; +#endif + + scbc->scb_num_cur--; + if (scbc->scb_num_cur < (scbc->scb_num >> 1)) + scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; + + SLIST_REMOVE_HEAD(&scbc->scb_free, next); + SLIST_NEXT(scb, next) = scb_head; + scb_head = scb; + } + return scb_head; +} +MOCK_DEF_EPILOGUE(ips_scbctrl_alloc); + +void ips_scbctrl_free(ips_scb_t *scb) +{ + struct ips_scbctrl *scbc = scb->scbc; + if (scbc->sbuf_num && (ips_scb_buffer(scb) >= scbc->sbuf_buf_base) && + (ips_scb_buffer(scb) <= scbc->sbuf_buf_last)) { + scbc->sbuf_num_cur++; + SLIST_INSERT_HEAD(&scbc->sbuf_free, scb->sbuf, next); + } + + ips_scb_buffer(scb) = NULL; + scb->tidsendc = NULL; + scb->payload_size = 0; + scbc->scb_num_cur++; + if (SLIST_EMPTY(&scbc->scb_free)) { + SLIST_INSERT_HEAD(&scbc->scb_free, scb, next); + if (scbc->scb_avail_callback != NULL) + scbc->scb_avail_callback(scbc, scbc->scb_avail_context); + } else + SLIST_INSERT_HEAD(&scbc->scb_free, scb, next); + + return; +} + +ips_scb_t *MOCKABLE(ips_scbctrl_alloc_tiny)(struct ips_scbctrl *scbc) +{ + ips_scb_t *scb; + if (SLIST_EMPTY(&scbc->scb_free)) + return NULL; + scb = SLIST_FIRST(&scbc->scb_free); + + SLIST_REMOVE_HEAD(&scbc->scb_free, next); + SLIST_NEXT(scb, next) = NULL; + + ips_scb_buffer(scb) = NULL; + scb->payload_size = 0; + scb->scb_flags = 0; + scb->tidsendc = NULL; + scb->callback = NULL; + scb->tidctrl = 0; + scb->nfrag = 1; + scb->frag_size = 0; +#ifdef PSM_CUDA + scb->mq_req = NULL; +#endif + + scbc->scb_num_cur--; + if (scbc->scb_num_cur < (scbc->scb_num >> 1)) + scb->scb_flags |= IPS_SEND_FLAG_ACKREQ; + return scb; +} +MOCK_DEF_EPILOGUE(ips_scbctrl_alloc_tiny); diff --git a/prov/psm3/psm3/ptl_ips/ips_scb.h b/prov/psm3/psm3/ptl_ips/ips_scb.h new file mode 100644 index 00000000000..8d1eb49d119 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_scb.h @@ -0,0 +1,202 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_SCB_H +#define _IPS_SCB_H + +#include "psm2_mock_testing.h" +#include "psm_user.h" +#include "ips_proto_header.h" + +/* ips_alloc_scb flags */ +#define IPS_SCB_FLAG_NONE 0x0 +#define IPS_SCB_FLAG_ADD_BUFFER 0x1 + +/* macros to update scb */ +#define ips_scb_opcode(scb) scb->opcode +#define ips_scb_buffer(scb) scb->payload +#define ips_scb_length(scb) scb->payload_size +#define ips_scb_flags(scb) scb->scb_flags +#define ips_scb_dma_cntr(scb) scb->dma_cntr +#define ips_scb_epaddr(scb) scb->epaddr +#define ips_scb_cb(scb) scb->callback +#define ips_scb_cb_param(scb) scb->cb_param + +#define ips_scb_copy_tag(dst, src) \ + (dst)[0] = (src)[0]; \ + (dst)[1] = (src)[1]; \ + (dst)[2] = (src)[2]; + +struct ips_scbbuf; +struct ips_scb; +struct ips_scbctrl; +struct ips_tid_send_desc; + +typedef void (*ips_scbctrl_avail_callback_fn_t) (struct ips_scbctrl *, + void *context); + +STAILQ_HEAD(ips_scb_stailq, ips_scb); +SLIST_HEAD(ips_scb_slist, ips_scb); + +struct ips_scbctrl { + /* const psmi_context_t *context; */ + + /* Send control blocks for each send */ + uint32_t scb_num; + uint32_t scb_num_cur; + SLIST_HEAD(scb_free, ips_scb) scb_free; + void *scb_base; + ips_scbctrl_avail_callback_fn_t scb_avail_callback; + void *scb_avail_context; + + /* Immediate data for send buffers */ + uint32_t scb_imm_size; + void *scb_imm_buf; + psmi_timer *timers; /* ack/send timers */ + + /* + * Send buffers (or bounce buffers) to keep user data if we need to + * retransmit. + */ + uint32_t sbuf_num; + uint32_t sbuf_num_cur; + SLIST_HEAD(sbuf_free, ips_scbbuf) sbuf_free; + void *sbuf_buf_alloc; + uint32_t sbuf_buf_size; + void *sbuf_buf_base; + void *sbuf_buf_last; +}; + +struct ips_scbbuf { + SLIST_ENTRY(ips_scbbuf) next; +}; + +typedef struct ips_scb ips_scb_t; + +struct ips_scb { + union { + SLIST_ENTRY(ips_scb) next; + STAILQ_ENTRY(ips_scb) nextq; + }; + union { + void *payload; // used for UD and UDP + struct ips_scbbuf *sbuf; // linkage for free scb's + }; + uint64_t ack_timeout; /* in cycles */ + uint64_t abs_timeout; /* in cycles */ + + psmi_timer *timer_send; /* for sending packets */ + psmi_timer *timer_ack; /* for acking packets */ + + /* Used when composing packet */ + psmi_seqnum_t seq_num; + uint32_t cksum[2]; + uint32_t scb_flags; + uint32_t payload_size; /* remaining first packet size */ + uint32_t chunk_size; /* total buffer size if nfrag > 1 */ + /* initially chunk_size_remaining = chunk_size. */ + uint32_t chunk_size_remaining; /* buffer size to re-transmit */ + uint16_t nfrag; /* total packets in sequence */ + /* initially nfrag_remaining = nfrag */ + uint16_t nfrag_remaining; /* number packets to re-transmit */ + uint16_t dma_complete; + uint16_t tidctrl; + uint16_t frag_size; /* max packet size in sequence */ + uint16_t opcode; + uint16_t tsess_length; + uint32_t *tsess; + struct ips_flow *flow; + struct ips_tid_send_desc *tidsendc; + + struct ips_scbctrl *scbc; + void *imm_payload; + + union { + int (*callback) (void *, uint32_t); + psm2_am_completion_fn_t completion_am; + }; + void *cb_param; +#ifdef PSM_CUDA + psm2_mq_req_t mq_req; /* back pointer to original request */ +#endif + struct { + struct ips_message_header ips_lrh; + } PSMI_CACHEALIGN; +}; + + +#ifdef PSM_CUDA +#define IS_TRANSFER_BUF_GPU_MEM(scb) (ips_scb_flags(scb) & IPS_SEND_FLAG_PAYLOAD_BUF_GPU) +#endif + +void ips_scbctrl_free(ips_scb_t *scb); +int ips_scbctrl_bufalloc(ips_scb_t *scb); +int ips_scbctrl_avail(struct ips_scbctrl *scbc); +ips_scb_t *MOCKABLE(ips_scbctrl_alloc)(struct ips_scbctrl *scbc, + int scbnum, int len, uint32_t flags); +MOCK_DCL_EPILOGUE(ips_scbctrl_alloc); +ips_scb_t *MOCKABLE(ips_scbctrl_alloc_tiny)(struct ips_scbctrl *scbc); +MOCK_DCL_EPILOGUE(ips_scbctrl_alloc_tiny); + +psm2_error_t ips_scbctrl_init(const psmi_context_t *context, + uint32_t numscb, uint32_t numbufs, + uint32_t imm_size, uint32_t bufsize, + ips_scbctrl_avail_callback_fn_t, + void *avail_context, struct ips_scbctrl *); +psm2_error_t ips_scbctrl_fini(struct ips_scbctrl *); + +psm2_error_t ips_scbctrl_writev(struct ips_scb_slist *slist, int fd); + +#endif /* _IPS_SCB_H */ diff --git a/prov/psm3/psm3/ptl_ips/ips_stats.h b/prov/psm3/psm3/ptl_ips/ips_stats.h new file mode 100644 index 00000000000..046e0c38a2f --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_stats.h @@ -0,0 +1,83 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_STATS_H +#define _IPS_STATS_H + +struct psm2_epaddr; /* for non-PSM clients */ + +/* Old stats */ +typedef struct { + uint64_t err_chk_send; + uint64_t err_chk_recv; + uint64_t send_failed; + uint64_t recv_dropped; + union { + uint64_t recv_copied; /* obsolete */ + uint64_t nak_sent; + }; + uint64_t nak_recv; + uint64_t total_send_eager; + uint64_t total_send_exp; + uint64_t acks_sent; + uint64_t retransmits; + uint64_t recv_matched; + uint64_t recv_unmatched; + uint64_t scb_alloc_yields; +} ips_sess_stat; + +int ips_get_stat(struct psm2_epaddr *epaddr, ips_sess_stat *stats); + +#endif /* _IPS_STATS_H */ diff --git a/prov/psm3/psm3/ptl_ips/ips_subcontext.h b/prov/psm3/psm3/ptl_ips/ips_subcontext.h new file mode 100644 index 00000000000..4f5afcbab0e --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_subcontext.h @@ -0,0 +1,79 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef __IPS_SUBCONTEXT_H +#define __IPS_SUBCONTEXT_H + +#include "psm_user.h" +#include "ips_recvhdrq.h" +#include "ips_writehdrq.h" + +/* This data structure is allocated in ureg page of each subcontext process */ + +struct ips_subcontext_ureg { + /* head/eager head/tail register storage, one per cacheline + (member is unused by PSM, but needed here to match driver structures). */ + uint64_t subcontext_uregbase[40 /* i.e. ur_maxreg * 8 */]; + struct ips_writehdrq_state writeq_state; /* used in all ureg pages */ +} __attribute__ ((aligned(64))); + +struct ips_hwcontext_ctrl { + pthread_spinlock_t context_lock; /* lock shared by all subctxts */ + struct ips_recvhdrq_state recvq_state; /* state shared by all subctxts */ + uint32_t rx_hdrq_rhf_seq; /* rhf seq for the hw hdrq shared + by all subctxts */ +} __attribute__ ((aligned(64))); + +#endif diff --git a/prov/psm3/psm3/ptl_ips/ips_tid.c b/prov/psm3/psm3/ptl_ips/ips_tid.c new file mode 100644 index 00000000000..e7349dde133 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_tid.c @@ -0,0 +1,55 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + diff --git a/prov/psm3/psm3/ptl_ips/ips_tid.h b/prov/psm3/psm3/ptl_ips/ips_tid.h new file mode 100644 index 00000000000..6d31defc872 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_tid.h @@ -0,0 +1,61 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +/* included header files */ + +#ifndef _IPS_TID_H +#define _IPS_TID_H + +#endif /* _IPS_TID_H */ diff --git a/prov/psm3/psm3/ptl_ips/ips_tidcache.c b/prov/psm3/psm3/ptl_ips/ips_tidcache.c new file mode 100644 index 00000000000..f7588b83fe0 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_tidcache.c @@ -0,0 +1,53 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + diff --git a/prov/psm3/psm3/ptl_ips/ips_tidcache.h b/prov/psm3/psm3/ptl_ips/ips_tidcache.h new file mode 100644 index 00000000000..20d45bf5fa1 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_tidcache.h @@ -0,0 +1,158 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef _IPS_TIDCACHE_H +#define _IPS_TIDCACHE_H + +#include +#include +#include +#include + +/* + * Design notes. + * + * PSM needs to call into driver to program receiving buffer pages to + * HFI gen1 hardware, each tid can be programmed with physically contiguous + * power-of-two pages from 1 pages to 512 pages. This procedure takes + * time. + * + * Lots of applications tend to re-use the same receiving buffer, caching + * such programmed tids in user space process will save time and improve + * application performance. + * + * This PSM tid registration caching design requires cooperation between + * PSM and driver. Here is what happen between PSM and driver. + * + * 1. PSM call into driver with a chunk of buffer with virtual address + * and length. + * 2. driver pins the buffer pages, program hardware with the physical + * pages, get a list of tids. + * 3. driver caches the tids with the corresponding virtual address in + * user space for each tid, and return the list of tids back to PSM. + * 4. PSM also caches the list of tids with the corresponding virtual + * address for each tid, and use the list of tids for transmission. + * 5. when process frees a buffer, kernel VM will catch the event and + * calls the callback in driver to notify that the virtual address + * range is gone in the process. + * 6. driver will search its cache system and find the tids with the + * removed virtual address, put these tid in an invalidation queue + * and notify PSM the event. + * 7. PSM will pick the event and remove the tids from its own cache + * as well. + * 8. PSM must check such invalidation event every time before searching + * its caching system to match tids for a 'new' buffer chunk. + * 9, when the caching system is full, and a new buffer chunk is asked + * to register, PSM picks a victim to remove. + */ + +typedef struct +{ + unsigned long start; /* start virtual address */ + uint32_t tidinfo; /* tid encoding */ + uint16_t length; /* length in pages */ + uint16_t invalidate; /* invalidate flag */ + uint16_t refcount; /* usage reference count */ + uint16_t i_prev; /* idle queue previous */ + uint16_t i_next; /* idle queue next */ +} rbtree_tidcache_mapitem_pl_t; + +typedef struct { + uint32_t ntid; /* tids are cached */ + uint32_t nidle; /* tids are idle */ +} rbtree_tidcache_map_pl_t; + +#define RBTREE_MI_PL rbtree_tidcache_mapitem_pl_t +#define RBTREE_MAP_PL rbtree_tidcache_map_pl_t + +#include "rbtree.h" + +/* + * Macro definition for easy programming. + */ + +#define NTID p_map->payload.ntid +#define REFCNT(x) p_map->root[x].payload.refcount +#define INVALIDATE(x) p_map->root[x].payload.invalidate + +#define LENGTH(x) p_map->root[x].payload.length +#define START(x) p_map->root[x].payload.start +#define END(x) (START(x) + (LENGTH(x)<<12)) + +/* + * Macro for idle tid queue management. + */ +#define NIDLE p_map->payload.nidle +#define IHEAD 0 +#define INEXT(x) p_map->root[x].payload.i_next +#define IPREV(x) p_map->root[x].payload.i_prev + +#define IDLE_REMOVE(x) do { \ + INEXT(IPREV(x)) = INEXT(x); \ + IPREV(INEXT(x)) = IPREV(x); \ + NIDLE--; \ + } while (0) + +#define IDLE_INSERT(x) do { \ + INEXT(x) = INEXT(IHEAD); \ + IPREV(x) = IHEAD; \ + IPREV(INEXT(IHEAD)) = x; \ + INEXT(IHEAD) = x; \ + NIDLE++; \ + } while (0) + +extern void ips_tidcache_map_init(cl_qmap_t *p_map, + cl_map_item_t* const root, + cl_map_item_t* const nil_item); + +#endif diff --git a/prov/psm3/psm3/ptl_ips/ips_tidflow.c b/prov/psm3/psm3/ptl_ips/ips_tidflow.c new file mode 100644 index 00000000000..9d671fd0b9c --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_tidflow.c @@ -0,0 +1,273 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_proto.h" +#include "ips_expected_proto.h" +#include "ips_tidflow.h" + +psm2_error_t ips_tf_init(struct ips_protoexp *protoexp, + const psmi_context_t *context, + struct ips_tf *tfc, + ips_tf_avail_cb_fn_t cb) +{ + int tf_idx; + +#if TF_ADD + struct psmi_stats_entry entries[] = { + PSMI_STATS_DECL("tidflow_update_count", + MPSPAWN_STATS_REDUCTION_ALL, + NULL, &tfc->tf_num_total), + }; +#endif + + tfc->context = context; + tfc->tf_num_total = 0; + tfc->tf_num_inuse = 0; + tfc->tf_avail_cb = cb; + tfc->tf_avail_context = (void *)protoexp; + tfc->tf_gen_mask = 0xFFFFF; + + /* Allocate and Initialize tidrecvc array. */ + tfc->tidrecvc = (struct ips_tid_recv_desc *) + psmi_calloc(context->ep, UNDEFINED, 1, + sizeof(struct ips_tid_recv_desc)*HFI_TF_NFLOWS); + if (tfc->tidrecvc == NULL) + return PSM2_NO_MEMORY; + + for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) { + tfc->tidrecvc[tf_idx].context = context; + tfc->tidrecvc[tf_idx].protoexp = protoexp; + tfc->tidrecvc[tf_idx].rdescid._desc_idx = tf_idx; + tfc->tidrecvc[tf_idx].rdescid._desc_genc = tf_idx; + } + + /* Shared control structure, it will be in shared memory + * for context sharing, otherwise calloc() it */ + tfc->tf_ctrl = (struct ips_tf_ctrl *)context->tf_ctrl; + if (!tfc->tf_ctrl) { + tfc->tf_ctrl = (struct ips_tf_ctrl *) + psmi_calloc(context->ep, UNDEFINED, 1, + sizeof(struct ips_tf_ctrl)); + if (tfc->tf_ctrl == NULL) { + return PSM2_NO_MEMORY; + } + } + + /* + * Only the master process can initialize. + */ + { + pthread_spin_init(&tfc->tf_ctrl->tf_ctrl_lock, + PTHREAD_PROCESS_SHARED); + tfc->tf_ctrl->tf_num_max = HFI_TF_NFLOWS; + tfc->tf_ctrl->tf_num_avail = HFI_TF_NFLOWS; + + for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) { +// USE_RC TBD this is bizzare. For native mode it works fine +// for UD/UDP mode it crashes at next_free assignment below on some systems +// but adding this print or moving next_free assignment to separate +// loop works fine. Really odd if this is a compiler issue, but +// I don't see any other reason. We should be single threaded here +// enabling the empty call to tidflow_reset doesn't help +// stubbing tidflow_reset on native works fine, can't explain crash +// nor workaround + /* Update flow state */ + tfc->tf_ctrl->tf[tf_idx].state = TF_STATE_DEALLOCATED; + tfc->tf_ctrl->tf[tf_idx].tf_idx = tf_idx; + tfc->tf_ctrl->tf[tf_idx].next_gen = 0; +#if 0 + tfc->tf_ctrl->tf[tf_idx].next_free = tf_idx + 1; +#endif + + } +#if 1 + for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) { + tfc->tf_ctrl->tf[tf_idx].next_free = tf_idx + 1; + } +#endif + tfc->tf_ctrl->tf_head = 0; + } + +#if TF_ADD + /* TF_ADD: Add a new stats type for tid flows in psm_stats.h */ + return psmi_stats_register_type("TID_Flow_Statistics", + PSMI_STATSTYPE_TIDS, + entries, + PSMI_STATS_HOWMANY(entries), + protoexp->proto->ep->epid, tfc); +#else + return PSM2_OK; +#endif +} + +psm2_error_t ips_tf_fini(struct ips_tf *tfc) +{ + psmi_stats_deregister_type(PSMI_STATSTYPE_TIDS, tfc); + if (!tfc->context->tf_ctrl) + psmi_free(tfc->tf_ctrl); + psmi_free(tfc->tidrecvc); + return PSM2_OK; +} + +/* Allocate a tidflow */ +psm2_error_t ips_tf_allocate(struct ips_tf *tfc, + struct ips_tid_recv_desc **tidrecvc) +{ + struct ips_tf_ctrl *ctrl = tfc->tf_ctrl; + struct ips_tf_entry *entry; + + if (tfc->context->tf_ctrl) + pthread_spin_lock(&ctrl->tf_ctrl_lock); + + if (!ctrl->tf_num_avail) { + psmi_assert(ctrl->tf_head == HFI_TF_NFLOWS); + *tidrecvc = NULL; + + if (tfc->context->tf_ctrl) + pthread_spin_unlock(&ctrl->tf_ctrl_lock); + + return PSM2_EP_NO_RESOURCES; + } + + entry = &ctrl->tf[ctrl->tf_head]; + ctrl->tf_head = entry->next_free; + ctrl->tf_num_avail--; + + if (tfc->context->tf_ctrl) + pthread_spin_unlock(&ctrl->tf_ctrl_lock); + + tfc->tf_num_total++; + tfc->tf_num_inuse++; + + psmi_assert(entry->state == TF_STATE_DEALLOCATED); + entry->state = TF_STATE_ALLOCATED; + + *tidrecvc = &(tfc->tidrecvc[entry->tf_idx]); + /* initial tidflow generation */ + (*tidrecvc)->tidflow_active_gen = entry->next_gen; + + psmi_assert((*tidrecvc)->rdescid._desc_idx == entry->tf_idx); + psmi_assert_always(entry->next_gen < tfc->tf_gen_mask); + + entry->next_gen++; + if (entry->next_gen == tfc->tf_gen_mask) + entry->next_gen = 0; + + return PSM2_OK; +} + +/* Deallocate a tidflow */ +psm2_error_t ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx) +{ + struct ips_tf_ctrl *ctrl = tfc->tf_ctrl; + struct ips_tf_entry *entry; + + psmi_assert(tf_idx < HFI_TF_NFLOWS); + psmi_assert(tf_idx >= 0); + + entry = &ctrl->tf[tf_idx]; + psmi_assert(entry->state == TF_STATE_ALLOCATED); + entry->state = TF_STATE_DEALLOCATED; + + /* we track all 32 bits to improve ability for err_chk_rdma + * to identify completed requests vs rdescid reuse + * however only low 16 bits are used in RDMA immediate data + */ + tfc->tidrecvc[tf_idx].rdescid.u32w1++; + + + if (tfc->context->tf_ctrl) + pthread_spin_lock(&ctrl->tf_ctrl_lock); + + entry->next_free = ctrl->tf_head; + ctrl->tf_head = tf_idx; + ctrl->tf_num_avail++; + + if (tfc->context->tf_ctrl) + pthread_spin_unlock(&ctrl->tf_ctrl_lock); + + tfc->tf_num_inuse--; + /* If an available callback is registered invoke it */ + if (((tfc->tf_num_inuse + 1) == ctrl->tf_num_max) && tfc->tf_avail_cb) + tfc->tf_avail_cb(tfc, tfc->tf_avail_context); + + return PSM2_OK; +} + +/* Allocate a generation for a flow */ +psm2_error_t ips_tfgen_allocate(struct ips_tf *tfc, + uint32_t tf_idx, uint32_t *tfgen) +{ + struct ips_tf_entry *entry; + int ret = PSM2_OK; + + psmi_assert(tf_idx < HFI_TF_NFLOWS); + psmi_assert(tf_idx >= 0); + + entry = &tfc->tf_ctrl->tf[tf_idx]; + psmi_assert(entry->state == TF_STATE_ALLOCATED); + + *tfgen = entry->next_gen; + + entry->next_gen++; + if (entry->next_gen == tfc->tf_gen_mask) + entry->next_gen = 0; + + psmi_assert_always(*tfgen < tfc->tf_gen_mask); + + return ret; +} diff --git a/prov/psm3/psm3/ptl_ips/ips_tidflow.h b/prov/psm3/psm3/ptl_ips/ips_tidflow.h new file mode 100644 index 00000000000..2a93fb7b090 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_tidflow.h @@ -0,0 +1,131 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2016 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2016 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_TIDFLOW_H +#define _IPS_TIDFLOW_H + +#include "psm_user.h" + +#define TF_ADD 1 // enable code for tidflow statistics + +struct ips_tf; +struct ips_protoexp; + +typedef void (*ips_tf_avail_cb_fn_t) (struct ips_tf *, void *context); +typedef enum { + TF_STATE_INVALID = 0, + TF_STATE_ALLOCATED = 1, + TF_STATE_DEALLOCATED = 2 +} tf_state_t; + +struct ips_tf_entry { + tf_state_t state; + uint32_t tf_idx; + uint32_t next_gen; + uint32_t next_free; +}; + +struct ips_tf_ctrl { + pthread_spinlock_t tf_ctrl_lock; + uint32_t tf_num_max; + uint32_t tf_num_avail; + uint32_t tf_head; + struct ips_tf_entry tf[HFI_TF_NFLOWS]; +} __attribute__ ((aligned(64))); + +struct ips_tf { + const psmi_context_t *context; + ips_tf_avail_cb_fn_t tf_avail_cb; + void *tf_avail_context; + struct ips_tf_ctrl *tf_ctrl; + + uint64_t tf_num_total; + uint32_t tf_num_inuse; + uint32_t tf_gen_mask; + + /* Pointer to array of size HFI_TF_NFLOWS */ + struct ips_tid_recv_desc *tidrecvc; +}; + +PSMI_ALWAYS_INLINE(int ips_tf_available(struct ips_tf *tf)) +{ + if (tf->tf_ctrl->tf_num_avail == 0) { + if (tf->tf_ctrl->tf_num_max == tf->tf_num_inuse) + return -1; + else + return 0; + } + + return tf->tf_ctrl->tf_num_avail; +} + +psm2_error_t ips_tf_init(struct ips_protoexp *protoexp, + const psmi_context_t *context, + struct ips_tf *tfc, + ips_tf_avail_cb_fn_t cb); +psm2_error_t ips_tf_fini(struct ips_tf *tfc); + +/* Allocate a tidflow */ +psm2_error_t ips_tf_allocate(struct ips_tf *tfc, + struct ips_tid_recv_desc **tidrecvc); + +/* Deallocate a tidflow */ +psm2_error_t ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx); + +/* Allocate a generation for a flow */ +psm2_error_t ips_tfgen_allocate(struct ips_tf *tfc, + uint32_t tf_idx, uint32_t *tfgen); + +#endif diff --git a/prov/psm3/psm3/ptl_ips/ips_writehdrq.c b/prov/psm3/psm3/ptl_ips/ips_writehdrq.c new file mode 100644 index 00000000000..fc30d546d5e --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_writehdrq.c @@ -0,0 +1,61 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include "psm_user.h" +#include "psm2_hal.h" +#include "ips_writehdrq.h" +#include "ips_proto_params.h" + + diff --git a/prov/psm3/psm3/ptl_ips/ips_writehdrq.h b/prov/psm3/psm3/ptl_ips/ips_writehdrq.h new file mode 100644 index 00000000000..0ad489a3805 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ips_writehdrq.h @@ -0,0 +1,83 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_WRITEHDRQ_H +#define _IPS_WRITEHDRQ_H + +#include "psm_user.h" +#include "ips_recvq.h" + +/* + * Structure containing state for writehdrq writing. This is logically + * part of ips_writehdrq but needs to be separated out for context + * sharing so that it can be put in a shared memory page and hence + * be available to all processes sharing the port. Generally, do not + * put pointers in here since the address map of each process can be + * different. + */ +struct ips_writehdrq_state { + uint32_t hdrq_rhf_seq; /* last seq */ + uint32_t egrq_offset; /* in bytes unit, not 64B */ + uint32_t enabled; /* enables writing */ +}; + +struct ips_writehdrq { + const psmi_context_t *context; + struct ips_writehdrq_state *state; + uint32_t hdrq_elemlast; +}; + + +#endif /* _IPS_WRITEHDRQ_H */ diff --git a/prov/psm3/psm3/ptl_ips/ptl.c b/prov/psm3/psm3/ptl_ips/ptl.c new file mode 100644 index 00000000000..b8ff1becc0a --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ptl.c @@ -0,0 +1,622 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +/* This file implements the PSM PTL for ips */ +#include "psm_user.h" +#include "psm2_hal.h" +#include "ptl_ips.h" +#include "psm_mq_internal.h" + + +static size_t ips_ptl_sizeof(void) +{ + return sizeof(struct ptl_ips); +} + +#if 0 // unused code, specific to QLogic MPI +static +int ips_ptl_epaddr_stats_num(void) +{ + return sizeof(struct ips_proto_epaddr_stats) / sizeof(uint64_t); +} + +static +int ips_ptl_epaddr_stats_init(char **desc, uint16_t *flags) +{ + int num_stats = + sizeof(struct ips_proto_epaddr_stats) / sizeof(uint64_t); + int i; + int j=0; + + /* All stats are uint64_t */ + for (i = 0; i < num_stats; i++) + flags[i] = MPSPAWN_STATS_REDUCTION_ALL | + MPSPAWN_STATS_SKIP_IF_ZERO; + + desc[j++] = "errchecks send"; + desc[j++] = "errchecks recv"; +#if defined(RNDV_MOD_MR) + desc[j++] = "err_chk_rdma send" + desc[j++] = "err_chk_rdma recv" +#endif + desc[j++] = "nak send"; + desc[j++] = "nak recv"; + desc[j++] = "connect req send"; + desc[j++] = "connect req recv"; + desc[j++] = "connect rep send"; + desc[j++] = "connect rep recv"; + desc[j++] = "disconnect req send"; + desc[j++] = "disconnect req recv"; + desc[j++] = "disconnect rep send"; + desc[j++] = "disconnect rep recv"; + desc[j++] = "tid grants send"; + desc[j++] = "tid grants recv"; + desc[j++] = "send rexmit"; +#if defined(RNDV_MOD_MR) + desc[j++] = "rdma rexmit"; +#endif + desc[j++] = "congestion packets"; + + psmi_assert(num_stats == j); + return num_stats; +} + +int ips_ptl_epaddr_stats_get(psm2_epaddr_t epaddr, uint64_t *stats_o) +{ + int i, num_stats = + sizeof(struct ips_proto_epaddr_stats) / sizeof(uint64_t); + uint64_t *stats_i = (uint64_t *) &epaddr->proto->epaddr_stats; + + for (i = 0; i < num_stats; i++) + stats_o[i] = stats_i[i]; + + return num_stats; +} +#endif // 0 // unused code, specific to QLogic MPI + +static +psm2_error_t +psmi_context_check_status_callback(struct psmi_timer *t, uint64_t current) +{ + struct ptl_ips *ptl = (struct ptl_ips *)t->context; + const uint64_t current_count = get_cycles(); + psm2_error_t err; + + err = psmi_context_check_status(ptl->context); + if (err == PSM2_OK || err == PSM2_OK_NO_PROGRESS) + { + int rc = psmi_hal_spio_process_events((struct ptl *)ptl); + err = rc >= 0 ? PSM2_OK : PSM2_INTERNAL_ERR; + } + psmi_timer_request_always(&ptl->timerq, &ptl->status_timer, + current_count + ptl->status_cyc_timeout); + + return err; +} + +static +psm2_error_t ips_ptl_init(const psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + psm2_error_t err = PSM2_OK; + uint32_t num_of_send_bufs = ep->hfi_num_sendbufs; + uint32_t num_of_send_desc = ep->hfi_num_descriptors; + uint32_t imm_size = ep->hfi_imm_size; + const psmi_context_t *context = &ep->context; + const int enable_shcontexts = 0; + const uint64_t current_count = get_cycles(); + + /* Preconditions */ + psmi_assert_always(ep != NULL); + psmi_assert_always(ep->epaddr != NULL); + psmi_assert_always(ep->epid != 0); + psmi_assert_always(ep->hfi_num_sendbufs > 0); + + memset(ptl, 0, sizeof(struct ptl_ips)); + + ptl->ep = ep; /* back pointer */ + ptl->epid = ep->epid; /* cache epid */ + ptl->epaddr = ep->epaddr; /* cache a copy */ + ptl->ctl = ctl; + ptl->context = context; + + memset(ctl, 0, sizeof(*ctl)); + /* Fill in the control structure */ + ctl->ep = ep; + ctl->ptl = ptl_gen; + ctl->ep_poll = ips_ptl_poll; + ctl->ep_connect = ips_ptl_connect; + ctl->ep_disconnect = ips_ptl_disconnect; + ctl->mq_send = ips_proto_mq_send; + ctl->mq_isend = ips_proto_mq_isend; + + ctl->am_get_parameters = ips_am_get_parameters; + + ctl->am_short_request = ips_am_short_request; + ctl->am_short_reply = ips_am_short_reply; + +#if 0 // unused code, specific to QLogic MPI + ctl->epaddr_stats_num = ips_ptl_epaddr_stats_num; + ctl->epaddr_stats_init = ips_ptl_epaddr_stats_init; + ctl->epaddr_stats_get = ips_ptl_epaddr_stats_get; +#endif + + ctl->msg_size_thresh_query = ips_proto_msg_size_thresh_query; + + /* + * Runtime flags in 'ptl' are different from runtime flags in 'context'. + * In 'context', runtime flags reflect what the driver is capable of. + * In 'ptl', runtime flags reflect the features we can or want to use in + * the driver's supported runtime flags. + */ + + /* + * This timer is to be used to check the context's status at every + * PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS. This is useful to detect when + * the link transitions from the DOWN state to the UP state. We can thus + * stop aggregating link failure messages once we detect that the link is + * up. + */ + psmi_timer_entry_init(&ptl->status_timer, + psmi_context_check_status_callback, ptl); + + /* cache the context's status timeout in cycles */ + ptl->status_cyc_timeout = + ms_2_cycles(PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS); + + /* + * Retransmissions and pending operations are kept in a timer structure + * (queue). The timerq is shared to various internal IPS interfaces so + * that they too may schedule events on the timer queue. The timerq is + * drained in the progress function. + */ + if ((err = psmi_timer_init(&ptl->timerq))) + goto fail; + + /* start the context's status timer */ + psmi_timer_request_always(&ptl->timerq, &ptl->status_timer, + current_count + ptl->status_cyc_timeout); + + /* + * Epstate maps endpoint ids (epid integers) to ipsaddr (structs). Mappings + * are added/removed by the connect portion of the ips protocol and lookup + * is made by the receive queue processing component. + */ + if ((err = ips_epstate_init(&ptl->epstate, context))) + goto fail; + + + /* + * Actual ips protocol handling. + */ + if ((err = + ips_proto_init(context, ptl_gen, num_of_send_bufs, num_of_send_desc, + imm_size, &ptl->timerq, &ptl->epstate, ptl->spioc, + &ptl->proto))) + goto fail; + + /* + * Hardware receive hdr/egr queue, services incoming packets and issues + * callbacks for protocol handling in proto_recv. It uses the epstate + * interface to determine if a packet is known or unknown. + */ + if (!enable_shcontexts) { + struct ips_recvhdrq_callbacks recvq_callbacks; + recvq_callbacks.callback_packet_unknown = + ips_proto_process_unknown; + if ((err = + ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto, + &recvq_callbacks, + &ptl->recvq + ))) + goto fail; + } + + /* + * Receive thread, always initialized but not necessary creates a + * pthread. + */ + if ((err = ips_ptl_rcvthread_init(ptl_gen, &ptl->recvq))) + goto fail; +fail: + return err; +} + +static psm2_error_t ips_ptl_fini(ptl_t *ptl_gen, int force, uint64_t timeout_in) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + psm2_error_t err = PSM2_OK; + + if ((err = ips_proto_fini(&ptl->proto, force, timeout_in))) + goto fail; + + /* We have to cancel the thread after terminating the protocol because + * connect/disconnect packets use interrupts and the kernel doesn't + * like to have no pollers waiting */ + if ((err = ips_ptl_rcvthread_fini(ptl_gen))) + goto fail; + + if ((err = ips_epstate_fini(&ptl->epstate))) + goto fail; + + + if ((err = psmi_timer_fini(&ptl->timerq))) + goto fail; + + + +fail: + return err; +} + +static +psm2_error_t +ips_ptl_optctl(const void *core_obj, int optname, + void *optval, uint64_t *optlen, int get) +{ + psm2_error_t err = PSM2_OK; + + switch (optname) { + case PSM2_IB_OPT_EP_SL: + { + /* Core object is psm2_epaddr */ + psm2_epaddr_t epaddr = (psm2_epaddr_t) core_obj; + ips_epaddr_t *ipsaddr = (ips_epaddr_t *) epaddr; + + /* If endpoint does not use IB ignore for set, complain for get */ + if (epaddr->ptlctl->ep_connect != ips_ptl_connect) { + if (get) + err = + psmi_handle_error(PSMI_EP_LOGEVENT, + PSM2_PARAM_ERR, + "Invalid EP transport"); + goto exit_fn; + } + + /* Sanity check option length */ + if (*optlen < sizeof(uint8_t)) { + err = + psmi_handle_error(PSMI_EP_LOGEVENT, + PSM2_PARAM_ERR, + "Option value length error"); + *optlen = sizeof(unsigned); + goto exit_fn; + } + + if (get) { + /* Get returns the SL for the PIO flow */ + *((uint8_t *) optval) = + (uint8_t) ipsaddr-> + flows[EP_FLOW_GO_BACK_N_PIO].path->pr_sl; + } else { + uint16_t new_sl; + + /* Sanity check if SL is within range */ + new_sl = (uint16_t) *(uint8_t *) optval; + if (new_sl > PSMI_SL_MAX) { + err = + psmi_handle_error(PSMI_EP_LOGEVENT, + PSM2_PARAM_ERR, + "Invalid SL value %u. %d<= SL <=%d.", + new_sl, PSMI_SL_MIN, PSMI_SL_MAX); + goto exit_fn; + } + + /* Set new SL for all flows */ + ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].path-> + pr_sl = new_sl; + } + } + break; + case PSM2_IB_OPT_DF_SL: + { + /* Set default SL to be used by an endpoint for all communication */ + /* Core object is psm2_epaddr */ + psm2_ep_t ep = (psm2_ep_t) core_obj; + + /* Make sure ep is specified */ + if (!ep) { + err = + psmi_handle_error(PSMI_EP_LOGEVENT, + PSM2_PARAM_ERR, + "Invalid PSM Endpoint"); + goto exit_fn; + } + + /* Sanity check option length */ + if (*optlen < sizeof(uint8_t)) { + err = + psmi_handle_error(PSMI_EP_LOGEVENT, + PSM2_PARAM_ERR, + "Option value length error"); + *optlen = sizeof(uint8_t); + goto exit_fn; + } + + if (get) { + *((uint8_t *) optval) = + ((struct ptl_ips *)(ep->ptl_ips.ptl))->proto.epinfo.ep_sl; + } else { + uint16_t new_sl; + + /* Sanity check if SL is within range */ + new_sl = (uint16_t) *(uint8_t *) optval; + if (new_sl > PSMI_SL_MAX) { + err = + psmi_handle_error(PSMI_EP_LOGEVENT, + PSM2_PARAM_ERR, + "Invalid SL value %u. %d<= SL <=%d.", + new_sl, PSMI_SL_MIN, PSMI_SL_MAX); + goto exit_fn; + } + + ((struct ptl_ips *)(ep->ptl_ips.ptl))->proto.epinfo.ep_sl = + (uint8_t) new_sl; + } + } + break; + default: + err = + psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Unknown PSM3_IB option %u.", optname); + } + +exit_fn: + return err; +} + +static +psm2_error_t +ips_ptl_setopt(const void *component_obj, int optname, + const void *optval, uint64_t optlen) +{ + return ips_ptl_optctl(component_obj, optname, (void *)optval, &optlen, + 0); +} + +static +psm2_error_t +ips_ptl_getopt(const void *component_obj, int optname, + void *optval, uint64_t *optlen) +{ + return ips_ptl_optctl(component_obj, optname, optval, optlen, 1); +} + +static +uint32_t +ips_ptl_rcvthread_is_enabled(const ptl_t *ptl) +{ + return psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); +} + +psm2_error_t ips_ptl_poll(ptl_t *ptl_gen, int _ignored) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + const uint64_t current_count = get_cycles(); + const int do_lock = PSMI_LOCK_DISABLED && + psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); + psm2_error_t err = PSM2_OK_NO_PROGRESS; + psm2_error_t err2; + + if (do_lock && !ips_recvhdrq_trylock(&ptl->recvq)) + return err; + err = ips_recvhdrq_progress(&ptl->recvq); + if (do_lock) + ips_recvhdrq_unlock(&ptl->recvq); + if_pf(err > PSM2_OK_NO_PROGRESS) + return err; + err2 = psmi_timer_process_if_expired(&(ptl->timerq), current_count); + if (err2 != PSM2_OK_NO_PROGRESS) + return err2; + else { + // TBD - where to best poll for this + (void)psm2_verbs_completion_update(ptl->ep); + return err; + } +} + + + + +/* + * Legacy ips_get_stat -- do nothing. + */ +int ips_get_stat(psm2_epaddr_t epaddr, ips_sess_stat *stats) +{ + memset(stats, 0, sizeof(ips_sess_stat)); + return 0; +} + + + +psm2_error_t +ips_ptl_connect(ptl_t *ptl_gen, int numep, const psm2_epid_t *array_of_epid, + const int *array_of_epid_mask, psm2_error_t *array_of_errors, + psm2_epaddr_t *array_of_epaddr, uint64_t timeout_in) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + psm2_error_t err; + psm2_ep_t ep; + psm2_epid_t *epid_array = NULL; + psm2_error_t *error_array = NULL; + psm2_epaddr_t *epaddr_array = NULL; + ips_epaddr_t *ipsaddr_master, *ipsaddr; + int *mask_array = NULL; + int i; + + PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock); + err = ips_proto_connect(&ptl->proto, numep, array_of_epid, + array_of_epid_mask, array_of_errors, + array_of_epaddr, timeout_in); + if (err) + return err; + + psmi_assert_always(ptl->ep->mctxt_master == ptl->ep); + if (ptl->ep->mctxt_next == ptl->ep) + return err; + + /* make the additional mutil-context connections. */ + epid_array = (psm2_epid_t *) + psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm2_epid_t) * numep); + mask_array = (int *) + psmi_malloc(ptl->ep, UNDEFINED, sizeof(int) * numep); + error_array = (psm2_error_t *) + psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm2_error_t) * numep); + epaddr_array = (psm2_epaddr_t *) + psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm2_epaddr_t) * numep); + if (!epid_array || !mask_array || !error_array || !epaddr_array) { + goto fail; + } + + ep = ptl->ep->mctxt_next; + while (ep != ep->mctxt_master) { + + /* Setup the mask array and epid array. */ + for (i = 0; i < numep; i++) { + if (array_of_epid_mask[i] + && array_of_errors[i] == PSM2_OK) { + ipsaddr_master = + (ips_epaddr_t *) array_of_epaddr[i]; + ipsaddr = ipsaddr_master->next; + mask_array[i] = 0; + while (ipsaddr != ipsaddr_master) { + if (((psm2_epaddr_t) ipsaddr)->proto-> + ep == ep) { + mask_array[i] = 1; + epid_array[i] = + ((psm2_epaddr_t) ipsaddr)-> + epid; + break; + } + ipsaddr = ipsaddr->next; + } + } else { + mask_array[i] = 0; + } + } + + /* Make the real protocol connections. */ + err = + ips_proto_connect(&((struct ptl_ips *)(ep->ptl_ips.ptl))->proto, + numep, epid_array, mask_array, error_array, + epaddr_array, timeout_in); + if (err) + goto fail; + + ep = ep->mctxt_next; + } + +fail: + if (epid_array) + psmi_free(epid_array); + if (mask_array) + psmi_free(mask_array); + if (error_array) + psmi_free(error_array); + if (epaddr_array) + psmi_free(epaddr_array); + + return err; +} + +psm2_error_t +ips_ptl_disconnect(ptl_t *ptl_gen, int force, int numep, + psm2_epaddr_t array_of_epaddr[], + const int array_of_epaddr_mask[], + psm2_error_t array_of_errors[], uint64_t timeout_in) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + int *array_of_epaddr_mask_internal, i; + psm2_error_t err; + + /* + * Copy true values from array_of_epaddr_mask, provided that their + * respective epaddr is an ips one. + * Newly created mask will be used for the protocol disconnect call + * instead. + */ + PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock); + array_of_epaddr_mask_internal = psmi_calloc(ptl->ep, UNDEFINED, + sizeof(int), numep); + if (!array_of_epaddr_mask_internal) + return PSM2_NO_MEMORY; + + for (i = 0; i < numep; ++i) { + if (array_of_epaddr_mask[i] && array_of_epaddr[i] + && array_of_epaddr[i]->ptlctl->ptl == ptl_gen) { + array_of_epaddr_mask_internal[i] = 1; + } + } + + err = ips_proto_disconnect(&ptl->proto, force, numep, array_of_epaddr, + array_of_epaddr_mask_internal, + array_of_errors, timeout_in); + + psmi_free(array_of_epaddr_mask_internal); + return err; +} + +/* Only symbol we expose out of here */ +struct ptl_ctl_init +psmi_ptl_ips = { + ips_ptl_sizeof, ips_ptl_init, ips_ptl_fini, ips_ptl_setopt, + ips_ptl_getopt +}; + +struct ptl_ctl_rcvthread +psmi_ptl_ips_rcvthread = { + ips_ptl_rcvthread_is_enabled, + ips_ptl_rcvthread_transfer_ownership, +}; diff --git a/prov/psm3/psm3/ptl_ips/ptl_fwd.h b/prov/psm3/psm3/ptl_ips/ptl_fwd.h new file mode 100644 index 00000000000..b7742609f3a --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ptl_fwd.h @@ -0,0 +1,67 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _PTL_FWD_IPS_H +#define _PTL_FWD_IPS_H +#include "ptl.h" + +typedef struct ips_epaddr ips_epaddr_t; +typedef struct ips_msgctl ips_msgctl_t; + +/* Symbol in ips ptl */ +extern struct ptl_ctl_init psmi_ptl_ips; + +extern struct ptl_ctl_rcvthread psmi_ptl_ips_rcvthread; +#endif /* _PTL_FWD_IPS_H */ diff --git a/prov/psm3/psm3/ptl_ips/ptl_ips.h b/prov/psm3/psm3/ptl_ips/ptl_ips.h new file mode 100644 index 00000000000..185f0ec0791 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ptl_ips.h @@ -0,0 +1,161 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _IPS_PTL_H +#define _IPS_PTL_H + +#include "psm_user.h" + +#include "ips_proto.h" +#include "ips_stats.h" +#include "ips_subcontext.h" + +struct ptl_shared; + +/* + * PTL at the ips level (for OPA) + * + * This PTL structure glues all the ips components together. + * + * * ips timer, shared by various components, allows each component to + * schedule time-based expiration callbacks on the timerq. + * * HW receive queue + * * send control block to handle eager messages + * * instantiation of the ips protocol + * * endpoint state, to map endpoint indexes into structures + * + * Receive-side + * + * ----[ proto ] + * / ^ ^ + * | | | + * | packet packet + * | known unknown + * add_endpt \ / + * | | + * `----> [epstate] + * ^ + * | + * lookup_endpt + * | + * [recvq] + * | + * poll + * + */ +/* Updates to this struct must be reflected in PTL_IPS_SIZE in ptl_fwd.h */ +/* IPS knows it functions as a PTL whenever ptl->ep is non-NULL */ +struct ptl_ips { + psm2_ep_t ep; /* back ptr */ + psm2_epid_t epid; /* cached from ep */ + psm2_epaddr_t epaddr; /* cached from ep */ + ips_epaddr_t *ipsaddr; /* cached from epaddr */ + ptl_ctl_t *ctl; /* cached from init */ + const psmi_context_t *context; /* cached from init */ + + void *spioc; /* PIO send control (opaque ptr) */ + struct ips_proto proto; /* protocol instance: timerq, epstate, spio */ + + struct psmi_timer_ctrl timerq; + struct ips_epstate epstate; /* map incoming packets */ + struct ips_recvhdrq_state recvq_state; + struct ips_recvhdrq recvq; /* HW recvq: epstate, proto */ + + /* timer to check the context's status */ + struct psmi_timer status_timer; + + /* context's status check timeout in cycles -- cached */ + uint64_t status_cyc_timeout; + /* Shared contexts context */ + struct ptl_shared *recvshc; + /* Rcv thread context */ + struct ptl_rcvthread *rcvthread; +} +#ifndef PACK_STRUCT_STL +#define PACK_STRUCT_STL /* nothing */ +#endif + __attribute__ ((PACK_STRUCT_STL aligned(16))); + + +/* + * Connect/disconnect are wrappers around psm proto's connect/disconnect, + * mostly to abstract away PSM-specific stuff from ips internal structures + */ +psm2_error_t ips_ptl_connect(ptl_t *ptl, int numep, + const psm2_epid_t *array_of_epid, + const int *array_of_epid_mask, + psm2_error_t *array_of_errors, + psm2_epaddr_t *array_of_epaddr, + uint64_t timeout_in); + +psm2_error_t ips_ptl_disconnect(ptl_t *ptl, int force, int numep, + psm2_epaddr_t array_of_epaddr[], + const int array_of_epaddr_mask[], + psm2_error_t array_of_errors[], + uint64_t timeout_in); + +/* + * Generic Poll function for ips-level ptl + */ +psm2_error_t ips_ptl_poll(ptl_t *ptl, int _ignored); + +/* + * Support for receive thread + */ +psm2_error_t ips_ptl_rcvthread_init(ptl_t *ptl, struct ips_recvhdrq *recvq); +psm2_error_t ips_ptl_rcvthread_fini(ptl_t *ptl); +void ips_ptl_rcvthread_transfer_ownership(ptl_t *from_ptl, ptl_t *to_ptl); + +#endif /* _IPS_PTL_H */ diff --git a/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c b/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c new file mode 100644 index 00000000000..f68fd5885e5 --- /dev/null +++ b/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c @@ -0,0 +1,631 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#include + +#include "psm_user.h" +#include "psm2_hal.h" +#include "psm_mq_internal.h" +#include "ptl_ips.h" +#include "ips_proto.h" + +struct ptl_rcvthread; + +static void *ips_ptl_pollintr(void *recvthreadc); +static psm2_error_t rcvthread_initstats(ptl_t *ptl); +static psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc); + +struct ptl_rcvthread { + const psmi_context_t *context; + const ptl_t *ptl; + struct ips_recvhdrq *recvq; + + pthread_t hdrq_threadid; + uint64_t t_start_cyc; + int pipefd[2]; + + /* stats and some for scheduling */ + uint64_t pollcnt; + uint64_t pollcnt_to; + uint64_t pollcyc; + uint64_t pollok; + + /* For scheduling interrupt thread */ + int timeout_period_min; + int timeout_period_max; + int timeout_shift; + uint64_t pollok_last; + uint64_t pollcnt_last; + uint32_t last_timeout; +}; + +#ifdef PSM_CUDA + /* This is a global cuda context (extern declaration in psm_user.h) + * stored to provide hints during a cuda failure + * due to a null cuda context. + */ + CUcontext ctxt; +#endif + +/* + * The receive thread knows about the ptl interface, so it can muck with it + * directly. + */ +psm2_error_t ips_ptl_rcvthread_init(ptl_t *ptl_gen, struct ips_recvhdrq *recvq) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + psm2_error_t err = PSM2_OK; + struct ptl_rcvthread *rcvc; + + ptl->rcvthread = + psmi_calloc(ptl->ep, UNDEFINED, 1, sizeof(struct ptl_rcvthread)); + if (ptl->rcvthread == NULL) { + err = PSM2_NO_MEMORY; + goto fail; + } + rcvc = ptl->rcvthread; + + rcvc->recvq = recvq; + rcvc->ptl = ptl_gen; + rcvc->context = ptl->context; + rcvc->t_start_cyc = get_cycles(); + +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED) + PSMI_CUDA_CALL(cuCtxGetCurrent, &ctxt); +#endif + + if (psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD) && + (!psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED))){ + + if ((err = rcvthread_initsched(rcvc))) + goto fail; + + /* Create a pipe so we can synchronously terminate the thread */ + if (pipe(rcvc->pipefd) != 0) { + err = psmi_handle_error(ptl->ep, PSM2_EP_DEVICE_FAILURE, + "Cannot create a pipe for receive thread: %s\n", + strerror(errno)); + goto fail; + } + + psmi_hal_add_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); + if (pthread_create(&rcvc->hdrq_threadid, NULL, + ips_ptl_pollintr, ptl->rcvthread)) { + close(rcvc->pipefd[0]); + close(rcvc->pipefd[1]); + err = psmi_handle_error(ptl->ep, PSM2_EP_DEVICE_FAILURE, + "Cannot start receive thread: %s\n", + strerror(errno)); + goto fail; + } + + } + + if ((err = rcvthread_initstats(ptl_gen))) + goto fail; + +fail: + return err; +} + +psm2_error_t ips_ptl_rcvthread_fini(ptl_t *ptl_gen) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)ptl->rcvthread; + uint64_t t_now; + psm2_error_t err = PSM2_OK; + + PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock); + + if (ptl->rcvthread == NULL) + return err; + + psmi_stats_deregister_type(PSMI_STATSTYPE_RCVTHREAD, rcvc); + if (rcvc->hdrq_threadid && psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED)) { + t_now = get_cycles(); + + /* Disable interrupts then kill the receive thread */ + if (psmi_context_interrupt_isenabled + ((psmi_context_t *) ptl->context)) + if ((err = + psmi_context_interrupt_set((psmi_context_t *) ptl-> + context, 0))) + goto fail; + + /* Close the pipe so we can have the thread synchronously exit. + On Linux just closing the pipe does not wake up the receive + thread. + */ + if (write(rcvc->pipefd[1], (const void *)&t_now, + sizeof(uint64_t)) == -1 || + close(rcvc->pipefd[1]) == -1) { + _HFI_VDBG + ("unable to close pipe to receive thread cleanly\n"); + } + pthread_join(rcvc->hdrq_threadid, NULL); + psmi_hal_sub_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); + rcvc->hdrq_threadid = 0; + if (_HFI_PRDBG_ON) { + _HFI_PRDBG_ALWAYS + ("rcvthread poll success %lld/%lld times, " + "thread cancelled in %.3f us\n", + (long long)rcvc->pollok, (long long)rcvc->pollcnt, + (double)cycles_to_nanosecs(get_cycles() - t_now) / 1e3); + } + } + + psmi_free(ptl->rcvthread); + ptl->rcvthread = NULL; +fail: + return err; +} + +void ips_ptl_rcvthread_transfer_ownership(ptl_t *from_ptl_gen, ptl_t *to_ptl_gen) +{ + struct ptl_rcvthread *rcvc; + + psmi_hal_sub_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED); + struct ptl_ips *from_ptl = (struct ptl_ips *)from_ptl_gen; + struct ptl_ips *to_ptl = (struct ptl_ips *)to_ptl_gen; + to_ptl->rcvthread = from_ptl->rcvthread; + from_ptl->rcvthread = NULL; + + rcvc = to_ptl->rcvthread; + + rcvc->recvq = &to_ptl->recvq; + rcvc->context = to_ptl->context; + rcvc->ptl = to_ptl_gen; +} + +psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc) +{ + union psmi_envvar_val env_to; + char buf[192]; + char *rcv_freq = buf; + int no_timeout = 0; + int tvals[3] = { RCVTHREAD_TO_MIN_FREQ, + RCVTHREAD_TO_MAX_FREQ, + RCVTHREAD_TO_SHIFT + }; + snprintf(buf, sizeof(buf) - 1, "%d:%d:%d", RCVTHREAD_TO_MIN_FREQ, + RCVTHREAD_TO_MAX_FREQ, RCVTHREAD_TO_SHIFT); + buf[sizeof(buf) - 1] = '\0'; + + if (!psmi_getenv("PSM3_RCVTHREAD_FREQ", + "Recv Thread frequency (per sec) ", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)rcv_freq, &env_to)) { + /* not using default values */ + int nparsed = psmi_parse_str_tuples(env_to.e_str, 3, tvals); + int invalid = 0; + + if (nparsed < 1 || (nparsed > 0 && tvals[0] == 0) || + (nparsed > 1 && tvals[1] == 0)) { + no_timeout = 1; + } else { + if (nparsed > 0 && tvals[0] > 1000) + invalid = 1; + if (nparsed > 1 + && (tvals[1] > 1000 || tvals[1] < tvals[0])) + invalid = 1; + if (nparsed > 2 && tvals[2] > 10) + invalid = 1; + } + + if (invalid) { + _HFI_INFO + ("Overriding invalid request for RcvThread frequency" + " settings of %s to be <%d:%d:%d>\n", env_to.e_str, + RCVTHREAD_TO_MIN_FREQ, RCVTHREAD_TO_MAX_FREQ, + RCVTHREAD_TO_SHIFT); + tvals[0] = RCVTHREAD_TO_MIN_FREQ; + tvals[1] = RCVTHREAD_TO_MAX_FREQ; + tvals[2] = RCVTHREAD_TO_SHIFT; + } + } + + if (no_timeout) { + rcvc->last_timeout = -1; + _HFI_PRDBG("PSM3_RCVTHREAD_FREQ set to only interrupt " + "(no timeouts)\n"); + } else { + /* Convert freq to period in milliseconds (for poll()) */ + rcvc->timeout_period_max = 1000 / tvals[0]; + rcvc->timeout_period_min = 1000 / tvals[1]; + rcvc->timeout_shift = tvals[2]; + /* Start in the middle of min and max */ + rcvc->last_timeout = (rcvc->timeout_period_min + + rcvc->timeout_period_max) / 2; + _HFI_PRDBG("PSM3_RCVTHREAD_FREQ converted to period " + "min=%dms,max=%dms,shift=%d\n", + rcvc->timeout_period_min, rcvc->timeout_period_max, + rcvc->timeout_shift); + } + return PSM2_OK; +} + +static +int rcvthread_next_timeout(struct ptl_rcvthread *rcvc) +{ + uint64_t pollok_diff = rcvc->pollok - rcvc->pollok_last; + + if (pollok_diff > 0) { + if (rcvc->last_timeout > rcvc->timeout_period_min) + /* By default, be less aggressive, but there's a more aggressive + * alternative if need be */ +#if 1 + rcvc->last_timeout >>= rcvc->timeout_shift; +#else + rcvc->last_timeout = rcvc->timeout_period_min; +#endif + } else { /* we had less progress */ + if (rcvc->last_timeout < rcvc->timeout_period_max) + rcvc->last_timeout <<= rcvc->timeout_shift; + } + + rcvc->pollok_last = rcvc->pollok; + rcvc->pollcnt_last = rcvc->pollcnt; + return (int)rcvc->last_timeout; +} + +extern int ips_in_rcvthread; + +static void process_async_event(psm2_ep_t ep) +{ + struct ibv_async_event async_event; + const char* errstr = NULL; + + if (ibv_get_async_event(ep->verbs_ep.context, &async_event)) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Receive thread ibv_get_async_event() error: %s", strerror(errno)); + } + /* Ack the event */ + ibv_ack_async_event(&async_event); + + switch (async_event.event_type) { + case IBV_EVENT_CQ_ERR: + if (async_event.element.cq == ep->verbs_ep.send_cq) + errstr = "Send CQ"; + else if (async_event.element.cq == ep->verbs_ep.recv_cq) + errstr = "Recv CQ"; + else + errstr = "CQ"; + break; + case IBV_EVENT_QP_FATAL: + case IBV_EVENT_QP_REQ_ERR: + case IBV_EVENT_QP_ACCESS_ERR: + if (async_event.element.qp == ep->verbs_ep.qp) + errstr = "UD QP"; + else + errstr = "RC QP"; // qp->context will be an ipsaddr + break; + case IBV_EVENT_DEVICE_FATAL: + errstr = "NIC"; + break; + default: + // be silent about other events + break; + } + if (errstr) + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Fatal %s Async Event: %s", errstr, + ibv_event_type_str(async_event.event_type)); +} + +static void rearm_cq_event(psm2_ep_t ep) +{ + struct ibv_cq *ev_cq; + void *ev_ctx; + + _HFI_VDBG("rcvthread got solicited event\n"); + if (ibv_get_cq_event(ep->verbs_ep.recv_comp_channel, &ev_cq, &ev_ctx)) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Receive thread ibv_get_cq_event() error: %s", + strerror(errno)); + } + + /* Ack the event */ + ibv_ack_cq_events(ev_cq, 1); + psmi_assert_always(ev_cq == ep->verbs_ep.recv_cq); + psmi_assert_always(ev_ctx == ep); + // we only use solicited, so just reenable it + // TBD - during shutdown events get disabled and we could check + // psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED) + // to make sure we still want enabled. But given verbs events + // are one-shots, that seems like overkill + if (ibv_req_notify_cq(ep->verbs_ep.recv_cq, 1)) { + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Receive thread ibv_req_notify_cq() error: %s", + strerror(errno)); + } +} + +// poll for async events for all rails/QPs within a given end user opened EP +static void poll_async_events(psm2_ep_t ep) +{ + struct pollfd pfd[PSMI_MAX_QPS]; + psm2_ep_t pep[PSMI_MAX_QPS]; + int num_ep = 0; + psm2_ep_t first; + int ret; + int i; + + first = ep; + do { + pfd[num_ep].fd = ep->verbs_ep.context->async_fd; + pfd[num_ep].events = POLLIN; + pfd[num_ep].revents = 0; + pep[num_ep++] = ep; + ep = ep->mctxt_next; + } while (ep != first); + + ret = poll(pfd, num_ep, 0); + if_pf(ret < 0) { + if (errno == EINTR) + _HFI_DBG("got signal, keep polling\n"); + else + psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "Receive thread poll() error: %s", strerror(errno)); + } else if_pf (ret > 0) { + for (i=0; i < num_ep; i++) { + if (pfd[i].revents & POLLIN) + process_async_event(pep[i]); + } + } +} + +/* + * Receiver thread support. + * + * By default, polling in the driver asks the chip to generate an interrupt on + * every packet. When the driver supports POLLURG we can switch the poll mode + * to one that requests interrupts only for packets that contain an urgent bit + * (and optionally enable interrupts for hdrq overflow events). When poll + * returns an event, we *try* to make progress on the receive queue but simply + * go back to sleep if we notice that the main thread is already making + * progress. + */ +static +void *ips_ptl_pollintr(void *rcvthreadc) +{ + struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)rcvthreadc; + struct ips_recvhdrq *recvq = rcvc->recvq; + int fd_pipe = rcvc->pipefd[0]; + psm2_ep_t ep; + struct pollfd pfd[3]; + int ret; + int next_timeout = rcvc->last_timeout; + uint64_t t_cyc; + psm2_error_t err; + +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED && ctxt != NULL) + PSMI_CUDA_CALL(cuCtxSetCurrent, ctxt); +#endif + + PSM2_LOG_MSG("entering"); + /* No reason to have many of these, keep this as a backup in case the + * recvhdrq init function is misused */ + psmi_assert_always(psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED)); + + /* Switch driver to a mode where it can interrupt on urgent packets */ + if (psmi_context_interrupt_set((psmi_context_t *) + rcvc->context, 1) == PSM2_EP_NO_RESOURCES) { + _HFI_PRDBG + ("poll_type feature not present in driver, turning " + "off internal progress thread\n"); + return NULL; + } + + _HFI_PRDBG("Enabled communication thread on URG packets\n"); + + while (1) { + // pfd[0] is for urgent inbound packets (NAK, urgent ACK, etc) + // pfd[1] is for rcvthread termination + // pfd[2] is for verbs async events (PSM_UD only) + // on timeout (poll() returns 0), we do background process checks + // for non urgent inbound packets + pfd[0].fd = rcvc->context->ep->verbs_ep.recv_comp_channel->fd; + pfd[0].events = POLLIN; + pfd[0].revents = 0; + pfd[1].fd = fd_pipe; + pfd[1].events = POLLIN; + pfd[1].revents = 0; + pfd[2].fd = rcvc->context->ep->verbs_ep.context->async_fd; + pfd[2].events = POLLIN; + pfd[2].revents = 0; + + ret = poll(pfd, 3, next_timeout); + t_cyc = get_cycles(); + if_pf(ret < 0) { + if (errno == EINTR) + _HFI_DBG("got signal, keep polling\n"); + else + psmi_handle_error(PSMI_EP_NORETURN, + PSM2_INTERNAL_ERR, + "Receive thread poll() error: %s", + strerror(errno)); + } else if (pfd[1].revents) { + /* Any type of event on this fd means exit, should be POLLHUP */ + _HFI_DBG("close thread: revents=0x%x\n", pfd[1].revents); + close(fd_pipe); + break; + } else { + // we got an async event + if (pfd[2].revents & POLLIN) + process_async_event(rcvc->context->ep); + + // we got here due to a CQ event (as opposed to timeout) + // consume the event and rearm, we'll poll cq below + if (pfd[0].revents & POLLIN) + rearm_cq_event(rcvc->context->ep); + + rcvc->pollcnt++; + if (!PSMI_LOCK_TRY(psmi_creation_lock)) { + if (ret == 0 || pfd[0].revents & (POLLIN | POLLERR)) { + if (PSMI_LOCK_DISABLED) { + // this path is not supported. having rcvthread + // and PSMI_PLOCK_IS_NOLOCK define not allowed. + // TBD - would be good if we could quickly + // check for ep->verbs_ep.recv_wc_count == 0 + // && nothing on CQ without doing a ibv_poll_cq + // ibv_poll_cq(cq, 0, NULL) always returns 0, so that + // doesn't help + // ibv_poll_cq would consume a CQE and require a lock so + // must call our main recv progress function below + // maybe if we open the can on HW verbs driver we could + // quickly check Q without polling. Main benefit would + // be avoiding spinlock contention with main PSM + // thread and perhaps using the trylock style inside + // poll_cq much like we do for WFR + if (!ips_recvhdrq_trylock(recvq)) + continue; + err = ips_recvhdrq_progress(recvq); + if (err == PSM2_OK) + rcvc->pollok++; + else + rcvc->pollcyc += get_cycles() - t_cyc; + ips_recvhdrq_unlock(recvq); + } else { + + ep = psmi_opened_endpoint; + + /* Go through all master endpoints. */ + do{ + if (!PSMI_LOCK_TRY(ep->mq->progress_lock)) { + /* If we time out, we service shm and NIC. + * If not, we assume to have received an urgent + * packet and service only NIC. + */ + err = psmi_poll_internal(ep, + ret == 0 ? PSMI_TRUE : PSMI_FALSE); + + if (err == PSM2_OK) + rcvc->pollok++; + else + rcvc->pollcyc += get_cycles() - t_cyc; + PSMI_UNLOCK(ep->mq->progress_lock); + } + poll_async_events(ep); + + /* get next endpoint from multi endpoint list */ + ep = ep->user_ep_next; + } while(NULL != ep); + } + } + PSMI_UNLOCK(psmi_creation_lock); + } + if (ret == 0) { /* change timeout only on timed out poll */ + rcvc->pollcnt_to++; + next_timeout = rcvthread_next_timeout(rcvc); + } + } + } + + PSM2_LOG_MSG("leaving"); + return NULL; +} + +static uint64_t rcvthread_stats_pollok(void *context) +{ + struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)context; + //double ratio = 0.0; + uint64_t ratio_u = 0; + if (rcvc->pollcnt > 0) + //ratio = (double)rcvc->pollok * 100.0 / rcvc->pollcnt; + ratio_u = (uint64_t)((double)rcvc->pollok * 100.0 / rcvc->pollcnt); + //memcpy(&ratio_u, &ratio, sizeof(uint64_t)); + return ratio_u; +} + +static uint64_t rcvthread_stats_pollcyc(void *context) +{ + struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)context; + /* log in milliseconds */ + return (uint64_t) ((double)cycles_to_nanosecs(rcvc->pollcyc) / 1.0e6); +} + +static psm2_error_t rcvthread_initstats(ptl_t *ptl_gen) +{ + struct ptl_ips *ptl = (struct ptl_ips *)ptl_gen; + struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *)ptl->rcvthread; + struct psmi_stats_entry entries[] = { + PSMI_STATS_DECLU64("intrthread_schedule_count", &rcvc->pollcnt), + PSMI_STATS_DECL("intrthread_schedule_success_(%)", + MPSPAWN_STATS_REDUCTION_ALL, + rcvthread_stats_pollok, NULL), + PSMI_STATS_DECLU64("intrthread_timeout_count", &rcvc->pollcnt_to), + PSMI_STATS_DECL("intrthread_wasted_time_(ms)", + MPSPAWN_STATS_REDUCTION_ALL, + rcvthread_stats_pollcyc, NULL) + }; + + /* If we don't want a thread, make sure we still initialize the counters + * but set them to NaN instead */ + if (!psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED)) { + int i; + static uint64_t ctr_nan = MPSPAWN_NAN; + for (i = 0; i < (int)PSMI_STATS_HOWMANY(entries); i++) { + entries[i].getfn = NULL; + entries[i].u.val = &ctr_nan; + } + } + + return psmi_stats_register_type("RcvThread_statistics", + PSMI_STATSTYPE_RCVTHREAD, + entries, + PSMI_STATS_HOWMANY(entries), ptl->ep->epid, rcvc); +} diff --git a/prov/psm3/psm3/ptl_self/ptl.c b/prov/psm3/psm3/ptl_self/ptl.c new file mode 100644 index 00000000000..040e6d4612c --- /dev/null +++ b/prov/psm3/psm3/ptl_self/ptl.c @@ -0,0 +1,412 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +/* + * This file implements the PSM PTL for self (loopback) + */ + +#include "psm_user.h" +#include "psm_mq_internal.h" +#include "psm_am_internal.h" + +struct ptl_self { + psm2_ep_t ep; + psm2_epid_t epid; + psm2_epaddr_t epaddr; + ptl_ctl_t *ctl; +} __attribute__((aligned(16))); + +static +psm2_error_t +ptl_handle_rtsmatch(psm2_mq_req_t recv_req, int was_posted) +{ + psm2_mq_req_t send_req = (psm2_mq_req_t) recv_req->ptl_req_ptr; + + if (recv_req->req_data.recv_msglen > 0) { + psmi_mq_mtucpy(recv_req->req_data.buf, send_req->req_data.buf, + recv_req->req_data.recv_msglen); + } + + psmi_mq_handle_rts_complete(recv_req); + + /* If the send is already marked complete, that's because it was internally + * buffered. */ + if (send_req->state == MQ_STATE_COMPLETE) { + psmi_mq_stats_rts_account(send_req); + if (send_req->req_data.buf != NULL && send_req->req_data.send_msglen > 0) + psmi_mq_sysbuf_free(send_req->mq, send_req->req_data.buf); + /* req was left "live" even though the sender was told that the + * send was done */ + psmi_mq_req_free(send_req); + } else + psmi_mq_handle_rts_complete(send_req); + + _HFI_VDBG("[self][complete][b=%p][sreq=%p][rreq=%p]\n", + recv_req->req_data.buf, send_req, recv_req); + return PSM2_OK; +} + +static +psm2_error_t self_mq_send_testwait(psm2_mq_req_t *ireq) +{ + uint8_t *ubuf; + psm2_mq_req_t req = *ireq; + + PSMI_LOCK_ASSERT(req->mq->progress_lock); + + /* We're waiting on a send request, and the matching receive has not been + * posted yet. This is a deadlock condition in MPI but we accommodate it + * here in the "self ptl" by using system-allocated memory. + */ + req->testwait_callback = NULL; /* no more calls here */ + + ubuf = req->req_data.buf; + if (ubuf != NULL && req->req_data.send_msglen > 0) { + req->req_data.buf = psmi_mq_sysbuf_alloc(req->mq, req->req_data.send_msglen); + if (req->req_data.buf == NULL) + return PSM2_NO_MEMORY; + psmi_mq_mtucpy(req->req_data.buf, ubuf, req->req_data.send_msglen); + } + + /* Mark it complete but don't free the req, it's freed when the receiver + * does the match */ + req->state = MQ_STATE_COMPLETE; + *ireq = PSM2_MQ_REQINVALID; + return PSM2_OK; +} + +/* Self is different. We do everything as rendezvous. */ +static +psm2_error_t +self_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags_user, + uint32_t flags_internal, psm2_mq_tag_t *tag, const void *ubuf, + uint32_t len, void *context, psm2_mq_req_t *req_o) +{ + psm2_mq_req_t send_req; + psm2_mq_req_t recv_req; + int rc; + + send_req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); + if_pf(send_req == NULL) + return PSM2_NO_MEMORY; + +#ifdef PSM_CUDA + if (PSMI_IS_CUDA_ENABLED && PSMI_IS_CUDA_MEM(ubuf)) { + psmi_cuda_set_attr_sync_memops(ubuf); + send_req->is_buf_gpu_mem = 1; + } else + send_req->is_buf_gpu_mem = 0; +#endif + + rc = psmi_mq_handle_rts(mq, epaddr, tag, + len, NULL, 0, 1, + ptl_handle_rtsmatch, &recv_req); + send_req->req_data.tag = *tag; + send_req->req_data.buf = (void *)ubuf; + send_req->req_data.send_msglen = len; + send_req->req_data.context = context; + recv_req->ptl_req_ptr = (void *)send_req; + recv_req->rts_sbuf = (uintptr_t) ubuf; + recv_req->rts_peer = epaddr; + if (rc == MQ_RET_MATCH_OK) + ptl_handle_rtsmatch(recv_req, 1); + else + send_req->testwait_callback = self_mq_send_testwait; + + _HFI_VDBG("[self][b=%p][m=%d][t=%08x.%08x.%08x][match=%s][req=%p]\n", + ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2], + rc == MQ_RET_MATCH_OK ? "YES" : "NO", send_req); + *req_o = send_req; + return PSM2_OK; +} + +static +psm2_error_t +self_mq_send(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags, + psm2_mq_tag_t *tag, const void *ubuf, uint32_t len) +{ + psm2_error_t err; + psm2_mq_req_t req; + err = self_mq_isend(mq, epaddr, flags, PSMI_REQ_FLAG_NORMAL, tag, ubuf, len, NULL, &req); + psmi_mq_wait_internal(&req); + return err; +} + +/* Fill in AM capabilities parameters */ +static psm2_error_t +self_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters) +{ + if (parameters == NULL) { + return PSM2_PARAM_ERR; + } + + /* Self is just a loop-back and has no restrictions. */ + parameters->max_handlers = INT_MAX; + parameters->max_nargs = INT_MAX; + parameters->max_request_short = INT_MAX; + parameters->max_reply_short = INT_MAX; + + return PSM2_OK; +} + +static +psm2_error_t +self_am_short_request(psm2_epaddr_t epaddr, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + struct psm2_ep_am_handle_entry *hentry; + psm2_ep_t ep = ((struct ptl_self *)(epaddr->ptlctl->ptl))->ep; + struct psmi_am_token tok; + + tok.epaddr_incoming = epaddr; + + hentry = psm_am_get_handler_function(ep, handler); + + /* Note a guard here for hentry != NULL is not needed because at + * initialization, a psmi_assert_always() assure the entry will be + * non-NULL. */ + + if (likely(hentry->version == PSM2_AM_HANDLER_V2)) { + psm2_am_handler_2_fn_t hfn2 = + (psm2_am_handler_2_fn_t)hentry->hfn; + hfn2(&tok, args, nargs, src, len, hentry->hctx); + } else { + psm2_am_handler_fn_t hfn1 = + (psm2_am_handler_fn_t)hentry->hfn; + hfn1(&tok, args, nargs, src, len); + } + + if (completion_fn) { + completion_fn(completion_ctxt); + } + + return PSM2_OK; +} + +static +psm2_error_t +self_am_short_reply(psm2_am_token_t token, + psm2_handler_t handler, psm2_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm2_am_completion_fn_t completion_fn, void *completion_ctxt) +{ + struct psm2_ep_am_handle_entry *hentry; + struct psmi_am_token *tok = token; + struct ptl_self *ptl = (struct ptl_self *)tok->epaddr_incoming->ptlctl->ptl; + psm2_ep_t ep = ptl->ep; + + hentry = psm_am_get_handler_function(ep, handler); + + /* Note a guard here for hentry != NULL is not needed because at + * initialization, a psmi_assert_always() assure the entry will be + * non-NULL. */ + + if (likely(hentry->version == PSM2_AM_HANDLER_V2)) { + psm2_am_handler_2_fn_t hfn2 = + (psm2_am_handler_2_fn_t)hentry->hfn; + hfn2(token, args, nargs, src, len, hentry->hctx); + } else { + psm2_am_handler_fn_t hfn1 = + (psm2_am_handler_fn_t)hentry->hfn; + hfn1(token, args, nargs, src, len); + } + + if (completion_fn) { + completion_fn(completion_ctxt); + } + + return PSM2_OK; +} + +static +psm2_error_t +self_connect(ptl_t *ptl_gen, + int numep, + const psm2_epid_t array_of_epid[], + const int array_of_epid_mask[], + psm2_error_t array_of_errors[], + psm2_epaddr_t array_of_epaddr[], uint64_t timeout_ns) +{ + struct ptl_self *ptl = (struct ptl_self *)ptl_gen; + psmi_assert_always(ptl->epaddr != NULL); + psm2_error_t err = PSM2_OK; + int i; + + PSMI_LOCK_ASSERT(ptl->ep->mq->progress_lock); + + for (i = 0; i < numep; i++) { + if (!array_of_epid_mask[i]) + continue; + + if (array_of_epid[i] == ptl->epid) { + array_of_epaddr[i] = ptl->epaddr; + array_of_epaddr[i]->ptlctl = ptl->ctl; + array_of_epaddr[i]->epid = ptl->epid; + if (psmi_epid_set_hostname(psm2_epid_nid(ptl->epid), + psmi_gethostname(), 0)) { + err = PSM2_NO_MEMORY; + goto fail; + } + psmi_epid_add(ptl->ep, ptl->epid, ptl->epaddr); + array_of_errors[i] = PSM2_OK; + } else { + array_of_epaddr[i] = NULL; + array_of_errors[i] = PSM2_EPID_UNREACHABLE; + } + } + +fail: + return err; +} + +static +psm2_error_t +self_disconnect(ptl_t *ptl_gen, int force, int numep, + psm2_epaddr_t array_of_epaddr[], + const int array_of_epaddr_mask[], + psm2_error_t array_of_errors[], uint64_t timeout_in) +{ + struct ptl_self *ptl = (struct ptl_self *)ptl_gen; + int i; + for (i = 0; i < numep; i++) { + if (array_of_epaddr_mask[i] == 0) + continue; + + if (array_of_epaddr[i] == ptl->epaddr) { + psmi_epid_remove(ptl->ep, ptl->epid); + array_of_errors[i] = PSM2_OK; + } + } + return PSM2_OK; +} + +static +size_t self_ptl_sizeof(void) +{ + return sizeof(struct ptl_self); +} + +ustatic +psm2_error_t self_ptl_init(const psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl) +{ + struct ptl_self *ptl = (struct ptl_self *)ptl_gen; + psmi_assert_always(ep != NULL); + psmi_assert_always(ep->epaddr != NULL); + psmi_assert_always(ep->epid != 0); + + ptl->ep = ep; + ptl->epid = ep->epid; + ptl->epaddr = ep->epaddr; + ptl->ctl = ctl; + + memset(ctl, 0, sizeof(*ctl)); + /* Fill in the control structure */ + ctl->ptl = ptl_gen; + ctl->ep = ep; + ctl->ep_poll = NULL; + ctl->ep_connect = self_connect; + ctl->ep_disconnect = self_disconnect; + + ctl->mq_send = self_mq_send; + ctl->mq_isend = self_mq_isend; + + ctl->am_get_parameters = self_am_get_parameters; + ctl->am_short_request = self_am_short_request; + ctl->am_short_reply = self_am_short_reply; + +#if 0 // unused code, specific to QLogic MPI + /* No stats in self */ + ctl->epaddr_stats_num = NULL; + ctl->epaddr_stats_init = NULL; + ctl->epaddr_stats_get = NULL; +#endif + + return PSM2_OK; +} + +static psm2_error_t self_ptl_fini(ptl_t *ptl, int force, uint64_t timeout_ns) +{ + return PSM2_OK; /* nothing to do */ +} + +static +psm2_error_t +self_ptl_setopt(const void *component_obj, int optname, + const void *optval, uint64_t optlen) +{ + /* No options for SELF PTL at the moment */ + return psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Unknown SELF ptl option %u.", optname); +} + +static +psm2_error_t +self_ptl_getopt(const void *component_obj, int optname, + void *optval, uint64_t *optlen) +{ + /* No options for SELF PTL at the moment */ + return psmi_handle_error(NULL, PSM2_PARAM_ERR, + "Unknown SELF ptl option %u.", optname); +} + +/* Only symbol we expose out of here */ +struct ptl_ctl_init +psmi_ptl_self = { + self_ptl_sizeof, self_ptl_init, self_ptl_fini, self_ptl_setopt, + self_ptl_getopt +}; diff --git a/prov/psm3/psm3/ptl_self/ptl_fwd.h b/prov/psm3/psm3/ptl_self/ptl_fwd.h new file mode 100644 index 00000000000..7ee6b732a3f --- /dev/null +++ b/prov/psm3/psm3/ptl_self/ptl_fwd.h @@ -0,0 +1,62 @@ +/* + + This file is provided under a dual BSD/GPLv2 license. When using or + redistributing this file, you may do so under either license. + + GPL LICENSE SUMMARY + + Copyright(c) 2015 Intel Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Contact Information: + Intel Corporation, www.intel.com + + BSD LICENSE + + Copyright(c) 2015 Intel Corporation. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ + +#ifndef _PTL_FWD_SELF_H +#define _PTL_FWD_SELF_H + +/* Symbol in am ptl */ +extern struct ptl_ctl_init psmi_ptl_self; + +#endif diff --git a/prov/psm3/shared b/prov/psm3/shared new file mode 120000 index 00000000000..929cb3dc9ba --- /dev/null +++ b/prov/psm3/shared @@ -0,0 +1 @@ +../../src \ No newline at end of file diff --git a/prov/psm3/src/.gitignore b/prov/psm3/src/.gitignore new file mode 100644 index 00000000000..89260b1273c --- /dev/null +++ b/prov/psm3/src/.gitignore @@ -0,0 +1 @@ +psm3_revision.c diff --git a/prov/psm3/src/psm3_revision.c.in b/prov/psm3/src/psm3_revision.c.in new file mode 100644 index 00000000000..74082cd81fe --- /dev/null +++ b/prov/psm3/src/psm3_revision.c.in @@ -0,0 +1,21 @@ +#ifndef PSMX3_IFS_VERSION +#define PSMX3_IFS_VERSION "@IFS_VERSION@" +#endif + +#ifndef PSMX3_BUILD_TIMESTAMP +#define PSMX3_BUILD_TIMESTAMP "@BUILD_TIMESTAMP@" +#endif + +#ifndef PSMX3_SRC_CHECKSUM +#define PSMX3_SRC_CHECKSUM "@SRC_CHECKSUM@" +#endif + +#ifndef PSMX3_GIT_CHECKSUM +#define PSMX3_GIT_CHECKSUM "@GIT_HASH@" +#endif + +char psmi_hfi_IFS_version[] = PSMX3_IFS_VERSION; +char psmi_hfi_build_timestamp[] = PSMX3_BUILD_TIMESTAMP; +char psmi_hfi_sources_checksum[] = PSMX3_SRC_CHECKSUM; +char psmi_hfi_git_checksum[] = PSMX3_GIT_CHECKSUM; + diff --git a/prov/psm3/src/psmx3.h b/prov/psm3/src/psmx3.h new file mode 100644 index 00000000000..95fb4754b4a --- /dev/null +++ b/prov/psm3/src/psmx3.h @@ -0,0 +1,1234 @@ +/* + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. + * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FI_PSM2_H +#define _FI_PSM2_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ofi.h" +#include "ofi_atomic.h" +#include "ofi_enosys.h" +#include "ofi_list.h" +#include "ofi_util.h" +#include "ofi_mem.h" +#include "rbtree.h" +#include "version.h" +#include "psm_config.h" + +#ifdef FABRIC_DIRECT_ENABLED +#define DIRECT_FN __attribute__((visibility ("default"))) +#define STATIC +#else +#define DIRECT_FN +#define STATIC static +#endif + +extern struct fi_provider psmx3_prov; + + +#define PSMX3_OP_FLAGS (FI_INJECT | FI_MULTI_RECV | FI_COMPLETION | \ + FI_TRIGGER | FI_INJECT_COMPLETE | \ + FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE) + +#define PSMX3_TX_CAPS (OFI_TX_MSG_CAPS | FI_TAGGED | OFI_TX_RMA_CAPS | FI_ATOMICS | \ + FI_NAMED_RX_CTX | FI_TRIGGER) +#define PSMX3_RX_CAPS (FI_SOURCE | FI_SOURCE_ERR | FI_RMA_EVENT | OFI_RX_MSG_CAPS | \ + FI_TAGGED | OFI_RX_RMA_CAPS | FI_ATOMICS | FI_DIRECTED_RECV | \ + FI_MULTI_RECV | FI_TRIGGER) +#define PSMX3_DOM_CAPS (FI_SHARED_AV | FI_LOCAL_COMM | FI_REMOTE_COMM) +#define PSMX3_CAPS (PSMX3_TX_CAPS | PSMX3_RX_CAPS | PSMX3_DOM_CAPS) + +#define PSMX3_RMA_TX_CAPS (PSMX3_TX_CAPS & ~(FI_TAGGED | FI_MSG | FI_SEND)) +#define PSMX3_RMA_RX_CAPS (PSMX3_RX_CAPS & ~(FI_TAGGED | FI_MSG | FI_RECV | \ + FI_DIRECTED_RECV | FI_MULTI_RECV)) +#define PSMX3_RMA_CAPS (PSMX3_RMA_TX_CAPS | PSMX3_RMA_RX_CAPS | PSMX3_DOM_CAPS) + +#define PSMX3_SUB_CAPS (FI_SEND | FI_RECV | FI_READ | FI_WRITE | \ + FI_REMOTE_READ | FI_REMOTE_WRITE) + +#define PSMX3_ALL_TRX_CTXT ((void *)-1) +#define PSMX3_MAX_MSG_SIZE ((0x1ULL << 32) - 1) +#define PSMX3_RMA_ORDER_SIZE (4096) +#define PSMX3_MSG_ORDER (FI_ORDER_SAS | OFI_ORDER_RAR_SET | OFI_ORDER_RAW_SET | \ + OFI_ORDER_WAR_SET | OFI_ORDER_WAW_SET) +#define PSMX3_COMP_ORDER FI_ORDER_NONE + +/* + * Four bits are reserved from the 64-bit tag space as a flags to identify the + * type and properties of the messages. + * + * To conserve tag bits, we use a couple otherwise invalid bit combinations + * to distinguish RMA long reads from long writes and distinguish iovec + * payloads from regular messages. + * + * We never match on the immediate bit. Regular tagged and untagged messages + * do not match on the iov bit, but the iov and imm bits are checked when we + * process completions. + * + * MSG RMA IOV IMM + * tagged message 0 0 x x + * untagged message 1 0 x x + * rma long read 0 1 0 x + * rma long write 0 1 1 x + * iov payload 1 1 x x + */ + +#define PSMX3_MSG_BIT (0x80000000) +#define PSMX3_RMA_BIT (0x40000000) +#define PSMX3_IOV_BIT (0x20000000) +#define PSMX3_IMM_BIT (0x10000000) + +/* Top two bits of the flag are the message type */ +#define PSMX3_TYPE_TAGGED (0) +#define PSMX3_TYPE_MSG PSMX3_MSG_BIT +#define PSMX3_TYPE_RMA PSMX3_RMA_BIT +#define PSMX3_TYPE_IOV_PAYLOAD (PSMX3_MSG_BIT | PSMX3_RMA_BIT) +#define PSMX3_TYPE_MASK (PSMX3_MSG_BIT | PSMX3_RMA_BIT) + +/* + * For RMA protocol, use the IOV bit to distinguish between long RMA write + * and long RMA read. This prevents tag collisions between reads/writes issued + * locally and the writes/reads issued by peers. RMA doesn't use this bit for + * IOV support so it's safe to do so. + */ +#define PSMX3_RMA_TYPE_READ PSMX3_TYPE_RMA +#define PSMX3_RMA_TYPE_WRITE (PSMX3_TYPE_RMA | PSMX3_IOV_BIT) +#define PSMX3_RMA_TYPE_MASK (PSMX3_TYPE_MASK | PSMX3_IOV_BIT) + +/* IOV header is only possible when the RMA bit is 0 */ +#define PSMX3_IOV_HEADER_MASK (PSMX3_IOV_BIT | PSMX3_RMA_BIT) + +#define PSMX3_IS_IOV_HEADER(flags) (((flags) & PSMX3_IOV_HEADER_MASK) == PSMX3_IOV_BIT) +#define PSMX3_IS_IOV_PAYLOAD(flags) (((flags) & PSMX3_TYPE_MASK) == PSMX3_TYPE_IOV_PAYLOAD) +#define PSMX3_IS_RMA(flags) (((flags) & PSMX3_TYPE_MASK) == PSMX3_TYPE_RMA) +#define PSMX3_IS_MSG(flags) (((flags) & PSMX3_TYPE_MASK) == PSMX3_TYPE_MSG) +#define PSMX3_IS_TAGGED(flags) (((flags) & PSMX3_TYPE_MASK) == PSMX3_TYPE_TAGGED) +#define PSMX3_HAS_IMM(flags) ((flags) & PSMX3_IMM_BIT) + +/* Set a bit conditionally without branching. Flag must be 1 or 0. */ +#define PSMX3_MSG_BIT_SET(flag) (-(uint32_t)flag & PSMX3_MSG_BIT) +#define PSMX3_RMA_BIT_SET(flag) (-(uint32_t)flag & PSMX3_RMA_BIT) +#define PSMX3_IOV_BIT_SET(flag) (-(uint32_t)flag & PSMX3_IOV_BIT) +#define PSMX3_IMM_BIT_SET(flag) (-(uint32_t)flag & PSMX3_IMM_BIT) + +/* + * Different ways to use the 96 bit tag: + * TAG60: 32/4/60 for data/flags/tag + * TAG64: 4/28/64 for flags/data/tag + * RUNTIME: make the choice at runtime + */ +#define PSMX3_TAG_LAYOUT_RUNTIME 0 +#define PSMX3_TAG_LAYOUT_TAG60 1 +#define PSMX3_TAG_LAYOUT_TAG64 2 + +#ifndef PSMX3_TAG_LAYOUT +#define PSMX3_TAG_LAYOUT PSMX3_TAG_LAYOUT_RUNTIME +#elif (PSMX3_TAG_LAYOUT < 0 || PSMX3_TAG_LAYOUT > 2) +#warning "Invalid PSMX3_TAG_LAYOUT definition, using default." +#undef PSMX3_TAG_LAYOUT +#define PSMX3_TAG_LAYOUT PSMX3_TAG_LAYOUT_RUNTIME +#endif + +#define PSMX3_TAG_MASK_60 (0x0FFFFFFFFFFFFFFFULL) +#define PSMX3_TAG_UPPER_MASK_60 ((uint32_t)0x0FFFFFFF) +#define PSMX3_DATA_MASK_60 ((uint32_t)0xFFFFFFFF) +#define PSMX3_FLAGS_IDX_60 (1) + +#define PSMX3_TAG_MASK_64 (0xFFFFFFFFFFFFFFFFULL) +#define PSMX3_TAG_UPPER_MASK_64 ((uint32_t)0xFFFFFFFF) +#define PSMX3_DATA_MASK_64 ((uint32_t)0x0FFFFFFF) +#define PSMX3_FLAGS_IDX_64 (2) + +#if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_TAG60) +#define PSMX3_TAG_MASK PSMX3_TAG_MASK_60 +#define PSMX3_TAG_UPPER_MASK PSMX3_TAG_UPPER_MASK_60 +#define PSMX3_DATA_MASK PSMX3_DATA_MASK_60 +#define PSMX3_FLAGS_IDX PSMX3_FLAGS_IDX_60 +#endif + +#if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_TAG64) +#define PSMX3_TAG_MASK PSMX3_TAG_MASK_64 +#define PSMX3_TAG_UPPER_MASK PSMX3_TAG_UPPER_MASK_64 +#define PSMX3_DATA_MASK PSMX3_DATA_MASK_64 +#define PSMX3_FLAGS_IDX PSMX3_FLAGS_IDX_64 +#endif + +#if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_RUNTIME) +#define PSMX3_TAG_MASK psmx3_tag_mask +#define PSMX3_TAG_UPPER_MASK psmx3_tag_upper_mask +#define PSMX3_DATA_MASK psmx3_data_mask +#define PSMX3_FLAGS_IDX psmx3_flags_idx +extern uint64_t psmx3_tag_mask; +extern uint32_t psmx3_tag_upper_mask; +extern uint32_t psmx3_data_mask; +extern int psmx3_flags_idx; +extern int psmx3_tag_layout_locked; +#endif + +#define PSMX3_FLAGS_MASK ((uint32_t)0xF0000000) + +#define PSMX3_MAX_TAG PSMX3_TAG_MASK +#define PSMX3_MATCH_ALL (-1ULL) +#define PSMX3_MATCH_NONE (0ULL) + +#define PSMX3_PRINT_TAG(tag96) \ + printf("%s: %08x %08x %08x\n", __func__, tag96.tag0, tag96.tag1, tag96.tag2) + +/* + * psm2_mq_tag_t is a union type of 96 bits. These functions are used to + * access the first 64 bits without generating the warning "dereferencing + * type-punned pointer will break strict-aliasing rules". This is faster + * than combining two 32-bit values with bit operations. + * + * Notice: + * (1) *(uint64_t *)tag96 works, but *(uint64_t *)tag96->tag doesn't; + * (2) putting these statements directly inside the macros won't work. + */ +__attribute__((always_inline)) +static inline void psmx3_set_tag64(psm2_mq_tag_t *tag96, uint64_t tag64) +{ + tag96->tag64 = tag64; +} + +__attribute__((always_inline)) +static inline uint64_t psmx3_get_tag64(psm2_mq_tag_t *tag96) +{ + return tag96->tag64; +} + +#define PSMX3_SET_TAG_INTERNAL(tag96,_tag_,cq_data,flags) \ + do { \ + psmx3_set_tag64(&(tag96),(_tag_) & PSMX3_TAG_MASK); \ + (tag96).tag2 = ((cq_data) & PSMX3_DATA_MASK); \ + (tag96).tag[PSMX3_FLAGS_IDX] |= (flags); \ + } while (0) + +#define PSMX3_SET_TAG(tag96,tag,cq_data,flags) \ + PSMX3_SET_TAG_INTERNAL(tag96,tag,cq_data,flags) + +#define PSMX3_SET_MASK(tagsel96,tag_mask,flag_mask) \ + PSMX3_SET_TAG_INTERNAL(tagsel96,tag_mask,0,flag_mask) + +#define PSMX3_GET_TAG64(tag96) (psmx3_get_tag64(&(tag96)) & PSMX3_TAG_MASK) +#define PSMX3_GET_FLAGS(tag96) ((tag96).tag[PSMX3_FLAGS_IDX] & PSMX3_FLAGS_MASK) +#define PSMX3_GET_CQDATA(tag96) ((tag96).tag2 & PSMX3_DATA_MASK) + +#define PSMX3_MAX_RX_CTX_BITS (12) +#define PSMX3_ADDR_IDX_MASK (0x000FFFFFFFFFFFFFUL) +#define PSMX3_ADDR_CTXT_MASK (0xFFF0000000000000UL) +#define PSMX3_ADDR_IDX(addr) ((addr) & PSMX3_ADDR_IDX_MASK) +#define PSMX3_ADDR_CTXT(addr, ctxt_bits) \ + (((addr) & PSMX3_ADDR_CTXT_MASK) >> (64-(ctxt_bits))) + +/* Bits 60 .. 63 of the flag are provider specific */ +#define PSMX3_NO_COMPLETION (1ULL << 60) + +enum psmx3_context_type { + PSMX3_NOCOMP_SEND_CONTEXT = 1, + PSMX3_NOCOMP_RECV_CONTEXT, + PSMX3_NOCOMP_TSEND_CONTEXT, + PSMX3_NOCOMP_TRECV_CONTEXT, + PSMX3_NOCOMP_WRITE_CONTEXT, + PSMX3_NOCOMP_READ_CONTEXT, + PSMX3_SEND_CONTEXT, + PSMX3_RECV_CONTEXT, + PSMX3_MULTI_RECV_CONTEXT, + PSMX3_TSEND_CONTEXT, + PSMX3_TRECV_CONTEXT, + PSMX3_WRITE_CONTEXT, + PSMX3_READ_CONTEXT, + PSMX3_REMOTE_WRITE_CONTEXT, + PSMX3_REMOTE_READ_CONTEXT, + PSMX3_SENDV_CONTEXT, + PSMX3_IOV_SEND_CONTEXT, + PSMX3_IOV_RECV_CONTEXT, + PSMX3_MAX_CONTEXT_TYPE +}; + +union psmx3_pi { + void *p; + uint32_t i[2]; +}; + +#define PSMX3_CTXT_REQ(fi_context) ((fi_context)->internal[0]) +#define PSMX3_CTXT_TYPE(fi_context) (((union psmx3_pi *)&(fi_context)->internal[1])->i[0]) +#define PSMX3_CTXT_SIZE(fi_context) (((union psmx3_pi *)&(fi_context)->internal[1])->i[1]) +#define PSMX3_CTXT_USER(fi_context) ((fi_context)->internal[2]) +#define PSMX3_CTXT_EP(fi_context) ((fi_context)->internal[3]) + +/* + * Use per-protocol versioning to avoid unnecessary version checking. Only perform + * version checking when the current version is greater than zero. + */ +#define PSMX3_AM_RMA_VERSION 0 +#define PSMX3_AM_ATOMIC_VERSION 0 +#define PSMX3_AM_SEP_VERSION 1 +#define PSMX3_AM_TRX_CTXT_VERSION 0 + +#define PSMX3_AM_RMA_HANDLER 0 +#define PSMX3_AM_ATOMIC_HANDLER 1 +#define PSMX3_AM_SEP_HANDLER 2 +#define PSMX3_AM_TRX_CTXT_HANDLER 3 + +#define PSMX3_AM_OP_MASK 0x000000FF +#define PSMX3_AM_FLAG_MASK 0xFF000000 +#define PSMX3_AM_VER_MASK 0x00FF0000 +#define PSMX3_AM_VER_SHIFT 16 +#define PSMX3_AM_EOM 0x40000000 +#define PSMX3_AM_DATA 0x20000000 +#define PSMX3_AM_FORCE_ACK 0x10000000 + +#define PSMX3_AM_SET_OP(u32w0,op) do {(u32w0) &= ~PSMX3_AM_OP_MASK; (u32w0) |= (op);} while (0) +#define PSMX3_AM_SET_FLAG(u32w0,flag) do {(u32w0) &= ~PSMX3_AM_FLAG_MASK; (u32w0) |= (flag);} while (0) +#define PSMX3_AM_SET_VER(u32w0,ver) do {(u32w0) &= ~PSMX3_AM_VER_MASK; (u32w0) |= (ver << PSMX3_AM_VER_SHIFT);} while (0) +#define PSMX3_AM_GET_OP(u32w0) ((u32w0) & PSMX3_AM_OP_MASK) +#define PSMX3_AM_GET_FLAG(u32w0) ((u32w0) & PSMX3_AM_FLAG_MASK) +#define PSMX3_AM_GET_VER(u32w0) (((u32w0) & PSMX3_AM_VER_MASK) >> PSMX3_AM_VER_SHIFT) + +enum { + PSMX3_AM_REQ_WRITE = 1, + PSMX3_AM_REQ_WRITE_LONG, + PSMX3_AM_REP_WRITE, + PSMX3_AM_REQ_READ, + PSMX3_AM_REQ_READ_LONG, + PSMX3_AM_REP_READ, + PSMX3_AM_REQ_ATOMIC_WRITE, + PSMX3_AM_REP_ATOMIC_WRITE, + PSMX3_AM_REQ_ATOMIC_READWRITE, + PSMX3_AM_REP_ATOMIC_READWRITE, + PSMX3_AM_REQ_ATOMIC_COMPWRITE, + PSMX3_AM_REP_ATOMIC_COMPWRITE, + PSMX3_AM_REQ_WRITEV, + PSMX3_AM_REQ_READV, + PSMX3_AM_REQ_SEP_QUERY, + PSMX3_AM_REP_SEP_QUERY, + PSMX3_AM_REQ_TRX_CTXT_DISCONNECT, +}; + +struct psmx3_am_request { + int op; + union { + struct { + uint8_t *buf; + size_t len; + uint64_t addr; + uint64_t key; + void *context; + void *peer_addr; + uint64_t data; + } write; + struct { + union { + uint8_t *buf; /* for read */ + size_t iov_count; /* for readv */ + }; + size_t len; + uint64_t addr; + uint64_t key; + void *context; + void *peer_addr; + size_t len_read; + } read; + struct { + union { + uint8_t *buf; /* for result_count == 1 */ + size_t iov_count; /* for result_count > 1 */ + }; + size_t len; + uint64_t addr; + uint64_t key; + void *context; + uint8_t *result; + int datatype; + } atomic; + }; + uint64_t cq_flags; + struct fi_context fi_context; + struct psmx3_fid_ep *ep; + int no_event; + int error; + struct slist_entry list_entry; + union { + struct iovec *iov; /* for readv */ + struct fi_ioc *ioc; /* for atomic read */ + }; + void *tmpbuf; +}; + +#define PSMX3_IOV_PROTO_PACK 0 +#define PSMX3_IOV_PROTO_MULTI 1 +#define PSMX3_IOV_MAX_SEQ_NUM 0x7fffffff +#define PSMX3_IOV_BUF_SIZE 64 +#define PSMX3_IOV_MAX_COUNT (PSMX3_IOV_BUF_SIZE / sizeof(uint32_t) - 3) + +struct psmx3_iov_info { + uint32_t seq_num; + uint32_t total_len; + uint32_t count; + uint32_t len[PSMX3_IOV_MAX_COUNT]; +}; + +struct psmx3_sendv_request { + struct fi_context fi_context; + struct fi_context fi_context_iov; + void *user_context; + int iov_protocol; + int no_completion; + int comp_flag; + uint32_t iov_done; + psm2_mq_tag_t tag; + union { + struct psmx3_iov_info iov_info; + char buf[PSMX3_IOV_BUF_SIZE]; + }; +}; + +struct psmx3_sendv_reply { + struct fi_context fi_context; + int no_completion; + int multi_recv; + psm2_mq_tag_t tag; + uint8_t *buf; + void *user_context; + size_t iov_done; + size_t bytes_received; + size_t msg_length; + int error_code; + int comp_flag; + struct psmx3_iov_info iov_info; +}; + +struct psmx3_req_queue { + fastlock_t lock; + struct slist list; +}; + +struct psmx3_multi_recv { + psm2_epaddr_t src_addr; + psm2_mq_tag_t tag; + psm2_mq_tag_t tagsel; + uint8_t *buf; + size_t len; + size_t offset; + int min_buf_size; + int flag; + void *context; +}; + +struct psmx3_fid_fabric { + struct util_fabric util_fabric; + psm2_uuid_t uuid; + struct util_ns name_server; + + /* list of all opened domains */ + fastlock_t domain_lock; + struct dlist_entry domain_list; +}; + +#define PSMX3_TX (1) +#define PSMX3_RX (2) +#define PSMX3_TX_RX (PSMX3_TX | PSMX3_RX) + +struct psmx3_trx_ctxt { + psm2_ep_t psm2_ep; + psm2_epid_t psm2_epid; + psm2_mq_t psm2_mq; + int am_initialized; + int am_progress; + int am_poll_count; + int id; + int usage_flags; + struct psm2_am_parameters psm2_am_param; + + struct psmx3_fid_domain *domain; + struct psmx3_fid_ep *ep; + + /* triggered operations that are ready to be processed */ + struct psmx3_req_queue trigger_queue; + + /* request pool for RMA/atomic ops */ + struct ofi_bufpool *am_req_pool; + fastlock_t am_req_pool_lock; + + /* lock to prevent the sequence of psm2_mq_ipeek and psm2_mq_test be + * interleaved in a multithreaded environment. + */ + fastlock_t poll_lock; + + /* list of peers connected to this tx/rx context */ + struct dlist_entry peer_list; + fastlock_t peer_lock; + + /* number of pathes this tx/rx context can be polled. this include + * CQs and counters, as well as domain->trx_ctxt_list. + */ + ofi_atomic32_t poll_refcnt; + int poll_active; + + psm2_uuid_t uuid; + + struct dlist_entry entry; +}; + +typedef void (*psmx3_lock_fn_t) (fastlock_t *lock, int lock_level); +typedef int (*psmx3_trylock_fn_t) (fastlock_t *lock, int lock_level); +typedef void (*psmx3_unlock_fn_t) (fastlock_t *lock, int lock_level); + +struct psmx3_fid_domain { + struct util_domain util_domain; + struct psmx3_fid_fabric *fabric; + uint64_t mode; + uint64_t caps; + + enum fi_mr_mode mr_mode; + fastlock_t mr_lock; + uint64_t mr_reserved_key; + RbtHandle mr_map; + + /* list of hw contexts opened for this domain */ + fastlock_t trx_ctxt_lock; + struct dlist_entry trx_ctxt_list; + + ofi_atomic32_t sep_cnt; + fastlock_t sep_lock; + struct dlist_entry sep_list; + + int progress_thread_enabled; + pthread_t progress_thread; + + int addr_format; + uint32_t max_atomic_size; + + struct dlist_entry entry; + + /* Lock/Unlock function pointers set based on FI_THREAD model */ + psmx3_lock_fn_t av_lock_fn; + psmx3_unlock_fn_t av_unlock_fn; + psmx3_lock_fn_t am_req_pool_lock_fn; + psmx3_unlock_fn_t am_req_pool_unlock_fn; + psmx3_lock_fn_t trx_ctxt_lock_fn; + psmx3_unlock_fn_t trx_ctxt_unlock_fn; + psmx3_lock_fn_t rma_queue_lock_fn; + psmx3_unlock_fn_t rma_queue_unlock_fn; + psmx3_lock_fn_t trigger_queue_lock_fn; + psmx3_unlock_fn_t trigger_queue_unlock_fn; + psmx3_lock_fn_t peer_lock_fn; + psmx3_unlock_fn_t peer_unlock_fn; + psmx3_lock_fn_t sep_lock_fn; + psmx3_unlock_fn_t sep_unlock_fn; + psmx3_lock_fn_t trigger_lock_fn; + psmx3_unlock_fn_t trigger_unlock_fn; + psmx3_lock_fn_t cq_lock_fn; + psmx3_unlock_fn_t cq_unlock_fn; + psmx3_lock_fn_t mr_lock_fn; + psmx3_unlock_fn_t mr_unlock_fn; + psmx3_lock_fn_t context_lock_fn; + psmx3_unlock_fn_t context_unlock_fn; + psmx3_trylock_fn_t poll_trylock_fn; + psmx3_unlock_fn_t poll_unlock_fn; +}; + +#define PSMX3_EP_REGULAR 0 +#define PSMX3_EP_SCALABLE 1 +#define PSMX3_EP_SRC_ADDR 2 + +#define PSMX3_RESERVED_EPID (0xFFFFULL) +#define PSMX3_DEFAULT_UNIT (-1) +#define PSMX3_DEFAULT_PORT 0 +#define PSMX3_ANY_SERVICE 0 + +struct psmx3_ep_name { + psm2_epid_t epid; + uint8_t type; + union { + uint8_t sep_id; /* for scalable ep */ + int8_t unit; /* for src addr. start from 0. -1 means any */ + }; + uint8_t port; /* for src addr. start from 1, 0 means any */ + uint8_t padding; + uint32_t service; /* for src addr. 0 means any */ +}; + +#define PSMX3_MAX_STRING_NAME_LEN 64 /* "fi_addr_psmx3://:" */ + +struct psmx3_status_data { + struct psmx3_fid_cq *poll_cq; + struct psmx3_trx_ctxt *trx_ctxt; + fi_addr_t *src_addr; + void *event_buffer; +}; + +struct psmx3_cq_event { + union { + struct fi_cq_entry context; + struct fi_cq_msg_entry msg; + struct fi_cq_data_entry data; + struct fi_cq_tagged_entry tagged; + struct fi_cq_err_entry err; + } cqe; + int error; + int8_t source_is_valid; + uint8_t source_sep_id; + psm2_epaddr_t source; + struct psmx3_fid_av *source_av; + struct slist_entry list_entry; +}; + +#define PSMX3_ERR_DATA_SIZE 64 /* large enough to hold a string address */ + +struct psmx3_poll_ctxt { + struct psmx3_trx_ctxt *trx_ctxt; + struct slist_entry list_entry; +}; + +struct psmx3_fid_cq { + struct fid_cq cq; + struct psmx3_fid_domain *domain; + struct slist poll_list; + int format; + int entry_size; + size_t event_count; + struct slist event_queue; + struct slist free_list; + fastlock_t lock; + struct psmx3_cq_event *pending_error; + struct util_wait *wait; + int wait_cond; + int wait_is_local; + ofi_atomic32_t signaled; + uint8_t error_data[PSMX3_ERR_DATA_SIZE]; +}; + +struct psmx3_trigger; + +struct psmx3_fid_cntr { + union { + struct fid_cntr cntr; + struct util_cntr util_cntr; /* for util_poll_run */ + }; + struct psmx3_fid_domain *domain; + struct slist poll_list; + int poll_all; + int events; + uint64_t flags; + ofi_atomic64_t counter; + ofi_atomic64_t error_counter; + int error_avail; + int wait_is_local; + struct util_wait *wait; + struct psmx3_trigger *trigger; + fastlock_t trigger_lock; +}; + +#define PSMX3_AV_DEFAULT_SIZE 64 + +#define PSMX3_AV_TABLE_SIZE(count, shared) \ + (sizeof(struct psmx3_av_hdr) + \ + ((shared) ? (count) * sizeof(fi_addr_t) : 0) + \ + (count) * sizeof(struct psmx3_av_addr)) + +struct psmx3_av_hdr { + uint64_t size; + uint64_t last; +}; + +struct psmx3_av_addr { + psm2_epid_t epid; + uint8_t type; + uint8_t sep_id; + uint8_t valid; +}; + +struct psmx3_av_sep { + int ctxt_cnt; + psm2_epid_t *epids; +}; + +struct psmx3_av_conn { + struct psmx3_trx_ctxt *trx_ctxt; + psm2_epaddr_t *epaddrs; + psm2_epaddr_t **sepaddrs; +}; + +struct psmx3_fid_av { + struct fid_av av; + int type; + struct psmx3_fid_domain *domain; + struct fid_eq *eq; + int addr_format; + int rx_ctx_bits; + int max_trx_ctxt; + int shared; + uint64_t flags; + size_t addrlen; + size_t count; + fastlock_t lock; + struct psmx3_trx_ctxt *av_map_trx_ctxt; + struct util_shm shm; + struct psmx3_av_hdr *hdr; /* shared AV header */ + fi_addr_t *map; /* shared AV address mapping */ + struct psmx3_av_addr *table; /* shared AV address table */ + struct psmx3_av_sep *sep_info; + struct psmx3_av_conn conn_info[]; +}; + +struct psmx3_fid_ep { + struct fid_ep ep; + int type; + struct psmx3_fid_domain *domain; + /* above fields are common with sep */ + + struct psmx3_trx_ctxt *tx; + struct psmx3_trx_ctxt *rx; + struct psmx3_fid_ep *base_ep; + struct psmx3_fid_stx *stx; + struct psmx3_fid_av *av; + struct psmx3_fid_cq *send_cq; + struct psmx3_fid_cq *recv_cq; + struct psmx3_fid_cntr *send_cntr; + struct psmx3_fid_cntr *recv_cntr; + struct psmx3_fid_cntr *write_cntr; + struct psmx3_fid_cntr *read_cntr; + struct psmx3_fid_cntr *remote_write_cntr; + struct psmx3_fid_cntr *remote_read_cntr; + unsigned send_selective_completion:1; + unsigned recv_selective_completion:1; + unsigned enabled:1; + uint64_t tx_flags; + uint64_t rx_flags; + uint64_t caps; + ofi_atomic32_t ref; + struct fi_context nocomp_send_context; + struct fi_context nocomp_tsend_context; + + PSMX3_EP_DECL_OP_CONTEXT + + size_t min_multi_recv; + uint32_t iov_seq_num; + int service; + int sep_id; +}; + +struct psmx3_sep_ctxt { + struct psmx3_trx_ctxt *trx_ctxt; + struct psmx3_fid_ep *ep; +}; + +struct psmx3_fid_sep { + struct fid_ep ep; + int type; + struct psmx3_fid_domain *domain; + /* above fields are common with regular ep */ + + struct dlist_entry entry; + + ofi_atomic32_t ref; + int service; + uint8_t id; + uint8_t enabled; + size_t ctxt_cnt; + struct psmx3_sep_ctxt ctxts[]; /* must be last element */ +}; + +struct psmx3_fid_stx { + struct fid_stx stx; + struct psmx3_fid_domain *domain; + struct psmx3_trx_ctxt *tx; + ofi_atomic32_t ref; +}; + +struct psmx3_fid_mr { + struct fid_mr mr; + struct psmx3_fid_domain *domain; + struct psmx3_fid_cntr *cntr; + uint64_t access; + uint64_t flags; + uint64_t offset; + size_t iov_count; + struct iovec iov[]; /* must be the last field */ +}; + +struct psmx3_epaddr_context { + struct psmx3_trx_ctxt *trx_ctxt; + psm2_epid_t epid; + psm2_epaddr_t epaddr; + struct dlist_entry entry; +}; + +struct psmx3_env { + int name_server; + int tagged_rma; + char *uuid; + int uuid_override; + int delay; + int timeout; + int conn_timeout; + int prog_interval; + char *prog_affinity; + int multi_ep; + int inject_size; + int lock_level; + int lazy_conn; + int disconnect; +#if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_RUNTIME) + char *tag_layout; +#endif +}; + +#define PSMX3_MAX_UNITS PSMI_MAX_RAILS /* from psm_config.h */ +struct psmx3_hfi_info { + int max_trx_ctxt; + int free_trx_ctxt; + int num_units; + int num_active_units; + int active_units[PSMX3_MAX_UNITS]; + int unit_is_active[PSMX3_MAX_UNITS]; + int unit_nctxts[PSMX3_MAX_UNITS]; + int unit_nfreectxts[PSMX3_MAX_UNITS]; + char default_domain_name[PSMX3_MAX_UNITS * NAME_MAX]; /* hfi1_0;hfi1_1;...;hfi1_n */ +}; + +extern struct fi_ops_mr psmx3_mr_ops; +extern struct fi_ops_cm psmx3_cm_ops; +extern struct fi_ops_tagged psmx3_tagged_ops; +extern struct fi_ops_tagged psmx3_tagged_ops_no_flag_directed; +extern struct fi_ops_tagged psmx3_tagged_ops_no_event_directed; +extern struct fi_ops_tagged psmx3_tagged_ops_no_send_event_directed; +extern struct fi_ops_tagged psmx3_tagged_ops_no_recv_event_directed; +extern struct fi_ops_tagged psmx3_tagged_ops_no_flag_undirected; +extern struct fi_ops_tagged psmx3_tagged_ops_no_event_undirected; +extern struct fi_ops_tagged psmx3_tagged_ops_no_send_event_undirected; +extern struct fi_ops_tagged psmx3_tagged_ops_no_recv_event_undirected; +extern struct fi_ops_tagged psmx3_tagged_ops_no_flag_directed_av_map; +extern struct fi_ops_tagged psmx3_tagged_ops_no_event_directed_av_map; +extern struct fi_ops_tagged psmx3_tagged_ops_no_send_event_directed_av_map; +extern struct fi_ops_tagged psmx3_tagged_ops_no_recv_event_directed_av_map; +extern struct fi_ops_tagged psmx3_tagged_ops_no_flag_undirected_av_map; +extern struct fi_ops_tagged psmx3_tagged_ops_no_event_undirected_av_map; +extern struct fi_ops_tagged psmx3_tagged_ops_no_send_event_undirected_av_map; +extern struct fi_ops_tagged psmx3_tagged_ops_no_recv_event_undirected_av_map; +extern struct fi_ops_msg psmx3_msg_ops; +extern struct fi_ops_msg psmx3_msg2_ops; +extern struct fi_ops_rma psmx3_rma_ops; +extern struct fi_ops_atomic psmx3_atomic_ops; +extern struct psmx3_env psmx3_env; +extern struct psmx3_hfi_info psmx3_hfi_info; +extern struct psmx3_fid_fabric *psmx3_active_fabric; + +/* + * Lock levels: + * 0 -- always lock + * 1 -- lock needed if there is more than one thread (including internal threads) + * 2 -- lock needed if more then one thread accesses the same psm2 ep + */ +static inline void psmx3_lock(fastlock_t *lock, int lock_level) +{ + if (psmx3_env.lock_level >= lock_level) + fastlock_acquire(lock); +} + +static inline int psmx3_trylock(fastlock_t *lock, int lock_level) +{ + if (psmx3_env.lock_level >= lock_level) + return fastlock_tryacquire(lock); + else + return 0; +} + +static inline void psmx3_unlock(fastlock_t *lock, int lock_level) +{ + if (psmx3_env.lock_level >= lock_level) + fastlock_release(lock); +} + +/* Specialized lock functions used based on FI_THREAD model */ + +static inline void psmx3_lock_disabled(fastlock_t *lock, int lock_level) +{ + return; +} + +static inline int psmx3_trylock_disabled(fastlock_t *lock, int lock_level) +{ + return 0; +} + +static inline void psmx3_lock_enabled(fastlock_t *lock, int lock_level) +{ + fastlock_acquire(lock); +} + +static inline void psmx3_unlock_enabled(fastlock_t *lock, int lock_level) +{ + fastlock_release(lock); +} + +static inline int psmx3_trylock_enabled(fastlock_t *lock, int lock_level) +{ + return fastlock_tryacquire(lock); +} + +int psmx3_init_prov_info(const struct fi_info *hints, struct fi_info **info); +void psmx3_update_prov_info(struct fi_info *info, + struct psmx3_ep_name *src_addr, + struct psmx3_ep_name *dest_addr); +int psmx3_check_prov_info(uint32_t api_version, const struct fi_info *hints, + struct fi_info **info); +void psmx3_alter_prov_info(uint32_t api_version, const struct fi_info *hints, + struct fi_info *info); + +void psmx3_init_tag_layout(struct fi_info *info); +int psmx3_get_round_robin_unit(int idx); + +int psmx3_fabric(struct fi_fabric_attr *attr, + struct fid_fabric **fabric, void *context); +int psmx3_domain_open(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **domain, void *context); +int psmx3_ep_open(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context); +int psmx3_sep_open(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **sep, void *context); +int psmx3_stx_ctx(struct fid_domain *domain, struct fi_tx_attr *attr, + struct fid_stx **stx, void *context); +int psmx3_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, + struct fid_cq **cq, void *context); +int psmx3_av_open(struct fid_domain *domain, struct fi_av_attr *attr, + struct fid_av **av, void *context); +int psmx3_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, + struct fid_cntr **cntr, void *context); +int psmx3_wait_open(struct fid_fabric *fabric, struct fi_wait_attr *attr, + struct fid_wait **waitset); +int psmx3_wait_trywait(struct fid_fabric *fabric, struct fid **fids, + int count); +int psmx3_query_atomic(struct fid_domain *doamin, enum fi_datatype datatype, + enum fi_op op, struct fi_atomic_attr *attr, + uint64_t flags); + +static inline void psmx3_fabric_acquire(struct psmx3_fid_fabric *fabric) +{ + ofi_atomic_inc32(&fabric->util_fabric.ref); +} + +static inline void psmx3_fabric_release(struct psmx3_fid_fabric *fabric) +{ + ofi_atomic_dec32(&fabric->util_fabric.ref); +} + +static inline void psmx3_domain_acquire(struct psmx3_fid_domain *domain) +{ + ofi_atomic_inc32(&domain->util_domain.ref); +} + +static inline void psmx3_domain_release(struct psmx3_fid_domain *domain) +{ + ofi_atomic_dec32(&domain->util_domain.ref); +} + +int psmx3_domain_enable_ep(struct psmx3_fid_domain *domain, struct psmx3_fid_ep *ep); + +void psmx3_trx_ctxt_free(struct psmx3_trx_ctxt *trx_ctxt, int usage_flags); +struct psmx3_trx_ctxt *psmx3_trx_ctxt_alloc(struct psmx3_fid_domain *domain, + struct psmx3_ep_name *src_addr, + int sep_ctxt_idx, int usage_flags, + uint8_t *uuid); + +static inline +int psmx3_ns_service_cmp(void *svc1, void *svc2) +{ + int service1 = *(int *)svc1, service2 = *(int *)svc2; + if (service1 == PSMX3_ANY_SERVICE || + service2 == PSMX3_ANY_SERVICE) + return 0; + return (service1 < service2) ? + -1 : (service1 > service2); +} +static inline +int psmx3_ns_is_service_wildcard(void *svc) +{ + return (*(int *)svc == PSMX3_ANY_SERVICE); +} +void psmx3_get_uuid(psm2_uuid_t uuid); +int psmx3_override_uuid(void); +int psmx3_uuid_to_port(psm2_uuid_t uuid); +char *psmx3_uuid_to_string(psm2_uuid_t uuid); +void *psmx3_ep_name_to_string(const struct psmx3_ep_name *name, size_t *len); +struct psmx3_ep_name *psmx3_string_to_ep_name(const void *s); +int psmx3_errno(int err); +void psmx3_query_mpi(void); + +void psmx3_cq_enqueue_event(struct psmx3_fid_cq *cq, struct psmx3_cq_event *event); +struct psmx3_cq_event *psmx3_cq_create_event(struct psmx3_fid_cq *cq, + void *op_context, void *buf, + uint64_t flags, size_t len, + uint64_t data, uint64_t tag, + size_t olen, int err); +int psmx3_cq_poll_mq(struct psmx3_fid_cq *cq, struct psmx3_trx_ctxt *trx_ctxt, + struct psmx3_cq_event *event, int count, fi_addr_t *src_addr); + +void psmx3_epid_to_epaddr(struct psmx3_trx_ctxt *trx_ctxt, + psm2_epid_t epid, psm2_epaddr_t *epaddr); + +int psmx3_av_add_trx_ctxt(struct psmx3_fid_av *av, struct psmx3_trx_ctxt *trx_ctxt); + +void psmx3_av_remove_conn(struct psmx3_fid_av *av, struct psmx3_trx_ctxt *trx_ctxt, + psm2_epaddr_t epaddr); + +int psmx3_av_query_sep(struct psmx3_fid_av *av, struct psmx3_trx_ctxt *trx_ctxt, + size_t idx); + +static inline +psm2_epaddr_t psmx3_av_translate_addr(struct psmx3_fid_av *av, + struct psmx3_trx_ctxt *trx_ctxt, + fi_addr_t addr, + int av_type) +{ + psm2_epaddr_t epaddr; + size_t idx; + int ctxt; + + if (av_type == FI_AV_MAP) + return (psm2_epaddr_t) addr; + + av->domain->av_lock_fn(&av->lock, 1); + + idx = PSMX3_ADDR_IDX(addr); + assert(idx < av->hdr->last && av->table[idx].valid); + + if (OFI_UNLIKELY(av->table[idx].type == PSMX3_EP_SCALABLE)) { + if (OFI_UNLIKELY(!av->sep_info[idx].epids)) { + psmx3_av_query_sep(av, trx_ctxt, idx); + assert(av->sep_info[idx].epids); + } + + if (OFI_UNLIKELY(!av->conn_info[trx_ctxt->id].sepaddrs[idx])) { + av->conn_info[trx_ctxt->id].sepaddrs[idx] = + calloc(av->sep_info[idx].ctxt_cnt, sizeof(psm2_epaddr_t)); + assert(av->conn_info[trx_ctxt->id].sepaddrs[idx]); + } + + ctxt = PSMX3_ADDR_CTXT(addr, av->rx_ctx_bits); + assert(ctxt < av->sep_info[idx].ctxt_cnt); + + if (OFI_UNLIKELY(!av->conn_info[trx_ctxt->id].sepaddrs[idx][ctxt])) + psmx3_epid_to_epaddr(trx_ctxt, + av->sep_info[idx].epids[ctxt], + &av->conn_info[trx_ctxt->id].sepaddrs[idx][ctxt]); + epaddr = av->conn_info[trx_ctxt->id].sepaddrs[idx][ctxt]; + } else { + if (OFI_UNLIKELY(!av->conn_info[trx_ctxt->id].epaddrs[idx])) + psmx3_epid_to_epaddr(trx_ctxt, av->table[idx].epid, + &av->conn_info[trx_ctxt->id].epaddrs[idx]); + epaddr = av->conn_info[trx_ctxt->id].epaddrs[idx]; + } + + av->domain->av_unlock_fn(&av->lock, 1); + return epaddr; +} + +void psmx3_am_global_init(void); +void psmx3_am_global_fini(void); +int psmx3_am_init(struct psmx3_trx_ctxt *trx_ctxt); +void psmx3_am_fini(struct psmx3_trx_ctxt *trx_ctxt); +int psmx3_am_progress(struct psmx3_trx_ctxt *trx_ctxt); +int psmx3_am_process_send(struct psmx3_trx_ctxt *trx_ctxt, + struct psmx3_am_request *req); +int psmx3_am_process_rma(struct psmx3_trx_ctxt *trx_ctxt, + struct psmx3_am_request *req); +int psmx3_am_rma_handler(psm2_am_token_t token, psm2_amarg_t *args, + int nargs, void *src, uint32_t len, + void *hctx); +int psmx3_am_atomic_handler(psm2_am_token_t token, + psm2_amarg_t *args, int nargs, void *src, + uint32_t len, void *hctx); +int psmx3_am_sep_handler(psm2_am_token_t token, psm2_amarg_t *args, int nargs, + void *src, uint32_t len, void *hctx); +int psmx3_am_trx_ctxt_handler(psm2_am_token_t token, + psm2_amarg_t *args, int nargs, void *src, uint32_t len, + void *hctx); +void psmx3_atomic_global_init(void); +void psmx3_atomic_global_fini(void); + +void psmx3_am_ack_rma(struct psmx3_am_request *req); + +static inline +struct psmx3_am_request *psmx3_am_request_alloc(struct psmx3_trx_ctxt *trx_ctxt) +{ + struct psmx3_am_request *req; + + trx_ctxt->domain->am_req_pool_lock_fn(&trx_ctxt->am_req_pool_lock, 0); + req = ofi_buf_alloc(trx_ctxt->am_req_pool); + trx_ctxt->domain->am_req_pool_unlock_fn(&trx_ctxt->am_req_pool_lock, 0); + + if (req) + memset(req, 0, sizeof(*req)); + + return req; +} + +static inline void psmx3_am_request_free(struct psmx3_trx_ctxt *trx_ctxt, + struct psmx3_am_request *req) +{ + trx_ctxt->domain->am_req_pool_lock_fn(&trx_ctxt->am_req_pool_lock, 0); + ofi_buf_free(req); + trx_ctxt->domain->am_req_pool_unlock_fn(&trx_ctxt->am_req_pool_lock, 0); +} + +struct psmx3_fid_mr *psmx3_mr_get(struct psmx3_fid_domain *domain, uint64_t key); +int psmx3_mr_validate(struct psmx3_fid_mr *mr, uint64_t addr, size_t len, uint64_t access); +void psmx3_cntr_check_trigger(struct psmx3_fid_cntr *cntr); +void psmx3_cntr_add_trigger(struct psmx3_fid_cntr *cntr, struct psmx3_trigger *trigger); + +int psmx3_handle_sendv_req(struct psmx3_fid_ep *ep, PSMX3_STATUS_TYPE *status, + int multi_recv); + +static inline void psmx3_cntr_inc(struct psmx3_fid_cntr *cntr, int error) +{ + if (OFI_UNLIKELY(error)) { + ofi_atomic_inc64(&cntr->error_counter); + cntr->error_avail = 1; + } else { + ofi_atomic_inc64(&cntr->counter); + } + psmx3_cntr_check_trigger(cntr); + if (cntr->wait) + cntr->wait->signal(cntr->wait); +} + +fi_addr_t psmx3_av_translate_source(struct psmx3_fid_av *av, + psm2_epaddr_t source, int source_sep_id); + +static inline void psmx3_get_source_name(psm2_epaddr_t source, + int source_sep_id, + struct psmx3_ep_name *name) +{ + memset(name, 0, sizeof(*name)); + psm2_epaddr_to_epid(source, &name->epid); + name->sep_id = source_sep_id; + name->type = source_sep_id ? PSMX3_EP_SCALABLE : PSMX3_EP_REGULAR; +} + +static inline void psmx3_get_source_string_name(psm2_epaddr_t source, + int source_sep_id, + char *name, size_t *len) +{ + struct psmx3_ep_name ep_name; + + memset(&ep_name, 0, sizeof(ep_name)); + psm2_epaddr_to_epid(source, &ep_name.epid); + ep_name.sep_id = source_sep_id; + ep_name.type = source_sep_id ? PSMX3_EP_SCALABLE : PSMX3_EP_REGULAR; + + ofi_straddr(name, len, FI_ADDR_PSMX3, &ep_name); +} + +static inline void psmx3_progress(struct psmx3_trx_ctxt *trx_ctxt) +{ + if (trx_ctxt && trx_ctxt->poll_active) { + psmx3_cq_poll_mq(NULL, trx_ctxt, NULL, 1, NULL); + if (trx_ctxt->am_progress) + psmx3_am_progress(trx_ctxt); + } +} + +static inline void psmx3_progress_all(struct psmx3_fid_domain *domain) +{ + struct dlist_entry *item; + struct psmx3_trx_ctxt *trx_ctxt; + + domain->trx_ctxt_lock_fn(&domain->trx_ctxt_lock, 1); + dlist_foreach(&domain->trx_ctxt_list, item) { + trx_ctxt = container_of(item, struct psmx3_trx_ctxt, entry); + psmx3_progress(trx_ctxt); + } + domain->trx_ctxt_unlock_fn(&domain->trx_ctxt_lock, 1); +} + +/* + * There is a limitation in PSM2 AM implementation that can cause significant + * delay if too many AM requests are enqueued in a row without progress calls + * being made in between. As a workaround, call this function after each AM + * request is enqueued whenever possible. + */ +#define PSMX3_AM_POLL_INTERVAL 64 +static inline void psmx3_am_poll(struct psmx3_trx_ctxt *trx_ctxt) +{ + if (OFI_UNLIKELY(++trx_ctxt->am_poll_count > PSMX3_AM_POLL_INTERVAL)) { + trx_ctxt->am_poll_count = 0; + psm2_poll(trx_ctxt->psm2_ep); + } +} + +static inline int psmx3_peer_match(struct dlist_entry *item, const void *arg) +{ + struct psmx3_epaddr_context *peer; + + peer = container_of(item, struct psmx3_epaddr_context, entry); + return (peer->epaddr == arg); +} + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/prov/psm3/src/psmx3_am.c b/prov/psm3/src/psmx3_am.c new file mode 100644 index 00000000000..680a5fabf4c --- /dev/null +++ b/prov/psm3/src/psmx3_am.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx3.h" +#include "psmx3_trigger.h" + +int psmx3_am_progress(struct psmx3_trx_ctxt *trx_ctxt) +{ + struct slist_entry *item; + struct psmx3_trigger *trigger; + + trx_ctxt->domain->trigger_queue_lock_fn(&trx_ctxt->trigger_queue.lock, 2); + while (!slist_empty(&trx_ctxt->trigger_queue.list)) { + item = slist_remove_head(&trx_ctxt->trigger_queue.list); + trigger = container_of(item, struct psmx3_trigger, list_entry); + trx_ctxt->domain->trigger_queue_unlock_fn(&trx_ctxt->trigger_queue.lock, 2); + psmx3_process_trigger(trx_ctxt, trigger); + trx_ctxt->domain->trigger_queue_lock_fn(&trx_ctxt->trigger_queue.lock, 2); + } + trx_ctxt->domain->trigger_queue_unlock_fn(&trx_ctxt->trigger_queue.lock, 2); + + return 0; +} + +int psmx3_am_init(struct psmx3_trx_ctxt *trx_ctxt) +{ + psm2_am_handler_2_fn_t psmx3_am_handlers[4]; + struct psmx3_trx_ctxt *hctx[4]; + int psmx3_am_handlers_idx[4]; + int num_handlers = 4; + + psm2_ep_t psm2_ep = trx_ctxt->psm2_ep; + size_t size; + int err = 0; + uint32_t max_atomic_size; + + FI_INFO(&psmx3_prov, FI_LOG_CORE, "epid %016lx\n", trx_ctxt->psm2_epid); + + if (!trx_ctxt->am_initialized) { + err = psm2_am_get_parameters(psm2_ep, &trx_ctxt->psm2_am_param, + sizeof(struct psm2_am_parameters), + &size); + if (err) + return psmx3_errno(err); + + max_atomic_size = trx_ctxt->psm2_am_param.max_request_short; + if (trx_ctxt->domain->max_atomic_size > max_atomic_size) + trx_ctxt->domain->max_atomic_size = max_atomic_size; + + psmx3_am_handlers[0] = psmx3_am_rma_handler; + hctx[0] = trx_ctxt; + psmx3_am_handlers[1] = psmx3_am_atomic_handler; + hctx[1] = trx_ctxt; + psmx3_am_handlers[2] = psmx3_am_sep_handler; + hctx[2] = trx_ctxt; + psmx3_am_handlers[3] = psmx3_am_trx_ctxt_handler; + hctx[3] = trx_ctxt; + + err = psm2_am_register_handlers_2(psm2_ep, psmx3_am_handlers, + num_handlers, (void **)hctx, psmx3_am_handlers_idx); + if (err) + return psmx3_errno(err); + + if ((psmx3_am_handlers_idx[0] != PSMX3_AM_RMA_HANDLER) || + (psmx3_am_handlers_idx[1] != PSMX3_AM_ATOMIC_HANDLER) || + (psmx3_am_handlers_idx[2] != PSMX3_AM_SEP_HANDLER) || + (psmx3_am_handlers_idx[3] != PSMX3_AM_TRX_CTXT_HANDLER)) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "failed to register one or more AM handlers " + "at indecies %d, %d, %d, %d\n", PSMX3_AM_RMA_HANDLER, + PSMX3_AM_ATOMIC_HANDLER, PSMX3_AM_SEP_HANDLER, + PSMX3_AM_TRX_CTXT_HANDLER); + return -FI_EBUSY; + } + + trx_ctxt->am_initialized = 1; + } + + return err; +} + +void psmx3_am_fini(struct psmx3_trx_ctxt *trx_ctxt) +{ + /* there is no way to unregister AM handlers */ +} + diff --git a/prov/psm3/src/psmx3_atomic.c b/prov/psm3/src/psmx3_atomic.c new file mode 100644 index 00000000000..639377e559b --- /dev/null +++ b/prov/psm3/src/psmx3_atomic.c @@ -0,0 +1,2101 @@ +/* + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx3.h" +#include "psmx3_trigger.h" + +/* Atomics protocol: + * + * Atomics REQ: + * args[0].u32w0 cmd + * args[0].u32w1 count + * args[1].u64 req + * args[2].u64 addr + * args[3].u64 key + * args[4].u32w0 datatype + * args[4].u32w1 op + * + * Atomics REP: + * args[0].u32w0 cmd + * args[0].u32w1 error + * args[1].u64 req + */ + +static fastlock_t psmx3_atomic_lock; + +void psmx3_atomic_global_init(void) +{ + fastlock_init(&psmx3_atomic_lock); +} + +void psmx3_atomic_global_fini(void) +{ + fastlock_destroy(&psmx3_atomic_lock); +} + +static inline void psmx3_ioc_read(const struct fi_ioc *ioc, size_t count, + int datatype, uint8_t *buf, size_t len) +{ + int i; + size_t copy_len; + + for (i=0; i len) + copy_len = len; + memcpy(buf, ioc[i].addr, copy_len); + buf += copy_len; + len -= copy_len; + } +} + +static inline void psmx3_ioc_write(struct fi_ioc *ioc, size_t count, + int datatype, const uint8_t *buf, size_t len) +{ + int i; + size_t copy_len; + + for (i=0; i len) + copy_len = len; + memcpy(ioc[i].addr, buf, copy_len); + buf += copy_len; + len -= copy_len; + } +} + +static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count, + int datatype) +{ + int i; + size_t len = 0; + + for (i=0; i (src)) (dst) = (src) +#define PSMX3_MAX(dst,src) if ((dst) < (src)) (dst) = (src) +#define PSMX3_SUM(dst,src) (dst) += (src) +#define PSMX3_PROD(dst,src) (dst) *= (src) +#define PSMX3_LOR(dst,src) (dst) = (dst) || (src) +#define PSMX3_LAND(dst,src) (dst) = (dst) && (src) +#define PSMX3_BOR(dst,src) (dst) |= (src) +#define PSMX3_BAND(dst,src) (dst) &= (src) +#define PSMX3_LXOR(dst,src) (dst) = ((dst) && !(src)) || (!(dst) && (src)) +#define PSMX3_BXOR(dst,src) (dst) ^= (src) +#define PSMX3_COPY(dst,src) (dst) = (src) + +#define PSMX3_ATOMIC_READ(dst,res,cnt,TYPE) \ + do { \ + int i; \ + TYPE *d = (dst); \ + TYPE *r = (res); \ + psmx3_lock(&psmx3_atomic_lock, 1); \ + for (i=0; i<(cnt); i++) \ + r[i] = d[i]; \ + psmx3_unlock(&psmx3_atomic_lock, 1); \ + } while (0) + +#define PSMX3_ATOMIC_WRITE(dst,src,cnt,OP,TYPE) \ + do { \ + int i; \ + TYPE *d = (dst); \ + TYPE *s = (src); \ + psmx3_lock(&psmx3_atomic_lock, 1); \ + for (i=0; i=); + break; + + case FI_CSWAP_GT: + SWITCH_ORD_TYPE(datatype,PSMX3_ATOMIC_CSWAP, + dest,src,compare,result,count,>); + break; + + case FI_MSWAP: + SWITCH_INT_TYPE(datatype,PSMX3_ATOMIC_MSWAP, + dest,src,compare,result,count); + break; + + default: + return -FI_EOPNOTSUPP; + } + + return 0; +} + +int psmx3_am_atomic_handler(psm2_am_token_t token, + psm2_amarg_t *args, int nargs, void *src, + uint32_t len, void *hctx) +{ + psm2_amarg_t rep_args[8]; + int count; + uint8_t *addr; + uint64_t key; + int datatype, op; + int err = 0; + int op_error = 0; + struct psmx3_am_request *req; + struct psmx3_cq_event *event; + struct psmx3_fid_mr *mr; + struct psmx3_fid_cntr *cntr = NULL; + struct psmx3_fid_cntr *mr_cntr = NULL; + void *tmp_buf; + psm2_epaddr_t epaddr; + int cmd; + struct psmx3_trx_ctxt *rx; + + psm2_am_get_source(token, &epaddr); + cmd = PSMX3_AM_GET_OP(args[0].u32w0); + + switch (cmd) { + case PSMX3_AM_REQ_ATOMIC_WRITE: + rx = (struct psmx3_trx_ctxt *)hctx; + count = args[0].u32w1; + addr = (uint8_t *)(uintptr_t)args[2].u64; + key = args[3].u64; + datatype = args[4].u32w0; + op = args[4].u32w1; + assert(len == ofi_datatype_size(datatype) * count); + + mr = psmx3_mr_get(rx->domain, key); + op_error = mr ? + psmx3_mr_validate(mr, (uint64_t)addr, len, FI_REMOTE_WRITE) : + -FI_EINVAL; + + if (!op_error) { + addr += mr->offset; + psmx3_atomic_do_write(addr, src, datatype, op, count); + + if (rx->ep->caps & FI_RMA_EVENT) { + cntr = rx->ep->remote_write_cntr; + mr_cntr = mr->cntr; + + if (cntr) + psmx3_cntr_inc(cntr, 0); + + if (mr_cntr && mr_cntr != cntr) + psmx3_cntr_inc(mr_cntr, 0); + } + } + + rep_args[0].u32w0 = PSMX3_AM_REP_ATOMIC_WRITE; + rep_args[0].u32w1 = op_error; + rep_args[1].u64 = args[1].u64; + err = psm2_am_reply_short(token, PSMX3_AM_ATOMIC_HANDLER, + rep_args, 2, NULL, 0, 0, + NULL, NULL ); + break; + + case PSMX3_AM_REQ_ATOMIC_READWRITE: + rx = (struct psmx3_trx_ctxt *)hctx; + count = args[0].u32w1; + addr = (uint8_t *)(uintptr_t)args[2].u64; + key = args[3].u64; + datatype = args[4].u32w0; + op = args[4].u32w1; + + if (op == FI_ATOMIC_READ) + len = ofi_datatype_size(datatype) * count; + + assert(len == ofi_datatype_size(datatype) * count); + + mr = psmx3_mr_get(rx->domain, key); + op_error = mr ? + psmx3_mr_validate(mr, (uint64_t)addr, len, + FI_REMOTE_READ|FI_REMOTE_WRITE) : + -FI_EINVAL; + + if (!op_error) { + addr += mr->offset; + tmp_buf = malloc(len); + if (tmp_buf) + psmx3_atomic_do_readwrite(addr, src, tmp_buf, + datatype, op, count); + else + op_error = -FI_ENOMEM; + + if (rx->ep->caps & FI_RMA_EVENT) { + if (op == FI_ATOMIC_READ) { + cntr = rx->ep->remote_read_cntr; + } else { + cntr = rx->ep->remote_write_cntr; + mr_cntr = mr->cntr; + } + + if (cntr) + psmx3_cntr_inc(cntr, 0); + + if (mr_cntr && mr_cntr != cntr) + psmx3_cntr_inc(mr_cntr, 0); + } + } else { + tmp_buf = NULL; + } + + rep_args[0].u32w0 = PSMX3_AM_REP_ATOMIC_READWRITE; + rep_args[0].u32w1 = op_error; + rep_args[1].u64 = args[1].u64; + err = psm2_am_reply_short(token, PSMX3_AM_ATOMIC_HANDLER, + rep_args, 2, tmp_buf, + (tmp_buf ? len : 0), + 0, free, tmp_buf ); + break; + + case PSMX3_AM_REQ_ATOMIC_COMPWRITE: + rx = (struct psmx3_trx_ctxt *)hctx; + count = args[0].u32w1; + addr = (uint8_t *)(uintptr_t)args[2].u64; + key = args[3].u64; + datatype = args[4].u32w0; + op = args[4].u32w1; + len /= 2; + assert(len == ofi_datatype_size(datatype) * count); + + mr = psmx3_mr_get(rx->domain, key); + op_error = mr ? + psmx3_mr_validate(mr, (uint64_t)addr, len, + FI_REMOTE_READ|FI_REMOTE_WRITE) : + -FI_EINVAL; + + if (!op_error) { + addr += mr->offset; + tmp_buf = malloc(len); + if (tmp_buf) + psmx3_atomic_do_compwrite(addr, src, (uint8_t *)src + len, + tmp_buf, datatype, + op, count); + else + op_error = -FI_ENOMEM; + + if (rx->ep->caps & FI_RMA_EVENT) { + cntr = rx->ep->remote_write_cntr; + mr_cntr = mr->cntr; + + if (cntr) + psmx3_cntr_inc(cntr, 0); + + if (mr_cntr && mr_cntr != cntr) + psmx3_cntr_inc(mr_cntr, 0); + } + } else { + tmp_buf = NULL; + } + + rep_args[0].u32w0 = PSMX3_AM_REP_ATOMIC_READWRITE; + rep_args[0].u32w1 = op_error; + rep_args[1].u64 = args[1].u64; + err = psm2_am_reply_short(token, PSMX3_AM_ATOMIC_HANDLER, + rep_args, 2, tmp_buf, + (tmp_buf ? len : 0), + 0, free, tmp_buf ); + break; + + case PSMX3_AM_REP_ATOMIC_WRITE: + req = (struct psmx3_am_request *)(uintptr_t)args[1].u64; + op_error = (int)args[0].u32w1; + assert(req->op == PSMX3_AM_REQ_ATOMIC_WRITE); + if (req->ep->send_cq && (!req->no_event || op_error)) { + event = psmx3_cq_create_event( + req->ep->send_cq, + req->atomic.context, + req->atomic.buf, + req->cq_flags, + req->atomic.len, + 0, /* data */ + 0, /* tag */ + 0, /* olen */ + op_error); + if (event) + psmx3_cq_enqueue_event(req->ep->send_cq, event); + else + err = -FI_ENOMEM; + } + + if (req->ep->write_cntr) + psmx3_cntr_inc(req->ep->write_cntr, op_error); + + free(req->tmpbuf); + psmx3_am_request_free(req->ep->tx, req); + break; + + case PSMX3_AM_REP_ATOMIC_READWRITE: + case PSMX3_AM_REP_ATOMIC_COMPWRITE: + req = (struct psmx3_am_request *)(uintptr_t)args[1].u64; + op_error = (int)args[0].u32w1; + assert(op_error || req->atomic.len == len); + + if (!op_error) { + if (req->atomic.result) + memcpy(req->atomic.result, src, len); + else + psmx3_ioc_write(req->ioc, req->atomic.iov_count, + req->atomic.datatype, src, len); + } + + if (req->ep->send_cq && (!req->no_event || op_error)) { + event = psmx3_cq_create_event( + req->ep->send_cq, + req->atomic.context, + req->atomic.buf, + req->cq_flags, + req->atomic.len, + 0, /* data */ + 0, /* tag */ + 0, /* olen */ + op_error); + if (event) + psmx3_cq_enqueue_event(req->ep->send_cq, event); + else + err = -FI_ENOMEM; + } + + if (req->ep->read_cntr) + psmx3_cntr_inc(req->ep->read_cntr, op_error); + + free(req->tmpbuf); + psmx3_am_request_free(req->ep->tx, req); + break; + + default: + err = -FI_EINVAL; + } + return err; +} + +static int psmx3_atomic_self(int am_cmd, + struct psmx3_fid_ep *ep, + const void *buf, + size_t count, void *desc, + const void *compare, void *compare_desc, + void *result, void *result_desc, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context, + uint64_t flags) +{ + struct psmx3_fid_mr *mr; + struct psmx3_cq_event *event; + struct psmx3_fid_cntr *cntr = NULL; + struct psmx3_fid_cntr *mr_cntr = NULL; + void *tmp_buf; + size_t len; + int no_event; + int err = 0; + int op_error; + int access; + uint64_t cq_flags = 0; + + if (am_cmd == PSMX3_AM_REQ_ATOMIC_WRITE) + access = FI_REMOTE_WRITE; + else + access = FI_REMOTE_READ | FI_REMOTE_WRITE; + + len = ofi_datatype_size(datatype) * count; + mr = psmx3_mr_get(ep->domain, key); + op_error = mr ? psmx3_mr_validate(mr, addr, len, access) : -FI_EINVAL; + + if (op_error) + goto gen_local_event; + + addr += mr->offset; + + switch (am_cmd) { + case PSMX3_AM_REQ_ATOMIC_WRITE: + err = psmx3_atomic_do_write((void *)addr, (void *)buf, + (int)datatype, (int)op, (int)count); + cq_flags = FI_WRITE | FI_ATOMIC; + break; + + case PSMX3_AM_REQ_ATOMIC_READWRITE: + if (result != buf) { + err = psmx3_atomic_do_readwrite((void *)addr, (void *)buf, + (void *)result, (int)datatype, + (int)op, (int)count); + } else { + tmp_buf = malloc(len); + if (tmp_buf) { + memcpy(tmp_buf, result, len); + err = psmx3_atomic_do_readwrite((void *)addr, (void *)buf, + tmp_buf, (int)datatype, + (int)op, (int)count); + memcpy(result, tmp_buf, len); + free(tmp_buf); + } else { + err = -FI_ENOMEM; + } + } + if (op == FI_ATOMIC_READ) + cq_flags = FI_READ | FI_ATOMIC; + else + cq_flags = FI_WRITE | FI_ATOMIC; + break; + + case PSMX3_AM_REQ_ATOMIC_COMPWRITE: + if (result != buf && result != compare) { + err = psmx3_atomic_do_compwrite((void *)addr, (void *)buf, + (void *)compare, (void *)result, + (int)datatype, (int)op, (int)count); + } else { + tmp_buf = malloc(len); + if (tmp_buf) { + memcpy(tmp_buf, result, len); + err = psmx3_atomic_do_compwrite((void *)addr, (void *)buf, + (void *)compare, tmp_buf, + (int)datatype, (int)op, (int)count); + memcpy(result, tmp_buf, len); + free(tmp_buf); + } else { + err = -FI_ENOMEM; + } + } + cq_flags = FI_WRITE | FI_ATOMIC; + break; + } + + if (ep->caps & FI_RMA_EVENT) { + if (op == FI_ATOMIC_READ) { + cntr = ep->remote_read_cntr; + } else { + cntr = ep->remote_write_cntr; + mr_cntr = mr->cntr; + } + + if (cntr) + psmx3_cntr_inc(cntr, 0); + + if (mr_cntr && mr_cntr != cntr) + psmx3_cntr_inc(mr_cntr, 0); + } + + op_error = err; + +gen_local_event: + no_event = ((flags & PSMX3_NO_COMPLETION) || + (ep->send_selective_completion && !(flags & FI_COMPLETION))); + if (ep->send_cq && (!no_event || op_error)) { + event = psmx3_cq_create_event( + ep->send_cq, + context, + (void *)buf, + cq_flags, + len, + 0, /* data */ + 0, /* tag */ + 0, /* olen */ + op_error); + if (event) + psmx3_cq_enqueue_event(ep->send_cq, event); + else + err = -FI_ENOMEM; + } + + switch (am_cmd) { + case PSMX3_AM_REQ_ATOMIC_WRITE: + if (ep->write_cntr) + psmx3_cntr_inc(ep->write_cntr, op_error); + break; + case PSMX3_AM_REQ_ATOMIC_READWRITE: + case PSMX3_AM_REQ_ATOMIC_COMPWRITE: + if (ep->read_cntr) + psmx3_cntr_inc(ep->read_cntr, op_error); + break; + } + + return err; +} + +ssize_t psmx3_atomic_write_generic(struct fid_ep *ep, + const void *buf, + size_t count, void *desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context, + uint64_t flags) +{ + struct psmx3_fid_ep *ep_priv; + struct psmx3_fid_av *av; + struct psmx3_am_request *req; + psm2_amarg_t args[8]; + psm2_epaddr_t psm2_epaddr; + psm2_epid_t psm2_epid; + int am_flags = PSM2_AM_FLAG_ASYNC; + int chunk_size, len; + int err; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (flags & FI_TRIGGER) + return psmx3_trigger_queue_atomic_write(ep, buf, count, desc, + dest_addr, addr, key, + datatype, op, context, + flags); + + assert(buf); + assert((int)datatype >= 0 && (int)datatype < FI_DATATYPE_LAST); + assert((int)op >= 0 && (int)op < FI_ATOMIC_OP_LAST); + + av = ep_priv->av; + assert(av); + + psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type); + psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid); + + if (psm2_epid == ep_priv->tx->psm2_epid) + return psmx3_atomic_self(PSMX3_AM_REQ_ATOMIC_WRITE, ep_priv, + buf, count, desc, NULL, NULL, NULL, + NULL, addr, key, datatype, op, + context, flags); + + chunk_size = ep_priv->tx->psm2_am_param.max_request_short; + len = ofi_datatype_size(datatype)* count; + if (len > chunk_size) + return -FI_EMSGSIZE; + + req = psmx3_am_request_alloc(ep_priv->tx); + if (!req) + return -FI_ENOMEM; + + if (flags & FI_INJECT) { + req->tmpbuf = malloc(len); + if (!req->tmpbuf) { + psmx3_am_request_free(ep_priv->tx, req); + return -FI_ENOMEM; + } + + memcpy(req->tmpbuf, (void *)buf, len); + buf = req->tmpbuf; + } + + req->no_event = (flags & PSMX3_NO_COMPLETION) || + (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)); + + req->op = PSMX3_AM_REQ_ATOMIC_WRITE; + req->atomic.buf = (void *)buf; + req->atomic.len = len; + req->atomic.addr = addr; + req->atomic.key = key; + req->atomic.context = context; + req->atomic.datatype = datatype; + req->ep = ep_priv; + req->cq_flags = FI_WRITE | FI_ATOMIC; + + args[0].u32w0 = PSMX3_AM_REQ_ATOMIC_WRITE; + args[0].u32w1 = count; + args[1].u64 = (uint64_t)(uintptr_t)req; + args[2].u64 = addr; + args[3].u64 = key; + args[4].u32w0 = datatype; + args[4].u32w1 = op; + err = psm2_am_request_short(psm2_epaddr, + PSMX3_AM_ATOMIC_HANDLER, args, 5, + (void *)buf, len, am_flags, NULL, NULL); + if (err) { + free(req->tmpbuf); + psmx3_am_request_free(ep_priv->tx, req); + return psmx3_errno(err); + } + + psmx3_am_poll(ep_priv->tx); + return 0; +} + +ssize_t psmx3_atomic_writev_generic(struct fid_ep *ep, + const struct fi_ioc *iov, + void **desc, size_t count, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context, + uint64_t flags) +{ + struct psmx3_fid_ep *ep_priv; + struct psmx3_fid_av *av; + struct psmx3_am_request *req; + psm2_amarg_t args[8]; + psm2_epaddr_t psm2_epaddr; + psm2_epid_t psm2_epid; + int am_flags = PSM2_AM_FLAG_ASYNC; + int chunk_size; + size_t len; + uint8_t *buf; + int err; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (flags & FI_TRIGGER) + return psmx3_trigger_queue_atomic_writev(ep, iov, desc, count, + dest_addr, addr, key, + datatype, op, context, + flags); + + assert(iov); + assert(count); + assert((int)datatype >= 0 && (int)datatype < FI_DATATYPE_LAST); + assert((int)op >= 0 && (int)op < FI_ATOMIC_OP_LAST); + + while (count && !iov[count-1].count) + count--; + + av = ep_priv->av; + assert(av); + + len = psmx3_ioc_size(iov, count, datatype); + + psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type); + psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid); + + if (psm2_epid == ep_priv->tx->psm2_epid) { + buf = malloc(len); + if (!buf) + return -FI_ENOMEM; + + psmx3_ioc_read(iov, count, datatype, buf, len); + + err = psmx3_atomic_self(PSMX3_AM_REQ_ATOMIC_WRITE, ep_priv, + buf, len / ofi_datatype_size(datatype), + NULL, NULL, NULL, NULL, NULL, addr, + key, datatype, op, context, flags); + + free(buf); + return err; + } + + chunk_size = ep_priv->tx->psm2_am_param.max_request_short; + if (len > chunk_size) + return -FI_EMSGSIZE; + + req = psmx3_am_request_alloc(ep_priv->tx); + if (!req) + return -FI_ENOMEM; + + if (count > 1) { + req->tmpbuf = malloc(len); + if (!req->tmpbuf) { + psmx3_am_request_free(ep_priv->tx, req); + return -FI_ENOMEM; + } + + buf = req->tmpbuf; + psmx3_ioc_read(iov, count, datatype, buf, len); + } else { + buf = iov[0].addr; + } + + req->no_event = (flags & PSMX3_NO_COMPLETION) || + (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)); + + req->op = PSMX3_AM_REQ_ATOMIC_WRITE; + req->atomic.buf = (void *)buf; + req->atomic.len = len; + req->atomic.addr = addr; + req->atomic.key = key; + req->atomic.context = context; + req->atomic.datatype = datatype; + req->ep = ep_priv; + req->cq_flags = FI_WRITE | FI_ATOMIC; + + args[0].u32w0 = PSMX3_AM_REQ_ATOMIC_WRITE; + args[0].u32w1 = len / ofi_datatype_size(datatype); + args[1].u64 = (uint64_t)(uintptr_t)req; + args[2].u64 = addr; + args[3].u64 = key; + args[4].u32w0 = datatype; + args[4].u32w1 = op; + err = psm2_am_request_short(psm2_epaddr, + PSMX3_AM_ATOMIC_HANDLER, args, 5, + (void *)buf, len, am_flags, NULL, NULL); + if (err) { + free(req->tmpbuf); + psmx3_am_request_free(ep_priv->tx, req); + return psmx3_errno(err); + } + + psmx3_am_poll(ep_priv->tx); + return 0; +} + +DIRECT_FN +STATIC ssize_t psmx3_atomic_write(struct fid_ep *ep, + const void *buf, + size_t count, void *desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + return psmx3_atomic_write_generic(ep, buf, count, desc, dest_addr, + addr, key, datatype, op, context, + ep_priv->tx_flags); +} + +DIRECT_FN +STATIC ssize_t psmx3_atomic_writemsg(struct fid_ep *ep, + const struct fi_msg_atomic *msg, + uint64_t flags) +{ + assert(msg); + assert(msg->iov_count); + assert(msg->msg_iov); + assert(msg->rma_iov); + assert(msg->rma_iov_count == 1); + + if (msg->iov_count > 1) + return psmx3_atomic_writev_generic(ep, msg->msg_iov, msg->desc, + msg->iov_count, msg->addr, + msg->rma_iov[0].addr, + msg->rma_iov[0].key, + msg->datatype, msg->op, + msg->context, flags); + + return psmx3_atomic_write_generic(ep, msg->msg_iov[0].addr, + msg->msg_iov[0].count, + msg->desc ? msg->desc[0] : NULL, + msg->addr, msg->rma_iov[0].addr, + msg->rma_iov[0].key, msg->datatype, + msg->op, msg->context, flags); +} + +DIRECT_FN +STATIC ssize_t psmx3_atomic_writev(struct fid_ep *ep, + const struct fi_ioc *iov, + void **desc, size_t count, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + assert(iov); + assert(count); + + if (count > 1) + return psmx3_atomic_writev_generic(ep, iov, desc, count, + dest_addr, addr, key, + datatype, op, context, + ep_priv->tx_flags); + + return psmx3_atomic_write_generic(ep, iov->addr, iov->count, + desc ? desc[0] : NULL, dest_addr, + addr, key, datatype, op, context, + ep_priv->tx_flags); +} + +DIRECT_FN +STATIC ssize_t psmx3_atomic_inject(struct fid_ep *ep, + const void *buf, + size_t count, /*void *desc,*/ + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + return psmx3_atomic_write_generic(ep, buf, count, NULL/*desc*/, + dest_addr, addr, key, + datatype, op, NULL, + ep_priv->tx_flags | FI_INJECT | PSMX3_NO_COMPLETION); +} + +ssize_t psmx3_atomic_readwrite_generic(struct fid_ep *ep, + const void *buf, + size_t count, void *desc, + void *result, void *result_desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context, + uint64_t flags) +{ + struct psmx3_fid_ep *ep_priv; + struct psmx3_fid_av *av; + struct psmx3_am_request *req; + psm2_amarg_t args[8]; + psm2_epaddr_t psm2_epaddr; + psm2_epid_t psm2_epid; + int am_flags = PSM2_AM_FLAG_ASYNC; + int chunk_size, len; + int err; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (flags & FI_TRIGGER) + return psmx3_trigger_queue_atomic_readwrite(ep, buf, count, + desc, result, + result_desc, + dest_addr, addr, + key, datatype, op, + context, flags); + + assert(buf || op == FI_ATOMIC_READ); + assert((int)datatype >= 0 && (int)datatype < FI_DATATYPE_LAST); + assert((int)op >= 0 && (int)op < FI_ATOMIC_OP_LAST); + + av = ep_priv->av; + assert(av); + + psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type); + psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid); + + if (psm2_epid == ep_priv->tx->psm2_epid) + return psmx3_atomic_self(PSMX3_AM_REQ_ATOMIC_READWRITE, ep_priv, + buf, count, desc, NULL, NULL, result, + result_desc, addr, key, datatype, op, + context, flags); + + chunk_size = ep_priv->tx->psm2_am_param.max_request_short; + len = ofi_datatype_size(datatype) * count; + if (len > chunk_size) + return -FI_EMSGSIZE; + + req = psmx3_am_request_alloc(ep_priv->tx); + if (!req) + return -FI_ENOMEM; + + if ((flags & FI_INJECT) && op != FI_ATOMIC_READ) { + req->tmpbuf = malloc(len); + if (!req->tmpbuf) { + psmx3_am_request_free(ep_priv->tx, req); + return -FI_ENOMEM; + } + + memcpy(req->tmpbuf, (void *)buf, len); + buf = req->tmpbuf; + } + + req->no_event = (flags & PSMX3_NO_COMPLETION) || + (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)); + + req->op = PSMX3_AM_REQ_ATOMIC_READWRITE; + req->atomic.buf = (void *)buf; + req->atomic.len = len; + req->atomic.addr = addr; + req->atomic.key = key; + req->atomic.context = context; + req->atomic.result = result; + req->atomic.datatype = datatype; + req->ep = ep_priv; + if (op == FI_ATOMIC_READ) + req->cq_flags = FI_READ | FI_ATOMIC; + else + req->cq_flags = FI_WRITE | FI_ATOMIC; + + args[0].u32w0 = PSMX3_AM_REQ_ATOMIC_READWRITE; + args[0].u32w1 = count; + args[1].u64 = (uint64_t)(uintptr_t)req; + args[2].u64 = addr; + args[3].u64 = key; + args[4].u32w0 = datatype; + args[4].u32w1 = op; + err = psm2_am_request_short(psm2_epaddr, + PSMX3_AM_ATOMIC_HANDLER, args, 5, + (void *)buf, (buf?len:0), am_flags, NULL, + NULL); + if (err) { + free(req->tmpbuf); + psmx3_am_request_free(ep_priv->tx, req); + return psmx3_errno(err); + } + + psmx3_am_poll(ep_priv->tx); + return 0; +} + +ssize_t psmx3_atomic_readwritev_generic(struct fid_ep *ep, + const struct fi_ioc *iov, + void **desc, size_t count, + struct fi_ioc *resultv, + void **result_desc, + size_t result_count, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context, + uint64_t flags) +{ + struct psmx3_fid_ep *ep_priv; + struct psmx3_fid_av *av; + struct psmx3_am_request *req; + psm2_amarg_t args[8]; + psm2_epaddr_t psm2_epaddr; + psm2_epid_t psm2_epid; + int am_flags = PSM2_AM_FLAG_ASYNC; + int chunk_size; + size_t len, result_len, iov_size; + uint8_t *buf, *result; + void *desc0, *result_desc0; + int err; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (flags & FI_TRIGGER) + return psmx3_trigger_queue_atomic_readwritev(ep, iov, desc, + count, resultv, + result_desc, + result_count, + dest_addr, addr, + key, datatype, op, + context, flags); + + assert((iov && count) || op == FI_ATOMIC_READ); + assert(resultv); + assert(result_count); + assert((int)datatype >= 0 && (int)datatype < FI_DATATYPE_LAST); + assert((int)op >= 0 && (int)op < FI_ATOMIC_OP_LAST); + + if (iov) { + while (count && !iov[count-1].count) + count--; + } + + while (result_count && !resultv[result_count-1].count) + result_count--; + + result_len = psmx3_ioc_size(resultv, result_count, datatype); + + if (op != FI_ATOMIC_READ) { + buf = iov[0].addr; /* as default for count == 1 */ + len = psmx3_ioc_size(iov, count, datatype); + desc0 = desc ? desc[0] : NULL; + } else { + buf = NULL; + len = result_len; + desc0 = NULL; + } + + assert(result_len >= len); + + av = ep_priv->av; + assert(av); + + psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type); + psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid); + + if (psm2_epid == ep_priv->tx->psm2_epid) { + if (buf && count > 1) { + buf = malloc(len); + psmx3_ioc_read(iov, count, datatype, buf, len); + desc0 = NULL; + } + + if (result_count > 1) { + result = malloc(len); + if (!result) { + if (buf && count > 1) + free(buf); + return -FI_ENOMEM; + } + result_desc0 = result_desc ? result_desc[0] : NULL; + } else { + result = resultv[0].addr; + result_desc0 = NULL; + } + + err = psmx3_atomic_self(PSMX3_AM_REQ_ATOMIC_READWRITE, ep_priv, + buf, len / ofi_datatype_size(datatype), + desc0, NULL, NULL, result, result_desc0, + addr, key, datatype, op, context, flags); + + if (result_count > 1) { + psmx3_ioc_write(resultv, result_count, datatype, result, len); + free(result); + } + + if (buf && count > 1) + free(buf); + + return err; + } + + chunk_size = ep_priv->tx->psm2_am_param.max_request_short; + if (len > chunk_size) + return -FI_EMSGSIZE; + + iov_size = result_count > 1 ? result_count * sizeof(struct fi_ioc) : 0; + + req = psmx3_am_request_alloc(ep_priv->tx); + if (!req) + return -FI_ENOMEM; + + if (((flags & FI_INJECT) || count > 1) && op != FI_ATOMIC_READ) { + req->tmpbuf = malloc(iov_size + len); + if (!req->tmpbuf) { + psmx3_am_request_free(ep_priv->tx, req); + return -FI_ENOMEM; + } + + buf = (uint8_t *)req->tmpbuf + iov_size; + psmx3_ioc_read(iov, count, datatype, buf, len); + } else { + req->tmpbuf = malloc(iov_size); + if (!req->tmpbuf) { + psmx3_am_request_free(ep_priv->tx, req); + return -FI_ENOMEM; + } + } + + req->ioc = req->tmpbuf; + if (iov_size) { + memcpy(req->ioc, resultv, iov_size); + req->atomic.iov_count = result_count; + req->atomic.result = NULL; + } else { + req->atomic.buf = buf; + req->atomic.result = resultv[0].addr; + } + + req->no_event = (flags & PSMX3_NO_COMPLETION) || + (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)); + + req->op = PSMX3_AM_REQ_ATOMIC_READWRITE; + req->atomic.buf = (void *)buf; + req->atomic.len = len; + req->atomic.addr = addr; + req->atomic.key = key; + req->atomic.context = context; + req->atomic.datatype = datatype; + req->ep = ep_priv; + if (op == FI_ATOMIC_READ) + req->cq_flags = FI_READ | FI_ATOMIC; + else + req->cq_flags = FI_WRITE | FI_ATOMIC; + + args[0].u32w0 = PSMX3_AM_REQ_ATOMIC_READWRITE; + args[0].u32w1 = len / ofi_datatype_size(datatype); + args[1].u64 = (uint64_t)(uintptr_t)req; + args[2].u64 = addr; + args[3].u64 = key; + args[4].u32w0 = datatype; + args[4].u32w1 = op; + err = psm2_am_request_short(psm2_epaddr, + PSMX3_AM_ATOMIC_HANDLER, args, 5, + (void *)buf, (buf?len:0), am_flags, NULL, + NULL); + if (err) { + free(req->tmpbuf); + psmx3_am_request_free(ep_priv->tx, req); + return psmx3_errno(err); + } + + psmx3_am_poll(ep_priv->tx); + return 0; +} + +DIRECT_FN +STATIC ssize_t psmx3_atomic_readwrite(struct fid_ep *ep, + const void *buf, + size_t count, void *desc, + void *result, void *result_desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + return psmx3_atomic_readwrite_generic(ep, buf, count, desc, + result, result_desc, dest_addr, + addr, key, datatype, op, + context, ep_priv->tx_flags); +} + +DIRECT_FN +STATIC ssize_t psmx3_atomic_readwritemsg(struct fid_ep *ep, + const struct fi_msg_atomic *msg, + struct fi_ioc *resultv, + void **result_desc, + size_t result_count, + uint64_t flags) +{ + void *buf; + size_t count; + void *desc; + + assert(msg); + assert(msg->rma_iov); + assert(msg->rma_iov_count ==1); + assert(resultv); + assert(result_count); + assert((msg->msg_iov && msg->iov_count) || msg->op == FI_ATOMIC_READ); + + if ((msg->op != FI_ATOMIC_READ && msg->iov_count > 1) || + result_count > 1) + return psmx3_atomic_readwritev_generic(ep, msg->msg_iov, msg->desc, + msg->iov_count, resultv, + result_desc, result_count, + msg->addr, + msg->rma_iov[0].addr, + msg->rma_iov[0].key, + msg->datatype, msg->op, + msg->context, flags); + + if (msg->op == FI_ATOMIC_READ) { + buf = NULL; + count = resultv[0].count; + desc = result_desc ? result_desc[0] : NULL; + } else { + buf = msg->msg_iov[0].addr; + count = msg->msg_iov[0].count; + desc = msg->desc ? msg->desc[0] : NULL; + } + + return psmx3_atomic_readwrite_generic(ep, buf, count, desc, resultv[0].addr, + result_desc ? result_desc[0] : NULL, + msg->addr, msg->rma_iov[0].addr, + msg->rma_iov[0].key, msg->datatype, + msg->op, msg->context, flags); +} + +DIRECT_FN +STATIC ssize_t psmx3_atomic_readwritev(struct fid_ep *ep, + const struct fi_ioc *iov, + void **desc, size_t count, + struct fi_ioc *resultv, + void **result_desc, size_t result_count, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context) +{ + struct psmx3_fid_ep *ep_priv; + void *buf; + void *src_desc; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + assert(resultv); + assert(result_count); + assert((iov && count) || op == FI_ATOMIC_READ); + + if ((op != FI_ATOMIC_READ && count > 1) || result_count > 1) + return psmx3_atomic_readwritev_generic(ep, iov, desc, count, + resultv, result_desc, result_count, + dest_addr, addr, key, datatype, op, + context, ep_priv->tx_flags); + + if (op == FI_ATOMIC_READ) { + buf = NULL; + count = resultv[0].count; + src_desc = result_desc ? result_desc[0] : NULL; + } else { + buf = iov[0].addr; + count = iov[0].count; + src_desc = desc ? desc[0] : NULL; + } + + return psmx3_atomic_readwrite_generic(ep, buf, count, src_desc, resultv[0].addr, + result_desc ? result_desc[0] : NULL, + dest_addr, addr, key, datatype, op, + context, ep_priv->tx_flags); +} + +ssize_t psmx3_atomic_compwrite_generic(struct fid_ep *ep, + const void *buf, + size_t count, void *desc, + const void *compare, void *compare_desc, + void *result, void *result_desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context, + uint64_t flags) +{ + struct psmx3_fid_ep *ep_priv; + struct psmx3_fid_av *av; + struct psmx3_am_request *req; + psm2_amarg_t args[8]; + psm2_epaddr_t psm2_epaddr; + psm2_epid_t psm2_epid; + int am_flags = PSM2_AM_FLAG_ASYNC; + int chunk_size, len; + int err; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (flags & FI_TRIGGER) + return psmx3_trigger_queue_atomic_compwrite(ep, buf, count, + desc, compare, + compare_desc, + result, result_desc, + dest_addr, addr, + key, datatype, op, + context, flags); + + assert(buf); + assert((int)datatype >= 0 && (int)datatype < FI_DATATYPE_LAST); + assert((int)op >= 0 && (int)op < FI_ATOMIC_OP_LAST); + + av = ep_priv->av; + assert(av); + + psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type); + psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid); + + if (psm2_epid == ep_priv->tx->psm2_epid) + return psmx3_atomic_self(PSMX3_AM_REQ_ATOMIC_COMPWRITE, ep_priv, + buf, count, desc, compare, + compare_desc, result, result_desc, + addr, key, datatype, op, + context, flags); + + chunk_size = ep_priv->tx->psm2_am_param.max_request_short; + len = ofi_datatype_size(datatype) * count; + if (len * 2 > chunk_size) + return -FI_EMSGSIZE; + + req = psmx3_am_request_alloc(ep_priv->tx); + if (!req) + return -FI_ENOMEM; + + if ((flags & FI_INJECT) || + ((uintptr_t)compare != (uintptr_t)buf + len)) { + req->tmpbuf = malloc(len * 2); + if (!req->tmpbuf) { + psmx3_am_request_free(ep_priv->tx, req); + return -FI_ENOMEM; + } + memcpy(req->tmpbuf, buf, len); + memcpy((uint8_t *)req->tmpbuf + len, compare, len); + buf = req->tmpbuf; + compare = (uint8_t *)buf + len; + } + + req->no_event = (flags & PSMX3_NO_COMPLETION) || + (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)); + + req->op = PSMX3_AM_REQ_ATOMIC_COMPWRITE; + req->atomic.buf = (void *)buf; + req->atomic.len = len; + req->atomic.addr = addr; + req->atomic.key = key; + req->atomic.context = context; + req->atomic.result = result; + req->atomic.datatype = datatype; + req->ep = ep_priv; + req->cq_flags = FI_WRITE | FI_ATOMIC; + + args[0].u32w0 = PSMX3_AM_REQ_ATOMIC_COMPWRITE; + args[0].u32w1 = count; + args[1].u64 = (uint64_t)(uintptr_t)req; + args[2].u64 = addr; + args[3].u64 = key; + args[4].u32w0 = datatype; + args[4].u32w1 = op; + err = psm2_am_request_short(psm2_epaddr, + PSMX3_AM_ATOMIC_HANDLER, args, 5, + (void *)buf, len * 2, am_flags, + NULL, NULL); + if (err) { + free(req->tmpbuf); + psmx3_am_request_free(ep_priv->tx, req); + return psmx3_errno(err); + } + + psmx3_am_poll(ep_priv->tx); + return 0; +} + +ssize_t psmx3_atomic_compwritev_generic(struct fid_ep *ep, + const struct fi_ioc *iov, + void **desc, size_t count, + const struct fi_ioc *comparev, + void **compare_desc, + size_t compare_count, + struct fi_ioc *resultv, + void **result_desc, + size_t result_count, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context, + uint64_t flags) +{ + struct psmx3_fid_ep *ep_priv; + struct psmx3_fid_av *av; + struct psmx3_am_request *req; + psm2_amarg_t args[8]; + psm2_epaddr_t psm2_epaddr; + psm2_epid_t psm2_epid; + int am_flags = PSM2_AM_FLAG_ASYNC; + int chunk_size; + size_t len, iov_size; + uint8_t *buf, *compare, *result; + void *desc0, *compare_desc0, *result_desc0; + int err; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (flags & FI_TRIGGER) + return psmx3_trigger_queue_atomic_compwritev(ep, iov, desc, + count, comparev, + compare_desc, + compare_count, + resultv, + result_desc, + result_count, + dest_addr, addr, + key, datatype, op, + context, flags); + + assert(iov); + assert(count); + assert(comparev); + assert(compare_count); + assert(resultv); + assert(result_count); + assert((int)datatype >= 0 && (int)datatype < FI_DATATYPE_LAST); + assert((int)op >= 0 && (int)op < FI_ATOMIC_OP_LAST); + + while (count && !iov[count-1].count) + count--; + + while (compare_count && !comparev[compare_count-1].count) + compare_count--; + + while (result_count && !resultv[result_count-1].count) + result_count--; + + len = psmx3_ioc_size(iov, count, datatype); + + assert(psmx3_ioc_size(comparev, compare_count, datatype) >= len); + assert(psmx3_ioc_size(resultv, result_count, datatype) >= len); + + av = ep_priv->av; + assert(av); + + psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type); + psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid); + + if (psm2_epid == ep_priv->tx->psm2_epid) { + if (count > 1) { + buf = malloc(len); + if (!buf) + return -FI_ENOMEM; + psmx3_ioc_read(iov, count, datatype, buf, len); + desc0 = NULL; + } else { + buf = iov[0].addr; + desc0 = desc ? desc[0] : NULL; + } + + if (compare_count > 1) { + compare = malloc(len); + if (!compare) { + if (count > 1) + free(buf); + return -FI_ENOMEM; + } + psmx3_ioc_read(comparev, compare_count, datatype, compare, len); + compare_desc0 = NULL; + } else { + compare = comparev[0].addr; + compare_desc0 = compare_desc ? compare_desc[0] : NULL; + } + + if (result_count > 1) { + result = malloc(len); + if (!result) { + if (compare_count > 1) + free(compare); + if (count > 1) + free(buf); + return -FI_ENOMEM; + } + result_desc0 = NULL; + } else { + result = resultv[0].addr; + result_desc0 = result_desc ? result_desc[0] : NULL; + } + + err = psmx3_atomic_self(PSMX3_AM_REQ_ATOMIC_COMPWRITE, ep_priv, + buf, len / ofi_datatype_size(datatype), desc0, + compare, compare_desc0, result, result_desc0, + addr, key, datatype, op, context, flags); + + if (result_count > 1) { + psmx3_ioc_write(resultv, result_count, datatype, result, len); + free(result); + } + + if (compare_count > 1) + free(compare); + + if (count > 1) + free(buf); + + return err; + } + + chunk_size = ep_priv->tx->psm2_am_param.max_request_short; + if (len * 2 > chunk_size) + return -FI_EMSGSIZE; + + iov_size = result_count > 1 ? result_count * sizeof(struct fi_ioc) : 0; + + req = psmx3_am_request_alloc(ep_priv->tx); + if (!req) + return -FI_ENOMEM; + + if ((flags & FI_INJECT) || count > 1 || compare_count > 1 || + (uintptr_t)comparev[0].addr != (uintptr_t)iov[0].addr + len) { + req->tmpbuf = malloc(iov_size + len + len); + if (!req->tmpbuf) { + psmx3_am_request_free(ep_priv->tx, req); + return -FI_ENOMEM; + } + buf = (uint8_t *)req->tmpbuf + iov_size; + psmx3_ioc_read(iov, count, datatype, buf, len); + psmx3_ioc_read(comparev, compare_count, datatype, buf + len, len); + } else { + req->tmpbuf = malloc(iov_size); + if (!req->tmpbuf) { + psmx3_am_request_free(ep_priv->tx, req); + return -FI_ENOMEM; + } + buf = iov[0].addr; + } + + req->ioc = req->tmpbuf; + if (iov_size) { + memcpy(req->ioc, resultv, iov_size); + req->atomic.iov_count = result_count; + req->atomic.result = NULL; + } else { + req->atomic.buf = buf; + req->atomic.result = resultv[0].addr; + } + + req->no_event = (flags & PSMX3_NO_COMPLETION) || + (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)); + + req->op = PSMX3_AM_REQ_ATOMIC_COMPWRITE; + req->atomic.len = len; + req->atomic.addr = addr; + req->atomic.key = key; + req->atomic.context = context; + req->atomic.datatype = datatype; + req->ep = ep_priv; + req->cq_flags = FI_WRITE | FI_ATOMIC; + + args[0].u32w0 = PSMX3_AM_REQ_ATOMIC_COMPWRITE; + args[0].u32w1 = len / ofi_datatype_size(datatype); + args[1].u64 = (uint64_t)(uintptr_t)req; + args[2].u64 = addr; + args[3].u64 = key; + args[4].u32w0 = datatype; + args[4].u32w1 = op; + err = psm2_am_request_short(psm2_epaddr, + PSMX3_AM_ATOMIC_HANDLER, args, 5, + buf, len * 2, am_flags, NULL, NULL); + if (err) { + free(req->tmpbuf); + psmx3_am_request_free(ep_priv->tx, req); + return psmx3_errno(err); + } + + psmx3_am_poll(ep_priv->tx); + return 0; +} + +DIRECT_FN +STATIC ssize_t psmx3_atomic_compwrite(struct fid_ep *ep, + const void *buf, + size_t count, void *desc, + const void *compare, void *compare_desc, + void *result, void *result_desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + return psmx3_atomic_compwrite_generic(ep, buf, count, desc, + compare, compare_desc, + result, result_desc, + dest_addr, addr, key, + datatype, op, context, ep_priv->tx_flags); +} + +DIRECT_FN +STATIC ssize_t psmx3_atomic_compwritemsg(struct fid_ep *ep, + const struct fi_msg_atomic *msg, + const struct fi_ioc *comparev, + void **compare_desc, + size_t compare_count, + struct fi_ioc *resultv, + void **result_desc, + size_t result_count, + uint64_t flags) +{ + assert(msg); + assert(msg->msg_iov); + assert(msg->iov_count); + assert(msg->rma_iov); + assert(msg->rma_iov_count == 1); + assert(comparev); + assert(compare_count); + assert(resultv); + assert(result_count); + + if (msg->iov_count > 1 || compare_count > 1 || result_count > 1) + return psmx3_atomic_compwritev_generic(ep, msg->msg_iov, msg->desc, + msg->iov_count, comparev, + compare_desc, compare_count, + resultv, result_desc, result_count, + msg->addr, msg->rma_iov[0].addr, + msg->rma_iov[0].key, msg->datatype, + msg->op, msg->context, flags); + + return psmx3_atomic_compwrite_generic(ep, msg->msg_iov[0].addr, + msg->msg_iov[0].count, + msg->desc ? msg->desc[0] : NULL, + comparev[0].addr, + compare_desc ? compare_desc[0] : NULL, + resultv[0].addr, + result_desc ? result_desc[0] : NULL, + msg->addr, msg->rma_iov[0].addr, + msg->rma_iov[0].key, msg->datatype, + msg->op, msg->context, flags); +} + +DIRECT_FN +STATIC ssize_t psmx3_atomic_compwritev(struct fid_ep *ep, + const struct fi_ioc *iov, + void **desc, size_t count, + const struct fi_ioc *comparev, + void **compare_desc, + size_t compare_count, + struct fi_ioc *resultv, + void **result_desc, + size_t result_count, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + assert(iov); + assert(count); + assert(comparev); + assert(compare_count); + assert(resultv); + assert(result_count); + + if (count > 1 || compare_count > 1 || result_count > 1) + return psmx3_atomic_compwritev_generic(ep, iov, desc, count, + comparev, compare_desc, + compare_count, resultv, + result_desc, result_count, + dest_addr, addr, key, + datatype, op, context, + ep_priv->tx_flags); + + return psmx3_atomic_compwrite_generic(ep, iov->addr, iov->count, + desc ? desc[0] : NULL, + comparev[0].addr, + compare_desc ? compare_desc[0] : NULL, + resultv[0].addr, + result_desc ? result_desc[0] : NULL, + dest_addr, addr, key, datatype, op, + context, ep_priv->tx_flags); +} + +static int psmx3_atomic_writevalid_internal(size_t chunk_size, + enum fi_datatype datatype, + enum fi_op op, size_t *count) +{ + if (datatype >= FI_DATATYPE_LAST) + return -FI_EOPNOTSUPP; + + switch (op) { + case FI_MIN: + case FI_MAX: + case FI_SUM: + case FI_PROD: + case FI_LOR: + case FI_LAND: + case FI_BOR: + case FI_BAND: + case FI_LXOR: + case FI_BXOR: + case FI_ATOMIC_WRITE: + break; + + default: + return -FI_EOPNOTSUPP; + } + + if (count) + *count = chunk_size / ofi_datatype_size(datatype); + + return 0; +} + +static int psmx3_atomic_readwritevalid_internal(size_t chunk_size, + enum fi_datatype datatype, + enum fi_op op, size_t *count) +{ + if (datatype >= FI_DATATYPE_LAST) + return -FI_EOPNOTSUPP; + + switch (op) { + case FI_MIN: + case FI_MAX: + case FI_SUM: + case FI_PROD: + case FI_LOR: + case FI_LAND: + case FI_BOR: + case FI_BAND: + case FI_LXOR: + case FI_BXOR: + case FI_ATOMIC_READ: + case FI_ATOMIC_WRITE: + break; + + default: + return -FI_EOPNOTSUPP; + } + + if (count) + *count = chunk_size / ofi_datatype_size(datatype); + + return 0; +} + +static int psmx3_atomic_compwritevalid_internal(size_t chunk_size, + enum fi_datatype datatype, + enum fi_op op, size_t *count) +{ + + if (datatype >= FI_DATATYPE_LAST) + return -FI_EOPNOTSUPP; + + switch (op) { + case FI_CSWAP: + case FI_CSWAP_NE: + break; + + case FI_CSWAP_LE: + case FI_CSWAP_LT: + case FI_CSWAP_GE: + case FI_CSWAP_GT: + if (datatype == FI_FLOAT_COMPLEX || + datatype == FI_DOUBLE_COMPLEX || + datatype == FI_LONG_DOUBLE_COMPLEX) + return -FI_EOPNOTSUPP; + break; + + case FI_MSWAP: + if (datatype == FI_FLOAT_COMPLEX || + datatype == FI_DOUBLE_COMPLEX || + datatype == FI_LONG_DOUBLE_COMPLEX || + datatype == FI_FLOAT || + datatype == FI_DOUBLE || + datatype == FI_LONG_DOUBLE) + return -FI_EOPNOTSUPP; + break; + + default: + return -FI_EOPNOTSUPP; + } + + if (count) + *count = chunk_size / (2 * ofi_datatype_size(datatype)); + + return 0; +} + +DIRECT_FN +STATIC int psmx3_atomic_writevalid(struct fid_ep *ep, + enum fi_datatype datatype, + enum fi_op op, size_t *count) +{ + struct psmx3_fid_ep *ep_priv; + size_t chunk_size; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + chunk_size = ep_priv->tx->psm2_am_param.max_request_short; + return psmx3_atomic_writevalid_internal(chunk_size, datatype, op, count); +} + +DIRECT_FN +STATIC int psmx3_atomic_readwritevalid(struct fid_ep *ep, + enum fi_datatype datatype, + enum fi_op op, size_t *count) +{ + struct psmx3_fid_ep *ep_priv; + size_t chunk_size; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + chunk_size = ep_priv->tx->psm2_am_param.max_request_short; + return psmx3_atomic_readwritevalid_internal(chunk_size, datatype, op, count); +} + +DIRECT_FN +STATIC int psmx3_atomic_compwritevalid(struct fid_ep *ep, + enum fi_datatype datatype, + enum fi_op op, size_t *count) +{ + struct psmx3_fid_ep *ep_priv; + size_t chunk_size; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + chunk_size = ep_priv->tx->psm2_am_param.max_request_short; + return psmx3_atomic_compwritevalid_internal(chunk_size, datatype, op, count); +} + +DIRECT_FN +int psmx3_query_atomic(struct fid_domain *domain, enum fi_datatype datatype, + enum fi_op op, struct fi_atomic_attr *attr, uint64_t flags) +{ + struct psmx3_fid_domain *domain_priv; + size_t chunk_size; + size_t count; + int ret; + + domain_priv = container_of(domain, struct psmx3_fid_domain, util_domain.domain_fid); + chunk_size = domain_priv->max_atomic_size; + + if (flags & FI_TAGGED) + return -FI_EOPNOTSUPP; + + if (flags & FI_COMPARE_ATOMIC) { + if (flags & FI_FETCH_ATOMIC) + return -FI_EINVAL; + ret = psmx3_atomic_compwritevalid_internal(chunk_size, datatype, + op, &count); + } else if (flags & FI_FETCH_ATOMIC) { + ret = psmx3_atomic_readwritevalid_internal(chunk_size, datatype, + op, &count); + } else { + ret = psmx3_atomic_writevalid_internal(chunk_size, datatype, + op, &count); + } + + if (attr && !ret) { + attr->size = ofi_datatype_size(datatype); + attr->count = count; + } + + return ret; +} + +struct fi_ops_atomic psmx3_atomic_ops = { + .size = sizeof(struct fi_ops_atomic), + .write = psmx3_atomic_write, + .writev = psmx3_atomic_writev, + .writemsg = psmx3_atomic_writemsg, + .inject = psmx3_atomic_inject, + .readwrite = psmx3_atomic_readwrite, + .readwritev = psmx3_atomic_readwritev, + .readwritemsg = psmx3_atomic_readwritemsg, + .compwrite = psmx3_atomic_compwrite, + .compwritev = psmx3_atomic_compwritev, + .compwritemsg = psmx3_atomic_compwritemsg, + .writevalid = psmx3_atomic_writevalid, + .readwritevalid = psmx3_atomic_readwritevalid, + .compwritevalid = psmx3_atomic_compwritevalid, +}; + diff --git a/prov/psm3/src/psmx3_attr.c b/prov/psm3/src/psmx3_attr.c new file mode 100644 index 00000000000..30c673e065a --- /dev/null +++ b/prov/psm3/src/psmx3_attr.c @@ -0,0 +1,594 @@ +/* + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx3.h" + +/* + * Default provider attributes are defined for: + * + * full set of capabilities + * ep type = FI_EP_RDM + * addr format = FI_ADDR_PSMX3 + * cq_data_size = 0 + * + * This is used as a template to create actual provider info, which will + * have some fields modified for different configurations and some fields + * updated to environment settings. + */ + +static struct fi_tx_attr psmx3_tx_attr = { + .caps = PSMX3_TX_CAPS, /* PSMX3_RMA_TX_CAPS */ + .mode = FI_CONTEXT, /* 0 */ + .op_flags = PSMX3_OP_FLAGS, + .msg_order = PSMX3_MSG_ORDER, + .comp_order = PSMX3_COMP_ORDER, + .inject_size = 64, /* psmx3_env.inject_size */ + .size = UINT64_MAX, + .iov_limit = PSMX3_IOV_MAX_COUNT, + .rma_iov_limit = 1, +}; + +static struct fi_rx_attr psmx3_rx_attr = { + .caps = PSMX3_RX_CAPS, /* PSMX3_RMA_RX_CAPS */ + .mode = FI_CONTEXT, /* 0 */ + .op_flags = PSMX3_OP_FLAGS, + .msg_order = PSMX3_MSG_ORDER, + .comp_order = PSMX3_COMP_ORDER, + .total_buffered_recv = UINT64_MAX, + .size = UINT64_MAX, + .iov_limit = 1, +}; + +static struct fi_ep_attr psmx3_ep_attr = { + .type = FI_EP_RDM, /* FI_EP_DGRAM */ + .protocol = FI_PROTO_PSMX3, + .protocol_version = PSM2_VERNO, + .max_msg_size = PSMX3_MAX_MSG_SIZE & ~0x0FFF, + .msg_prefix_size = 0, + .max_order_raw_size = PSMX3_RMA_ORDER_SIZE, + .max_order_war_size = PSMX3_RMA_ORDER_SIZE, + .max_order_waw_size = PSMX3_RMA_ORDER_SIZE, + .mem_tag_format = FI_TAG_GENERIC, /* >>= 4 */ + .tx_ctx_cnt = 1, + .rx_ctx_cnt = 1, + .auth_key_size = sizeof(psm2_uuid_t), + .auth_key = NULL, +}; + +static struct fi_domain_attr psmx3_domain_attr = { + .domain = NULL, + .name = PSMX3_DOMAIN_NAME, + .threading = FI_THREAD_SAFE, + .control_progress = FI_PROGRESS_AUTO, + .data_progress = FI_PROGRESS_AUTO, + .resource_mgmt = FI_RM_ENABLED, + .av_type = FI_AV_UNSPEC, + .mr_mode = FI_MR_SCALABLE | FI_MR_BASIC, + .mr_key_size = sizeof(uint64_t), + .cq_data_size = 0, /* 4, 8 */ + .cq_cnt = 65535, + .ep_cnt = 65535, + .tx_ctx_cnt = 1, /* psmx3_hfi_info.free_trx_ctxt */ + .rx_ctx_cnt = 1, /* psmx3_hfi_info.free_trx_ctxt */ + .max_ep_tx_ctx = 1, /* psmx3_hfi_info.max_trx_ctxt */ + .max_ep_rx_ctx = 1, /* psmx3_hfi_info.max_trx_ctxt */ + .max_ep_stx_ctx = 1, /* psmx3_hfi_info.max_trx_ctxt */ + .max_ep_srx_ctx = 0, + .cntr_cnt = 65535, + .mr_iov_limit = 65535, + .caps = PSMX3_DOM_CAPS, + .mode = 0, + .auth_key = NULL, + .auth_key_size = sizeof(psm2_uuid_t), + .max_err_data = PSMX3_ERR_DATA_SIZE, + .mr_cnt = 65535, +}; + +static struct fi_fabric_attr psmx3_fabric_attr = { + .name = PSMX3_FABRIC_NAME, + .prov_version = OFI_VERSION_DEF_PROV, +}; + +static struct fi_info psmx3_prov_info = { + .next = NULL, + .caps = PSMX3_CAPS, /* PSMX3_RMA_CAPS */ + .mode = FI_CONTEXT, /* 0 */ + .addr_format = FI_ADDR_PSMX3, /* FI_ADDR_STR */ + .src_addrlen = sizeof(struct psmx3_ep_name), + .dest_addrlen = sizeof(struct psmx3_ep_name), + .src_addr = NULL, + .dest_addr = NULL, + .handle = NULL, + .tx_attr = &psmx3_tx_attr, + .rx_attr = &psmx3_rx_attr, + .ep_attr = &psmx3_ep_attr, + .domain_attr = &psmx3_domain_attr, + .fabric_attr = &psmx3_fabric_attr, +}; + +#ifdef HAVE_PSM3_DL +static struct fi_info *psmx3_allocinfo_internal(void) +{ + struct fi_info *info; + + info = calloc(1, sizeof(*info)); + if (!info) + return NULL; + + info->tx_attr = calloc(1, sizeof(*info->tx_attr)); + info->rx_attr = calloc(1, sizeof(*info->rx_attr)); + info->ep_attr = calloc(1, sizeof(*info->ep_attr)); + info->domain_attr = calloc(1, sizeof(*info->domain_attr)); + info->fabric_attr = calloc(1, sizeof(*info->fabric_attr)); + if (!info->tx_attr|| !info->rx_attr || !info->ep_attr || + !info->domain_attr || !info->fabric_attr) + goto err; + + return info; +err: + fi_freeinfo(info); + return NULL; +} +static struct fi_info *psmx3_dupinfo(const struct fi_info *info) +{ + struct fi_info *dup; + int ret; + + if (!info) + return psmx3_allocinfo_internal(); + + dup = mem_dup(info, sizeof(*dup)); + if (dup == NULL) { + return NULL; + } + dup->src_addr = NULL; + dup->dest_addr = NULL; + dup->tx_attr = NULL; + dup->rx_attr = NULL; + dup->ep_attr = NULL; + dup->domain_attr = NULL; + dup->fabric_attr = NULL; + dup->next = NULL; + + if (info->src_addr != NULL) { + dup->src_addr = mem_dup(info->src_addr, info->src_addrlen); + if (dup->src_addr == NULL) + goto fail; + } + if (info->dest_addr != NULL) { + dup->dest_addr = mem_dup(info->dest_addr, info->dest_addrlen); + if (dup->dest_addr == NULL) + goto fail; + } + if (info->tx_attr != NULL) { + dup->tx_attr = mem_dup(info->tx_attr, sizeof(*info->tx_attr)); + if (dup->tx_attr == NULL) + goto fail; + } + if (info->rx_attr != NULL) { + dup->rx_attr = mem_dup(info->rx_attr, sizeof(*info->rx_attr)); + if (dup->rx_attr == NULL) + goto fail; + } + if (info->ep_attr != NULL) { + dup->ep_attr = mem_dup(info->ep_attr, sizeof(*info->ep_attr)); + if (dup->ep_attr == NULL) + goto fail; + if (info->ep_attr->auth_key != NULL) { + dup->ep_attr->auth_key = + mem_dup(info->ep_attr->auth_key, + info->ep_attr->auth_key_size); + if (dup->ep_attr->auth_key == NULL) + goto fail; + } + } + if (info->domain_attr) { + dup->domain_attr = mem_dup(info->domain_attr, + sizeof(*info->domain_attr)); + if (dup->domain_attr == NULL) + goto fail; + dup->domain_attr->name = NULL; + dup->domain_attr->auth_key = NULL; + if (info->domain_attr->name != NULL) { + dup->domain_attr->name = strdup(info->domain_attr->name); + if (dup->domain_attr->name == NULL) + goto fail; + } + if (info->domain_attr->auth_key != NULL) { + dup->domain_attr->auth_key = + mem_dup(info->domain_attr->auth_key, + info->domain_attr->auth_key_size); + if (dup->domain_attr->auth_key == NULL) + goto fail; + } + } + if (info->fabric_attr) { + dup->fabric_attr = mem_dup(info->fabric_attr, + sizeof(*info->fabric_attr)); + if (dup->fabric_attr == NULL) + goto fail; + dup->fabric_attr->name = NULL; + dup->fabric_attr->prov_name = NULL; + if (info->fabric_attr->name != NULL) { + dup->fabric_attr->name = strdup(info->fabric_attr->name); + if (dup->fabric_attr->name == NULL) + goto fail; + } + if (info->fabric_attr->prov_name != NULL) { + dup->fabric_attr->prov_name = strdup(info->fabric_attr->prov_name); + if (dup->fabric_attr->prov_name == NULL) + goto fail; + } + } + + if (info->nic) { + ret = fi_control(&info->nic->fid, FI_DUP, &dup->nic); + if (ret && ret != -FI_ENOSYS) + goto fail; + } + + return dup; + +fail: + fi_freeinfo(dup); + return NULL; +} +#else +#define psmx3_dupinfo fi_dupinfo +#endif /* HAVE_PSM3_DL */ + +/* + * Possible provider variations: + * + * (1) FI_ADDR_PSMX3, FI_EP_RDM, tag64 (cq_data_size 0, FI_CONTEXT) + * (2) FI_ADDR_PSMX3, FI_EP_RDM, tag60 (cq_data_size 4, FI_CONTEXT) + * (3) FI_ADDR_PSMX3, FI_EP_RDM, rma (cq_data_size 8) + * (4) FI_ADDR_PSMX3, FI_EP_DGRAM, tag64 (cq_data_size 0, FI_CONTEXT) + * (5) FI_ADDR_PSMX3, FI_EP_DGRAM, tag60 (cq_data_size 4, FI_CONTEXT) + * (6) FI_ADDR_PSMX3, FI_EP_DGRAM, rma (cq_data_size 8) + * (7) FI_ADDR_STR, FI_EP_RDM, tag64 (cq_data_size 0, FI_CONTEXT) + * (8) FI_ADDR_STR, FI_EP_RDM, tag60 (cq_data_size 4, FI_CONTEXT) + * (9) FI_ADDR_STR, FI_EP_RDM, rma (cq_data_size 8) + * (10) FI_ADDR_STR, FI_EP_DGRAM, tag64 (cq_data_size 0, FI_CONTEXT) + * (11) FI_ADDR_STR, FI_EP_DGRAM, tag60 (cq_data_size 4, FI_CONTEXT) + * (12) FI_ADDR_STR, FI_EP_DGRAM, rma (cq_data_size 8) + * + * To avoid returning all 12 provider variations for an unrestricted query, + * "addr_format" and "ep_type" are checked first and a single value is set + * for each of them. As the result, at most three provider instances (tag64, + * tag60, rma) are returned. + * + * This also bypasses queries obviously unsuitable for this provider and + * avoid unnecessary initialization steps. + */ + +int psmx3_init_prov_info(const struct fi_info *hints, struct fi_info **info) +{ + struct fi_fabric_attr *fabric_attr = &psmx3_fabric_attr; + struct fi_info *prov_info = &psmx3_prov_info; + struct fi_info *info_out, *info_new; + int addr_format = FI_ADDR_PSMX3; + int addr_format2 = FI_ADDR_STR; + int ep_type = FI_EP_RDM; + int ep_type2 = FI_EP_DGRAM; + + if (!hints) + goto alloc_info; + + if (hints->fabric_attr && hints->fabric_attr->name && + strcasecmp(hints->fabric_attr->name, fabric_attr->name)) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, "Unknown fabric name\n"); + FI_INFO_NAME(&psmx3_prov, fabric_attr, hints->fabric_attr); + return -FI_ENODATA; + } + + if (hints->ep_attr) { + switch (hints->ep_attr->type) { + case FI_EP_UNSPEC: + case FI_EP_RDM: + break; + case FI_EP_DGRAM: + ep_type = FI_EP_DGRAM; + break; + default: + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "Unsupported endpoint type\n"); + FI_INFO(&psmx3_prov, FI_LOG_CORE, "Supported: %s\n", + fi_tostr(&ep_type, FI_TYPE_EP_TYPE)); + FI_INFO(&psmx3_prov, FI_LOG_CORE, "Supported: %s\n", + fi_tostr(&ep_type2, FI_TYPE_EP_TYPE)); + FI_INFO(&psmx3_prov, FI_LOG_CORE, "Requested: %s\n", + fi_tostr(&hints->ep_attr->type, FI_TYPE_EP_TYPE)); + return -FI_ENODATA; + } + } + + switch (hints->addr_format) { + case FI_FORMAT_UNSPEC: + case FI_ADDR_PSMX3: + break; + case FI_ADDR_STR: + addr_format = FI_ADDR_STR; + break; + default: + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "Unsupported address format\n"); + FI_INFO(&psmx3_prov, FI_LOG_CORE, "Supported: %s\n", + fi_tostr(&addr_format, FI_TYPE_ADDR_FORMAT)); + FI_INFO(&psmx3_prov, FI_LOG_CORE, "Supported: %s\n", + fi_tostr(&addr_format2, FI_TYPE_ADDR_FORMAT)); + FI_INFO(&psmx3_prov, FI_LOG_CORE, "Requested: %s\n", + fi_tostr(&hints->addr_format, FI_TYPE_ADDR_FORMAT)); + return -FI_ENODATA; + } + + if ((hints->caps & PSMX3_CAPS) != hints->caps) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, "caps not supported\n"); + FI_INFO_CHECK(&psmx3_prov, prov_info, hints, caps, FI_TYPE_CAPS); + return -FI_ENODATA; + } + +alloc_info: + info_out = NULL; + if (!hints || !(hints->caps & (FI_TAGGED | FI_MSG))) { + info_new = psmx3_dupinfo(&psmx3_prov_info); + if (info_new) { + /* rma only, 64 bit CQ data */ + info_new->addr_format = addr_format; + info_new->ep_attr->type = ep_type; + info_new->caps = PSMX3_RMA_CAPS; + info_new->mode = 0; + info_new->tx_attr->caps = PSMX3_RMA_TX_CAPS; + info_new->tx_attr->mode = 0; + info_new->rx_attr->caps = PSMX3_RMA_RX_CAPS; + info_new->rx_attr->mode = 0; + info_new->domain_attr->cq_data_size = 8; + info_out = info_new; + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "RMA only instance included\n"); + } + } + + info_new = psmx3_dupinfo(&psmx3_prov_info); + if (info_new) { + /* 60 bit tag, 32 bit CQ data */ + info_new->addr_format = addr_format; + info_new->ep_attr->type = ep_type; + info_new->ep_attr->mem_tag_format >>= 4; + info_new->domain_attr->cq_data_size = 4; + info_new->next = info_out; + info_out = info_new; + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "TAG60 instance included\n"); + } + + if (!hints || !hints->domain_attr || + !hints->domain_attr->cq_data_size) { + info_new = psmx3_dupinfo(&psmx3_prov_info); + if (info_new) { + /* 64 bit tag, no CQ data */ + info_new->addr_format = addr_format; + info_new->ep_attr->type = ep_type; + info_new->next = info_out; + info_out = info_new; + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "TAG64 instance included\n"); + } + } + + *info = info_out; + return info_out ? 0 : -FI_ENODATA; +} + +static void psmx3_dup_addr(int format, struct psmx3_ep_name *addr, + void **addr_out, size_t *len) +{ + if (!addr) + return; + + if (format == FI_ADDR_STR) { + *addr_out = psmx3_ep_name_to_string(addr, len); + } else { + *addr_out = mem_dup(addr, sizeof(*addr)); + *len = sizeof(*addr); + } +} + +static void psmx3_expand_default_unit(struct fi_info *info) +{ + struct fi_info *p, *next; + struct psmx3_ep_name *src_addr; + int i; + + p = info; + while (p) { + next = p->next; + src_addr = p->src_addr; + if (src_addr->unit == PSMX3_DEFAULT_UNIT) { + if (psmx3_hfi_info.num_active_units == 1) { + src_addr->unit = psmx3_hfi_info.active_units[0]; + } else { + for (i = 0; i < psmx3_hfi_info.num_active_units; i++) { + p->next = psmx3_dupinfo(p); + if (!p->next) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "Failed to duplicate info for HFI unit %d\n", + psmx3_hfi_info.active_units[i]); + break; + } + p = p->next; + src_addr = p->src_addr; + src_addr->unit = psmx3_hfi_info.active_units[i]; + } + } + } + p->next = next; + p = next; + } +} + +void psmx3_update_prov_info(struct fi_info *info, + struct psmx3_ep_name *src_addr, + struct psmx3_ep_name *dest_addr) +{ + struct fi_info *p; + + for (p = info; p; p = p->next) { + psmx3_dup_addr(p->addr_format, src_addr, + &p->src_addr, &p->src_addrlen); + psmx3_dup_addr(p->addr_format, dest_addr, + &p->dest_addr, &p->dest_addrlen); + } + + psmx3_expand_default_unit(info); + + for (p = info; p; p = p->next) { + int unit = ((struct psmx3_ep_name *)p->src_addr)->unit; + + if (unit == PSMX3_DEFAULT_UNIT || !psmx3_env.multi_ep) { + p->domain_attr->tx_ctx_cnt = psmx3_hfi_info.free_trx_ctxt; + p->domain_attr->rx_ctx_cnt = psmx3_hfi_info.free_trx_ctxt; + p->domain_attr->max_ep_tx_ctx = psmx3_hfi_info.max_trx_ctxt; + p->domain_attr->max_ep_rx_ctx = psmx3_hfi_info.max_trx_ctxt; + p->domain_attr->max_ep_stx_ctx = psmx3_hfi_info.max_trx_ctxt; + } else { + p->domain_attr->tx_ctx_cnt = psmx3_hfi_info.unit_nfreectxts[unit]; + p->domain_attr->rx_ctx_cnt = psmx3_hfi_info.unit_nfreectxts[unit]; + p->domain_attr->max_ep_tx_ctx = psmx3_hfi_info.unit_nctxts[unit]; + p->domain_attr->max_ep_rx_ctx = psmx3_hfi_info.unit_nctxts[unit]; + p->domain_attr->max_ep_stx_ctx = psmx3_hfi_info.unit_nctxts[unit]; + } + + free(p->domain_attr->name); + if (unit == PSMX3_DEFAULT_UNIT) + p->domain_attr->name = strdup(psmx3_hfi_info.default_domain_name); + else { + char unit_name[NAME_MAX]; + psm2_info_query_arg_t args[2]; + + args[0].unit = unit; + args[1].length = sizeof(unit_name); + + if (PSM2_OK != psm2_info_query(PSM2_INFO_QUERY_UNIT_NAME, + unit_name, 2, args)) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "Failed to read unit name for NIC unit %d\n", unit); + if (asprintf(&p->domain_attr->name, "UNKNOWN") < 0) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "Failed to allocate memory for unit name for NIC unit %d\n", unit); + } + } else { + if (asprintf(&p->domain_attr->name, "%s", unit_name) <0) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "Failed to allocate memory for unit name for NIC unit %d\n", unit); + } + } + } + + p->tx_attr->inject_size = psmx3_env.inject_size; + } +} + +int psmx3_check_prov_info(uint32_t api_version, + const struct fi_info *hints, + struct fi_info **info) +{ + struct util_prov util_prov = { .prov = &psmx3_prov }; + struct fi_info *next; + struct fi_info *prev = NULL; + struct fi_info *curr = *info; + struct fi_info *new_info = *info; + + while (curr) { + next = curr->next; + if (ofi_check_info(&util_prov, curr, api_version, hints)) { + if (prev) + prev->next = next; + else + new_info = next; + curr->next = NULL; + fi_freeinfo(curr); + } else { + prev = curr; + } + curr = next; + } + + *info = new_info; + return new_info ? 0 : -FI_ENODATA; +} + +void psmx3_alter_prov_info(uint32_t api_version, + const struct fi_info *hints, + struct fi_info *info) +{ + int cnt = 0; + int cq_data_cnt = 0; + + ofi_alter_info(info, hints, api_version); + + /* + * Some of the default values are set to simplify info + * checking. Now change them back to the preferred values. + */ + for (; info; info = info->next) { + if (!hints || !hints->domain_attr || + !hints->domain_attr->control_progress) + info->domain_attr->control_progress = + FI_PROGRESS_MANUAL; + + if (!hints || !hints->domain_attr || + !hints->domain_attr->data_progress) + info->domain_attr->data_progress = + FI_PROGRESS_MANUAL; + + if (info->domain_attr->mr_mode == (FI_MR_BASIC | FI_MR_SCALABLE)) + info->domain_attr->mr_mode = FI_MR_SCALABLE; + + /* + * Avoid automatically adding secondary caps that may negatively + * impact performance. + */ + if (hints && hints->caps && !(hints->caps & FI_TRIGGER)) + info->caps &= ~FI_TRIGGER; + + if (info->domain_attr->cq_data_size) + cq_data_cnt++; + + cnt++; + } + + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "%d instances available, %d with CQ data flag set\n", + cnt, cq_data_cnt); +} + diff --git a/prov/psm3/src/psmx3_av.c b/prov/psm3/src/psmx3_av.c new file mode 100644 index 00000000000..22374e5cd13 --- /dev/null +++ b/prov/psm3/src/psmx3_av.c @@ -0,0 +1,1194 @@ +/* + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx3.h" + +/* + * SEP address query protocol: + * + * SEP Query REQ: + * args[0].u32w0 cmd, version + * args[0].u32w1 id + * args[1].u64 sep_info + * args[2].u64 status + * + * SEP Query REP: + * args[0].u32w0 cmd, version + * args[0].u32w1 error + * args[1].u64 sep_info + * args[2].u64 status + * args[3].u64 n + * data epids + */ + +static int psmx3_am_sep_match(struct dlist_entry *entry, const void *arg) +{ + struct psmx3_fid_sep *sep; + + sep = container_of(entry, struct psmx3_fid_sep, entry); + return ((uintptr_t)sep->id == (uintptr_t)arg); +} + +static void psmx3_am_sep_completion(void *buf) +{ + free(buf); +} + +int psmx3_am_sep_handler(psm2_am_token_t token, psm2_amarg_t *args, + int nargs, void *src, uint32_t len, void *hctx) +{ + struct psmx3_fid_domain *domain; + psm2_amarg_t rep_args[4]; + int op_error = 0; + int err = 0; + int cmd, version; + int n, i, j; + uint8_t sep_id; + struct psmx3_fid_sep *sep; + struct psmx3_av_sep *sep_info; + ofi_atomic32_t *status; + psm2_epid_t *epids; + psm2_epid_t *buf = NULL; + int buflen; + struct dlist_entry *entry; + struct psmx3_trx_ctxt *trx_ctxt = hctx; + + cmd = PSMX3_AM_GET_OP(args[0].u32w0); + version = PSMX3_AM_GET_VER(args[0].u32w0); + if (version != PSMX3_AM_SEP_VERSION) { + FI_WARN(&psmx3_prov, FI_LOG_AV, + "AM SEP protocol version mismatch: request %d handler %d\n", + version, PSMX3_AM_SEP_VERSION); + return -FI_EINVAL; + } + + domain = trx_ctxt->domain; + + switch (cmd) { + case PSMX3_AM_REQ_SEP_QUERY: + sep_id = args[0].u32w1; + domain->sep_lock_fn(&domain->sep_lock, 1); + entry = dlist_find_first_match(&domain->sep_list, psmx3_am_sep_match, + (void *)(uintptr_t)sep_id); + if (!entry) { + op_error = PSM2_EPID_UNKNOWN; + n = 0; + buflen = 0; + } else { + sep = container_of(entry, struct psmx3_fid_sep, entry); + n = sep->ctxt_cnt; + buflen = n * sizeof(psm2_epid_t); + if (n) { + buf = malloc(buflen); + if (!buf) { + op_error = PSM2_NO_MEMORY; + buflen = 0; + n = 0; + } + for (i=0; i< n; i++) + buf[i] = sep->ctxts[i].trx_ctxt->psm2_epid; + } + } + domain->sep_unlock_fn(&domain->sep_lock, 1); + + rep_args[0].u32w0 = PSMX3_AM_REP_SEP_QUERY; + PSMX3_AM_SET_VER(rep_args[0].u32w0, PSMX3_AM_SEP_VERSION); + rep_args[0].u32w1 = op_error; + rep_args[1].u64 = args[1].u64; + rep_args[2].u64 = args[2].u64; + rep_args[3].u64 = n; + err = psm2_am_reply_short(token, PSMX3_AM_SEP_HANDLER, + rep_args, 4, buf, buflen, 0, + psmx3_am_sep_completion, buf); + break; + + case PSMX3_AM_REP_SEP_QUERY: + op_error = args[0].u32w1; + sep_info = (struct psmx3_av_sep *)(uintptr_t)args[1].u64; + status = (void *)(uintptr_t)args[2].u64; + if (op_error) { + ofi_atomic_set32(status, psmx3_errno(op_error)); + } else { + n = args[3].u64; + epids = malloc(n * sizeof(psm2_epid_t)); + if (!epids) { + ofi_atomic_set32(status, -FI_ENOMEM); + } else { + for (j=0; jctxt_cnt = n; + sep_info->epids = epids; + ofi_atomic_set32(status, 0); + } + } + break; + + default: + err = -FI_EINVAL; + break; + } + + return err; +} + +static void psmx3_set_epaddr_context(struct psmx3_trx_ctxt *trx_ctxt, + psm2_epid_t epid, psm2_epaddr_t epaddr) +{ + struct psmx3_epaddr_context *context; + struct psmx3_epaddr_context *old_context = NULL; + + context = (void *)psm2_epaddr_getctxt(epaddr); + if (context) { + if (context->trx_ctxt != trx_ctxt || context->epid != epid) { + FI_WARN(&psmx3_prov, FI_LOG_AV, + "trx_ctxt or epid doesn't match\n"); + old_context = context; + context = NULL; + } + } + + if (context) + return; + + context = malloc(sizeof *context); + if (!context) { + FI_WARN(&psmx3_prov, FI_LOG_AV, + "cannot allocate context\n"); + return; + } + + context->trx_ctxt = trx_ctxt; + context->epid = epid; + context->epaddr = epaddr; + psm2_epaddr_setctxt(epaddr, context); + free(old_context); + + trx_ctxt->domain->peer_lock_fn(&trx_ctxt->peer_lock, 2); + dlist_insert_before(&context->entry, &trx_ctxt->peer_list); + trx_ctxt->domain->peer_unlock_fn(&trx_ctxt->peer_lock, 2); +} + +void psmx3_epid_to_epaddr(struct psmx3_trx_ctxt *trx_ctxt, + psm2_epid_t epid, psm2_epaddr_t *epaddr) +{ + int err; + psm2_error_t errors; + psm2_epconn_t epconn; + struct psmx3_epaddr_context *context; + + err = psm2_ep_epid_lookup2(trx_ctxt->psm2_ep, epid, &epconn); + if (err == PSM2_OK) { + context = psm2_epaddr_getctxt(epconn.addr); + if (context && context->epid == epid) { + *epaddr = epconn.addr; + return; + } + } + + err = psm2_ep_connect(trx_ctxt->psm2_ep, 1, &epid, NULL, &errors, epaddr, + (int64_t) psmx3_env.conn_timeout * 1000000000LL); + if (err == PSM2_OK || err == PSM2_EPID_ALREADY_CONNECTED) { + psmx3_set_epaddr_context(trx_ctxt, epid, *epaddr); + return; + } + + /* call fi_log() directly to always generate the output */ + if (err == PSM2_TIMEOUT) + fi_log(&psmx3_prov, FI_LOG_WARN, FI_LOG_AV, __func__, __LINE__, + "psm2_ep_connect returned error %s, remote epid=%lx." + "Try setting FI_PSM3_CONN_TIMEOUT " + "to a larger value (current: %d seconds).\n", + psm2_error_get_string(err), epid, psmx3_env.conn_timeout); + else + fi_log(&psmx3_prov, FI_LOG_WARN, FI_LOG_AV, __func__, __LINE__, + "psm2_ep_connect returned error %s, remote epid=%lx.\n", + psm2_error_get_string(err), epid); + + abort(); +} + +/* + * Must be called with av->lock held + */ +static int psmx3_av_check_space(struct psmx3_fid_av *av, size_t count) +{ + psm2_epaddr_t *new_epaddrs; + psm2_epaddr_t **new_sepaddrs; + struct psmx3_av_hdr *new_hdr; + struct psmx3_av_sep *new_sep_info; + size_t new_count; + size_t old_table_size, new_table_size; + int i; + + new_count = av->count; + while (new_count < av->hdr->last + count) + new_count = new_count * 2; + + if ((new_count <= av->count) && av->table) + return 0; + + old_table_size = PSMX3_AV_TABLE_SIZE(av->count, av->shared); + new_table_size = PSMX3_AV_TABLE_SIZE(new_count, av->shared); + if (av->shared) { + new_hdr = mremap(av->hdr, old_table_size, new_table_size, 0); + if (new_hdr == MAP_FAILED) + return -FI_ENOMEM; + av->hdr = new_hdr; + av->map = (fi_addr_t *)(av->hdr + 1); + av->table = (struct psmx3_av_addr *)(av->map + new_count); + for (i = 0; i < new_count; i++) + av->map[i] = i; + } else { + new_hdr = realloc(av->hdr, new_table_size); + if (!new_hdr) + return -FI_ENOMEM; + av->hdr = new_hdr; + av->table = (struct psmx3_av_addr *)(av->hdr + 1); + } + + new_sep_info = realloc(av->sep_info, new_count * sizeof(*new_sep_info)); + if (!new_sep_info) + return -FI_ENOMEM; + av->sep_info = new_sep_info; + + for (i = 0; i < av->max_trx_ctxt; i++) { + if (!av->conn_info[i].trx_ctxt) + continue; + + new_epaddrs = realloc(av->conn_info[i].epaddrs, + new_count * sizeof(*new_epaddrs)); + if (!new_epaddrs) + return -FI_ENOMEM; + memset(new_epaddrs + av->hdr->last, 0, + (new_count - av->hdr->last) * sizeof(*new_epaddrs)); + av->conn_info[i].epaddrs = new_epaddrs; + + new_sepaddrs = realloc(av->conn_info[i].sepaddrs, + new_count * sizeof(*new_sepaddrs)); + if (!new_sepaddrs) + return -FI_ENOMEM; + memset(new_sepaddrs + av->hdr->last, 0, + (new_count - av->hdr->last) * sizeof(*new_sepaddrs)); + av->conn_info[i].sepaddrs = new_sepaddrs; + } + + av->count = av->hdr->size = new_count; + return 0; +} + +static void psmx3_av_post_completion(struct psmx3_fid_av *av, void *context, + uint64_t data, int prov_errno) +{ + if (prov_errno) { + struct fi_eq_err_entry entry; + entry.fid = &av->av.fid; + entry.context = context; + entry.data = data; + entry.err = -psmx3_errno(prov_errno); + entry.prov_errno = prov_errno; + entry.err_data = NULL; + entry.err_data_size = 0; + fi_eq_write(av->eq, FI_AV_COMPLETE, &entry, sizeof(entry), + UTIL_FLAG_ERROR); + } else { + struct fi_eq_entry entry; + entry.fid = &av->av.fid; + entry.context = context; + entry.data = data; + fi_eq_write(av->eq, FI_AV_COMPLETE, &entry, sizeof(entry), 0); + } +} + +/* + * Must be called with av->lock held + */ +int psmx3_av_query_sep(struct psmx3_fid_av *av, + struct psmx3_trx_ctxt *trx_ctxt, + size_t idx) +{ + ofi_atomic32_t status; /* 1: pending, 0: succ, <0: error */ + psm2_amarg_t args[3]; + int error; + + if (!av->conn_info[trx_ctxt->id].epaddrs[idx]) + psmx3_epid_to_epaddr(trx_ctxt, av->table[idx].epid, + &av->conn_info[trx_ctxt->id].epaddrs[idx]); + + psmx3_am_init(trx_ctxt); /* check AM handler installation */ + + ofi_atomic_initialize32(&status, 1); + + args[0].u32w0 = PSMX3_AM_REQ_SEP_QUERY; + PSMX3_AM_SET_VER(args[0].u32w0, PSMX3_AM_SEP_VERSION); + args[0].u32w1 = av->table[idx].sep_id; + args[1].u64 = (uint64_t)(uintptr_t)&av->sep_info[idx]; + args[2].u64 = (uint64_t)(uintptr_t)&status; + error = psm2_am_request_short(av->conn_info[trx_ctxt->id].epaddrs[idx], + PSMX3_AM_SEP_HANDLER, args, 3, NULL, + 0, 0, NULL, NULL); + + if (error) + return error; + + /* + * make sure AM is progressed promptly. don't call + * psmx3_progress() which may call functions that + * need to access the address vector. + */ + while (ofi_atomic_get32(&status) == 1) + psm2_poll(trx_ctxt->psm2_ep); + + error = (int)(int32_t)ofi_atomic_get32(&status); + + return error; +} + +int psmx3_av_add_trx_ctxt(struct psmx3_fid_av *av, + struct psmx3_trx_ctxt *trx_ctxt) +{ + int id; + int err = 0; + + av->domain->av_lock_fn(&av->lock, 1); + + if (av->type == FI_AV_MAP) { + av->av_map_trx_ctxt = trx_ctxt; + goto out; + } + + id = trx_ctxt->id; + if (id >= av->max_trx_ctxt) { + FI_WARN(&psmx3_prov, FI_LOG_AV, + "trx_ctxt->id(%d) exceeds av->max_trx_ctxt(%d).\n", + id, av->max_trx_ctxt); + err = -FI_EINVAL; + goto out; + } + + if (av->conn_info[id].trx_ctxt) { + if (av->conn_info[id].trx_ctxt == trx_ctxt) { + FI_INFO(&psmx3_prov, FI_LOG_AV, + "trx_ctxt(%p) with id(%d) already added.\n", + trx_ctxt, id); + goto out; + } else { + FI_INFO(&psmx3_prov, FI_LOG_AV, + "different trx_ctxt(%p) with same id(%d) already added.\n", + trx_ctxt, id); + err = -FI_EINVAL; + goto out; + } + } + + av->conn_info[id].epaddrs = (psm2_epaddr_t *) calloc(av->count, + sizeof(psm2_epaddr_t)); + if (!av->conn_info[id].epaddrs) { + err = -FI_ENOMEM; + goto out; + } + + av->conn_info[id].sepaddrs = (psm2_epaddr_t **)calloc(av->count, + sizeof(psm2_epaddr_t *)); + if (!av->conn_info[id].sepaddrs) { + err = -FI_ENOMEM; + goto out; + } + + av->conn_info[id].trx_ctxt = trx_ctxt; + +out: + av->domain->av_unlock_fn(&av->lock, 1); + return err; +} + +DIRECT_FN +STATIC int psmx3_av_insert(struct fid_av *av, const void *addr, + size_t count, fi_addr_t *fi_addr, + uint64_t flags, void *context) +{ + struct psmx3_fid_av *av_priv; + struct psmx3_ep_name *ep_name; + const struct psmx3_ep_name *names = addr; + const char **string_names = (void *)addr; + psm2_error_t *errors = NULL; + int error_count = 0; + int i, idx, ret; + + assert(addr || !count); + + av_priv = container_of(av, struct psmx3_fid_av, av); + + av_priv->domain->av_lock_fn(&av_priv->lock, 1); + + if ((av_priv->flags & FI_EVENT) && !av_priv->eq) { + ret = -FI_ENOEQ; + goto out; + } + + if (av_priv->flags & FI_READ) { + ret = -FI_EINVAL; + goto out; + } + + if (psmx3_av_check_space(av_priv, count)) { + ret = -FI_ENOMEM; + goto out; + } + + errors = calloc(count, sizeof(*errors)); + if (!errors) { + ret = -FI_ENOMEM; + goto out; + } + + /* save the peer address information */ + for (i = 0; i < count; i++) { + idx = av_priv->hdr->last + i; + if (av_priv->addr_format == FI_ADDR_STR) { + ep_name = psmx3_string_to_ep_name(string_names[i]); + if (!ep_name) { + ret = -FI_EINVAL; + goto out; + } + av_priv->table[idx].type = ep_name->type; + av_priv->table[idx].epid = ep_name->epid; + av_priv->table[idx].sep_id = ep_name->sep_id; + av_priv->table[idx].valid = 1; + free(ep_name); + } else { + av_priv->table[idx].type = names[i].type; + av_priv->table[idx].epid = names[i].epid; + av_priv->table[idx].sep_id = names[i].sep_id; + av_priv->table[idx].valid = 1; + } + av_priv->sep_info[idx].ctxt_cnt = 1; + av_priv->sep_info[idx].epids = NULL; + } + + if (fi_addr) { + for (i = 0; i < count; i++) { + idx = av_priv->hdr->last + i; + if (errors[i] != PSM2_OK) + fi_addr[i] = FI_ADDR_NOTAVAIL; + else + fi_addr[i] = idx; + } + } + + av_priv->hdr->last += count; + + if (av_priv->flags & FI_EVENT) { + if (error_count) { + for (i = 0; i < count; i++) + psmx3_av_post_completion(av_priv, context, i, errors[i]); + } + psmx3_av_post_completion(av_priv, context, count - error_count, 0); + ret = 0; + } else { + if (flags & FI_SYNC_ERR) { + int *fi_errors = context; + for (i=0; idomain->av_unlock_fn(&av_priv->lock, 1); + return ret; +} + +DIRECT_FN +STATIC int psmx3_av_map_insert(struct fid_av *av, const void *addr, + size_t count, fi_addr_t *fi_addr, + uint64_t flags, void *context) +{ + struct psmx3_fid_av *av_priv; + struct psmx3_trx_ctxt *trx_ctxt; + struct psmx3_ep_name *ep_name; + const struct psmx3_ep_name *names = addr; + const char **string_names = (void *)addr; + psm2_epid_t *epids = NULL; + psm2_epaddr_t *epaddrs = NULL; + psm2_error_t *errors = NULL; + int error_count = 0; + int i, ret, err = 0; + + assert(addr || !count); + + av_priv = container_of(av, struct psmx3_fid_av, av); + + av_priv->domain->av_lock_fn(&av_priv->lock, 1); + + if (!count) + goto out; + + epids = calloc(count, sizeof(*epids)); + errors = calloc(count, sizeof(*errors)); + if (!epids || !errors) { + err = -FI_ENOMEM; + goto out; + } + + for (i=0; iaddr_format == FI_ADDR_STR) { + ep_name = psmx3_string_to_ep_name(string_names[i]); + if (!ep_name) { + err = -FI_EINVAL; + goto out; + } + epids[i] = ep_name->epid; + free(ep_name); + } else { + epids[i] = names[i].epid; + } + } + + epaddrs = (psm2_epaddr_t *)fi_addr; + + trx_ctxt = av_priv->av_map_trx_ctxt; + if (!trx_ctxt) { + FI_WARN(&psmx3_prov, FI_LOG_AV, + "unable to map address without AV-EP binding\n"); + err = -FI_ENODEV; + goto out; + } + + psm2_ep_connect(trx_ctxt->psm2_ep, count, epids, NULL, errors, epaddrs, + (int64_t) psmx3_env.conn_timeout * count * 1000000000LL); + + for (i=0; iflags & FI_EVENT) { + if (!err) { + if (error_count) { + for (i = 0; i < count; i++) + psmx3_av_post_completion(av_priv, context, i, errors[i]); + } + psmx3_av_post_completion(av_priv, context, count - error_count, 0); + } + ret = err; + } else { + if (flags & FI_SYNC_ERR) { + int *fi_errors = context; + for (i=0; idomain->av_unlock_fn(&av_priv->lock, 1); + + return ret; +} + +static int psmx3_av_disconnect_addr(int trx_ctxt_id, psm2_epid_t epid, + psm2_epaddr_t epaddr) +{ + struct psmx3_epaddr_context *epaddr_context; + struct psmx3_trx_ctxt *trx_ctxt; + psm2_error_t errors; + int err; + + if (!epaddr) + return 0; + + FI_INFO(&psmx3_prov, FI_LOG_AV, + "trx_ctxt_id %d epid %lx epaddr %p\n", trx_ctxt_id, epid, epaddr); + + epaddr_context = psm2_epaddr_getctxt(epaddr); + if (!epaddr_context) + return -FI_EINVAL; + + trx_ctxt = epaddr_context->trx_ctxt; + if (trx_ctxt_id != trx_ctxt->id) + return -FI_EINVAL; + + if (epid != epaddr_context->epid) + return -FI_EINVAL; + + trx_ctxt->domain->peer_lock_fn(&trx_ctxt->peer_lock, 2); + dlist_remove_first_match(&trx_ctxt->peer_list, + psmx3_peer_match, epaddr); + trx_ctxt->domain->peer_unlock_fn(&trx_ctxt->peer_lock, 2); + + psm2_epaddr_setctxt(epaddr, NULL); + + err = psm2_ep_disconnect2(trx_ctxt->psm2_ep, 1, &epaddr, + NULL, &errors, PSM2_EP_DISCONNECT_FORCE, 0); + + free(epaddr_context); + return psmx3_errno(err); +} + +DIRECT_FN +STATIC int psmx3_av_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count, + uint64_t flags) +{ + struct psmx3_fid_av *av_priv; + int idx, i, j, k; + int err; + + av_priv = container_of(av, struct psmx3_fid_av, av); + + av_priv->domain->av_lock_fn(&av_priv->lock, 1); + + for (i = 0; i < count; i++) { + idx = PSMX3_ADDR_IDX(fi_addr[i]); + if (idx >= av_priv->hdr->last) { + FI_WARN(&psmx3_prov, FI_LOG_AV, + "AV index out of range: fi_addr %lx idx %d last %ld\n", + fi_addr[i], idx, av_priv->hdr->last); + continue; + } + + if (av_priv->table[idx].type == PSMX3_EP_REGULAR) { + for (j = 0; j < av_priv->max_trx_ctxt; j++) { + if (!av_priv->conn_info[j].trx_ctxt) + continue; + + err = psmx3_av_disconnect_addr( + j, av_priv->table[idx].epid, + av_priv->conn_info[j].epaddrs[idx]); + if (!err) + av_priv->conn_info[j].epaddrs[idx] = NULL; + } + av_priv->table[idx].epid = 0; + } else { + if (!av_priv->sep_info[idx].epids) + continue; + + for (j = 0; j < av_priv->max_trx_ctxt; j++) { + if (!av_priv->conn_info[j].trx_ctxt) + continue; + + if (!av_priv->conn_info[j].sepaddrs[idx]) + continue; + + for (k = 0; k < av_priv->sep_info[idx].ctxt_cnt; k++) { + err = psmx3_av_disconnect_addr( + j, av_priv->sep_info[idx].epids[k], + av_priv->conn_info[j].sepaddrs[idx][k]); + if (!err) + av_priv->conn_info[j].sepaddrs[idx][k] = NULL; + } + } + free(av_priv->sep_info[idx].epids); + av_priv->sep_info[idx].epids = NULL; + } + av_priv->table[idx].valid = 0; + } + + av_priv->domain->av_unlock_fn(&av_priv->lock, 1); + + return 0; +} + +DIRECT_FN +STATIC int psmx3_av_map_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count, + uint64_t flags) +{ + struct psmx3_fid_av *av_priv; + struct psmx3_trx_ctxt *trx_ctxt; + psm2_error_t *errors; + int i; + + av_priv = container_of(av, struct psmx3_fid_av, av); + + if (!count) + return 0; + + trx_ctxt = av_priv->av_map_trx_ctxt; + if (!trx_ctxt) + return -FI_ENODEV; + + errors = calloc(count, sizeof(*errors)); + if (!errors) + return -FI_ENOMEM; + + trx_ctxt->domain->peer_lock_fn(&trx_ctxt->peer_lock, 2); + for (i=0; ipeer_list, + psmx3_peer_match, + (psm2_epaddr_t)(fi_addr[i])); + } + trx_ctxt->domain->peer_unlock_fn(&trx_ctxt->peer_lock, 2); + + for (i=0; ipsm2_ep, count, (psm2_epaddr_t *)fi_addr, + NULL, errors, PSM2_EP_DISCONNECT_FORCE, 0); + + free(errors); + return 0; +} + +DIRECT_FN +STATIC int psmx3_av_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr, + size_t *addrlen) +{ + struct psmx3_fid_av *av_priv; + struct psmx3_ep_name name; + int idx = PSMX3_ADDR_IDX(fi_addr); + int err = 0; + + assert(addr); + assert(addrlen); + + av_priv = container_of(av, struct psmx3_fid_av, av); + + memset(&name, 0, sizeof(name)); + + av_priv->domain->av_lock_fn(&av_priv->lock, 1); + + if (idx >= av_priv->hdr->last) { + err = -FI_EINVAL; + goto out; + } + + if (!av_priv->table[idx].valid) { + err = -FI_EINVAL; + goto out; + } + + name.type = av_priv->table[idx].type; + name.epid = av_priv->table[idx].epid; + name.sep_id = av_priv->table[idx].sep_id; + + if (av_priv->addr_format == FI_ADDR_STR) { + ofi_straddr(addr, addrlen, FI_ADDR_PSMX3, &name); + } else { + memcpy(addr, &name, MIN(*addrlen, sizeof(name))); + *addrlen = sizeof(name); + } + +out: + av_priv->domain->av_unlock_fn(&av_priv->lock, 1); + return err; +} + +DIRECT_FN +STATIC int psmx3_av_map_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr, + size_t *addrlen) +{ + struct psmx3_fid_av *av_priv; + struct psmx3_ep_name name; + + assert(addr); + assert(addrlen); + + av_priv = container_of(av, struct psmx3_fid_av, av); + + memset(&name, 0, sizeof(name)); + psm2_epaddr_to_epid((psm2_epaddr_t)fi_addr, &name.epid); + name.type = PSMX3_EP_REGULAR; + + if (av_priv->addr_format == FI_ADDR_STR) { + ofi_straddr(addr, addrlen, FI_ADDR_PSMX3, &name); + } else { + memcpy(addr, &name, MIN(*addrlen, sizeof(name))); + *addrlen = sizeof(name); + } + + return 0; +} + +fi_addr_t psmx3_av_translate_source(struct psmx3_fid_av *av, + psm2_epaddr_t source, int source_sep_id) +{ + psm2_epid_t epid; + fi_addr_t ret; + int i, j, found; + int ep_type = source_sep_id ? PSMX3_EP_SCALABLE : PSMX3_EP_REGULAR; + + if (av->type == FI_AV_MAP) + return (fi_addr_t) source; + + psm2_epaddr_to_epid(source, &epid); + + av->domain->av_lock_fn(&av->lock, 1); + + ret = FI_ADDR_NOTAVAIL; + found = 0; + for (i = av->hdr->last - 1; i >= 0 && !found; i--) { + if (!av->table[i].valid) + continue; + + if (av->table[i].type == PSMX3_EP_REGULAR) { + if (ep_type == PSMX3_EP_SCALABLE) + continue; + if (av->table[i].epid == epid) { + ret = (fi_addr_t)i; + found = 1; + } + } else { + /* + * scalable endpoint must match sep_id exactly. + * regular endpoint can match a context of any + * scalable endpoint. + */ + if (ep_type == PSMX3_EP_SCALABLE && + av->table[i].sep_id != source_sep_id) + continue; + + if (!av->sep_info[i].epids) { + for (j = 0; j < av->max_trx_ctxt; j++) { + if (av->conn_info[j].trx_ctxt) + break; + } + if (j >= av->max_trx_ctxt) + continue; + psmx3_av_query_sep(av, av->conn_info[j].trx_ctxt, i); + if (!av->sep_info[i].epids) + continue; + } + + for (j=0; jsep_info[i].ctxt_cnt; j++) { + if (av->sep_info[i].epids[j] == epid) { + ret = fi_rx_addr((fi_addr_t)i, j, + av->rx_ctx_bits); + found = 1; + break; + } + } + } + } + + av->domain->av_unlock_fn(&av->lock, 1); + return ret; +} + +void psmx3_av_remove_conn(struct psmx3_fid_av *av, + struct psmx3_trx_ctxt *trx_ctxt, + psm2_epaddr_t epaddr) +{ + psm2_epid_t epid; + int i, j; + + if (av->type == FI_AV_MAP) + return; + + psm2_epaddr_to_epid(epaddr, &epid); + + av->domain->av_lock_fn(&av->lock, 1); + + for (i = 0; i < av->hdr->last; i++) { + if (!av->table[i].valid) + continue; + if (av->table[i].type == PSMX3_EP_REGULAR) { + if (av->table[i].epid == epid && + av->conn_info[trx_ctxt->id].epaddrs[i] == epaddr) + av->conn_info[trx_ctxt->id].epaddrs[i] = NULL; + } else { + if (!av->sep_info[i].epids) + continue; + for (j=0; jsep_info[i].ctxt_cnt; j++) { + if (av->sep_info[i].epids[j] == epid && + av->conn_info[trx_ctxt->id].sepaddrs[i] && + av->conn_info[trx_ctxt->id].sepaddrs[i][j] == epaddr) + av->conn_info[trx_ctxt->id].sepaddrs[i][j] = NULL; + } + } + } + + av->domain->av_unlock_fn(&av->lock, 1); +} + +DIRECT_FN +STATIC const char *psmx3_av_straddr(struct fid_av *av, const void *addr, + char *buf, size_t *len) +{ + return ofi_straddr(buf, len, FI_ADDR_PSMX3, addr); +} + +static int psmx3_av_close(fid_t fid) +{ + struct psmx3_fid_av *av; + int i, j; + int err; + + av = container_of(fid, struct psmx3_fid_av, av.fid); + psmx3_domain_release(av->domain); + fastlock_destroy(&av->lock); + + if (av->type == FI_AV_MAP) + goto out; + + for (i = 0; i < av->max_trx_ctxt; i++) { + if (!av->conn_info[i].trx_ctxt) + continue; + free(av->conn_info[i].epaddrs); + if (av->conn_info[i].sepaddrs) { + for (j = 0; j < av->hdr->last; j++) + free(av->conn_info[i].sepaddrs[j]); + } + free(av->conn_info[i].sepaddrs); + } + if (av->shared) { + err = ofi_shm_unmap(&av->shm); + if (err) + FI_INFO(&psmx3_prov, FI_LOG_AV, + "Failed to unmap shared AV: %s.\n", + strerror(ofi_syserr())); + } else { + free(av->hdr); + } + + free(av->sep_info); +out: + free(av); + return 0; +} + +DIRECT_FN +STATIC int psmx3_av_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + struct psmx3_fid_av *av; + + av = container_of(fid, struct psmx3_fid_av, av.fid); + + assert(bfid); + + switch (bfid->fclass) { + case FI_CLASS_EQ: + av->eq = (struct fid_eq *)bfid; + break; + + default: + return -FI_ENOSYS; + } + + return 0; +} + +static struct fi_ops psmx3_fi_ops = { + .size = sizeof(struct fi_ops), + .close = psmx3_av_close, + .bind = psmx3_av_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_av psmx3_av_ops = { + .size = sizeof(struct fi_ops_av), + .insert = psmx3_av_insert, + .insertsvc = fi_no_av_insertsvc, + .insertsym = fi_no_av_insertsym, + .remove = psmx3_av_remove, + .lookup = psmx3_av_lookup, + .straddr = psmx3_av_straddr, +}; + +static struct fi_ops_av psmx3_av_map_ops = { + .size = sizeof(struct fi_ops_av), + .insert = psmx3_av_map_insert, + .insertsvc = fi_no_av_insertsvc, + .insertsym = fi_no_av_insertsym, + .remove = psmx3_av_map_remove, + .lookup = psmx3_av_map_lookup, + .straddr = psmx3_av_straddr, +}; + +DIRECT_FN +int psmx3_av_open(struct fid_domain *domain, struct fi_av_attr *attr, + struct fid_av **av, void *context) +{ + struct psmx3_fid_domain *domain_priv; + struct psmx3_fid_av *av_priv; + size_t count = PSMX3_AV_DEFAULT_SIZE; + uint64_t flags = 0; + int shared = 0; + int rx_ctx_bits = PSMX3_MAX_RX_CTX_BITS; + size_t conn_size; + size_t table_size; + int av_type = FI_AV_TABLE; + int err; + int i; + + domain_priv = container_of(domain, struct psmx3_fid_domain, + util_domain.domain_fid); + + if (attr) { + if (attr->type == FI_AV_MAP) { + if (psmx3_env.multi_ep) { + FI_INFO(&psmx3_prov, FI_LOG_AV, + "FI_AV_MAP asked, but force FI_AV_TABLE for multi-EP support\n"); + } else if (psmx3_env.lazy_conn) { + FI_INFO(&psmx3_prov, FI_LOG_AV, + "FI_AV_MAP asked, but force FI_AV_TABLE for lazy connection\n"); + } else if (attr->name) { + FI_INFO(&psmx3_prov, FI_LOG_AV, + "FI_AV_MAP asked, but force FI_AV_TABLE for shared AV\n"); + } else { + FI_INFO(&psmx3_prov, FI_LOG_AV, + "FI_AV_MAP asked, and granted\n"); + av_type = FI_AV_MAP; + } + } + + if (attr->count) + count = attr->count; + + if (attr->name) + shared = 1; + + flags = attr->flags; + if (flags & FI_SYMMETRIC) { + FI_INFO(&psmx3_prov, FI_LOG_AV, + "FI_SYMMETRIC flags is no supported\n"); + return -FI_ENOSYS; + } + + if (attr->rx_ctx_bits > PSMX3_MAX_RX_CTX_BITS) { + FI_INFO(&psmx3_prov, FI_LOG_AV, + "attr->rx_ctx_bits=%d, maximum allowed is %d\n", + attr->rx_ctx_bits, PSMX3_MAX_RX_CTX_BITS); + return -FI_ENOSYS; + } + + rx_ctx_bits = attr->rx_ctx_bits; + } + + if (av_type == FI_AV_MAP) + conn_size = 0; + else + conn_size = psmx3_hfi_info.max_trx_ctxt * sizeof(struct psmx3_av_conn); + + av_priv = (struct psmx3_fid_av *) calloc(1, sizeof(*av_priv) + conn_size); + if (!av_priv) + return -FI_ENOMEM; + + if (av_type == FI_AV_MAP) + goto init_lock; + + av_priv->sep_info = calloc(count, sizeof(struct psmx3_av_sep)); + if (!av_priv->sep_info) { + err = -FI_ENOMEM; + goto errout_free; + } + + table_size = PSMX3_AV_TABLE_SIZE(count, shared); + if (attr && attr->name) { + err = ofi_shm_map(&av_priv->shm, attr->name, table_size, + flags & FI_READ, (void**)&av_priv->hdr); + if (err || av_priv->hdr == MAP_FAILED) { + FI_WARN(&psmx3_prov, FI_LOG_AV, + "failed to map shared AV: %s\n", attr->name); + err = -FI_EINVAL; + goto errout_free; + } + + if (flags & FI_READ) { + if (av_priv->hdr->size != count) { + FI_WARN(&psmx3_prov, FI_LOG_AV, + "AV size doesn't match: shared %ld, asking %ld\n", + av_priv->hdr->size, count); + err = -FI_EINVAL; + goto errout_free; + } + } else { + av_priv->hdr->size = count; + av_priv->hdr->last = 0; + } + av_priv->shared = 1; + av_priv->map = (fi_addr_t *)(av_priv->hdr + 1); + av_priv->table = (struct psmx3_av_addr *)(av_priv->map + count); + for (i = 0; i < count; i++) + av_priv->map[i] = i; + } else { + av_priv->hdr = calloc(1, table_size); + if (!av_priv->hdr) { + err = -FI_ENOMEM; + goto errout_free; + } + av_priv->hdr->size = count; + av_priv->table = (struct psmx3_av_addr *)(av_priv->hdr + 1); + } + +init_lock: + fastlock_init(&av_priv->lock); + + psmx3_domain_acquire(domain_priv); + + av_priv->domain = domain_priv; + av_priv->addrlen = sizeof(psm2_epaddr_t); + av_priv->count = count; + av_priv->flags = flags; + av_priv->rx_ctx_bits = rx_ctx_bits; + av_priv->max_trx_ctxt = psmx3_hfi_info.max_trx_ctxt; + av_priv->addr_format = domain_priv->addr_format; + av_priv->type = av_type; + + av_priv->av.fid.fclass = FI_CLASS_AV; + av_priv->av.fid.context = context; + av_priv->av.fid.ops = &psmx3_fi_ops; + if (av_type == FI_AV_MAP) + av_priv->av.ops = &psmx3_av_map_ops; + else + av_priv->av.ops = &psmx3_av_ops; + + *av = &av_priv->av; + if (attr) { + attr->type = av_type; + if (shared) + attr->map_addr = av_priv->map; + } + + return 0; + +errout_free: + free(av_priv->sep_info); + free(av_priv); + return err; +} + diff --git a/prov/psm3/src/psmx3_cm.c b/prov/psm3/src/psmx3_cm.c new file mode 100644 index 00000000000..c5e2d3d40c1 --- /dev/null +++ b/prov/psm3/src/psmx3_cm.c @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx3.h" + +DIRECT_FN +STATIC int psmx3_cm_getname(fid_t fid, void *addr, size_t *addrlen) +{ + struct psmx3_fid_ep *ep; + struct psmx3_fid_sep *sep; + struct psmx3_ep_name epname; + size_t addr_size; + int err = 0; + + ep = container_of(fid, struct psmx3_fid_ep, ep.fid); + if (!ep->domain) + return -FI_EBADF; + + memset(&epname, 0, sizeof(epname)); + + if (ep->type == PSMX3_EP_REGULAR) { + epname.epid = ep->rx ? ep->rx->psm2_epid : 0; + epname.type = ep->type; + } else { + sep = (struct psmx3_fid_sep *)ep; + epname.epid = sep->ctxts[0].trx_ctxt->psm2_epid; + epname.sep_id = sep->id; + epname.type = sep->type; + } + + if (ep->domain->addr_format == FI_ADDR_STR) { + addr_size = *addrlen; + ofi_straddr(addr, &addr_size, FI_ADDR_PSMX3, &epname); + } else { + addr_size = sizeof(epname); + memcpy(addr, &epname, MIN(*addrlen, addr_size)); + } + + if (*addrlen < addr_size) + err = -FI_ETOOSMALL; + + *addrlen = addr_size; + return err; +} + +struct fi_ops_cm psmx3_cm_ops = { + .size = sizeof(struct fi_ops_cm), + .setname = fi_no_setname, + .getname = psmx3_cm_getname, + .getpeer = fi_no_getpeer, + .connect = fi_no_connect, + .listen = fi_no_listen, + .accept = fi_no_accept, + .reject = fi_no_reject, + .shutdown = fi_no_shutdown, + .join = fi_no_join, +}; + diff --git a/prov/psm3/src/psmx3_cntr.c b/prov/psm3/src/psmx3_cntr.c new file mode 100644 index 00000000000..a1a92d9bb4b --- /dev/null +++ b/prov/psm3/src/psmx3_cntr.c @@ -0,0 +1,443 @@ +/* + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx3.h" +#include "psmx3_trigger.h" + +void psmx3_cntr_check_trigger(struct psmx3_fid_cntr *cntr) +{ + struct psmx3_trigger *trigger; + struct psmx3_trx_ctxt *trx_ctxt; + struct psmx3_fid_ep *ep; + + if (!cntr->trigger) + return; + + cntr->domain->trigger_lock_fn(&cntr->trigger_lock, 2); + + trigger = cntr->trigger; + while (trigger) { + if (ofi_atomic_get64(&cntr->counter) < trigger->threshold) + break; + + cntr->trigger = trigger->next; + + /* 'ep' is the first field of the union regardless of the op type */ + ep = container_of(trigger->send.ep, struct psmx3_fid_ep, ep); + + switch (trigger->op) { + case PSMX3_TRIGGERED_RECV: + case PSMX3_TRIGGERED_TRECV: + trx_ctxt = ep->rx; + break; + default: + trx_ctxt = ep->tx; + break; + } + + if (trx_ctxt->am_initialized) { + cntr->domain->trigger_queue_lock_fn(&trx_ctxt->trigger_queue.lock, 2); + slist_insert_tail(&trigger->list_entry, + &trx_ctxt->trigger_queue.list); + cntr->domain->trigger_queue_unlock_fn(&trx_ctxt->trigger_queue.lock, 2); + } else { + psmx3_process_trigger(trx_ctxt, trigger); + } + + trigger = cntr->trigger; + } + + cntr->domain->trigger_unlock_fn(&cntr->trigger_lock, 2); +} + +void psmx3_cntr_add_trigger(struct psmx3_fid_cntr *cntr, + struct psmx3_trigger *trigger) +{ + struct psmx3_trigger *p, *q; + + cntr->domain->trigger_lock_fn(&cntr->trigger_lock, 2); + + q = NULL; + p = cntr->trigger; + while (p && p->threshold <= trigger->threshold) { + q = p; + p = p->next; + } + if (q) + q->next = trigger; + else + cntr->trigger = trigger; + trigger->next = p; + + cntr->domain->trigger_unlock_fn(&cntr->trigger_lock, 2); + + psmx3_cntr_check_trigger(cntr); +} + +DIRECT_FN +STATIC uint64_t psmx3_cntr_read(struct fid_cntr *cntr) +{ + struct psmx3_fid_cntr *cntr_priv; + struct psmx3_poll_ctxt *poll_ctxt; + struct slist_entry *item, *prev; + + cntr_priv = container_of(cntr, struct psmx3_fid_cntr, cntr); + + if (cntr_priv->poll_all) { + psmx3_progress_all(cntr_priv->domain); + } else { + slist_foreach(&cntr_priv->poll_list, item, prev) { + poll_ctxt = container_of(item, + struct psmx3_poll_ctxt, + list_entry); + psmx3_progress(poll_ctxt->trx_ctxt); + (void) prev; /* suppress compiler warning */ + } + } + + return ofi_atomic_get64(&cntr_priv->counter); +} + +DIRECT_FN +STATIC uint64_t psmx3_cntr_readerr(struct fid_cntr *cntr) +{ + struct psmx3_fid_cntr *cntr_priv; + + cntr_priv = container_of(cntr, struct psmx3_fid_cntr, cntr); + cntr_priv->error_avail = 0; + + return ofi_atomic_get64(&cntr_priv->error_counter); +} + +DIRECT_FN +STATIC int psmx3_cntr_add(struct fid_cntr *cntr, uint64_t value) +{ + struct psmx3_fid_cntr *cntr_priv; + + cntr_priv = container_of(cntr, struct psmx3_fid_cntr, cntr); + ofi_atomic_add64(&cntr_priv->counter, value); + + psmx3_cntr_check_trigger(cntr_priv); + + if (cntr_priv->wait) + cntr_priv->wait->signal(cntr_priv->wait); + + return 0; +} + +DIRECT_FN +STATIC int psmx3_cntr_set(struct fid_cntr *cntr, uint64_t value) +{ + struct psmx3_fid_cntr *cntr_priv; + + cntr_priv = container_of(cntr, struct psmx3_fid_cntr, cntr); + ofi_atomic_set64(&cntr_priv->counter, value); + + psmx3_cntr_check_trigger(cntr_priv); + + if (cntr_priv->wait) + cntr_priv->wait->signal(cntr_priv->wait); + + return 0; +} + +DIRECT_FN +STATIC int psmx3_cntr_adderr(struct fid_cntr *cntr, uint64_t value) +{ + struct psmx3_fid_cntr *cntr_priv; + + cntr_priv = container_of(cntr, struct psmx3_fid_cntr, cntr); + ofi_atomic_add64(&cntr_priv->error_counter, value); + cntr_priv->error_avail = 1; + + psmx3_cntr_check_trigger(cntr_priv); + + if (cntr_priv->wait) + cntr_priv->wait->signal(cntr_priv->wait); + + return 0; +} + +DIRECT_FN +STATIC int psmx3_cntr_seterr(struct fid_cntr *cntr, uint64_t value) +{ + struct psmx3_fid_cntr *cntr_priv; + + cntr_priv = container_of(cntr, struct psmx3_fid_cntr, cntr); + ofi_atomic_set64(&cntr_priv->error_counter, value); + cntr_priv->error_avail = 1; + + psmx3_cntr_check_trigger(cntr_priv); + + if (cntr_priv->wait) + cntr_priv->wait->signal(cntr_priv->wait); + + return 0; +} + +DIRECT_FN +STATIC int psmx3_cntr_wait(struct fid_cntr *cntr, uint64_t threshold, int timeout) +{ + struct psmx3_fid_cntr *cntr_priv; + struct psmx3_poll_ctxt *poll_ctxt; + struct slist_entry *item, *prev; + struct timespec ts0, ts; + int msec_passed = 0; + int ret = 0; + + cntr_priv = container_of(cntr, struct psmx3_fid_cntr, cntr); + + clock_gettime(CLOCK_REALTIME, &ts0); + + while (ofi_atomic_get64(&cntr_priv->counter) < threshold) { + if (cntr_priv->error_avail) { + ret = -FI_EAVAIL; + break; + } + + if (cntr_priv->wait) { + ret = fi_wait((struct fid_wait *)cntr_priv->wait, + timeout - msec_passed); + if (ret == -FI_ETIMEDOUT) + break; + } else if (cntr_priv->poll_all) { + psmx3_progress_all(cntr_priv->domain); + } else { + slist_foreach(&cntr_priv->poll_list, item, prev) { + poll_ctxt = container_of(item, + struct psmx3_poll_ctxt, + list_entry); + psmx3_progress(poll_ctxt->trx_ctxt); + (void) prev; /* suppress compiler warning */ + } + } + + if (cntr_priv->error_avail) { + ret = -FI_EAVAIL; + break; + } + + if (ofi_atomic_get64(&cntr_priv->counter) >= threshold) + break; + + if (timeout < 0) + continue; + + clock_gettime(CLOCK_REALTIME, &ts); + msec_passed = (ts.tv_sec - ts0.tv_sec) * 1000 + + (ts.tv_nsec - ts0.tv_nsec) / 1000000; + + if (msec_passed >= timeout) { + ret = -FI_ETIMEDOUT; + break; + } + } + + return ret; +} + +static int psmx3_cntr_close(fid_t fid) +{ + struct psmx3_fid_cntr *cntr; + struct psmx3_poll_ctxt *item; + struct slist_entry *entry; + + cntr = container_of(fid, struct psmx3_fid_cntr, cntr.fid); + + while (!slist_empty(&cntr->poll_list)) { + entry = slist_remove_head(&cntr->poll_list); + item = container_of(entry, struct psmx3_poll_ctxt, list_entry); + if (!ofi_atomic_dec32(&item->trx_ctxt->poll_refcnt)) + free(item->trx_ctxt); + free(item); + } + + if (cntr->wait) { + fi_poll_del(&cntr->wait->pollset->poll_fid, &cntr->cntr.fid, 0); + if (cntr->wait_is_local) + fi_close((fid_t)cntr->wait); + } + + fastlock_destroy(&cntr->trigger_lock); + psmx3_domain_release(cntr->domain); + free(cntr); + + return 0; +} + +static int psmx3_cntr_control(fid_t fid, int command, void *arg) +{ + struct psmx3_fid_cntr *cntr; + int ret = 0; + + cntr = container_of(fid, struct psmx3_fid_cntr, cntr.fid); + + switch (command) { + case FI_SETOPSFLAG: + cntr->flags = *(uint64_t *)arg; + break; + + case FI_GETOPSFLAG: + if (!arg) + return -FI_EINVAL; + *(uint64_t *)arg = cntr->flags; + break; + + case FI_GETWAIT: + if (cntr->wait) + ret = fi_control(&cntr->wait->wait_fid.fid, FI_GETWAIT, arg); + else + return -FI_EINVAL; + break; + default: + return -FI_ENOSYS; + } + + return ret; +} + +static struct fi_ops psmx3_fi_ops = { + .size = sizeof(struct fi_ops), + .close = psmx3_cntr_close, + .bind = fi_no_bind, + .control = psmx3_cntr_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_cntr psmx3_cntr_ops = { + .size = sizeof(struct fi_ops_cntr), + .read = psmx3_cntr_read, + .readerr = psmx3_cntr_readerr, + .add = psmx3_cntr_add, + .set = psmx3_cntr_set, + .wait = psmx3_cntr_wait, + .adderr = psmx3_cntr_adderr, + .seterr = psmx3_cntr_seterr, +}; + +DIRECT_FN +int psmx3_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, + struct fid_cntr **cntr, void *context) +{ + struct psmx3_fid_domain *domain_priv; + struct psmx3_fid_cntr *cntr_priv; + struct fid_wait *wait = NULL; + struct fi_wait_attr wait_attr; + int wait_is_local = 0; + int events; + uint64_t flags; + int err; + + flags = 0; + domain_priv = container_of(domain, struct psmx3_fid_domain, + util_domain.domain_fid); + + switch (attr->events) { + case FI_CNTR_EVENTS_COMP: + events = attr->events; + break; + + default: + FI_INFO(&psmx3_prov, FI_LOG_CQ, + "attr->events=%d, supported=%d\n", + attr->events, FI_CNTR_EVENTS_COMP); + return -FI_EINVAL; + } + + switch (attr->wait_obj) { + case FI_WAIT_NONE: + case FI_WAIT_UNSPEC: + break; + + case FI_WAIT_SET: + if (!attr->wait_set) { + FI_INFO(&psmx3_prov, FI_LOG_CQ, + "FI_WAIT_SET is specified but attr->wait_set is NULL\n"); + return -FI_EINVAL; + } + wait = attr->wait_set; + break; + + case FI_WAIT_FD: + case FI_WAIT_MUTEX_COND: + wait_attr.wait_obj = attr->wait_obj; + wait_attr.flags = 0; + err = fi_wait_open(&domain_priv->fabric->util_fabric.fabric_fid, + &wait_attr, (struct fid_wait **)&wait); + if (err) + return err; + wait_is_local = 1; + break; + + default: + FI_INFO(&psmx3_prov, FI_LOG_CQ, + "attr->wait_obj=%d, supported=%d...%d\n", + attr->wait_obj, FI_WAIT_NONE, FI_WAIT_MUTEX_COND); + return -FI_EINVAL; + } + + cntr_priv = (struct psmx3_fid_cntr *) calloc(1, sizeof *cntr_priv); + if (!cntr_priv) { + err = -FI_ENOMEM; + goto fail; + } + + + cntr_priv->domain = domain_priv; + cntr_priv->events = events; + if (wait) + cntr_priv->wait = container_of(wait, struct util_wait, wait_fid); + cntr_priv->wait_is_local = wait_is_local; + cntr_priv->flags = flags; + cntr_priv->cntr.fid.fclass = FI_CLASS_CNTR; + cntr_priv->cntr.fid.context = context; + cntr_priv->cntr.fid.ops = &psmx3_fi_ops; + cntr_priv->cntr.ops = &psmx3_cntr_ops; + ofi_atomic_initialize64(&cntr_priv->counter, 0); + ofi_atomic_initialize64(&cntr_priv->error_counter, 0); + + slist_init(&cntr_priv->poll_list); + fastlock_init(&cntr_priv->trigger_lock); + + if (wait) + fi_poll_add(&cntr_priv->wait->pollset->poll_fid, + &cntr_priv->cntr.fid, 0); + + psmx3_domain_acquire(domain_priv); + *cntr = &cntr_priv->cntr; + return 0; +fail: + if (wait && wait_is_local) + fi_close(&wait->fid); + return err; +} + diff --git a/prov/psm3/src/psmx3_cq.c b/prov/psm3/src/psmx3_cq.c new file mode 100644 index 00000000000..759fa2ae03c --- /dev/null +++ b/prov/psm3/src/psmx3_cq.c @@ -0,0 +1,1299 @@ +/* + * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx3.h" + +void psmx3_cq_enqueue_event(struct psmx3_fid_cq *cq, + struct psmx3_cq_event *event) +{ + cq->domain->cq_lock_fn(&cq->lock, 2); + slist_insert_tail(&event->list_entry, &cq->event_queue); + cq->event_count++; + cq->domain->cq_unlock_fn(&cq->lock, 2); + + if (cq->wait) + cq->wait->signal(cq->wait); +} + +static struct psmx3_cq_event *psmx3_cq_dequeue_event(struct psmx3_fid_cq *cq) +{ + struct slist_entry *entry; + + cq->domain->cq_lock_fn(&cq->lock, 2); + if (slist_empty(&cq->event_queue)) { + cq->domain->cq_unlock_fn(&cq->lock, 2); + return NULL; + } + entry = slist_remove_head(&cq->event_queue); + cq->event_count--; + cq->domain->cq_unlock_fn(&cq->lock, 2); + + return container_of(entry, struct psmx3_cq_event, list_entry); +} + +static struct psmx3_cq_event *psmx3_cq_alloc_event(struct psmx3_fid_cq *cq) +{ + struct psmx3_cq_event *event; + + cq->domain->cq_lock_fn(&cq->lock, 2); + if (!slist_empty(&cq->free_list)) { + event = container_of(slist_remove_head(&cq->free_list), + struct psmx3_cq_event, list_entry); + cq->domain->cq_unlock_fn(&cq->lock, 2); + return event; + } + + cq->domain->cq_unlock_fn(&cq->lock, 2); + event = calloc(1, sizeof(*event)); + if (!event) + FI_WARN(&psmx3_prov, FI_LOG_CQ, "out of memory.\n"); + + return event; +} + +static void psmx3_cq_free_event(struct psmx3_fid_cq *cq, + struct psmx3_cq_event *event) +{ + memset(event, 0, sizeof(*event)); + + cq->domain->cq_lock_fn(&cq->lock, 2); + slist_insert_tail(&event->list_entry, &cq->free_list); + cq->domain->cq_unlock_fn(&cq->lock, 2); +} + +struct psmx3_cq_event *psmx3_cq_create_event(struct psmx3_fid_cq *cq, + void *op_context, void *buf, + uint64_t flags, size_t len, + uint64_t data, uint64_t tag, + size_t olen, int err) +{ + struct psmx3_cq_event *event; + + event = psmx3_cq_alloc_event(cq); + if (!event) + return NULL; + + if ((event->error = !!err)) { + event->cqe.err.op_context = op_context; + event->cqe.err.err = -err; + event->cqe.err.data = data; + event->cqe.err.tag = tag; + event->cqe.err.olen = olen; + event->cqe.err.flags = flags; + event->cqe.err.prov_errno = PSM2_INTERNAL_ERR; + goto out; + } + + switch (cq->format) { + case FI_CQ_FORMAT_CONTEXT: + event->cqe.context.op_context = op_context; + break; + + case FI_CQ_FORMAT_MSG: + event->cqe.msg.op_context = op_context; + event->cqe.msg.flags = flags; + event->cqe.msg.len = len; + break; + + case FI_CQ_FORMAT_DATA: + event->cqe.data.op_context = op_context; + event->cqe.data.buf = buf; + event->cqe.data.flags = flags; + event->cqe.data.len = len; + event->cqe.data.data = data; + break; + + case FI_CQ_FORMAT_TAGGED: + event->cqe.tagged.op_context = op_context; + event->cqe.tagged.buf = buf; + event->cqe.tagged.flags = flags; + event->cqe.tagged.len = len; + event->cqe.tagged.data = data; + event->cqe.tagged.tag = tag; + break; + + default: + FI_WARN(&psmx3_prov, FI_LOG_CQ, + "unsupported CQ format %d\n", cq->format); + psmx3_cq_free_event(cq, event); + return NULL; + } + +out: + return event; +} + +static uint64_t psmx3_comp_flags[PSMX3_MAX_CONTEXT_TYPE] = { + [PSMX3_NOCOMP_SEND_CONTEXT] = FI_SEND | FI_MSG, + [PSMX3_NOCOMP_RECV_CONTEXT] = FI_RECV | FI_MSG, + [PSMX3_NOCOMP_TSEND_CONTEXT] = FI_SEND | FI_TAGGED, + [PSMX3_NOCOMP_TRECV_CONTEXT] = FI_RECV | FI_TAGGED, + [PSMX3_NOCOMP_WRITE_CONTEXT] = FI_WRITE | FI_RMA, + [PSMX3_NOCOMP_READ_CONTEXT] = FI_READ | FI_RMA, + [PSMX3_SEND_CONTEXT] = FI_SEND | FI_MSG, + [PSMX3_RECV_CONTEXT] = FI_RECV | FI_MSG, + [PSMX3_MULTI_RECV_CONTEXT] = FI_RECV | FI_MSG, + [PSMX3_TSEND_CONTEXT] = FI_SEND | FI_TAGGED, + [PSMX3_TRECV_CONTEXT] = FI_RECV | FI_TAGGED, + [PSMX3_WRITE_CONTEXT] = FI_WRITE | FI_RMA, + [PSMX3_READ_CONTEXT] = FI_READ | FI_RMA, + [PSMX3_REMOTE_WRITE_CONTEXT] = FI_REMOTE_WRITE | FI_RMA, + [PSMX3_REMOTE_READ_CONTEXT] = FI_REMOTE_READ | FI_RMA, + [PSMX3_SENDV_CONTEXT] = FI_SEND, + [PSMX3_IOV_SEND_CONTEXT] = FI_SEND, + [PSMX3_IOV_RECV_CONTEXT] = FI_RECV, +}; + +/* + * Translate "status" into completion event. A few factors determine where to + * save the event. + * + * If: + * + * (1) the CQE is for the CQ being polled; and + * (2) event buffer is supplied (event_in != NULL); and + * (3) the CQE is not an error entry, + * + * then the event is written to the event buffer directly. Otherwise a CQE is + * allocated on the corresponding CQ. + * + * The function doesn't use PSMX3_STATUS_CONTEXT(status) because the context + * field could refer to an allocated descriptor that may have already been + * freed. All the information that are dependent on the field are obtained + * in advance and passed in as separate parameters ("op_context", "buf", + * "flags", "data", and "is_recv"). + * + * The flag "event_saved" is set to indicate to the caller that the event + * was saved to the user's provided buffer, otherwise the event was an error + * or the event has been saved to the comp_cq slist. + */ + +__attribute__((always_inline)) +static inline int psmx3_cq_any_complete(struct psmx3_fid_cq *poll_cq, + struct psmx3_fid_cq *comp_cq, + struct psmx3_fid_av *av, + PSMX3_STATUS_TYPE *status, + void *op_context, + void *buf, + uint64_t flags, + uint64_t data, + struct psmx3_cq_event *event_in, + int *event_saved, + fi_addr_t *src_addr, + int is_recv) +{ + struct psmx3_cq_event *event = event_in; + + *event_saved = 1; + + if (OFI_UNLIKELY(PSMX3_STATUS_ERROR(status))) { + *event_saved = 0; + event = psmx3_cq_alloc_event(comp_cq); + if (!event) + return -FI_ENOMEM; + + event->error = 1; + event->cqe.err.op_context = op_context; + event->cqe.err.flags = flags; + event->cqe.err.err = -psmx3_errno(PSMX3_STATUS_ERROR(status)); + event->cqe.err.prov_errno = PSMX3_STATUS_ERROR(status); + event->cqe.err.tag = PSMX3_GET_TAG64(PSMX3_STATUS_TAG(status)); + event->cqe.err.olen = PSMX3_STATUS_SNDLEN(status) - PSMX3_STATUS_RCVLEN(status); + event->cqe.err.data = data; + + psmx3_cq_enqueue_event(comp_cq, event); + return 0; + } + + if (OFI_UNLIKELY(poll_cq != comp_cq || !event)) { + *event_saved = 0; + event = psmx3_cq_alloc_event(comp_cq); + if (!event) + return -FI_ENOMEM; + + event->error = 0; + } + + if (is_recv) { + psm2_epaddr_t source = PSMX3_STATUS_PEER(status); + int source_sep_id = (flags & FI_REMOTE_CQ_DATA) ? 0 : data; + + if (event == event_in) { + if (src_addr) { + src_addr[0] = psmx3_av_translate_source(av, source, + source_sep_id); + if (src_addr[0] == FI_ADDR_NOTAVAIL) { + *event_saved = 0; + event = psmx3_cq_alloc_event(comp_cq); + if (!event) + return -FI_ENOMEM; + + event->cqe = event_in->cqe; + event->cqe.err.err = FI_EADDRNOTAVAIL; + event->cqe.err.err_data = &comp_cq->error_data; + event->error = !!event->cqe.err.err; + if (av->addr_format == FI_ADDR_STR) { + event->cqe.err.err_data_size = PSMX3_ERR_DATA_SIZE; + psmx3_get_source_string_name( + source, source_sep_id, + (void *)&comp_cq->error_data, + &event->cqe.err.err_data_size); + } else { + psmx3_get_source_name( + source, source_sep_id, + (void *)&comp_cq->error_data); + event->cqe.err.err_data_size = sizeof(struct psmx3_ep_name); + } + } + } + } else { + event->source_is_valid = 1; + event->source_sep_id = source_sep_id; + event->source = source; + event->source_av = av; + } + } + + switch (comp_cq->format) { + case FI_CQ_FORMAT_CONTEXT: + event->cqe.context.op_context = op_context; + break; + + case FI_CQ_FORMAT_MSG: + event->cqe.msg.op_context = op_context; + event->cqe.msg.flags = flags; + event->cqe.msg.len = PSMX3_STATUS_RCVLEN(status); + break; + + case FI_CQ_FORMAT_DATA: + event->cqe.data.op_context = op_context; + event->cqe.data.buf = buf; + event->cqe.data.flags = flags; + event->cqe.data.len = PSMX3_STATUS_RCVLEN(status); + event->cqe.data.data = data; + break; + + case FI_CQ_FORMAT_TAGGED: + event->cqe.tagged.op_context = op_context; + event->cqe.tagged.buf = buf; + event->cqe.tagged.flags = flags; + event->cqe.tagged.len = PSMX3_STATUS_RCVLEN(status); + event->cqe.tagged.data = data; + event->cqe.tagged.tag = PSMX3_GET_TAG64(PSMX3_STATUS_TAG(status)); + break; + + default: + FI_WARN(&psmx3_prov, FI_LOG_CQ, + "unsupported CQ format %d\n", comp_cq->format); + if (event != event_in) + psmx3_cq_free_event(comp_cq, event); + return -FI_EINVAL; + } + + if (OFI_UNLIKELY(event != event_in)) + psmx3_cq_enqueue_event(comp_cq, event); + + return 0; +} + +static inline int psmx3_cq_tx_complete(struct psmx3_fid_cq *poll_cq, + struct psmx3_fid_cq *comp_cq, + struct psmx3_fid_av *av, + PSMX3_STATUS_TYPE *status, + void *op_context, + void *buf, + uint64_t flags, + uint64_t data, + struct psmx3_cq_event *event_in, + int *event_saved) +{ + return psmx3_cq_any_complete(poll_cq, comp_cq, av, status, + op_context, buf, flags, data, + event_in, event_saved, NULL, 0); +} + +static inline int psmx3_cq_rx_complete(struct psmx3_fid_cq *poll_cq, + struct psmx3_fid_cq *comp_cq, + struct psmx3_fid_av *av, + PSMX3_STATUS_TYPE *status, + void *op_context, + void *buf, + uint64_t flags, + uint64_t data, + struct psmx3_cq_event *event_in, + fi_addr_t *src_addr, + int *event_saved) +{ + return psmx3_cq_any_complete(poll_cq, comp_cq, av, status, + op_context, buf, flags, data, + event_in, event_saved, src_addr, 1); +} + +int +psmx3_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry_index) +{ + struct fi_context *fi_context; + struct psmx3_fid_ep *ep; + struct psmx3_fid_mr *mr; + struct psmx3_am_request *am_req; + struct psmx3_multi_recv *multi_recv_req; + struct psmx3_sendv_request *sendv_req; + struct psmx3_sendv_reply *sendv_rep; + psm2_mq_req_t psm2_req; + size_t len_remaining; + void *op_context; + void *buf; + uint64_t flags; + uint64_t data; + int err; + int context_type; + int event_saved = 0; + void *entry = NULL; + + struct psmx3_status_data *status_data = status_array; + + if (OFI_LIKELY(status_data->event_buffer && status_data->poll_cq)) + entry = (uint8_t *)status_data->event_buffer + + (entry_index * status_data->poll_cq->entry_size); + + fi_context = PSMX3_STATUS_CONTEXT(req); + + if (OFI_UNLIKELY(!fi_context)) + return 0; + + context_type = (int)PSMX3_CTXT_TYPE(fi_context); + flags = psmx3_comp_flags[context_type]; + ep = PSMX3_CTXT_EP(fi_context); + + switch (context_type) { + case PSMX3_SEND_CONTEXT: + case PSMX3_TSEND_CONTEXT: + if (ep->send_cq) { + op_context = fi_context; + buf = PSMX3_CTXT_USER(fi_context); + err = psmx3_cq_tx_complete( + status_data->poll_cq, ep->send_cq, ep->av, + req, op_context, buf, flags, 0, + entry, &event_saved); + if (OFI_UNLIKELY(err)) + return err; + } + if (ep->send_cntr) + psmx3_cntr_inc(ep->send_cntr, PSMX3_STATUS_ERROR(req)); + + /* Bi-directional send/recv performance tweak for KNL */ + if (event_saved && PSMX3_STATUS_SNDLEN(req) > 16384) + event_saved++; + break; + + case PSMX3_NOCOMP_SEND_CONTEXT: + case PSMX3_NOCOMP_TSEND_CONTEXT: + if (OFI_UNLIKELY(ep->send_cq && PSMX3_STATUS_ERROR(req))) { + err = psmx3_cq_tx_complete( + status_data->poll_cq, ep->send_cq, ep->av, + req, NULL, NULL, flags, 0, + entry, &event_saved); + if (OFI_UNLIKELY(err)) + return err; + } + if (ep->send_cntr) + psmx3_cntr_inc(ep->send_cntr, PSMX3_STATUS_ERROR(req)); + break; + + case PSMX3_RECV_CONTEXT: + if (OFI_UNLIKELY(PSMX3_IS_IOV_HEADER(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req))) && + !psmx3_handle_sendv_req(ep, req, 0))) { + return 0; + } + if (ep->recv_cq) { + op_context = fi_context; + buf = PSMX3_CTXT_USER(fi_context); + data = PSMX3_GET_CQDATA(PSMX3_STATUS_TAG(req)); + if (PSMX3_HAS_IMM(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req)))) + flags |= FI_REMOTE_CQ_DATA; + err = psmx3_cq_rx_complete( + status_data->poll_cq, ep->recv_cq, ep->av, + req, op_context, buf, flags, data, + entry, status_data->src_addr, &event_saved); + if (OFI_UNLIKELY(err)) + return err; + } + if (ep->recv_cntr) + psmx3_cntr_inc(ep->recv_cntr, PSMX3_STATUS_ERROR(req)); + break; + + case PSMX3_TRECV_CONTEXT: + if (OFI_UNLIKELY(PSMX3_IS_IOV_HEADER(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req))) && + !psmx3_handle_sendv_req(ep, req, 0))) { + return 0; + } + if (ep->recv_cq) { + op_context = fi_context; + buf = PSMX3_CTXT_USER(fi_context); + data = PSMX3_GET_CQDATA(PSMX3_STATUS_TAG(req)); + if (PSMX3_HAS_IMM(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req)))) + flags |= FI_REMOTE_CQ_DATA; + err = psmx3_cq_rx_complete( + status_data->poll_cq, ep->recv_cq, ep->av, + req, op_context, buf, flags, data, + entry, status_data->src_addr, &event_saved); + if (OFI_UNLIKELY(err)) + return err; + } + if (ep->recv_cntr) + psmx3_cntr_inc(ep->recv_cntr, PSMX3_STATUS_ERROR(req)); + break; + + case PSMX3_NOCOMP_RECV_CONTEXT: + if (OFI_UNLIKELY(PSMX3_IS_IOV_HEADER(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req))) && + !psmx3_handle_sendv_req(ep, req, 0))) { + PSMX3_EP_PUT_OP_CONTEXT(ep, fi_context); + return 0; + } + PSMX3_EP_PUT_OP_CONTEXT(ep, fi_context); + if (OFI_UNLIKELY(ep->recv_cq && PSMX3_STATUS_ERROR(req))) { + data = PSMX3_GET_CQDATA(PSMX3_STATUS_TAG(req)); + if (PSMX3_HAS_IMM(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req)))) + flags |= FI_REMOTE_CQ_DATA; + err = psmx3_cq_rx_complete( + status_data->poll_cq, ep->recv_cq, ep->av, + req, NULL, NULL, flags, data, + entry, status_data->src_addr, &event_saved); + if (OFI_UNLIKELY(err)) + return err; + } + if (ep->recv_cntr) + psmx3_cntr_inc(ep->recv_cntr, PSMX3_STATUS_ERROR(req)); + break; + + case PSMX3_NOCOMP_TRECV_CONTEXT: + if (OFI_UNLIKELY(PSMX3_IS_IOV_HEADER(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req))) && + !psmx3_handle_sendv_req(ep, req, 0))) { + PSMX3_EP_PUT_OP_CONTEXT(ep, fi_context); + return 0; + } + PSMX3_EP_PUT_OP_CONTEXT(ep, fi_context); + if (OFI_UNLIKELY(ep->recv_cq && PSMX3_STATUS_ERROR(req))) { + data = PSMX3_GET_CQDATA(PSMX3_STATUS_TAG(req)); + if (PSMX3_HAS_IMM(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req)))) + flags |= FI_REMOTE_CQ_DATA; + err = psmx3_cq_rx_complete( + status_data->poll_cq, ep->recv_cq, ep->av, + req, NULL, NULL, flags, data, + entry, status_data->src_addr, &event_saved); + if (OFI_UNLIKELY(err)) + return err; + } + if (ep->recv_cntr) + psmx3_cntr_inc(ep->recv_cntr, PSMX3_STATUS_ERROR(req)); + break; + + case PSMX3_WRITE_CONTEXT: + am_req = container_of(fi_context, struct psmx3_am_request, + fi_context); + op_context = PSMX3_CTXT_USER(fi_context); + free(am_req->tmpbuf); + psmx3_am_request_free(status_data->trx_ctxt, am_req); + if (ep->send_cq) { + err = psmx3_cq_tx_complete( + status_data->poll_cq, ep->send_cq, ep->av, + req, op_context, NULL, flags, 0, + entry, &event_saved); + if (OFI_UNLIKELY(err)) + return err; + } + if (ep->write_cntr) + psmx3_cntr_inc(ep->write_cntr, PSMX3_STATUS_ERROR(req)); + break; + + case PSMX3_NOCOMP_WRITE_CONTEXT: + am_req = container_of(fi_context, struct psmx3_am_request, + fi_context); + op_context = PSMX3_CTXT_USER(fi_context); + free(am_req->tmpbuf); + psmx3_am_request_free(status_data->trx_ctxt, am_req); + if (OFI_UNLIKELY(ep->send_cq && PSMX3_STATUS_ERROR(req))) { + err = psmx3_cq_tx_complete( + status_data->poll_cq, ep->send_cq, ep->av, + req, op_context, NULL, flags, 0, + entry, &event_saved); + if (OFI_UNLIKELY(err)) + return err; + } + if (ep->write_cntr) + psmx3_cntr_inc(ep->write_cntr, PSMX3_STATUS_ERROR(req)); + break; + + case PSMX3_READ_CONTEXT: + am_req = container_of(fi_context, struct psmx3_am_request, + fi_context); + if (OFI_UNLIKELY(am_req->op == PSMX3_AM_REQ_READV)) { + am_req->read.len_read += PSMX3_STATUS_RCVLEN(req); + if (am_req->read.len_read < am_req->read.len) { + FI_INFO(&psmx3_prov, FI_LOG_EP_DATA, + "readv: long protocol finishes early\n"); + if (PSMX3_STATUS_ERROR(req)) + am_req->error = psmx3_errno(PSMX3_STATUS_ERROR(req)); + /* Request to be freed in AM handler */ + return 0; + } + } + op_context = PSMX3_CTXT_USER(fi_context); + free(am_req->tmpbuf); + psmx3_am_request_free(status_data->trx_ctxt, am_req); + if (ep->send_cq) { + err = psmx3_cq_tx_complete( + status_data->poll_cq, ep->send_cq, ep->av, + req, op_context, NULL, flags, 0, + entry, &event_saved); + if (OFI_UNLIKELY(err)) + return err; + } + if (ep->read_cntr) + psmx3_cntr_inc(ep->read_cntr, PSMX3_STATUS_ERROR(req)); + break; + + case PSMX3_NOCOMP_READ_CONTEXT: + am_req = container_of(fi_context, struct psmx3_am_request, + fi_context); + if (OFI_UNLIKELY(am_req->op == PSMX3_AM_REQ_READV)) { + am_req->read.len_read += PSMX3_STATUS_RCVLEN(req); + if (am_req->read.len_read < am_req->read.len) { + FI_INFO(&psmx3_prov, FI_LOG_EP_DATA, + "readv: long protocol finishes early\n"); + if (PSMX3_STATUS_ERROR(req)) + am_req->error = psmx3_errno(PSMX3_STATUS_ERROR(req)); + /* Request to be freed in AM handler */ + return 0; + } + } + op_context = PSMX3_CTXT_USER(fi_context); + free(am_req->tmpbuf); + psmx3_am_request_free(status_data->trx_ctxt, am_req); + if (OFI_UNLIKELY(ep->send_cq && PSMX3_STATUS_ERROR(req))) { + err = psmx3_cq_tx_complete( + status_data->poll_cq, ep->send_cq, ep->av, + req, op_context, NULL, flags, 0, + entry, &event_saved); + if (OFI_UNLIKELY(err)) + return err; + } + if (ep->read_cntr) + psmx3_cntr_inc(ep->read_cntr, PSMX3_STATUS_ERROR(req)); + break; + + case PSMX3_MULTI_RECV_CONTEXT: + if (OFI_UNLIKELY(PSMX3_IS_IOV_HEADER(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req))) && + !psmx3_handle_sendv_req(ep, req, 1))) { + return 0; + } + multi_recv_req = PSMX3_CTXT_USER(fi_context); + if (ep->recv_cq) { + op_context = fi_context; + buf = multi_recv_req->buf + multi_recv_req->offset; + data = PSMX3_GET_CQDATA(PSMX3_STATUS_TAG(req)); + if (PSMX3_HAS_IMM(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req)))) + flags |= FI_REMOTE_CQ_DATA; + if (multi_recv_req->offset + PSMX3_STATUS_RCVLEN(req) + + multi_recv_req->min_buf_size > multi_recv_req->len) + flags |= FI_MULTI_RECV; /* buffer used up */ + err = psmx3_cq_rx_complete( + status_data->poll_cq, ep->recv_cq, ep->av, + req, op_context, buf, flags, data, + entry, status_data->src_addr, &event_saved); + if (OFI_UNLIKELY(err)) + return err; + } + if (ep->recv_cntr) + psmx3_cntr_inc(ep->recv_cntr, PSMX3_STATUS_ERROR(req)); + + /* repost multi-recv buffer */ + multi_recv_req->offset += PSMX3_STATUS_RCVLEN(req); + len_remaining = multi_recv_req->len - multi_recv_req->offset; + if (len_remaining >= multi_recv_req->min_buf_size) { + if (len_remaining > PSMX3_MAX_MSG_SIZE) + len_remaining = PSMX3_MAX_MSG_SIZE; + err = psm2_mq_irecv2(ep->rx->psm2_mq, + multi_recv_req->src_addr, &multi_recv_req->tag, + &multi_recv_req->tagsel, multi_recv_req->flag, + multi_recv_req->buf + multi_recv_req->offset, + len_remaining, + (void *)fi_context, &psm2_req); + if (OFI_UNLIKELY(err != PSM2_OK)) + return psmx3_errno(err); + PSMX3_CTXT_REQ(fi_context) = psm2_req; + } else { + free(multi_recv_req); + } + break; + + case PSMX3_REMOTE_WRITE_CONTEXT: + am_req = container_of(fi_context, struct psmx3_am_request, fi_context); + if (am_req->op & PSMX3_AM_FORCE_ACK) { + am_req->error = psmx3_errno(PSMX3_STATUS_ERROR(req)); + psmx3_am_ack_rma(am_req); + } + + if (am_req->ep->recv_cq && (am_req->cq_flags & FI_REMOTE_CQ_DATA)) { + flags |= FI_REMOTE_CQ_DATA; + err = psmx3_cq_rx_complete( + status_data->poll_cq, am_req->ep->recv_cq, am_req->ep->av, + req, NULL, NULL, flags, am_req->write.data, + entry, status_data->src_addr, &event_saved); + if (OFI_UNLIKELY(err)) { + psmx3_am_request_free(status_data->trx_ctxt, am_req); + return err; + } + } + + if (am_req->ep->caps & FI_RMA_EVENT) { + if (am_req->ep->remote_write_cntr) + psmx3_cntr_inc(am_req->ep->remote_write_cntr, 0); + + mr = PSMX3_CTXT_USER(fi_context); + if (mr->cntr && mr->cntr != am_req->ep->remote_write_cntr) + psmx3_cntr_inc(mr->cntr, 0); + } + + /* NOTE: am_req->tmpbuf is unused here */ + psmx3_am_request_free(status_data->trx_ctxt, am_req); + break; + + case PSMX3_REMOTE_READ_CONTEXT: + am_req = container_of(fi_context, struct psmx3_am_request, fi_context); + if (am_req->ep->caps & FI_RMA_EVENT) { + if (am_req->ep->remote_read_cntr) + psmx3_cntr_inc(am_req->ep->remote_read_cntr, 0); + } + + /* NOTE: am_req->tmpbuf is unused here */ + psmx3_am_request_free(status_data->trx_ctxt, am_req); + break; + + case PSMX3_SENDV_CONTEXT: + sendv_req = PSMX3_CTXT_USER(fi_context); + sendv_req->iov_done++; + if (sendv_req->iov_protocol == PSMX3_IOV_PROTO_MULTI && + sendv_req->iov_done < sendv_req->iov_info.count + 1) { + sendv_req->tag = PSMX3_STATUS_TAG(req); + return 0; + } + if (ep->send_cq && !sendv_req->no_completion) { + op_context = sendv_req->user_context; + flags |= sendv_req->comp_flag; + err = psmx3_cq_tx_complete( + status_data->poll_cq, ep->send_cq, ep->av, + req, op_context, NULL, flags, 0, + entry, &event_saved); + if (OFI_UNLIKELY(err)) { + free(sendv_req); + return err; + } + } + if (ep->send_cntr) + psmx3_cntr_inc(ep->send_cntr, PSMX3_STATUS_ERROR(req)); + free(sendv_req); + break; + + case PSMX3_IOV_SEND_CONTEXT: + sendv_req = PSMX3_CTXT_USER(fi_context); + sendv_req->iov_done++; + if (sendv_req->iov_done < sendv_req->iov_info.count + 1) + return 0; + PSMX3_STATUS_TAG(req) = sendv_req->tag; + if (ep->send_cq && !sendv_req->no_completion) { + op_context = sendv_req->user_context; + flags |= sendv_req->comp_flag; + err = psmx3_cq_tx_complete( + status_data->poll_cq, ep->send_cq, ep->av, + req, op_context, NULL, flags, 0, + entry, &event_saved); + if (OFI_UNLIKELY(err)) { + free(sendv_req); + return err; + } + } + if (ep->send_cntr) + psmx3_cntr_inc(ep->send_cntr, PSMX3_STATUS_ERROR(req)); + free(sendv_req); + break; + + case PSMX3_IOV_RECV_CONTEXT: + sendv_rep = PSMX3_CTXT_USER(fi_context); + sendv_rep->iov_done++; + sendv_rep->msg_length += PSMX3_STATUS_SNDLEN(req); + sendv_rep->bytes_received += PSMX3_STATUS_RCVLEN(req); + if (PSMX3_STATUS_ERROR(req) != PSM2_OK) + sendv_rep->error_code = PSMX3_STATUS_ERROR(req); + if (sendv_rep->iov_done < sendv_rep->iov_info.count) + return 0; + + PSMX3_STATUS_TAG(req) = sendv_rep->tag; + PSMX3_STATUS_RCVLEN(req) = sendv_rep->bytes_received; + PSMX3_STATUS_SNDLEN(req) = sendv_rep->msg_length; + PSMX3_STATUS_ERROR(req) = sendv_rep->error_code; + + if (ep->recv_cq && !sendv_rep->no_completion) { + op_context = sendv_rep->user_context; + buf = sendv_rep->buf; + flags |= sendv_rep->comp_flag; + err = psmx3_cq_rx_complete( + status_data->poll_cq, ep->recv_cq, ep->av, + req, op_context, buf, flags, 0, + entry, status_data->src_addr, &event_saved); + if (OFI_UNLIKELY(err)) { + free(sendv_rep); + return err; + } + } + if (ep->recv_cntr) + psmx3_cntr_inc(ep->recv_cntr, PSMX3_STATUS_ERROR(req)); + + if (sendv_rep->multi_recv) { + /* repost the multi-recv buffer */ + fi_context = sendv_rep->user_context; + multi_recv_req = PSMX3_CTXT_USER(fi_context); + multi_recv_req->offset += PSMX3_STATUS_RCVLEN(req); + len_remaining = multi_recv_req->len - multi_recv_req->offset; + if (len_remaining >= multi_recv_req->min_buf_size) { + if (len_remaining > PSMX3_MAX_MSG_SIZE) + len_remaining = PSMX3_MAX_MSG_SIZE; + err = psm2_mq_irecv2(ep->rx->psm2_mq, + multi_recv_req->src_addr, &multi_recv_req->tag, + &multi_recv_req->tagsel, multi_recv_req->flag, + multi_recv_req->buf + multi_recv_req->offset, + len_remaining, + (void *)fi_context, &psm2_req); + if (OFI_UNLIKELY(err != PSM2_OK)) { + free(sendv_rep); + return psmx3_errno(err); + } + PSMX3_CTXT_REQ(fi_context) = psm2_req; + } else { + free(multi_recv_req); + } + } + + free(sendv_rep); + break; + } + + return event_saved; +} + +int psmx3_cq_poll_mq(struct psmx3_fid_cq *cq, + struct psmx3_trx_ctxt *trx_ctxt, + struct psmx3_cq_event *event_in, + int count, fi_addr_t *src_addr) +{ + struct psmx3_status_data status_data; + + /* psm2_mq_ipeek_dequeue_multi needs non-zero count to make progress */ + if (!count) { + event_in = NULL; + count = 1; + } + + status_data.poll_cq = cq; + status_data.event_buffer = event_in; + status_data.src_addr = src_addr; + status_data.trx_ctxt = trx_ctxt; + + psm2_mq_ipeek_dequeue_multi(trx_ctxt->psm2_mq, &status_data, + psmx3_mq_status_copy, &count); + return count; +} + +DIRECT_FN +STATIC ssize_t psmx3_cq_readfrom(struct fid_cq *cq, void *buf, size_t count, + fi_addr_t *src_addr) +{ + struct psmx3_fid_cq *cq_priv; + struct psmx3_cq_event *event; + struct psmx3_poll_ctxt *poll_ctxt; + struct slist_entry *item, *prev; + int ret; + ssize_t read_count; + fi_addr_t source; + int i; + + cq_priv = container_of(cq, struct psmx3_fid_cq, cq); + + if (slist_empty(&cq_priv->event_queue) || !buf) { + slist_foreach(&cq_priv->poll_list, item, prev) { + poll_ctxt = container_of(item, struct psmx3_poll_ctxt, + list_entry); + + if (OFI_UNLIKELY(!poll_ctxt->trx_ctxt->poll_active)) + continue; + + ret = psmx3_cq_poll_mq(cq_priv, poll_ctxt->trx_ctxt, + (struct psmx3_cq_event *)buf, + count, src_addr); + if (ret > 0) + return ret; + + if (poll_ctxt->trx_ctxt->am_progress) + psmx3_am_progress(poll_ctxt->trx_ctxt); + + (void) prev; /* suppress compiler warning */ + } + } + + if (OFI_UNLIKELY(cq_priv->pending_error != NULL)) + return -FI_EAVAIL; + + assert(buf || !count); + + read_count = 0; + for (i = 0; i < count; i++) { + if (slist_empty(&cq_priv->event_queue)) + break; + + event = psmx3_cq_dequeue_event(cq_priv); + if (event) { + if (!event->error) { + if (src_addr && event->source_is_valid) { + source = psmx3_av_translate_source( + event->source_av, event->source, + event->source_sep_id); + if (source == FI_ADDR_NOTAVAIL) { + if (cq_priv->domain->addr_format == FI_ADDR_STR) { + event->cqe.err.err_data_size = PSMX3_ERR_DATA_SIZE; + psmx3_get_source_string_name( + event->source, event->source_sep_id, + (void *)&cq_priv->error_data, + &event->cqe.err.err_data_size); + } else { + psmx3_get_source_name( + event->source, + event->source_sep_id, + (void *)&cq_priv->error_data); + event->cqe.err.err_data_size = sizeof(struct psmx3_ep_name); + } + event->cqe.err.err_data = &cq_priv->error_data; + event->cqe.err.err = FI_EADDRNOTAVAIL; + event->error = !!event->cqe.err.err; + cq_priv->pending_error = event; + if (!read_count) + read_count = -FI_EAVAIL; + break; + } + + *src_addr = source; + } + + memcpy(buf, (void *)&event->cqe, cq_priv->entry_size); + psmx3_cq_free_event(cq_priv, event); + + read_count++; + buf = (uint8_t *)buf + cq_priv->entry_size; + if (src_addr) + src_addr++; + continue; + } else { + cq_priv->pending_error = event; + if (!read_count) + read_count = -FI_EAVAIL; + break; + } + } else { + break; + } + } + + /* + * Return 0 if and only if the input count is 0 and the CQ is not empty. + * This is used by the util poll code to check the poll state. + */ + if (!read_count && (count || slist_empty(&cq_priv->event_queue))) + read_count = -FI_EAGAIN; + + return read_count; +} + +DIRECT_FN +STATIC ssize_t psmx3_cq_read(struct fid_cq *cq, void *buf, size_t count) +{ + return psmx3_cq_readfrom(cq, buf, count, NULL); +} + +DIRECT_FN +STATIC ssize_t psmx3_cq_readerr(struct fid_cq *cq, struct fi_cq_err_entry *buf, + uint64_t flags) +{ + struct psmx3_fid_cq *cq_priv; + uint32_t api_version; + size_t size; + + cq_priv = container_of(cq, struct psmx3_fid_cq, cq); + + cq_priv->domain->cq_lock_fn(&cq_priv->lock, 2); + if (cq_priv->pending_error) { + api_version = cq_priv->domain->fabric->util_fabric. + fabric_fid.api_version; + size = FI_VERSION_GE(api_version, FI_VERSION(1, 5)) ? + sizeof(*buf) : sizeof(struct fi_cq_err_entry_1_0); + + memcpy(buf, &cq_priv->pending_error->cqe, size); + free(cq_priv->pending_error); + cq_priv->pending_error = NULL; + psmx3_unlock(&cq_priv->lock, 2); + return 1; + } + cq_priv->domain->cq_unlock_fn(&cq_priv->lock, 2); + + return -FI_EAGAIN; +} + +DIRECT_FN +STATIC ssize_t psmx3_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count, + fi_addr_t *src_addr, const void *cond, + int timeout) +{ + struct psmx3_fid_cq *cq_priv; + struct psmx3_poll_ctxt *poll_ctxt; + struct slist_entry *item, *prev; + struct timespec ts0, ts; + size_t threshold, event_count; + int msec_passed = 0; + int sth_happened = 0; + + cq_priv = container_of(cq, struct psmx3_fid_cq, cq); + if (cq_priv->wait_cond == FI_CQ_COND_THRESHOLD) + threshold = (size_t) cond; + else + threshold = 1; + + /* NOTE: "cond" is only a hint, not a mandatory condition. */ + event_count = cq_priv->event_count; + if (event_count < threshold) { + if (cq_priv->wait) { + if (ofi_atomic_get32(&cq_priv->signaled)) { + ofi_atomic_set32(&cq_priv->signaled, 0); + return -FI_ECANCELED; + } + fi_wait((struct fid_wait *)cq_priv->wait, timeout); + } else { + clock_gettime(CLOCK_REALTIME, &ts0); + while (!sth_happened) { + slist_foreach(&cq_priv->poll_list, item, prev) { + poll_ctxt = container_of(item, + struct psmx3_poll_ctxt, + list_entry); + + if (OFI_UNLIKELY(!poll_ctxt->trx_ctxt->poll_active)) + continue; + + sth_happened = + psmx3_cq_poll_mq(cq_priv, + poll_ctxt->trx_ctxt, + NULL, 0, NULL); + if (sth_happened) + break; + + (void) prev; /* suppress compiler warning */ + } + + /* CQ may be updated asynchronously by the AM handlers */ + if (cq_priv->event_count > event_count) + break; + + if (ofi_atomic_get32(&cq_priv->signaled)) { + ofi_atomic_set32(&cq_priv->signaled, 0); + return -FI_ECANCELED; + } + + if (timeout < 0) + continue; + + clock_gettime(CLOCK_REALTIME, &ts); + msec_passed = (ts.tv_sec - ts0.tv_sec) * 1000 + + (ts.tv_nsec - ts0.tv_nsec) / 1000000; + + if (msec_passed >= timeout) + break; + } + } + } + + return psmx3_cq_readfrom(cq, buf, count, src_addr); +} + +DIRECT_FN +STATIC ssize_t psmx3_cq_sread(struct fid_cq *cq, void *buf, size_t count, + const void *cond, int timeout) +{ + return psmx3_cq_sreadfrom(cq, buf, count, NULL, cond, timeout); +} + +DIRECT_FN +STATIC int psmx3_cq_signal(struct fid_cq *cq) +{ + struct psmx3_fid_cq *cq_priv; + cq_priv = container_of(cq, struct psmx3_fid_cq, cq); + + ofi_atomic_set32(&cq_priv->signaled, 1); + if (cq_priv->wait) + cq_priv->wait->signal(cq_priv->wait); + + return 0; +} + +DIRECT_FN +STATIC const char *psmx3_cq_strerror(struct fid_cq *cq, int prov_errno, const void *prov_data, + char *buf, size_t len) +{ + return psm2_error_get_string(prov_errno); +} + +static int psmx3_cq_close(fid_t fid) +{ + struct psmx3_fid_cq *cq; + struct slist_entry *entry; + struct psmx3_cq_event *item; + struct psmx3_poll_ctxt *poll_item; + + cq = container_of(fid, struct psmx3_fid_cq, cq.fid); + + while (!slist_empty(&cq->poll_list)) { + entry = slist_remove_head(&cq->poll_list); + poll_item = container_of(entry, struct psmx3_poll_ctxt, list_entry); + if (!ofi_atomic_dec32(&poll_item->trx_ctxt->poll_refcnt)) + free(poll_item->trx_ctxt); + free(poll_item); + } + + while (!slist_empty(&cq->free_list)) { + entry = slist_remove_head(&cq->free_list); + item = container_of(entry, struct psmx3_cq_event, list_entry); + free(item); + } + + while (!slist_empty(&cq->event_queue)) { + entry = slist_remove_head(&cq->event_queue); + item = container_of(entry, struct psmx3_cq_event, list_entry); + free(item); + } + + fastlock_destroy(&cq->lock); + + if (cq->wait) { + fi_poll_del(&cq->wait->pollset->poll_fid, &cq->cq.fid, 0); + if (cq->wait_is_local) + fi_close(&cq->wait->wait_fid.fid); + } + + psmx3_domain_release(cq->domain); + free(cq); + + return 0; +} + +static int psmx3_cq_control(struct fid *fid, int command, void *arg) +{ + struct psmx3_fid_cq *cq; + int ret = 0; + + cq = container_of(fid, struct psmx3_fid_cq, cq.fid); + + switch (command) { + case FI_GETWAIT: + if (cq->wait) + ret = fi_control(&cq->wait->wait_fid.fid, FI_GETWAIT, arg); + else + return -FI_EINVAL; + break; + + default: + return -FI_ENOSYS; + } + + return ret; +} + +static struct fi_ops psmx3_fi_ops = { + .size = sizeof(struct fi_ops), + .close = psmx3_cq_close, + .bind = fi_no_bind, + .control = psmx3_cq_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_cq psmx3_cq_ops = { + .size = sizeof(struct fi_ops_cq), + .read = psmx3_cq_read, + .readfrom = psmx3_cq_readfrom, + .readerr = psmx3_cq_readerr, + .sread = psmx3_cq_sread, + .sreadfrom = psmx3_cq_sreadfrom, + .signal = psmx3_cq_signal, + .strerror = psmx3_cq_strerror, +}; + +DIRECT_FN +int psmx3_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, + struct fid_cq **cq, void *context) +{ + struct psmx3_fid_domain *domain_priv; + struct psmx3_fid_cq *cq_priv; + struct fid_wait *wait = NULL; + struct psmx3_cq_event *event; + struct fi_wait_attr wait_attr; + int wait_is_local = 0; + int entry_size; + int err; + int i; + + domain_priv = container_of(domain, struct psmx3_fid_domain, + util_domain.domain_fid); + switch (attr->format) { + case FI_CQ_FORMAT_UNSPEC: + attr->format = FI_CQ_FORMAT_TAGGED; + entry_size = sizeof(struct fi_cq_tagged_entry); + break; + + case FI_CQ_FORMAT_CONTEXT: + entry_size = sizeof(struct fi_cq_entry); + break; + + case FI_CQ_FORMAT_MSG: + entry_size = sizeof(struct fi_cq_msg_entry); + break; + + case FI_CQ_FORMAT_DATA: + entry_size = sizeof(struct fi_cq_data_entry); + break; + + case FI_CQ_FORMAT_TAGGED: + entry_size = sizeof(struct fi_cq_tagged_entry); + break; + + default: + FI_INFO(&psmx3_prov, FI_LOG_CQ, + "attr->format=%d, supported=%d...%d\n", attr->format, + FI_CQ_FORMAT_UNSPEC, FI_CQ_FORMAT_TAGGED); + return -FI_EINVAL; + } + + switch (attr->wait_obj) { + case FI_WAIT_NONE: + break; + + case FI_WAIT_SET: + if (!attr->wait_set) { + FI_INFO(&psmx3_prov, FI_LOG_CQ, + "FI_WAIT_SET is specified but attr->wait_set is NULL\n"); + return -FI_EINVAL; + } + wait = attr->wait_set; + break; + + case FI_WAIT_UNSPEC: + case FI_WAIT_FD: + case FI_WAIT_MUTEX_COND: + wait_attr.wait_obj = attr->wait_obj; + wait_attr.flags = 0; + err = fi_wait_open(&domain_priv->fabric->util_fabric.fabric_fid, + &wait_attr, (struct fid_wait **)&wait); + if (err) + return err; + wait_is_local = 1; + break; + + default: + FI_INFO(&psmx3_prov, FI_LOG_CQ, + "attr->wait_obj=%d, supported=%d...%d\n", attr->wait_obj, + FI_WAIT_NONE, FI_WAIT_MUTEX_COND); + return -FI_EINVAL; + } + + if (wait) { + switch (attr->wait_cond) { + case FI_CQ_COND_NONE: + case FI_CQ_COND_THRESHOLD: + break; + + default: + FI_INFO(&psmx3_prov, FI_LOG_CQ, + "attr->wait_cond=%d, supported=%d...%d\n", + attr->wait_cond, FI_CQ_COND_NONE, FI_CQ_COND_THRESHOLD); + return -FI_EINVAL; + } + } + + cq_priv = (struct psmx3_fid_cq *) calloc(1, sizeof *cq_priv); + if (!cq_priv) { + if (wait) + free(wait); + return -FI_ENOMEM; + } + + psmx3_domain_acquire(domain_priv); + + cq_priv->domain = domain_priv; + cq_priv->format = attr->format; + cq_priv->entry_size = entry_size; + if (wait) { + cq_priv->wait = container_of(wait, struct util_wait, wait_fid); + cq_priv->wait_cond = attr->wait_cond; + } + cq_priv->wait_is_local = wait_is_local; + ofi_atomic_initialize32(&cq_priv->signaled, 0); + + cq_priv->cq.fid.fclass = FI_CLASS_CQ; + cq_priv->cq.fid.context = context; + cq_priv->cq.fid.ops = &psmx3_fi_ops; + cq_priv->cq.ops = &psmx3_cq_ops; + + slist_init(&cq_priv->poll_list); + slist_init(&cq_priv->event_queue); + slist_init(&cq_priv->free_list); + fastlock_init(&cq_priv->lock); + +#define PSMX3_FREE_LIST_SIZE 64 + for (i=0; ilist_entry, &cq_priv->free_list); + } + + if (wait) + fi_poll_add(&cq_priv->wait->pollset->poll_fid, &cq_priv->cq.fid, 0); + + *cq = &cq_priv->cq; + return 0; +} + diff --git a/prov/psm3/src/psmx3_domain.c b/prov/psm3/src/psmx3_domain.c new file mode 100644 index 00000000000..f0f187e77f8 --- /dev/null +++ b/prov/psm3/src/psmx3_domain.c @@ -0,0 +1,544 @@ +/* + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx3.h" + +static inline int normalize_core_id(int core_id, int num_cores) +{ + if (core_id < 0) + core_id += num_cores; + + if (core_id < 0) + core_id = 0; + + if (core_id >= num_cores) + core_id = num_cores - 1; + + return core_id; +} + +static int psmx3_progress_set_affinity(char *affinity) +{ + int num_cores = sysconf(_SC_NPROCESSORS_ONLN); + int core_id; + cpu_set_t cpuset; + char *triplet; + int n, start, end, stride; + int set_count = 0; + + if (!affinity) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "progress thread affinity not set\n"); + return 0; + } + + CPU_ZERO(&cpuset); + + for (triplet = affinity; triplet; triplet = strchr(triplet, 'c')) { + if (triplet[0] == ',') + triplet++; + + stride = 1; + n = sscanf(triplet, "%d:%d:%d", &start, &end, &stride); + if (n < 1) + continue; + + if (n < 2) + end = start; + + if (stride < 1) + stride = 1; + + start = normalize_core_id(start, num_cores); + end = normalize_core_id(end, num_cores); + + for (core_id = start; core_id <= end; core_id += stride) { + CPU_SET(core_id, &cpuset); + set_count++; + } + + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "core set [%d:%d:%d] added to progress thread affinity set\n", + start, end, stride); + } + + if (set_count) + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); + else + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "progress thread affinity not set due to invalid format\n"); + + return set_count; +} + +static void *psmx3_progress_func(void *args) +{ + struct psmx3_fid_domain *domain = args; + int affinity_set; + int sleep_usec; + struct timespec ts; + + FI_INFO(&psmx3_prov, FI_LOG_CORE, "\n"); + + affinity_set = psmx3_progress_set_affinity(psmx3_env.prog_affinity); + + /* Negative sleep time means let the system choose the default. + * If affinity is set, sleep a short time to get better latency. + * If affinity is not set, short sleep time doesn't make difference. + */ + sleep_usec = psmx3_env.prog_interval; + if (sleep_usec < 0) { + if (affinity_set) + sleep_usec = 1; + else + sleep_usec = 1000; + } + + ts.tv_sec = sleep_usec / 1000000; + ts.tv_nsec = (sleep_usec % 1000000) * 1000; + + while (1) { + psmx3_progress_all(domain); + nanosleep(&ts, NULL); + } + + return NULL; +} + +static void psmx3_domain_start_progress(struct psmx3_fid_domain *domain) +{ + int err; + + err = pthread_create(&domain->progress_thread, NULL, + psmx3_progress_func, (void *)domain); + if (err) { + domain->progress_thread = pthread_self(); + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "pthread_create returns %d\n", err); + } else { + FI_INFO(&psmx3_prov, FI_LOG_CORE, "progress thread started\n"); + } +} + +static void psmx3_domain_stop_progress(struct psmx3_fid_domain *domain) +{ + int err; + void *exit_code; + + if (!pthread_equal(domain->progress_thread, pthread_self())) { + err = pthread_cancel(domain->progress_thread); + if (err) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "pthread_cancel returns %d\n", err); + } + err = pthread_join(domain->progress_thread, &exit_code); + if (err) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "pthread_join returns %d\n", err); + } else { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "progress thread exited with code %ld (%s)\n", + (uintptr_t)exit_code, + (exit_code == PTHREAD_CANCELED) ? + "PTHREAD_CANCELED" : "?"); + } + } +} + +static int psmx3_domain_close(fid_t fid) +{ + struct psmx3_fid_domain *domain; + + domain = container_of(fid, struct psmx3_fid_domain, + util_domain.domain_fid.fid); + + FI_INFO(&psmx3_prov, FI_LOG_DOMAIN, "refcnt=%d\n", + ofi_atomic_get32(&domain->util_domain.ref)); + + if (ofi_domain_close(&domain->util_domain)) + return 0; + + if (domain->progress_thread_enabled) + psmx3_domain_stop_progress(domain); + + fastlock_destroy(&domain->sep_lock); + fastlock_destroy(&domain->mr_lock); + rbtDelete(domain->mr_map); + + psmx3_lock(&domain->fabric->domain_lock, 1); + dlist_remove(&domain->entry); + psmx3_unlock(&domain->fabric->domain_lock, 1); + psmx3_fabric_release(domain->fabric); + + free(domain); + return 0; +} + +DIRECT_FN +STATIC int psmx3_domain_control(fid_t fid, int command, void *arg) +{ + struct fi_mr_map_raw *map; + + switch (command) { + case FI_MAP_RAW_MR: + map = arg; + if (!map || !map->key || !map->raw_key) + return -FI_EINVAL; + *(uint64_t *)map->key = *(uint64_t *)map->raw_key; + break; + + case FI_UNMAP_KEY: + /* Nothing to do here */ + break; + + default: + return -FI_ENOSYS; + } + + return 0; +} + +static struct fi_ops psmx3_fi_ops = { + .size = sizeof(struct fi_ops), + .close = psmx3_domain_close, + .bind = fi_no_bind, + .control = psmx3_domain_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_domain psmx3_domain_ops = { + .size = sizeof(struct fi_ops_domain), + .av_open = psmx3_av_open, + .cq_open = psmx3_cq_open, + .endpoint = psmx3_ep_open, + .scalable_ep = psmx3_sep_open, + .cntr_open = psmx3_cntr_open, + .poll_open = fi_poll_create, + .stx_ctx = psmx3_stx_ctx, + .srx_ctx = fi_no_srx_context, + .query_atomic = psmx3_query_atomic, + .query_collective = fi_no_query_collective, +}; + +static int psmx3_key_compare(void *key1, void *key2) +{ + return (key1 < key2) ? -1 : (key1 > key2); +} + +static int psmx3_domain_init(struct psmx3_fid_domain *domain, + struct psmx3_ep_name *src_addr) +{ + int err; + + err = fastlock_init(&domain->mr_lock); + if (err) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "fastlock_init(mr_lock) returns %d\n", err); + goto err_out; + } + + domain->mr_map = rbtNew(&psmx3_key_compare); + if (!domain->mr_map) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "rbtNew failed\n"); + goto err_out_destroy_mr_lock; + } + + domain->mr_reserved_key = 1; + domain->max_atomic_size = INT_MAX; + + ofi_atomic_initialize32(&domain->sep_cnt, 0); + fastlock_init(&domain->sep_lock); + dlist_init(&domain->sep_list); + dlist_init(&domain->trx_ctxt_list); + fastlock_init(&domain->trx_ctxt_lock); + + if (domain->progress_thread_enabled) + psmx3_domain_start_progress(domain); + + return 0; + +err_out_destroy_mr_lock: + fastlock_destroy(&domain->mr_lock); + +err_out: + return err; +} + +DIRECT_FN +int psmx3_domain_open(struct fid_fabric *fabric, struct fi_info *info, + struct fid_domain **domain, void *context) +{ + struct psmx3_fid_fabric *fabric_priv; + struct psmx3_fid_domain *domain_priv; + struct psmx3_ep_name *src_addr = info->src_addr; + int mr_mode = (info->domain_attr->mr_mode & FI_MR_BASIC) ? FI_MR_BASIC : 0; + int err, tmp; + + FI_INFO(&psmx3_prov, FI_LOG_DOMAIN, "\n"); + + fabric_priv = container_of(fabric, struct psmx3_fid_fabric, + util_fabric.fabric_fid); + +#if 0 + if (!info->domain_attr->name || + strncmp(info->domain_attr->name, PSMX3_DOMAIN_NAME, strlen(PSMX3_DOMAIN_NAME))) { + err = -FI_EINVAL; + goto err_out; + } +#endif /* 0 */ + + domain_priv = (struct psmx3_fid_domain *) calloc(1, sizeof *domain_priv); + if (!domain_priv) { + err = -FI_ENOMEM; + goto err_out; + } + + err = ofi_domain_init(fabric, info, &domain_priv->util_domain, context); + if (err) + goto err_out_free_domain; + + /* fclass & context are set in ofi_domain_init */ + domain_priv->util_domain.domain_fid.fid.ops = &psmx3_fi_ops; + domain_priv->util_domain.domain_fid.ops = &psmx3_domain_ops; + domain_priv->util_domain.domain_fid.mr = &psmx3_mr_ops; + domain_priv->mr_mode = mr_mode; + domain_priv->mode = info->mode; + domain_priv->caps = info->caps; + domain_priv->fabric = fabric_priv; + domain_priv->progress_thread_enabled = + (info->domain_attr->data_progress == FI_PROGRESS_AUTO); + domain_priv->addr_format = info->addr_format; + + if (info->addr_format == FI_ADDR_STR) + src_addr = psmx3_string_to_ep_name(info->src_addr); + + /* Use generic lock/unlock functions by default */ + domain_priv->av_lock_fn = psmx3_lock; + domain_priv->am_req_pool_lock_fn = psmx3_lock; + domain_priv->trx_ctxt_lock_fn = psmx3_lock; + domain_priv->rma_queue_lock_fn = psmx3_lock; + domain_priv->trigger_queue_lock_fn = psmx3_lock; + domain_priv->peer_lock_fn = psmx3_lock; + domain_priv->sep_lock_fn = psmx3_lock; + domain_priv->trigger_lock_fn = psmx3_lock; + domain_priv->cq_lock_fn = psmx3_lock; + domain_priv->mr_lock_fn = psmx3_lock; + domain_priv->context_lock_fn = psmx3_lock; + domain_priv->poll_trylock_fn = psmx3_trylock; + + domain_priv->av_unlock_fn = psmx3_unlock; + domain_priv->am_req_pool_unlock_fn = psmx3_unlock; + domain_priv->trx_ctxt_unlock_fn = psmx3_unlock; + domain_priv->rma_queue_unlock_fn = psmx3_unlock; + domain_priv->trigger_queue_unlock_fn = psmx3_unlock; + domain_priv->peer_unlock_fn = psmx3_unlock; + domain_priv->sep_unlock_fn = psmx3_unlock; + domain_priv->trigger_unlock_fn = psmx3_unlock; + domain_priv->cq_unlock_fn = psmx3_unlock; + domain_priv->mr_unlock_fn = psmx3_unlock; + domain_priv->context_unlock_fn = psmx3_unlock; + domain_priv->poll_unlock_fn = psmx3_unlock; + + /* If lock_level env is unset, then set locks based off threading model*/ + err = fi_param_get_bool(&psmx3_prov, "lock_level", &tmp); + if (err < 0) { + switch (info->domain_attr->threading) { + case FI_THREAD_DOMAIN: + /* Disable locks not required when serializing access to a domain */ + domain_priv->av_lock_fn = psmx3_lock_disabled; + domain_priv->trx_ctxt_lock_fn = psmx3_lock_disabled; + domain_priv->trigger_queue_lock_fn = psmx3_lock_disabled; + domain_priv->sep_lock_fn = psmx3_lock_disabled; + domain_priv->trigger_lock_fn = psmx3_lock_disabled; + domain_priv->cq_lock_fn = psmx3_lock_disabled; + domain_priv->mr_lock_fn = psmx3_lock_disabled; + domain_priv->context_lock_fn = psmx3_lock_disabled; + domain_priv->poll_trylock_fn = psmx3_trylock_disabled; + + domain_priv->av_unlock_fn = psmx3_lock_disabled; + domain_priv->trx_ctxt_unlock_fn = psmx3_lock_disabled; + domain_priv->trigger_queue_unlock_fn = psmx3_lock_disabled; + domain_priv->sep_unlock_fn = psmx3_lock_disabled; + domain_priv->trigger_unlock_fn = psmx3_lock_disabled; + domain_priv->cq_unlock_fn = psmx3_lock_disabled; + domain_priv->mr_unlock_fn = psmx3_lock_disabled; + domain_priv->context_unlock_fn = psmx3_lock_disabled; + domain_priv->poll_unlock_fn = psmx3_lock_disabled; + + /* Enable lock accessed by the disconnection thread */ + domain_priv->peer_lock_fn = psmx3_lock_enabled; + domain_priv->peer_unlock_fn = psmx3_unlock_enabled; + + /* + * If FI_RMA or FI_ATOMIC caps are enabled, then locks are + * required for the CQ, am_req_pool, & rma_queue + * due to the PSM2 Recv thread. + * NOTE: am_req_pool & rma_queue are only used when FI_RMA + * and FI_ATOMIC capabilities are enabled. + */ + if ((info->caps & FI_RMA) || (info->caps & FI_ATOMIC)) { + domain_priv->cq_lock_fn = psmx3_lock_enabled; + domain_priv->am_req_pool_lock_fn = psmx3_lock_enabled; + domain_priv->rma_queue_lock_fn = psmx3_lock_enabled; + domain_priv->cq_unlock_fn = psmx3_unlock_enabled; + domain_priv->am_req_pool_unlock_fn = psmx3_unlock_enabled; + domain_priv->rma_queue_unlock_fn = psmx3_unlock_enabled; + } + + /* + * Locks accessed by the progress thread are required because + * they are outside the scope of domain access serialization + * implied by FI_THREAD_DOMAIN. + */ + if (domain_priv->progress_thread_enabled) { + domain_priv->trx_ctxt_lock_fn = psmx3_lock_enabled; + domain_priv->poll_trylock_fn = psmx3_trylock_enabled; + domain_priv->cq_lock_fn = psmx3_lock_enabled; + domain_priv->trx_ctxt_unlock_fn = psmx3_unlock_enabled; + domain_priv->poll_unlock_fn = psmx3_unlock_enabled; + domain_priv->cq_unlock_fn = psmx3_unlock_enabled; + if (info->caps & FI_TRIGGER) { + domain_priv->trigger_queue_lock_fn = psmx3_lock_enabled; + domain_priv->trigger_lock_fn = psmx3_lock_enabled; + domain_priv->av_lock_fn = psmx3_lock_enabled; + domain_priv->mr_lock_fn = psmx3_lock_enabled; + domain_priv->context_lock_fn = psmx3_lock_enabled; + domain_priv->trigger_queue_unlock_fn = psmx3_unlock_enabled; + domain_priv->trigger_unlock_fn = psmx3_unlock_enabled; + domain_priv->av_unlock_fn = psmx3_unlock_enabled; + domain_priv->mr_unlock_fn = psmx3_unlock_enabled; + domain_priv->context_unlock_fn = psmx3_unlock_enabled; + } + } + break; + default: + /* Otherwise, enable all locks */ + domain_priv->av_lock_fn = psmx3_lock_enabled; + domain_priv->am_req_pool_lock_fn = psmx3_lock_enabled; + domain_priv->trx_ctxt_lock_fn = psmx3_lock_enabled; + domain_priv->rma_queue_lock_fn = psmx3_lock_enabled; + domain_priv->trigger_queue_lock_fn = psmx3_lock_enabled; + domain_priv->peer_lock_fn = psmx3_lock_enabled; + domain_priv->sep_lock_fn = psmx3_lock_enabled; + domain_priv->trigger_lock_fn = psmx3_lock_enabled; + domain_priv->cq_lock_fn = psmx3_lock_enabled; + domain_priv->mr_lock_fn = psmx3_lock_enabled; + domain_priv->context_lock_fn = psmx3_lock_enabled; + domain_priv->poll_trylock_fn = psmx3_trylock_enabled; + + domain_priv->av_unlock_fn = psmx3_unlock_enabled; + domain_priv->am_req_pool_unlock_fn = psmx3_unlock_enabled; + domain_priv->trx_ctxt_unlock_fn = psmx3_unlock_enabled; + domain_priv->rma_queue_unlock_fn = psmx3_unlock_enabled; + domain_priv->trigger_queue_unlock_fn = psmx3_unlock_enabled; + domain_priv->peer_unlock_fn = psmx3_unlock_enabled; + domain_priv->sep_unlock_fn = psmx3_unlock_enabled; + domain_priv->trigger_unlock_fn = psmx3_unlock_enabled; + domain_priv->cq_unlock_fn = psmx3_unlock_enabled; + domain_priv->mr_unlock_fn = psmx3_unlock_enabled; + domain_priv->context_unlock_fn = psmx3_unlock_enabled; + domain_priv->poll_unlock_fn = psmx3_unlock_enabled; + break; + } + } + + err = psmx3_domain_init(domain_priv, src_addr); + if (info->addr_format == FI_ADDR_STR) + free(src_addr); + if (err) + goto err_out_close_domain; + + psmx3_fabric_acquire(fabric_priv); + psmx3_lock(&fabric_priv->domain_lock, 1); + dlist_insert_before(&domain_priv->entry, &fabric_priv->domain_list); + psmx3_unlock(&fabric_priv->domain_lock, 1); + + psmx3_init_tag_layout(info); + + *domain = &domain_priv->util_domain.domain_fid; + return 0; + +err_out_close_domain: + ofi_domain_close(&domain_priv->util_domain); + +err_out_free_domain: + free(domain_priv); + +err_out: + return err; +} + +static int psmx3_domain_check_features(struct psmx3_fid_domain *domain, + uint64_t ep_caps) +{ + uint64_t domain_caps = domain->caps & ~PSMX3_SUB_CAPS; + + ep_caps &= ~PSMX3_SUB_CAPS; + + if ((domain_caps & ep_caps) != ep_caps) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "caps mismatch: domain_caps=%s;\n", + fi_tostr(&domain_caps, FI_TYPE_CAPS)); + + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "caps mismatch: ep_caps=%s.\n", + fi_tostr(&ep_caps, FI_TYPE_CAPS)); + + return -FI_EOPNOTSUPP; + } + + return 0; +} + +int psmx3_domain_enable_ep(struct psmx3_fid_domain *domain, + struct psmx3_fid_ep *ep) +{ + int err; + + err = psmx3_domain_check_features(domain, ep->caps); + if (err) + return err; + + if ((ep->caps & FI_RMA) || (ep->caps & FI_ATOMICS)) { + if (ep->tx) { + err = psmx3_am_init(ep->tx); + if (err) + return err; + } + if (ep->rx && ep->rx != ep->tx) + return psmx3_am_init(ep->rx); + } + + return 0; +} + diff --git a/prov/psm3/src/psmx3_ep.c b/prov/psm3/src/psmx3_ep.c new file mode 100644 index 00000000000..d196abfd1c6 --- /dev/null +++ b/prov/psm3/src/psmx3_ep.c @@ -0,0 +1,1108 @@ +/* + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx3.h" + +#define PSMX3_EP_SET_TAGGED_OPS(suffix, msg_suffix) \ + do { \ + if (!send_completion && !recv_completion) { \ + ep->ep.tagged = &psmx3_tagged_ops_no_event##suffix; \ + FI_INFO(&psmx3_prov, FI_LOG_EP_DATA, \ + "tagged ops optimized for op_flags=0 " \ + "and event suppression " \ + msg_suffix \ + "\n"); \ + } else if (!send_completion) { \ + ep->ep.tagged = &psmx3_tagged_ops_no_send_event##suffix;\ + FI_INFO(&psmx3_prov, FI_LOG_EP_DATA, \ + "tagged ops optimized for op_flags=0 " \ + "and send event suppression " \ + msg_suffix \ + "\n"); \ + } else if (!recv_completion) { \ + ep->ep.tagged = &psmx3_tagged_ops_no_recv_event##suffix;\ + FI_INFO(&psmx3_prov, FI_LOG_EP_DATA, \ + "tagged ops optimized for op_flags=0 " \ + "and recv event suppression " \ + msg_suffix \ + "\n"); \ + } else { \ + ep->ep.tagged = &psmx3_tagged_ops_no_flag##suffix; \ + FI_INFO(&psmx3_prov, FI_LOG_EP_DATA, \ + "tagged ops optimized for op_flags=0 " \ + msg_suffix \ + "\n"); \ + } \ + } while (0) + +static void psmx3_ep_optimize_ops(struct psmx3_fid_ep *ep) +{ + int send_completion; + int recv_completion; + uint64_t mask; + + mask = PSMX3_OP_FLAGS & + ~(FI_INJECT_COMPLETE | FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE); + + if (ep->ep.tagged) { + if (ep->tx_flags & mask & ~FI_COMPLETION || ep->rx_flags & mask & ~FI_COMPLETION) { + ep->ep.tagged = &psmx3_tagged_ops; + FI_INFO(&psmx3_prov, FI_LOG_EP_DATA, + "generic tagged ops.\n"); + } else { + send_completion = !ep->send_selective_completion || ep->tx_flags & FI_COMPLETION; + recv_completion = !ep->recv_selective_completion || ep->rx_flags & FI_COMPLETION; + + if (ep->av && ep->av->type == FI_AV_MAP) { + if (ep->caps & FI_DIRECTED_RECV) + PSMX3_EP_SET_TAGGED_OPS(_directed_av_map, "and directed receive and av map"); + else + PSMX3_EP_SET_TAGGED_OPS(_undirected_av_map, "and av map"); + } else { + if (ep->caps & FI_DIRECTED_RECV) + PSMX3_EP_SET_TAGGED_OPS(_directed, "and directed receive"); + else + PSMX3_EP_SET_TAGGED_OPS(_undirected, ""); + } + } + } +} + +DIRECT_FN +STATIC ssize_t psmx3_ep_cancel(fid_t fid, void *context) +{ + struct psmx3_fid_ep *ep; + psm2_mq_status2_t status; + struct fi_context *fi_context = context; + uint64_t flags; + struct psmx3_cq_event *event; + int err; + + ep = container_of(fid, struct psmx3_fid_ep, ep.fid); + assert(ep->domain); + assert(fi_context); + + switch (PSMX3_CTXT_TYPE(fi_context)) { + case PSMX3_TRECV_CONTEXT: + flags = FI_RECV | FI_TAGGED; + break; + case PSMX3_RECV_CONTEXT: + case PSMX3_MULTI_RECV_CONTEXT: + flags = FI_RECV | FI_MSG; + break; + default: + return -FI_EOPNOTSUPP; + } + + err = psm2_mq_cancel((psm2_mq_req_t *)&PSMX3_CTXT_REQ(fi_context)); + if (err == PSM2_OK) { + err = psm2_mq_test2((psm2_mq_req_t *)&PSMX3_CTXT_REQ(fi_context), &status); + if (err == PSM2_OK && ep->recv_cq) { + event = psmx3_cq_create_event( + ep->recv_cq, + status.context, + NULL, /* buf */ + flags, + 0, /* len */ + 0, /* data */ + 0, /* tag */ + 0 /* olen */, + -FI_ECANCELED); + if (event) + psmx3_cq_enqueue_event(ep->recv_cq, event); + else + return -FI_ENOMEM; + } + } + + return psmx3_errno(err); +} + +DIRECT_FN +STATIC int psmx3_ep_getopt(fid_t fid, int level, int optname, + void *optval, size_t *optlen) +{ + struct psmx3_fid_ep *ep; + + ep = container_of(fid, struct psmx3_fid_ep, ep.fid); + + if (level != FI_OPT_ENDPOINT) + return -FI_ENOPROTOOPT; + + switch (optname) { + case FI_OPT_MIN_MULTI_RECV: + *(size_t *)optval = ep->min_multi_recv; + *optlen = sizeof(size_t); + break; + + default: + return -FI_ENOPROTOOPT; + } + + return 0; +} + +DIRECT_FN +STATIC int psmx3_ep_setopt(fid_t fid, int level, int optname, + const void *optval, size_t optlen) +{ + struct psmx3_fid_ep *ep; + + ep = container_of(fid, struct psmx3_fid_ep, ep.fid); + + if (level != FI_OPT_ENDPOINT) + return -FI_ENOPROTOOPT; + + switch (optname) { + case FI_OPT_MIN_MULTI_RECV: + ep->min_multi_recv = *(size_t *)optval; + break; + + default: + return -FI_ENOPROTOOPT; + } + + return 0; +} + +static void psmx3_ep_close_internal(struct psmx3_fid_ep *ep) +{ + psmx3_domain_release(ep->domain); + PSMX3_EP_FINI_OP_CONTEXT(ep); + free(ep); +} + +static int psmx3_ep_close(fid_t fid) +{ + struct psmx3_fid_ep *ep; + struct psmx3_ep_name ep_name; + int usage_flags = 0; + + ep = container_of(fid, struct psmx3_fid_ep, ep.fid); + + if (ep->base_ep) { + ofi_atomic_dec32(&ep->base_ep->ref); + return 0; + } + + if (ofi_atomic_get32(&ep->ref)) + return -FI_EBUSY; + + if (ep->stx) + ofi_atomic_dec32(&ep->stx->ref); + + if (ep->tx && !ep->stx) + usage_flags |= PSMX3_TX; + + if (ep->rx) { + usage_flags |= PSMX3_RX; + ep_name.epid = ep->rx->psm2_epid; + + ofi_ns_del_local_name(&ep->domain->fabric->name_server, + &ep->service, &ep_name); + } + + psmx3_trx_ctxt_free(ep->rx, usage_flags); + psmx3_ep_close_internal(ep); + return 0; +} + +static int psmx3_poll_ctxt_match(struct slist_entry *entry, const void *arg) +{ + struct psmx3_poll_ctxt *poll_ctxt; + + poll_ctxt = container_of(entry, struct psmx3_poll_ctxt, list_entry); + return (poll_ctxt->trx_ctxt == arg); +} + +static int psmx3_add_poll_ctxt(struct slist *list, struct psmx3_trx_ctxt *trx_ctxt) +{ + struct psmx3_poll_ctxt *item; + + if (!trx_ctxt) + return 0; + + if (!slist_empty(list) && + slist_find_first_match(list, psmx3_poll_ctxt_match, trx_ctxt)) + return 0; + + item = calloc(1, sizeof(*item)); + if (!item) + return -FI_ENOMEM; + + ofi_atomic_inc32(&trx_ctxt->poll_refcnt); + item->trx_ctxt = trx_ctxt; + slist_insert_tail(&item->list_entry, list); + return 0; +} + +DIRECT_FN +STATIC int psmx3_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + struct psmx3_fid_ep *ep; + struct psmx3_fid_av *av; + struct psmx3_fid_cq *cq; + struct psmx3_fid_cntr *cntr; + struct psmx3_fid_stx *stx; + int err; + + ep = container_of(fid, struct psmx3_fid_ep, ep.fid); + err = ofi_ep_bind_valid(&psmx3_prov, bfid, flags); + if (err) + return err; + + switch (bfid->fclass) { + case FI_CLASS_EQ: + return -FI_ENOSYS; + + case FI_CLASS_CQ: + cq = container_of(bfid, struct psmx3_fid_cq, cq.fid); + if (ep->domain != cq->domain) + return -FI_EINVAL; + if (flags & FI_SEND) { + err = psmx3_add_poll_ctxt(&cq->poll_list, ep->tx); + if (err) + return err; + ep->send_cq = cq; + if (flags & FI_SELECTIVE_COMPLETION) + ep->send_selective_completion = 1; + } + if (flags & FI_RECV) { + err = psmx3_add_poll_ctxt(&cq->poll_list, ep->rx); + if (err) + return err; + ep->recv_cq = cq; + if (flags & FI_SELECTIVE_COMPLETION) + ep->recv_selective_completion = 1; + } + psmx3_ep_optimize_ops(ep); + break; + + case FI_CLASS_CNTR: + cntr = container_of(bfid, struct psmx3_fid_cntr, cntr.fid); + if (ep->domain != cntr->domain) + return -FI_EINVAL; + if (flags & (FI_SEND | FI_WRITE | FI_READ)) { + err = psmx3_add_poll_ctxt(&cntr->poll_list, ep->tx); + if (err) + return err; + } + if (flags & (FI_RECV | FI_REMOTE_WRITE | FI_REMOTE_READ)) { + err = psmx3_add_poll_ctxt(&cntr->poll_list, ep->rx); + if (err) + return err; + } + if (flags & FI_SEND) + ep->send_cntr = cntr; + if (flags & FI_RECV) + ep->recv_cntr = cntr; + if (flags & FI_WRITE) + ep->write_cntr = cntr; + if (flags & FI_READ) + ep->read_cntr = cntr; + if (flags & FI_REMOTE_WRITE) + ep->remote_write_cntr = cntr; + if (flags & FI_REMOTE_READ) + ep->remote_read_cntr = cntr; + break; + + case FI_CLASS_AV: + av = container_of(bfid, + struct psmx3_fid_av, av.fid); + if (ep->domain != av->domain) + return -FI_EINVAL; + ep->av = av; + psmx3_ep_optimize_ops(ep); + if (ep->tx) + psmx3_av_add_trx_ctxt(av, ep->tx); + if (ep->rx && ep->rx != ep->tx) + psmx3_av_add_trx_ctxt(av, ep->rx); + break; + + case FI_CLASS_MR: + if (!bfid->ops || !bfid->ops->bind) + return -FI_EINVAL; + err = bfid->ops->bind(bfid, fid, flags); + if (err) + return err; + break; + + case FI_CLASS_STX_CTX: + stx = container_of(bfid, struct psmx3_fid_stx, stx.fid); + if (ep->domain != stx->domain) + return -FI_EINVAL; + if (ep->tx || ep->stx) + return -FI_EINVAL; + ep->tx = stx->tx; + ep->stx = stx; + err = psmx3_domain_enable_ep(ep->domain, ep); + if (err) + return err; + if (ep->caps & FI_TRIGGER) + stx->tx->am_progress = 1; + ofi_atomic_inc32(&stx->ref); + break; + + default: + return -FI_ENOSYS; + } + + return 0; +} + +static inline int psmx3_ep_set_flags(struct psmx3_fid_ep *ep, uint64_t flags) +{ + uint64_t real_flags = flags & ~(FI_TRANSMIT | FI_RECV); + + if ((flags & FI_TRANSMIT) && (flags & FI_RECV)) + return -EINVAL; + else if (flags & FI_TRANSMIT) + ep->tx_flags = real_flags; + else if (flags & FI_RECV) + ep->rx_flags = real_flags; + + /* otherwise ok to leave the flags intact */ + + return 0; +} + +static inline int psmx3_ep_get_flags(struct psmx3_fid_ep *ep, uint64_t *flags) +{ + uint64_t flags_in = *flags; + + if ((flags_in & FI_TRANSMIT) && (flags_in & FI_RECV)) + return -EINVAL; + else if (flags_in & FI_TRANSMIT) + *flags = ep->tx_flags; + else if (flags_in & FI_RECV) + *flags = ep->rx_flags; + else + return -EINVAL; + + return 0; +} + +DIRECT_FN +STATIC int psmx3_ep_control(fid_t fid, int command, void *arg) +{ + struct fi_alias *alias; + struct psmx3_fid_ep *ep, *new_ep; + int err; + + ep = container_of(fid, struct psmx3_fid_ep, ep.fid); + + switch (command) { + case FI_ALIAS: + new_ep = (struct psmx3_fid_ep *) calloc(1, sizeof *ep); + if (!new_ep) + return -FI_ENOMEM; + alias = arg; + *new_ep = *ep; + err = psmx3_ep_set_flags(new_ep, alias->flags); + if (err) { + free(new_ep); + return err; + } + new_ep->base_ep = ep; + ofi_atomic_inc32(&ep->ref); + psmx3_ep_optimize_ops(new_ep); + *alias->fid = &new_ep->ep.fid; + break; + + case FI_SETOPSFLAG: + err = psmx3_ep_set_flags(ep, *(uint64_t *)arg); + if (err) + return err; + psmx3_ep_optimize_ops(ep); + break; + + case FI_GETOPSFLAG: + if (!arg) + return -FI_EINVAL; + err = psmx3_ep_get_flags(ep, arg); + if (err) + return err; + break; + + case FI_ENABLE: + ep->enabled = 1; + return 0; + + default: + return -FI_ENOSYS; + } + + return 0; +} + +DIRECT_FN +STATIC ssize_t psmx3_rx_size_left(struct fid_ep *ep) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + if (ep_priv->enabled) + return 0x7fffffff; + else + return -FI_EOPBADSTATE; +} + +DIRECT_FN +STATIC ssize_t psmx3_tx_size_left(struct fid_ep *ep) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + if (ep_priv->enabled) + return 0x7fffffff; + else + return -FI_EOPBADSTATE; +} + +static struct fi_ops psmx3_fi_ops = { + .size = sizeof(struct fi_ops), + .close = psmx3_ep_close, + .bind = psmx3_ep_bind, + .control = psmx3_ep_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_ep psmx3_ep_ops = { + .size = sizeof(struct fi_ops_ep), + .cancel = psmx3_ep_cancel, + .getopt = psmx3_ep_getopt, + .setopt = psmx3_ep_setopt, + .tx_ctx = fi_no_tx_ctx, + .rx_ctx = fi_no_rx_ctx, + .rx_size_left = psmx3_rx_size_left, + .tx_size_left = psmx3_tx_size_left, +}; + +int psmx3_ep_open_internal(struct psmx3_fid_domain *domain_priv, + struct fi_info *info, + struct psmx3_fid_ep **ep_out, void *context, + struct psmx3_trx_ctxt *trx_ctxt, + int usage_flags) +{ + struct psmx3_fid_ep *ep_priv; + uint64_t ep_cap; + int err = -FI_EINVAL; + + if (info) + ep_cap = info->caps; + else + ep_cap = FI_TAGGED; + + ep_priv = (struct psmx3_fid_ep *) calloc(1, sizeof *ep_priv); + if (!ep_priv) { + err = -FI_ENOMEM; + goto errout; + } + + ep_priv->ep.fid.fclass = FI_CLASS_EP; + ep_priv->ep.fid.context = context; + ep_priv->ep.fid.ops = &psmx3_fi_ops; + ep_priv->ep.ops = &psmx3_ep_ops; + ep_priv->ep.cm = &psmx3_cm_ops; + ep_priv->domain = domain_priv; + if (usage_flags & PSMX3_RX) { + ep_priv->rx = trx_ctxt; + if (trx_ctxt) + trx_ctxt->ep = ep_priv; /* only used by RMA target */ + } + if (usage_flags & PSMX3_TX) + ep_priv->tx = trx_ctxt; + ofi_atomic_initialize32(&ep_priv->ref, 0); + + PSMX3_CTXT_TYPE(&ep_priv->nocomp_send_context) = PSMX3_NOCOMP_SEND_CONTEXT; + PSMX3_CTXT_EP(&ep_priv->nocomp_send_context) = ep_priv; + PSMX3_CTXT_TYPE(&ep_priv->nocomp_tsend_context) = PSMX3_NOCOMP_TSEND_CONTEXT; + PSMX3_CTXT_EP(&ep_priv->nocomp_tsend_context) = ep_priv; + + if (ep_cap & FI_TAGGED) + ep_priv->ep.tagged = &psmx3_tagged_ops; + if (ep_cap & FI_MSG) + ep_priv->ep.msg = &psmx3_msg_ops; + if (ep_cap & FI_RMA) + ep_priv->ep.rma = &psmx3_rma_ops; + if (ep_cap & FI_ATOMICS) + ep_priv->ep.atomic = &psmx3_atomic_ops; + + ep_priv->caps = ep_cap; + + err = psmx3_domain_enable_ep(domain_priv, ep_priv); + if (err) + goto errout_free_ep; + + psmx3_domain_acquire(domain_priv); + + if (info) { + if (info->tx_attr) + ep_priv->tx_flags = info->tx_attr->op_flags; + if (info->rx_attr) + ep_priv->rx_flags = info->rx_attr->op_flags; + } + + psmx3_ep_optimize_ops(ep_priv); + + PSMX3_EP_INIT_OP_CONTEXT(ep_priv); + if ((ep_cap & FI_TRIGGER) && trx_ctxt) + trx_ctxt->am_progress = 1; + + *ep_out = ep_priv; + return 0; + +errout_free_ep: + free(ep_priv); + +errout: + return err; +} + +DIRECT_FN +int psmx3_ep_open(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context) +{ + struct psmx3_fid_domain *domain_priv; + struct psmx3_fid_ep *ep_priv; + struct psmx3_ep_name ep_name; + struct psmx3_ep_name *src_addr; + struct psmx3_trx_ctxt *trx_ctxt = NULL; + int err = -FI_EINVAL; + int usage_flags = PSMX3_TX_RX; + uint8_t *uuid = NULL; + + domain_priv = container_of(domain, struct psmx3_fid_domain, + util_domain.domain_fid.fid); + if (!domain_priv) + goto errout; + + if (info && info->ep_attr && + info->ep_attr->rx_ctx_cnt == FI_SHARED_CONTEXT) + return -FI_ENOSYS; + + if (info && info->ep_attr && + info->ep_attr->tx_ctx_cnt == FI_SHARED_CONTEXT) + usage_flags &= ~PSMX3_TX; + + if (info && !ofi_send_allowed(info->caps) && + !ofi_rma_initiate_allowed(info->caps)) + usage_flags &= ~PSMX3_TX; + + if (info && !ofi_recv_allowed(info->caps) && + !ofi_rma_target_allowed(info->caps)) + usage_flags &= ~PSMX3_RX; + + src_addr = NULL; + if (info && info->src_addr) { + if (info->addr_format == FI_ADDR_STR) + src_addr = psmx3_string_to_ep_name(info->src_addr); + else + src_addr = info->src_addr; + } + if (!psmx3_override_uuid() && info && info->domain_attr && info->domain_attr->auth_key) { + if (info->domain_attr->auth_key_size != sizeof(psm2_uuid_t)) { + FI_WARN(&psmx3_prov, FI_LOG_EP_CTRL, + "Invalid domain auth_key_len %"PRIu64 + ", should be %"PRIu64".\n", + info->domain_attr->auth_key_size, + sizeof(psm2_uuid_t)); + goto errout; + } + uuid = info->domain_attr->auth_key; + } + + if (!psmx3_override_uuid() && info && info->ep_attr && info->ep_attr->auth_key) { + if (info->ep_attr->auth_key_size != sizeof(psm2_uuid_t)) { + FI_WARN(&psmx3_prov, FI_LOG_EP_CTRL, + "Invalid ep auth_key_len %"PRIu64 + ", should be %"PRIu64".\n", + info->ep_attr->auth_key_size, + sizeof(psm2_uuid_t)); + goto errout; + } + uuid = info->ep_attr->auth_key; + } + + /* If override is true, the FI_PSM3_UUID was set to override other uuid */ + if (psmx3_override_uuid()) { + uuid = domain_priv->fabric->uuid; + } + + if (usage_flags) { + trx_ctxt = psmx3_trx_ctxt_alloc(domain_priv, src_addr, -1, + usage_flags, uuid); + if (!trx_ctxt) + goto errout; + } else { + FI_INFO(&psmx3_prov, FI_LOG_EP_CTRL, + "Tx only endpoint with STX context.\n"); + } + + err = psmx3_ep_open_internal(domain_priv, info, &ep_priv, context, + trx_ctxt, usage_flags); + if (err) + goto errout_free_ctxt; + + ep_priv->type = PSMX3_EP_REGULAR; + ep_priv->service = PSMX3_ANY_SERVICE; + if (src_addr) { + ep_priv->service = src_addr->service; + if (info->addr_format == FI_ADDR_STR) + free(src_addr); + } + + if (ep_priv->service == PSMX3_ANY_SERVICE) + ep_priv->service = ((getpid() & 0x7FFF) << 16) + + ((uintptr_t)ep_priv & 0xFFFF); + + if (usage_flags) { + ep_name.epid = trx_ctxt->psm2_epid; + ep_name.type = ep_priv->type; + + ofi_ns_add_local_name(&domain_priv->fabric->name_server, + &ep_priv->service, &ep_name); + } + + *ep = &ep_priv->ep; + return 0; + +errout_free_ctxt: + psmx3_trx_ctxt_free(trx_ctxt, usage_flags); + +errout: + return err; +} + +/* + * Shared tx context + */ + +static int psmx3_stx_close(fid_t fid) +{ + struct psmx3_fid_stx *stx; + + stx = container_of(fid, struct psmx3_fid_stx, stx.fid); + + if (ofi_atomic_get32(&stx->ref)) + return -FI_EBUSY; + + psmx3_trx_ctxt_free(stx->tx, PSMX3_TX); + psmx3_domain_release(stx->domain); + free(stx); + return 0; +} + +static struct fi_ops psmx3_fi_ops_stx = { + .size = sizeof(struct fi_ops), + .close = psmx3_stx_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_ep psmx3_stx_ops = { + .size = sizeof(struct fi_ops_ep), + .cancel = fi_no_cancel, + .getopt = fi_no_getopt, + .setopt = fi_no_setopt, + .tx_ctx = fi_no_tx_ctx, + .rx_ctx = fi_no_rx_ctx, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, +}; + +DIRECT_FN +int psmx3_stx_ctx(struct fid_domain *domain, struct fi_tx_attr *attr, + struct fid_stx **stx, void *context) +{ + struct psmx3_fid_domain *domain_priv; + struct psmx3_trx_ctxt *trx_ctxt; + struct psmx3_fid_stx *stx_priv; + int err = -FI_EINVAL; + + domain_priv = container_of(domain, struct psmx3_fid_domain, + util_domain.domain_fid.fid); + if (!domain_priv) + goto errout; + + stx_priv = (struct psmx3_fid_stx *) calloc(1, sizeof *stx_priv); + if (!stx_priv) { + err = -FI_ENOMEM; + goto errout; + } + + /* no auth_key is provided, use NULL to pick the default uuid */ + trx_ctxt = psmx3_trx_ctxt_alloc(domain_priv, NULL, -1, PSMX3_TX, + NULL); + if (!trx_ctxt) { + err = -FI_ENOMEM; + goto errout_free_stx; + } + + psmx3_domain_acquire(domain_priv); + stx_priv->stx.fid.fclass = FI_CLASS_STX_CTX; + stx_priv->stx.fid.context = context; + stx_priv->stx.fid.ops = &psmx3_fi_ops_stx; + stx_priv->stx.ops = &psmx3_stx_ops; + stx_priv->domain = domain_priv; + stx_priv->tx = trx_ctxt; + ofi_atomic_initialize32(&stx_priv->ref, 0); + + *stx = &stx_priv->stx; + return 0; + +errout_free_stx: + free(stx_priv); + +errout: + return err; +} + +/* + * Scalable endpoint + */ + +static int psmx3_sep_close(fid_t fid) +{ + struct psmx3_fid_sep *sep; + struct psmx3_ep_name ep_name; + int i; + + sep = container_of(fid, struct psmx3_fid_sep, ep.fid); + + if (ofi_atomic_get32(&sep->ref)) + return -FI_EBUSY; + + for (i = 0; i < sep->ctxt_cnt; i++) { + if (sep->ctxts[i].ep && ofi_atomic_get32(&sep->ctxts[i].ep->ref)) + return -FI_EBUSY; + } + + ep_name.epid = sep->ctxts[0].trx_ctxt->psm2_epid; + ep_name.sep_id = sep->id; + ep_name.type = sep->type; + + ofi_ns_del_local_name(&sep->domain->fabric->name_server, + &sep->service, &ep_name); + + for (i = 0; i < sep->ctxt_cnt; i++) { + psmx3_trx_ctxt_free(sep->ctxts[i].trx_ctxt, PSMX3_TX_RX); + + if (sep->ctxts[i].ep) + psmx3_ep_close_internal(sep->ctxts[i].ep); + } + + sep->domain->sep_lock_fn(&sep->domain->sep_lock, 1); + dlist_remove(&sep->entry); + sep->domain->sep_unlock_fn(&sep->domain->sep_lock, 1); + + psmx3_domain_release(sep->domain); + free(sep); + return 0; +} + +static int psmx3_sep_control(fid_t fid, int command, void *arg) +{ + struct psmx3_fid_sep *sep; + + sep = container_of(fid, struct psmx3_fid_sep, ep.fid); + + switch (command) { + case FI_ENABLE: + sep->enabled = 1; + return 0; + + default: + return -FI_ENOSYS; + } + + return 0; +} + +DIRECT_FN +STATIC int psmx3_sep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + struct psmx3_fid_sep *sep; + int i, err = 0; + + sep = container_of(fid, struct psmx3_fid_sep, ep.fid); + + for (i = 0; i < sep->ctxt_cnt; i++) { + err = psmx3_ep_bind(&sep->ctxts[i].ep->ep.fid, bfid, flags); + if (err) + break; + } + + return err; +} + +DIRECT_FN +STATIC int psmx3_tx_context(struct fid_ep *ep, int index, struct fi_tx_attr *attr, + struct fid_ep **tx_ep, void *context) +{ + struct psmx3_fid_sep *sep; + + sep = container_of(ep, struct psmx3_fid_sep, ep); + + assert(index >= 0 && index < sep->ctxt_cnt); + + *tx_ep = &sep->ctxts[index].ep->ep; + return 0; +} + +DIRECT_FN +STATIC int psmx3_rx_context(struct fid_ep *ep, int index, struct fi_rx_attr *attr, + struct fid_ep **rx_ep, void *context) +{ + struct psmx3_fid_sep *sep; + + sep = container_of(ep, struct psmx3_fid_sep, ep); + + assert(index >= 0 && index < sep->ctxt_cnt); + + *rx_ep = &sep->ctxts[index].ep->ep; + return 0; +} + +static int psmx3_sep_ctxt_close(fid_t fid) +{ + struct psmx3_fid_ep *ep; + + ep = container_of(fid, struct psmx3_fid_ep, ep.fid); + + if (ep->base_ep) + ofi_atomic_dec32(&ep->base_ep->ref); + + return 0; +} + +static struct fi_ops psmx3_fi_ops_sep_ctxt = { + .size = sizeof(struct fi_ops), + .close = psmx3_sep_ctxt_close, + .bind = psmx3_ep_bind, + .control = psmx3_ep_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops psmx3_fi_ops_sep = { + .size = sizeof(struct fi_ops), + .close = psmx3_sep_close, + .bind = psmx3_sep_bind, + .control = psmx3_sep_control, + .ops_open = fi_no_ops_open, +}; + +static struct fi_ops_ep psmx3_sep_ops = { + .size = sizeof(struct fi_ops_ep), + .cancel = fi_no_cancel, + .getopt = fi_no_getopt, + .setopt = fi_no_setopt, + .tx_ctx = psmx3_tx_context, + .rx_ctx = psmx3_rx_context, + .rx_size_left = fi_no_rx_size_left, + .tx_size_left = fi_no_tx_size_left, +}; + +DIRECT_FN +int psmx3_sep_open(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **sep, void *context) +{ + struct psmx3_fid_domain *domain_priv; + struct psmx3_fid_ep *ep_priv; + struct psmx3_fid_sep *sep_priv; + struct psmx3_ep_name ep_name; + struct psmx3_ep_name *src_addr; + struct psmx3_trx_ctxt *trx_ctxt; + size_t ctxt_cnt = 1; + size_t ctxt_size; + int err = -FI_EINVAL; + uint8_t *uuid = NULL; + int i; + + domain_priv = container_of(domain, struct psmx3_fid_domain, + util_domain.domain_fid.fid); + if (!domain_priv) + goto errout; + + if (!psmx3_override_uuid() && info && info->domain_attr && info->domain_attr->auth_key) { + if (info->domain_attr->auth_key_size != sizeof(psm2_uuid_t)) { + FI_WARN(&psmx3_prov, FI_LOG_EP_CTRL, + "Invalid domain auth_key_len %"PRIu64 + ", should be %"PRIu64".\n", + info->domain_attr->auth_key_size, + sizeof(psm2_uuid_t)); + goto errout; + } + uuid = info->domain_attr->auth_key; + } + + if (info && info->ep_attr) { + if (!psmx3_override_uuid() && info->ep_attr->auth_key) { + if (info->ep_attr->auth_key_size != sizeof(psm2_uuid_t)) { + FI_WARN(&psmx3_prov, FI_LOG_EP_CTRL, + "Invalid ep auth_key_len %"PRIu64 + ", should be %"PRIu64".\n", + info->ep_attr->auth_key_size, + sizeof(psm2_uuid_t)); + goto errout; + } + uuid = info->ep_attr->auth_key; + } + + if (info->ep_attr->tx_ctx_cnt > psmx3_hfi_info.max_trx_ctxt) { + FI_WARN(&psmx3_prov, FI_LOG_EP_CTRL, + "tx_ctx_cnt %"PRIu64" exceed limit %d.\n", + info->ep_attr->tx_ctx_cnt, + psmx3_hfi_info.max_trx_ctxt); + goto errout; + } + if (info->ep_attr->rx_ctx_cnt > psmx3_hfi_info.max_trx_ctxt) { + FI_WARN(&psmx3_prov, FI_LOG_EP_CTRL, + "rx_ctx_cnt %"PRIu64" exceed limit %d.\n", + info->ep_attr->rx_ctx_cnt, + psmx3_hfi_info.max_trx_ctxt); + goto errout; + } + ctxt_cnt = info->ep_attr->tx_ctx_cnt; + if (ctxt_cnt < info->ep_attr->rx_ctx_cnt) + ctxt_cnt = info->ep_attr->rx_ctx_cnt; + if (ctxt_cnt == 0) { + FI_INFO(&psmx3_prov, FI_LOG_EP_CTRL, + "tx_ctx_cnt and rx_ctx_cnt are 0, use 1.\n"); + ctxt_cnt = 1; + } + } + + /* If override is true, the FI_PSM3_UUID was set to override other uuid */ + if (psmx3_override_uuid()) { + uuid = domain_priv->fabric->uuid; + } + + ctxt_size = ctxt_cnt * sizeof(struct psmx3_sep_ctxt); + sep_priv = (struct psmx3_fid_sep *) calloc(1, sizeof(*sep_priv) + ctxt_size); + if (!sep_priv) { + err = -FI_ENOMEM; + goto errout; + } + + sep_priv->ep.fid.fclass = FI_CLASS_SEP; + sep_priv->ep.fid.context = context; + sep_priv->ep.fid.ops = &psmx3_fi_ops_sep; + sep_priv->ep.ops = &psmx3_sep_ops; + sep_priv->ep.cm = &psmx3_cm_ops; + sep_priv->domain = domain_priv; + sep_priv->ctxt_cnt = ctxt_cnt; + ofi_atomic_initialize32(&sep_priv->ref, 0); + + src_addr = NULL; + if (info && info->src_addr) { + if (info->addr_format == FI_ADDR_STR) + src_addr = psmx3_string_to_ep_name(info->src_addr); + else + src_addr = info->src_addr; + } + + for (i = 0; i < ctxt_cnt; i++) { + trx_ctxt = psmx3_trx_ctxt_alloc(domain_priv, src_addr, + (ctxt_cnt > 1) ? i : -1, + PSMX3_TX_RX, uuid); + if (!trx_ctxt) { + err = -FI_ENOMEM; + goto errout_free_ctxt; + } + + sep_priv->ctxts[i].trx_ctxt = trx_ctxt; + + err = psmx3_ep_open_internal(domain_priv, info, &ep_priv, context, + trx_ctxt, PSMX3_TX_RX); + if (err) + goto errout_free_ctxt; + + /* override the ops so the fid can't be closed individually */ + ep_priv->ep.fid.ops = &psmx3_fi_ops_sep_ctxt; + + sep_priv->ctxts[i].ep = ep_priv; + } + + sep_priv->type = PSMX3_EP_SCALABLE; + sep_priv->service = PSMX3_ANY_SERVICE; + if (src_addr) { + sep_priv->service = src_addr->service; + if (info->addr_format == FI_ADDR_STR) + free(src_addr); + } + + if (sep_priv->service == PSMX3_ANY_SERVICE) + sep_priv->service = ((getpid() & 0x7FFF) << 16) + + ((uintptr_t)sep_priv & 0xFFFF); + + sep_priv->id = ofi_atomic_inc32(&domain_priv->sep_cnt); + for (i = 0; i < ctxt_cnt; i++) + sep_priv->ctxts[i].ep->sep_id = sep_priv->id; + + domain_priv->sep_lock_fn(&domain_priv->sep_lock, 1); + dlist_insert_before(&sep_priv->entry, &domain_priv->sep_list); + domain_priv->sep_unlock_fn(&domain_priv->sep_lock, 1); + + ep_name.epid = sep_priv->ctxts[0].trx_ctxt->psm2_epid; + ep_name.sep_id = sep_priv->id; + ep_name.type = sep_priv->type; + + ofi_ns_add_local_name(&domain_priv->fabric->name_server, + &sep_priv->service, &ep_name); + + psmx3_domain_acquire(domain_priv); + *sep = &sep_priv->ep; + + /* Make sure the AM handler is installed to answer SEP query */ + psmx3_am_init(sep_priv->ctxts[0].trx_ctxt); + + return 0; + +errout_free_ctxt: + while (i) { + if (sep_priv->ctxts[i].trx_ctxt) + psmx3_trx_ctxt_free(sep_priv->ctxts[i].trx_ctxt, + PSMX3_TX_RX); + + if (sep_priv->ctxts[i].ep) + psmx3_ep_close_internal(sep_priv->ctxts[i].ep); + + i--; + } + + free(sep_priv); + +errout: + return err; +} diff --git a/prov/psm3/src/psmx3_fabric.c b/prov/psm3/src/psmx3_fabric.c new file mode 100644 index 00000000000..951aedc7f69 --- /dev/null +++ b/prov/psm3/src/psmx3_fabric.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx3.h" + +extern int psmx3_trx_ctxt_cnt; +struct psmx3_fid_fabric *psmx3_active_fabric = NULL; + +static int psmx3_fabric_close(fid_t fid) +{ + struct psmx3_fid_fabric *fabric; + + fabric = container_of(fid, struct psmx3_fid_fabric, + util_fabric.fabric_fid.fid); + + psmx3_fabric_release(fabric); + + FI_INFO(&psmx3_prov, FI_LOG_CORE, "refcnt=%d\n", + ofi_atomic_get32(&fabric->util_fabric.ref)); + + if (ofi_fabric_close(&fabric->util_fabric)) + return 0; + + if (psmx3_env.name_server) + ofi_ns_stop_server(&fabric->name_server); + + fastlock_destroy(&fabric->domain_lock); + assert(fabric == psmx3_active_fabric); + psmx3_active_fabric = NULL; + free(fabric); + + psmx3_atomic_global_fini(); + return 0; +} + +static struct fi_ops psmx3_fabric_fi_ops = { + .size = sizeof(struct fi_ops), + .close = psmx3_fabric_close, +}; + +static struct fi_ops_fabric psmx3_fabric_ops = { + .size = sizeof(struct fi_ops_fabric), + .domain = psmx3_domain_open, + .passive_ep = fi_no_passive_ep, + .eq_open = ofi_eq_create, + .wait_open = psmx3_wait_open, + .trywait = psmx3_wait_trywait +}; + +static struct fi_fabric_attr psmx3_fabric_attr = { + .name = PSMX3_FABRIC_NAME, + .prov_version = OFI_VERSION_DEF_PROV, +}; + +int psmx3_fabric(struct fi_fabric_attr *attr, + struct fid_fabric **fabric, void *context) +{ + struct psmx3_fid_fabric *fabric_priv; + int ret; + + FI_INFO(&psmx3_prov, FI_LOG_CORE, "\n"); + + if (strcmp(attr->name, PSMX3_FABRIC_NAME)) + return -FI_ENODATA; + + if (psmx3_active_fabric) { + psmx3_fabric_acquire(psmx3_active_fabric); + *fabric = &psmx3_active_fabric->util_fabric.fabric_fid; + return 0; + } + + fabric_priv = calloc(1, sizeof(*fabric_priv)); + if (!fabric_priv) + return -FI_ENOMEM; + + fastlock_init(&fabric_priv->domain_lock); + dlist_init(&fabric_priv->domain_list); + + psmx3_get_uuid(fabric_priv->uuid); + if (psmx3_env.name_server) { + fabric_priv->name_server.port = psmx3_uuid_to_port(fabric_priv->uuid); + fabric_priv->name_server.name_len = sizeof(struct psmx3_ep_name); + fabric_priv->name_server.service_len = sizeof(int); + fabric_priv->name_server.service_cmp = psmx3_ns_service_cmp; + fabric_priv->name_server.is_service_wildcard = psmx3_ns_is_service_wildcard; + + ofi_ns_init(&fabric_priv->name_server); + ofi_ns_start_server(&fabric_priv->name_server); + } + + ret = ofi_fabric_init(&psmx3_prov, &psmx3_fabric_attr, attr, + &fabric_priv->util_fabric, context); + if (ret) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, "ofi_fabric_init returns %d\n", ret); + if (psmx3_env.name_server) + ofi_ns_stop_server(&fabric_priv->name_server); + free(fabric_priv); + return ret; + } + + /* fclass & context initialized in ofi_fabric_init */ + fabric_priv->util_fabric.fabric_fid.fid.ops = &psmx3_fabric_fi_ops; + fabric_priv->util_fabric.fabric_fid.ops = &psmx3_fabric_ops; + + psmx3_atomic_global_init(); + psmx3_query_mpi(); + + /* take the reference to count for multiple fabric open calls */ + psmx3_fabric_acquire(fabric_priv); + + *fabric = &fabric_priv->util_fabric.fabric_fid; + psmx3_active_fabric = fabric_priv; + psmx3_trx_ctxt_cnt = 0; + + return 0; +} + diff --git a/prov/psm3/src/psmx3_init.c b/prov/psm3/src/psmx3_init.c new file mode 100644 index 00000000000..b6f615435fc --- /dev/null +++ b/prov/psm3/src/psmx3_init.c @@ -0,0 +1,743 @@ +/* + * Copyright (c) 2013-2020 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ofi_prov.h" +#include "psmx3.h" +#include +#include + +static int psmx3_init_count = 0; +static int psmx3_lib_initialized = 0; +static pthread_mutex_t psmx3_lib_mutex; + +struct psmx3_hfi_info psmx3_hfi_info; + +struct psmx3_env psmx3_env = { + .name_server = 1, + .tagged_rma = 1, + .uuid = PSMX3_DEFAULT_UUID, + .uuid_override = 0, + .delay = 0, + .timeout = 10, + .conn_timeout = 10, + .prog_interval = -1, + .prog_affinity = NULL, + .multi_ep = 0, + .inject_size = 64, + .lock_level = 2, + .lazy_conn = 0, + .disconnect = 0, +#if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_RUNTIME) + .tag_layout = "auto", +#endif +}; + +#if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_RUNTIME) +uint64_t psmx3_tag_mask; +uint32_t psmx3_tag_upper_mask; +uint32_t psmx3_data_mask; +int psmx3_flags_idx; +int psmx3_tag_layout_locked = 0; +#endif + +static void psmx3_init_env(void) +{ + uint32_t uid = getuid(); + char *uuid = NULL; + + if (getenv("OMPI_COMM_WORLD_RANK") || getenv("PMI_RANK") || getenv("PMIX_RANK")) + psmx3_env.name_server = 0; + + fi_param_get_bool(&psmx3_prov, "name_server", &psmx3_env.name_server); + fi_param_get_bool(&psmx3_prov, "tagged_rma", &psmx3_env.tagged_rma); + + if (FI_SUCCESS != fi_param_get_str(&psmx3_prov, "uuid", &psmx3_env.uuid)) { + /* + * For OpenMPI 4.x only: + * The job key is passed via the environment variable, but the format + * is different. Perform format conversion and use it as the default + * uuid. This will be overridden if FI_PSM3_UUID is set. + */ + psm2_uuid_t ompi_uuid = {}; + unsigned long long int *u = (unsigned long long int *)ompi_uuid; + char *ompi_job_key = getenv("OMPI_MCA_orte_precondition_transports"); + if (ompi_job_key) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "Open MPI job key: %s.\n", ompi_job_key); + if (sscanf(ompi_job_key, "%016llx-%016llx", &u[0], &u[1]) == 2) + uuid = strdup(psmx3_uuid_to_string(ompi_uuid)); + else { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "Invalid Open MPI job key format.\n"); + } + } + + /* Set Default UUID if none supplied through environment variable */ + if (!uuid) { /* If ompi_job_key is not set or invalid */ + uuid = strdup(PSMX3_DEFAULT_UUID); + if (uuid) { + /* fill in uid as bytes 9-11 (XXXX-XXXX) in format: + * xxxxxxxx-xxxx-XXXX-XXXX-xxxxxxxxxxxx + */ + snprintf(&uuid[14], 10, "%02hhX%02hhX-%02hhX%02hhX", + (uid >> 24) & 0xff, (uid >> 16) & 0xff, + (uid >> 8) & 0xff, uid & 0xff); + uuid[23] = '-'; /* restore */ + } + } + psmx3_env.uuid = uuid; + } else { + /* FI_PSM3_UUID has highest priority, so it can override auth_key from fi_info */ + psmx3_env.uuid_override = 1; + } + fi_param_get_int(&psmx3_prov, "delay", &psmx3_env.delay); + fi_param_get_int(&psmx3_prov, "timeout", &psmx3_env.timeout); + fi_param_get_int(&psmx3_prov, "prog_interval", &psmx3_env.prog_interval); + fi_param_get_str(&psmx3_prov, "prog_affinity", &psmx3_env.prog_affinity); + fi_param_get_int(&psmx3_prov, "inject_size", &psmx3_env.inject_size); + fi_param_get_bool(&psmx3_prov, "lock_level", &psmx3_env.lock_level); + fi_param_get_bool(&psmx3_prov, "lazy_conn", &psmx3_env.lazy_conn); + if (psmx3_env.lazy_conn) + psmx3_env.conn_timeout = 30; // more headroom since app may be busy + fi_param_get_int(&psmx3_prov, "conn_timeout", &psmx3_env.conn_timeout); + fi_param_get_bool(&psmx3_prov, "disconnect", &psmx3_env.disconnect); +#if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_RUNTIME) + fi_param_get_str(&psmx3_prov, "tag_layout", &psmx3_env.tag_layout); +#endif +} + +void psmx3_init_tag_layout(struct fi_info *info) +{ + int use_tag64; + +#if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_RUNTIME) + use_tag64 = (psmx3_tag_mask == PSMX3_TAG_MASK_64); + + if (psmx3_tag_layout_locked) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "tag layout already set opened domain.\n"); + goto out; + } + + if (strcasecmp(psmx3_env.tag_layout, "tag60") == 0) { + psmx3_tag_upper_mask = PSMX3_TAG_UPPER_MASK_60; + psmx3_tag_mask = PSMX3_TAG_MASK_60; + psmx3_data_mask = PSMX3_DATA_MASK_60; + psmx3_flags_idx = PSMX3_FLAGS_IDX_60; + use_tag64 = 0; + } else if (strcasecmp(psmx3_env.tag_layout, "tag64") == 0) { + psmx3_tag_upper_mask = PSMX3_TAG_UPPER_MASK_64; + psmx3_tag_mask = PSMX3_TAG_MASK_64; + psmx3_data_mask = PSMX3_DATA_MASK_64; + psmx3_flags_idx = PSMX3_FLAGS_IDX_64; + use_tag64 = 1; + } else { + if (strcasecmp(psmx3_env.tag_layout, "auto") != 0) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "Invalid tag layout '%s', using 'auto'.\n", + psmx3_env.tag_layout); + psmx3_env.tag_layout = "auto"; + } + if ((info->caps & (FI_TAGGED | FI_MSG)) && + info->domain_attr->cq_data_size) { + psmx3_tag_upper_mask = PSMX3_TAG_UPPER_MASK_60; + psmx3_tag_mask = PSMX3_TAG_MASK_60; + psmx3_data_mask = PSMX3_DATA_MASK_60; + psmx3_flags_idx = PSMX3_FLAGS_IDX_60; + use_tag64 = 0; + } else { + psmx3_tag_upper_mask = PSMX3_TAG_UPPER_MASK_64; + psmx3_tag_mask = PSMX3_TAG_MASK_64; + psmx3_data_mask = PSMX3_DATA_MASK_64; + psmx3_flags_idx = PSMX3_FLAGS_IDX_64; + use_tag64 = 1; + } + } + + psmx3_tag_layout_locked = 1; +out: +#elif (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_TAG64) + use_tag64 = 1; +#else + use_tag64 = 0; +#endif + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "use %s: tag_mask: %016" PRIX64 ", data_mask: %08" PRIX32 "\n", + use_tag64 ? "tag64" : "tag60", (uint64_t)PSMX3_TAG_MASK, + PSMX3_DATA_MASK); +} + +static int psmx3_get_yes_no(char *s, int default_value) +{ + unsigned long value; + char *end_ptr; + + if (!s || s[0] == '\0') + return default_value; + + if (s[0] == 'Y' || s[0] == 'y') + return 1; + + if (s[0] == 'N' || s[0] == 'n') + return 0; + + value = strtoul(s, &end_ptr, 0); + if (end_ptr == s) + return default_value; + + return value ? 1 : 0; +} + +static int psmx3_check_multi_ep_cap(void) +{ + uint64_t caps = PSM2_MULTI_EP_CAP; + char *s = getenv("PSM3_MULTI_EP"); + + if (psm2_get_capability_mask(caps) == caps && psmx3_get_yes_no(s, 0)) + psmx3_env.multi_ep = 1; + else + psmx3_env.multi_ep = 0; + + return psmx3_env.multi_ep; +} + +static int psmx3_init_lib(void) +{ + int major, minor; + int ret = 0, err; + + if (psmx3_lib_initialized) + return 0; + + pthread_mutex_lock(&psmx3_lib_mutex); + + if (psmx3_lib_initialized) + goto out; + + /* turn on multi-ep feature, but don't overwrite existing setting */ + /*setenv("PSM3_MULTI_EP", "1", 0); - not needed, PSM3 default=1*/ + + psm2_error_register_handler(NULL, PSM2_ERRHANDLER_NO_HANDLER); + + major = PSM2_VERNO_MAJOR; + minor = PSM2_VERNO_MINOR; + + err = psm2_init(&major, &minor); + if (err != PSM2_OK) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "psm2_init failed: %s\n", psm2_error_get_string(err)); + ret = err; + goto out; + } + + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "PSM3 header version = (%d, %d)\n", PSM2_VERNO_MAJOR, PSM2_VERNO_MINOR); + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "PSM3 library version = (%d, %d)\n", major, minor); + + if (psmx3_check_multi_ep_cap()) + FI_INFO(&psmx3_prov, FI_LOG_CORE, "PSM3 multi-ep feature enabled.\n"); + else + FI_INFO(&psmx3_prov, FI_LOG_CORE, "PSM3 multi-ep feature not available or disabled.\n"); + + psmx3_lib_initialized = 1; + +out: + pthread_mutex_unlock(&psmx3_lib_mutex); + return ret; +} + +static int psmx3_update_hfi_info(void) +{ + unsigned short i; + int nctxts = 0; + int nfreectxts = 0; + int hfi_unit = -1; + char *hfi_name = NULL; + int multirail = 0; + char *s; + char unit_name[NAME_MAX]; + uint32_t cnt = 0; + int tmp_nctxts, tmp_nfreectxts; + int unit_active; + int ret; + psm2_info_query_arg_t args[2]; + + args[1].length = sizeof(unit_name); + + if (psmx3_hfi_info.num_units > 0) + return 0; + + if (psm2_info_query(PSM2_INFO_QUERY_NUM_UNITS, &cnt, 0, NULL) || !cnt) + { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "no PSM3 device is found.\n"); + return -FI_ENODEV; + } + psmx3_hfi_info.num_units = cnt; + + assert(psmx3_hfi_info.num_units <= PSMX3_MAX_UNITS); + + s = getenv("PSM3_NIC"); + if (s && *s) { + if (0 == strcasecmp(s, "any")) { + hfi_unit = -1; + } else { + char *p; + long l = strtol(s, &p, 10); + if (p && *p == '\0') + hfi_unit = (int)l; // consumed all of string as a number + else + hfi_name = s; // name specified + } + } + + s = getenv("PSM3_MULTIRAIL"); + if (s) + multirail = atoi(s); + + psmx3_hfi_info.num_active_units = 0; + for (i = 0; i < psmx3_hfi_info.num_units; i++) { + args[0].unit = i; + ret = psm2_info_query(PSM2_INFO_QUERY_UNIT_STATUS, &unit_active, 1, args); + if (ret != PSM2_OK) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "Failed to check active state of HFI unit %d\n", + i); + continue; + } + + if (unit_active<=0) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "NIC %d STATE = INACTIVE\n", + i); + continue; + } + + if (hfi_unit >=0 && i != hfi_unit) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "NIC %d skipped: PSM3_NIC=%d\n", + i, hfi_unit); + continue; + } + + if (PSM2_OK != psm2_info_query(PSM2_INFO_QUERY_NUM_FREE_CONTEXTS, + &tmp_nfreectxts, 1, args) || (tmp_nfreectxts < 0)) + { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "Failed to read number of free contexts from HFI unit %d\n", + i); + continue; + } + + if (PSM2_OK != psm2_info_query(PSM2_INFO_QUERY_NUM_CONTEXTS, + &tmp_nctxts, 1, args) || (tmp_nctxts < 0)) + { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "Failed to read number of contexts from HFI unit %d\n", + i); + continue; + } + + if (PSM2_OK != psm2_info_query(PSM2_INFO_QUERY_UNIT_NAME, + unit_name, 2, args)) + { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "Failed to read name of HFI unit %d\n", + i); + continue; + } + if (hfi_name && 0 != strcasecmp(hfi_name, unit_name)) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "NIC %d skipped: PSM3_NIC=%s\n", + i, hfi_name); + continue; + } + + nctxts += tmp_nctxts; + nfreectxts += tmp_nfreectxts; + + psmx3_hfi_info.unit_is_active[i] = 1; + psmx3_hfi_info.unit_nctxts[i] = tmp_nctxts; + psmx3_hfi_info.unit_nfreectxts[i] = tmp_nfreectxts; + psmx3_hfi_info.active_units[psmx3_hfi_info.num_active_units++] = i; + + if (psmx3_hfi_info.num_active_units > 1) + strcat(psmx3_hfi_info.default_domain_name, ";"); + strcat(psmx3_hfi_info.default_domain_name, unit_name); + + if (multirail) + break; + } + + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "hfi1 units: total %d, active %d; " + "hfi1 contexts: total %d, free %d\n", + psmx3_hfi_info.num_units, psmx3_hfi_info.num_active_units, + nctxts, nfreectxts); + + if (psmx3_env.multi_ep) { + psmx3_hfi_info.max_trx_ctxt = nctxts; + psmx3_hfi_info.free_trx_ctxt = nfreectxts; + } else { + psmx3_hfi_info.max_trx_ctxt = 1; + psmx3_hfi_info.free_trx_ctxt = (nfreectxts == 0) ? 0 : 1; + } + + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "Tx/Rx contexts: %d in total, %d available.\n", + psmx3_hfi_info.max_trx_ctxt, psmx3_hfi_info.free_trx_ctxt); + + return 0; +} + +int psmx3_get_round_robin_unit(int idx) +{ + return psmx3_hfi_info.num_active_units ? + psmx3_hfi_info.active_units[idx % psmx3_hfi_info.num_active_units] : + -1; +} + +static void psmx3_update_hfi_nic_info(struct fi_info *info) +{ + char *path; + char buffer[PATH_MAX]; + char *s; + ssize_t n; + unsigned int a, b, c, d; + int unit; + char sys_dev_path[PATH_MAX]; + psm2_info_query_arg_t args[2]; + args[1].length=sizeof(sys_dev_path); + + for ( ; info; info = info->next) { + unit = ((struct psmx3_ep_name *)info->src_addr)->unit; + + if (unit == PSMX3_DEFAULT_UNIT) + continue; + + if (!info->nic) { + info->nic = ofi_nic_dup(NULL); + if (!info->nic) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "Failed to allocate nic info for HFI unit %d\n", unit); + continue; + } + } + + args[0].unit = unit; + if ((PSM2_OK != psm2_info_query(PSM2_INFO_QUERY_UNIT_SYS_PATH, + sys_dev_path, 2, args)) || + (asprintf(&path, "%s/%s", sys_dev_path, "device") < 0)) + { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "Failed to read nic info for HFI unit %d\n", unit); + continue; + } + + n = readlink(path, buffer, sizeof(buffer)-1); + free(path); + + if (n < 0) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "Failed to read nic info for HFI unit %d\n", unit); + continue; + } + + buffer[n] = '\0'; + if ((s = strrchr(buffer, '/'))) + s++; + else + s = buffer; + + n = sscanf(s, "%x:%x:%x.%x", &a, &b, &c, &d); + if (n < 4) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "Failed to read nic info for HFI unit %d\n", unit); + continue; + } + + info->nic->bus_attr->bus_type = FI_BUS_PCI; + info->nic->bus_attr->attr.pci.domain_id = (uint16_t) a; + info->nic->bus_attr->attr.pci.bus_id = (uint8_t) b; + info->nic->bus_attr->attr.pci.device_id = (uint8_t) c; + info->nic->bus_attr->attr.pci.function_id = (uint8_t) d; + } +} + +static int psmx3_getinfo(uint32_t api_version, const char *node, + const char *service, uint64_t flags, + const struct fi_info *hints, struct fi_info **info) +{ + struct fi_info *prov_info = NULL; + struct psmx3_ep_name *dest_addr = NULL; + struct psmx3_ep_name *src_addr = NULL; + int svc0, svc = PSMX3_ANY_SERVICE; + size_t len; + void *addr; + uint32_t fmt; + + FI_INFO(&psmx3_prov, FI_LOG_CORE,"\n"); + + if (psmx3_init_prov_info(hints, &prov_info)) + goto err_out; + + if (psmx3_init_lib()) + goto err_out; + + if (psmx3_update_hfi_info()) + goto err_out; + + if (!psmx3_hfi_info.num_active_units) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "no PSM3 device is active.\n"); + goto err_out; + } + + if (hints && hints->domain_attr && hints->domain_attr->name && + NULL == strcasestr(psmx3_hfi_info.default_domain_name, hints->domain_attr->name)) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, "Unknown domain name\n"); + FI_INFO_STRING(&psmx3_prov, psmx3_hfi_info.default_domain_name, + hints->domain_attr->name, "Supported", "Requested"); + goto err_out; + } + + /* Set src or dest to used supplied address in native format */ + if (node && + !ofi_str_toaddr(node, &fmt, &addr, &len) && + fmt == FI_ADDR_PSMX3) { + if (flags & FI_SOURCE) { + src_addr = addr; + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "'%s' is taken as src_addr: \n", + node, src_addr->unit, src_addr->port, src_addr->service); + } else { + dest_addr = addr; + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "'%s' is taken as dest_addr: \n", + node, dest_addr->epid); + } + node = NULL; + } + + /* Initialize src address based on the "host:unit:port" format */ + if (!src_addr) { + src_addr = calloc(1, sizeof(*src_addr)); + if (!src_addr) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "failed to allocate src addr.\n"); + goto err_out; + } + src_addr->type = PSMX3_EP_SRC_ADDR; + src_addr->epid = PSMX3_RESERVED_EPID; + src_addr->unit = PSMX3_DEFAULT_UNIT; + src_addr->port = PSMX3_DEFAULT_PORT; + src_addr->service = PSMX3_ANY_SERVICE; + + if (flags & FI_SOURCE) { + if (node) + sscanf(node, "%*[^:]:%" SCNi8 ":%" SCNu8, + &src_addr->unit, &src_addr->port); + if (service) + sscanf(service, "%" SCNu32, &src_addr->service); + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "node '%s' service '%s' converted to \n", + node, service, src_addr->unit, src_addr->port, src_addr->service); + } + } + + /* Check that the src address contains valid unit */ + if (src_addr->unit != PSMX3_DEFAULT_UNIT) { + if (src_addr->unit < 0 || src_addr->unit >= PSMX3_MAX_UNITS) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "invalid source address: unit %d out of range\n", src_addr->unit); + goto err_out; + } + if (!psmx3_hfi_info.unit_is_active[src_addr->unit]) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "invalid source address: unit %d is inactive\n", src_addr->unit); + goto err_out; + } + } + + /* Resovle dest address using "node", "service" pair */ + if (!dest_addr && node && !(flags & FI_SOURCE)) { + psm2_uuid_t uuid; + + psmx3_get_uuid(uuid); + struct util_ns ns = { + .port = psmx3_uuid_to_port(uuid), + .name_len = sizeof(*dest_addr), + .service_len = sizeof(svc), + .service_cmp = psmx3_ns_service_cmp, + .is_service_wildcard = psmx3_ns_is_service_wildcard, + }; + ofi_ns_init(&ns); + + if (service) + svc = atoi(service); + svc0 = svc; + dest_addr = (struct psmx3_ep_name *) + ofi_ns_resolve_name(&ns, node, &svc); + if (dest_addr) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "'%s:%u' resolved to :%d\n", + node, svc0, dest_addr->epid, svc); + } else { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "failed to resolve '%s:%u'.\n", node, svc); + goto err_out; + } + } + + /* Update prov info with resovled addresses and hfi info */ + psmx3_update_prov_info(prov_info, src_addr, dest_addr); + + /* Remove prov info that don't match the hints */ + if (psmx3_check_prov_info(api_version, hints, &prov_info)) + goto err_out; + + /* Apply hints to the prov info */ + psmx3_alter_prov_info(api_version, hints, prov_info); + + /* Set fi_nic struture */ + psmx3_update_hfi_nic_info(prov_info); + + *info = prov_info; + free(src_addr); + free(dest_addr); + return 0; + +err_out: + free(src_addr); + free(dest_addr); + fi_freeinfo(prov_info); + *info = NULL; + return -FI_ENODATA; +} + +static void psmx3_fini(void) +{ + FI_INFO(&psmx3_prov, FI_LOG_CORE, "\n"); + + if (! --psmx3_init_count && psmx3_lib_initialized) { + /* This function is called from a library destructor, which is called + * automatically when exit() is called. The call to psm2_finalize() + * might cause deadlock if the applicaiton is terminated with Ctrl-C + * -- the application could be inside a PSM3 call, holding a lock that + * psm2_finalize() tries to acquire. This can be avoided by only + * calling psm2_finalize() when PSM3 is guaranteed to be unused. + */ + if (psmx3_active_fabric) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "psmx3_active_fabric != NULL, skip psm2_finalize\n"); + } else { + psm2_finalize(); + psmx3_lib_initialized = 0; + } + } +} + +struct fi_provider psmx3_prov = { + .name = PSMX3_PROV_NAME, + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, + .getinfo = psmx3_getinfo, + .fabric = psmx3_fabric, + .cleanup = psmx3_fini +}; + +PROVIDER_INI +{ + FI_INFO(&psmx3_prov, FI_LOG_CORE, "build options: HAVE_PSM3_SRC=%d, " + "PSMX3_USE_REQ_CONTEXT=%d\n", HAVE_PSM3_SRC, + PSMX3_USE_REQ_CONTEXT); + + fi_param_define(&psmx3_prov, "name_server", FI_PARAM_BOOL, + "Whether to turn on the name server or not " + "(default: yes)"); + + fi_param_define(&psmx3_prov, "tagged_rma", FI_PARAM_BOOL, + "Whether to use tagged messages for large size " + "RMA or not (default: yes)"); + + fi_param_define(&psmx3_prov, "uuid", FI_PARAM_STRING, + "Unique Job ID required by the fabric"); + + fi_param_define(&psmx3_prov, "delay", FI_PARAM_INT, + "Delay (seconds) before finalization (for debugging)"); + + fi_param_define(&psmx3_prov, "timeout", FI_PARAM_INT, + "Timeout (seconds) for gracefully closing the PSM3 endpoint"); + + fi_param_define(&psmx3_prov, "conn_timeout", FI_PARAM_INT, + "Timeout (seconds) for establishing connection between two PSM3 endpoints"); + + fi_param_define(&psmx3_prov, "prog_interval", FI_PARAM_INT, + "Interval (microseconds) between progress calls made in the " + "progress thread (default: 1 if affinity is set, 1000 if not)"); + + fi_param_define(&psmx3_prov, "prog_affinity", FI_PARAM_STRING, + "When set, specify the set of CPU cores to set the progress " + "thread affinity to. The format is " + "[:[:]][,[:[:]]]*, " + "where each triplet :: defines a block " + "of core_ids. Both and can be either the core_id " + "(when >=0) or core_id - num_cores (when <0). " + "(default: affinity not set)"); + + fi_param_define(&psmx3_prov, "inject_size", FI_PARAM_INT, + "Maximum message size for fi_inject and fi_tinject (default: 64)."); + + fi_param_define(&psmx3_prov, "lock_level", FI_PARAM_INT, + "How internal locking is used. 0 means no locking. (default: 2)."); + + fi_param_define(&psmx3_prov, "lazy_conn", FI_PARAM_BOOL, + "Whether to force lazy connection mode. (default: no)."); + + fi_param_define(&psmx3_prov, "disconnect", FI_PARAM_BOOL, + "Whether to issue disconnect request when process ends (default: no)."); + +#if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_RUNTIME) + fi_param_define(&psmx3_prov, "tag_layout", FI_PARAM_STRING, + "How the 96 bit PSM3 tag is organized: " + "tag60 means 32/4/60 for data/flags/tag;" + "tag64 means 4/28/64 for flags/data/tag (default: tag60)."); +#endif + + psmx3_init_env(); + + pthread_mutex_init(&psmx3_lib_mutex, NULL); + psmx3_init_count++; + return (&psmx3_prov); +} + diff --git a/prov/psm3/src/psmx3_mr.c b/prov/psm3/src/psmx3_mr.c new file mode 100644 index 00000000000..cc6533062ee --- /dev/null +++ b/prov/psm3/src/psmx3_mr.c @@ -0,0 +1,438 @@ +/* + * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx3.h" + +struct psmx3_fid_mr *psmx3_mr_get(struct psmx3_fid_domain *domain, + uint64_t key) +{ + RbtIterator it; + struct psmx3_fid_mr *mr = NULL; + + domain->mr_lock_fn(&domain->mr_lock, 1); + it = rbtFind(domain->mr_map, (void *)key); + if (!it) + goto exit; + + rbtKeyValue(domain->mr_map, it, (void **)&key, (void **)&mr); +exit: + domain->mr_unlock_fn(&domain->mr_lock, 1); + return mr; +} + +static inline void psmx3_mr_release_key(struct psmx3_fid_domain *domain, + uint64_t key) +{ + RbtIterator it; + + domain->mr_lock_fn(&domain->mr_lock, 1); + it = rbtFind(domain->mr_map, (void *)key); + if (it) + rbtErase(domain->mr_map, it); + domain->mr_unlock_fn(&domain->mr_lock, 1); +} + +static int psmx3_mr_reserve_key(struct psmx3_fid_domain *domain, + uint64_t requested_key, + uint64_t *assigned_key, + void *mr) +{ + uint64_t key; + int i; + int try_count; + int err = -FI_ENOKEY; + + domain->mr_lock_fn(&domain->mr_lock, 1); + + if (domain->mr_mode == FI_MR_BASIC) { + key = domain->mr_reserved_key; + try_count = 10000; /* large enough */ + } else { + key = requested_key; + try_count = 1; + } + + for (i=0; imr_map, (void *)key)) { + if (!rbtInsert(domain->mr_map, (void *)key, mr)) { + if (domain->mr_mode == FI_MR_BASIC) + domain->mr_reserved_key = key + 1; + *assigned_key = key; + err = 0; + } + break; + } + } + + domain->mr_unlock_fn(&domain->mr_lock, 1); + + return err; +} + +int psmx3_mr_validate(struct psmx3_fid_mr *mr, uint64_t addr, + size_t len, uint64_t access) +{ + int i; + + addr += mr->offset; + + if (!addr) + return -FI_EINVAL; + + if ((access & mr->access) != access) + return -FI_EACCES; + + for (i = 0; i < mr->iov_count; i++) { + if ((uint64_t)mr->iov[i].iov_base <= addr && + (uint64_t)mr->iov[i].iov_base + mr->iov[i].iov_len >= addr + len) + return 0; + } + + return -FI_EACCES; +} + +static int psmx3_mr_close(fid_t fid) +{ + struct psmx3_fid_mr *mr; + + mr = container_of(fid, struct psmx3_fid_mr, mr.fid); + psmx3_mr_release_key(mr->domain, mr->mr.key); + psmx3_domain_release(mr->domain); + free(mr); + + return 0; +} + +DIRECT_FN +STATIC int psmx3_mr_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + struct psmx3_fid_mr *mr; + struct psmx3_fid_ep *ep; + struct psmx3_fid_cntr *cntr; + + mr = container_of(fid, struct psmx3_fid_mr, mr.fid); + + assert(bfid); + + switch (bfid->fclass) { + case FI_CLASS_EP: + ep = container_of(bfid, struct psmx3_fid_ep, ep.fid); + if (mr->domain != ep->domain) + return -FI_EINVAL; + break; + + case FI_CLASS_CNTR: + cntr = container_of(bfid, struct psmx3_fid_cntr, cntr.fid); + if (mr->cntr && mr->cntr != cntr) + return -FI_EBUSY; + if (mr->domain != cntr->domain) + return -FI_EINVAL; + if (flags) { + if (flags != FI_REMOTE_WRITE) + return -FI_EINVAL; + mr->cntr = cntr; + cntr->poll_all = 1; + } + break; + + default: + return -FI_ENOSYS; + } + + return 0; +} + +DIRECT_FN +STATIC int psmx3_mr_control(fid_t fid, int command, void *arg) +{ + struct psmx3_fid_mr *mr; + struct fi_mr_raw_attr *attr; + + mr = container_of(fid, struct psmx3_fid_mr, mr.fid); + + switch (command) { + case FI_GET_RAW_MR: + attr = arg; + if (!attr) + return -FI_EINVAL; + if (attr->base_addr) + *attr->base_addr = (uint64_t)(uintptr_t)mr->iov[0].iov_base; + if (attr->raw_key) + *(uint64_t *)attr->raw_key = mr->mr.key; + if (attr->key_size) + *attr->key_size = sizeof(uint64_t); + break; + + case FI_REFRESH: + case FI_ENABLE: + /* Nothing to do here */ + break; + + default: + return -FI_ENOSYS; + } + + return 0; +} + +static struct fi_ops psmx3_fi_ops = { + .size = sizeof(struct fi_ops), + .close = psmx3_mr_close, + .bind = psmx3_mr_bind, + .control = psmx3_mr_control, + .ops_open = fi_no_ops_open, +}; + +static void psmx3_mr_normalize_iov(struct iovec *iov, size_t *count) +{ + struct iovec tmp_iov; + int i, j, n, new_len; + uintptr_t iov_end_i, iov_end_j; + + n = *count; + + if (!n) + return; + + /* sort segments by base address */ + for (i = 0; i < n - 1; i++) { + for (j = i + 1; j < n; j++) { + if (iov[i].iov_base > iov[j].iov_base) { + tmp_iov = iov[i]; + iov[i] = iov[j]; + iov[j] = tmp_iov; + } + } + } + + /* merge overlapping segments */ + for (i = 0; i < n - 1; i++) { + if (iov[i].iov_len == 0) + continue; + + for (j = i + 1; j < n; j++) { + if (iov[j].iov_len == 0) + continue; + + iov_end_i = (uintptr_t)iov[i].iov_base + iov[i].iov_len; + iov_end_j = (uintptr_t)iov[j].iov_base + iov[j].iov_len; + if (iov_end_i >= (uintptr_t)iov[j].iov_base) { + new_len = iov_end_j - (uintptr_t)iov[i].iov_base; + if (new_len > iov[i].iov_len) + iov[i].iov_len = new_len; + iov[j].iov_len = 0; + } else { + break; + } + } + } + + /* remove empty segments */ + for (i = 0, j = 1; i < n; i++, j++) { + if (iov[i].iov_len) + continue; + + while (j < n && iov[j].iov_len == 0) + j++; + + if (j >= n) + break; + + iov[i] = iov[j]; + iov[j].iov_len = 0; + } + + *count = i; +} + +DIRECT_FN +STATIC int psmx3_mr_reg(struct fid *fid, const void *buf, size_t len, + uint64_t access, uint64_t offset, uint64_t requested_key, + uint64_t flags, struct fid_mr **mr, void *context) +{ + struct fid_domain *domain; + struct psmx3_fid_domain *domain_priv; + struct psmx3_fid_mr *mr_priv; + uint64_t key; + int err; + + assert(fid->fclass == FI_CLASS_DOMAIN); + + domain = container_of(fid, struct fid_domain, fid); + domain_priv = container_of(domain, struct psmx3_fid_domain, + util_domain.domain_fid); + + mr_priv = (struct psmx3_fid_mr *) calloc(1, sizeof(*mr_priv) + sizeof(struct iovec)); + if (!mr_priv) + return -FI_ENOMEM; + + err = psmx3_mr_reserve_key(domain_priv, requested_key, &key, mr_priv); + if (err) { + free(mr_priv); + return err; + } + + psmx3_domain_acquire(domain_priv); + + mr_priv->mr.fid.fclass = FI_CLASS_MR; + mr_priv->mr.fid.context = context; + mr_priv->mr.fid.ops = &psmx3_fi_ops; + mr_priv->mr.mem_desc = mr_priv; + mr_priv->mr.key = key; + mr_priv->domain = domain_priv; + mr_priv->access = access; + mr_priv->flags = flags; + mr_priv->iov_count = 1; + mr_priv->iov[0].iov_base = (void *)buf; + mr_priv->iov[0].iov_len = len; + mr_priv->offset = (domain_priv->mr_mode == FI_MR_BASIC) ? 0 : + ((uint64_t)mr_priv->iov[0].iov_base - offset); + + *mr = &mr_priv->mr; + return 0; +} + +DIRECT_FN +STATIC int psmx3_mr_regv(struct fid *fid, + const struct iovec *iov, size_t count, + uint64_t access, uint64_t offset, + uint64_t requested_key, uint64_t flags, + struct fid_mr **mr, void *context) +{ + struct fid_domain *domain; + struct psmx3_fid_domain *domain_priv; + struct psmx3_fid_mr *mr_priv; + int i, err; + uint64_t key; + + assert(fid->fclass == FI_CLASS_DOMAIN); + + domain = container_of(fid, struct fid_domain, fid); + domain_priv = container_of(domain, struct psmx3_fid_domain, + util_domain.domain_fid); + + assert(count); + assert(iov); + + mr_priv = (struct psmx3_fid_mr *) + calloc(1, sizeof(*mr_priv) + + sizeof(struct iovec) * count); + if (!mr_priv) + return -FI_ENOMEM; + + err = psmx3_mr_reserve_key(domain_priv, requested_key, &key, mr_priv); + if (err) { + free(mr_priv); + return err; + } + + psmx3_domain_acquire(domain_priv); + + mr_priv->mr.fid.fclass = FI_CLASS_MR; + mr_priv->mr.fid.context = context; + mr_priv->mr.fid.ops = &psmx3_fi_ops; + mr_priv->mr.mem_desc = mr_priv; + mr_priv->mr.key = key; + mr_priv->domain = domain_priv; + mr_priv->access = access; + mr_priv->flags = flags; + mr_priv->iov_count = count; + for (i=0; iiov[i] = iov[i]; + psmx3_mr_normalize_iov(mr_priv->iov, &mr_priv->iov_count); + mr_priv->offset = (domain_priv->mr_mode == FI_MR_BASIC) ? 0 : + ((uint64_t)mr_priv->iov[0].iov_base - offset); + + *mr = &mr_priv->mr; + return 0; +} + +DIRECT_FN +STATIC int psmx3_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, + uint64_t flags, struct fid_mr **mr) +{ + struct fid_domain *domain; + struct psmx3_fid_domain *domain_priv; + struct psmx3_fid_mr *mr_priv; + int i, err; + uint64_t key; + + assert(fid->fclass == FI_CLASS_DOMAIN); + + domain = container_of(fid, struct fid_domain, fid); + domain_priv = container_of(domain, struct psmx3_fid_domain, + util_domain.domain_fid); + + assert(attr); + assert(attr->iov_count); + assert(attr->mr_iov); + + mr_priv = (struct psmx3_fid_mr *) + calloc(1, sizeof(*mr_priv) + + sizeof(struct iovec) * attr->iov_count); + if (!mr_priv) + return -FI_ENOMEM; + + err = psmx3_mr_reserve_key(domain_priv, attr->requested_key, &key, mr_priv); + if (err) { + free(mr_priv); + return err; + } + + psmx3_domain_acquire(domain_priv); + + mr_priv->mr.fid.fclass = FI_CLASS_MR; + mr_priv->mr.fid.context = attr->context; + mr_priv->mr.fid.ops = &psmx3_fi_ops; + mr_priv->mr.mem_desc = mr_priv; + mr_priv->mr.key = key; + mr_priv->domain = domain_priv; + mr_priv->access = attr->access; + mr_priv->flags = flags; + mr_priv->iov_count = attr->iov_count; + for (i=0; iiov_count; i++) + mr_priv->iov[i] = attr->mr_iov[i]; + psmx3_mr_normalize_iov(mr_priv->iov, &mr_priv->iov_count); + mr_priv->offset = (domain_priv->mr_mode == FI_MR_BASIC) ? 0 : + ((uint64_t)mr_priv->iov[0].iov_base - attr->offset); + + *mr = &mr_priv->mr; + return 0; +} + +struct fi_ops_mr psmx3_mr_ops = { + .size = sizeof(struct fi_ops_mr), + .reg = psmx3_mr_reg, + .regv = psmx3_mr_regv, + .regattr = psmx3_mr_regattr, +}; + diff --git a/prov/psm3/src/psmx3_msg.c b/prov/psm3/src/psmx3_msg.c new file mode 100644 index 00000000000..dd1933c5651 --- /dev/null +++ b/prov/psm3/src/psmx3_msg.c @@ -0,0 +1,664 @@ +/* + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx3.h" +#include "psmx3_trigger.h" + +ssize_t psmx3_recv_generic(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, void *context, + uint64_t flags) +{ + struct psmx3_fid_ep *ep_priv; + struct psmx3_fid_av *av; + psm2_epaddr_t psm2_epaddr; + psm2_mq_req_t psm2_req; + psm2_mq_tag_t psm2_tag, psm2_tagsel; + struct fi_context *fi_context; + int recv_flag = 0; + int err; + int enable_completion; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (flags & FI_TRIGGER) + return psmx3_trigger_queue_recv(ep, buf, len, desc, src_addr, + context, flags); + + if ((ep_priv->caps & FI_DIRECTED_RECV) && src_addr != FI_ADDR_UNSPEC) { + av = ep_priv->av; + assert(av); + psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->rx, src_addr, av->type); + } else { + psm2_epaddr = 0; + } + + PSMX3_SET_TAG(psm2_tag, 0ULL, 0, PSMX3_TYPE_MSG); + PSMX3_SET_MASK(psm2_tagsel, PSMX3_MATCH_NONE, PSMX3_TYPE_MASK); + + enable_completion = !ep_priv->recv_selective_completion || + (flags & FI_COMPLETION); + if (enable_completion) { + assert(context); + fi_context = context; + if (flags & FI_MULTI_RECV) { + struct psmx3_multi_recv *req; + + req = calloc(1, sizeof(*req)); + if (!req) + return -FI_ENOMEM; + + req->src_addr = psm2_epaddr; + req->tag = psm2_tag; + req->tagsel = psm2_tagsel; + req->flag = recv_flag; + req->buf = buf; + req->len = len; + req->offset = 0; + req->min_buf_size = ep_priv->min_multi_recv; + req->context = fi_context; + PSMX3_CTXT_TYPE(fi_context) = PSMX3_MULTI_RECV_CONTEXT; + PSMX3_CTXT_USER(fi_context) = req; + if (len > PSMX3_MAX_MSG_SIZE) + len = PSMX3_MAX_MSG_SIZE; + } else { + PSMX3_CTXT_TYPE(fi_context) = PSMX3_RECV_CONTEXT; + PSMX3_CTXT_USER(fi_context) = buf; + } + PSMX3_CTXT_EP(fi_context) = ep_priv; + PSMX3_CTXT_SIZE(fi_context) = len; + } else { + PSMX3_EP_GET_OP_CONTEXT(ep_priv, fi_context); + #if !PSMX3_USE_REQ_CONTEXT + PSMX3_CTXT_TYPE(fi_context) = PSMX3_NOCOMP_RECV_CONTEXT; + PSMX3_CTXT_EP(fi_context) = ep_priv; + PSMX3_CTXT_USER(fi_context) = buf; + PSMX3_CTXT_SIZE(fi_context) = len; + #endif + } + + err = psm2_mq_irecv2(ep_priv->rx->psm2_mq, psm2_epaddr, + &psm2_tag, &psm2_tagsel, recv_flag, buf, len, + (void *)fi_context, &psm2_req); + if (OFI_UNLIKELY(err != PSM2_OK)) + return psmx3_errno(err); + + if (enable_completion) { + PSMX3_CTXT_REQ(fi_context) = psm2_req; + } else { + #if PSMX3_USE_REQ_CONTEXT + PSMX3_REQ_GET_OP_CONTEXT(psm2_req, fi_context); + PSMX3_CTXT_TYPE(fi_context) = PSMX3_NOCOMP_RECV_CONTEXT; + PSMX3_CTXT_EP(fi_context) = ep_priv; + PSMX3_CTXT_USER(fi_context) = buf; + PSMX3_CTXT_SIZE(fi_context) = len; + #endif + } + + return 0; +} + +DIRECT_FN +STATIC ssize_t psmx3_recv(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, void *context) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + return psmx3_recv_generic(ep, buf, len, desc, src_addr, context, + ep_priv->rx_flags); +} + +DIRECT_FN +STATIC ssize_t psmx3_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, + uint64_t flags) +{ + void *buf; + size_t len; + + assert(msg); + assert(!msg->iov_count || msg->msg_iov); + assert(msg->iov_count <= 1); + + if (msg->iov_count) { + buf = msg->msg_iov[0].iov_base; + len = msg->msg_iov[0].iov_len; + } else { + buf = NULL; + len = 0; + } + + return psmx3_recv_generic(ep, buf, len, + msg->desc ? msg->desc[0] : NULL, + msg->addr, msg->context, flags); +} + +DIRECT_FN +STATIC ssize_t psmx3_recvv(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, + void *context) +{ + void *buf; + size_t len; + + assert(!count || iov); + assert(count <= 1); + + if (count) { + buf = iov[0].iov_base; + len = iov[0].iov_len; + } else { + buf = NULL; + len = 0; + } + + return psmx3_recv(ep, buf, len, desc ? desc[0] : NULL, + src_addr, context); +} + +ssize_t psmx3_send_generic(struct fid_ep *ep, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, void *context, + uint64_t flags, uint64_t data) +{ + struct psmx3_fid_ep *ep_priv; + struct psmx3_fid_av *av; + psm2_epaddr_t psm2_epaddr; + psm2_mq_req_t psm2_req; + psm2_mq_tag_t psm2_tag; + struct fi_context * fi_context; + int send_flag = 0; + int err; + int no_completion = 0; + struct psmx3_cq_event *event; + int have_data = (flags & FI_REMOTE_CQ_DATA) > 0; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (flags & FI_TRIGGER) + return psmx3_trigger_queue_send(ep, buf, len, desc, dest_addr, + context, flags, data); + + av = ep_priv->av; + assert(av); + psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type); + + if (have_data) + PSMX3_SET_TAG(psm2_tag, 0, data, PSMX3_TYPE_MSG | PSMX3_IMM_BIT); + else + PSMX3_SET_TAG(psm2_tag, 0, ep_priv->sep_id, PSMX3_TYPE_MSG); + + if ((flags & PSMX3_NO_COMPLETION) || + (ep_priv->send_selective_completion && !(flags & FI_COMPLETION))) + no_completion = 1; + + if (flags & FI_INJECT) { + if (len > psmx3_env.inject_size) + return -FI_EMSGSIZE; + + err = psm2_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, + send_flag, &psm2_tag, buf, len); + + if (err != PSM2_OK) + return psmx3_errno(err); + + if (ep_priv->send_cntr) + psmx3_cntr_inc(ep_priv->send_cntr, 0); + + if (ep_priv->send_cq && !no_completion) { + event = psmx3_cq_create_event( + ep_priv->send_cq, + context, (void *)buf, flags, len, + (uint64_t) data, + 0 /* tag */, + 0 /* olen */, + 0 /* err */); + + if (event) + psmx3_cq_enqueue_event(ep_priv->send_cq, event); + else + return -FI_ENOMEM; + } + + return 0; + } + + if (no_completion) { + fi_context = &ep_priv->nocomp_send_context; + } else { + assert(context); + fi_context = context; + PSMX3_CTXT_TYPE(fi_context) = PSMX3_SEND_CONTEXT; + PSMX3_CTXT_USER(fi_context) = (void *)buf; + PSMX3_CTXT_EP(fi_context) = ep_priv; + } + + err = psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr, + send_flag, &psm2_tag, buf, len, + (void *)fi_context, &psm2_req); + + if (err != PSM2_OK) + return psmx3_errno(err); + + if (fi_context == context) + PSMX3_CTXT_REQ(fi_context) = psm2_req; + + return 0; +} + +ssize_t psmx3_sendv_generic(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t dest_addr, + void *context, uint64_t flags, uint64_t data) +{ + struct psmx3_fid_ep *ep_priv; + struct psmx3_fid_av *av; + psm2_epaddr_t psm2_epaddr; + psm2_mq_req_t psm2_req; + psm2_mq_tag_t psm2_tag; + uint32_t msg_flags; + struct fi_context * fi_context; + int send_flag = 0; + int err; + int no_completion = 0; + struct psmx3_cq_event *event; + size_t real_count; + size_t len, total_len; + char *p; + uint32_t *q; + int i, j=0; + struct psmx3_sendv_request *req; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (flags & FI_TRIGGER) + return psmx3_trigger_queue_sendv(ep, iov, desc, count, + dest_addr, context, flags, + data); + + total_len = 0; + real_count = 0; + for (i=0; iiov_protocol = PSMX3_IOV_PROTO_PACK; + p = req->buf; + for (i=0; iiov_protocol = PSMX3_IOV_PROTO_MULTI; + req->iov_done = 0; + req->iov_info.seq_num = (++ep_priv->iov_seq_num) % + PSMX3_IOV_MAX_SEQ_NUM + 1; + req->iov_info.count = (uint32_t)real_count; + req->iov_info.total_len = (uint32_t)total_len; + + q = req->iov_info.len; + for (i=0; iav; + assert(av); + psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type); + + if (flags & FI_REMOTE_CQ_DATA) { + msg_flags |= PSMX3_IMM_BIT; + PSMX3_SET_TAG(psm2_tag, 0ULL, data, msg_flags); + } else { + PSMX3_SET_TAG(psm2_tag, 0ULL, ep_priv->sep_id, msg_flags); + } + + if ((flags & PSMX3_NO_COMPLETION) || + (ep_priv->send_selective_completion && !(flags & FI_COMPLETION))) + no_completion = 1; + + if (flags & FI_INJECT) { + if (len > psmx3_env.inject_size) { + free(req); + return -FI_EMSGSIZE; + } + + err = psm2_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, + send_flag, &psm2_tag, req->buf, len); + + free(req); + + if (err != PSM2_OK) + return psmx3_errno(err); + + if (ep_priv->send_cntr) + psmx3_cntr_inc(ep_priv->send_cntr, 0); + + if (ep_priv->send_cq && !no_completion) { + event = psmx3_cq_create_event( + ep_priv->send_cq, + context, NULL, flags, len, + (uint64_t) data, + 0 /* tag */, + 0 /* olen */, + 0 /* err */); + + if (event) + psmx3_cq_enqueue_event(ep_priv->send_cq, event); + else + return -FI_ENOMEM; + } + + return 0; + } + + req->no_completion = no_completion; + req->user_context = context; + req->comp_flag = FI_MSG; + + fi_context = &req->fi_context; + PSMX3_CTXT_TYPE(fi_context) = PSMX3_SENDV_CONTEXT; + PSMX3_CTXT_USER(fi_context) = req; + PSMX3_CTXT_EP(fi_context) = ep_priv; + + err = psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr, + send_flag, &psm2_tag, req->buf, len, + (void *)fi_context, &psm2_req); + + if (err != PSM2_OK) { + free(req); + return psmx3_errno(err); + } + + PSMX3_CTXT_REQ(fi_context) = psm2_req; + + if (req->iov_protocol == PSMX3_IOV_PROTO_MULTI) { + fi_context = &req->fi_context_iov; + PSMX3_CTXT_TYPE(fi_context) = PSMX3_IOV_SEND_CONTEXT; + PSMX3_CTXT_USER(fi_context) = req; + PSMX3_CTXT_EP(fi_context) = ep_priv; + PSMX3_SET_TAG(psm2_tag, req->iov_info.seq_num, 0, PSMX3_TYPE_IOV_PAYLOAD); + for (i=0; itx->psm2_mq, + psm2_epaddr, send_flag, &psm2_tag, + iov[i].iov_base, iov[i].iov_len, + (void *)fi_context, &psm2_req); + if (err != PSM2_OK) + return psmx3_errno(err); + } + } + } + + return 0; +} + +int psmx3_handle_sendv_req(struct psmx3_fid_ep *ep, + PSMX3_STATUS_TYPE *status, + int multi_recv) +{ + psm2_mq_req_t psm2_req; + psm2_mq_tag_t psm2_tag, psm2_tagsel; + struct psmx3_sendv_reply *rep; + struct psmx3_multi_recv *recv_req; + struct fi_context *fi_context; + struct fi_context *recv_context; + int i, err; + uint8_t *recv_buf; + size_t recv_len, len; + + if (PSMX3_STATUS_ERROR(status) != PSM2_OK) + return psmx3_errno(PSMX3_STATUS_ERROR(status)); + + rep = malloc(sizeof(*rep)); + if (!rep) { + PSMX3_STATUS_ERROR(status) = PSM2_NO_MEMORY; + return -FI_ENOMEM; + } + + recv_context = PSMX3_STATUS_CONTEXT(status); + if (multi_recv) { + recv_req = PSMX3_CTXT_USER(recv_context); + recv_buf = recv_req->buf + recv_req->offset; + recv_len = recv_req->len - recv_req->offset; + rep->multi_recv = 1; + } else { + recv_buf = PSMX3_CTXT_USER(recv_context); + recv_len = PSMX3_CTXT_SIZE(recv_context); + rep->multi_recv = 0; + } + + /* assert(PSMX3_STATUS_RCVLEN(status) <= PSMX3_IOV_BUF_SIZE); */ + + memcpy(&rep->iov_info, recv_buf, PSMX3_STATUS_RCVLEN(status)); + + rep->user_context = PSMX3_STATUS_CONTEXT(status); + rep->tag = PSMX3_STATUS_TAG(status); + rep->buf = recv_buf; + rep->no_completion = 0; + rep->iov_done = 0; + rep->bytes_received = 0; + rep->msg_length = 0; + rep->error_code = PSM2_OK; + + fi_context = &rep->fi_context; + PSMX3_CTXT_TYPE(fi_context) = PSMX3_IOV_RECV_CONTEXT; + PSMX3_CTXT_USER(fi_context) = rep; + PSMX3_CTXT_EP(fi_context) = ep; + + rep->comp_flag = PSMX3_IS_MSG(PSMX3_GET_FLAGS(rep->tag)) ? FI_MSG : FI_TAGGED; + if (PSMX3_HAS_IMM(PSMX3_GET_FLAGS(rep->tag))) + rep->comp_flag |= FI_REMOTE_CQ_DATA; + + /* IOV payload uses a sequence number in place of a tag. */ + PSMX3_SET_TAG(psm2_tag, rep->iov_info.seq_num, 0, PSMX3_TYPE_IOV_PAYLOAD); + PSMX3_SET_MASK(psm2_tagsel, PSMX3_MATCH_ALL, PSMX3_TYPE_MASK); + + for (i=0; iiov_info.count; i++) { + if (recv_len) { + len = MIN(recv_len, rep->iov_info.len[i]); + err = psm2_mq_irecv2(ep->rx->psm2_mq, + PSMX3_STATUS_PEER(status), + &psm2_tag, &psm2_tagsel, + 0/*flag*/, recv_buf, len, + (void *)fi_context, &psm2_req); + if (err) { + PSMX3_STATUS_ERROR(status) = err; + return psmx3_errno(err); + } + recv_buf += len; + recv_len -= len; + } else { + /* recv buffer full, post empty recvs */ + err = psm2_mq_irecv2(ep->rx->psm2_mq, + PSMX3_STATUS_PEER(status), + &psm2_tag, &psm2_tagsel, + 0/*flag*/, NULL, 0, + (void *)fi_context, &psm2_req); + if (err) { + PSMX3_STATUS_ERROR(status) = err; + return psmx3_errno(err); + } + } + } + + if (multi_recv && recv_len < recv_req->min_buf_size) + rep->comp_flag |= FI_MULTI_RECV; + + return 0; +} + +DIRECT_FN +STATIC ssize_t psmx3_send(struct fid_ep *ep, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, void *context) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + return psmx3_send_generic(ep, buf, len, desc, dest_addr, context, + ep_priv->tx_flags, 0); +} + +DIRECT_FN +STATIC ssize_t psmx3_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, + uint64_t flags) +{ + void *buf; + size_t len; + + assert(msg); + assert(!msg->iov_count || msg->msg_iov); + assert(msg->iov_count <= PSMX3_IOV_MAX_COUNT); + + if (msg->iov_count > 1) { + return psmx3_sendv_generic(ep, msg->msg_iov, msg->desc, + msg->iov_count, msg->addr, + msg->context, flags, + msg->data); + } else if (msg->iov_count) { + buf = msg->msg_iov[0].iov_base; + len = msg->msg_iov[0].iov_len; + } else { + buf = NULL; + len = 0; + } + + return psmx3_send_generic(ep, buf, len, + msg->desc ? msg->desc[0] : NULL, + msg->addr, msg->context, flags, + msg->data); +} + +DIRECT_FN +STATIC ssize_t psmx3_sendv(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t dest_addr, + void *context) +{ + void *buf; + size_t len; + + assert(!count || iov); + assert(count <= PSMX3_IOV_MAX_COUNT); + + if (count > 1) { + struct psmx3_fid_ep *ep_priv; + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + return psmx3_sendv_generic(ep, iov, desc, count, dest_addr, + context, ep_priv->tx_flags, 0); + } else if (count) { + buf = iov[0].iov_base; + len = iov[0].iov_len; + } else { + buf = NULL; + len = 0; + } + + return psmx3_send(ep, buf, len, desc ? desc[0] : NULL, + dest_addr, context); +} + +DIRECT_FN +STATIC ssize_t psmx3_inject(struct fid_ep *ep, const void *buf, size_t len, + fi_addr_t dest_addr) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + return psmx3_send_generic(ep, buf, len, NULL, dest_addr, NULL, + ep_priv->tx_flags | FI_INJECT | PSMX3_NO_COMPLETION, + 0); +} + +DIRECT_FN +STATIC ssize_t psmx3_senddata(struct fid_ep *ep, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, + void *context) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + return psmx3_send_generic(ep, buf, len, desc, dest_addr, context, + ep_priv->tx_flags | FI_REMOTE_CQ_DATA, data); +} + +DIRECT_FN +STATIC ssize_t psmx3_injectdata(struct fid_ep *ep, const void *buf, size_t len, + uint64_t data, fi_addr_t dest_addr) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + return psmx3_send_generic(ep, buf, len, NULL, dest_addr, NULL, + ep_priv->tx_flags | FI_INJECT | PSMX3_NO_COMPLETION | + FI_REMOTE_CQ_DATA, + data); +} + +struct fi_ops_msg psmx3_msg_ops = { + .size = sizeof(struct fi_ops_msg), + .recv = psmx3_recv, + .recvv = psmx3_recvv, + .recvmsg = psmx3_recvmsg, + .send = psmx3_send, + .sendv = psmx3_sendv, + .sendmsg = psmx3_sendmsg, + .inject = psmx3_inject, + .senddata = psmx3_senddata, + .injectdata = psmx3_injectdata, +}; + diff --git a/prov/psm3/src/psmx3_rma.c b/prov/psm3/src/psmx3_rma.c new file mode 100644 index 00000000000..d7f2c5d273b --- /dev/null +++ b/prov/psm3/src/psmx3_rma.c @@ -0,0 +1,1454 @@ +/* + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx3.h" +#include "psmx3_trigger.h" + +static inline void psmx3_iov_copy(struct iovec *iov, size_t count, + size_t offset, const void *src, + size_t len) +{ + int i; + size_t copy_len; + + for (i=0; i= iov[i].iov_len) { + offset -= iov[i].iov_len; + continue; + } + + copy_len = iov[i].iov_len - offset; + if (copy_len > len) + copy_len = len; + + memcpy((uint8_t *)iov[i].iov_base + offset, src, copy_len); + + src = (const uint8_t *)src + copy_len; + len -= copy_len; + + if (offset) + offset = 0; + } +} + +/* RMA protocol: + * + * Write REQ: + * args[0].u32w0 cmd, flag + * args[0].u32w1 len + * args[1].u64 req + * args[2].u64 addr + * args[3].u64 key + * args[4].u64 data (optional) + * + * Write REP: + * args[0].u32w0 cmd, flag + * args[0].u32w1 error + * args[1].u64 req + * + * Read REQ: + * args[0].u32w0 cmd, flag + * args[0].u32w1 len + * args[1].u64 req + * args[2].u64 addr + * args[3].u64 key + * args[4].u64 offset / unused for long protocol + * + * Read REP: + * args[0].u32w0 cmd, flag + * args[0].u32w1 error + * args[1].u64 req + * args[2].u64 offset + */ + +int psmx3_am_rma_handler(psm2_am_token_t token, psm2_amarg_t *args, + int nargs, void *src, uint32_t len, + void *hctx) +{ + psm2_amarg_t rep_args[8]; + uint8_t *rma_addr; + ssize_t rma_len; + uint64_t key; + int err = 0; + int op_error = 0; + int cmd, eom, has_data; + struct psmx3_am_request *req; + struct psmx3_cq_event *event; + uint64_t offset; + struct psmx3_fid_mr *mr; + psm2_epaddr_t epaddr; + struct psmx3_trx_ctxt *rx; + + psm2_mq_req_t psm2_req; + psm2_mq_tag_t psm2_tag, psm2_tagsel; + + psm2_am_get_source(token, &epaddr); + cmd = PSMX3_AM_GET_OP(args[0].u32w0); + eom = args[0].u32w0 & PSMX3_AM_EOM; + has_data = args[0].u32w0 & PSMX3_AM_DATA; + + switch (cmd) { + case PSMX3_AM_REQ_WRITE: + rx = (struct psmx3_trx_ctxt *)hctx; + rma_len = args[0].u32w1; + rma_addr = (uint8_t *)(uintptr_t)args[2].u64; + key = args[3].u64; + mr = psmx3_mr_get(rx->domain, key); + op_error = mr ? + psmx3_mr_validate(mr, (uint64_t)rma_addr, len, FI_REMOTE_WRITE) : + -FI_EINVAL; + if (!op_error) { + rma_addr += mr->offset; + memcpy(rma_addr, src, len); + if (eom) { + if (rx->ep->recv_cq && has_data) { + /* TODO: report the addr/len of the whole write */ + event = psmx3_cq_create_event( + rx->ep->recv_cq, + 0, /* context */ + rma_addr, + FI_REMOTE_WRITE | FI_RMA | FI_REMOTE_CQ_DATA, + rma_len, + args[4].u64, + 0, /* tag */ + 0, /* olen */ + 0); + + if (event) + psmx3_cq_enqueue_event(rx->ep->recv_cq, event); + else + err = -FI_ENOMEM; + } + + if (rx->ep->caps & FI_RMA_EVENT) { + if (rx->ep->remote_write_cntr) + psmx3_cntr_inc(rx->ep->remote_write_cntr, 0); + + if (mr->cntr && mr->cntr != rx->ep->remote_write_cntr) + psmx3_cntr_inc(mr->cntr, 0); + } + } + } + if (eom || op_error) { + rep_args[0].u32w0 = PSMX3_AM_REP_WRITE | eom; + rep_args[0].u32w1 = op_error; + rep_args[1].u64 = args[1].u64; + err = psm2_am_reply_short(token, PSMX3_AM_RMA_HANDLER, + rep_args, 2, NULL, 0, 0, + NULL, NULL ); + } + break; + + case PSMX3_AM_REQ_WRITE_LONG: + rx = (struct psmx3_trx_ctxt *)hctx; + rma_len = args[0].u32w1; + rma_addr = (uint8_t *)(uintptr_t)args[2].u64; + key = args[3].u64; + mr = psmx3_mr_get(rx->domain, key); + op_error = mr ? + psmx3_mr_validate(mr, (uint64_t)rma_addr, rma_len, FI_REMOTE_WRITE) : + -FI_EINVAL; + if (op_error) { + rep_args[0].u32w0 = PSMX3_AM_REP_WRITE | eom; + rep_args[0].u32w1 = op_error; + rep_args[1].u64 = args[1].u64; + err = psm2_am_reply_short(token, PSMX3_AM_RMA_HANDLER, + rep_args, 2, NULL, 0, 0, + NULL, NULL ); + break; + } + + rma_addr += mr->offset; + + req = psmx3_am_request_alloc(rx); + if (!req) { + err = -FI_ENOMEM; + } else { + req->ep = rx->ep; + req->op = args[0].u32w0; + req->write.addr = (uint64_t)rma_addr; + req->write.len = rma_len; + req->write.key = key; + req->write.context = (void *)args[1].u64; + req->write.peer_addr = (void *)epaddr; + req->write.data = has_data ? args[4].u64 : 0; + req->cq_flags = FI_REMOTE_WRITE | FI_RMA | + (has_data ? FI_REMOTE_CQ_DATA : 0), + PSMX3_CTXT_TYPE(&req->fi_context) = PSMX3_REMOTE_WRITE_CONTEXT; + PSMX3_CTXT_USER(&req->fi_context) = mr; + PSMX3_SET_TAG(psm2_tag, (uint64_t)req->write.context, 0, + PSMX3_RMA_TYPE_WRITE); + PSMX3_SET_MASK(psm2_tagsel, PSMX3_MATCH_ALL, PSMX3_RMA_TYPE_MASK); + op_error = psm2_mq_fp_msg(rx->psm2_ep, rx->psm2_mq, + (psm2_epaddr_t)epaddr, + &psm2_tag, &psm2_tagsel, 0, + (void *)rma_addr, rma_len, + (void *)&req->fi_context, PSM2_MQ_IRECV_FP, &psm2_req); + if (op_error) { + rep_args[0].u32w0 = PSMX3_AM_REP_WRITE | eom; + rep_args[0].u32w1 = op_error; + rep_args[1].u64 = args[1].u64; + err = psm2_am_reply_short(token, PSMX3_AM_RMA_HANDLER, + rep_args, 2, NULL, 0, 0, + NULL, NULL ); + psmx3_am_request_free(rx, req); + break; + } + } + break; + + case PSMX3_AM_REQ_READ: + rx = (struct psmx3_trx_ctxt *)hctx; + rma_len = args[0].u32w1; + rma_addr = (uint8_t *)(uintptr_t)args[2].u64; + key = args[3].u64; + offset = args[4].u64; + mr = psmx3_mr_get(rx->domain, key); + op_error = mr ? + psmx3_mr_validate(mr, (uint64_t)rma_addr, rma_len, FI_REMOTE_READ) : + -FI_EINVAL; + if (!op_error) { + rma_addr += mr->offset; + } else { + rma_addr = NULL; + rma_len = 0; + } + + rep_args[0].u32w0 = PSMX3_AM_REP_READ | eom; + rep_args[0].u32w1 = op_error; + rep_args[1].u64 = args[1].u64; + rep_args[2].u64 = offset; + err = psm2_am_reply_short(token, PSMX3_AM_RMA_HANDLER, + rep_args, 3, rma_addr, rma_len, 0, + NULL, NULL ); + + if (eom && !op_error) { + if (rx->ep->caps & FI_RMA_EVENT) { + if (rx->ep->remote_read_cntr) + psmx3_cntr_inc(rx->ep->remote_read_cntr, 0); + } + } + break; + + case PSMX3_AM_REQ_READ_LONG: + rx = (struct psmx3_trx_ctxt *)hctx; + rma_len = args[0].u32w1; + rma_addr = (uint8_t *)(uintptr_t)args[2].u64; + key = args[3].u64; + mr = psmx3_mr_get(rx->domain, key); + op_error = mr ? + psmx3_mr_validate(mr, (uint64_t)rma_addr, rma_len, FI_REMOTE_READ) : + -FI_EINVAL; + if (op_error) { + rep_args[0].u32w0 = PSMX3_AM_REP_READ | eom; + rep_args[0].u32w1 = op_error; + rep_args[1].u64 = args[1].u64; + rep_args[2].u64 = 0; + err = psm2_am_reply_short(token, PSMX3_AM_RMA_HANDLER, + rep_args, 3, NULL, 0, 0, + NULL, NULL ); + break; + } + + rma_addr += mr->offset; + + req = psmx3_am_request_alloc(rx); + if (!req) { + err = -FI_ENOMEM; + } else { + req->ep = rx->ep; + req->op = args[0].u32w0; + req->read.addr = (uint64_t)rma_addr; + req->read.len = rma_len; + req->read.key = key; + req->read.context = (void *)args[1].u64; + req->read.peer_addr = (void *)epaddr; + PSMX3_CTXT_TYPE(&req->fi_context) = PSMX3_REMOTE_READ_CONTEXT; + PSMX3_CTXT_USER(&req->fi_context) = mr; + PSMX3_SET_TAG(psm2_tag, (uint64_t)req->read.context, 0, + PSMX3_RMA_TYPE_READ); + op_error = psm2_mq_fp_msg(rx->psm2_ep, rx->psm2_mq, + (psm2_epaddr_t)req->read.peer_addr, + &psm2_tag, 0, 0, + (void *)req->read.addr, req->read.len, + (void *)&req->fi_context, PSM2_MQ_ISEND_FP, &psm2_req); + if (op_error) { + rep_args[0].u32w0 = PSMX3_AM_REP_READ | eom; + rep_args[0].u32w1 = op_error; + rep_args[1].u64 = args[1].u64; + rep_args[2].u64 = 0; + err = psm2_am_reply_short(token, PSMX3_AM_RMA_HANDLER, + rep_args, 3, NULL, 0, 0, + NULL, NULL ); + psmx3_am_request_free(rx, req); + break; + } + } + break; + + case PSMX3_AM_REP_WRITE: + req = (struct psmx3_am_request *)(uintptr_t)args[1].u64; + assert(req->op == PSMX3_AM_REQ_WRITE); + op_error = (int)args[0].u32w1; + if (!req->error) + req->error = op_error; + if (eom) { + if (req->ep->send_cq && (!req->no_event || req->error)) { + event = psmx3_cq_create_event( + req->ep->send_cq, + req->write.context, + req->write.buf, + req->cq_flags, + req->write.len, + 0, /* data */ + 0, /* tag */ + 0, /* olen */ + req->error); + if (event) + psmx3_cq_enqueue_event(req->ep->send_cq, event); + else + err = -FI_ENOMEM; + } + + if (req->ep->write_cntr) + psmx3_cntr_inc(req->ep->write_cntr, req->error); + + free(req->tmpbuf); + psmx3_am_request_free(req->ep->tx, req); + } + break; + + case PSMX3_AM_REP_READ: + req = (struct psmx3_am_request *)(uintptr_t)args[1].u64; + assert(req->op == PSMX3_AM_REQ_READ || req->op == PSMX3_AM_REQ_READV); + op_error = (int)args[0].u32w1; + offset = args[2].u64; + if (!req->error) + req->error = op_error; + if (!op_error) { + if (req->op == PSMX3_AM_REQ_READ) + memcpy(req->read.buf + offset, src, len); + else + psmx3_iov_copy(req->iov, req->read.iov_count, offset, src, len); + + req->read.len_read += len; + } + if (eom || req->read.len == req->read.len_read) { + if (!eom) + FI_INFO(&psmx3_prov, FI_LOG_EP_DATA, + "readv: short protocol finishes after long protocol.\n"); + if (req->ep->send_cq && (!req->no_event || req->error)) { + event = psmx3_cq_create_event( + req->ep->send_cq, + req->read.context, + req->read.buf, + req->cq_flags, + req->read.len_read, + 0, /* data */ + 0, /* tag */ + req->read.len - req->read.len_read, + req->error); + if (event) + psmx3_cq_enqueue_event(req->ep->send_cq, event); + else + err = -FI_ENOMEM; + } + + if (req->ep->read_cntr) + psmx3_cntr_inc(req->ep->read_cntr, req->error); + + free(req->tmpbuf); + psmx3_am_request_free(req->ep->tx, req); + } + break; + + default: + err = -FI_EINVAL; + } + return err; +} + +static ssize_t psmx3_rma_self(int am_cmd, + struct psmx3_fid_ep *ep, + void *buf, size_t len, void *desc, + uint64_t addr, uint64_t key, + void *context, uint64_t flags, uint64_t data) +{ + struct psmx3_fid_mr *mr; + struct psmx3_cq_event *event; + struct psmx3_fid_cntr *cntr = NULL; + struct psmx3_fid_cntr *mr_cntr = NULL; + struct psmx3_fid_cq *cq = NULL; + int no_event; + int err = 0; + int op_error = 0; + int access; + uint8_t *dst, *src; + uint64_t cq_flags; + struct iovec *iov = buf; + size_t iov_count = len; + int i; + + switch (am_cmd) { + case PSMX3_AM_REQ_WRITE: + access = FI_REMOTE_WRITE; + cq_flags = FI_WRITE | FI_RMA; + break; + case PSMX3_AM_REQ_WRITEV: + access = FI_REMOTE_WRITE; + cq_flags = FI_WRITE | FI_RMA; + len = 0; + for (i=0; idomain, key); + op_error = mr ? psmx3_mr_validate(mr, addr, len, access) : -FI_EINVAL; + + if (!op_error) { + addr += mr->offset; + switch (am_cmd) { + case PSMX3_AM_REQ_WRITE: + cntr = ep->remote_write_cntr; + if (flags & FI_REMOTE_CQ_DATA) + cq = ep->recv_cq; + if (mr->cntr != cntr) + mr_cntr = mr->cntr; + memcpy((void *)addr, buf, len); + break; + + case PSMX3_AM_REQ_WRITEV: + cntr = ep->remote_write_cntr; + if (flags & FI_REMOTE_CQ_DATA) + cq = ep->recv_cq; + if (mr->cntr != cntr) + mr_cntr = mr->cntr; + dst = (void *)addr; + for (i=0; iremote_read_cntr; + memcpy(buf, (void *)addr, len); + break; + + case PSMX3_AM_REQ_READV: + cntr = ep->remote_read_cntr; + src = (void *)addr; + for (i=0; icaps & FI_RMA_EVENT) { + if (cntr) + psmx3_cntr_inc(cntr, 0); + + if (mr_cntr) + psmx3_cntr_inc(mr_cntr, 0); + } + } + + no_event = (flags & PSMX3_NO_COMPLETION) || + (ep->send_selective_completion && !(flags & FI_COMPLETION)); + + if (ep->send_cq && (!no_event || op_error)) { + event = psmx3_cq_create_event( + ep->send_cq, + context, + (void *)buf, + cq_flags, + len, + 0, /* data */ + 0, /* tag */ + 0, /* olen */ + op_error); + if (event) + psmx3_cq_enqueue_event(ep->send_cq, event); + else + err = -FI_ENOMEM; + } + + switch (am_cmd) { + case PSMX3_AM_REQ_WRITE: + case PSMX3_AM_REQ_WRITEV: + if (ep->write_cntr) + psmx3_cntr_inc(ep->write_cntr, op_error); + break; + + case PSMX3_AM_REQ_READ: + case PSMX3_AM_REQ_READV: + if (ep->read_cntr) + psmx3_cntr_inc(ep->read_cntr, op_error); + break; + } + + return err; +} + +void psmx3_am_ack_rma(struct psmx3_am_request *req) +{ + psm2_amarg_t args[8]; + int err; + + if ((req->op & PSMX3_AM_OP_MASK) != PSMX3_AM_REQ_WRITE_LONG) + return; + + args[0].u32w0 = PSMX3_AM_REP_WRITE | PSMX3_AM_EOM; + args[0].u32w1 = req->error; + args[1].u64 = (uint64_t)(uintptr_t)req->write.context; + + err = psm2_am_request_short(req->write.peer_addr, + PSMX3_AM_RMA_HANDLER, args, 2, NULL, 0, + PSM2_AM_FLAG_NOREPLY, NULL, NULL); + if (err) + FI_INFO(&psmx3_prov, FI_LOG_EP_DATA, + "failed to send am_ack: err %d.\n", err); +} + +ssize_t psmx3_read_generic(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, + uint64_t addr, uint64_t key, void *context, + uint64_t flags) +{ + struct psmx3_fid_ep *ep_priv; + struct psmx3_fid_av *av; + struct psmx3_am_request *req; + psm2_amarg_t args[8]; + int chunk_size; + size_t offset = 0; + psm2_epaddr_t psm2_epaddr; + psm2_epid_t psm2_epid; + psm2_mq_req_t psm2_req; + psm2_mq_tag_t psm2_tag, psm2_tagsel; + size_t req_refcnt = 0; + int err; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (flags & FI_TRIGGER) + return psmx3_trigger_queue_read(ep, buf, len, desc, src_addr, + addr, key, context, flags); + + assert(buf); + + av = ep_priv->av; + assert(av); + + psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, src_addr, av->type); + psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid); + + if (psm2_epid == ep_priv->tx->psm2_epid) + return psmx3_rma_self(PSMX3_AM_REQ_READ, ep_priv, + buf, len, desc, addr, key, + context, flags, 0); + + req = psmx3_am_request_alloc(ep_priv->tx); + if (!req) + return -FI_ENOMEM; + + req->op = PSMX3_AM_REQ_READ; + req->read.buf = buf; + req->read.len = len; + req->read.addr = addr; /* needed? */ + req->read.key = key; /* needed? */ + req->read.context = context; + req->ep = ep_priv; + req->cq_flags = FI_READ | FI_RMA; + PSMX3_CTXT_TYPE(&req->fi_context) = PSMX3_READ_CONTEXT; + PSMX3_CTXT_USER(&req->fi_context) = context; + PSMX3_CTXT_EP(&req->fi_context) = ep_priv; + + if (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)) { + PSMX3_CTXT_TYPE(&req->fi_context) = PSMX3_NOCOMP_READ_CONTEXT; + req->no_event = 1; + } + + chunk_size = ep_priv->tx->psm2_am_param.max_reply_short; + + args[0].u32w0 = 0; + + if (psmx3_env.tagged_rma && len > chunk_size) { + PSMX3_SET_TAG(psm2_tag, (uint64_t)req, 0, PSMX3_RMA_TYPE_READ); + PSMX3_SET_MASK(psm2_tagsel, PSMX3_MATCH_ALL, PSMX3_RMA_TYPE_MASK); + err = psm2_mq_irecv2(ep_priv->tx->psm2_mq, psm2_epaddr, + &psm2_tag, &psm2_tagsel, 0, buf, len, + (void *)&req->fi_context, &psm2_req); + if (err) { + psmx3_am_request_free(ep_priv->tx, req); + return psmx3_errno(err); + } + + PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_READ_LONG); + args[0].u32w1 = len; + args[1].u64 = (uint64_t)req; + args[2].u64 = addr; + args[3].u64 = key; + err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER, + args, 4, NULL, 0, 0, NULL, NULL); + if (err) { + /* req in use, don't free */ + return psmx3_errno(err); + } + psmx3_am_poll(ep_priv->tx); + return 0; + } + + PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_READ); + args[1].u64 = (uint64_t)(uintptr_t)req; + args[3].u64 = key; + while (len > chunk_size) { + args[0].u32w1 = chunk_size; + args[2].u64 = addr; + args[4].u64 = offset; + err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER, + args, 5, NULL, 0, 0, NULL, NULL); + if (err) { + if (!req_refcnt) + psmx3_am_request_free(ep_priv->tx, req); + return psmx3_errno(err); + } + psmx3_am_poll(ep_priv->tx); + addr += chunk_size; + len -= chunk_size; + offset += chunk_size; + req_refcnt++; + } + + PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_EOM); + args[0].u32w1 = len; + args[2].u64 = addr; + args[4].u64 = offset; + err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER, + args, 5, NULL, 0, 0, NULL, NULL); + if (err) { + if (!req_refcnt) + psmx3_am_request_free(ep_priv->tx, req); + return psmx3_errno(err); + } + psmx3_am_poll(ep_priv->tx); + return 0; +} + +ssize_t psmx3_readv_generic(struct fid_ep *ep, const struct iovec *iov, + void *desc, size_t count, fi_addr_t src_addr, + uint64_t addr, uint64_t key, void *context, + uint64_t flags) +{ + struct psmx3_fid_ep *ep_priv; + struct psmx3_fid_av *av; + struct psmx3_am_request *req; + psm2_amarg_t args[8]; + int chunk_size; + size_t offset = 0; + psm2_epaddr_t psm2_epaddr; + psm2_epid_t psm2_epid; + psm2_mq_req_t psm2_req; + psm2_mq_tag_t psm2_tag, psm2_tagsel; + size_t total_len, long_len = 0, short_len; + void *long_buf = NULL; + int i; + size_t req_refcnt = 0; + int err; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (flags & FI_TRIGGER) + return psmx3_trigger_queue_readv(ep, iov, desc, count, src_addr, + addr, key, context, flags); + + av = ep_priv->av; + assert(av); + + psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, src_addr, av->type); + psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid); + + if (psm2_epid == ep_priv->tx->psm2_epid) + return psmx3_rma_self(PSMX3_AM_REQ_READV, ep_priv, + (void *)iov, count, desc, addr, + key, context, flags, 0); + + total_len = 0; + for (i=0; itx); + if (!req) + return -FI_ENOMEM; + + req->tmpbuf = malloc(count * sizeof(struct iovec)); + if (!req->tmpbuf) { + psmx3_am_request_free(ep_priv->tx, req); + return -FI_ENOMEM; + } + + req->iov = req->tmpbuf; + memcpy(req->iov, iov, count * sizeof(struct iovec)); + + req->op = PSMX3_AM_REQ_READV; + req->read.iov_count = count; + req->read.len = total_len; + req->read.addr = addr; /* needed? */ + req->read.key = key; /* needed? */ + req->read.context = context; + req->ep = ep_priv; + req->cq_flags = FI_READ | FI_RMA; + PSMX3_CTXT_TYPE(&req->fi_context) = PSMX3_READ_CONTEXT; + PSMX3_CTXT_USER(&req->fi_context) = context; + PSMX3_CTXT_EP(&req->fi_context) = ep_priv; + + if (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)) { + PSMX3_CTXT_TYPE(&req->fi_context) = PSMX3_NOCOMP_READ_CONTEXT; + req->no_event = 1; + } + + chunk_size = ep_priv->tx->psm2_am_param.max_reply_short; + + if (psmx3_env.tagged_rma) { + for (i=count-1; i>=0; i--) { + if (iov[i].iov_len > chunk_size) { + long_buf = iov[i].iov_base; + long_len = iov[i].iov_len; + break; + } else if (iov[i].iov_len) { + break; + } + } + } + + short_len = total_len - long_len; + + /* Use short protocol for all but the last segment (long_len) */ + args[0].u32w0 = 0; + PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_READ); + args[1].u64 = (uint64_t)(uintptr_t)req; + args[3].u64 = key; + while (short_len > chunk_size) { + args[0].u32w1 = chunk_size; + args[2].u64 = addr; + args[4].u64 = offset; + err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER, + args, 5, NULL, 0, 0, NULL, NULL); + if (err) { + if (!req_refcnt) { + free(req->tmpbuf); + psmx3_am_request_free(ep_priv->tx, req); + } + return psmx3_errno(err); + } + psmx3_am_poll(ep_priv->tx); + addr += chunk_size; + short_len -= chunk_size; + offset += chunk_size; + req_refcnt++; + } + + if (short_len) { + if (!long_len) + PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_EOM); + args[0].u32w1 = short_len; + args[2].u64 = addr; + args[4].u64 = offset; + err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER, + args, 5, NULL, 0, 0, NULL, NULL); + if (err) { + if (!req_refcnt) { + free(req->tmpbuf); + psmx3_am_request_free(ep_priv->tx, req); + } + return psmx3_errno(err); + } + psmx3_am_poll(ep_priv->tx); + req_refcnt++; + } + + /* Use the long protocol for the last segment */ + if (long_len) { + PSMX3_SET_TAG(psm2_tag, (uint64_t)req, 0, PSMX3_RMA_TYPE_READ); + PSMX3_SET_MASK(psm2_tagsel, PSMX3_MATCH_ALL, PSMX3_RMA_TYPE_MASK); + err = psm2_mq_irecv2(ep_priv->tx->psm2_mq, psm2_epaddr, + &psm2_tag, &psm2_tagsel, 0, + long_buf, long_len, + (void *)&req->fi_context, &psm2_req); + if (err) { + if (!req_refcnt) { + free(req->tmpbuf); + psmx3_am_request_free(ep_priv->tx, req); + } + return psmx3_errno(err); + } + + PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_READ_LONG); + args[0].u32w1 = long_len; + args[1].u64 = (uint64_t)req; + args[2].u64 = addr + short_len; + args[3].u64 = key; + err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER, + args, 4, NULL, 0, 0, NULL, NULL); + if (err) { + /* req in use, don't free */ + return psmx3_errno(err); + } + psmx3_am_poll(ep_priv->tx); + } + + return 0; +} + +DIRECT_FN +STATIC ssize_t psmx3_read(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, + uint64_t addr, uint64_t key, void *context) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + return psmx3_read_generic(ep, buf, len, desc, src_addr, addr, + key, context, ep_priv->tx_flags); +} + +DIRECT_FN +STATIC ssize_t psmx3_readmsg(struct fid_ep *ep, + const struct fi_msg_rma *msg, + uint64_t flags) +{ + assert(msg); + assert(msg->iov_count); + assert(msg->msg_iov); + assert(msg->rma_iov); + assert(msg->rma_iov_count == 1); + + if (msg->iov_count > 1) + return psmx3_readv_generic(ep, msg->msg_iov, + msg->desc ? msg->desc[0] : NULL, + msg->iov_count, msg->addr, + msg->rma_iov[0].addr, + msg->rma_iov[0].key, + msg->context, flags); + + return psmx3_read_generic(ep, msg->msg_iov[0].iov_base, + msg->msg_iov[0].iov_len, + msg->desc ? msg->desc[0] : NULL, + msg->addr, msg->rma_iov[0].addr, + msg->rma_iov[0].key, msg->context, + flags); +} + +DIRECT_FN +STATIC ssize_t psmx3_readv(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, + uint64_t addr, uint64_t key, void *context) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + assert(iov); + assert(count); + + if (count > 1) + return psmx3_readv_generic(ep, iov, desc ? desc[0] : NULL, + count, src_addr, addr, key, + context, ep_priv->tx_flags); + + return psmx3_read(ep, iov->iov_base, iov->iov_len, + desc ? desc[0] : NULL, src_addr, addr, key, context); +} + +ssize_t psmx3_write_generic(struct fid_ep *ep, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, void *context, + uint64_t flags, uint64_t data) +{ + struct psmx3_fid_ep *ep_priv; + struct psmx3_fid_av *av; + struct psmx3_am_request *req; + psm2_amarg_t args[8]; + int nargs; + int am_flags = PSM2_AM_FLAG_ASYNC; + int chunk_size; + psm2_epaddr_t psm2_epaddr; + psm2_epid_t psm2_epid; + psm2_mq_req_t psm2_req; + psm2_mq_tag_t psm2_tag; + void *psm2_context; + int no_event; + size_t req_refcnt = 0; + int err; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (flags & FI_TRIGGER) + return psmx3_trigger_queue_write(ep, buf, len, desc, dest_addr, + addr, key, context, flags, + data); + + assert(buf); + + av = ep_priv->av; + assert(av); + + psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type); + psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid); + + if (psm2_epid == ep_priv->tx->psm2_epid) + return psmx3_rma_self(PSMX3_AM_REQ_WRITE, ep_priv, + (void *)buf, len, desc, addr, + key, context, flags, data); + + no_event = (flags & PSMX3_NO_COMPLETION) || + (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)); + + req = psmx3_am_request_alloc(ep_priv->tx); + if (!req) + return -FI_ENOMEM; + + if (flags & FI_INJECT) { + if (len > psmx3_env.inject_size) { + psmx3_am_request_free(ep_priv->tx, req); + return -FI_EMSGSIZE; + } + + req->tmpbuf = malloc(len); + if (!req->tmpbuf) { + psmx3_am_request_free(ep_priv->tx, req); + return -FI_ENOMEM; + } + + memcpy(req->tmpbuf, (void *)buf, len); + buf = req->tmpbuf; + } else { + PSMX3_CTXT_TYPE(&req->fi_context) = no_event ? + PSMX3_NOCOMP_WRITE_CONTEXT : + PSMX3_WRITE_CONTEXT; + } + + req->no_event = no_event; + req->op = PSMX3_AM_REQ_WRITE; + req->write.buf = (void *)buf; + req->write.len = len; + req->write.addr = addr; /* needed? */ + req->write.key = key; /* needed? */ + req->write.context = context; + req->ep = ep_priv; + req->cq_flags = FI_WRITE | FI_RMA; + PSMX3_CTXT_USER(&req->fi_context) = context; + PSMX3_CTXT_EP(&req->fi_context) = ep_priv; + + chunk_size = ep_priv->tx->psm2_am_param.max_request_short; + + args[0].u32w0 = 0; + + if (psmx3_env.tagged_rma && len > chunk_size) { + PSMX3_SET_TAG(psm2_tag, (uint64_t)req, 0, PSMX3_RMA_TYPE_WRITE); + PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_WRITE_LONG); + args[0].u32w1 = len; + args[1].u64 = (uint64_t)req; + args[2].u64 = addr; + args[3].u64 = key; + nargs = 4; + if (flags & FI_REMOTE_CQ_DATA) { + PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_DATA); + args[4].u64 = data; + nargs++; + } + + if (flags & FI_DELIVERY_COMPLETE) { + args[0].u32w0 |= PSMX3_AM_FORCE_ACK; + psm2_context = NULL; + } else { + psm2_context = (void *)&req->fi_context; + } + + err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER, + args, nargs, NULL, 0, am_flags, + NULL, NULL); + if (err) { + free(req->tmpbuf); + psmx3_am_request_free(ep_priv->tx, req); + return psmx3_errno(err); + } + psmx3_am_poll(ep_priv->tx); + + err = psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr, 0, + &psm2_tag, buf, len, psm2_context, &psm2_req); + if (err) { + /* req in use, don't free */ + return psmx3_errno(err); + } + return 0; + } + + PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_WRITE); + nargs = 4; + while (len > chunk_size) { + args[0].u32w1 = chunk_size; + args[1].u64 = (uint64_t)(uintptr_t)req; + args[2].u64 = addr; + args[3].u64 = key; + err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER, + args, nargs, (void *)buf, + chunk_size, am_flags, NULL, NULL); + if (err) { + if (!req_refcnt) { + free(req->tmpbuf); + psmx3_am_request_free(ep_priv->tx, req); + } + return psmx3_errno(err); + } + psmx3_am_poll(ep_priv->tx); + buf = (const uint8_t *)buf + chunk_size; + addr += chunk_size; + len -= chunk_size; + req_refcnt++; + } + + args[0].u32w1 = len; + args[1].u64 = (uint64_t)(uintptr_t)req; + args[2].u64 = addr; + args[3].u64 = key; + if (flags & FI_REMOTE_CQ_DATA) { + PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_DATA | PSMX3_AM_EOM); + args[4].u64 = data; + nargs++; + } else { + PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_EOM); + } + err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER, + args, nargs, (void *)buf, len, am_flags, + NULL, NULL); + if (err) { + if (!req_refcnt) { + free(req->tmpbuf); + psmx3_am_request_free(ep_priv->tx, req); + } + return psmx3_errno(err); + } + psmx3_am_poll(ep_priv->tx); + return 0; +} + +ssize_t psmx3_writev_generic(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, void *context, + uint64_t flags, uint64_t data) +{ + struct psmx3_fid_ep *ep_priv; + struct psmx3_fid_av *av; + struct psmx3_am_request *req; + psm2_amarg_t args[8]; + int nargs; + int am_flags = PSM2_AM_FLAG_ASYNC; + int chunk_size; + psm2_epaddr_t psm2_epaddr; + psm2_epid_t psm2_epid; + psm2_mq_req_t psm2_req; + psm2_mq_tag_t psm2_tag; + void *psm2_context; + int no_event; + size_t total_len, len, len_sent; + uint8_t *buf, *p; + int i; + size_t req_refcnt = 0; + int err; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (flags & FI_TRIGGER) + return psmx3_trigger_queue_writev(ep, iov, desc, count, + dest_addr, addr, key, + context, flags, data); + + av = ep_priv->av; + assert(av); + + psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type); + psm2_epaddr_to_epid(psm2_epaddr, &psm2_epid); + + if (psm2_epid == ep_priv->tx->psm2_epid) + return psmx3_rma_self(PSMX3_AM_REQ_WRITEV, ep_priv, + (void *)iov, count, desc, addr, + key, context, flags, data); + + no_event = (flags & PSMX3_NO_COMPLETION) || + (ep_priv->send_selective_completion && !(flags & FI_COMPLETION)); + + total_len = 0; + for (i=0; itx->psm2_am_param.max_request_short; + + req = psmx3_am_request_alloc(ep_priv->tx); + if (!req) + return -FI_ENOMEM; + + /* Case 1: fit into a AM message, then pack and send */ + if (total_len <= chunk_size) { + req->tmpbuf = malloc(total_len); + if (!req->tmpbuf) { + psmx3_am_request_free(ep_priv->tx, req); + return -FI_ENOMEM; + } + + p = req->tmpbuf; + for (i=0; itmpbuf; + len = total_len; + + req->no_event = no_event; + req->op = PSMX3_AM_REQ_WRITE; + req->write.buf = (void *)buf; + req->write.len = len; + req->write.addr = addr; /* needed? */ + req->write.key = key; /* needed? */ + req->write.context = context; + req->ep = ep_priv; + req->cq_flags = FI_WRITE | FI_RMA; + PSMX3_CTXT_USER(&req->fi_context) = context; + PSMX3_CTXT_EP(&req->fi_context) = ep_priv; + + args[0].u32w0 = 0; + PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_WRITE); + args[0].u32w1 = len; + args[1].u64 = (uint64_t)(uintptr_t)req; + args[2].u64 = addr; + args[3].u64 = key; + nargs = 4; + if (flags & FI_REMOTE_CQ_DATA) { + PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_DATA | PSMX3_AM_EOM); + args[4].u64 = data; + nargs++; + } else { + PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_EOM); + } + err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER, + args, nargs, (void *)buf, len, + am_flags, NULL, NULL); + if (err) { + free(req->tmpbuf); + psmx3_am_request_free(ep_priv->tx, req); + return psmx3_errno(err); + } + psmx3_am_poll(ep_priv->tx); + return 0; + } + + if (flags & FI_INJECT) { + psmx3_am_request_free(ep_priv->tx, req); + return -FI_EMSGSIZE; + } + + PSMX3_CTXT_TYPE(&req->fi_context) = no_event ? + PSMX3_NOCOMP_WRITE_CONTEXT : + PSMX3_WRITE_CONTEXT; + + req->no_event = no_event; + req->op = PSMX3_AM_REQ_WRITE; + req->write.buf = (void *)iov[0].iov_base; + req->write.len = total_len; + req->write.addr = addr; /* needed? */ + req->write.key = key; /* needed? */ + req->write.context = context; + req->ep = ep_priv; + req->cq_flags = FI_WRITE | FI_RMA; + PSMX3_CTXT_USER(&req->fi_context) = context; + PSMX3_CTXT_EP(&req->fi_context) = ep_priv; + + /* Case 2: send iov in sequence */ + args[0].u32w0 = 0; + + len_sent = 0; + for (i=0; i chunk_size && + len_sent + iov[i].iov_len == total_len) { + PSMX3_SET_TAG(psm2_tag, (uint64_t)req, 0, PSMX3_RMA_TYPE_WRITE); + PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_WRITE_LONG); + args[0].u32w1 = iov[i].iov_len; + args[1].u64 = (uint64_t)req; + args[2].u64 = addr; + args[3].u64 = key; + nargs = 4; + if (flags & FI_REMOTE_CQ_DATA) { + PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_DATA); + args[4].u64 = data; + nargs++; + } + + if (flags & FI_DELIVERY_COMPLETE) { + args[0].u32w0 |= PSMX3_AM_FORCE_ACK; + psm2_context = NULL; + } else { + psm2_context = (void *)&req->fi_context; + } + + err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER, + args, nargs, NULL, 0, am_flags, + NULL, NULL); + if (err) { + if (!req_refcnt) + psmx3_am_request_free(ep_priv->tx, req); + return psmx3_errno(err); + } + psmx3_am_poll(ep_priv->tx); + + err = psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr, + 0, &psm2_tag, iov[i].iov_base, + iov[i].iov_len, psm2_context, + &psm2_req); + if (err) { + /* req in use, don't free */ + return psmx3_errno(err); + } + return 0; + } + + /* Case 2.2: use short protocol all other segments */ + PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_WRITE); + nargs = 4; + buf = iov[i].iov_base; + len = iov[i].iov_len; + while (len > chunk_size) { + args[0].u32w1 = chunk_size; + args[1].u64 = (uint64_t)(uintptr_t)req; + args[2].u64 = addr; + args[3].u64 = key; + err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER, + args, nargs, (void *)buf, + chunk_size, am_flags, + NULL, NULL); + if (err) { + if (!req_refcnt) + psmx3_am_request_free(ep_priv->tx, req); + return psmx3_errno(err); + } + psmx3_am_poll(ep_priv->tx); + buf += chunk_size; + addr += chunk_size; + len -= chunk_size; + len_sent += chunk_size; + req_refcnt++; + } + + args[0].u32w1 = len; + args[1].u64 = (uint64_t)(uintptr_t)req; + args[2].u64 = addr; + args[3].u64 = key; + if (len_sent + len == total_len) { + if (flags & FI_REMOTE_CQ_DATA) { + PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_DATA | PSMX3_AM_EOM); + args[4].u64 = data; + nargs++; + } else { + PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_EOM); + } + } + err = psm2_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER, + args, nargs, (void *)buf, len, + am_flags, NULL, NULL); + if (err) { + if (!req_refcnt) + psmx3_am_request_free(ep_priv->tx, req); + return psmx3_errno(err); + } + psmx3_am_poll(ep_priv->tx); + + addr += len; + len_sent += len; + req_refcnt++; + } + + return 0; +} + +DIRECT_FN +STATIC ssize_t psmx3_write(struct fid_ep *ep, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, uint64_t addr, + uint64_t key, void *context) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + return psmx3_write_generic(ep, buf, len, desc, dest_addr, addr, + key, context, ep_priv->tx_flags, 0); +} + +DIRECT_FN +STATIC ssize_t psmx3_writemsg(struct fid_ep *ep, + const struct fi_msg_rma *msg, + uint64_t flags) +{ + assert(msg); + assert(msg->msg_iov); + assert(msg->iov_count); + assert(msg->rma_iov); + assert(msg->rma_iov_count == 1); + + if (msg->iov_count > 1) + return psmx3_writev_generic(ep, msg->msg_iov, msg->desc, + msg->iov_count, msg->addr, + msg->rma_iov[0].addr, + msg->rma_iov[0].key, + msg->context, flags, msg->data); + + return psmx3_write_generic(ep, msg->msg_iov[0].iov_base, + msg->msg_iov[0].iov_len, + msg->desc ? msg->desc[0] : NULL, msg->addr, + msg->rma_iov[0].addr, msg->rma_iov[0].key, + msg->context, flags, msg->data); +} + +DIRECT_FN +STATIC ssize_t psmx3_writev(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, void *context) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + assert(iov); + assert(count); + + if (count > 1) + return psmx3_writev_generic(ep, iov, desc, count, dest_addr, + addr, key, context, ep_priv->tx_flags, 0); + + return psmx3_write_generic(ep, iov->iov_base, iov->iov_len, + desc ? desc[0] : NULL, dest_addr, addr, key, + context, ep_priv->tx_flags, 0); +} + +DIRECT_FN +STATIC ssize_t psmx3_inject_write(struct fid_ep *ep, const void *buf, size_t len, + fi_addr_t dest_addr, uint64_t addr, uint64_t key) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + return psmx3_write_generic(ep, buf, len, NULL, dest_addr, addr, key, NULL, + ep_priv->tx_flags | FI_INJECT | PSMX3_NO_COMPLETION, + 0); +} + +DIRECT_FN +STATIC ssize_t psmx3_writedata(struct fid_ep *ep, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, void *context) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + return psmx3_write_generic(ep, buf, len, desc, dest_addr, addr, key, + context, ep_priv->tx_flags | FI_REMOTE_CQ_DATA, + data); +} + +DIRECT_FN +STATIC ssize_t psmx3_inject_writedata(struct fid_ep *ep, const void *buf, size_t len, + uint64_t data, fi_addr_t dest_addr, uint64_t addr, + uint64_t key) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + return psmx3_write_generic(ep, buf, len, NULL, dest_addr, addr, key, NULL, + ep_priv->tx_flags | FI_INJECT | PSMX3_NO_COMPLETION, + data); +} + +struct fi_ops_rma psmx3_rma_ops = { + .size = sizeof(struct fi_ops_rma), + .read = psmx3_read, + .readv = psmx3_readv, + .readmsg = psmx3_readmsg, + .write = psmx3_write, + .writev = psmx3_writev, + .writemsg = psmx3_writemsg, + .inject = psmx3_inject_write, + .writedata = psmx3_writedata, + .injectdata = psmx3_inject_writedata, +}; + diff --git a/prov/psm3/src/psmx3_tagged.c b/prov/psm3/src/psmx3_tagged.c new file mode 100644 index 00000000000..47a9a78fe26 --- /dev/null +++ b/prov/psm3/src/psmx3_tagged.c @@ -0,0 +1,1142 @@ +/* + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx3.h" +#include "psmx3_trigger.h" + +static ssize_t psmx3_tagged_peek_generic(struct fid_ep *ep, + void *buf, size_t len, + void *desc, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, + void *context, uint64_t flags) +{ + struct psmx3_fid_ep *ep_priv; + struct psmx3_fid_av *av; + struct psmx3_cq_event *event; + psm2_epaddr_t psm2_epaddr; + psm2_mq_req_t req = NULL; + psm2_mq_status2_t psm2_status; + psm2_mq_tag_t psm2_tag, psm2_tagsel; + uint64_t data; + int err; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if ((ep_priv->caps & FI_DIRECTED_RECV) && src_addr != FI_ADDR_UNSPEC) { + av = ep_priv->av; + assert(av); + psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->rx, src_addr, av->type); + } else { + psm2_epaddr = 0; + } + + PSMX3_SET_TAG(psm2_tag, tag, 0, PSMX3_TYPE_TAGGED); + PSMX3_SET_MASK(psm2_tagsel, ~ignore, PSMX3_TYPE_MASK); + + if (flags & (FI_CLAIM | FI_DISCARD)) + err = psm2_mq_improbe2(ep_priv->rx->psm2_mq, + psm2_epaddr, &psm2_tag, + &psm2_tagsel, &req, &psm2_status); + else + err = psm2_mq_iprobe2(ep_priv->rx->psm2_mq, + psm2_epaddr, &psm2_tag, &psm2_tagsel, + &psm2_status); + switch (err) { + case PSM2_OK: + if (ep_priv->recv_cq) { + if (flags & FI_CLAIM) { + if (context) + PSMX3_CTXT_REQ((struct fi_context *)context) = req; + } else if (flags & FI_DISCARD) { + if (!psm2_mq_imrecv(ep_priv->rx->psm2_mq, 0, + NULL, 0, req, &req)) + psm2_mq_wait2(&req, NULL); + } + + tag = PSMX3_GET_TAG64(psm2_status.msg_tag); + if (PSMX3_HAS_IMM(PSMX3_GET_FLAGS(psm2_status.msg_tag))) { + data = PSMX3_GET_CQDATA(psm2_status.msg_tag); + flags |= FI_REMOTE_CQ_DATA; + } else { + data = 0; + } + len = psm2_status.msg_length; + event = psmx3_cq_create_event( + ep_priv->recv_cq, + context, /* op_context */ + NULL, /* buf */ + flags|FI_RECV|FI_TAGGED,/* flags */ + len, /* len */ + data, /* data */ + tag, /* tag */ + len, /* olen */ + 0); /* err */ + + if (!event) + return -FI_ENOMEM; + + event->source_is_valid = 1; + event->source = psm2_status.msg_peer; + event->source_av = ep_priv->av; + psmx3_cq_enqueue_event(ep_priv->recv_cq, event); + } + return 0; + + case PSM2_MQ_NO_COMPLETIONS: + if (ep_priv->recv_cq) { + event = psmx3_cq_create_event( + ep_priv->recv_cq, + context, /* op_context */ + NULL, /* buf */ + flags|FI_RECV|FI_TAGGED,/* flags */ + len, /* len */ + 0, /* data */ + tag, /* tag */ + len, /* olen */ + -FI_ENOMSG); /* err */ + + if (!event) + return -FI_ENOMEM; + + event->source = 0; + psmx3_cq_enqueue_event(ep_priv->recv_cq, event); + } + return 0; + + default: + return psmx3_errno(err); + } +} + +ssize_t psmx3_tagged_recv_generic(struct fid_ep *ep, void *buf, + size_t len, void *desc, + fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, + void *context, uint64_t flags) +{ + struct psmx3_fid_ep *ep_priv; + struct psmx3_fid_av *av; + psm2_epaddr_t psm2_epaddr; + psm2_mq_req_t psm2_req; + psm2_mq_tag_t psm2_tag, psm2_tagsel; + struct fi_context *fi_context; + int err; + int enable_completion; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (flags & FI_PEEK) + return psmx3_tagged_peek_generic(ep, buf, len, desc, + src_addr, tag, ignore, + context, flags); + + if (flags & FI_TRIGGER) + return psmx3_trigger_queue_trecv(ep, buf, len, desc, + src_addr, tag, ignore, + context, flags); + + if (flags & FI_CLAIM) { + assert(context); + if (flags & FI_DISCARD) { + psm2_mq_status2_t psm2_status; + struct psmx3_cq_event *event; + + fi_context = context; + psm2_req = PSMX3_CTXT_REQ(fi_context); + err = psm2_mq_imrecv(ep_priv->rx->psm2_mq, 0, + NULL, 0, context, &psm2_req); + if (err != PSM2_OK) + return psmx3_errno(err); + + psm2_mq_wait2(&psm2_req, &psm2_status); + + if (ep_priv->recv_cq && + (!ep_priv->recv_selective_completion || (flags & FI_COMPLETION))) { + tag = PSMX3_GET_TAG64(psm2_status.msg_tag); + event = psmx3_cq_create_event( + ep_priv->recv_cq, + context, /* op_context */ + NULL, /* buf */ + flags|FI_RECV|FI_TAGGED,/* flags */ + 0, /* len */ + 0, /* data */ + tag, /* tag */ + 0, /* olen */ + 0); /* err */ + + if (!event) + return -FI_ENOMEM; + + event->source_is_valid = 1; + event->source = psm2_status.msg_peer; + event->source_av = ep_priv->av; + psmx3_cq_enqueue_event(ep_priv->recv_cq, event); + } + + if (ep_priv->recv_cntr) + psmx3_cntr_inc(ep_priv->recv_cntr, 0); + + return 0; + } + + fi_context = context; + psm2_req = PSMX3_CTXT_REQ(fi_context); + PSMX3_CTXT_TYPE(fi_context) = PSMX3_TRECV_CONTEXT; + PSMX3_CTXT_USER(fi_context) = buf; + PSMX3_CTXT_EP(fi_context) = ep_priv; + + err = psm2_mq_imrecv(ep_priv->rx->psm2_mq, 0, + buf, len, context, &psm2_req); + if (err != PSM2_OK) + return psmx3_errno(err); + + PSMX3_CTXT_REQ(fi_context) = psm2_req; + return 0; + } + + enable_completion = !ep_priv->recv_selective_completion || (flags & FI_COMPLETION); + + if (enable_completion) { + assert(context); + fi_context = context; + PSMX3_CTXT_TYPE(fi_context) = PSMX3_TRECV_CONTEXT; + PSMX3_CTXT_EP(fi_context) = ep_priv; + PSMX3_CTXT_USER(fi_context) = buf; + PSMX3_CTXT_SIZE(fi_context) = len; + } else { + PSMX3_EP_GET_OP_CONTEXT(ep_priv, fi_context); + #if !PSMX3_USE_REQ_CONTEXT + PSMX3_CTXT_TYPE(fi_context) = PSMX3_NOCOMP_TRECV_CONTEXT; + PSMX3_CTXT_EP(fi_context) = ep_priv; + PSMX3_CTXT_USER(fi_context) = buf; + PSMX3_CTXT_SIZE(fi_context) = len; + #endif + } + + if ((ep_priv->caps & FI_DIRECTED_RECV) && src_addr != FI_ADDR_UNSPEC) { + av = ep_priv->av; + assert(av); + psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->rx, src_addr, av->type); + } else { + psm2_epaddr = 0; + } + + PSMX3_SET_TAG(psm2_tag, tag, 0, PSMX3_TYPE_TAGGED); + PSMX3_SET_MASK(psm2_tagsel, ~ignore, PSMX3_TYPE_MASK); + + err = psm2_mq_irecv2(ep_priv->rx->psm2_mq, psm2_epaddr, + &psm2_tag, &psm2_tagsel, 0, buf, len, + (void *)fi_context, &psm2_req); + + if (err != PSM2_OK) + return psmx3_errno(err); + + if (enable_completion) { + PSMX3_CTXT_REQ(fi_context) = psm2_req; + } else { + #if PSMX3_USE_REQ_CONTEXT + PSMX3_REQ_GET_OP_CONTEXT(psm2_req, fi_context); + PSMX3_CTXT_TYPE(fi_context) = PSMX3_NOCOMP_TRECV_CONTEXT; + PSMX3_CTXT_EP(fi_context) = ep_priv; + PSMX3_CTXT_USER(fi_context) = buf; + PSMX3_CTXT_SIZE(fi_context) = len; + #endif + } + + return 0; +} + +__attribute__((always_inline)) +static inline ssize_t +psmx3_tagged_recv_specialized(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, + void *context, + int enable_completion, + int directed_receive, + int av_map) +{ + struct psmx3_fid_ep *ep_priv; + psm2_epaddr_t psm2_epaddr; + psm2_mq_req_t psm2_req; + psm2_mq_tag_t psm2_tag, psm2_tagsel; + struct fi_context *fi_context; + int err; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (enable_completion) { + fi_context = context; + PSMX3_CTXT_TYPE(fi_context) = PSMX3_TRECV_CONTEXT; + PSMX3_CTXT_EP(fi_context) = ep_priv; + PSMX3_CTXT_USER(fi_context) = buf; + PSMX3_CTXT_SIZE(fi_context) = len; + } else { + PSMX3_EP_GET_OP_CONTEXT(ep_priv, fi_context); + #if !PSMX3_USE_REQ_CONTEXT + PSMX3_CTXT_TYPE(fi_context) = PSMX3_NOCOMP_TRECV_CONTEXT; + PSMX3_CTXT_EP(fi_context) = ep_priv; + PSMX3_CTXT_USER(fi_context) = buf; + PSMX3_CTXT_SIZE(fi_context) = len; + #endif + } + + if (directed_receive && src_addr != FI_ADDR_UNSPEC) { + if (av_map) { + psm2_epaddr = (psm2_epaddr_t)src_addr; + } else { + assert(ep_priv->av); + psm2_epaddr = psmx3_av_translate_addr(ep_priv->av, ep_priv->rx, src_addr, FI_AV_TABLE); + } + } else { + psm2_epaddr = 0; + } + + PSMX3_SET_TAG(psm2_tag, tag, 0, PSMX3_TYPE_TAGGED); + PSMX3_SET_MASK(psm2_tagsel, ~ignore, PSMX3_TYPE_MASK); + + err = psm2_mq_irecv2(ep_priv->rx->psm2_mq, psm2_epaddr, + &psm2_tag, &psm2_tagsel, 0, buf, len, + (void *)fi_context, &psm2_req); + + if (OFI_UNLIKELY((err != PSM2_OK))) + return psmx3_errno(err); + + if (enable_completion) { + PSMX3_CTXT_REQ(fi_context) = psm2_req; + } else { + #if PSMX3_USE_REQ_CONTEXT + PSMX3_REQ_GET_OP_CONTEXT(psm2_req, fi_context); + PSMX3_CTXT_TYPE(fi_context) = PSMX3_NOCOMP_TRECV_CONTEXT; + PSMX3_CTXT_EP(fi_context) = ep_priv; + PSMX3_CTXT_USER(fi_context) = buf; + PSMX3_CTXT_SIZE(fi_context) = len; + #endif + } + + return 0; +} + +/* op_flags=0, FI_SELECTIVE_COMPLETION not set, FI_DIRECTED_RECEIVE not set, av table */ +static ssize_t +psmx3_tagged_recv_no_flag_undirected(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, + void *context) +{ + return psmx3_tagged_recv_specialized(ep, buf, len, desc, src_addr, + tag, ignore, context, 1, 0, 0); +} + +/* op_flags=0, FI_SELECTIVE_COMPLETION set, FI_DIRECTED_RECEIVE not set, av table */ +static ssize_t +psmx3_tagged_recv_no_event_undirected(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, + void *context) +{ + return psmx3_tagged_recv_specialized(ep, buf, len, desc, src_addr, + tag, ignore, context, 0, 0, 0); +} + +/* op_flags=0, FI_SELECTIVE_COMPLETION not set, FI_DIRECTED_RECEIVE set, av_table */ +static ssize_t +psmx3_tagged_recv_no_flag_directed(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, + void *context) +{ + return psmx3_tagged_recv_specialized(ep, buf, len, desc, src_addr, + tag, ignore, context, 1, 1, 0); +} + +/* op_flags=0, FI_SELECTIVE_COMPLETION set, FI_DIRECTED_RECEIVE set, av table */ +static ssize_t +psmx3_tagged_recv_no_event_directed(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, + void *context) +{ + return psmx3_tagged_recv_specialized(ep, buf, len, desc, src_addr, + tag, ignore, context, 0, 1, 0); +} + +/* op_flags=0, FI_SELECTIVE_COMPLETION not set, FI_DIRECTED_RECEIVE not set, av map */ +static ssize_t +psmx3_tagged_recv_no_flag_undirected_av_map(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, + void *context) +{ + return psmx3_tagged_recv_specialized(ep, buf, len, desc, src_addr, + tag, ignore, context, 1, 0, 1); +} + +/* op_flags=0, FI_SELECTIVE_COMPLETION set, FI_DIRECTED_RECEIVE not set, av map */ +static ssize_t +psmx3_tagged_recv_no_event_undirected_av_map(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, + void *context) +{ + return psmx3_tagged_recv_specialized(ep, buf, len, desc, src_addr, + tag, ignore, context, 0, 0, 1); +} + +/* op_flags=0, FI_SELECTIVE_COMPLETION not set, FI_DIRECTED_RECEIVE set, av_map */ +static ssize_t +psmx3_tagged_recv_no_flag_directed_av_map(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, + void *context) +{ + return psmx3_tagged_recv_specialized(ep, buf, len, desc, src_addr, + tag, ignore, context, 1, 1, 1); +} + +/* op_flags=0, FI_SELECTIVE_COMPLETION set, FI_DIRECTED_RECEIVE set, av map */ +static ssize_t +psmx3_tagged_recv_no_event_directed_av_map(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, + void *context) +{ + return psmx3_tagged_recv_specialized(ep, buf, len, desc, src_addr, + tag, ignore, context, 0, 1, 1); +} + +static ssize_t psmx3_tagged_recv(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, uint64_t tag, + uint64_t ignore, void *context) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + return psmx3_tagged_recv_generic(ep, buf, len, desc, src_addr, tag, + ignore, context, ep_priv->rx_flags); +} + +static ssize_t psmx3_tagged_recvmsg(struct fid_ep *ep, + const struct fi_msg_tagged *msg, + uint64_t flags) +{ + void *buf; + size_t len; + + assert(msg); + assert(!msg->iov_count || msg->msg_iov); + assert(msg->iov_count <= 1); + + if (msg->iov_count) { + buf = msg->msg_iov[0].iov_base; + len = msg->msg_iov[0].iov_len; + } else { + buf = NULL; + len = 0; + } + + return psmx3_tagged_recv_generic(ep, buf, len, + msg->desc ? msg->desc[0] : NULL, + msg->addr, msg->tag, msg->ignore, + msg->context, flags); +} + +#define PSMX3_TAGGED_RECVV_FUNC(suffix) \ +static ssize_t \ +psmx3_tagged_recvv##suffix(struct fid_ep *ep, const struct iovec *iov, \ + void **desc, size_t count, \ + fi_addr_t src_addr, uint64_t tag, \ + uint64_t ignore, void *context) \ +{ \ + void *buf; \ + size_t len; \ + assert(!count || iov); \ + assert(count <= 1); \ + if (count) { \ + buf = iov[0].iov_base; \ + len = iov[0].iov_len; \ + } else { \ + buf = NULL; \ + len = 0; \ + } \ + return psmx3_tagged_recv##suffix(ep, buf, len, \ + desc ? desc[0] : NULL, \ + src_addr, tag, ignore, \ + context); \ +} + +PSMX3_TAGGED_RECVV_FUNC() +PSMX3_TAGGED_RECVV_FUNC(_no_flag_directed) +PSMX3_TAGGED_RECVV_FUNC(_no_event_directed) +PSMX3_TAGGED_RECVV_FUNC(_no_flag_undirected) +PSMX3_TAGGED_RECVV_FUNC(_no_event_undirected) +PSMX3_TAGGED_RECVV_FUNC(_no_flag_directed_av_map) +PSMX3_TAGGED_RECVV_FUNC(_no_event_directed_av_map) +PSMX3_TAGGED_RECVV_FUNC(_no_flag_undirected_av_map) +PSMX3_TAGGED_RECVV_FUNC(_no_event_undirected_av_map) + +ssize_t psmx3_tagged_send_generic(struct fid_ep *ep, + const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, + uint64_t tag, void *context, + uint64_t flags, uint64_t data) +{ + struct psmx3_fid_ep *ep_priv; + struct psmx3_fid_av *av; + psm2_epaddr_t psm2_epaddr; + psm2_mq_req_t psm2_req; + psm2_mq_tag_t psm2_tag; + struct fi_context *fi_context; + int err; + int no_completion = 0; + struct psmx3_cq_event *event; + int have_data = (flags & FI_REMOTE_CQ_DATA) > 0; + + assert((tag & ~PSMX3_TAG_MASK) == 0); + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (flags & FI_TRIGGER) + return psmx3_trigger_queue_tsend(ep, buf, len, desc, + dest_addr, tag, context, + flags, data); + + av = ep_priv->av; + assert(av); + psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type); + + if (have_data) + PSMX3_SET_TAG(psm2_tag, tag, (uint32_t)data, + PSMX3_TYPE_TAGGED | PSMX3_IMM_BIT); + else + PSMX3_SET_TAG(psm2_tag, tag, (uint32_t)ep_priv->sep_id, + PSMX3_TYPE_TAGGED); + + if ((flags & PSMX3_NO_COMPLETION) || + (ep_priv->send_selective_completion && !(flags & FI_COMPLETION))) + no_completion = 1; + + if (flags & FI_INJECT) { + if (len > psmx3_env.inject_size) + return -FI_EMSGSIZE; + + err = psm2_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, + 0, &psm2_tag, buf, len); + + if (err != PSM2_OK) + return psmx3_errno(err); + + if (ep_priv->send_cntr) + psmx3_cntr_inc(ep_priv->send_cntr, 0); + + if (ep_priv->send_cq && !no_completion) { + event = psmx3_cq_create_event( + ep_priv->send_cq, + context, (void *)buf, flags, len, + (uint64_t) data, tag, + 0 /* olen */, + 0 /* err */); + + if (event) + psmx3_cq_enqueue_event(ep_priv->send_cq, event); + else + return -FI_ENOMEM; + } + + return 0; + } + + if (no_completion) { + fi_context = &ep_priv->nocomp_tsend_context; + } else { + assert(context); + fi_context = context; + PSMX3_CTXT_TYPE(fi_context) = PSMX3_TSEND_CONTEXT; + PSMX3_CTXT_USER(fi_context) = (void *)buf; + PSMX3_CTXT_EP(fi_context) = ep_priv; + } + + err = psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr, 0, + &psm2_tag, buf, len, (void*)fi_context, + &psm2_req); + + if (err != PSM2_OK) + return psmx3_errno(err); + + if (fi_context == context) + PSMX3_CTXT_REQ(fi_context) = psm2_req; + + return 0; +} + +__attribute__((always_inline)) +static inline ssize_t +psmx3_tagged_send_specialized(struct fid_ep *ep, const void *buf, + size_t len, void *desc, + fi_addr_t dest_addr, uint64_t tag, + void *context, + int enable_completion, int av_map, + int have_data, uint64_t data) +{ + struct psmx3_fid_ep *ep_priv; + psm2_epaddr_t psm2_epaddr; + psm2_mq_req_t psm2_req; + psm2_mq_tag_t psm2_tag; + struct fi_context *fi_context; + int err; + + assert((tag & ~PSMX3_TAG_MASK) == 0); + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (av_map) { + psm2_epaddr = (psm2_epaddr_t)dest_addr; + } else { + assert(ep_priv->av); + psm2_epaddr = psmx3_av_translate_addr(ep_priv->av, ep_priv->tx, dest_addr, FI_AV_TABLE); + } + + if (have_data) + PSMX3_SET_TAG(psm2_tag, tag, data, PSMX3_TYPE_TAGGED | PSMX3_IMM_BIT); + else + PSMX3_SET_TAG(psm2_tag, tag, ep_priv->sep_id, PSMX3_TYPE_TAGGED); + + if (enable_completion) { + fi_context = context; + PSMX3_CTXT_TYPE(fi_context) = PSMX3_TSEND_CONTEXT; + PSMX3_CTXT_USER(fi_context) = (void *)buf; + PSMX3_CTXT_EP(fi_context) = ep_priv; + } else { + fi_context = &ep_priv->nocomp_tsend_context; + } + + err = psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr, 0, + &psm2_tag, buf, len, (void*)fi_context, + &psm2_req); + + if (err != PSM2_OK) + return psmx3_errno(err); + + if (enable_completion) + PSMX3_CTXT_REQ(fi_context) = psm2_req; + + return 0; +} + +/* op_flags=0, FI_SELECTIVE_COMPLETION not set, av_table */ +static ssize_t +psmx3_tagged_send_no_flag(struct fid_ep *ep, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, uint64_t tag, + void *context) +{ + return psmx3_tagged_send_specialized(ep, buf, len, desc, dest_addr, tag, + context, 1, 0, 0, 0); +} + +/* op_flags=0, FI_SELECTIVE_COMPLETION set, av_table */ +static ssize_t +psmx3_tagged_send_no_event(struct fid_ep *ep, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, uint64_t tag, + void *context) +{ + return psmx3_tagged_send_specialized(ep, buf, len, desc, dest_addr, tag, + context, 0, 0, 0, 0); +} + +/* op_flags=0, FI_SELECTIVE_COMPLETION not set, av_map */ +static ssize_t +psmx3_tagged_send_no_flag_av_map(struct fid_ep *ep, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, uint64_t tag, + void *context) +{ + return psmx3_tagged_send_specialized(ep, buf, len, desc, dest_addr, tag, + context, 1, 1, 0, 0); +} + +/* op_flags=0, FI_SELECTIVE_COMPLETION set, av_map */ +static ssize_t +psmx3_tagged_send_no_event_av_map(struct fid_ep *ep, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, uint64_t tag, + void *context) +{ + return psmx3_tagged_send_specialized(ep, buf, len, desc, dest_addr, tag, + context, 0, 1, 0, 0); +} + +/* op_flags=0, FI_SELECTIVE_COMPLETION not set, av_table */ +static ssize_t +psmx3_tagged_senddata_no_flag(struct fid_ep *ep, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, + uint64_t tag, void *context) +{ + return psmx3_tagged_send_specialized(ep, buf, len, desc, dest_addr, tag, + context, 1, 0, 1, data); +} + +/* op_flags=0, FI_SELECTIVE_COMPLETION set, av_table */ +static ssize_t +psmx3_tagged_senddata_no_event(struct fid_ep *ep, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, + uint64_t tag, void *context) +{ + return psmx3_tagged_send_specialized(ep, buf, len, desc, dest_addr, tag, + context, 0, 0, 1, data); +} + +/* op_flags=0, FI_SELECTIVE_COMPLETION not set, av_map */ +static ssize_t +psmx3_tagged_senddata_no_flag_av_map(struct fid_ep *ep, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, + uint64_t tag, void *context) +{ + return psmx3_tagged_send_specialized(ep, buf, len, desc, dest_addr, tag, + context, 1, 1, 1, data); +} + +/* op_flags=0, FI_SELECTIVE_COMPLETION set, av_map */ +static ssize_t +psmx3_tagged_senddata_no_event_av_map(struct fid_ep *ep, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, + uint64_t tag, void *context) +{ + return psmx3_tagged_send_specialized(ep, buf, len, desc, dest_addr, tag, + context, 0, 1, 1, data); +} + +__attribute__((always_inline)) +static inline ssize_t +psmx3_tagged_inject_specialized(struct fid_ep *ep, const void *buf, + size_t len, fi_addr_t dest_addr, + uint64_t tag, int av_map, + int have_data, uint64_t data) +{ + struct psmx3_fid_ep *ep_priv; + psm2_epaddr_t psm2_epaddr; + psm2_mq_tag_t psm2_tag; + int err; + + assert((tag & ~PSMX3_TAG_MASK) == 0); + + if (len > psmx3_env.inject_size) + return -FI_EMSGSIZE; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (av_map) { + psm2_epaddr = (psm2_epaddr_t)dest_addr; + } else { + assert(ep_priv->av); + psm2_epaddr = psmx3_av_translate_addr(ep_priv->av, ep_priv->tx, dest_addr, FI_AV_TABLE); + } + + if (have_data) + PSMX3_SET_TAG(psm2_tag, tag, data, PSMX3_TYPE_TAGGED | PSMX3_IMM_BIT); + else + PSMX3_SET_TAG(psm2_tag, tag, ep_priv->sep_id, PSMX3_TYPE_TAGGED); + + err = psm2_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, 0, + &psm2_tag, buf, len); + + if (err != PSM2_OK) + return psmx3_errno(err); + + if (ep_priv->send_cntr) + psmx3_cntr_inc(ep_priv->send_cntr, 0); + + return 0; +} + +/* op_flags=0, av_table */ +static ssize_t +psmx3_tagged_inject_no_flag(struct fid_ep *ep, const void *buf, size_t len, + fi_addr_t dest_addr, uint64_t tag) +{ + return psmx3_tagged_inject_specialized(ep, buf, len, dest_addr, tag, + 0, 0, 0); +} + +/* op_flags=0, av_map */ +static ssize_t +psmx3_tagged_inject_no_flag_av_map(struct fid_ep *ep, const void *buf, size_t len, + fi_addr_t dest_addr, uint64_t tag) +{ + return psmx3_tagged_inject_specialized(ep, buf, len, dest_addr, tag, + 1, 0, 0); +} + +/* op_flags=0, av_table */ +static ssize_t +psmx3_tagged_injectdata_no_flag(struct fid_ep *ep, const void *buf, size_t len, + uint64_t data, fi_addr_t dest_addr, uint64_t tag) +{ + return psmx3_tagged_inject_specialized(ep, buf, len, dest_addr, tag, + 0, 1, data); +} + +/* op_flags=0, av_map */ +static ssize_t +psmx3_tagged_injectdata_no_flag_av_map(struct fid_ep *ep, const void *buf, size_t len, + uint64_t data, fi_addr_t dest_addr, uint64_t tag) +{ + return psmx3_tagged_inject_specialized(ep, buf, len, dest_addr, tag, + 1, 1, data); +} + +ssize_t psmx3_tagged_sendv_generic(struct fid_ep *ep, + const struct iovec *iov, void **desc, + size_t count, fi_addr_t dest_addr, + uint64_t tag, void *context, + uint64_t flags, uint64_t data) +{ + struct psmx3_fid_ep *ep_priv; + struct psmx3_fid_av *av; + psm2_epaddr_t psm2_epaddr; + psm2_mq_req_t psm2_req; + psm2_mq_tag_t psm2_tag; + struct fi_context * fi_context; + int send_flag = 0; + int err; + int no_completion = 0; + struct psmx3_cq_event *event; + size_t real_count; + size_t len, total_len; + char *p; + uint32_t *q; + int i, j=0; + struct psmx3_sendv_request *req; + int have_data = (flags & FI_REMOTE_CQ_DATA) > 0; + uint32_t msg_flags; + + assert((tag & ~PSMX3_TAG_MASK) == 0); + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + if (flags & FI_TRIGGER) + return psmx3_trigger_queue_tsendv(ep, iov, desc, count, + dest_addr, tag, context, + flags, data); + + total_len = 0; + real_count = 0; + for (i=0; iiov_protocol = PSMX3_IOV_PROTO_PACK; + p = req->buf; + for (i=0; iiov_protocol = PSMX3_IOV_PROTO_MULTI; + req->iov_done = 0; + req->iov_info.seq_num = (++ep_priv->iov_seq_num) % + PSMX3_IOV_MAX_SEQ_NUM + 1; + req->iov_info.count = (uint32_t)real_count; + req->iov_info.total_len = (uint32_t)total_len; + + q = req->iov_info.len; + for (i=0; iav; + assert(av); + psm2_epaddr = psmx3_av_translate_addr(av, ep_priv->tx, dest_addr, av->type); + + if (have_data) + PSMX3_SET_TAG(psm2_tag, tag, (uint32_t)data, msg_flags | PSMX3_IMM_BIT); + else + PSMX3_SET_TAG(psm2_tag, tag, (uint32_t)ep_priv->sep_id, msg_flags); + + if ((flags & PSMX3_NO_COMPLETION) || + (ep_priv->send_selective_completion && !(flags & FI_COMPLETION))) + no_completion = 1; + + if (flags & FI_INJECT) { + if (len > psmx3_env.inject_size) { + free(req); + return -FI_EMSGSIZE; + } + + err = psm2_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, + send_flag, &psm2_tag, req->buf, len); + + free(req); + + if (err != PSM2_OK) + return psmx3_errno(err); + + if (ep_priv->send_cntr) + psmx3_cntr_inc(ep_priv->send_cntr, 0); + + if (ep_priv->send_cq && !no_completion) { + event = psmx3_cq_create_event( + ep_priv->send_cq, + context, NULL, flags, len, + (uint64_t) data, + 0 /* tag */, + 0 /* olen */, + 0 /* err */); + + if (event) + psmx3_cq_enqueue_event(ep_priv->send_cq, event); + else + return -FI_ENOMEM; + } + + return 0; + } + + req->no_completion = no_completion; + req->user_context = context; + req->comp_flag = FI_TAGGED; + + fi_context = &req->fi_context; + PSMX3_CTXT_TYPE(fi_context) = PSMX3_SENDV_CONTEXT; + PSMX3_CTXT_USER(fi_context) = req; + PSMX3_CTXT_EP(fi_context) = ep_priv; + + err = psm2_mq_isend2(ep_priv->tx->psm2_mq, psm2_epaddr, + send_flag, &psm2_tag, req->buf, len, + (void *)fi_context, &psm2_req); + + if (err != PSM2_OK) { + free(req); + return psmx3_errno(err); + } + + PSMX3_CTXT_REQ(fi_context) = psm2_req; + + if (req->iov_protocol == PSMX3_IOV_PROTO_MULTI) { + fi_context = &req->fi_context_iov; + PSMX3_CTXT_TYPE(fi_context) = PSMX3_IOV_SEND_CONTEXT; + PSMX3_CTXT_USER(fi_context) = req; + PSMX3_CTXT_EP(fi_context) = ep_priv; + PSMX3_SET_TAG(psm2_tag, req->iov_info.seq_num, 0, + PSMX3_TYPE_IOV_PAYLOAD); + for (i=0; itx->psm2_mq, + psm2_epaddr, send_flag, &psm2_tag, + iov[i].iov_base, iov[i].iov_len, + (void *)fi_context, &psm2_req); + if (err != PSM2_OK) + return psmx3_errno(err); + } + } + } + + return 0; +} + +static ssize_t psmx3_tagged_send(struct fid_ep *ep, + const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, + uint64_t tag, void *context) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + return psmx3_tagged_send_generic(ep, buf, len, desc, dest_addr, + tag, context, ep_priv->tx_flags, 0); +} + +static ssize_t psmx3_tagged_sendmsg(struct fid_ep *ep, + const struct fi_msg_tagged *msg, + uint64_t flags) +{ + void *buf; + size_t len; + + assert(msg); + assert(!msg->iov_count || msg->msg_iov); + assert(msg->iov_count <= PSMX3_IOV_MAX_COUNT); + + if (msg->iov_count > 1) { + return psmx3_tagged_sendv_generic(ep, msg->msg_iov, + msg->desc, msg->iov_count, + msg->addr, msg->tag, + msg->context, flags, + msg->data); + } else if (msg->iov_count) { + buf = msg->msg_iov[0].iov_base; + len = msg->msg_iov[0].iov_len; + } else { + buf = NULL; + len = 0; + } + + return psmx3_tagged_send_generic(ep, buf, len, + msg->desc ? msg->desc[0] : NULL, + msg->addr, msg->tag, msg->context, + flags, msg->data); +} + +ssize_t psmx3_tagged_senddata(struct fid_ep *ep, const void *buf, size_t len, + void *desc, uint64_t data, fi_addr_t dest_addr, + uint64_t tag, void *context) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + return psmx3_tagged_send_generic(ep, buf, len, desc, dest_addr, + tag, context, + ep_priv->tx_flags | FI_REMOTE_CQ_DATA, + data); +} + +#define PSMX3_TAGGED_SENDV_FUNC(suffix) \ +static ssize_t \ +psmx3_tagged_sendv##suffix(struct fid_ep *ep, const struct iovec *iov, \ + void **desc,size_t count, \ + fi_addr_t dest_addr, uint64_t tag, \ + void *context) \ +{ \ + void *buf; \ + size_t len; \ + assert(!count || iov); \ + assert(count <= PSMX3_IOV_MAX_COUNT); \ + if (count > 1) { \ + struct psmx3_fid_ep *ep_priv; \ + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); \ + return psmx3_tagged_sendv_generic(ep, iov, desc, count, \ + dest_addr, tag, \ + context, \ + ep_priv->tx_flags, 0);\ + } else if (count) { \ + buf = iov[0].iov_base; \ + len = iov[0].iov_len; \ + } else { \ + buf = NULL; \ + len = 0; \ + } \ + return psmx3_tagged_send##suffix(ep, buf, len, \ + desc ? desc[0] : NULL, \ + dest_addr, tag, context); \ +} + +PSMX3_TAGGED_SENDV_FUNC() +PSMX3_TAGGED_SENDV_FUNC(_no_flag) +PSMX3_TAGGED_SENDV_FUNC(_no_event) +PSMX3_TAGGED_SENDV_FUNC(_no_flag_av_map) +PSMX3_TAGGED_SENDV_FUNC(_no_event_av_map) + +static ssize_t psmx3_tagged_inject(struct fid_ep *ep, + const void *buf, size_t len, + fi_addr_t dest_addr, uint64_t tag) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + return psmx3_tagged_send_generic(ep, buf, len, NULL, dest_addr, + tag, NULL, + ep_priv->tx_flags | FI_INJECT | PSMX3_NO_COMPLETION, + 0); +} + +static ssize_t psmx3_tagged_injectdata(struct fid_ep *ep, + const void *buf, size_t len, uint64_t data, + fi_addr_t dest_addr, uint64_t tag) +{ + struct psmx3_fid_ep *ep_priv; + + ep_priv = container_of(ep, struct psmx3_fid_ep, ep); + + return psmx3_tagged_send_generic(ep, buf, len, NULL, dest_addr, + tag, NULL, + ep_priv->tx_flags | FI_INJECT | FI_REMOTE_CQ_DATA | + PSMX3_NO_COMPLETION, + data); +} + +#define PSMX3_TAGGED_OPS(suffix,sendopt,recvopt,injopt) \ +struct fi_ops_tagged psmx3_tagged_ops##suffix = { \ + .size = sizeof(struct fi_ops_tagged), \ + .recv = psmx3_tagged_recv##recvopt, \ + .recvv = psmx3_tagged_recvv##recvopt, \ + .recvmsg = psmx3_tagged_recvmsg, \ + .send = psmx3_tagged_send##sendopt, \ + .sendv = psmx3_tagged_sendv##sendopt, \ + .sendmsg = psmx3_tagged_sendmsg, \ + .inject = psmx3_tagged_inject##injopt, \ + .senddata = psmx3_tagged_senddata##sendopt, \ + .injectdata = psmx3_tagged_injectdata##injopt, \ +}; + +PSMX3_TAGGED_OPS(,,,) +PSMX3_TAGGED_OPS(_no_flag_directed, _no_flag, _no_flag_directed, _no_flag) +PSMX3_TAGGED_OPS(_no_event_directed, _no_event, _no_event_directed, _no_flag) +PSMX3_TAGGED_OPS(_no_send_event_directed, _no_event, _no_flag_directed, _no_flag) +PSMX3_TAGGED_OPS(_no_recv_event_directed, _no_flag, _no_event_directed, _no_flag) +PSMX3_TAGGED_OPS(_no_flag_undirected, _no_flag, _no_flag_undirected, _no_flag) +PSMX3_TAGGED_OPS(_no_event_undirected, _no_event, _no_event_undirected, _no_flag) +PSMX3_TAGGED_OPS(_no_send_event_undirected, _no_event, _no_flag_undirected, _no_flag) +PSMX3_TAGGED_OPS(_no_recv_event_undirected, _no_flag, _no_event_undirected, _no_flag) +PSMX3_TAGGED_OPS(_no_flag_directed_av_map, _no_flag_av_map, _no_flag_directed_av_map, _no_flag_av_map) +PSMX3_TAGGED_OPS(_no_event_directed_av_map, _no_event_av_map, _no_event_directed_av_map, _no_flag_av_map) +PSMX3_TAGGED_OPS(_no_send_event_directed_av_map, _no_event_av_map, _no_flag_directed_av_map, _no_flag_av_map) +PSMX3_TAGGED_OPS(_no_recv_event_directed_av_map, _no_flag_av_map, _no_event_directed_av_map, _no_flag_av_map) +PSMX3_TAGGED_OPS(_no_flag_undirected_av_map, _no_flag_av_map, _no_flag_undirected_av_map, _no_flag_av_map) +PSMX3_TAGGED_OPS(_no_event_undirected_av_map, _no_event_av_map, _no_event_undirected_av_map, _no_flag_av_map) +PSMX3_TAGGED_OPS(_no_send_event_undirected_av_map, _no_event_av_map, _no_flag_undirected_av_map, _no_flag_av_map) +PSMX3_TAGGED_OPS(_no_recv_event_undirected_av_map, _no_flag_av_map, _no_event_undirected_av_map, _no_flag_av_map) + diff --git a/prov/psm3/src/psmx3_trigger.h b/prov/psm3/src/psmx3_trigger.h new file mode 100644 index 00000000000..68f152886b0 --- /dev/null +++ b/prov/psm3/src/psmx3_trigger.h @@ -0,0 +1,1113 @@ +/* + * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FI_PSM2_TRIGGER_H +#define _FI_PSM2_TRIGGER_H + +#ifdef __cplusplus +extern "C" { +#endif + +enum psmx3_triggered_op { + PSMX3_TRIGGERED_SEND, + PSMX3_TRIGGERED_SENDV, + PSMX3_TRIGGERED_RECV, + PSMX3_TRIGGERED_TSEND, + PSMX3_TRIGGERED_TSENDV, + PSMX3_TRIGGERED_TRECV, + PSMX3_TRIGGERED_WRITE, + PSMX3_TRIGGERED_WRITEV, + PSMX3_TRIGGERED_READ, + PSMX3_TRIGGERED_READV, + PSMX3_TRIGGERED_ATOMIC_WRITE, + PSMX3_TRIGGERED_ATOMIC_WRITEV, + PSMX3_TRIGGERED_ATOMIC_READWRITE, + PSMX3_TRIGGERED_ATOMIC_READWRITEV, + PSMX3_TRIGGERED_ATOMIC_COMPWRITE, + PSMX3_TRIGGERED_ATOMIC_COMPWRITEV, +}; + +struct psmx3_trigger { + enum psmx3_triggered_op op; + struct psmx3_fid_cntr *cntr; + size_t threshold; + union { + struct { + struct fid_ep *ep; + const void *buf; + size_t len; + void *desc; + fi_addr_t dest_addr; + void *context; + uint64_t flags; + uint64_t data; + } send; + struct { + struct fid_ep *ep; + const struct iovec *iov; + size_t count; + void **desc; + fi_addr_t dest_addr; + void *context; + uint64_t flags; + uint64_t data; + } sendv; + struct { + struct fid_ep *ep; + void *buf; + size_t len; + void *desc; + fi_addr_t src_addr; + void *context; + uint64_t flags; + } recv; + struct { + struct fid_ep *ep; + const void *buf; + size_t len; + void *desc; + fi_addr_t dest_addr; + uint64_t tag; + void *context; + uint64_t flags; + uint64_t data; + } tsend; + struct { + struct fid_ep *ep; + const struct iovec *iov; + size_t count; + void **desc; + fi_addr_t dest_addr; + uint64_t tag; + void *context; + uint64_t flags; + uint64_t data; + } tsendv; + struct { + struct fid_ep *ep; + void *buf; + size_t len; + void *desc; + fi_addr_t src_addr; + uint64_t tag; + uint64_t ignore; + void *context; + uint64_t flags; + } trecv; + struct { + struct fid_ep *ep; + const void *buf; + size_t len; + void *desc; + fi_addr_t dest_addr; + uint64_t addr; + uint64_t key; + void *context; + uint64_t flags; + uint64_t data; + } write; + struct { + struct fid_ep *ep; + const struct iovec *iov; + size_t count; + void *desc; + fi_addr_t dest_addr; + uint64_t addr; + uint64_t key; + void *context; + uint64_t flags; + uint64_t data; + } writev; + struct { + struct fid_ep *ep; + void *buf; + size_t len; + void *desc; + fi_addr_t src_addr; + uint64_t addr; + uint64_t key; + void *context; + uint64_t flags; + } read; + struct { + struct fid_ep *ep; + const struct iovec *iov; + size_t count; + void *desc; + fi_addr_t src_addr; + uint64_t addr; + uint64_t key; + void *context; + uint64_t flags; + } readv; + struct { + struct fid_ep *ep; + const void *buf; + size_t count; + void *desc; + fi_addr_t dest_addr; + uint64_t addr; + uint64_t key; + enum fi_datatype datatype; + enum fi_op atomic_op; + void *context; + uint64_t flags; + } atomic_write; + struct { + struct fid_ep *ep; + const struct fi_ioc *iov; + size_t count; + void *desc; + fi_addr_t dest_addr; + uint64_t addr; + uint64_t key; + enum fi_datatype datatype; + enum fi_op atomic_op; + void *context; + uint64_t flags; + } atomic_writev; + struct { + struct fid_ep *ep; + const void *buf; + size_t count; + void *desc; + void *result; + void *result_desc; + fi_addr_t dest_addr; + uint64_t addr; + uint64_t key; + enum fi_datatype datatype; + enum fi_op atomic_op; + void *context; + uint64_t flags; + } atomic_readwrite; + struct { + struct fid_ep *ep; + const struct fi_ioc *iov; + size_t count; + void **desc; + struct fi_ioc *resultv; + void **result_desc; + size_t result_count; + fi_addr_t dest_addr; + uint64_t addr; + uint64_t key; + enum fi_datatype datatype; + enum fi_op atomic_op; + void *context; + uint64_t flags; + } atomic_readwritev; + struct { + struct fid_ep *ep; + const void *buf; + size_t count; + void *desc; + const void *compare; + void *compare_desc; + void *result; + void *result_desc; + fi_addr_t dest_addr; + uint64_t addr; + uint64_t key; + enum fi_datatype datatype; + enum fi_op atomic_op; + void *context; + uint64_t flags; + } atomic_compwrite; + struct { + struct fid_ep *ep; + const struct fi_ioc *iov; + size_t count; + void **desc; + const struct fi_ioc *comparev; + void **compare_desc; + size_t compare_count; + struct fi_ioc *resultv; + void **result_desc; + size_t result_count; + fi_addr_t dest_addr; + uint64_t addr; + uint64_t key; + enum fi_datatype datatype; + enum fi_op atomic_op; + void *context; + uint64_t flags; + } atomic_compwritev; + }; + struct psmx3_trigger *next; /* used for randomly accessed trigger list */ + struct slist_entry list_entry; /* used for ready-to-fire trigger queue */ +}; + +ssize_t psmx3_send_generic( + struct fid_ep *ep, + const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, + void *context, uint64_t flags, + uint64_t data); + +ssize_t psmx3_sendv_generic( + struct fid_ep *ep, + const struct iovec *iov, void **desc, + size_t count, fi_addr_t dest_addr, + void *context, uint64_t flags, + uint64_t data); + +ssize_t psmx3_recv_generic( + struct fid_ep *ep, + void *buf, size_t len, void *desc, + fi_addr_t src_addr, void *context, + uint64_t flags); + +ssize_t psmx3_tagged_send_generic( + struct fid_ep *ep, + const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, + uint64_t tag, void *context, + uint64_t flags, uint64_t data); + +ssize_t psmx3_tagged_sendv_generic( + struct fid_ep *ep, + const struct iovec *iov, void **desc, + size_t count, fi_addr_t dest_addr, + uint64_t tag, void *context, + uint64_t flags, uint64_t data); + +ssize_t psmx3_tagged_recv_generic( + struct fid_ep *ep, + void *buf, size_t len, + void *desc, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, + void *context, uint64_t flags); + +ssize_t psmx3_write_generic( + struct fid_ep *ep, + const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + void *context, uint64_t flags, + uint64_t data); + +ssize_t psmx3_writev_generic( + struct fid_ep *ep, + const struct iovec *iov, void **desc, + size_t count, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + void *context, uint64_t flags, + uint64_t data); + +ssize_t psmx3_read_generic( + struct fid_ep *ep, + void *buf, size_t len, + void *desc, fi_addr_t src_addr, + uint64_t addr, uint64_t key, + void *context, uint64_t flags); + +ssize_t psmx3_readv_generic( + struct fid_ep *ep, + const struct iovec *iov, void *desc, + size_t count, fi_addr_t src_addr, + uint64_t addr, uint64_t key, + void *context, uint64_t flags); + +ssize_t psmx3_atomic_write_generic( + struct fid_ep *ep, + const void *buf, + size_t count, void *desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context, + uint64_t flags); + +ssize_t psmx3_atomic_readwrite_generic( + struct fid_ep *ep, + const void *buf, + size_t count, void *desc, + void *result, void *result_desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context, + uint64_t flags); + +ssize_t psmx3_atomic_compwrite_generic( + struct fid_ep *ep, + const void *buf, + size_t count, void *desc, + const void *compare, void *compare_desc, + void *result, void *result_desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context, + uint64_t flags); + +static inline +int psmx3_process_trigger(struct psmx3_trx_ctxt *trx_ctxt, + struct psmx3_trigger *trigger) +{ + switch (trigger->op) { + case PSMX3_TRIGGERED_SEND: + psmx3_send_generic(trigger->send.ep, + trigger->send.buf, + trigger->send.len, + trigger->send.desc, + trigger->send.dest_addr, + trigger->send.context, + trigger->send.flags, + trigger->send.data); + break; + case PSMX3_TRIGGERED_SENDV: + psmx3_sendv_generic(trigger->sendv.ep, + trigger->sendv.iov, + trigger->sendv.desc, + trigger->sendv.count, + trigger->sendv.dest_addr, + trigger->sendv.context, + trigger->sendv.flags, + trigger->sendv.data); + break; + case PSMX3_TRIGGERED_RECV: + psmx3_recv_generic(trigger->recv.ep, + trigger->recv.buf, + trigger->recv.len, + trigger->recv.desc, + trigger->recv.src_addr, + trigger->recv.context, + trigger->recv.flags); + break; + case PSMX3_TRIGGERED_TSEND: + psmx3_tagged_send_generic(trigger->tsend.ep, + trigger->tsend.buf, + trigger->tsend.len, + trigger->tsend.desc, + trigger->tsend.dest_addr, + trigger->tsend.tag, + trigger->tsend.context, + trigger->tsend.flags, + trigger->tsend.data); + break; + case PSMX3_TRIGGERED_TSENDV: + psmx3_tagged_sendv_generic(trigger->tsendv.ep, + trigger->tsendv.iov, + trigger->tsendv.desc, + trigger->tsendv.count, + trigger->tsendv.dest_addr, + trigger->tsendv.tag, + trigger->tsendv.context, + trigger->tsendv.flags, + trigger->tsendv.data); + break; + case PSMX3_TRIGGERED_TRECV: + psmx3_tagged_recv_generic(trigger->trecv.ep, + trigger->trecv.buf, + trigger->trecv.len, + trigger->trecv.desc, + trigger->trecv.src_addr, + trigger->trecv.tag, + trigger->trecv.ignore, + trigger->trecv.context, + trigger->trecv.flags); + break; + case PSMX3_TRIGGERED_WRITE: + psmx3_write_generic(trigger->write.ep, + trigger->write.buf, + trigger->write.len, + trigger->write.desc, + trigger->write.dest_addr, + trigger->write.addr, + trigger->write.key, + trigger->write.context, + trigger->write.flags, + trigger->write.data); + break; + + case PSMX3_TRIGGERED_WRITEV: + psmx3_writev_generic(trigger->writev.ep, + trigger->writev.iov, + trigger->writev.desc, + trigger->writev.count, + trigger->writev.dest_addr, + trigger->writev.addr, + trigger->writev.key, + trigger->writev.context, + trigger->writev.flags, + trigger->writev.data); + break; + + case PSMX3_TRIGGERED_READ: + psmx3_read_generic(trigger->read.ep, + trigger->read.buf, + trigger->read.len, + trigger->read.desc, + trigger->read.src_addr, + trigger->read.addr, + trigger->read.key, + trigger->read.context, + trigger->read.flags); + break; + + case PSMX3_TRIGGERED_READV: + psmx3_readv_generic(trigger->readv.ep, + trigger->readv.iov, + trigger->readv.desc, + trigger->readv.count, + trigger->readv.src_addr, + trigger->readv.addr, + trigger->readv.key, + trigger->readv.context, + trigger->readv.flags); + break; + + case PSMX3_TRIGGERED_ATOMIC_WRITE: + psmx3_atomic_write_generic( + trigger->atomic_write.ep, + trigger->atomic_write.buf, + trigger->atomic_write.count, + trigger->atomic_write.desc, + trigger->atomic_write.dest_addr, + trigger->atomic_write.addr, + trigger->atomic_write.key, + trigger->atomic_write.datatype, + trigger->atomic_write.atomic_op, + trigger->atomic_write.context, + trigger->atomic_write.flags); + break; + + case PSMX3_TRIGGERED_ATOMIC_READWRITE: + psmx3_atomic_readwrite_generic( + trigger->atomic_readwrite.ep, + trigger->atomic_readwrite.buf, + trigger->atomic_readwrite.count, + trigger->atomic_readwrite.desc, + trigger->atomic_readwrite.result, + trigger->atomic_readwrite.result_desc, + trigger->atomic_readwrite.dest_addr, + trigger->atomic_readwrite.addr, + trigger->atomic_readwrite.key, + trigger->atomic_readwrite.datatype, + trigger->atomic_readwrite.atomic_op, + trigger->atomic_readwrite.context, + trigger->atomic_readwrite.flags); + break; + + case PSMX3_TRIGGERED_ATOMIC_COMPWRITE: + psmx3_atomic_compwrite_generic( + trigger->atomic_compwrite.ep, + trigger->atomic_compwrite.buf, + trigger->atomic_compwrite.count, + trigger->atomic_compwrite.desc, + trigger->atomic_compwrite.compare, + trigger->atomic_compwrite.compare_desc, + trigger->atomic_compwrite.result, + trigger->atomic_compwrite.result_desc, + trigger->atomic_compwrite.dest_addr, + trigger->atomic_compwrite.addr, + trigger->atomic_compwrite.key, + trigger->atomic_compwrite.datatype, + trigger->atomic_compwrite.atomic_op, + trigger->atomic_compwrite.context, + trigger->atomic_compwrite.flags); + break; + default: + FI_INFO(&psmx3_prov, FI_LOG_CQ, + "%d unsupported op\n", trigger->op); + break; + } + + free(trigger); + return 0; +} + +static inline +int psmx3_trigger_queue_trecv(struct fid_ep *ep, void *buf, + size_t len, void *desc, + fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, + void *context, uint64_t flags) +{ + struct psmx3_trigger *trigger; + struct fi_triggered_context *ctxt = context; + + trigger = calloc(1, sizeof(*trigger)); + if (!trigger) + return -FI_ENOMEM; + + trigger->op = PSMX3_TRIGGERED_TRECV; + trigger->cntr = container_of(ctxt->trigger.threshold.cntr, + struct psmx3_fid_cntr, cntr); + trigger->threshold = ctxt->trigger.threshold.threshold; + trigger->trecv.ep = ep; + trigger->trecv.buf = buf; + trigger->trecv.len = len; + trigger->trecv.desc = desc; + trigger->trecv.src_addr = src_addr; + trigger->trecv.tag = tag; + trigger->trecv.ignore = ignore; + trigger->trecv.context = context; + trigger->trecv.flags = flags & ~FI_TRIGGER; + + psmx3_cntr_add_trigger(trigger->cntr, trigger); + return 0; +} + +static inline +int psmx3_trigger_queue_tsend(struct fid_ep *ep, + const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, + uint64_t tag, void *context, + uint64_t flags, uint64_t data) +{ + struct psmx3_trigger *trigger; + struct fi_triggered_context *ctxt = context; + + trigger = calloc(1, sizeof(*trigger)); + if (!trigger) + return -FI_ENOMEM; + + trigger->op = PSMX3_TRIGGERED_TSEND; + trigger->cntr = container_of(ctxt->trigger.threshold.cntr, + struct psmx3_fid_cntr, cntr); + trigger->threshold = ctxt->trigger.threshold.threshold; + trigger->tsend.ep = ep; + trigger->tsend.buf = buf; + trigger->tsend.len = len; + trigger->tsend.desc = desc; + trigger->tsend.dest_addr = dest_addr; + trigger->tsend.tag = tag; + trigger->tsend.context = context; + trigger->tsend.flags = flags & ~FI_TRIGGER; + trigger->tsend.data = data; + + psmx3_cntr_add_trigger(trigger->cntr, trigger); + return 0; +} + +static inline +int psmx3_trigger_queue_tsendv(struct fid_ep *ep, + const struct iovec *iov, void *desc, + size_t count, fi_addr_t dest_addr, + uint64_t tag, void *context, + uint64_t flags, uint64_t data) +{ + struct psmx3_trigger *trigger; + struct fi_triggered_context *ctxt = context; + + trigger = calloc(1, sizeof(*trigger)); + if (!trigger) + return -FI_ENOMEM; + + trigger->op = PSMX3_TRIGGERED_TSENDV; + trigger->cntr = container_of(ctxt->trigger.threshold.cntr, + struct psmx3_fid_cntr, cntr); + trigger->threshold = ctxt->trigger.threshold.threshold; + trigger->tsendv.ep = ep; + trigger->tsendv.iov = iov; + trigger->tsendv.desc = desc; + trigger->tsendv.count = count; + trigger->tsendv.dest_addr = dest_addr; + trigger->tsendv.tag = tag; + trigger->tsendv.context = context; + trigger->tsendv.flags = flags & ~FI_TRIGGER; + trigger->tsendv.data = data; + + psmx3_cntr_add_trigger(trigger->cntr, trigger); + return 0; +} + +static inline +int psmx3_trigger_queue_recv(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, void *context, + uint64_t flags) +{ + struct psmx3_trigger *trigger; + struct fi_triggered_context *ctxt = context; + + trigger = calloc(1, sizeof(*trigger)); + if (!trigger) + return -FI_ENOMEM; + + trigger->op = PSMX3_TRIGGERED_RECV; + trigger->cntr = container_of(ctxt->trigger.threshold.cntr, + struct psmx3_fid_cntr, cntr); + trigger->threshold = ctxt->trigger.threshold.threshold; + trigger->recv.ep = ep; + trigger->recv.buf = buf; + trigger->recv.len = len; + trigger->recv.desc = desc; + trigger->recv.src_addr = src_addr; + trigger->recv.context = context; + trigger->recv.flags = flags & ~FI_TRIGGER; + + psmx3_cntr_add_trigger(trigger->cntr, trigger); + return 0; +} + +static inline +int psmx3_trigger_queue_send(struct fid_ep *ep, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, void *context, + uint64_t flags, uint64_t data) +{ + struct psmx3_trigger *trigger; + struct fi_triggered_context *ctxt = context; + + trigger = calloc(1, sizeof(*trigger)); + if (!trigger) + return -FI_ENOMEM; + + trigger->op = PSMX3_TRIGGERED_SEND; + trigger->cntr = container_of(ctxt->trigger.threshold.cntr, + struct psmx3_fid_cntr, cntr); + trigger->threshold = ctxt->trigger.threshold.threshold; + trigger->send.ep = ep; + trigger->send.buf = buf; + trigger->send.len = len; + trigger->send.desc = desc; + trigger->send.dest_addr = dest_addr; + trigger->send.context = context; + trigger->send.flags = flags & ~FI_TRIGGER; + trigger->send.data = data; + + psmx3_cntr_add_trigger(trigger->cntr, trigger); + return 0; +} + +static inline +int psmx3_trigger_queue_sendv(struct fid_ep *ep, const struct iovec *iov, + void *desc, size_t count, fi_addr_t dest_addr, + void *context, uint64_t flags, uint64_t data) +{ + struct psmx3_trigger *trigger; + struct fi_triggered_context *ctxt = context; + + trigger = calloc(1, sizeof(*trigger)); + if (!trigger) + return -FI_ENOMEM; + + trigger->op = PSMX3_TRIGGERED_SENDV; + trigger->cntr = container_of(ctxt->trigger.threshold.cntr, + struct psmx3_fid_cntr, cntr); + trigger->threshold = ctxt->trigger.threshold.threshold; + trigger->sendv.ep = ep; + trigger->sendv.iov = iov; + trigger->sendv.desc = desc; + trigger->sendv.count = count; + trigger->sendv.dest_addr = dest_addr; + trigger->sendv.context = context; + trigger->sendv.flags = flags & ~FI_TRIGGER; + trigger->sendv.data = data; + + psmx3_cntr_add_trigger(trigger->cntr, trigger); + return 0; +} + +static inline +int psmx3_trigger_queue_read(struct fid_ep *ep, void *buf, size_t len, + void *desc, fi_addr_t src_addr, + uint64_t addr, uint64_t key, void *context, + uint64_t flags) +{ + struct psmx3_trigger *trigger; + struct fi_triggered_context *ctxt = context; + + trigger = calloc(1, sizeof(*trigger)); + if (!trigger) + return -FI_ENOMEM; + + trigger->op = PSMX3_TRIGGERED_READ; + trigger->cntr = container_of(ctxt->trigger.threshold.cntr, + struct psmx3_fid_cntr, cntr); + trigger->threshold = ctxt->trigger.threshold.threshold; + trigger->read.ep = ep; + trigger->read.buf = buf; + trigger->read.len = len; + trigger->read.desc = desc; + trigger->read.src_addr = src_addr; + trigger->read.addr = addr; + trigger->read.key = key; + trigger->read.context = context; + trigger->read.flags = flags & ~FI_TRIGGER; + + psmx3_cntr_add_trigger(trigger->cntr, trigger); + return 0; +} + +static inline +int psmx3_trigger_queue_readv(struct fid_ep *ep, const struct iovec *iov, + void *desc, size_t count, fi_addr_t src_addr, + uint64_t addr, uint64_t key, void *context, + uint64_t flags) +{ + struct psmx3_trigger *trigger; + struct fi_triggered_context *ctxt = context; + + trigger = calloc(1, sizeof(*trigger)); + if (!trigger) + return -FI_ENOMEM; + + trigger->op = PSMX3_TRIGGERED_READV; + trigger->cntr = container_of(ctxt->trigger.threshold.cntr, + struct psmx3_fid_cntr, cntr); + trigger->threshold = ctxt->trigger.threshold.threshold; + trigger->readv.ep = ep; + trigger->readv.iov = iov; + trigger->readv.count = count; + trigger->readv.desc = desc; + trigger->readv.src_addr = src_addr; + trigger->readv.addr = addr; + trigger->readv.key = key; + trigger->readv.context = context; + trigger->readv.flags = flags & ~FI_TRIGGER; + + psmx3_cntr_add_trigger(trigger->cntr, trigger); + return 0; +} + +static inline +int psmx3_trigger_queue_write(struct fid_ep *ep, const void *buf, size_t len, + void *desc, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, void *context, + uint64_t flags, uint64_t data) +{ + struct psmx3_trigger *trigger; + struct fi_triggered_context *ctxt = context; + + trigger = calloc(1, sizeof(*trigger)); + if (!trigger) + return -FI_ENOMEM; + + trigger->op = PSMX3_TRIGGERED_WRITE; + trigger->cntr = container_of(ctxt->trigger.threshold.cntr, + struct psmx3_fid_cntr, cntr); + trigger->threshold = ctxt->trigger.threshold.threshold; + trigger->write.ep = ep; + trigger->write.buf = buf; + trigger->write.len = len; + trigger->write.desc = desc; + trigger->write.dest_addr = dest_addr; + trigger->write.addr = addr; + trigger->write.key = key; + trigger->write.context = context; + trigger->write.flags = flags & ~FI_TRIGGER; + trigger->write.data = data; + + psmx3_cntr_add_trigger(trigger->cntr, trigger); + return 0; +} + +static inline +int psmx3_trigger_queue_writev(struct fid_ep *ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t dest_addr, + uint64_t addr, uint64_t key, void *context, + uint64_t flags, uint64_t data) +{ + struct psmx3_trigger *trigger; + struct fi_triggered_context *ctxt = context; + + trigger = calloc(1, sizeof(*trigger)); + if (!trigger) + return -FI_ENOMEM; + + trigger->op = PSMX3_TRIGGERED_WRITEV; + trigger->cntr = container_of(ctxt->trigger.threshold.cntr, + struct psmx3_fid_cntr, cntr); + trigger->threshold = ctxt->trigger.threshold.threshold; + trigger->writev.ep = ep; + trigger->writev.iov = iov; + trigger->writev.count = count; + trigger->writev.desc = desc; + trigger->writev.dest_addr = dest_addr; + trigger->writev.addr = addr; + trigger->writev.key = key; + trigger->writev.context = context; + trigger->writev.flags = flags & ~FI_TRIGGER; + trigger->writev.data = data; + + psmx3_cntr_add_trigger(trigger->cntr, trigger); + return 0; +} + +static inline +int psmx3_trigger_queue_atomic_write(struct fid_ep *ep, + const void *buf, + size_t count, void *desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context, + uint64_t flags) +{ + struct psmx3_trigger *trigger; + struct fi_triggered_context *ctxt = context; + + trigger = calloc(1, sizeof(*trigger)); + if (!trigger) + return -FI_ENOMEM; + + trigger->op = PSMX3_TRIGGERED_ATOMIC_WRITE; + trigger->cntr = container_of(ctxt->trigger.threshold.cntr, + struct psmx3_fid_cntr, cntr); + trigger->threshold = ctxt->trigger.threshold.threshold; + trigger->atomic_write.ep = ep; + trigger->atomic_write.buf = buf; + trigger->atomic_write.count = count; + trigger->atomic_write.desc = desc; + trigger->atomic_write.dest_addr = dest_addr; + trigger->atomic_write.addr = addr; + trigger->atomic_write.key = key; + trigger->atomic_write.datatype = datatype; + trigger->atomic_write.atomic_op = op; + trigger->atomic_write.context = context; + trigger->atomic_write.flags = flags & ~FI_TRIGGER; + + psmx3_cntr_add_trigger(trigger->cntr, trigger); + return 0; +} + +static inline +int psmx3_trigger_queue_atomic_writev(struct fid_ep *ep, + const struct fi_ioc *iov, + void **desc, size_t count, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context, + uint64_t flags) +{ + struct psmx3_trigger *trigger; + struct fi_triggered_context *ctxt = context; + + trigger = calloc(1, sizeof(*trigger)); + if (!trigger) + return -FI_ENOMEM; + + trigger->op = PSMX3_TRIGGERED_ATOMIC_WRITEV; + trigger->cntr = container_of(ctxt->trigger.threshold.cntr, + struct psmx3_fid_cntr, cntr); + trigger->threshold = ctxt->trigger.threshold.threshold; + trigger->atomic_writev.ep = ep; + trigger->atomic_writev.iov = iov; + trigger->atomic_writev.count = count; + trigger->atomic_writev.desc = desc; + trigger->atomic_writev.dest_addr = dest_addr; + trigger->atomic_writev.addr = addr; + trigger->atomic_writev.key = key; + trigger->atomic_writev.datatype = datatype; + trigger->atomic_writev.atomic_op = op; + trigger->atomic_writev.context = context; + trigger->atomic_writev.flags = flags & ~FI_TRIGGER; + + psmx3_cntr_add_trigger(trigger->cntr, trigger); + return 0; +} + +static inline +int psmx3_trigger_queue_atomic_readwrite(struct fid_ep *ep, + const void *buf, + size_t count, void *desc, + void *result, void *result_desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context, + uint64_t flags) +{ + struct psmx3_trigger *trigger; + struct fi_triggered_context *ctxt = context; + + trigger = calloc(1, sizeof(*trigger)); + if (!trigger) + return -FI_ENOMEM; + + trigger->op = PSMX3_TRIGGERED_ATOMIC_READWRITE; + trigger->cntr = container_of(ctxt->trigger.threshold.cntr, + struct psmx3_fid_cntr, cntr); + trigger->threshold = ctxt->trigger.threshold.threshold; + trigger->atomic_readwrite.ep = ep; + trigger->atomic_readwrite.buf = buf; + trigger->atomic_readwrite.count = count; + trigger->atomic_readwrite.desc = desc; + trigger->atomic_readwrite.result = result; + trigger->atomic_readwrite.result_desc = result_desc; + trigger->atomic_readwrite.dest_addr = dest_addr; + trigger->atomic_readwrite.addr = addr; + trigger->atomic_readwrite.key = key; + trigger->atomic_readwrite.datatype = datatype; + trigger->atomic_readwrite.atomic_op = op; + trigger->atomic_readwrite.context = context; + trigger->atomic_readwrite.flags = flags & ~FI_TRIGGER; + + psmx3_cntr_add_trigger(trigger->cntr, trigger); + return 0; +} + +static inline +int psmx3_trigger_queue_atomic_readwritev(struct fid_ep *ep, + const struct fi_ioc *iov, + void **desc, size_t count, + struct fi_ioc *resultv, + void **result_desc, + size_t result_count, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context, + uint64_t flags) +{ + struct psmx3_trigger *trigger; + struct fi_triggered_context *ctxt = context; + + trigger = calloc(1, sizeof(*trigger)); + if (!trigger) + return -FI_ENOMEM; + + trigger->op = PSMX3_TRIGGERED_ATOMIC_READWRITEV; + trigger->cntr = container_of(ctxt->trigger.threshold.cntr, + struct psmx3_fid_cntr, cntr); + trigger->threshold = ctxt->trigger.threshold.threshold; + trigger->atomic_readwritev.ep = ep; + trigger->atomic_readwritev.iov = iov; + trigger->atomic_readwritev.count = count; + trigger->atomic_readwritev.desc = desc; + trigger->atomic_readwritev.resultv = resultv; + trigger->atomic_readwritev.result_desc = result_desc; + trigger->atomic_readwritev.result_count = result_count; + trigger->atomic_readwritev.dest_addr = dest_addr; + trigger->atomic_readwritev.addr = addr; + trigger->atomic_readwritev.key = key; + trigger->atomic_readwritev.datatype = datatype; + trigger->atomic_readwritev.atomic_op = op; + trigger->atomic_readwritev.context = context; + trigger->atomic_readwritev.flags = flags & ~FI_TRIGGER; + + psmx3_cntr_add_trigger(trigger->cntr, trigger); + return 0; +} + +static inline +int psmx3_trigger_queue_atomic_compwrite(struct fid_ep *ep, + const void *buf, + size_t count, void *desc, + const void *compare, void *compare_desc, + void *result, void *result_desc, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context, + uint64_t flags) +{ + struct psmx3_trigger *trigger; + struct fi_triggered_context *ctxt = context; + + trigger = calloc(1, sizeof(*trigger)); + if (!trigger) + return -FI_ENOMEM; + + trigger->op = PSMX3_TRIGGERED_ATOMIC_COMPWRITE; + trigger->cntr = container_of(ctxt->trigger.threshold.cntr, + struct psmx3_fid_cntr, cntr); + trigger->threshold = ctxt->trigger.threshold.threshold; + trigger->atomic_compwrite.ep = ep; + trigger->atomic_compwrite.buf = buf; + trigger->atomic_compwrite.count = count; + trigger->atomic_compwrite.desc = desc; + trigger->atomic_compwrite.compare = compare; + trigger->atomic_compwrite.compare_desc = compare_desc; + trigger->atomic_compwrite.result = result; + trigger->atomic_compwrite.result_desc = result_desc; + trigger->atomic_compwrite.dest_addr = dest_addr; + trigger->atomic_compwrite.addr = addr; + trigger->atomic_compwrite.key = key; + trigger->atomic_compwrite.datatype = datatype; + trigger->atomic_compwrite.atomic_op = op; + trigger->atomic_compwrite.context = context; + trigger->atomic_compwrite.flags = flags & ~FI_TRIGGER; + + psmx3_cntr_add_trigger(trigger->cntr, trigger); + return 0; +} + +static inline +int psmx3_trigger_queue_atomic_compwritev(struct fid_ep *ep, + const struct fi_ioc *iov, + void **desc, size_t count, + const struct fi_ioc *comparev, + void **compare_desc, + size_t compare_count, + struct fi_ioc *resultv, + void **result_desc, + size_t result_count, + fi_addr_t dest_addr, + uint64_t addr, uint64_t key, + enum fi_datatype datatype, + enum fi_op op, void *context, + uint64_t flags) +{ + struct psmx3_trigger *trigger; + struct fi_triggered_context *ctxt = context; + + trigger = calloc(1, sizeof(*trigger)); + if (!trigger) + return -FI_ENOMEM; + + trigger->op = PSMX3_TRIGGERED_ATOMIC_COMPWRITEV; + trigger->cntr = container_of(ctxt->trigger.threshold.cntr, + struct psmx3_fid_cntr, cntr); + trigger->threshold = ctxt->trigger.threshold.threshold; + trigger->atomic_compwritev.ep = ep; + trigger->atomic_compwritev.iov = iov; + trigger->atomic_compwritev.desc = desc; + trigger->atomic_compwritev.count = count; + trigger->atomic_compwritev.comparev = comparev; + trigger->atomic_compwritev.compare_desc = compare_desc; + trigger->atomic_compwritev.compare_count = compare_count; + trigger->atomic_compwritev.resultv = resultv; + trigger->atomic_compwritev.result_desc = result_desc; + trigger->atomic_compwritev.result_count = result_count; + trigger->atomic_compwritev.dest_addr = dest_addr; + trigger->atomic_compwritev.addr = addr; + trigger->atomic_compwritev.key = key; + trigger->atomic_compwritev.datatype = datatype; + trigger->atomic_compwritev.atomic_op = op; + trigger->atomic_compwritev.context = context; + trigger->atomic_compwritev.flags = flags & ~FI_TRIGGER; + + psmx3_cntr_add_trigger(trigger->cntr, trigger); + return 0; +} + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/prov/psm3/src/psmx3_trx_ctxt.c b/prov/psm3/src/psmx3_trx_ctxt.c new file mode 100644 index 00000000000..971e0dd69d7 --- /dev/null +++ b/prov/psm3/src/psmx3_trx_ctxt.c @@ -0,0 +1,373 @@ +/* + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx3.h" + +int psmx3_trx_ctxt_cnt = 0; + +/* + * Tx/Rx context disconnect protocol: + * + * TRX_CTXT disconnect REQ: + * args[0].u32w0 cmd + * + * Before a PSM2 endpoint is closed, a TRX_CTXT disconnect REQ is sent to + * all connected peers. Each peer then calls psm2_ep_disconnet() to clean + * up the local connection state. This allows a future endpoint with the + * same epid to connect to the same peers. + */ + +struct disconnect_args { + struct psmx3_trx_ctxt *trx_ctxt; + psm2_epaddr_t epaddr; +}; + +static void *disconnect_func(void *args) +{ + struct disconnect_args *disconn = args; + struct psmx3_trx_ctxt *trx_ctxt = disconn->trx_ctxt; + struct psmx3_epaddr_context *epaddr_context; + psm2_error_t errors; + + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "psm2_ep: %p, epaddr: %p\n", trx_ctxt->psm2_ep, disconn->epaddr); + + trx_ctxt->domain->peer_lock_fn(&trx_ctxt->peer_lock, 2); + dlist_remove_first_match(&trx_ctxt->peer_list, + psmx3_peer_match, disconn->epaddr); + trx_ctxt->domain->peer_unlock_fn(&trx_ctxt->peer_lock, 2); + if (trx_ctxt->ep && trx_ctxt->ep->av) + psmx3_av_remove_conn(trx_ctxt->ep->av, trx_ctxt, disconn->epaddr); + + epaddr_context = psm2_epaddr_getctxt(disconn->epaddr); + psm2_epaddr_setctxt(disconn->epaddr, NULL); + free(epaddr_context); + + psm2_ep_disconnect2(trx_ctxt->psm2_ep, 1, &disconn->epaddr, NULL, + &errors, PSM2_EP_DISCONNECT_FORCE, 0); + + free(args); + return NULL; +} + +int psmx3_am_trx_ctxt_handler(psm2_am_token_t token, psm2_amarg_t *args, + int nargs, void *src, uint32_t len, void *hctx) +{ + psm2_epaddr_t epaddr; + int err = 0; + int cmd; + struct disconnect_args *disconn; + pthread_t disconnect_thread; + struct psmx3_trx_ctxt *trx_ctxt; + trx_ctxt = (struct psmx3_trx_ctxt *)hctx; + + psm2_am_get_source(token, &epaddr); + cmd = PSMX3_AM_GET_OP(args[0].u32w0); + + switch(cmd) { + case PSMX3_AM_REQ_TRX_CTXT_DISCONNECT: + /* + * we can't call psm2_ep_disconnect from the AM + * handler. instead, create a thread to do the work. + * the performance of this operation is not important. + * + * also put the av cleanup operations into the thread + * to avoid deadlock because the AM handler may be + * called with the av lock held. + */ + disconn = malloc(sizeof(*disconn)); + if (disconn) { + disconn->trx_ctxt = trx_ctxt; + disconn->epaddr = epaddr; + pthread_create(&disconnect_thread, NULL, + disconnect_func, disconn); + pthread_detach(disconnect_thread); + } + break; + + default: + err = -FI_EINVAL; + break; + } + + return err; +} + +void psmx3_trx_ctxt_disconnect_peers(struct psmx3_trx_ctxt *trx_ctxt) +{ + struct dlist_entry *item, *tmp; + struct psmx3_epaddr_context *peer; + struct dlist_entry peer_list; + psm2_amarg_t arg; + int err; + + arg.u32w0 = PSMX3_AM_REQ_TRX_CTXT_DISCONNECT; + + /* use local peer_list to avoid entering AM handler while holding the lock */ + dlist_init(&peer_list); + trx_ctxt->domain->peer_lock_fn(&trx_ctxt->peer_lock, 2); + dlist_foreach_safe(&trx_ctxt->peer_list, item, tmp) { + dlist_remove(item); + dlist_insert_before(item, &peer_list); + } + trx_ctxt->domain->peer_unlock_fn(&trx_ctxt->peer_lock, 2); + + dlist_foreach_safe(&peer_list, item, tmp) { + peer = container_of(item, struct psmx3_epaddr_context, entry); + if (psmx3_env.disconnect) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, "epaddr: %p\n", peer->epaddr); + err = psm2_am_request_short(peer->epaddr, + PSMX3_AM_TRX_CTXT_HANDLER, + &arg, 1, NULL, 0, 0, NULL, + NULL); + if (err) + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "failed to send disconnect, err %d\n", + err); + } + psm2_epaddr_setctxt(peer->epaddr, NULL); + free(peer); + } +} + +static const char *psmx3_usage_flags_to_string(int usage_flags) +{ + switch (usage_flags & PSMX3_TX_RX) { + case PSMX3_TX: return "tx"; + case PSMX3_RX: return "rx"; + default: return "tx+rx"; + } +} + +void psmx3_trx_ctxt_free(struct psmx3_trx_ctxt *trx_ctxt, int usage_flags) +{ + int err; + int old_flags; + + if (!trx_ctxt) + return; + + old_flags = trx_ctxt->usage_flags; + trx_ctxt->usage_flags &= ~usage_flags; + if (trx_ctxt->usage_flags) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, "epid: %016lx (%s -> %s)\n", + trx_ctxt->psm2_epid, + psmx3_usage_flags_to_string(old_flags), + psmx3_usage_flags_to_string(trx_ctxt->usage_flags)); + return; + } + + FI_INFO(&psmx3_prov, FI_LOG_CORE, "epid: %016lx (%s)\n", + trx_ctxt->psm2_epid, psmx3_usage_flags_to_string(old_flags)); + + trx_ctxt->am_progress = 0; + trx_ctxt->poll_active = 0; + + trx_ctxt->domain->trx_ctxt_lock_fn(&trx_ctxt->domain->trx_ctxt_lock, 1); + dlist_remove(&trx_ctxt->entry); + trx_ctxt->domain->trx_ctxt_unlock_fn(&trx_ctxt->domain->trx_ctxt_lock, 1); + + psmx3_trx_ctxt_disconnect_peers(trx_ctxt); + + if (trx_ctxt->am_initialized) + psmx3_am_fini(trx_ctxt); + +#if 0 + /* AM messages could arrive after MQ is finalized, causing segfault + * when trying to dereference the MQ pointer. There is no mechanism + * to properly shutdown AM. The workaround is to keep MQ valid. + */ + psm2_mq_finalize(trx_ctxt->psm2_mq); +#endif + + /* workaround for: + * Assertion failure at psm2_ep.c:1059: ep->mctxt_master == ep + */ + if (psmx3_env.delay) + sleep(psmx3_env.delay); + + if (psmx3_env.timeout) + err = psm2_ep_close(trx_ctxt->psm2_ep, PSM2_EP_CLOSE_GRACEFUL, + (int64_t) psmx3_env.timeout * 1000000000LL); + else + err = PSM2_EP_CLOSE_TIMEOUT; + + if (err != PSM2_OK) + psm2_ep_close(trx_ctxt->psm2_ep, PSM2_EP_CLOSE_FORCE, 0); + + ofi_bufpool_destroy(trx_ctxt->am_req_pool); + fastlock_destroy(&trx_ctxt->am_req_pool_lock); + fastlock_destroy(&trx_ctxt->poll_lock); + fastlock_destroy(&trx_ctxt->peer_lock); + + if (!ofi_atomic_dec32(&trx_ctxt->poll_refcnt)) + free(trx_ctxt); +} + +struct psmx3_trx_ctxt *psmx3_trx_ctxt_alloc(struct psmx3_fid_domain *domain, + struct psmx3_ep_name *src_addr, + int sep_ctxt_idx, + int usage_flags, + uint8_t *uuid) +{ + struct psmx3_trx_ctxt *trx_ctxt; + struct psm2_ep_open_opts opts; + int should_retry = 0; + int err; + struct dlist_entry *item; + int asked_flags = usage_flags & PSMX3_TX_RX; + int compatible_flags = ~asked_flags & PSMX3_TX_RX; + + if (!uuid) + uuid = domain->fabric->uuid; + + /* Check existing allocations first if only Tx or Rx is needed */ + if (compatible_flags) { + domain->trx_ctxt_lock_fn(&domain->trx_ctxt_lock, 1); + dlist_foreach(&domain->trx_ctxt_list, item) { + trx_ctxt = container_of(item, struct psmx3_trx_ctxt, entry); + if (compatible_flags == trx_ctxt->usage_flags && + !memcmp(uuid, trx_ctxt->uuid, sizeof(psm2_uuid_t))) { + trx_ctxt->usage_flags |= asked_flags; + domain->trx_ctxt_unlock_fn(&domain->trx_ctxt_lock, 1); + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "use existing context. epid: %016lx " + "(%s -> tx+rx).\n", trx_ctxt->psm2_epid, + psmx3_usage_flags_to_string(compatible_flags)); + return trx_ctxt; + } + } + domain->trx_ctxt_unlock_fn(&domain->trx_ctxt_lock, 1); + } + + if (psmx3_trx_ctxt_cnt >= psmx3_hfi_info.max_trx_ctxt) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "number of Tx/Rx contexts exceeds limit (%d).\n", + psmx3_hfi_info.max_trx_ctxt); + return NULL; + } + + trx_ctxt = calloc(1, sizeof(*trx_ctxt)); + if (!trx_ctxt) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "failed to allocate trx_ctxt.\n"); + return NULL; + } + + err = ofi_bufpool_create(&trx_ctxt->am_req_pool, + sizeof(struct psmx3_am_request), + sizeof(void *), 0, 64, 0); + if (err) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "failed to allocate am_req_pool.\n"); + goto err_out; + } + + psm2_ep_open_opts_get_defaults(&opts); + memcpy(trx_ctxt->uuid, uuid, sizeof(psm2_uuid_t)); + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "uuid: %s\n", psmx3_uuid_to_string(uuid)); + + opts.unit = src_addr ? src_addr->unit : PSMX3_DEFAULT_UNIT; + opts.port = src_addr ? src_addr->port : PSMX3_DEFAULT_PORT; + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "ep_open_opts: unit=%d port=%u\n", opts.unit, opts.port); + + if (opts.unit < 0 && sep_ctxt_idx >= 0) { + should_retry = 1; + opts.unit = psmx3_get_round_robin_unit(sep_ctxt_idx); + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "sep %d: ep_open_opts: unit=%d\n", sep_ctxt_idx, opts.unit); + } + + err = psm2_ep_open(uuid, &opts, + &trx_ctxt->psm2_ep, &trx_ctxt->psm2_epid); + if (err != PSM2_OK) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "psm2_ep_open returns %d, errno=%d\n", err, errno); + if (!should_retry) + goto err_out_destroy_pool; + + /* When round-robin fails, retry w/o explicit assignment */ + opts.unit = -1; + err = psm2_ep_open(uuid, &opts, + &trx_ctxt->psm2_ep, &trx_ctxt->psm2_epid); + if (err != PSM2_OK) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "psm2_ep_open retry returns %d, errno=%d\n", err, errno); + goto err_out_destroy_pool; + } + } + + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "epid: %016lx (%s)\n", trx_ctxt->psm2_epid, + psmx3_usage_flags_to_string(usage_flags)); + + err = psm2_mq_init(trx_ctxt->psm2_ep, PSM2_MQ_ORDERMASK_ALL, + NULL, 0, &trx_ctxt->psm2_mq); + if (err != PSM2_OK) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "psm2_mq_init returns %d, errno=%d\n", err, errno); + goto err_out_close_ep; + } + + fastlock_init(&trx_ctxt->peer_lock); + fastlock_init(&trx_ctxt->poll_lock); + fastlock_init(&trx_ctxt->am_req_pool_lock); + fastlock_init(&trx_ctxt->trigger_queue.lock); + dlist_init(&trx_ctxt->peer_list); + slist_init(&trx_ctxt->trigger_queue.list); + trx_ctxt->id = psmx3_trx_ctxt_cnt++; + trx_ctxt->domain = domain; + trx_ctxt->usage_flags = asked_flags; + trx_ctxt->poll_active = 1; + ofi_atomic_initialize32(&trx_ctxt->poll_refcnt, 1); /* take one ref for domain->trx_ctxt_list */ + + domain->trx_ctxt_lock_fn(&domain->trx_ctxt_lock, 1); + dlist_insert_before(&trx_ctxt->entry, &domain->trx_ctxt_list); + domain->trx_ctxt_unlock_fn(&domain->trx_ctxt_lock, 1); + + return trx_ctxt; + +err_out_close_ep: + if (psm2_ep_close(trx_ctxt->psm2_ep, PSM2_EP_CLOSE_GRACEFUL, + (int64_t) psmx3_env.timeout * 1000000000LL) != PSM2_OK) + psm2_ep_close(trx_ctxt->psm2_ep, PSM2_EP_CLOSE_FORCE, 0); + +err_out_destroy_pool: + ofi_bufpool_destroy(trx_ctxt->am_req_pool); + +err_out: + free(trx_ctxt); + return NULL; +} + diff --git a/prov/psm3/src/psmx3_util.c b/prov/psm3/src/psmx3_util.c new file mode 100644 index 00000000000..fb3865124cc --- /dev/null +++ b/prov/psm3/src/psmx3_util.c @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2013-2017 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx3.h" + +static void psmx3_string_to_uuid(const char *s, psm2_uuid_t uuid) +{ + int n; + + if (!s) { + memset(uuid, 0, sizeof(psm2_uuid_t)); + return; + } + + n = sscanf(s, + "%2hhx%2hhx%2hhx%2hhx-" + "%2hhx%2hhx-%2hhx%2hhx-%2hhx%2hhx-" + "%2hhx%2hhx%2hhx%2hhx%2hhx%2hhx", + &uuid[0], &uuid[1], &uuid[2], &uuid[3], + &uuid[4], &uuid[5], &uuid[6], &uuid[7], &uuid[8], &uuid[9], + &uuid[10], &uuid[11], &uuid[12], &uuid[13], &uuid[14], &uuid[15]); + + if (n != 16) { + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "wrong uuid format: %s\n", s); + FI_WARN(&psmx3_prov, FI_LOG_CORE, + "correct uuid format is: " + "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx\n"); + } +} + +void psmx3_get_uuid(psm2_uuid_t uuid) +{ + psmx3_string_to_uuid(psmx3_env.uuid, uuid); +} +int psmx3_override_uuid(void) +{ + return psmx3_env.uuid_override; +} + +int psmx3_uuid_to_port(psm2_uuid_t uuid) +{ + uint16_t port; + uint16_t *u = (uint16_t *)uuid; + + port = u[0] + u[1] + u[2] + u[3] + u[4] + u[5] + u[6] + u[7]; + if (port < 4096) + port += 4096; + + return (int)port; +} + +char *psmx3_uuid_to_string(psm2_uuid_t uuid) +{ + static char s[40]; + + sprintf(s, + "%02hhX%02hhX%02hhX%02hhX-" + "%02hhX%02hhX-%02hhX%02hhX-%02hhX%02hhX-" + "%02hhX%02hhX%02hhX%02hhX%02hhX%02hhX", + uuid[0], uuid[1], uuid[2], uuid[3], + uuid[4], uuid[5], uuid[6], uuid[7], uuid[8], uuid[9], + uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]); + + return s; +} + +void *psmx3_ep_name_to_string(const struct psmx3_ep_name *name, size_t *len) +{ + char *s; + + if (!name) + return NULL; + + *len = PSMX3_MAX_STRING_NAME_LEN; + + s = calloc(*len, 1); + if (!s) + return NULL; + + if (!ofi_straddr((void *)s, len, FI_ADDR_PSMX3, name)) { + free(s); + return NULL; + } + + return s; +} + +struct psmx3_ep_name *psmx3_string_to_ep_name(const void *s) +{ + void *name; + size_t len; + uint32_t fmt; + + if (!s) + return NULL; + + if (ofi_str_toaddr(s, &fmt, &name, &len)) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "invalid string address: %s.\n", + (const char *)s); + return NULL; + } + + if (fmt != FI_ADDR_PSMX3) { + FI_INFO(&psmx3_prov, FI_LOG_CORE, + "invalid string address format: %s.\n", + (const char *)s); + free(name); + return NULL; + } + + return name; +} + +static int psmx3_errno_table[PSM2_ERROR_LAST] = { + 0, /* PSM2_OK = 0 */ + 0, /* PSM2_OK_NO_PROGRESS = 1 */ + -FI_EOTHER, + -FI_EINVAL, /* PSM2_PARAM_ERR = 3 */ + -FI_ENOMEM, /* PSM2_NO_MEMORY = 4 */ + -FI_EBADF, /* PSM2_INIT_NOT_INIT = 5 */ + -FI_EINVAL, /* PSM2_INIT_BAD_API_VERSION = 6 */ + -FI_ENOSYS, /* PSM2_NO_AFFINITY = 7 */ + -FI_EIO, /* PSM2_INTERNAL_ERR = 8 */ + -FI_EINVAL, /* PSM2_SHMEM_SEGMENT_ERR = 9 */ + -FI_EACCES, /* PSM2_OPT_READONLY = 10 */ + -FI_ETIMEDOUT, /* PSM2_TIMEOUT = 11 */ + -FI_EMFILE, /* PSM2_TOO_MANY_ENDPOINTS = 12 */ + -FI_ESHUTDOWN, /* PSM2_IS_FINALIZED = 13 */ + -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, + -FI_ESHUTDOWN, /* PSM2_EP_WAS_CLOSED = 20 */ + -FI_ENODEV, /* PSM2_EP_NO_DEVICE = 21 */ + -FI_ENOENT, /* PSM2_EP_UNIT_NOT_FOUND = 22 */ + -FI_EIO, /* PSM2_EP_DEVICE_FAILURE = 23 */ + -FI_ETIMEDOUT, /* PSM2_EP_CLOSE_TIMEOUT = 24 */ + -FI_ENOENT, /* PSM2_EP_NO_PORTS_AVAIL = 25 */ + -FI_ENETDOWN, /* PSM2_EP_NO_NETWORK = 26 */ + -FI_EINVAL, /* PSM2_EP_INVALID_UUID_KEY = 27 */ + -FI_ENOSPC, /* PSM2_EP_NO_RESOURCES = 28 */ + -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, + -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, + -FI_EBADF, /* PSM2_EPID_UNKNOWN = 40 */ + -FI_ENETUNREACH,/* PSM2_EPID_UNREACHABLE = 41 */ + -FI_EOTHER, + -FI_EINVAL, /* PSM2_EPID_INVALID_NODE = 43 */ + -FI_EINVAL, /* PSM2_EPID_INVALID_MTU = 44 */ + -FI_EINVAL, /* PSM2_EPID_INVALID_UUID_KEY = 45 */ + -FI_EINVAL, /* PSM2_EPID_INVALID_VERSION = 46 */ + -FI_EINVAL, /* PSM2_EPID_INVALID_CONNECT = 47 */ + -FI_EISCONN, /* PSM2_EPID_ALREADY_CONNECTED = 48 */ + -FI_EIO, /* PSM2_EPID_NETWORK_ERROR = 49 */ + -FI_EINVAL, /* PSM2_EPID_INVALID_PKEY = 50 */ + -FI_ENETUNREACH,/* PSM2_EPID_PATH_RESOLUTION = 51 */ + -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, + -FI_EOTHER, -FI_EOTHER, + -FI_EAGAIN, /* PSM2_MQ_NO_COMPLETIONS = 60 */ + -FI_ETRUNC, /* PSM2_MQ_TRUNCATION = 61 */ + -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, + -FI_EOTHER, -FI_EOTHER, + -FI_EINVAL, /* PSM2_AM_INVALID_REPLY = 70 */ + -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, -FI_EOTHER, + -FI_EOTHER, -FI_EOTHER, -FI_EOTHER + /* PSM2_ERROR_LAST = 80 */ +}; + +int psmx3_errno(int err) +{ + if (err >= 0 && err < PSM2_ERROR_LAST) + return psmx3_errno_table[err]; + else + return -FI_EOTHER; +} + +/* + * PSM context sharing requires some information from the MPI process manager. + * Try to get the needed information from the environment. + */ +void psmx3_query_mpi(void) +{ + char *s; + char env[32]; + int local_size = -1; + int local_rank = -1; + + /* Check Open MPI */ + if ((s = getenv("OMPI_COMM_WORLD_LOCAL_SIZE"))) { + local_size = atoi(s); + if ((s = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) + local_rank = atoi(s); + snprintf(env, sizeof(env), "%d", local_size); + setenv("MPI_LOCALNRANKS", env, 0); + snprintf(env, sizeof(env), "%d", local_rank); + setenv("MPI_LOCALRANKID", env, 0); + return; + } + + /* TODO: check other MPI */ +} + diff --git a/prov/psm3/src/psmx3_wait.c b/prov/psm3/src/psmx3_wait.c new file mode 100644 index 00000000000..f57ac046fce --- /dev/null +++ b/prov/psm3/src/psmx3_wait.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2013-2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psmx3.h" + +/* It is necessary to have a separate thread making progress in order + * for the wait functions to succeed. This thread is only created when + * wait functions are called and. In order to minimize performance + * impact, it only goes active during te time when wait calls are + * blocked. + */ +static pthread_t psmx3_wait_thread; +static pthread_mutex_t psmx3_wait_mutex; +static pthread_cond_t psmx3_wait_cond; +static volatile int psmx3_wait_thread_ready = 0; +static volatile int psmx3_wait_thread_enabled = 0; +static volatile int psmx3_wait_thread_busy = 0; + +static void *psmx3_wait_progress(void *args) +{ + struct psmx3_fid_fabric *fabric = args; + struct psmx3_fid_domain *domain; + struct dlist_entry *item; + + psmx3_wait_thread_ready = 1; + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); + + while (1) { + pthread_mutex_lock(&psmx3_wait_mutex); + if (!psmx3_wait_thread_enabled) + pthread_cond_wait(&psmx3_wait_cond, &psmx3_wait_mutex); + pthread_mutex_unlock(&psmx3_wait_mutex); + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); + + psmx3_wait_thread_busy = 1; + while (psmx3_wait_thread_enabled) { + psmx3_lock(&fabric->domain_lock, 1); + dlist_foreach(&fabric->domain_list, item) { + domain = container_of(item, struct psmx3_fid_domain, entry); + if (domain->progress_thread_enabled && + domain->progress_thread != pthread_self()) + continue; + + psmx3_progress_all(domain); + + if (!psmx3_wait_thread_enabled) + break; + } + psmx3_unlock(&fabric->domain_lock, 1); + } + + psmx3_wait_thread_busy = 0; + + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + } + + return NULL; +} + +static void psmx3_wait_start_progress(struct psmx3_fid_fabric *fabric) +{ + struct dlist_entry *item; + struct psmx3_fid_domain *domain; + int run_wait_thread = 0; + pthread_attr_t attr; + int err; + + if (!fabric) + return; + + psmx3_lock(&fabric->domain_lock, 1); + dlist_foreach(&fabric->domain_list, item) { + domain = container_of(item, struct psmx3_fid_domain, entry); + if (!domain->progress_thread_enabled || + domain->progress_thread == pthread_self()) + run_wait_thread = 1; + } + psmx3_unlock(&fabric->domain_lock, 1); + + if (!run_wait_thread) + return; + + if (!psmx3_wait_thread) { + pthread_mutex_init(&psmx3_wait_mutex, NULL); + pthread_cond_init(&psmx3_wait_cond, NULL); + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_DETACHED); + err = pthread_create(&psmx3_wait_thread, &attr, + psmx3_wait_progress, (void *)fabric); + if (err) + FI_WARN(&psmx3_prov, FI_LOG_EQ, + "cannot create wait progress thread\n"); + pthread_attr_destroy(&attr); + while (!psmx3_wait_thread_ready) + ; + } + + psmx3_wait_thread_enabled = 1; + pthread_cond_signal(&psmx3_wait_cond); +} + +static void psmx3_wait_stop_progress(void) +{ + psmx3_wait_thread_enabled = 0; + + while (psmx3_wait_thread_busy) + ; +} + +static struct fi_ops_wait *psmx3_wait_ops_save; +static struct fi_ops_wait psmx3_wait_ops; + +DIRECT_FN +STATIC int psmx3_wait_wait(struct fid_wait *wait, int timeout) +{ + struct util_wait *wait_priv; + struct psmx3_fid_fabric *fabric; + int err; + + wait_priv = container_of(wait, struct util_wait, wait_fid); + fabric = container_of(wait_priv->fabric, struct psmx3_fid_fabric, util_fabric); + + psmx3_wait_start_progress(fabric); + + err = psmx3_wait_ops_save->wait(wait, timeout); + + psmx3_wait_stop_progress(); + + return err; +} + +DIRECT_FN +int psmx3_wait_open(struct fid_fabric *fabric, struct fi_wait_attr *attr, + struct fid_wait **waitset) +{ + struct fid_wait *wait; + int err; + + err = ofi_wait_fd_open(fabric, attr, &wait); + if (err) + return err; + + psmx3_wait_ops_save = wait->ops; + psmx3_wait_ops = *psmx3_wait_ops_save; + psmx3_wait_ops.wait = psmx3_wait_wait; + wait->ops = &psmx3_wait_ops; + + *waitset = wait; + return 0; +} + +DIRECT_FN +int psmx3_wait_trywait(struct fid_fabric *fabric, struct fid **fids, int count) +{ + struct psmx3_fid_cq *cq_priv; + struct util_eq *eq; + struct util_wait *wait; + int i, ret; + + for (i = 0; i < count; i++) { + switch (fids[i]->fclass) { + case FI_CLASS_CQ: + cq_priv = container_of(fids[i], struct psmx3_fid_cq, cq); + wait = cq_priv->wait; + break; + case FI_CLASS_EQ: + eq = container_of(fids[i], struct util_eq, eq_fid.fid); + wait = eq->wait; + break; + case FI_CLASS_CNTR: + return -FI_ENOSYS; + case FI_CLASS_WAIT: + wait = container_of(fids[i], struct util_wait, wait_fid.fid); + break; + default: + return -FI_EINVAL; + } + + ret = wait->wait_try(wait); + if (ret) + return ret; + } + return 0; +} + diff --git a/prov/psm3/src/version.h b/prov/psm3/src/version.h new file mode 100644 index 00000000000..82f5313643b --- /dev/null +++ b/prov/psm3/src/version.h @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2013-2020 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FI_PSM_VERSION_H_ +#define _FI_PSM_VERSION_H_ + +#if HAVE_PSM3_SRC +#include "psm3/psm2.h" +#include "psm3/psm2_mq.h" +#include "psm3/psm2_am.h" +#ifdef VALGRIND_MAKE_MEM_DEFINED +#undef VALGRIND_MAKE_MEM_DEFINED +#endif +#else +#include +#include +#include +#endif + +#define PSMX3_PROV_NAME "psm3" +#define PSMX3_DOMAIN_NAME "any_verbs_ud_device" +#define PSMX3_FABRIC_NAME "psm3" + +#define PSMX3_DEFAULT_UUID "00FF00FF-0000-0000-0000-00FF00FF00FF" +#define PROVIDER_INI PSM3_INI + +#ifndef PSMX3_USE_REQ_CONTEXT +#define PSMX3_USE_REQ_CONTEXT 1 +#endif + +#define PSMX3_STATUS_TYPE struct psm2_mq_req_user +#define PSMX3_STATUS_ERROR(s) ((s)->error_code) +#define PSMX3_STATUS_TAG(s) ((s)->tag) +#define PSMX3_STATUS_RCVLEN(s) ((s)->recv_msglen) +#define PSMX3_STATUS_SNDLEN(s) ((s)->send_msglen) +#define PSMX3_STATUS_PEER(s) ((s)->peer) +#define PSMX3_STATUS_CONTEXT(s) ((s)->context) + +/* + * Use reserved space within psm2_mq_req_user for fi_context instead of + * allocating from a internal queue. + * + * Only work with PSM2 that has psm2_mq_req_user defined. Can be turned off by + * passing "-DPSMX3_USE_REQ_CONTEXT=0" to the compiler. + */ + +#if PSMX3_USE_REQ_CONTEXT + +#define PSMX3_EP_DECL_OP_CONTEXT + +#define PSMX3_EP_INIT_OP_CONTEXT(ep) \ + do { \ + FI_INFO(&psmx3_prov, FI_LOG_EP_CTRL, \ + "skip initialization of op context list.\n"); \ + } while (0) + +#define PSMX3_EP_FINI_OP_CONTEXT(ep) + +#define PSMX3_EP_GET_OP_CONTEXT(ep, ctx) \ + do { \ + (ctx) = NULL; \ + } while (0) + +#define PSMX3_EP_PUT_OP_CONTEXT(ep, ctx) + +#define PSMX3_REQ_GET_OP_CONTEXT(req, ctx) \ + do { \ + struct psm2_mq_req_user *req_user = (void *)(req); \ + (ctx) = req_user->context = req_user->user_reserved; \ + } while (0) + +#else /* !PSMX3_USE_REQ_CONTEXT */ + +struct psmx3_context { + struct fi_context fi_context; + struct slist_entry list_entry; +}; + +#define PSMX3_EP_DECL_OP_CONTEXT \ + struct slist free_context_list; \ + fastlock_t context_lock; + +#define PSMX3_EP_INIT_OP_CONTEXT(ep) \ + do { \ + struct psmx3_context *item; \ + int i; \ + slist_init(&(ep)->free_context_list); \ + fastlock_init(&(ep)->context_lock); \ + for (i = 0; i < 64; i++) { \ + item = calloc(1, sizeof(*item)); \ + if (!item) { \ + FI_WARN(&psmx3_prov, FI_LOG_EP_CTRL, "out of memory.\n"); \ + break; \ + } \ + slist_insert_tail(&item->list_entry, &(ep)->free_context_list); \ + } \ + } while (0) + +#define PSMX3_EP_FINI_OP_CONTEXT(ep) \ + do { \ + struct slist_entry *entry; \ + struct psmx3_context *item; \ + while (!slist_empty(&(ep)->free_context_list)) { \ + entry = slist_remove_head(&(ep)->free_context_list); \ + item = container_of(entry, struct psmx3_context, list_entry); \ + free(item); \ + } \ + fastlock_destroy(&(ep)->context_lock); \ + } while (0) + +#define PSMX3_EP_GET_OP_CONTEXT(ep, ctx) \ + do { \ + struct psmx3_context *context; \ + ep->domain->context_lock_fn(&(ep)->context_lock, 2); \ + if (!slist_empty(&(ep)->free_context_list)) { \ + context = container_of(slist_remove_head(&(ep)->free_context_list), \ + struct psmx3_context, list_entry); \ + ep->domain->context_unlock_fn(&(ep)->context_lock, 2); \ + (ctx) = &context->fi_context; \ + break; \ + } \ + ep->domain->context_unlock_fn(&(ep)->context_lock, 2); \ + context = malloc(sizeof(*context)); \ + if (!context) { \ + FI_WARN(&psmx3_prov, FI_LOG_EP_DATA, "out of memory.\n"); \ + return -FI_ENOMEM; \ + } \ + (ctx) = &context->fi_context; \ + } while (0) + +#define PSMX3_EP_PUT_OP_CONTEXT(ep, ctx) \ + do { \ + struct psmx3_context *context; \ + context = container_of((ctx), struct psmx3_context, fi_context); \ + context->list_entry.next = NULL; \ + ep->domain->context_lock_fn(&(ep)->context_lock, 2); \ + slist_insert_tail(&context->list_entry, &(ep)->free_context_list); \ + ep->domain->context_unlock_fn(&(ep)->context_lock, 2); \ + } while (0) + +#endif /* !PSMX3_USE_REQ_CONTEXT */ + +#endif + diff --git a/prov/psm3/util b/prov/psm3/util new file mode 120000 index 00000000000..40c3fc5bdf8 --- /dev/null +++ b/prov/psm3/util @@ -0,0 +1 @@ +../util \ No newline at end of file diff --git a/prov/rstream/src/rstream.h b/prov/rstream/src/rstream.h index ddbc1fd5b67..0b16c0e0752 100644 --- a/prov/rstream/src/rstream.h +++ b/prov/rstream/src/rstream.h @@ -142,7 +142,7 @@ struct rstream_ctx_data { size_t len; }; -DECLARE_FREESTACK(struct rstream_ctx_data, rstream_tx_ctx_fs); +OFI_DECLARE_FREESTACK(struct rstream_ctx_data, rstream_tx_ctx_fs); struct rstream_tx_ctx { struct rstream_ctx_data *tx_ctxs; @@ -207,9 +207,9 @@ extern ssize_t rstream_post_cq_data_recv(struct rstream_ep *ep, const struct fi_cq_data_entry *cq_entry); extern int rstream_info_to_rstream(uint32_t version, const struct fi_info *core_info, - struct fi_info *info); + const struct fi_info *base_info, struct fi_info *info); extern int rstream_info_to_core(uint32_t version, const struct fi_info *rstream_info, - struct fi_info *core_info); + const struct fi_info *base_info, struct fi_info *core_info); extern void rstream_set_info(struct fi_info *info); extern struct fi_ops_cm rstream_ops_cm; extern struct fi_ops_cm rstream_ops_pep_cm; @@ -227,6 +227,6 @@ int rstream_ep_open(struct fid_domain *domain, struct fi_info *info, int rstream_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, struct fid_eq **eq, void *context); int rstream_info_to_core(uint32_t version, const struct fi_info *rstream_info, - struct fi_info *core_info); + const struct fi_info *base_info, struct fi_info *core_info); #endif /* _RSTREAM_H_ */ diff --git a/prov/rstream/src/rstream_domain.c b/prov/rstream/src/rstream_domain.c index 90df5db9dce..3b59107d771 100644 --- a/prov/rstream/src/rstream_domain.c +++ b/prov/rstream/src/rstream_domain.c @@ -79,6 +79,7 @@ static struct fi_ops_domain rstream_domain_ops = { .stx_ctx = fi_no_stx_context, .srx_ctx = fi_no_srx_context, .query_atomic = fi_no_query_atomic, + .query_collective = fi_no_query_collective, }; int rstream_domain_open(struct fid_fabric *fabric, struct fi_info *info, @@ -97,7 +98,7 @@ int rstream_domain_open(struct fid_fabric *fabric, struct fi_info *info, util_fabric.fabric_fid); ret = ofi_get_core_info(FI_VERSION(1, 8), NULL, NULL, 0, - &rstream_util_prov, info, rstream_info_to_core, &cinfo); + &rstream_util_prov, info, NULL, rstream_info_to_core, &cinfo); if (ret) goto err1; diff --git a/prov/rstream/src/rstream_ep.c b/prov/rstream/src/rstream_ep.c index 361f2a86626..27f5e21ebf5 100644 --- a/prov/rstream/src/rstream_ep.c +++ b/prov/rstream/src/rstream_ep.c @@ -250,7 +250,7 @@ int rstream_ep_open(struct fid_domain *domain, struct fi_info *info, if (ret) goto err1; - rstream_info_to_core(FI_VERSION(1, 8), NULL, info); + rstream_info_to_core(FI_VERSION(1, 8), NULL, NULL, info); if (info->handle && info->handle->fclass == FI_CLASS_PEP) { rstream_pep = container_of(info->handle, @@ -378,7 +378,7 @@ int rstream_passive_ep(struct fid_fabric *fabric, struct fi_info *info, if (!rstream_pep) return -FI_ENOMEM; - rstream_info_to_core(FI_VERSION(1, 8), NULL, info); + rstream_info_to_core(FI_VERSION(1, 8), NULL, NULL, info); ret = fi_passive_ep(rstream_fabric->msg_fabric, info, &rstream_pep->pep_fd, NULL); diff --git a/prov/rstream/src/rstream_init.c b/prov/rstream/src/rstream_init.c index 2d20bde57cd..15d09302c14 100644 --- a/prov/rstream/src/rstream_init.c +++ b/prov/rstream/src/rstream_init.c @@ -51,7 +51,7 @@ static void rstream_default_settings(struct fi_info *core_info) } int rstream_info_to_core(uint32_t version, const struct fi_info *irstream_info, - struct fi_info *core_info) + const struct fi_info *base_info, struct fi_info *core_info) { core_info->ep_attr->type = FI_EP_MSG; core_info->ep_attr->protocol = FI_PROTO_UNSPEC; @@ -90,7 +90,7 @@ static void update_rstream_info(const struct fi_info *core_info) } int rstream_info_to_rstream(uint32_t version, const struct fi_info *core_info, - struct fi_info *info) + const struct fi_info *base_info, struct fi_info *info) { info->caps = RSTREAM_CAPS; info->mode = 0; @@ -162,8 +162,8 @@ static void rstream_fini(void) struct fi_provider rstream_prov = { .name = OFI_UTIL_PREFIX "rstream", - .version = FI_VERSION(1 ,0), - .fi_version = FI_VERSION(1, 8), + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, .getinfo = rstream_getinfo, .fabric = rstream_fabric_open, .cleanup = rstream_fini diff --git a/prov/rstream/src/rstream_msg.c b/prov/rstream/src/rstream_msg.c index 9b5451c2ec7..ef33f1c1932 100644 --- a/prov/rstream/src/rstream_msg.c +++ b/prov/rstream/src/rstream_msg.c @@ -118,7 +118,7 @@ static struct fi_context *rstream_get_rx_ctx(struct rstream_ep *ep) static struct fi_context *rstream_get_tx_ctx(struct rstream_ep *ep, int len) { struct rstream_tx_ctx_fs *fs = ep->tx_ctxs; - struct rstream_ctx_data *rtn_ctx = freestack_pop(fs); + struct rstream_ctx_data *rtn_ctx = ofi_freestack_pop(fs); if (!rtn_ctx) return NULL; @@ -135,7 +135,7 @@ static int rstream_return_tx_ctx(struct fi_context *ctx_ptr, struct rstream_ctx_data *ctx_data = (struct rstream_ctx_data *)ctx_ptr; len = ctx_data->len; - freestack_push(fs, ctx_data); + ofi_freestack_push(fs, ctx_data); return len; } diff --git a/prov/rxd/src/rxd.h b/prov/rxd/src/rxd.h index ba321d58aa2..d7ad6674266 100644 --- a/prov/rxd/src/rxd.h +++ b/prov/rxd/src/rxd.h @@ -54,26 +54,25 @@ #include #include #include +#include #include "rxd_proto.h" #ifndef _RXD_H_ #define _RXD_H_ -#define RXD_MAJOR_VERSION (1) -#define RXD_MINOR_VERSION (0) #define RXD_PROTOCOL_VERSION (2) #define RXD_MAX_MTU_SIZE 4096 #define RXD_MAX_TX_BITS 10 #define RXD_MAX_RX_BITS 10 -#define RXD_DEFAULT_AV_SIZE 1024 #define RXD_BUF_POOL_ALIGNMENT 16 #define RXD_TX_POOL_CHUNK_CNT 1024 #define RXD_RX_POOL_CHUNK_CNT 1024 #define RXD_MAX_PENDING 128 #define RXD_MAX_PKT_RETRY 50 +#define RXD_ADDR_INVALID 0 #define RXD_PKT_IN_USE (1 << 0) #define RXD_PKT_ACKED (1 << 1) @@ -86,6 +85,8 @@ #define RXD_INLINE (1 << 5) #define RXD_MULTI_RECV (1 << 6) +#define RXD_IDX_OFFSET(x) (x + 1) + struct rxd_env { int spin_count; int retry; @@ -154,14 +155,12 @@ struct rxd_av { struct util_av util_av; struct fid_av *dg_av; struct ofi_rbmap rbmap; - int fi_addr_idx; - int rxd_addr_idx; int dg_av_used; size_t dg_addrlen; - - fi_addr_t *fi_addr_table; - struct rxd_addr *rxd_addr_table; + struct indexer fi_addr_idx; + struct indexer rxdaddr_dg_idx; + struct index_map rxdaddr_fi_idm; }; struct rxd_cq; @@ -219,9 +218,14 @@ struct rxd_ep { struct dlist_entry rts_sent_list; struct dlist_entry ctrl_pkts; - struct rxd_peer peers[]; + struct index_map peers_idm; }; +/* ensure ep lock is held before this function is called */ +static inline struct rxd_peer *rxd_peer(struct rxd_ep *ep, fi_addr_t rxd_addr) +{ + return ofi_idm_lookup(&ep->peers_idm, rxd_addr); +} static inline struct rxd_domain *rxd_ep_domain(struct rxd_ep *ep) { return container_of(ep->util_ep.domain, struct rxd_domain, util_domain); @@ -364,6 +368,25 @@ static inline void *rxd_pkt_start(struct rxd_pkt_entry *pkt_entry) return (void *) ((char *) pkt_entry + sizeof(*pkt_entry)); } +static inline size_t rxd_pkt_size(struct rxd_ep *ep, struct rxd_base_hdr *base_hdr, + void *ptr) +{ + return ((char *) ptr - (char *) base_hdr) + ep->tx_prefix_size; +} + +static inline void rxd_remove_free_pkt_entry(struct rxd_pkt_entry *pkt_entry) +{ + dlist_remove(&pkt_entry->d_entry); + ofi_buf_free(pkt_entry); +} + +static inline void rxd_free_unexp_msg(struct rxd_unexp_msg *unexp_msg) +{ + ofi_buf_free(unexp_msg->pkt_entry); + dlist_remove(&unexp_msg->entry); + free(unexp_msg); +} + struct rxd_match_attr { fi_addr_t peer; uint64_t tag; @@ -372,7 +395,7 @@ struct rxd_match_attr { static inline int rxd_match_addr(fi_addr_t addr, fi_addr_t match_addr) { - return (addr == FI_ADDR_UNSPEC || addr == match_addr); + return (addr == RXD_ADDR_INVALID || addr == match_addr); } static inline int rxd_match_tag(uint64_t tag, uint64_t ignore, uint64_t match_tag) @@ -381,9 +404,9 @@ static inline int rxd_match_tag(uint64_t tag, uint64_t ignore, uint64_t match_ta } int rxd_info_to_core(uint32_t version, const struct fi_info *rxd_info, - struct fi_info *core_info); + const struct fi_info *base_info, struct fi_info *core_info); int rxd_info_to_rxd(uint32_t version, const struct fi_info *core_info, - struct fi_info *info); + const struct fi_info *base_info, struct fi_info *info); int rxd_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, void *context); @@ -433,7 +456,7 @@ size_t rxd_init_msg(void **ptr, const struct iovec *iov, size_t iov_count, size_t total_len, size_t avail_len); static inline void rxd_check_init_cq_data(void **ptr, struct rxd_x_entry *tx_entry, size_t *max_inline) -{ +{ if (tx_entry->flags & RXD_REMOTE_CQ_DATA) { rxd_init_data_hdr(ptr, tx_entry); *max_inline -= sizeof(tx_entry->cq_entry.data); @@ -443,7 +466,8 @@ static inline void rxd_check_init_cq_data(void **ptr, struct rxd_x_entry *tx_ent /* Tx/Rx entry sub-functions */ struct rxd_x_entry *rxd_tx_entry_init_common(struct rxd_ep *ep, fi_addr_t addr, uint32_t op, const struct iovec *iov, size_t iov_count, - uint64_t tag, uint64_t data, uint32_t flags, void *context); + uint64_t tag, uint64_t data, uint32_t flags, void *context, + struct rxd_base_hdr **base_hdr, void **ptr); struct rxd_x_entry *rxd_rx_entry_init(struct rxd_ep *ep, const struct iovec *iov, size_t iov_count, uint64_t tag, uint64_t ignore, void *context, fi_addr_t addr, @@ -488,9 +512,13 @@ struct rxd_x_entry *rxd_progress_multi_recv(struct rxd_ep *ep, struct rxd_x_entry *rx_entry, size_t total_size); void rxd_ep_progress(struct util_ep *util_ep); +void rxd_cleanup_unexp_msg(struct rxd_unexp_msg *unexp_msg); /* CQ sub-functions */ void rxd_cq_report_error(struct rxd_cq *cq, struct fi_cq_err_entry *err_entry); void rxd_cq_report_tx_comp(struct rxd_cq *cq, struct rxd_x_entry *tx_entry); + +int rxd_create_peer(struct rxd_ep *ep, uint64_t rxd_addr); + #endif diff --git a/prov/rxd/src/rxd_atomic.c b/prov/rxd/src/rxd_atomic.c index a60e15015d0..c6bc2aae0cf 100644 --- a/prov/rxd/src/rxd_atomic.c +++ b/prov/rxd/src/rxd_atomic.c @@ -54,14 +54,10 @@ static struct rxd_x_entry *rxd_tx_entry_init_atomic(struct rxd_ep *ep, fi_addr_t OFI_UNUSED(len); tx_entry = rxd_tx_entry_init_common(ep, addr, op, iov, iov_count, 0, - data, flags, context); + data, flags, context, &base_hdr, &ptr); if (!tx_entry) return NULL; - base_hdr = rxd_get_base_hdr(tx_entry->pkt); - ptr = (void *) base_hdr; - rxd_init_base_hdr(ep, &ptr, tx_entry); - if (res_count) { tx_entry->res_count = res_count; memcpy(&tx_entry->res_iov[0], res_iov, sizeof(*res_iov) * res_count); @@ -98,8 +94,8 @@ static struct rxd_x_entry *rxd_tx_entry_init_atomic(struct rxd_ep *ep, fi_addr_t assert(len == tx_entry->bytes_done); } } - tx_entry->pkt->pkt_size = ((char *) ptr - (char *) base_hdr) + - ep->tx_prefix_size; + + tx_entry->pkt->pkt_size = rxd_pkt_size(ep, base_hdr, ptr); return tx_entry; } @@ -137,8 +133,11 @@ static ssize_t rxd_generic_atomic(struct rxd_ep *rxd_ep, if (ofi_cirque_isfull(rxd_ep->util_ep.tx_cq->cirq)) goto out; - - rxd_addr = rxd_ep_av(rxd_ep)->fi_addr_table[addr]; + + rxd_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(rxd_ep)->fi_addr_idx), + RXD_IDX_OFFSET(addr)); + if (!rxd_addr) + goto out; ret = rxd_send_rts_if_needed(rxd_ep, rxd_addr); if (ret) goto out; @@ -149,7 +148,7 @@ static ssize_t rxd_generic_atomic(struct rxd_ep *rxd_ep, if (!tx_entry) goto out; - if (rxd_ep->peers[rxd_addr].peer_addr != FI_ADDR_UNSPEC) + if (rxd_peer(rxd_ep, rxd_addr)->peer_addr != RXD_ADDR_INVALID) (void) rxd_start_xfer(rxd_ep, tx_entry); out: @@ -238,8 +237,11 @@ static ssize_t rxd_atomic_inject(struct fid_ep *ep_fid, const void *buf, if (ofi_cirque_isfull(rxd_ep->util_ep.tx_cq->cirq)) goto out; + rxd_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(rxd_ep)->fi_addr_idx), + RXD_IDX_OFFSET(addr)); + if (!rxd_addr) + goto out; - rxd_addr = rxd_ep_av(rxd_ep)->fi_addr_table[addr]; ret = rxd_send_rts_if_needed(rxd_ep, rxd_addr); if (ret) goto out; @@ -250,7 +252,7 @@ static ssize_t rxd_atomic_inject(struct fid_ep *ep_fid, const void *buf, if (!tx_entry) goto out; - if (rxd_ep->peers[rxd_addr].peer_addr == FI_ADDR_UNSPEC) + if (rxd_peer(rxd_ep, rxd_addr)->peer_addr == RXD_ADDR_INVALID) goto out; (void) rxd_start_xfer(rxd_ep, tx_entry); diff --git a/prov/rxd/src/rxd_attr.c b/prov/rxd/src/rxd_attr.c index f0c6bfd4205..26b45798c47 100644 --- a/prov/rxd/src/rxd_attr.c +++ b/prov/rxd/src/rxd_attr.c @@ -32,20 +32,26 @@ #include "rxd.h" -#define RXD_EP_CAPS (FI_MSG | FI_TAGGED | FI_RMA | FI_ATOMIC | FI_SOURCE | \ - FI_DIRECTED_RECV | FI_MULTI_RECV | FI_RMA_EVENT) -#define RXD_TX_CAPS (FI_SEND | FI_WRITE | FI_READ) -#define RXD_RX_CAPS (FI_RECV | FI_REMOTE_READ | FI_REMOTE_WRITE) -#define RXD_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM) +#define RXD_TX_CAPS (OFI_TX_MSG_CAPS | FI_TAGGED | OFI_TX_RMA_CAPS | FI_ATOMICS) +#define RXD_RX_CAPS (FI_SOURCE | FI_RMA_EVENT | OFI_RX_MSG_CAPS | FI_TAGGED | \ + OFI_RX_RMA_CAPS | FI_ATOMICS | FI_DIRECTED_RECV | FI_MULTI_RECV) #define RXD_TX_OP_FLAGS (FI_INJECT | FI_INJECT_COMPLETE | FI_COMPLETION | \ FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE) #define RXD_RX_OP_FLAGS (FI_MULTI_RECV | FI_COMPLETION) +#define RXD_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM) + +#define RXD_MSG_ORDER (FI_ORDER_ATOMIC_RAR | FI_ORDER_ATOMIC_RAW | \ + FI_ORDER_ATOMIC_WAR | FI_ORDER_ATOMIC_WAW | \ + FI_ORDER_RAR | FI_ORDER_RAS | FI_ORDER_RAW | \ + FI_ORDER_RMA_RAR | FI_ORDER_RMA_RAW | \ + FI_ORDER_RMA_WAW | FI_ORDER_SAS | FI_ORDER_SAW | \ + FI_ORDER_WAS | FI_ORDER_WAW) struct fi_tx_attr rxd_tx_attr = { - .caps = RXD_EP_CAPS | RXD_TX_CAPS, + .caps = RXD_TX_CAPS, .op_flags = RXD_TX_OP_FLAGS, .comp_order = FI_ORDER_NONE, - .msg_order = FI_ORDER_SAS, + .msg_order = RXD_MSG_ORDER, .inject_size = RXD_MAX_MTU_SIZE - sizeof(struct rxd_base_hdr), .size = (1ULL << RXD_MAX_TX_BITS), .iov_limit = RXD_IOV_LIMIT, @@ -53,10 +59,10 @@ struct fi_tx_attr rxd_tx_attr = { }; struct fi_rx_attr rxd_rx_attr = { - .caps = RXD_EP_CAPS | RXD_RX_CAPS, + .caps = RXD_RX_CAPS, .op_flags = RXD_RX_OP_FLAGS, .comp_order = FI_ORDER_NONE, - .msg_order = FI_ORDER_SAS, + .msg_order = RXD_MSG_ORDER, .total_buffered_recv = 0, .size = (1ULL << RXD_MAX_RX_BITS), .iov_limit = RXD_IOV_LIMIT @@ -94,11 +100,11 @@ struct fi_domain_attr rxd_domain_attr = { }; struct fi_fabric_attr rxd_fabric_attr = { - .prov_version = FI_VERSION(RXD_MAJOR_VERSION, RXD_MINOR_VERSION), + .prov_version = OFI_VERSION_DEF_PROV, }; struct fi_info rxd_info = { - .caps = RXD_DOMAIN_CAPS | RXD_EP_CAPS | RXD_TX_CAPS | RXD_RX_CAPS, + .caps = RXD_DOMAIN_CAPS | RXD_TX_CAPS | RXD_RX_CAPS, .addr_format = FI_FORMAT_UNSPEC, .tx_attr = &rxd_tx_attr, .rx_attr = &rxd_rx_attr, diff --git a/prov/rxd/src/rxd_av.c b/prov/rxd/src/rxd_av.c index d817c9487a4..fb8e3b73144 100644 --- a/prov/rxd/src/rxd_av.c +++ b/prov/rxd/src/rxd_av.c @@ -40,11 +40,14 @@ static int rxd_tree_compare(struct ofi_rbmap *map, void *key, void *data) uint8_t addr[RXD_NAME_LENGTH]; size_t len = RXD_NAME_LENGTH; int ret; + fi_addr_t dg_addr; memset(addr, 0, len); av = container_of(map, struct rxd_av, rbmap); - ret = fi_av_lookup(av->dg_av, av->rxd_addr_table[(fi_addr_t) data].dg_addr, - addr, &len); + dg_addr = (intptr_t)ofi_idx_lookup(&av->rxdaddr_dg_idx, + (fi_addr_t) data); + + ret = fi_av_lookup(av->dg_av, dg_addr,addr, &len); if (ret) return -1; @@ -105,43 +108,53 @@ static int rxd_av_set_addrlen(struct rxd_av *av, const void *addr) static fi_addr_t rxd_av_dg_addr(struct rxd_av *av, fi_addr_t fi_addr) { - fi_addr_t rxd_addr = av->fi_addr_table[fi_addr]; - - return rxd_addr == FI_ADDR_UNSPEC ? rxd_addr : - av->rxd_addr_table[rxd_addr].dg_addr; + fi_addr_t dg_addr; + fi_addr_t rxd_addr = (intptr_t) ofi_idx_lookup(&av->fi_addr_idx, + RXD_IDX_OFFSET(fi_addr)); + if (!rxd_addr) + goto err; + dg_addr = (intptr_t) ofi_idx_lookup(&av->rxdaddr_dg_idx, rxd_addr); + if (!dg_addr) + goto err; + + return dg_addr; +err: + return FI_ADDR_UNSPEC; } -static fi_addr_t rxd_set_rxd_addr(struct rxd_av *av, fi_addr_t dg_addr) +static int rxd_set_rxd_addr(struct rxd_av *av, fi_addr_t dg_addr, fi_addr_t *addr) { - int tries = 0; - - while (av->rxd_addr_table[av->rxd_addr_idx].dg_addr != FI_ADDR_UNSPEC && - tries < av->util_av.count) { - if (++av->rxd_addr_idx == av->util_av.count) - av->rxd_addr_idx = 0; - tries++; - } - assert(av->rxd_addr_idx < av->util_av.count && tries < av->util_av.count); - av->rxd_addr_table[av->rxd_addr_idx].dg_addr = dg_addr; + int rxdaddr; + rxdaddr = ofi_idx_insert(&(av->rxdaddr_dg_idx), (void*)(uintptr_t)dg_addr); + if (rxdaddr < 0) + return -FI_ENOMEM; + *addr = rxdaddr; + return 0; - return av->rxd_addr_idx; } static fi_addr_t rxd_set_fi_addr(struct rxd_av *av, fi_addr_t rxd_addr) { - int tries = 0; + int fi_addr; + fi_addr_t dg_addr; + fi_addr = ofi_idx_insert(&(av->fi_addr_idx), (void*)(uintptr_t)rxd_addr); + if (fi_addr < 0) + goto nomem1; - while (av->fi_addr_table[av->fi_addr_idx] != FI_ADDR_UNSPEC && - tries < av->util_av.count) { - if (++av->fi_addr_idx == av->util_av.count) - av->fi_addr_idx = 0; - tries++; - } - assert(av->fi_addr_idx < av->util_av.count && tries < av->util_av.count); - av->fi_addr_table[av->fi_addr_idx] = rxd_addr; - av->rxd_addr_table[rxd_addr].fi_addr = av->fi_addr_idx; + if (ofi_idm_set(&(av->rxdaddr_fi_idm), rxd_addr, + (void*)(uintptr_t) fi_addr) < 0) + goto nomem2; + + return fi_addr; - return av->fi_addr_idx; +nomem2: + ofi_idx_remove_ordered(&(av->fi_addr_idx), fi_addr); +nomem1: + dg_addr = (intptr_t) ofi_idx_remove_ordered(&(av->rxdaddr_dg_idx), + rxd_addr); + fi_av_remove(av->dg_av, &dg_addr, 1, 0); + + return -FI_ENOMEM; } int rxd_av_insert_dg_addr(struct rxd_av *av, const void *addr, @@ -156,16 +169,24 @@ int rxd_av_insert_dg_addr(struct rxd_av *av, const void *addr, if (ret != 1) return -FI_EINVAL; - *rxd_addr = rxd_set_rxd_addr(av, dg_addr); + ret = rxd_set_rxd_addr(av, dg_addr, rxd_addr); + if (ret < 0) { + goto nomem; + } - ret = ofi_rbmap_insert(&av->rbmap, (void *) addr, (void *) (*rxd_addr), + ret = ofi_rbmap_insert(&av->rbmap, (void *)addr, (void *)(*rxd_addr), NULL); if (ret) { assert(ret != -FI_EALREADY); - fi_av_remove(av->dg_av, &dg_addr, 1, flags); + ofi_idx_remove_ordered(&(av->rxdaddr_dg_idx), *rxd_addr); + goto nomem; } return ret; +nomem: + fi_av_remove(av->dg_av, &dg_addr, 1, flags); + return ret; + } static int rxd_av_insert(struct fid_av *av_fid, const void *addr, size_t count, @@ -173,10 +194,20 @@ static int rxd_av_insert(struct fid_av *av_fid, const void *addr, size_t count, { struct rxd_av *av; int i = 0, ret = 0, success_cnt = 0; - fi_addr_t rxd_addr, util_addr; + fi_addr_t rxd_addr; + int util_addr, *sync_err = NULL; struct ofi_rbnode *node; av = container_of(av_fid, struct rxd_av, util_av.av_fid); + ret = ofi_verify_av_insert(&av->util_av, flags, context); + if (ret) + return ret; + + if (flags & FI_SYNC_ERR) { + sync_err = context; + memset(sync_err, 0, sizeof(*sync_err) * count); + } + fastlock_acquire(&av->util_av.lock); if (!av->dg_addrlen) { ret = rxd_av_set_addrlen(av, addr); @@ -190,16 +221,24 @@ static int rxd_av_insert(struct fid_av *av_fid, const void *addr, size_t count, rxd_addr = (fi_addr_t) node->data; } else { ret = rxd_av_insert_dg_addr(av, addr, &rxd_addr, - flags, context); + flags, sync_err ? + &sync_err[i] : context); if (ret) break; } - util_addr = av->rxd_addr_table[rxd_addr].fi_addr == FI_ADDR_UNSPEC ? - rxd_set_fi_addr(av, rxd_addr) : - av->rxd_addr_table[rxd_addr].fi_addr; + util_addr = (intptr_t)ofi_idm_lookup(&av->rxdaddr_fi_idm, + rxd_addr); + + if (!util_addr) { + util_addr = rxd_set_fi_addr(av, rxd_addr); + if (util_addr < 0) { + ret = util_addr; + break; + } + } if (fi_addr) - fi_addr[i] = util_addr; + fi_addr[i] = (util_addr - 1); success_cnt++; } @@ -208,10 +247,12 @@ static int rxd_av_insert(struct fid_av *av_fid, const void *addr, size_t count, FI_WARN(&rxd_prov, FI_LOG_AV, "failed to insert address %d: %d (%s)\n", i, -ret, fi_strerror(-ret)); - if (av->util_av.eq) - ofi_av_write_event(&av->util_av, i, -ret, context); if (fi_addr) fi_addr[i] = FI_ADDR_NOTAVAIL; + if (av->util_av.eq) + ofi_av_write_event(&av->util_av, i, -ret, context); + else if (sync_err) + sync_err[i] = -ret; i++; } out: @@ -219,10 +260,12 @@ static int rxd_av_insert(struct fid_av *av_fid, const void *addr, size_t count, fastlock_release(&av->util_av.lock); for (; i < count; i++) { - if (av->util_av.eq) - ofi_av_write_event(&av->util_av, i, FI_ECANCELED, context); if (fi_addr) fi_addr[i] = FI_ADDR_NOTAVAIL; + if (av->util_av.eq) + ofi_av_write_event(&av->util_av, i, FI_ECANCELED, context); + else if (sync_err) + sync_err[i] = FI_ECANCELED; } if (av->util_av.eq) { @@ -253,35 +296,39 @@ static int rxd_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, size_t count int ret = 0; size_t i, addrlen; fi_addr_t rxd_addr; + fi_addr_t dg_addr; struct rxd_av *av; uint8_t addr[RXD_NAME_LENGTH]; - struct ofi_rbnode *node; av = container_of(av_fid, struct rxd_av, util_av.av_fid); fastlock_acquire(&av->util_av.lock); for (i = 0; i < count; i++) { - rxd_addr = av->fi_addr_table[fi_addr[i]]; addrlen = RXD_NAME_LENGTH; - ret = fi_av_lookup(av->dg_av, av->rxd_addr_table[rxd_addr].dg_addr, - addr, &addrlen); + rxd_addr = (intptr_t)ofi_idx_lookup(&av->fi_addr_idx, + RXD_IDX_OFFSET(fi_addr[i])); + if (!rxd_addr) + goto err; + + dg_addr = (intptr_t)ofi_idx_lookup(&av->rxdaddr_dg_idx, rxd_addr); + + ret = fi_av_lookup(av->dg_av, dg_addr, addr, &addrlen); if (ret) goto err; - - node = ofi_rbmap_find(&av->rbmap, (void *) addr); - if (!node) + + ret = ofi_rbmap_find_delete(&av->rbmap, (void *) addr); + if (ret) goto err; - ofi_rbmap_delete(&av->rbmap, node); + ret = fi_av_remove(av->dg_av, &dg_addr, 1, flags); - ret = fi_av_remove(av->dg_av, &av->rxd_addr_table[rxd_addr].dg_addr, - 1, flags); if (ret) goto err; - av->fi_addr_table[fi_addr[i]] = FI_ADDR_UNSPEC; - av->rxd_addr_table[rxd_addr].fi_addr = FI_ADDR_UNSPEC; - av->rxd_addr_table[rxd_addr].dg_addr = FI_ADDR_UNSPEC; + ofi_idx_remove_ordered(&(av->fi_addr_idx), + RXD_IDX_OFFSET(fi_addr[i])); + ofi_idx_remove_ordered(&(av->rxdaddr_dg_idx), rxd_addr); + ofi_idm_clear(&(av->rxdaddr_fi_idm), rxd_addr); av->dg_av_used--; } @@ -330,6 +377,7 @@ static int rxd_av_close(struct fid *fid) struct rxd_av *av; int ret; + av = container_of(fid, struct rxd_av, util_av.av_fid); ret = fi_close(&av->dg_av->fid); if (ret) @@ -340,8 +388,10 @@ static int rxd_av_close(struct fid *fid) if (ret) return ret; - free(av->fi_addr_table); - free(av->rxd_addr_table); + ofi_idx_reset(&(av->fi_addr_idx)); + ofi_idx_reset(&(av->rxdaddr_dg_idx)); + ofi_idm_reset(&(av->rxdaddr_fi_idm), NULL); + free(av); return 0; } @@ -362,7 +412,7 @@ static struct fi_ops rxd_av_fi_ops = { int rxd_av_create(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **av_fid, void *context) { - int ret, i; + int ret; struct rxd_av *av; struct rxd_domain *domain; struct util_av_attr util_attr; @@ -374,37 +424,29 @@ int rxd_av_create(struct fid_domain *domain_fid, struct fi_av_attr *attr, if (attr->name) return -FI_ENOSYS; + //TODO implement dynamic AV sizing attr->count = roundup_power_of_two(attr->count ? - attr->count : RXD_DEFAULT_AV_SIZE); + attr->count : rxd_env.max_peers); domain = container_of(domain_fid, struct rxd_domain, util_domain.domain_fid); av = calloc(1, sizeof(*av)); if (!av) return -FI_ENOMEM; - av->fi_addr_table = calloc(1, attr->count * sizeof(fi_addr_t)); - av->rxd_addr_table = calloc(1, rxd_env.max_peers * sizeof(struct rxd_addr)); - if (!av->fi_addr_table || !av->rxd_addr_table) { - ret = -FI_ENOMEM; - goto err1; - } - + memset(&(av->fi_addr_idx), 0, sizeof(av->fi_addr_idx)); + memset(&(av->rxdaddr_dg_idx), 0, sizeof(av->rxdaddr_dg_idx)); + memset(&(av->rxdaddr_fi_idm), 0, sizeof(av->rxdaddr_fi_idm)); util_attr.addrlen = sizeof(fi_addr_t); + util_attr.context_len = 0; util_attr.flags = 0; attr->type = domain->util_domain.av_type != FI_AV_UNSPEC ? domain->util_domain.av_type : FI_AV_TABLE; ret = ofi_av_init(&domain->util_domain, attr, &util_attr, - &av->util_av, context); + &av->util_av, context); if (ret) goto err1; ofi_rbmap_init(&av->rbmap, rxd_tree_compare); - for (i = 0; i < attr->count; av->fi_addr_table[i++] = FI_ADDR_UNSPEC) - ; - for (i = 0; i < rxd_env.max_peers; i++) { - av->rxd_addr_table[i].fi_addr = FI_ADDR_UNSPEC; - av->rxd_addr_table[i].dg_addr = FI_ADDR_UNSPEC; - } av_attr = *attr; av_attr.count = 0; @@ -421,8 +463,6 @@ int rxd_av_create(struct fid_domain *domain_fid, struct fi_av_attr *attr, err2: ofi_av_close(&av->util_av); err1: - free(av->fi_addr_table); - free(av->rxd_addr_table); free(av); return ret; } diff --git a/prov/rxd/src/rxd_cq.c b/prov/rxd/src/rxd_cq.c index 8005531c194..244dd5e86c9 100644 --- a/prov/rxd/src/rxd_cq.c +++ b/prov/rxd/src/rxd_cq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Intel Corporation. All rights reserved. + * Copyright (c) 2013-2020 Intel Corporation. All rights reserved. * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -93,7 +93,7 @@ static int rxd_match_pkt_entry(struct slist_entry *item, const void *arg) { return ((struct rxd_pkt_entry *) arg == container_of(item, struct rxd_pkt_entry, s_entry)); -} +} static void rxd_remove_rx_pkt(struct rxd_ep *ep, struct rxd_pkt_entry *pkt_entry) { @@ -190,8 +190,8 @@ void rxd_ep_recv_data(struct rxd_ep *ep, struct rxd_x_entry *x_entry, x_entry->next_seg_no++; if (x_entry->next_seg_no < x_entry->num_segs) { - if (!(ep->peers[pkt->base_hdr.peer].rx_seq_no % - ep->peers[pkt->base_hdr.peer].rx_window)) + if (!(rxd_peer(ep, pkt->base_hdr.peer)->rx_seq_no % + rxd_peer(ep, pkt->base_hdr.peer)->rx_window)) rxd_ep_send_ack(ep, pkt->base_hdr.peer); return; } @@ -207,32 +207,33 @@ static void rxd_verify_active(struct rxd_ep *ep, fi_addr_t addr, fi_addr_t peer_ { struct rxd_pkt_entry *pkt_entry; - if (ep->peers[addr].peer_addr != FI_ADDR_UNSPEC && - ep->peers[addr].peer_addr != peer_addr) + if (rxd_peer(ep, addr)->peer_addr != RXD_ADDR_INVALID && + rxd_peer(ep, addr)->peer_addr != peer_addr) FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "overwriting active peer - unexpected behavior\n"); - ep->peers[addr].peer_addr = peer_addr; + rxd_peer(ep, addr)->peer_addr = peer_addr; - if (!dlist_empty(&ep->peers[addr].unacked) && - rxd_get_base_hdr(container_of((&ep->peers[addr].unacked)->next, + if (!dlist_empty(&(rxd_peer(ep, addr)->unacked)) && + rxd_get_base_hdr(container_of((&(rxd_peer(ep, addr)->unacked))->next, struct rxd_pkt_entry, d_entry))->type == RXD_RTS) { - dlist_pop_front(&ep->peers[addr].unacked, + dlist_pop_front(&(rxd_peer(ep, addr)->unacked), struct rxd_pkt_entry, pkt_entry, d_entry); if (pkt_entry->flags & RXD_PKT_IN_USE) { dlist_insert_tail(&pkt_entry->d_entry, &ep->ctrl_pkts); pkt_entry->flags |= RXD_PKT_ACKED; } else { ofi_buf_free(pkt_entry); - ep->peers[addr].unacked_cnt--; + rxd_peer(ep, addr)->unacked_cnt--; } - dlist_remove(&ep->peers[addr].entry); + dlist_remove(&(rxd_peer(ep, addr)->entry)); } - if (!ep->peers[addr].active) { - dlist_insert_tail(&ep->peers[addr].entry, &ep->active_peers); - ep->peers[addr].retry_cnt = 0; - ep->peers[addr].active = 1; + if (!rxd_peer(ep, addr)->active) { + dlist_insert_tail(&(rxd_peer(ep, addr)->entry), + &ep->active_peers); + rxd_peer(ep, addr)->retry_cnt = 0; + rxd_peer(ep, addr)->active = 1; } } @@ -240,17 +241,17 @@ int rxd_start_xfer(struct rxd_ep *ep, struct rxd_x_entry *tx_entry) { struct rxd_base_hdr *hdr = rxd_get_base_hdr(tx_entry->pkt); - if (ep->peers[tx_entry->peer].unacked_cnt >= - ep->peers[tx_entry->peer].tx_window) + if (rxd_peer(ep, tx_entry->peer)->unacked_cnt >= + rxd_peer(ep, tx_entry->peer)->tx_window) return 0; - tx_entry->start_seq = rxd_set_pkt_seq(&ep->peers[tx_entry->peer], + tx_entry->start_seq = rxd_set_pkt_seq(rxd_peer(ep, tx_entry->peer), tx_entry->pkt); if (tx_entry->op != RXD_READ_REQ && tx_entry->num_segs > 1) { - ep->peers[tx_entry->peer].tx_seq_no = tx_entry->start_seq + + rxd_peer(ep, tx_entry->peer)->tx_seq_no = tx_entry->start_seq + tx_entry->num_segs; } - hdr->peer = ep->peers[tx_entry->peer].peer_addr; + hdr->peer = rxd_peer(ep, tx_entry->peer)->peer_addr; rxd_ep_send_pkt(ep, tx_entry->pkt); rxd_insert_unacked(ep, tx_entry->peer, tx_entry->pkt); tx_entry->pkt = NULL; @@ -259,11 +260,11 @@ int rxd_start_xfer(struct rxd_ep *ep, struct rxd_x_entry *tx_entry) tx_entry->op == RXD_ATOMIC_COMPARE) { dlist_remove(&tx_entry->entry); dlist_insert_tail(&tx_entry->entry, - &ep->peers[tx_entry->peer].rma_rx_list); + &(rxd_peer(ep, tx_entry->peer)->rma_rx_list)); } - return ep->peers[tx_entry->peer].unacked_cnt < - ep->peers[tx_entry->peer].tx_window; + return rxd_peer(ep, tx_entry->peer)->unacked_cnt < + rxd_peer(ep,tx_entry->peer)->tx_window; } void rxd_progress_tx_list(struct rxd_ep *ep, struct rxd_peer *peer) @@ -279,7 +280,7 @@ void rxd_progress_tx_list(struct rxd_ep *ep, struct rxd_peer *peer) struct rxd_pkt_entry, d_entry))->seq_no; } - if (peer->peer_addr == FI_ADDR_UNSPEC) + if (peer->peer_addr == RXD_ADDR_INVALID) return; dlist_foreach_container_safe(&peer->tx_list, struct rxd_x_entry, @@ -302,14 +303,14 @@ void rxd_progress_tx_list(struct rxd_ep *ep, struct rxd_peer *peer) } continue; } - + if (tx_entry->op == RXD_DATA_READ && !tx_entry->bytes_done) { - if (ep->peers[tx_entry->peer].unacked_cnt >= - ep->peers[tx_entry->peer].tx_window) { + if (rxd_peer(ep, tx_entry->peer)->unacked_cnt >= + rxd_peer(ep, tx_entry->peer)->tx_window) { break; - } - tx_entry->start_seq = ep->peers[tx_entry->peer].tx_seq_no; - ep->peers[tx_entry->peer].tx_seq_no = tx_entry->start_seq + + } + tx_entry->start_seq = rxd_peer(ep,tx_entry->peer)->tx_seq_no; + rxd_peer(ep, tx_entry->peer)->tx_seq_no = tx_entry->start_seq + tx_entry->num_segs; inc = 1; } @@ -317,7 +318,7 @@ void rxd_progress_tx_list(struct rxd_ep *ep, struct rxd_peer *peer) ret = rxd_ep_post_data_pkts(ep, tx_entry); if (ret) { if (ret == -FI_ENOMEM && inc) - ep->peers[tx_entry->peer].tx_seq_no -= + rxd_peer(ep, tx_entry->peer)->tx_seq_no -= tx_entry->num_segs; break; } @@ -330,7 +331,7 @@ void rxd_progress_tx_list(struct rxd_ep *ep, struct rxd_peer *peer) static void rxd_update_peer(struct rxd_ep *ep, fi_addr_t peer, fi_addr_t peer_addr) { rxd_verify_active(ep, peer, peer_addr); - rxd_progress_tx_list(ep, &ep->peers[peer]); + rxd_progress_tx_list(ep, rxd_peer(ep, peer)); } static int rxd_send_cts(struct rxd_ep *rxd_ep, struct rxd_rts_pkt *rts_pkt, @@ -357,10 +358,8 @@ static int rxd_send_cts(struct rxd_ep *rxd_ep, struct rxd_rts_pkt *rts_pkt, dlist_insert_tail(&pkt_entry->d_entry, &rxd_ep->ctrl_pkts); ret = rxd_ep_send_pkt(rxd_ep, pkt_entry); - if (ret) { - dlist_remove(&pkt_entry->d_entry); - ofi_buf_free(pkt_entry); - } + if (ret) + rxd_remove_free_pkt_entry(pkt_entry); return ret; } @@ -440,6 +439,11 @@ static void rxd_handle_rts(struct rxd_ep *ep, struct rxd_pkt_entry *pkt_entry) return; } + if (!rxd_peer(ep, rxd_addr)) { + if (rxd_create_peer(ep, rxd_addr) < 0) + return; + } + if (rxd_send_cts(ep, pkt, rxd_addr)) { FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "error posting CTS\n"); @@ -514,18 +518,17 @@ static struct rxd_x_entry *rxd_match_rx(struct rxd_ep *ep, } if (!match) { - assert(!ep->peers[base->peer].curr_unexp); + assert(!rxd_peer(ep, base->peer)->curr_unexp); unexp_msg = rxd_init_unexp(ep, pkt_entry, base, op, tag, data, msg, msg_size); if (unexp_msg) { dlist_insert_tail(&unexp_msg->entry, unexp_list); - ep->peers[base->peer].curr_unexp = unexp_msg; + rxd_peer(ep, base->peer)->curr_unexp = unexp_msg; } return NULL; } rx_entry = container_of(match, struct rxd_x_entry, entry); - total_size = op ? op->size : msg_size; if (rx_entry->flags & RXD_MULTI_RECV) { @@ -558,7 +561,7 @@ static int rxd_verify_iov(struct rxd_ep *ep, struct ofi_rma_iov *rma, iov[i].iov_len = rma[i].len; if (ret) { FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "could not verify MR\n"); - return -FI_EACCES; + return -FI_EACCES; } } return 0; @@ -596,9 +599,9 @@ static struct rxd_x_entry *rxd_rma_read_entry_init(struct rxd_ep *ep, rx_entry->cq_entry.flags = ofi_rx_cq_flags(RXD_READ_REQ); rx_entry->cq_entry.len = sar_hdr->size; - dlist_insert_tail(&rx_entry->entry, &ep->peers[rx_entry->peer].tx_list); + dlist_insert_tail(&rx_entry->entry, &(rxd_peer(ep, rx_entry->peer)->tx_list)); - rxd_progress_tx_list(ep, &ep->peers[rx_entry->peer]); + rxd_progress_tx_list(ep, rxd_peer(ep, rx_entry->peer)); return rx_entry; } @@ -671,11 +674,11 @@ static struct rxd_x_entry *rxd_rx_atomic_fetch(struct rxd_ep *ep, if (rx_entry->bytes_done != rx_entry->cq_entry.len) FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "fetch data length mismatch\n"); - dlist_insert_tail(&rx_entry->entry, &ep->peers[rx_entry->peer].tx_list); + dlist_insert_tail(&rx_entry->entry, &(rxd_peer(ep, rx_entry->peer)->tx_list)); rxd_ep_send_ack(ep, base_hdr->peer); - rxd_progress_tx_list(ep, &ep->peers[rx_entry->peer]); + rxd_progress_tx_list(ep, rxd_peer(ep, rx_entry->peer)); return rx_entry; } @@ -783,11 +786,11 @@ void rxd_do_atomic(void *src, void *dst, void *cmp, enum fi_datatype datatype, { char tmp_result[RXD_MAX_MTU_SIZE]; - if (atomic_op >= OFI_SWAP_OP_START) { - ofi_atomic_swap_handlers[atomic_op - OFI_SWAP_OP_START][datatype](dst, - src, cmp, tmp_result, cnt); - } else if (atomic_op != FI_ATOMIC_READ) { - ofi_atomic_write_handlers[atomic_op][datatype](dst, src, cnt); + if (ofi_atomic_isswap_op(atomic_op)) { + ofi_atomic_swap_handler(atomic_op, datatype, dst, src, cmp, + tmp_result, cnt); + } else if (ofi_atomic_iswrite_op(atomic_op)) { + ofi_atomic_write_handler(atomic_op, datatype, dst, src, cnt); } } @@ -804,22 +807,30 @@ void rxd_progress_atom_op(struct rxd_ep *ep, struct rxd_x_entry *rx_entry, void **msg, size_t msg_size) { char *src, *cmp; - size_t len; + size_t data_size, len; int i, iov_count; src = (char *) (*msg); - cmp = base_hdr->type == RXD_ATOMIC_COMPARE ? (char *) (*msg) + - (msg_size / 2) : NULL; - + cmp = base_hdr->type == RXD_ATOMIC_COMPARE ? src + (msg_size / 2) : NULL; iov_count = sar_hdr ? sar_hdr->iov_count : 1; - for (i = len = 0; i < iov_count; i++) { + + data_size = ofi_datatype_size(atom_hdr->datatype); + if (!data_size) { + FI_WARN(&rxd_prov, FI_LOG_EP_DATA, + "Invalid atomic datatype received\n"); + len = ofi_total_iov_len(rx_entry->iov, iov_count); + goto out; + } + + for (i = 0, len = 0; i < iov_count; i++) { rxd_do_atomic(&src[len], rx_entry->iov[i].iov_base, - cmp ? &cmp[len] : NULL, atom_hdr->datatype, - atom_hdr->atomic_op, rx_entry->iov[i].iov_len / - ofi_datatype_size(atom_hdr->datatype)); + cmp ? &cmp[len] : NULL, + atom_hdr->datatype, atom_hdr->atomic_op, + rx_entry->iov[i].iov_len / data_size); len += rx_entry->iov[i].iov_len; } +out: if (base_hdr->type == RXD_ATOMIC) rx_entry->bytes_done = len; } @@ -835,9 +846,9 @@ void rxd_progress_op(struct rxd_ep *ep, struct rxd_x_entry *rx_entry, void **msg, size_t size) { if (sar_hdr) - ep->peers[base_hdr->peer].curr_tx_id = sar_hdr->tx_id; + rxd_peer(ep, base_hdr->peer)->curr_tx_id = sar_hdr->tx_id; - ep->peers[base_hdr->peer].curr_rx_id = rx_entry->rx_id; + rxd_peer(ep, base_hdr->peer)->curr_rx_id = rx_entry->rx_id; if (base_hdr->type == RXD_READ_REQ) return; @@ -871,7 +882,7 @@ void rxd_progress_op(struct rxd_ep *ep, struct rxd_x_entry *rx_entry, rx_entry->next_seg_no++; rx_entry->start_seq = base_hdr->seq_no; - dlist_insert_tail(&rx_entry->entry, &ep->peers[base_hdr->peer].rx_list); + dlist_insert_tail(&rx_entry->entry, &(rxd_peer(ep, base_hdr->peer)->rx_list)); } static struct rxd_x_entry *rxd_get_data_x_entry(struct rxd_ep *ep, @@ -879,7 +890,7 @@ static struct rxd_x_entry *rxd_get_data_x_entry(struct rxd_ep *ep, { if (data_pkt->base_hdr.type == RXD_DATA) return ofi_bufpool_get_ibuf(ep->rx_entry_pool.pool, - ep->peers[data_pkt->base_hdr.peer].curr_rx_id); + rxd_peer(ep, data_pkt->base_hdr.peer)->curr_rx_id); return ofi_bufpool_get_ibuf(ep->tx_entry_pool.pool, data_pkt->ext_hdr.tx_id); } @@ -899,14 +910,15 @@ static void rxd_progress_buf_pkts(struct rxd_ep *ep, fi_addr_t peer) size_t msg_size; struct rxd_x_entry *rx_entry = NULL; struct rxd_data_pkt *data_pkt; + struct dlist_entry *bufpkts; - while (!dlist_empty(&ep->peers[peer].buf_pkts)) { - pkt_entry = container_of((&ep->peers[peer].buf_pkts)->next, - struct rxd_pkt_entry, d_entry); + bufpkts = &(rxd_peer(ep, peer)->buf_pkts); + while (!dlist_empty(bufpkts)) { + pkt_entry = container_of(bufpkts->next, struct rxd_pkt_entry, + d_entry); base_hdr = rxd_get_base_hdr(pkt_entry); - if (base_hdr->seq_no != ep->peers[peer].rx_seq_no) + if (base_hdr->seq_no != rxd_peer(ep, peer)->rx_seq_no) return; - if (base_hdr->type == RXD_DATA || base_hdr->type == RXD_DATA_READ) { data_pkt = (struct rxd_data_pkt *) pkt_entry->pkt; rx_entry = rxd_get_data_x_entry(ep, data_pkt); @@ -924,15 +936,14 @@ static void rxd_progress_buf_pkts(struct rxd_ep *ep, fi_addr_t peer) if (ret) FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "could not write error entry\n"); - ep->peers[base_hdr->peer].rx_seq_no++; - dlist_remove(&pkt_entry->d_entry); - ofi_buf_free(pkt_entry); + rxd_peer(ep, base_hdr->peer)->rx_seq_no++; + rxd_remove_free_pkt_entry(pkt_entry); continue; } if (!rx_entry) { if (base_hdr->type == RXD_MSG || base_hdr->type == RXD_TAGGED) { - ep->peers[base_hdr->peer].rx_seq_no++; + rxd_peer(ep, base_hdr->peer)->rx_seq_no++; continue; } break; @@ -943,9 +954,8 @@ static void rxd_progress_buf_pkts(struct rxd_ep *ep, fi_addr_t peer) atom_hdr, &msg, msg_size); } - ep->peers[base_hdr->peer].rx_seq_no++; - dlist_remove(&pkt_entry->d_entry); - ofi_buf_free(pkt_entry); + rxd_peer(ep,base_hdr->peer)->rx_seq_no++; + rxd_remove_free_pkt_entry(pkt_entry); } } @@ -961,33 +971,34 @@ static void rxd_handle_data(struct rxd_ep *ep, struct rxd_pkt_entry *pkt_entry) goto free; } - if (pkt->base_hdr.seq_no == ep->peers[pkt->base_hdr.peer].rx_seq_no) { - ep->peers[pkt->base_hdr.peer].rx_seq_no++; + if (pkt->base_hdr.seq_no == rxd_peer(ep, + pkt->base_hdr.peer)->rx_seq_no) { + rxd_peer(ep, pkt->base_hdr.peer)->rx_seq_no++; if (pkt->base_hdr.type == RXD_DATA && - ep->peers[pkt->base_hdr.peer].curr_unexp) { - unexp_msg = ep->peers[pkt->base_hdr.peer].curr_unexp; + rxd_peer(ep, pkt->base_hdr.peer)->curr_unexp) { + unexp_msg = rxd_peer(ep, pkt->base_hdr.peer)->curr_unexp; dlist_insert_tail(&pkt_entry->d_entry, &unexp_msg->pkt_list); if (pkt->ext_hdr.seg_no + 1 == unexp_msg->sar_hdr->num_segs - 1) { - ep->peers[pkt->base_hdr.peer].curr_unexp = NULL; + rxd_peer(ep, pkt->base_hdr.peer)->curr_unexp = NULL; rxd_ep_send_ack(ep, pkt->base_hdr.peer); } - rxd_remove_rx_pkt(ep, pkt_entry); return; } x_entry = rxd_get_data_x_entry(ep, pkt); rxd_ep_recv_data(ep, x_entry, pkt, pkt_entry->pkt_size); - if (!dlist_empty(&ep->peers[pkt->base_hdr.peer].buf_pkts)) + if (!dlist_empty(&(rxd_peer(ep, + pkt->base_hdr.peer)->buf_pkts))) rxd_progress_buf_pkts(ep, pkt->base_hdr.peer); } else if (!rxd_env.retry) { - rxd_remove_rx_pkt(ep, pkt_entry); - dlist_insert_order(&ep->peers[pkt->base_hdr.peer].buf_pkts, + dlist_insert_order(&(rxd_peer(ep, + pkt->base_hdr.peer)->buf_pkts), &rxd_comp_pkt_seq_no, &pkt_entry->d_entry); return; - } else if (ep->peers[pkt->base_hdr.peer].peer_addr != FI_ADDR_UNSPEC) { + } else if (rxd_peer(ep, pkt->base_hdr.peer)->peer_addr != + RXD_ADDR_INVALID) { rxd_ep_send_ack(ep, pkt->base_hdr.peer); } free: - rxd_remove_rx_pkt(ep, pkt_entry); ofi_buf_free(pkt_entry); } @@ -1004,20 +1015,19 @@ static void rxd_handle_op(struct rxd_ep *ep, struct rxd_pkt_entry *pkt_entry) size_t msg_size; int ret; - if (base_hdr->seq_no != ep->peers[base_hdr->peer].rx_seq_no) { + if (base_hdr->seq_no != rxd_peer(ep, base_hdr->peer)->rx_seq_no) { if (!rxd_env.retry) { - rxd_remove_rx_pkt(ep, pkt_entry); - dlist_insert_order(&ep->peers[base_hdr->peer].buf_pkts, + dlist_insert_order(&(rxd_peer(ep, base_hdr->peer)->buf_pkts), &rxd_comp_pkt_seq_no, &pkt_entry->d_entry); return; } - if (ep->peers[base_hdr->peer].peer_addr != FI_ADDR_UNSPEC) + if (rxd_peer(ep, base_hdr->peer)->peer_addr != RXD_ADDR_INVALID) goto ack; goto release; } - if (ep->peers[base_hdr->peer].peer_addr == FI_ADDR_UNSPEC) + if (rxd_peer(ep, base_hdr->peer)->peer_addr == RXD_ADDR_INVALID) goto release; ret = rxd_unpack_init_rx(ep, &rx_entry, pkt_entry, base_hdr, &sar_hdr, @@ -1028,34 +1038,32 @@ static void rxd_handle_op(struct rxd_ep *ep, struct rxd_pkt_entry *pkt_entry) if (!rx_entry) { if (base_hdr->type == RXD_MSG || base_hdr->type == RXD_TAGGED) { - if (!ep->peers[base_hdr->peer].curr_unexp) + if (!rxd_peer(ep, base_hdr->peer)->curr_unexp) goto ack; - ep->peers[base_hdr->peer].rx_seq_no++; - rxd_remove_rx_pkt(ep, pkt_entry); + rxd_peer(ep, base_hdr->peer)->rx_seq_no++; if (!sar_hdr) - ep->peers[base_hdr->peer].curr_unexp = NULL; + rxd_peer(ep, base_hdr->peer)->curr_unexp = NULL; rxd_ep_send_ack(ep, base_hdr->peer); return; } - ep->peers[base_hdr->peer].rx_window = 0; + rxd_peer(ep, base_hdr->peer)->rx_window = 0; goto ack; } - ep->peers[base_hdr->peer].rx_seq_no++; - ep->peers[base_hdr->peer].rx_window = rxd_env.max_unacked; + rxd_peer(ep, base_hdr->peer)->rx_seq_no++; + rxd_peer(ep, base_hdr->peer)->rx_window = rxd_env.max_unacked; rxd_progress_op(ep, rx_entry, pkt_entry, base_hdr, sar_hdr, tag_hdr, data_hdr, rma_hdr, atom_hdr, &msg, msg_size); - if (!dlist_empty(&ep->peers[base_hdr->peer].buf_pkts)) + if (!dlist_empty(&(rxd_peer(ep, base_hdr->peer)->buf_pkts))) rxd_progress_buf_pkts(ep, base_hdr->peer); ack: rxd_ep_send_ack(ep, base_hdr->peer); release: - rxd_remove_rx_pkt(ep, pkt_entry); ofi_buf_free(pkt_entry); } @@ -1079,20 +1087,22 @@ static void rxd_handle_ack(struct rxd_ep *ep, struct rxd_pkt_entry *ack_entry) fi_addr_t peer = ack->base_hdr.peer; struct rxd_base_hdr *hdr; - ep->peers[peer].tx_window = ack->ext_hdr.rx_id; + rxd_peer(ep, peer)->tx_window = ack->ext_hdr.rx_id; - if (ep->peers[peer].last_rx_ack == ack->base_hdr.seq_no) + if (rxd_peer(ep, peer)->last_rx_ack == ack->base_hdr.seq_no) return; - ep->peers[peer].last_rx_ack = ack->base_hdr.seq_no; + rxd_peer(ep, peer)->last_rx_ack = ack->base_hdr.seq_no; - if (dlist_empty(&ep->peers[peer].unacked)) + if (dlist_empty(&(rxd_peer(ep, peer)->unacked))) return; - pkt_entry = container_of((&ep->peers[peer].unacked)->next, - struct rxd_pkt_entry, d_entry); + pkt_entry = container_of((&(rxd_peer(ep, + peer)->unacked))->next, + struct rxd_pkt_entry, d_entry); - while (&pkt_entry->d_entry != &ep->peers[peer].unacked) { + while (&pkt_entry->d_entry != &(rxd_peer(ep, + peer)->unacked)) { hdr = rxd_get_base_hdr(pkt_entry); if (ofi_after_eq(hdr->seq_no, ack->base_hdr.seq_no)) break; @@ -1103,17 +1113,16 @@ static void rxd_handle_ack(struct rxd_ep *ep, struct rxd_pkt_entry *ack_entry) struct rxd_pkt_entry, d_entry); continue; } - dlist_remove(&pkt_entry->d_entry); - ofi_buf_free(pkt_entry); - ep->peers[peer].unacked_cnt--; - ep->peers[peer].retry_cnt = 0; + rxd_remove_free_pkt_entry(pkt_entry); + rxd_peer(ep, peer)->unacked_cnt--; + rxd_peer(ep, peer)->retry_cnt = 0; - pkt_entry = container_of((&ep->peers[peer].unacked)->next, + pkt_entry = container_of((&(rxd_peer(ep, peer)->unacked))->next, struct rxd_pkt_entry, d_entry); } - rxd_progress_tx_list(ep, &ep->peers[ack->base_hdr.peer]); -} + rxd_progress_tx_list(ep, rxd_peer(ep, ack->base_hdr.peer)); +} void rxd_handle_send_comp(struct rxd_ep *ep, struct fi_cq_msg_entry *comp) { @@ -1128,16 +1137,14 @@ void rxd_handle_send_comp(struct rxd_ep *ep, struct fi_cq_msg_entry *comp) switch (rxd_pkt_type(pkt_entry)) { case RXD_CTS: case RXD_ACK: - dlist_remove(&pkt_entry->d_entry); - ofi_buf_free(pkt_entry); + rxd_remove_free_pkt_entry(pkt_entry); break; default: if (pkt_entry->flags & RXD_PKT_ACKED) { peer = pkt_entry->peer; - dlist_remove(&pkt_entry->d_entry); - ofi_buf_free(pkt_entry); - ep->peers[peer].unacked_cnt--; - rxd_progress_tx_list(ep, &ep->peers[peer]); + rxd_remove_free_pkt_entry(pkt_entry); + rxd_peer(ep, peer)->unacked_cnt--; + rxd_progress_tx_list(ep, rxd_peer(ep, peer)); } else { pkt_entry->flags &= ~RXD_PKT_IN_USE; } @@ -1154,6 +1161,7 @@ void rxd_handle_recv_comp(struct rxd_ep *ep, struct fi_cq_msg_entry *comp) rxd_pkt_type_str[(rxd_pkt_type(pkt_entry))]); rxd_ep_post_buf(ep); + rxd_remove_rx_pkt(ep, pkt_entry); pkt_entry->pkt_size = comp->len; switch (rxd_pkt_type(pkt_entry)) { @@ -1170,18 +1178,15 @@ void rxd_handle_recv_comp(struct rxd_ep *ep, struct fi_cq_msg_entry *comp) case RXD_DATA_READ: rxd_handle_data(ep, pkt_entry); /* don't need to perform action below: - * - remove RX packet * - release/repost RX packet */ return; default: rxd_handle_op(ep, pkt_entry); /* don't need to perform action below: - * - remove RX packet * - release/repost RX packet */ return; } - rxd_remove_rx_pkt(ep, pkt_entry); ofi_buf_free(pkt_entry); } @@ -1197,7 +1202,7 @@ void rxd_handle_error(struct rxd_ep *ep) } else { FI_WARN(&rxd_prov, FI_LOG_CQ, "Received %s error from core provider: %s\n", - err.flags & FI_SEND ? "tx" : "rx", fi_strerror(-err.err)); + err.flags & FI_SEND ? "tx" : "rx", fi_strerror(-err.err)); } } diff --git a/prov/rxd/src/rxd_domain.c b/prov/rxd/src/rxd_domain.c index a5c63ec7d29..96c1e7982ca 100644 --- a/prov/rxd/src/rxd_domain.c +++ b/prov/rxd/src/rxd_domain.c @@ -48,6 +48,7 @@ static struct fi_ops_domain rxd_domain_ops = { .stx_ctx = fi_no_stx_context, .srx_ctx = fi_no_srx_context, .query_atomic = rxd_query_atomic, + .query_collective = fi_no_query_collective, }; static int rxd_domain_close(fid_t fid) @@ -113,7 +114,7 @@ int rxd_domain_open(struct fid_fabric *fabric, struct fi_info *info, return -FI_ENOMEM; ret = ofi_get_core_info(fabric->api_version, NULL, NULL, - 0, &rxd_util_prov, info, + 0, &rxd_util_prov, info, NULL, rxd_info_to_core, &dg_info); if (ret) goto err1; diff --git a/prov/rxd/src/rxd_ep.c b/prov/rxd/src/rxd_ep.c index e195e56573d..548e264a8d0 100644 --- a/prov/rxd/src/rxd_ep.c +++ b/prov/rxd/src/rxd_ep.c @@ -305,7 +305,7 @@ void rxd_init_data_pkt(struct rxd_ep *ep, struct rxd_x_entry *tx_entry, data_pkt->ext_hdr.rx_id = tx_entry->rx_id; data_pkt->ext_hdr.tx_id = tx_entry->tx_id; data_pkt->ext_hdr.seg_no = tx_entry->next_seg_no++; - data_pkt->base_hdr.peer = ep->peers[tx_entry->peer].peer_addr; + data_pkt->base_hdr.peer = rxd_peer(ep, tx_entry->peer)->peer_addr; pkt_entry->pkt_size = ofi_copy_from_iov(data_pkt->msg, seg_size, tx_entry->iov, @@ -320,7 +320,8 @@ void rxd_init_data_pkt(struct rxd_ep *ep, struct rxd_x_entry *tx_entry, struct rxd_x_entry *rxd_tx_entry_init_common(struct rxd_ep *ep, fi_addr_t addr, uint32_t op, const struct iovec *iov, size_t iov_count, - uint64_t tag, uint64_t data, uint32_t flags, void *context) + uint64_t tag, uint64_t data, uint32_t flags, void *context, + struct rxd_base_hdr **base_hdr, void **ptr) { struct rxd_x_entry *tx_entry; @@ -354,8 +355,12 @@ struct rxd_x_entry *rxd_tx_entry_init_common(struct rxd_ep *ep, fi_addr_t addr, tx_entry->pkt->peer = tx_entry->peer; + *base_hdr = rxd_get_base_hdr(tx_entry->pkt); + *ptr = (void *) *base_hdr; + rxd_init_base_hdr(ep, &(*ptr), tx_entry); + dlist_insert_tail(&tx_entry->entry, - &ep->peers[tx_entry->peer].tx_list); + &(rxd_peer(ep, tx_entry->peer)->tx_list)); return tx_entry; } @@ -372,8 +377,8 @@ void rxd_insert_unacked(struct rxd_ep *ep, fi_addr_t peer, struct rxd_pkt_entry *pkt_entry) { dlist_insert_tail(&pkt_entry->d_entry, - &ep->peers[peer].unacked); - ep->peers[peer].unacked_cnt++; + &(rxd_peer(ep, peer)->unacked)); + rxd_peer(ep, peer)->unacked_cnt++; } ssize_t rxd_ep_post_data_pkts(struct rxd_ep *ep, struct rxd_x_entry *tx_entry) @@ -382,8 +387,8 @@ ssize_t rxd_ep_post_data_pkts(struct rxd_ep *ep, struct rxd_x_entry *tx_entry) struct rxd_data_pkt *data; while (tx_entry->bytes_done != tx_entry->cq_entry.len) { - if (ep->peers[tx_entry->peer].unacked_cnt >= - ep->peers[tx_entry->peer].tx_window) + if (rxd_peer(ep, tx_entry->peer)->unacked_cnt >= + rxd_peer(ep, tx_entry->peer)->tx_window) return 0; pkt_entry = rxd_get_tx_pkt(ep); @@ -402,19 +407,20 @@ ssize_t rxd_ep_post_data_pkts(struct rxd_ep *ep, struct rxd_x_entry *tx_entry) rxd_insert_unacked(ep, tx_entry->peer, pkt_entry); } - return ep->peers[tx_entry->peer].unacked_cnt >= - ep->peers[tx_entry->peer].tx_window; + return rxd_peer(ep, tx_entry->peer)->unacked_cnt >= + rxd_peer(ep, tx_entry->peer)->tx_window; } int rxd_ep_send_pkt(struct rxd_ep *ep, struct rxd_pkt_entry *pkt_entry) { int ret; + fi_addr_t dg_addr; + pkt_entry->timestamp = ofi_gettime_ms(); - pkt_entry->timestamp = fi_gettime_ms(); - + dg_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(ep)->rxdaddr_dg_idx), + pkt_entry->peer); ret = fi_send(ep->dg_ep, (const void *) rxd_pkt_start(pkt_entry), - pkt_entry->pkt_size, pkt_entry->desc, - rxd_ep_av(ep)->rxd_addr_table[pkt_entry->peer].dg_addr, + pkt_entry->pkt_size, pkt_entry->desc, dg_addr, &pkt_entry->context); if (ret) { FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, "error sending packet: %d (%s)\n", @@ -456,15 +462,21 @@ static ssize_t rxd_ep_send_rts(struct rxd_ep *rxd_ep, fi_addr_t rxd_addr) rxd_ep_send_pkt(rxd_ep, pkt_entry); rxd_insert_unacked(rxd_ep, rxd_addr, pkt_entry); - dlist_insert_tail(&rxd_ep->peers[rxd_addr].entry, &rxd_ep->rts_sent_list); + dlist_insert_tail(&(rxd_peer(rxd_ep, rxd_addr)->entry), + &rxd_ep->rts_sent_list); return 0; } ssize_t rxd_send_rts_if_needed(struct rxd_ep *ep, fi_addr_t addr) { - if (ep->peers[addr].peer_addr == FI_ADDR_UNSPEC && - dlist_empty(&ep->peers[addr].unacked)) + if (!rxd_peer(ep, addr)) { + if (rxd_create_peer(ep, addr) < 0) + return -FI_ENOMEM; + } + + if (rxd_peer(ep, addr)->peer_addr == RXD_ADDR_INVALID && + dlist_empty(&(rxd_peer(ep, addr)->unacked))) return rxd_ep_send_rts(ep, addr); return 0; } @@ -477,7 +489,7 @@ void rxd_init_base_hdr(struct rxd_ep *rxd_ep, void **ptr, hdr->version = RXD_PROTOCOL_VERSION; hdr->type = tx_entry->op; hdr->seq_no = 0; - hdr->peer = rxd_ep->peers[tx_entry->peer].peer_addr; + hdr->peer = rxd_peer(rxd_ep, tx_entry->peer)->peer_addr; hdr->flags = tx_entry->flags; *ptr = (char *) (*ptr) + sizeof(*hdr); @@ -564,16 +576,14 @@ void rxd_ep_send_ack(struct rxd_ep *rxd_ep, fi_addr_t peer) ack->base_hdr.version = RXD_PROTOCOL_VERSION; ack->base_hdr.type = RXD_ACK; - ack->base_hdr.peer = rxd_ep->peers[peer].peer_addr; - ack->base_hdr.seq_no = rxd_ep->peers[peer].rx_seq_no; - ack->ext_hdr.rx_id = rxd_ep->peers[peer].rx_window; - rxd_ep->peers[peer].last_tx_ack = ack->base_hdr.seq_no; + ack->base_hdr.peer = rxd_peer(rxd_ep, peer)->peer_addr; + ack->base_hdr.seq_no = rxd_peer(rxd_ep, peer)->rx_seq_no; + ack->ext_hdr.rx_id = rxd_peer(rxd_ep, peer)->rx_window; + rxd_peer(rxd_ep, peer)->last_tx_ack = ack->base_hdr.seq_no; dlist_insert_tail(&pkt_entry->d_entry, &rxd_ep->ctrl_pkts); - if (rxd_ep_send_pkt(rxd_ep, pkt_entry)) { - dlist_remove(&pkt_entry->d_entry); - ofi_buf_free(pkt_entry); - } + if (rxd_ep_send_pkt(rxd_ep, pkt_entry)) + rxd_remove_free_pkt_entry(pkt_entry); } static void rxd_ep_free_res(struct rxd_ep *ep) @@ -625,21 +635,26 @@ static void rxd_close_peer(struct rxd_ep *ep, struct rxd_peer *peer) peer->active = 0; } -static void rxd_cleanup_unexp_msg(struct dlist_entry *list) +void rxd_cleanup_unexp_msg(struct rxd_unexp_msg *unexp_msg) { - struct rxd_unexp_msg *unexp_msg; struct rxd_pkt_entry *pkt_entry; + while (!dlist_empty(&unexp_msg->pkt_list)) { + dlist_pop_front(&unexp_msg->pkt_list, struct rxd_pkt_entry, + pkt_entry, d_entry); + ofi_buf_free(pkt_entry); + } + + rxd_free_unexp_msg(unexp_msg); +} + +static void rxd_cleanup_unexp_msg_list(struct dlist_entry *list) +{ + struct rxd_unexp_msg *unexp_msg; while (!dlist_empty(list)) { dlist_pop_front(list, struct rxd_unexp_msg, unexp_msg, entry); - while (!dlist_empty(&unexp_msg->pkt_list)) { - dlist_pop_front(&unexp_msg->pkt_list, struct rxd_pkt_entry, - pkt_entry, d_entry); - ofi_buf_free(pkt_entry); - } - ofi_buf_free(unexp_msg->pkt_entry); - free(unexp_msg); + rxd_cleanup_unexp_msg(unexp_msg); } } @@ -672,8 +687,8 @@ static int rxd_ep_close(struct fid *fid) ofi_buf_free(pkt_entry); } - rxd_cleanup_unexp_msg(&ep->unexp_list); - rxd_cleanup_unexp_msg(&ep->unexp_tag_list); + rxd_cleanup_unexp_msg_list(&ep->unexp_list); + rxd_cleanup_unexp_msg_list(&ep->unexp_tag_list); while (!dlist_empty(&ep->ctrl_pkts)) { dlist_pop_front(&ep->ctrl_pkts, struct rxd_pkt_entry, @@ -681,6 +696,7 @@ static int rxd_ep_close(struct fid *fid) ofi_buf_free(pkt_entry); } + ofi_idm_reset(&(ep->peers_idm), free); rxd_ep_free_res(ep); ofi_endpoint_close(&ep->util_ep); free(ep); @@ -699,13 +715,6 @@ static int rxd_ep_trywait(void *arg) return fi_trywait(rxd_fabric->dg_fabric, fids, 1); } -static int rxd_ep_wait_fd_add(struct rxd_ep *rxd_ep, struct util_wait *wait) -{ - return ofi_wait_fd_add(wait, rxd_ep->dg_cq_fd, FI_EPOLL_IN, - rxd_ep_trywait, rxd_ep, - &rxd_ep->util_ep.ep_fid.fid); -} - static int rxd_dg_cq_open(struct rxd_ep *rxd_ep, enum fi_wait_obj wait_obj) { struct rxd_domain *rxd_domain; @@ -776,7 +785,9 @@ static int rxd_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) } if (cq->wait) - ret = rxd_ep_wait_fd_add(ep, cq->wait); + ret = ofi_wait_add_fd(cq->wait, ep->dg_cq_fd, POLLIN, + rxd_ep_trywait, ep, + &ep->util_ep.ep_fid.fid); break; case FI_CLASS_EQ: break; @@ -806,7 +817,9 @@ static int rxd_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) return ret; if (cntr->wait) - ret = rxd_ep_wait_fd_add(ep, cntr->wait); + ret = ofi_wait_add_fd(cntr->wait, ep->dg_cq_fd, + POLLIN, rxd_ep_trywait, ep, + &ep->util_ep.ep_fid.fid); break; default: FI_WARN(&rxd_prov, FI_LOG_EP_CTRL, @@ -907,7 +920,7 @@ static void rxd_progress_pkt_list(struct rxd_ep *ep, struct rxd_peer *peer) uint64_t current; int ret, retry = 0; - current = fi_gettime_ms(); + current = ofi_gettime_ms(); if (peer->retry_cnt > RXD_MAX_PKT_RETRY) { rxd_peer_timeout(ep, peer); return; @@ -993,7 +1006,7 @@ static int rxd_buf_region_alloc_fn(struct ofi_bufpool_region *region) ret = fi_mr_reg(rxd_ep_domain(pool->rxd_ep)->dg_domain, region->mem_region, region->pool->region_size, - FI_SEND | FI_RECV, 0, 0, 0, &mr, NULL); + FI_SEND | FI_RECV, 0, 0, OFI_MR_NOCACHE, &mr, NULL); region->context = mr; return ret; @@ -1131,23 +1144,38 @@ int rxd_ep_init_res(struct rxd_ep *ep, struct fi_info *fi_info) return ret; } -static void rxd_init_peer(struct rxd_ep *ep, uint64_t rxd_addr) +int rxd_create_peer(struct rxd_ep *ep, uint64_t rxd_addr) { - ep->peers[rxd_addr].peer_addr = FI_ADDR_UNSPEC; - ep->peers[rxd_addr].tx_seq_no = 0; - ep->peers[rxd_addr].rx_seq_no = 0; - ep->peers[rxd_addr].last_rx_ack = 0; - ep->peers[rxd_addr].last_tx_ack = 0; - ep->peers[rxd_addr].rx_window = rxd_env.max_unacked; - ep->peers[rxd_addr].tx_window = rxd_env.max_unacked; - ep->peers[rxd_addr].unacked_cnt = 0; - ep->peers[rxd_addr].retry_cnt = 0; - ep->peers[rxd_addr].active = 0; - dlist_init(&ep->peers[rxd_addr].unacked); - dlist_init(&ep->peers[rxd_addr].tx_list); - dlist_init(&ep->peers[rxd_addr].rx_list); - dlist_init(&ep->peers[rxd_addr].rma_rx_list); - dlist_init(&ep->peers[rxd_addr].buf_pkts); + + struct rxd_peer *peer; + + peer = calloc(1, sizeof(struct rxd_peer)); + if (!peer) + return -FI_ENOMEM; + + peer->peer_addr = RXD_ADDR_INVALID; + peer->tx_seq_no = 0; + peer->rx_seq_no = 0; + peer->last_rx_ack = 0; + peer->last_tx_ack = 0; + peer->rx_window = rxd_env.max_unacked; + peer->tx_window = rxd_env.max_unacked; + peer->unacked_cnt = 0; + peer->retry_cnt = 0; + peer->active = 0; + dlist_init(&(peer->unacked)); + dlist_init(&(peer->tx_list)); + dlist_init(&(peer->rx_list)); + dlist_init(&(peer->rma_rx_list)); + dlist_init(&(peer->buf_pkts)); + + if (ofi_idm_set(&(ep->peers_idm), rxd_addr, peer) < 0) + goto err; + + return 0; +err: + free(peer); + return -FI_ENOMEM; } int rxd_endpoint(struct fid_domain *domain, struct fi_info *info, @@ -1156,10 +1184,9 @@ int rxd_endpoint(struct fid_domain *domain, struct fi_info *info, struct fi_info *dg_info; struct rxd_domain *rxd_domain; struct rxd_ep *rxd_ep; - int ret, i; + int ret; - rxd_ep = calloc(1, sizeof(*rxd_ep) + sizeof(struct rxd_peer) * - rxd_env.max_peers); + rxd_ep = calloc(1, sizeof(*rxd_ep)); if (!rxd_ep) return -FI_ENOMEM; @@ -1172,7 +1199,7 @@ int rxd_endpoint(struct fid_domain *domain, struct fi_info *info, goto err1; ret = ofi_get_core_info(rxd_domain->util_domain.fabric->fabric_fid.api_version, - NULL, NULL, 0, &rxd_util_prov, info, + NULL, NULL, 0, &rxd_util_prov, info, NULL, rxd_info_to_core, &dg_info); if (ret) goto err2; @@ -1201,8 +1228,7 @@ int rxd_endpoint(struct fid_domain *domain, struct fi_info *info, if (ret) goto err3; - for (i = 0; i < rxd_env.max_peers; rxd_init_peer(rxd_ep, i++)) - ; + memset(&(rxd_ep->peers_idm), 0, sizeof(rxd_ep->peers_idm)); rxd_ep->util_ep.ep_fid.fid.ops = &rxd_ep_fi_ops; rxd_ep->util_ep.ep_fid.cm = &rxd_ep_cm; diff --git a/prov/rxd/src/rxd_init.c b/prov/rxd/src/rxd_init.c index 2662a856276..0969bff862d 100644 --- a/prov/rxd/src/rxd_init.c +++ b/prov/rxd/src/rxd_init.c @@ -77,7 +77,7 @@ void rxd_info_to_core_mr_modes(uint32_t version, const struct fi_info *hints, } int rxd_info_to_core(uint32_t version, const struct fi_info *rxd_info, - struct fi_info *core_info) + const struct fi_info *base_info, struct fi_info *core_info) { rxd_info_to_core_mr_modes(version, rxd_info, core_info); core_info->caps = FI_MSG; @@ -88,9 +88,10 @@ int rxd_info_to_core(uint32_t version, const struct fi_info *rxd_info, } int rxd_info_to_rxd(uint32_t version, const struct fi_info *core_info, - struct fi_info *info) + const struct fi_info *base_info, struct fi_info *info) { - info->caps = rxd_info.caps; + info->caps = ofi_pick_core_flags(rxd_info.caps, core_info->caps, + FI_LOCAL_COMM | FI_REMOTE_COMM); info->mode = rxd_info.mode; *info->tx_attr = *rxd_info.tx_attr; @@ -103,6 +104,9 @@ int rxd_info_to_rxd(uint32_t version, const struct fi_info *core_info, *info->rx_attr = *rxd_info.rx_attr; *info->ep_attr = *rxd_info.ep_attr; *info->domain_attr = *rxd_info.domain_attr; + info->domain_attr->caps = ofi_pick_core_flags(rxd_info.domain_attr->caps, + core_info->domain_attr->caps, + FI_LOCAL_COMM | FI_REMOTE_COMM); if (core_info->nic) { info->nic = ofi_nic_dup(core_info->nic); if (!info->nic) @@ -126,8 +130,8 @@ static void rxd_fini(void) struct fi_provider rxd_prov = { .name = OFI_UTIL_PREFIX "rxd", - .version = FI_VERSION(RXD_MAJOR_VERSION, RXD_MINOR_VERSION), - .fi_version = FI_VERSION(1, 8), + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, .getinfo = rxd_getinfo, .fabric = rxd_fabric, .cleanup = rxd_fini diff --git a/prov/rxd/src/rxd_msg.c b/prov/rxd/src/rxd_msg.c index 53edbbaeef7..58239d7e829 100644 --- a/prov/rxd/src/rxd_msg.c +++ b/prov/rxd/src/rxd_msg.c @@ -77,7 +77,7 @@ static void rxd_progress_unexp_msg(struct rxd_ep *ep, struct rxd_x_entry *rx_ent { struct rxd_pkt_entry *pkt_entry; uint64_t num_segs = 0; - uint16_t curr_id = ep->peers[unexp_msg->base_hdr->peer].curr_rx_id; + uint16_t curr_id = rxd_peer(ep, unexp_msg->base_hdr->peer)->curr_rx_id; rxd_progress_op(ep, rx_entry, unexp_msg->pkt_entry, unexp_msg->base_hdr, unexp_msg->sar_hdr, unexp_msg->tag_hdr, @@ -93,16 +93,14 @@ static void rxd_progress_unexp_msg(struct rxd_ep *ep, struct rxd_x_entry *rx_ent num_segs++; } - if (ep->peers[unexp_msg->base_hdr->peer].curr_unexp) { + if (rxd_peer(ep, unexp_msg->base_hdr->peer)->curr_unexp) { if (!unexp_msg->sar_hdr || num_segs == unexp_msg->sar_hdr->num_segs - 1) - ep->peers[unexp_msg->base_hdr->peer].curr_rx_id = curr_id; + rxd_peer(ep, unexp_msg->base_hdr->peer)->curr_rx_id = curr_id; else - ep->peers[unexp_msg->base_hdr->peer].curr_unexp = NULL; + rxd_peer(ep, unexp_msg->base_hdr->peer)->curr_unexp = NULL; } - ofi_buf_free(unexp_msg->pkt_entry); - dlist_remove(&unexp_msg->entry); - free(unexp_msg); + rxd_free_unexp_msg(unexp_msg); } static int rxd_progress_unexp_list(struct rxd_ep *ep, @@ -139,15 +137,15 @@ static int rxd_progress_unexp_list(struct rxd_ep *ep, static int rxd_ep_discard_recv(struct rxd_ep *rxd_ep, void *context, struct rxd_unexp_msg *unexp_msg) { - struct rxd_pkt_entry *pkt_entry; uint64_t seq = unexp_msg->base_hdr->seq_no; int ret; assert(unexp_msg->tag_hdr); seq += unexp_msg->sar_hdr ? unexp_msg->sar_hdr->num_segs : 1; - rxd_ep->peers[unexp_msg->base_hdr->peer].rx_seq_no = - MAX(seq, rxd_ep->peers[unexp_msg->base_hdr->peer].rx_seq_no); + rxd_peer(rxd_ep, unexp_msg->base_hdr->peer)->rx_seq_no = + MAX(seq, rxd_peer(rxd_ep, + unexp_msg->base_hdr->peer)->rx_seq_no); rxd_ep_send_ack(rxd_ep, unexp_msg->base_hdr->peer); ret = ofi_cq_write(rxd_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, @@ -155,15 +153,7 @@ static int rxd_ep_discard_recv(struct rxd_ep *rxd_ep, void *context, unexp_msg->data_hdr->cq_data : 0, unexp_msg->tag_hdr->tag); - while (!dlist_empty(&unexp_msg->pkt_list)) { - dlist_pop_front(&unexp_msg->pkt_list, struct rxd_pkt_entry, - pkt_entry, d_entry); - ofi_buf_free(pkt_entry); - } - - ofi_buf_free(unexp_msg->pkt_entry); - dlist_remove(&unexp_msg->entry); - free(unexp_msg); + rxd_cleanup_unexp_msg(unexp_msg); return ret; } @@ -212,11 +202,14 @@ ssize_t rxd_ep_generic_recvmsg(struct rxd_ep *rxd_ep, const struct iovec *iov, struct rxd_x_entry *rx_entry; struct dlist_entry *unexp_list, *rx_list; struct rxd_unexp_msg *unexp_msg; + fi_addr_t rxd_addr = RXD_ADDR_INVALID; + assert(iov_count <= RXD_IOV_LIMIT); assert(!(rxd_flags & RXD_MULTI_RECV) || iov_count == 1); assert(!(flags & FI_PEEK) || op == RXD_TAGGED); + fastlock_acquire(&rxd_ep->util_ep.lock); if (ofi_cirque_isfull(rxd_ep->util_ep.rx_cq->cirq)) { @@ -231,19 +224,22 @@ ssize_t rxd_ep_generic_recvmsg(struct rxd_ep *rxd_ep, const struct iovec *iov, unexp_list = &rxd_ep->unexp_list; rx_list = &rxd_ep->rx_list; } + + if (rxd_ep->util_ep.caps & FI_DIRECTED_RECV && + addr != FI_ADDR_UNSPEC) { + rxd_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(rxd_ep)->fi_addr_idx), + RXD_IDX_OFFSET(addr)); + } if (flags & FI_PEEK) { - ret = rxd_peek_recv(rxd_ep, addr, tag, ignore, context, flags, + ret = rxd_peek_recv(rxd_ep, rxd_addr, tag, ignore, context, flags, unexp_list); goto out; } - if (!(flags & FI_DISCARD)) { - rx_entry = rxd_rx_entry_init(rxd_ep, iov, iov_count, tag, ignore, context, - (rxd_ep->util_ep.caps & FI_DIRECTED_RECV && - addr != FI_ADDR_UNSPEC) ? - rxd_ep_av(rxd_ep)->fi_addr_table[addr] : - FI_ADDR_UNSPEC, op, rxd_flags); + + rx_entry = rxd_rx_entry_init(rxd_ep, iov, iov_count, tag, ignore, + context, rxd_addr, op, rxd_flags); if (!rx_entry) { ret = -FI_EAGAIN; } else if (flags & FI_CLAIM) { @@ -320,14 +316,10 @@ static struct rxd_x_entry *rxd_tx_entry_init_msg(struct rxd_ep *ep, fi_addr_t ad void *ptr; tx_entry = rxd_tx_entry_init_common(ep, addr, op, iov, iov_count, - tag, data, flags, context); + tag, data, flags, context, &base_hdr, &ptr); if (!tx_entry) return NULL; - base_hdr = rxd_get_base_hdr(tx_entry->pkt); - ptr = (void *) base_hdr; - rxd_init_base_hdr(ep, &ptr, tx_entry); - max_inline = rxd_domain->max_inline_msg; if (tx_entry->flags & RXD_TAG_HDR) { @@ -352,8 +344,7 @@ static struct rxd_x_entry *rxd_tx_entry_init_msg(struct rxd_ep *ep, fi_addr_t ad tx_entry->cq_entry.len, max_inline); - tx_entry->pkt->pkt_size = ((char *) ptr - (char *) base_hdr) + - ep->tx_prefix_size; + tx_entry->pkt->pkt_size = rxd_pkt_size(ep, base_hdr, ptr); return tx_entry; } @@ -374,8 +365,12 @@ ssize_t rxd_ep_generic_inject(struct rxd_ep *rxd_ep, const struct iovec *iov, if (ofi_cirque_isfull(rxd_ep->util_ep.tx_cq->cirq)) goto out; + + rxd_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(rxd_ep)->fi_addr_idx), + RXD_IDX_OFFSET(addr)); + if (!rxd_addr) + goto out; - rxd_addr = rxd_ep_av(rxd_ep)->fi_addr_table[addr]; ret = rxd_send_rts_if_needed(rxd_ep, rxd_addr); if (ret) goto out; @@ -387,7 +382,7 @@ ssize_t rxd_ep_generic_inject(struct rxd_ep *rxd_ep, const struct iovec *iov, goto out; } - if (rxd_ep->peers[rxd_addr].peer_addr != FI_ADDR_UNSPEC) + if (rxd_peer(rxd_ep, rxd_addr)->peer_addr != RXD_ADDR_INVALID) (void) rxd_start_xfer(rxd_ep, tx_entry); out: @@ -414,8 +409,12 @@ ssize_t rxd_ep_generic_sendmsg(struct rxd_ep *rxd_ep, const struct iovec *iov, if (ofi_cirque_isfull(rxd_ep->util_ep.tx_cq->cirq)) goto out; - - rxd_addr = rxd_ep_av(rxd_ep)->fi_addr_table[addr]; + + rxd_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(rxd_ep)->fi_addr_idx), + RXD_IDX_OFFSET(addr)); + if (!rxd_addr) + goto out; + ret = rxd_send_rts_if_needed(rxd_ep, rxd_addr); if (ret) goto out; @@ -425,7 +424,7 @@ ssize_t rxd_ep_generic_sendmsg(struct rxd_ep *rxd_ep, const struct iovec *iov, if (!tx_entry) goto out; - if (rxd_ep->peers[rxd_addr].peer_addr == FI_ADDR_UNSPEC) + if (rxd_peer(rxd_ep, rxd_addr)->peer_addr == RXD_ADDR_INVALID) goto out; ret = rxd_start_xfer(rxd_ep, tx_entry); diff --git a/prov/rxd/src/rxd_rma.c b/prov/rxd/src/rxd_rma.c index ea4794d73f1..3b3cb1172fe 100644 --- a/prov/rxd/src/rxd_rma.c +++ b/prov/rxd/src/rxd_rma.c @@ -48,14 +48,10 @@ static struct rxd_x_entry *rxd_tx_entry_init_rma(struct rxd_ep *ep, fi_addr_t ad void *ptr; tx_entry = rxd_tx_entry_init_common(ep, addr, op, iov, iov_count, 0, - data, flags, context); + data, flags, context, &base_hdr, &ptr); if (!tx_entry) return NULL; - base_hdr = rxd_get_base_hdr(tx_entry->pkt); - ptr = (void *) base_hdr; - rxd_init_base_hdr(ep, &ptr, tx_entry); - if (tx_entry->cq_entry.flags & FI_READ) { tx_entry->num_segs = ofi_div_ceil(tx_entry->cq_entry.len, rxd_domain->max_seg_sz); @@ -81,8 +77,8 @@ static struct rxd_x_entry *rxd_tx_entry_init_rma(struct rxd_ep *ep, fi_addr_t ad tx_entry->iov_count, tx_entry->cq_entry.len, max_inline); } - tx_entry->pkt->pkt_size = ((char *) ptr - (char *) base_hdr) + - ep->tx_prefix_size; + + tx_entry->pkt->pkt_size = rxd_pkt_size(ep, base_hdr, ptr); return tx_entry; } @@ -105,7 +101,10 @@ static ssize_t rxd_generic_write_inject(struct rxd_ep *rxd_ep, if (ofi_cirque_isfull(rxd_ep->util_ep.tx_cq->cirq)) goto out; - rxd_addr = rxd_ep_av(rxd_ep)->fi_addr_table[addr]; + rxd_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(rxd_ep)->fi_addr_idx), + RXD_IDX_OFFSET(addr)); + if (!rxd_addr) + goto out; ret = rxd_send_rts_if_needed(rxd_ep, rxd_addr); if (ret) goto out; @@ -118,7 +117,7 @@ static ssize_t rxd_generic_write_inject(struct rxd_ep *rxd_ep, goto out; } - if (rxd_ep->peers[rxd_addr].peer_addr == FI_ADDR_UNSPEC) + if (rxd_peer(rxd_ep, rxd_addr)->peer_addr == RXD_ADDR_INVALID) goto out; ret = rxd_start_xfer(rxd_ep, tx_entry); @@ -131,7 +130,8 @@ static ssize_t rxd_generic_write_inject(struct rxd_ep *rxd_ep, return ret; } -ssize_t rxd_generic_rma(struct rxd_ep *rxd_ep, const struct iovec *iov, +static ssize_t +rxd_generic_rma(struct rxd_ep *rxd_ep, const struct iovec *iov, size_t iov_count, const struct fi_rma_iov *rma_iov, size_t rma_count, void **desc, fi_addr_t addr, void *context, uint32_t op, uint64_t data, uint32_t rxd_flags) @@ -151,8 +151,11 @@ ssize_t rxd_generic_rma(struct rxd_ep *rxd_ep, const struct iovec *iov, if (ofi_cirque_isfull(rxd_ep->util_ep.tx_cq->cirq)) goto out; + rxd_addr = (intptr_t) ofi_idx_lookup(&(rxd_ep_av(rxd_ep)->fi_addr_idx), + RXD_IDX_OFFSET(addr)); + if (!rxd_addr) + goto out; - rxd_addr = rxd_ep_av(rxd_ep)->fi_addr_table[addr]; ret = rxd_send_rts_if_needed(rxd_ep, rxd_addr); if (ret) goto out; @@ -165,7 +168,7 @@ ssize_t rxd_generic_rma(struct rxd_ep *rxd_ep, const struct iovec *iov, goto out; } - if (rxd_ep->peers[rxd_addr].peer_addr == FI_ADDR_UNSPEC) + if (rxd_peer(rxd_ep, rxd_addr)->peer_addr == RXD_ADDR_INVALID) goto out; ret = rxd_start_xfer(rxd_ep, tx_entry); @@ -178,7 +181,8 @@ ssize_t rxd_generic_rma(struct rxd_ep *rxd_ep, const struct iovec *iov, return ret; } -ssize_t rxd_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, +static ssize_t +rxd_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { struct rxd_ep *ep; @@ -193,12 +197,13 @@ ssize_t rxd_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, rma_iov.len = len; rma_iov.key = key; - return rxd_generic_rma(ep, &msg_iov, 1, &rma_iov, 1, &desc, + return rxd_generic_rma(ep, &msg_iov, 1, &rma_iov, 1, &desc, src_addr, context, RXD_READ_REQ, 0, ep->tx_flags); } -ssize_t rxd_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, +static ssize_t +rxd_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { @@ -216,7 +221,8 @@ ssize_t rxd_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, ep->tx_flags); } -ssize_t rxd_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, +static ssize_t +rxd_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) { struct rxd_ep *ep; @@ -230,7 +236,8 @@ ssize_t rxd_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, ep->util_ep.tx_msg_flags)); } -ssize_t rxd_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, +static ssize_t +rxd_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { struct rxd_ep *ep; @@ -245,12 +252,13 @@ ssize_t rxd_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc rma_iov.len = len; rma_iov.key = key; - return rxd_generic_rma(ep, &msg_iov, 1, &rma_iov, 1, &desc, + return rxd_generic_rma(ep, &msg_iov, 1, &rma_iov, 1, &desc, dest_addr, context, RXD_WRITE, 0, ep->tx_flags); } -ssize_t rxd_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, +static ssize_t +rxd_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { @@ -268,8 +276,8 @@ ssize_t rxd_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, ep->tx_flags); } - -ssize_t rxd_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, +static ssize_t +rxd_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) { struct rxd_ep *ep; @@ -283,7 +291,8 @@ ssize_t rxd_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, ep->util_ep.tx_msg_flags)); } -ssize_t rxd_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, +static ssize_t +rxd_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { @@ -304,7 +313,8 @@ ssize_t rxd_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, ep->tx_flags | RXD_REMOTE_CQ_DATA); } -ssize_t rxd_inject_write(struct fid_ep *ep_fid, const void *buf, +static ssize_t +rxd_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { struct rxd_ep *rxd_ep; @@ -324,9 +334,10 @@ ssize_t rxd_inject_write(struct fid_ep *ep_fid, const void *buf, RXD_NO_TX_COMP | RXD_INJECT); } -ssize_t rxd_inject_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, - uint64_t data, fi_addr_t dest_addr, uint64_t addr, - uint64_t key) +static ssize_t +rxd_inject_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, + uint64_t data, fi_addr_t dest_addr, uint64_t addr, + uint64_t key) { struct rxd_ep *rxd_ep; struct iovec iov; diff --git a/prov/rxm/src/rxm.h b/prov/rxm/src/rxm.h index ab1898f7fc9..8a20882568a 100644 --- a/prov/rxm/src/rxm.h +++ b/prov/rxm/src/rxm.h @@ -2,6 +2,7 @@ /* * Copyright (c) 2016 Intel Corporation, Inc. All rights reserved. * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -53,23 +54,19 @@ #include #include #include +#include #ifndef _RXM_H_ #define _RXM_H_ -#endif - -#define RXM_MAJOR_VERSION 1 -#define RXM_MINOR_VERSION 0 #define RXM_CM_DATA_VERSION 1 #define RXM_OP_VERSION 3 #define RXM_CTRL_VERSION 4 -#define RXM_BUF_SIZE 16384 extern size_t rxm_eager_limit; +extern size_t rxm_buffer_size; -#define RXM_SAR_LIMIT 131072 #define RXM_SAR_TX_ERROR UINT64_MAX #define RXM_SAR_RX_INIT UINT64_MAX @@ -130,8 +127,14 @@ extern struct fi_ops_atomic rxm_ops_atomic; extern size_t rxm_msg_tx_size; extern size_t rxm_msg_rx_size; -extern size_t rxm_def_univ_size; extern size_t rxm_cm_progress_interval; +extern size_t rxm_cq_eq_fairness; +extern int force_auto_progress; +extern int rxm_use_write_rndv; +extern enum fi_wait_obj def_wait_obj, def_tcp_wait_obj; + +struct rxm_ep; + /* * Connection Map @@ -149,7 +152,6 @@ enum rxm_cmap_signal { FUNC(RXM_CMAP_IDLE), \ FUNC(RXM_CMAP_CONNREQ_SENT), \ FUNC(RXM_CMAP_CONNREQ_RECV), \ - FUNC(RXM_CMAP_CONNECTED_NOTIFY),\ FUNC(RXM_CMAP_CONNECTED), \ FUNC(RXM_CMAP_SHUTDOWN), \ @@ -189,12 +191,10 @@ struct rxm_cmap_peer { struct rxm_cmap_attr { void *name; - /* user guarantee for serializing access to cmap objects */ - uint8_t serial_access; }; struct rxm_cmap { - struct util_ep *ep; + struct rxm_ep *ep; struct util_av *av; /* cmap handles that correspond to addresses in AV */ @@ -215,8 +215,6 @@ struct rxm_cmap { fastlock_t lock; }; -struct rxm_ep; - enum rxm_cmap_reject_reason { RXM_CMAP_REJECT_UNSPEC, RXM_CMAP_REJECT_GENUINE, @@ -247,11 +245,12 @@ union rxm_cm_data { } reject; }; +int rxm_cmap_alloc_handle(struct rxm_cmap *cmap, fi_addr_t fi_addr, + enum rxm_cmap_state state, + struct rxm_cmap_handle **handle); struct rxm_cmap_handle *rxm_cmap_key2handle(struct rxm_cmap *cmap, uint64_t key); int rxm_cmap_update(struct rxm_cmap *cmap, const void *addr, fi_addr_t fi_addr); -void rxm_cmap_process_conn_notify(struct rxm_cmap *cmap, - struct rxm_cmap_handle *handle); void rxm_cmap_process_reject(struct rxm_cmap *cmap, struct rxm_cmap_handle *handle, enum rxm_cmap_reject_reason cm_reject_reason); @@ -259,7 +258,6 @@ void rxm_cmap_process_shutdown(struct rxm_cmap *cmap, struct rxm_cmap_handle *handle); int rxm_cmap_connect(struct rxm_ep *rxm_ep, fi_addr_t fi_addr, struct rxm_cmap_handle *handle); -void rxm_cmap_del_handle_ts(struct rxm_cmap_handle *handle); void rxm_cmap_free(struct rxm_cmap *cmap); int rxm_cmap_alloc(struct rxm_ep *rxm_ep, struct rxm_cmap_attr *attr); int rxm_cmap_remove(struct rxm_cmap *cmap, int index); @@ -281,7 +279,12 @@ struct rxm_domain { struct util_domain util_domain; struct fid_domain *msg_domain; size_t max_atomic_size; - uint8_t mr_local; + size_t rx_post_size; + uint64_t mr_key; + bool dyn_rbuf; + struct ofi_ops_flow_ctrl *flow_ctrl_ops; + struct ofi_bufpool *amo_bufpool; + fastlock_t amo_bufpool_lock; }; int rxm_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, @@ -291,8 +294,23 @@ struct rxm_mr { struct fid_mr mr_fid; struct fid_mr *msg_mr; struct rxm_domain *domain; + enum fi_hmem_iface iface; + uint64_t device; + fastlock_t amo_lock; }; +static inline enum fi_hmem_iface +rxm_mr_desc_to_hmem_iface_dev(void **desc, size_t count, uint64_t *device) +{ + if (!count || !desc || !desc[0]) { + *device = 0; + return FI_HMEM_SYSTEM; + } + + *device = ((struct rxm_mr *) desc[0])->device; + return ((struct rxm_mr *) desc[0])->iface; +} + struct rxm_rndv_hdr { struct ofi_rma_iov iov[RXM_IOV_LIMIT]; uint8_t count; @@ -337,11 +355,19 @@ struct rxm_atomic_resp_hdr { FUNC(RXM_RMA), \ FUNC(RXM_RX), \ FUNC(RXM_SAR_TX), \ + FUNC(RXM_CREDIT_TX), \ FUNC(RXM_RNDV_TX), \ - FUNC(RXM_RNDV_ACK_WAIT), \ + FUNC(RXM_RNDV_READ_DONE_WAIT), \ + FUNC(RXM_RNDV_WRITE_DATA_WAIT), \ + FUNC(RXM_RNDV_WRITE_DONE_WAIT), \ FUNC(RXM_RNDV_READ), \ - FUNC(RXM_RNDV_ACK_SENT), \ - FUNC(RXM_RNDV_ACK_RECVD), \ + FUNC(RXM_RNDV_WRITE), \ + FUNC(RXM_RNDV_READ_DONE_SENT), \ + FUNC(RXM_RNDV_READ_DONE_RECVD), \ + FUNC(RXM_RNDV_WRITE_DATA_SENT), \ + FUNC(RXM_RNDV_WRITE_DATA_RECVD),\ + FUNC(RXM_RNDV_WRITE_DONE_SENT), \ + FUNC(RXM_RNDV_WRITE_DONE_RECVD),\ FUNC(RXM_RNDV_FINISH), \ FUNC(RXM_ATOMIC_RESP_WAIT), \ FUNC(RXM_ATOMIC_RESP_SENT) @@ -355,10 +381,13 @@ extern char *rxm_proto_state_str[]; enum { rxm_ctrl_eager, rxm_ctrl_seg, - rxm_ctrl_rndv, - rxm_ctrl_rndv_ack, + rxm_ctrl_rndv_req, + rxm_ctrl_rndv_rd_done, rxm_ctrl_atomic, rxm_ctrl_atomic_resp, + rxm_ctrl_credit, + rxm_ctrl_rndv_wr_data, + rxm_ctrl_rndv_wr_done }; struct rxm_pkt { @@ -372,7 +401,7 @@ union rxm_sar_ctrl_data { enum rxm_sar_seg_type { RXM_SAR_SEG_FIRST = 1, RXM_SAR_SEG_MIDDLE = 2, - RXM_SAR_SEG_LAST = 3, + RXM_SAR_SEG_LAST = 3, } seg_type : 2; uint32_t offset; }; @@ -415,9 +444,12 @@ enum rxm_buf_pool_type { RXM_BUF_POOL_TX, RXM_BUF_POOL_TX_START = RXM_BUF_POOL_TX, RXM_BUF_POOL_TX_INJECT, - RXM_BUF_POOL_TX_ACK, - RXM_BUF_POOL_TX_RNDV, + RXM_BUF_POOL_TX_RNDV_RD_DONE, + RXM_BUF_POOL_TX_RNDV_WR_DONE, + RXM_BUF_POOL_TX_RNDV_REQ, + RXM_BUF_POOL_TX_RNDV_WR_DATA, RXM_BUF_POOL_TX_ATOMIC, + RXM_BUF_POOL_TX_CREDIT, RXM_BUF_POOL_TX_SAR, RXM_BUF_POOL_TX_END = RXM_BUF_POOL_TX_SAR, RXM_BUF_POOL_RMA, @@ -439,19 +471,19 @@ struct rxm_rx_buf { struct rxm_ep *ep; /* MSG EP / shared context to which bufs would be posted to */ - struct fid_ep *msg_ep; + struct fid_ep *rx_ep; struct dlist_entry repost_entry; - struct rxm_conn *conn; + struct rxm_conn *conn; /* msg ep data was received on */ + /* if recv_entry is set, then we matched dyn rbuf */ struct rxm_recv_entry *recv_entry; struct rxm_unexp_msg unexp_msg; uint64_t comp_flags; struct fi_recv_context recv_context; - // TODO remove this and modify unexp msg handling path to not repost - // rx_buf - uint8_t repost; + bool repost; /* Used for large messages */ - struct rxm_rndv_hdr *rndv_hdr; + struct dlist_entry rndv_wait_entry; + struct rxm_rndv_hdr *remote_rndv_hdr; size_t rndv_rma_index; struct fid_mr *mr[RXM_IOV_LIMIT]; @@ -498,6 +530,16 @@ struct rxm_tx_rndv_buf { struct fid_mr *mr[RXM_IOV_LIMIT]; uint8_t count; + struct { + struct iovec iov[RXM_IOV_LIMIT]; + void *desc[RXM_IOV_LIMIT]; + struct rxm_conn *conn; + size_t rndv_rma_index; + size_t rndv_rma_count; + struct rxm_tx_base_buf *done_buf; + struct rxm_rndv_hdr remote_hdr; + } write_rndv; + /* Must stay at bottom */ struct rxm_pkt pkt; }; @@ -523,8 +565,7 @@ struct rxm_tx_atomic_buf { void *app_context; uint64_t flags; - struct iovec result_iov[RXM_IOV_LIMIT]; - uint8_t result_iov_count; + struct rxm_iov result_iov; /* Must stay at bottom */ struct rxm_pkt pkt; @@ -532,9 +573,12 @@ struct rxm_tx_atomic_buf { enum rxm_deferred_tx_entry_type { RXM_DEFERRED_TX_RNDV_ACK, + RXM_DEFERRED_TX_RNDV_DONE, RXM_DEFERRED_TX_RNDV_READ, + RXM_DEFERRED_TX_RNDV_WRITE, RXM_DEFERRED_TX_SAR_SEG, RXM_DEFERRED_TX_ATOMIC_RESP, + RXM_DEFERRED_TX_CREDIT_SEND, }; struct rxm_deferred_tx_entry { @@ -546,12 +590,21 @@ struct rxm_deferred_tx_entry { union { struct { struct rxm_rx_buf *rx_buf; + size_t pkt_size; } rndv_ack; + struct { + struct rxm_tx_rndv_buf *tx_buf; + } rndv_done; struct { struct rxm_rx_buf *rx_buf; struct fi_rma_iov rma_iov; struct rxm_iov rxm_iov; } rndv_read; + struct { + struct rxm_tx_rndv_buf *tx_buf; + struct fi_rma_iov rma_iov; + struct rxm_iov rxm_iov; + } rndv_write; struct { struct rxm_tx_sar_buf *cur_seg_tx_buf; struct { @@ -569,11 +622,16 @@ struct rxm_deferred_tx_entry { uint64_t msg_id; void *app_context; uint64_t flags; + enum fi_hmem_iface iface; + uint64_t device; } sar_seg; struct { struct rxm_tx_atomic_buf *tx_buf; ssize_t len; } atomic_resp; + struct { + struct rxm_tx_base_buf *tx_buf; + } credit_msg; }; }; @@ -588,10 +646,6 @@ struct rxm_recv_entry { uint64_t comp_flags; size_t total_len; struct rxm_recv_queue *recv_queue; - struct { - void *buf; - size_t len; - } multi_recv; /* Used for SAR protocol */ struct { @@ -606,7 +660,7 @@ struct rxm_recv_entry { struct rxm_tx_base_buf *tx_buf; } rndv; }; -DECLARE_FREESTACK(struct rxm_recv_entry, rxm_recv_fs); +OFI_DECLARE_FREESTACK(struct rxm_recv_entry, rxm_recv_fs); enum rxm_recv_queue_type { RXM_RECV_QUEUE_UNSPEC, @@ -615,13 +669,14 @@ enum rxm_recv_queue_type { }; struct rxm_recv_queue { - struct rxm_ep *rxm_ep; + struct rxm_ep *rxm_ep; enum rxm_recv_queue_type type; - struct rxm_recv_fs *fs; - struct dlist_entry recv_list; - struct dlist_entry unexp_msg_list; - dlist_func_t *match_recv; - dlist_func_t *match_unexp; + struct rxm_recv_fs *fs; + struct dlist_entry recv_list; + struct dlist_entry unexp_msg_list; + size_t dyn_rbuf_unexp_cnt; + dlist_func_t *match_recv; + dlist_func_t *match_unexp; }; struct rxm_buf_pool { @@ -645,6 +700,28 @@ struct rxm_msg_eq_entry { #define RXM_CM_ENTRY_SZ (sizeof(struct fi_eq_cm_entry) + \ sizeof(union rxm_cm_data)) +ssize_t rxm_get_dyn_rbuf(struct fi_cq_data_entry *entry, struct iovec *iov, + size_t *count); + +struct rxm_eager_ops { + void (*comp_tx)(struct rxm_ep *rxm_ep, + struct rxm_tx_eager_buf *tx_eager_buf); + void (*handle_rx)(struct rxm_rx_buf *rx_buf); +}; + +struct rxm_rndv_ops { + int rx_mr_access; + int tx_mr_access; + ssize_t (*handle_rx)(struct rxm_rx_buf *rx_buf); + ssize_t (*xfer)(struct fid_ep *ep, const struct iovec *iov, void **desc, + size_t count, fi_addr_t remote_addr, uint64_t addr, + uint64_t key, void *context); + ssize_t (*defer_xfer)(struct rxm_deferred_tx_entry **def_tx_entry, + size_t index, struct iovec *iov, + void *desc[RXM_IOV_LIMIT], size_t count, + void *buf); +}; + struct rxm_ep { struct util_ep util_ep; struct fi_info *rxm_info; @@ -656,23 +733,32 @@ struct rxm_ep { uint64_t msg_cq_last_poll; struct fid_ep *srx_ctx; size_t comp_per_progress; - int msg_mr_local; - int rxm_mr_local; + ofi_atomic32_t atomic_tx_credits; + int cq_eq_fairness; + + bool msg_mr_local; + bool rdm_mr_local; + bool do_progress; + bool enable_direct_send; + size_t min_multi_recv_size; size_t buffered_min; size_t buffered_limit; - size_t inject_limit; - size_t eager_limit; size_t sar_limit; struct rxm_buf_pool *buf_pools; struct dlist_entry repost_ready_list; struct dlist_entry deferred_tx_conn_queue; + struct dlist_entry rndv_wait_list; struct rxm_recv_queue recv_queue; struct rxm_recv_queue trecv_queue; + struct ofi_bufpool *multi_recv_pool; + + struct rxm_eager_ops *eager_ops; + struct rxm_rndv_ops *rndv_ops; }; struct rxm_conn { @@ -692,33 +778,28 @@ struct rxm_conn { struct dlist_entry sar_rx_msg_list; struct dlist_entry sar_deferred_rx_msg_list; - /* This is saved MSG EP fid, that hasn't been closed during - * handling of CONN_RECV in RXM_CMAP_CONNREQ_SENT for passive side */ - struct fid_ep *saved_msg_ep; uint32_t rndv_tx_credits; }; extern struct fi_provider rxm_prov; -extern struct fi_info rxm_info; extern struct fi_fabric_attr rxm_fabric_attr; extern struct fi_domain_attr rxm_domain_attr; extern struct fi_tx_attr rxm_tx_attr; extern struct fi_rx_attr rxm_rx_attr; - -#define rxm_ep_rx_flags(rxm_ep) ((rxm_ep)->util_ep.rx_op_flags) -#define rxm_ep_tx_flags(rxm_ep) ((rxm_ep)->util_ep.tx_op_flags) +extern struct rxm_rndv_ops rxm_rndv_ops_read; +extern struct rxm_rndv_ops rxm_rndv_ops_write; int rxm_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, void *context); int rxm_info_to_core(uint32_t version, const struct fi_info *rxm_info, - struct fi_info *core_info); + const struct fi_info *base_info, struct fi_info *core_info); int rxm_info_to_rxm(uint32_t version, const struct fi_info *core_info, - struct fi_info *info); + const struct fi_info *base_info, struct fi_info *info); int rxm_domain_open(struct fid_fabric *fabric, struct fi_info *info, struct fid_domain **dom, void *context); int rxm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct fid_cq **cq_fid, void *context); -ssize_t rxm_cq_handle_rx_buf(struct rxm_rx_buf *rx_buf); +ssize_t rxm_handle_rx_buf(struct rxm_rx_buf *rx_buf); int rxm_endpoint(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); @@ -726,23 +807,36 @@ int rxm_endpoint(struct fid_domain *domain, struct fi_info *info, int rxm_conn_cmap_alloc(struct rxm_ep *rxm_ep); void rxm_cq_write_error(struct util_cq *cq, struct util_cntr *cntr, void *op_context, int err); +void rxm_cq_write_error_all(struct rxm_ep *rxm_ep, int err); +void rxm_handle_comp_error(struct rxm_ep *rxm_ep); +ssize_t rxm_handle_comp(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp); void rxm_ep_progress(struct util_ep *util_ep); +void rxm_ep_progress_coll(struct util_ep *util_ep); void rxm_ep_do_progress(struct util_ep *util_ep); -int rxm_msg_ep_prepost_recv(struct rxm_ep *rxm_ep, struct fid_ep *msg_ep); +void rxm_handle_eager(struct rxm_rx_buf *rx_buf); +void rxm_handle_coll_eager(struct rxm_rx_buf *rx_buf); +void rxm_finish_eager_send(struct rxm_ep *rxm_ep, + struct rxm_tx_eager_buf *tx_eager_buf); +void rxm_finish_coll_eager_send(struct rxm_ep *rxm_ep, + struct rxm_tx_eager_buf *tx_eager_buf); + +int rxm_prepost_recv(struct rxm_ep *rxm_ep, struct fid_ep *rx_ep); int rxm_ep_query_atomic(struct fid_domain *domain, enum fi_datatype datatype, enum fi_op op, struct fi_atomic_attr *attr, uint64_t flags); +ssize_t rxm_rndv_read(struct rxm_rx_buf *rx_buf); +ssize_t rxm_rndv_send_wr_data(struct rxm_rx_buf *rx_buf); +void rxm_rndv_hdr_init(struct rxm_ep *rxm_ep, void *buf, + const struct iovec *iov, size_t count, + struct fid_mr **mr); + static inline size_t rxm_ep_max_atomic_size(struct fi_info *info) { - size_t overhead = sizeof(struct rxm_atomic_hdr) + - sizeof(struct rxm_pkt); - - /* Must be set to eager size or less */ - return (info->tx_attr && info->tx_attr->inject_size > overhead) ? - info->tx_attr->inject_size - overhead : 0; + assert(rxm_eager_limit >= sizeof(struct rxm_atomic_hdr)); + return rxm_eager_limit - sizeof(struct rxm_atomic_hdr); } static inline ssize_t @@ -790,6 +884,15 @@ rxm_ep_enqueue_deferred_tx_queue(struct rxm_deferred_tx_entry *tx_entry) dlist_insert_tail(&tx_entry->entry, &tx_entry->rxm_conn->deferred_tx_queue); } +static inline void +rxm_ep_enqueue_deferred_tx_queue_priority(struct rxm_deferred_tx_entry *tx_entry) +{ + if (dlist_empty(&tx_entry->rxm_conn->deferred_tx_queue)) + dlist_insert_head(&tx_entry->rxm_conn->deferred_conn_entry, + &tx_entry->rxm_ep->deferred_tx_conn_queue); + dlist_insert_head(&tx_entry->entry, &tx_entry->rxm_conn->deferred_tx_queue); +} + static inline void rxm_ep_dequeue_deferred_tx_queue(struct rxm_deferred_tx_entry *tx_entry) { @@ -800,65 +903,13 @@ rxm_ep_dequeue_deferred_tx_queue(struct rxm_deferred_tx_entry *tx_entry) int rxm_conn_process_eq_events(struct rxm_ep *rxm_ep); -static inline void rxm_ep_msg_mr_closev(struct fid_mr **mr, size_t count) -{ - int ret; - size_t i; - - for (i = 0; i < count; i++) { - if (mr[i]) { - ret = fi_close(&mr[i]->fid); - if (ret) - FI_WARN(&rxm_prov, FI_LOG_EP_DATA, - "Unable to close msg mr: %zu\n", i); - mr[i] = NULL; - } - } -} - -static inline int -rxm_ep_msg_mr_regv(struct rxm_ep *rxm_ep, const struct iovec *iov, size_t count, - uint64_t access, struct fid_mr **mr) -{ - int ret; - size_t i; - struct rxm_domain *rxm_domain = - container_of(rxm_ep->util_ep.domain, struct rxm_domain, util_domain); - - for (i = 0; i < count; i++) { - ret = fi_mr_reg(rxm_domain->msg_domain, iov[i].iov_base, - iov[i].iov_len, access, 0, 0, 0, &mr[i], NULL); - if (ret) - goto err; - } - return 0; -err: - rxm_ep_msg_mr_closev(mr, count); - return ret; -} - -static inline int -rxm_ep_msg_mr_regv_lim(struct rxm_ep *rxm_ep, const struct iovec *iov, size_t count, - size_t total_reg_len, uint64_t access, struct fid_mr **mr) -{ - int ret; - size_t i; - struct rxm_domain *rxm_domain = - container_of(rxm_ep->util_ep.domain, struct rxm_domain, util_domain); - - for (i = 0; i < count && total_reg_len; i++) { - size_t len = MIN(iov[i].iov_len, total_reg_len); - ret = fi_mr_reg(rxm_domain->msg_domain, iov[i].iov_base, - len, access, 0, 0, 0, &mr[i], NULL); - if (ret) - goto err; - total_reg_len -= len; - } - return 0; -err: - rxm_ep_msg_mr_closev(mr, count); - return ret; -} +void rxm_msg_mr_closev(struct fid_mr **mr, size_t count); +int rxm_msg_mr_regv(struct rxm_ep *rxm_ep, const struct iovec *iov, + size_t count, size_t reg_limit, uint64_t access, + struct fid_mr **mr); +int rxm_msg_mr_reg_internal(struct rxm_domain *rxm_domain, const void *buf, + size_t len, uint64_t acs, uint64_t flags, + struct fid_mr **mr); static inline void rxm_cntr_incerr(struct util_cntr *cntr) { @@ -866,138 +917,43 @@ static inline void rxm_cntr_incerr(struct util_cntr *cntr) cntr->cntr_fid.ops->adderr(&cntr->cntr_fid, 1); } - - -static inline void rxm_cq_log_comp(uint64_t flags) -{ -#if ENABLE_DEBUG - FI_DBG(&rxm_prov, FI_LOG_CQ, "Reporting %s completion\n", - fi_tostr((void *)&flags, FI_TYPE_CQ_EVENT_FLAGS)); -#else - /* NOP */ -#endif -} - -/* Caller must hold recv_queue->lock */ -static inline struct rxm_rx_buf * -rxm_check_unexp_msg_list(struct rxm_recv_queue *recv_queue, fi_addr_t addr, - uint64_t tag, uint64_t ignore) +static inline void +rxm_cq_write(struct util_cq *cq, void *context, uint64_t flags, size_t len, + void *buf, uint64_t data, uint64_t tag) { - struct rxm_recv_match_attr match_attr; - struct dlist_entry *entry; - - if (dlist_empty(&recv_queue->unexp_msg_list)) - return NULL; - - match_attr.addr = addr; - match_attr.tag = tag; - match_attr.ignore = ignore; - - entry = dlist_find_first_match(&recv_queue->unexp_msg_list, - recv_queue->match_unexp, &match_attr); - if (!entry) - return NULL; - - RXM_DBG_ADDR_TAG(FI_LOG_EP_DATA, "Match for posted recv found in unexp" - " msg list\n", match_attr.addr, match_attr.tag); + int ret; - return container_of(entry, struct rxm_rx_buf, unexp_msg.entry); -} + FI_DBG(&rxm_prov, FI_LOG_CQ, "Reporting %s completion\n", + fi_tostr((void *) &flags, FI_TYPE_CQ_EVENT_FLAGS)); -static inline int -rxm_process_recv_entry(struct rxm_recv_queue *recv_queue, - struct rxm_recv_entry *recv_entry) -{ - struct rxm_rx_buf *rx_buf; - - rx_buf = rxm_check_unexp_msg_list(recv_queue, recv_entry->addr, - recv_entry->tag, recv_entry->ignore); - if (rx_buf) { - assert((recv_queue->type == RXM_RECV_QUEUE_MSG && - rx_buf->pkt.hdr.op == ofi_op_msg) || - (recv_queue->type == RXM_RECV_QUEUE_TAGGED && - rx_buf->pkt.hdr.op == ofi_op_tagged)); - dlist_remove(&rx_buf->unexp_msg.entry); - rx_buf->recv_entry = recv_entry; - - if (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg) { - return rxm_cq_handle_rx_buf(rx_buf); - } else { - struct dlist_entry *entry; - enum rxm_sar_seg_type last = - (rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) - == RXM_SAR_SEG_LAST); - ssize_t ret = rxm_cq_handle_rx_buf(rx_buf); - struct rxm_recv_match_attr match_attr; - - if (ret || last) - return ret; - - match_attr.addr = recv_entry->addr; - match_attr.tag = recv_entry->tag; - match_attr.ignore = recv_entry->ignore; - - dlist_foreach_container_safe(&recv_queue->unexp_msg_list, - struct rxm_rx_buf, rx_buf, - unexp_msg.entry, entry) { - if (!recv_queue->match_unexp(&rx_buf->unexp_msg.entry, - &match_attr)) - continue; - /* Handle unordered completions from MSG provider */ - if ((rx_buf->pkt.ctrl_hdr.msg_id != recv_entry->sar.msg_id) || - ((rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg))) - continue; - - if (!rx_buf->conn) { - rx_buf->conn = rxm_key2conn(rx_buf->ep, - rx_buf->pkt.ctrl_hdr.conn_id); - } - if (recv_entry->sar.conn != rx_buf->conn) - continue; - rx_buf->recv_entry = recv_entry; - dlist_remove(&rx_buf->unexp_msg.entry); - last = (rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) - == RXM_SAR_SEG_LAST); - ret = rxm_cq_handle_rx_buf(rx_buf); - if (ret || last) - break; - } - return ret; - } + ret = ofi_cq_write(cq, context, flags, len, buf, data, tag); + if (ret) { + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Unable to report completion\n"); + assert(0); } - - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Enqueuing recv\n"); - dlist_insert_tail(&recv_entry->entry, &recv_queue->recv_list); - - return FI_SUCCESS; } -static inline ssize_t -rxm_ep_prepare_tx(struct rxm_ep *rxm_ep, fi_addr_t dest_addr, - struct rxm_conn **rxm_conn) +static inline void +rxm_cq_write_src(struct util_cq *cq, void *context, uint64_t flags, size_t len, + void *buf, uint64_t data, uint64_t tag, fi_addr_t addr) { - ssize_t ret; - - assert(rxm_ep->util_ep.tx_cq); - *rxm_conn = (struct rxm_conn *)rxm_cmap_acquire_handle(rxm_ep->cmap, - dest_addr); - if (OFI_UNLIKELY(!*rxm_conn)) - return -FI_EHOSTUNREACH; - - if (OFI_UNLIKELY((*rxm_conn)->handle.state != RXM_CMAP_CONNECTED)) { - ret = rxm_cmap_connect(rxm_ep, dest_addr, &(*rxm_conn)->handle); - if (ret) - return ret; - } + int ret; - if (OFI_UNLIKELY(!dlist_empty(&(*rxm_conn)->deferred_tx_queue))) { - rxm_ep_do_progress(&rxm_ep->util_ep); - if (!dlist_empty(&(*rxm_conn)->deferred_tx_queue)) - return -FI_EAGAIN; + FI_DBG(&rxm_prov, FI_LOG_CQ, "Reporting %s completion\n", + fi_tostr((void *) &flags, FI_TYPE_CQ_EVENT_FLAGS)); + + ret = ofi_cq_write_src(cq, context, flags, len, buf, data, tag, addr); + if (ret) { + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Unable to report completion\n"); + assert(0); } - return 0; } +ssize_t rxm_get_conn(struct rxm_ep *rxm_ep, fi_addr_t addr, + struct rxm_conn **rxm_conn); + static inline void rxm_ep_format_tx_buf_pkt(struct rxm_conn *rxm_conn, size_t len, uint8_t op, uint64_t data, uint64_t tag, uint64_t flags, @@ -1011,40 +967,23 @@ rxm_ep_format_tx_buf_pkt(struct rxm_conn *rxm_conn, size_t len, uint8_t op, pkt->hdr.data = data; } - -static inline struct rxm_buf * +static inline void * rxm_tx_buf_alloc(struct rxm_ep *rxm_ep, enum rxm_buf_pool_type type) { assert((type == RXM_BUF_POOL_TX) || (type == RXM_BUF_POOL_TX_INJECT) || - (type == RXM_BUF_POOL_TX_ACK) || - (type == RXM_BUF_POOL_TX_RNDV) || + (type == RXM_BUF_POOL_TX_RNDV_RD_DONE) || + (type == RXM_BUF_POOL_TX_RNDV_WR_DATA) || + (type == RXM_BUF_POOL_TX_RNDV_WR_DONE) || + (type == RXM_BUF_POOL_TX_RNDV_REQ) || (type == RXM_BUF_POOL_TX_ATOMIC) || + (type == RXM_BUF_POOL_TX_CREDIT) || (type == RXM_BUF_POOL_TX_SAR)); return ofi_buf_alloc(rxm_ep->buf_pools[type].pool); } - -static inline struct rxm_rx_buf * -rxm_rx_buf_alloc(struct rxm_ep *rxm_ep, struct fid_ep *msg_ep, uint8_t repost) -{ - struct rxm_rx_buf *rx_buf = - ofi_buf_alloc(rxm_ep->buf_pools[RXM_BUF_POOL_RX].pool); - if (OFI_LIKELY((long int)rx_buf)) { - assert(rx_buf->ep == rxm_ep); - rx_buf->hdr.state = RXM_RX; - rx_buf->msg_ep = msg_ep; - rx_buf->repost = repost; - - if (!rxm_ep->srx_ctx) - rx_buf->conn = container_of(msg_ep->fid.context, - struct rxm_conn, handle); - } - return rx_buf; -} - static inline void -rxm_rx_buf_finish(struct rxm_rx_buf *rx_buf) +rxm_rx_buf_free(struct rxm_rx_buf *rx_buf) { if (rx_buf->repost) { dlist_insert_tail(&rx_buf->repost_entry, @@ -1054,57 +993,35 @@ rxm_rx_buf_finish(struct rxm_rx_buf *rx_buf) } } -static inline struct rxm_rma_buf *rxm_rma_buf_alloc(struct rxm_ep *rxm_ep) -{ - return (struct rxm_rma_buf *) - ofi_buf_alloc(rxm_ep->buf_pools[RXM_BUF_POOL_RMA].pool); -} - -static inline -struct rxm_tx_atomic_buf *rxm_tx_atomic_buf_alloc(struct rxm_ep *rxm_ep) -{ - return (struct rxm_tx_atomic_buf *) - rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_ATOMIC); -} - -static inline struct rxm_recv_entry *rxm_recv_entry_get(struct rxm_recv_queue *queue) -{ - return (freestack_isempty(queue->fs) ? - NULL : freestack_pop(queue->fs)); -} - static inline void -rxm_recv_entry_release(struct rxm_recv_queue *queue, struct rxm_recv_entry *entry) +rxm_recv_entry_release(struct rxm_recv_entry *entry) { - entry->total_len = 0; - freestack_push(queue->fs, entry); + if (entry->recv_queue) + ofi_freestack_push(entry->recv_queue->fs, entry); + else + ofi_buf_free(entry); } -static inline int rxm_cq_write_recv_comp(struct rxm_rx_buf *rx_buf, - void *context, uint64_t flags, - size_t len, char *buf) +static inline void +rxm_cq_write_recv_comp(struct rxm_rx_buf *rx_buf, void *context, uint64_t flags, + size_t len, char *buf) { if (rx_buf->ep->rxm_info->caps & FI_SOURCE) - return ofi_cq_write_src(rx_buf->ep->util_ep.rx_cq, context, - flags, len, buf, rx_buf->pkt.hdr.data, - rx_buf->pkt.hdr.tag, - rx_buf->conn->handle.fi_addr); + rxm_cq_write_src(rx_buf->ep->util_ep.rx_cq, context, + flags, len, buf, rx_buf->pkt.hdr.data, + rx_buf->pkt.hdr.tag, + rx_buf->conn->handle.fi_addr); else - return ofi_cq_write(rx_buf->ep->util_ep.rx_cq, context, - flags, len, buf, rx_buf->pkt.hdr.data, - rx_buf->pkt.hdr.tag); + rxm_cq_write(rx_buf->ep->util_ep.rx_cq, context, + flags, len, buf, rx_buf->pkt.hdr.data, + rx_buf->pkt.hdr.tag); } -static inline int -rxm_cq_write_multi_recv_comp(struct rxm_ep *rxm_ep, struct rxm_recv_entry *recv_entry) -{ - if (rxm_ep->rxm_info->caps & FI_SOURCE) - return ofi_cq_write_src(rxm_ep->util_ep.rx_cq, recv_entry->context, - FI_MULTI_RECV, recv_entry->multi_recv.len, - recv_entry->multi_recv.buf, 0, 0, - recv_entry->addr); - else - return ofi_cq_write(rxm_ep->util_ep.rx_cq, recv_entry->context, - FI_MULTI_RECV, recv_entry->multi_recv.len, - recv_entry->multi_recv.buf, 0, 0); -} +struct rxm_mr *rxm_mr_get_map_entry(struct rxm_domain *domain, uint64_t key); + +struct rxm_recv_entry * +rxm_multi_recv_entry_get(struct rxm_ep *rxm_ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, void *context, + uint64_t flags); +#endif diff --git a/prov/rxm/src/rxm_atomic.c b/prov/rxm/src/rxm_atomic.c index 1f0df302793..113acdbc2a9 100644 --- a/prov/rxm/src/rxm_atomic.c +++ b/prov/rxm/src/rxm_atomic.c @@ -1,5 +1,7 @@ /* * Copyright (c) 2018 Cray Inc. All rights reserved. + * Copyright (c) 2018 System Fabric Works, Inc. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -96,11 +98,16 @@ rxm_ep_atomic_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, struct rxm_atomic_hdr *atomic_hdr; struct iovec buf_iov[RXM_IOV_LIMIT]; struct iovec cmp_iov[RXM_IOV_LIMIT]; + enum fi_hmem_iface buf_iface = FI_HMEM_SYSTEM; + enum fi_hmem_iface cmp_iface; + uint64_t buf_device = 0; + uint64_t cmp_device; size_t datatype_sz = ofi_datatype_size(msg->datatype); size_t buf_len = 0; size_t cmp_len = 0; size_t tot_len; ssize_t ret; + int i; assert(msg->iov_count <= RXM_IOV_LIMIT && msg->rma_iov_count <= RXM_IOV_LIMIT); @@ -116,6 +123,10 @@ rxm_ep_atomic_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, ofi_ioc_to_iov(msg->msg_iov, buf_iov, msg->iov_count, datatype_sz); buf_len = ofi_total_iov_len(buf_iov, msg->iov_count); + + buf_iface = rxm_mr_desc_to_hmem_iface_dev(msg->desc, + msg->iov_count, + &buf_device); } if (op == ofi_op_atomic_compare) { @@ -124,23 +135,32 @@ rxm_ep_atomic_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, datatype_sz); cmp_len = ofi_total_iov_len(cmp_iov, compare_iov_count); assert(buf_len == cmp_len); + + cmp_iface = rxm_mr_desc_to_hmem_iface_dev(compare_desc, + compare_iov_count, + &cmp_device); } tot_len = buf_len + cmp_len + sizeof(struct rxm_atomic_hdr) + sizeof(struct rxm_pkt); - if (tot_len > rxm_eager_limit) { + if (tot_len > rxm_buffer_size) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "atomic data too large %zu\n", tot_len); return -FI_EINVAL; } - tx_buf = (struct rxm_tx_atomic_buf *) - rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_ATOMIC); + if (ofi_atomic_dec32(&rxm_ep->atomic_tx_credits) < 0) { + ret = -FI_EAGAIN; + goto restore_credit; + } + + tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_ATOMIC); if (OFI_UNLIKELY(!tx_buf)) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "Ran out of buffers from Atomic buffer pool\n"); - return -FI_EAGAIN; + ret = -FI_EAGAIN; + goto restore_credit; } rxm_ep_format_atomic_pkt_hdr(rxm_conn, tx_buf, tot_len, op, @@ -151,20 +171,35 @@ rxm_ep_atomic_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, atomic_hdr = (struct rxm_atomic_hdr *) tx_buf->pkt.data; - ofi_copy_from_iov(atomic_hdr->data, buf_len, buf_iov, - msg->iov_count, 0); - if (cmp_len) - ofi_copy_from_iov(atomic_hdr->data + buf_len, cmp_len, - cmp_iov, compare_iov_count, 0); + ret = ofi_copy_from_hmem_iov(atomic_hdr->data, buf_len, buf_iface, + buf_device, buf_iov, msg->iov_count, 0); + assert(ret == buf_len); - tx_buf->result_iov_count = result_iov_count; - if (resultv) - ofi_ioc_to_iov(resultv, tx_buf->result_iov, result_iov_count, - datatype_sz); + if (cmp_len) { + ret = ofi_copy_from_hmem_iov(atomic_hdr->data + buf_len, + cmp_len, cmp_iface, cmp_device, + cmp_iov, compare_iov_count, 0); + assert(ret == cmp_len); + } + + tx_buf->result_iov.count = result_iov_count; + if (resultv) { + ofi_ioc_to_iov(resultv, tx_buf->result_iov.iov, + result_iov_count, datatype_sz); + + if (result_desc) { + for (i = 0; i < result_iov_count; i++) + tx_buf->result_iov.desc[i] = result_desc[i]; + } + } ret = rxm_ep_send_atomic_req(rxm_ep, rxm_conn, tx_buf, tot_len); - if (ret) - ofi_buf_free(tx_buf); + if (OFI_LIKELY(!ret)) + return ret; + + ofi_buf_free(tx_buf); +restore_credit: + ofi_atomic_inc32(&rxm_ep->atomic_tx_credits); return ret; } @@ -172,13 +207,12 @@ static ssize_t rxm_ep_generic_atomic_writemsg(struct rxm_ep *rxm_ep, const struct fi_msg_atomic *msg, uint64_t flags) { - int ret; struct rxm_conn *rxm_conn; + ssize_t ret; ofi_ep_lock_acquire(&rxm_ep->util_ep); - - ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + ret = rxm_get_conn(rxm_ep, msg->addr, &rxm_conn); + if (ret) goto unlock; ret = rxm_ep_atomic_common(rxm_ep, rxm_conn, msg, NULL, NULL, 0, @@ -225,7 +259,8 @@ rxm_ep_atomic_writev(struct fid_ep *ep_fid, const struct fi_ioc *iov, .data = 0, }; - return rxm_ep_generic_atomic_writemsg(rxm_ep, &msg, rxm_ep_tx_flags(rxm_ep)); + return rxm_ep_generic_atomic_writemsg(rxm_ep, &msg, + rxm_ep->util_ep.tx_op_flags); } static ssize_t @@ -282,13 +317,12 @@ rxm_ep_generic_atomic_readwritemsg(struct rxm_ep *rxm_ep, struct fi_ioc *resultv, void **result_desc, size_t result_count, uint64_t flags) { - int ret; struct rxm_conn *rxm_conn; + ssize_t ret; ofi_ep_lock_acquire(&rxm_ep->util_ep); - - ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + ret = rxm_get_conn(rxm_ep, msg->addr, &rxm_conn); + if (ret) goto unlock; ret = rxm_ep_atomic_common(rxm_ep, rxm_conn, msg, NULL, NULL, 0, @@ -342,7 +376,7 @@ rxm_ep_atomic_readwritev(struct fid_ep *ep_fid, const struct fi_ioc *iov, }; return rxm_ep_generic_atomic_readwritemsg(rxm_ep, &msg, resultv, - result_desc, result_count, rxm_ep_tx_flags(rxm_ep)); + result_desc, result_count, rxm_ep->util_ep.tx_op_flags); } static ssize_t @@ -376,13 +410,12 @@ rxm_ep_generic_atomic_compwritemsg(struct rxm_ep *rxm_ep, void **result_desc, size_t result_count, uint64_t flags) { - int ret; struct rxm_conn *rxm_conn; + ssize_t ret; ofi_ep_lock_acquire(&rxm_ep->util_ep); - - ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + ret = rxm_get_conn(rxm_ep, msg->addr, &rxm_conn); + if (ret) goto unlock; ret = rxm_ep_atomic_common(rxm_ep, rxm_conn, msg, comparev, @@ -442,7 +475,7 @@ rxm_ep_atomic_compwritev(struct fid_ep *ep_fid, const struct fi_ioc *iov, return rxm_ep_generic_atomic_compwritemsg(rxm_ep, &msg, comparev, compare_desc, compare_count, resultv, result_desc, - result_count, rxm_ep_tx_flags(rxm_ep)); + result_count, rxm_ep->util_ep.tx_op_flags); } static ssize_t @@ -497,6 +530,9 @@ int rxm_ep_query_atomic(struct fid_domain *domain, enum fi_datatype datatype, attr->size = ofi_datatype_size(datatype); attr->count = tot_size / attr->size; + if (attr->count == 0) + return -FI_EOPNOTSUPP; + return FI_SUCCESS; } diff --git a/prov/rxm/src/rxm_attr.c b/prov/rxm/src/rxm_attr.c index 0fe7606159b..cb620d4b112 100644 --- a/prov/rxm/src/rxm_attr.c +++ b/prov/rxm/src/rxm_attr.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2015-2016 Intel Corporation. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -32,13 +33,15 @@ #include "rxm.h" -#define RXM_EP_CAPS (FI_MSG | FI_RMA | FI_TAGGED | FI_ATOMIC | \ - FI_DIRECTED_RECV | FI_READ | FI_WRITE | FI_RECV | \ - FI_SEND | FI_REMOTE_READ | FI_REMOTE_WRITE | FI_SOURCE) +#define RXM_TX_CAPS (OFI_TX_MSG_CAPS | FI_TAGGED | OFI_TX_RMA_CAPS | \ + FI_ATOMICS) + +#define RXM_RX_CAPS (FI_SOURCE | OFI_RX_MSG_CAPS | FI_TAGGED | \ + OFI_RX_RMA_CAPS | FI_ATOMICS | FI_DIRECTED_RECV | \ + FI_MULTI_RECV) #define RXM_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM) -// TODO have a separate "check info" against which app hints would be checked. /* Since we are a layering provider, the attributes for which we rely on the * core provider are set to full capability. This ensures that ofix_getinfo @@ -46,21 +49,40 @@ * requested by the app. */ struct fi_tx_attr rxm_tx_attr = { - .caps = RXM_EP_CAPS, + .caps = RXM_TX_CAPS | FI_HMEM, .op_flags = RXM_PASSTHRU_TX_OP_FLAGS | RXM_TX_OP_FLAGS, .msg_order = ~0x0ULL, .comp_order = FI_ORDER_NONE, - .size = 1024, + .size = 65536, .iov_limit = RXM_IOV_LIMIT, .rma_iov_limit = RXM_IOV_LIMIT, }; struct fi_rx_attr rxm_rx_attr = { - .caps = RXM_EP_CAPS | FI_MULTI_RECV, + .caps = RXM_RX_CAPS | FI_HMEM, .op_flags = RXM_PASSTHRU_RX_OP_FLAGS | RXM_RX_OP_FLAGS, .msg_order = ~0x0ULL, .comp_order = FI_ORDER_NONE, - .size = 1024, + .size = 65536, + .iov_limit= RXM_IOV_LIMIT, +}; + +struct fi_tx_attr rxm_tx_attr_coll = { + .caps = RXM_TX_CAPS | FI_COLLECTIVE, + .op_flags = RXM_PASSTHRU_TX_OP_FLAGS | RXM_TX_OP_FLAGS, + .msg_order = ~0x0ULL, + .comp_order = FI_ORDER_NONE, + .size = 65536, + .iov_limit = RXM_IOV_LIMIT, + .rma_iov_limit = RXM_IOV_LIMIT, +}; + +struct fi_rx_attr rxm_rx_attr_coll = { + .caps = RXM_RX_CAPS | FI_COLLECTIVE, + .op_flags = RXM_PASSTHRU_RX_OP_FLAGS | RXM_RX_OP_FLAGS, + .msg_order = ~0x0ULL, + .comp_order = FI_ORDER_NONE, + .size = 65536, .iov_limit= RXM_IOV_LIMIT, }; @@ -77,6 +99,19 @@ struct fi_ep_attr rxm_ep_attr = { .mem_tag_format = FI_TAG_GENERIC, }; +struct fi_ep_attr rxm_ep_attr_coll = { + .type = FI_EP_RDM, + .protocol = FI_PROTO_RXM, + .protocol_version = 1, + .max_msg_size = SIZE_MAX, + .tx_ctx_cnt = 1, + .rx_ctx_cnt = 1, + .max_order_raw_size = SIZE_MAX, + .max_order_war_size = SIZE_MAX, + .max_order_waw_size = SIZE_MAX, + .mem_tag_format = FI_TAG_GENERIC >> 1, +}; + struct fi_domain_attr rxm_domain_attr = { .caps = RXM_DOMAIN_CAPS, .threading = FI_THREAD_SAFE, @@ -86,7 +121,8 @@ struct fi_domain_attr rxm_domain_attr = { .av_type = FI_AV_UNSPEC, /* Advertise support for FI_MR_BASIC so that ofi_check_info call * doesn't fail at RxM level. If an app requires FI_MR_BASIC, it - * would be passed down to core provider. */ + * would be passed down to core provider. + */ .mr_mode = FI_MR_BASIC | FI_MR_SCALABLE, .cq_data_size = sizeof_field(struct ofi_op_hdr, data), .cq_cnt = (1 << 16), @@ -99,20 +135,64 @@ struct fi_domain_attr rxm_domain_attr = { }; struct fi_fabric_attr rxm_fabric_attr = { - .prov_version = FI_VERSION(RXM_MAJOR_VERSION, RXM_MINOR_VERSION), + .prov_version = OFI_VERSION_DEF_PROV, }; -struct fi_info rxm_info = { - .caps = RXM_EP_CAPS | RXM_DOMAIN_CAPS | FI_MULTI_RECV, +struct fi_fabric_attr rxm_verbs_fabric_attr = { + .prov_version = OFI_VERSION_DEF_PROV, + .prov_name = "verbs", +}; + +struct fi_fabric_attr rxm_tcp_fabric_attr = { + .prov_version = OFI_VERSION_DEF_PROV, + .prov_name = "tcp", +}; + +struct fi_info rxm_coll_info = { + .caps = RXM_TX_CAPS | RXM_RX_CAPS | RXM_DOMAIN_CAPS | FI_COLLECTIVE, + .addr_format = FI_SOCKADDR, + .tx_attr = &rxm_tx_attr_coll, + .rx_attr = &rxm_rx_attr_coll, + .ep_attr = &rxm_ep_attr_coll, + .domain_attr = &rxm_domain_attr, + .fabric_attr = &rxm_fabric_attr +}; + +struct fi_info rxm_base_info = { + .caps = RXM_TX_CAPS | RXM_RX_CAPS | RXM_DOMAIN_CAPS | FI_HMEM, .addr_format = FI_SOCKADDR, .tx_attr = &rxm_tx_attr, .rx_attr = &rxm_rx_attr, .ep_attr = &rxm_ep_attr, .domain_attr = &rxm_domain_attr, - .fabric_attr = &rxm_fabric_attr + .fabric_attr = &rxm_fabric_attr, + .next = &rxm_coll_info, +}; + +struct fi_info rxm_tcp_info = { + .caps = RXM_TX_CAPS | RXM_RX_CAPS | RXM_DOMAIN_CAPS, + .addr_format = FI_SOCKADDR, + .tx_attr = &rxm_tx_attr, + .rx_attr = &rxm_rx_attr, + .ep_attr = &rxm_ep_attr, + .domain_attr = &rxm_domain_attr, + .fabric_attr = &rxm_tcp_fabric_attr, + .next = &rxm_base_info, +}; + +struct fi_info rxm_verbs_info = { + .caps = RXM_TX_CAPS | RXM_RX_CAPS | RXM_DOMAIN_CAPS | FI_HMEM, + .addr_format = FI_SOCKADDR, + .tx_attr = &rxm_tx_attr, + .rx_attr = &rxm_rx_attr, + .ep_attr = &rxm_ep_attr, + .domain_attr = &rxm_domain_attr, + .fabric_attr = &rxm_verbs_fabric_attr, + .next = &rxm_tcp_info, }; struct util_prov rxm_util_prov = { .prov = &rxm_prov, + .info = &rxm_verbs_info, .flags = 0, }; diff --git a/prov/rxm/src/rxm_av.c b/prov/rxm/src/rxm_av.c index 18c12c950bd..b278dcb0551 100644 --- a/prov/rxm/src/rxm_av.c +++ b/prov/rxm/src/rxm_av.c @@ -30,6 +30,8 @@ * SOFTWARE. */ +#include + #include "rxm.h" static int rxm_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, @@ -59,6 +61,10 @@ static int rxm_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, return ofi_ip_av_remove(av_fid, fi_addr, count, flags); } +/* TODO: Determine if it's cleaner to insert an address into the cmap only + * when we need to send to that address, rather than inserting the address + * into the cmap when adding it to the AV. + */ static int rxm_av_insert_cmap(struct fid_av *av_fid, const void *addr, size_t count, fi_addr_t *fi_addr, uint64_t flags) @@ -131,7 +137,7 @@ static int rxm_av_insertsym(struct fid_av *av_fid, const char *node, size_t addrlen, count = nodecnt * svccnt; int ret, retv; - ret = ofi_verify_av_insert(av, flags); + ret = ofi_verify_av_insert(av, flags, context); if (ret) return ret; @@ -142,13 +148,17 @@ static int rxm_av_insertsym(struct fid_av *av_fid, const char *node, assert(ret == count); - ret = ofi_ip_av_insertv(av, addr, addrlen, count, fi_addr, context); - if (ret < 0) - goto out; - - if (!av->eq && !ret) - goto out; + ret = ofi_ip_av_insertv(av, addr, addrlen, count, fi_addr, flags, + context); + if (!av->eq && ret < count) { + count = ret; + } + /* If the AV is bound to an EQ, we can't determine which entries were + * added successfully to the AV until we process the insertion events + * later when reading the EQ. Add all addresses to the cmap + * optimistically. + */ retv = rxm_av_insert_cmap(av_fid, addr, count, fi_addr, flags); if (retv) { ret = rxm_av_remove(av_fid, fi_addr, count, flags); @@ -157,10 +167,9 @@ static int rxm_av_insertsym(struct fid_av *av_fid, const char *node, "from AV during error handling\n"); ret = retv; } -out: + free(addr); return ret; - } int rxm_av_insertsvc(struct fid_av *av, const char *node, const char *service, @@ -190,6 +199,7 @@ static struct fi_ops_av rxm_av_ops = { .remove = rxm_av_remove, .lookup = rxm_av_lookup, .straddr = rxm_av_straddr, + .av_set = ofi_av_set }; int rxm_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, diff --git a/prov/rxm/src/rxm_conn.c b/prov/rxm/src/rxm_conn.c index 1cbaec8c303..7250d6f1da1 100644 --- a/prov/rxm/src/rxm_conn.c +++ b/prov/rxm/src/rxm_conn.c @@ -40,21 +40,15 @@ #include "rxm.h" static struct rxm_cmap_handle *rxm_conn_alloc(struct rxm_cmap *cmap); -static void rxm_conn_connected_handler(struct rxm_cmap_handle *handle); -static void rxm_conn_close_saved(struct rxm_cmap_handle *handle); -static void rxm_conn_close(struct rxm_cmap_handle *handle); -static void rxm_conn_save(struct rxm_cmap_handle *handle); -static int -rxm_conn_connect(struct util_ep *util_ep, struct rxm_cmap_handle *handle, - const void *addr); -static int rxm_conn_signal(struct util_ep *util_ep, void *context, +static int rxm_conn_connect(struct rxm_ep *ep, + struct rxm_cmap_handle *handle, const void *addr); +static int rxm_conn_signal(struct rxm_ep *ep, void *context, enum rxm_cmap_signal signal); -static void -rxm_conn_av_updated_handler(struct rxm_cmap_handle *handle); +static void rxm_conn_av_updated_handler(struct rxm_cmap_handle *handle); static void *rxm_conn_progress(void *arg); static void *rxm_conn_atomic_progress(void *arg); -static int -rxm_conn_handle_event(struct rxm_ep *rxm_ep, struct rxm_msg_eq_entry *entry); +static int rxm_conn_handle_event(struct rxm_ep *rxm_ep, + struct rxm_msg_eq_entry *entry); /* @@ -75,8 +69,9 @@ static inline ssize_t rxm_eq_readerr(struct rxm_ep *rxm_ep, ret = fi_eq_readerr(rxm_ep->msg_eq, &entry->err_entry, 0); if (ret != sizeof(entry->err_entry)) { - FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, - "unable to fi_eq_readerr: %zd\n", ret); + if (ret != -FI_EAGAIN) + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, + "unable to fi_eq_readerr: %zd\n", ret); return ret < 0 ? ret : -FI_EINVAL; } @@ -90,20 +85,16 @@ static inline ssize_t rxm_eq_readerr(struct rxm_ep *rxm_ep, return -entry->err_entry.err; } -static ssize_t rxm_eq_read(struct rxm_ep *rxm_ep, size_t len, +static ssize_t rxm_eq_read(struct rxm_ep *ep, size_t len, struct rxm_msg_eq_entry *entry) { - ssize_t rd; - - rd = fi_eq_read(rxm_ep->msg_eq, &entry->event, &entry->cm_entry, - len, 0); - if (OFI_LIKELY(rd >= 0)) - return rd; + ssize_t ret; - if (rd != -FI_EAVAIL) - return rd; + ret = fi_eq_read(ep->msg_eq, &entry->event, &entry->cm_entry, len, 0); + if (ret == -FI_EAVAIL) + ret = rxm_eq_readerr(ep, entry); - return rxm_eq_readerr(rxm_ep, entry); + return ret; } static void rxm_cmap_set_key(struct rxm_cmap_handle *handle) @@ -184,6 +175,37 @@ static int rxm_cmap_del_handle(struct rxm_cmap_handle *handle) return 0; } +ssize_t rxm_get_conn(struct rxm_ep *rxm_ep, fi_addr_t addr, + struct rxm_conn **rxm_conn) +{ + struct rxm_cmap_handle *handle; + ssize_t ret; + + assert(rxm_ep->util_ep.tx_cq); + handle = rxm_cmap_acquire_handle(rxm_ep->cmap, addr); + if (!handle) { + ret = rxm_cmap_alloc_handle(rxm_ep->cmap, addr, + RXM_CMAP_IDLE, &handle); + if (ret) + return ret; + } + + *rxm_conn = container_of(handle, struct rxm_conn, handle); + + if (handle->state != RXM_CMAP_CONNECTED) { + ret = rxm_cmap_connect(rxm_ep, addr, handle); + if (ret) + return ret; + } + + if (!dlist_empty(&(*rxm_conn)->deferred_tx_queue)) { + rxm_ep_do_progress(&rxm_ep->util_ep); + if (!dlist_empty(&(*rxm_conn)->deferred_tx_queue)) + return -FI_EAGAIN; + } + return 0; +} + static inline int rxm_cmap_check_and_realloc_handles_table(struct rxm_cmap *cmap, fi_addr_t fi_addr) @@ -194,7 +216,7 @@ rxm_cmap_check_and_realloc_handles_table(struct rxm_cmap *cmap, if (OFI_LIKELY(fi_addr < cmap->num_allocated)) return 0; - grow_size = MAX(cmap->av->count, fi_addr - cmap->num_allocated + 1); + grow_size = MAX(ofi_av_size(cmap->av), fi_addr - cmap->num_allocated + 1); new_handles = realloc(cmap->handles_av, (grow_size + cmap->num_allocated) * @@ -229,6 +251,7 @@ rxm_conn_inject_pkt_alloc(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, return inject_pkt; } + static void rxm_conn_res_free(struct rxm_conn *rxm_conn) { ofi_freealign(rxm_conn->inject_pkt); @@ -240,6 +263,7 @@ static void rxm_conn_res_free(struct rxm_conn *rxm_conn) ofi_freealign(rxm_conn->tinject_data_pkt); rxm_conn->tinject_data_pkt = NULL; } + static int rxm_conn_res_alloc(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn) { dlist_init(&rxm_conn->deferred_conn_entry); @@ -272,54 +296,70 @@ static int rxm_conn_res_alloc(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn) return 0; } -static void rxm_conn_free(struct rxm_cmap_handle *handle) +static void rxm_conn_close(struct rxm_cmap_handle *handle) { - struct rxm_conn *rxm_conn = - container_of(handle, struct rxm_conn, handle); + struct rxm_conn *rxm_conn = container_of(handle, struct rxm_conn, handle); + struct rxm_conn *rxm_conn_tmp; + struct rxm_deferred_tx_entry *def_tx_entry; + struct dlist_entry *conn_entry_tmp; + + dlist_foreach_container_safe(&handle->cmap->ep->deferred_tx_conn_queue, + struct rxm_conn, rxm_conn_tmp, + deferred_conn_entry, conn_entry_tmp) + { + if (rxm_conn_tmp->handle.key != handle->key) + continue; - /* This handles case when saved_msg_ep wasn't closed */ - if (rxm_conn->saved_msg_ep) { - if (fi_close(&rxm_conn->saved_msg_ep->fid)) { - FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, - "Unable to close saved msg_ep\n"); - } else { + while (!dlist_empty(&rxm_conn_tmp->deferred_tx_queue)) { + def_tx_entry = + container_of(rxm_conn_tmp->deferred_tx_queue.next, + struct rxm_deferred_tx_entry, entry); FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, - "Closed saved msg_ep\n"); + "cancelled deferred message\n"); + rxm_ep_dequeue_deferred_tx_queue(def_tx_entry); + free(def_tx_entry); } - rxm_conn->saved_msg_ep = NULL; } - if (rxm_conn->msg_ep) { - if (fi_close(&rxm_conn->msg_ep->fid)) { - FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, - "unable to close msg_ep\n"); - } else { - FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, - "closed msg_ep\n"); - } - rxm_conn->msg_ep = NULL; - } + FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "closing msg ep\n"); + if (!rxm_conn->msg_ep) + return; + + if (fi_close(&rxm_conn->msg_ep->fid)) + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "unable to close msg_ep\n"); + + rxm_conn->msg_ep = NULL; +} + +static void rxm_conn_free(struct rxm_cmap_handle *handle) +{ + struct rxm_conn *rxm_conn = container_of(handle, struct rxm_conn, handle); + + rxm_conn_close(handle); rxm_conn_res_free(rxm_conn); free(rxm_conn); } -static int rxm_cmap_alloc_handle(struct rxm_cmap *cmap, fi_addr_t fi_addr, - enum rxm_cmap_state state, - struct rxm_cmap_handle **handle) +int rxm_cmap_alloc_handle(struct rxm_cmap *cmap, fi_addr_t fi_addr, + enum rxm_cmap_state state, + struct rxm_cmap_handle **handle) { int ret; *handle = rxm_conn_alloc(cmap); - if (OFI_UNLIKELY(!*handle)) + if (!*handle) return -FI_ENOMEM; + FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL, "Allocated handle: %p for fi_addr: %" PRIu64 "\n", *handle, fi_addr); + ret = rxm_cmap_check_and_realloc_handles_table(cmap, fi_addr); - if (OFI_UNLIKELY(ret)) { + if (ret) { rxm_conn_free(*handle); return ret; } + rxm_cmap_init_handle(*handle, cmap, state, fi_addr, NULL); cmap->handles_av[fi_addr] = *handle; return 0; @@ -334,14 +374,17 @@ static int rxm_cmap_alloc_handle_peer(struct rxm_cmap *cmap, void *addr, peer = calloc(1, sizeof(*peer) + cmap->av->addrlen); if (!peer) return -FI_ENOMEM; + *handle = rxm_conn_alloc(cmap); if (!*handle) { free(peer); return -FI_ENOMEM; } - ofi_straddr_dbg(cmap->av->prov, FI_LOG_AV, "Allocated handle for addr", - addr); + + ofi_straddr_dbg(cmap->av->prov, FI_LOG_AV, + "Allocated handle for addr", addr); FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL, "handle: %p\n", *handle); + rxm_cmap_init_handle(*handle, cmap, state, FI_ADDR_NOTAVAIL, peer); FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL, "Adding handle to peer list\n"); peer->handle = *handle; @@ -360,6 +403,7 @@ rxm_cmap_get_handle_peer(struct rxm_cmap *cmap, const void *addr) addr); if (!entry) return NULL; + ofi_straddr_dbg(cmap->av->prov, FI_LOG_AV, "handle found in peer list for addr", addr); peer = container_of(entry, struct rxm_cmap_peer, entry); @@ -454,15 +498,6 @@ void rxm_cmap_process_shutdown(struct rxm_cmap *cmap, } } -void rxm_cmap_process_conn_notify(struct rxm_cmap *cmap, - struct rxm_cmap_handle *handle) -{ - FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL, - "Processing connection notification for handle: %p.\n", handle); - RXM_CM_UPDATE_STATE(handle, RXM_CMAP_CONNECTED); - rxm_conn_connected_handler(handle); -} - void rxm_cmap_process_connect(struct rxm_cmap *cmap, struct rxm_cmap_handle *handle, union rxm_cm_data *cm_data) @@ -470,7 +505,7 @@ void rxm_cmap_process_connect(struct rxm_cmap *cmap, struct rxm_conn *rxm_conn = container_of(handle, struct rxm_conn, handle); FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL, - "Processing connect for handle: %p\n", handle); + "processing FI_CONNECTED event for handle: %p\n", handle); if (cm_data) { assert(handle->state == RXM_CMAP_CONNREQ_SENT); handle->remote_key = cm_data->accept.server_conn_id; @@ -478,10 +513,10 @@ void rxm_cmap_process_connect(struct rxm_cmap *cmap, } else { assert(handle->state == RXM_CMAP_CONNREQ_RECV); } - RXM_CM_UPDATE_STATE(handle, RXM_CMAP_CONNECTED_NOTIFY); + RXM_CM_UPDATE_STATE(handle, RXM_CMAP_CONNECTED); /* Set the remote key to the inject packets */ - if (cmap->ep->domain->threading != FI_THREAD_SAFE) { + if (cmap->ep->util_ep.domain->threading != FI_THREAD_SAFE) { rxm_conn->inject_pkt->ctrl_hdr.conn_id = rxm_conn->handle.remote_key; rxm_conn->inject_data_pkt->ctrl_hdr.conn_id = rxm_conn->handle.remote_key; rxm_conn->tinject_pkt->ctrl_hdr.conn_id = rxm_conn->handle.remote_key; @@ -498,11 +533,7 @@ void rxm_cmap_process_reject(struct rxm_cmap *cmap, switch (handle->state) { case RXM_CMAP_CONNREQ_RECV: case RXM_CMAP_CONNECTED: - case RXM_CMAP_CONNECTED_NOTIFY: /* Handle is being re-used for incoming connection request */ - FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL, - "Connection handle is being re-used. Close saved connection\n"); - rxm_conn_close_saved(handle); break; case RXM_CMAP_CONNREQ_SENT: if (reject_reason == RXM_CMAP_REJECT_GENUINE) { @@ -556,7 +587,6 @@ int rxm_cmap_process_connreq(struct rxm_cmap *cmap, void *addr, } switch (handle->state) { - case RXM_CMAP_CONNECTED_NOTIFY: case RXM_CMAP_CONNECTED: FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL, "Connection already present.\n"); @@ -581,7 +611,7 @@ int rxm_cmap_process_connreq(struct rxm_cmap *cmap, void *addr, "Re-using handle: %p to accept remote " "connection\n", handle); *reject_reason = RXM_CMAP_REJECT_GENUINE; - rxm_conn_save(handle); + rxm_conn_close(handle); } else { FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL, "Endpoint connects to itself\n"); @@ -620,7 +650,7 @@ int rxm_cmap_process_connreq(struct rxm_cmap *cmap, void *addr, int rxm_msg_eq_progress(struct rxm_ep *rxm_ep) { struct rxm_msg_eq_entry *entry; - int ret; + int ret; entry = alloca(RXM_MSG_EQ_ENTRY_SZ); if (!entry) { @@ -632,12 +662,15 @@ int rxm_msg_eq_progress(struct rxm_ep *rxm_ep) while (1) { entry->rd = rxm_eq_read(rxm_ep, RXM_MSG_EQ_ENTRY_SZ, entry); if (entry->rd < 0 && entry->rd != -FI_ECONNREFUSED) { - ret = (int)entry->rd; + ret = (int) entry->rd; break; } ret = rxm_conn_handle_event(rxm_ep, entry); - if (ret) + if (ret) { + FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, + "invalid connection handle event: %d\n", ret); break; + } } return ret; } @@ -648,16 +681,15 @@ int rxm_cmap_connect(struct rxm_ep *rxm_ep, fi_addr_t fi_addr, int ret = FI_SUCCESS; switch (handle->state) { - case RXM_CMAP_CONNECTED_NOTIFY: - rxm_cmap_process_conn_notify(rxm_ep->cmap, handle); - break; case RXM_CMAP_IDLE: FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "initiating MSG_EP connect " "for fi_addr: %" PRIu64 "\n", fi_addr); - ret = rxm_conn_connect(rxm_ep->cmap->ep, handle, - ofi_av_get_addr(rxm_ep->cmap->av, - fi_addr)); + ret = rxm_conn_connect(rxm_ep, handle, + ofi_av_get_addr(rxm_ep->cmap->av, fi_addr)); if (ret) { + if (ret == -FI_ECONNREFUSED) + return -FI_EAGAIN; + rxm_cmap_del_handle(handle); } else { RXM_CM_UPDATE_STATE(handle, RXM_CMAP_CONNREQ_SENT); @@ -685,9 +717,13 @@ static int rxm_cmap_cm_thread_close(struct rxm_cmap *cmap) { int ret; - if (cmap->ep->domain->data_progress != FI_PROGRESS_AUTO) + FI_INFO(&rxm_prov, FI_LOG_EP_CTRL, "stopping CM thread\n"); + if (!cmap->cm_thread) return 0; + ofi_ep_lock_acquire(&cmap->ep->util_ep); + cmap->ep->do_progress = false; + ofi_ep_lock_release(&cmap->ep->util_ep); ret = rxm_conn_signal(cmap->ep, NULL, RXM_CMAP_EXIT); if (ret) { FI_WARN(cmap->av->prov, FI_LOG_EP_CTRL, @@ -709,17 +745,17 @@ void rxm_cmap_free(struct rxm_cmap *cmap) struct dlist_entry *entry; size_t i; + FI_INFO(cmap->av->prov, FI_LOG_EP_CTRL, "Closing cmap\n"); rxm_cmap_cm_thread_close(cmap); - FI_DBG(cmap->av->prov, FI_LOG_EP_CTRL, "Closing cmap\n"); for (i = 0; i < cmap->num_allocated; i++) { if (cmap->handles_av[i]) { rxm_cmap_clear_key(cmap->handles_av[i]); rxm_conn_free(cmap->handles_av[i]); - cmap->handles_av[i] = 0; } } - while(!dlist_empty(&cmap->peer_list)) { + + while (!dlist_empty(&cmap->peer_list)) { entry = cmap->peer_list.next; peer = container_of(entry, struct rxm_cmap_peer, entry); dlist_remove(&peer->entry); @@ -757,15 +793,15 @@ int rxm_cmap_alloc(struct rxm_ep *rxm_ep, struct rxm_cmap_attr *attr) if (!cmap) return -FI_ENOMEM; - cmap->ep = ep; + cmap->ep = rxm_ep; cmap->av = ep->av; - cmap->handles_av = calloc(cmap->av->count, sizeof(*cmap->handles_av)); + cmap->handles_av = calloc(ofi_av_size(ep->av), sizeof(*cmap->handles_av)); if (!cmap->handles_av) { ret = -FI_ENOMEM; goto err1; } - cmap->num_allocated = ep->av->count; + cmap->num_allocated = ofi_av_size(ep->av); cmap->attr = *attr; cmap->attr.name = mem_dup(attr->name, ep->av->addrlen); @@ -781,7 +817,10 @@ int rxm_cmap_alloc(struct rxm_ep *rxm_ep, struct rxm_cmap_attr *attr) rxm_ep->cmap = cmap; - if (ep->domain->data_progress == FI_PROGRESS_AUTO) { + if (ep->domain->data_progress == FI_PROGRESS_AUTO || force_auto_progress) { + + assert(ep->domain->threading == FI_THREAD_SAFE); + rxm_ep->do_progress = true; if (pthread_create(&cmap->cm_thread, 0, rxm_ep->rxm_info->caps & FI_ATOMIC ? rxm_conn_atomic_progress : @@ -820,6 +859,7 @@ static int rxm_msg_ep_open(struct rxm_ep *rxm_ep, struct fi_info *msg_info, rxm_domain = container_of(rxm_ep->util_ep.domain, struct rxm_domain, util_domain); + ret = fi_endpoint(rxm_domain->msg_domain, msg_info, &msg_ep, context); if (ret) { FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, @@ -858,8 +898,14 @@ static int rxm_msg_ep_open(struct rxm_ep *rxm_ep, struct fi_info *msg_info, goto err; } + ret = rxm_domain->flow_ctrl_ops->enable(msg_ep); + if (!ret) { + rxm_domain->flow_ctrl_ops->set_threshold( + msg_ep, rxm_ep->msg_info->rx_attr->size / 2); + } + if (!rxm_ep->srx_ctx) { - ret = rxm_msg_ep_prepost_recv(rxm_ep, msg_ep); + ret = rxm_prepost_recv(rxm_ep, msg_ep); if (ret) goto err; } @@ -871,81 +917,6 @@ static int rxm_msg_ep_open(struct rxm_ep *rxm_ep, struct fi_info *msg_info, return ret; } -static void rxm_conn_close(struct rxm_cmap_handle *handle) -{ - struct rxm_conn *rxm_conn = - container_of(handle, struct rxm_conn, handle); - - if (!rxm_conn->msg_ep) - return; - - if (handle->cmap->attr.serial_access) { - if (fi_close(&rxm_conn->msg_ep->fid)) { - FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, - "Unable to close msg_ep\n"); - } else { - FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, - "Closed msg_ep\n"); - } - } else { - rxm_conn->saved_msg_ep = rxm_conn->msg_ep; - FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, - "Saved MSG EP fid for further deletion in main thread\n"); - } - rxm_conn->msg_ep = NULL; -} - -static void rxm_conn_save(struct rxm_cmap_handle *handle) -{ - struct rxm_conn *rxm_conn = - container_of(handle, struct rxm_conn, handle); - - if (!rxm_conn->msg_ep) - return; - - rxm_conn->saved_msg_ep = rxm_conn->msg_ep; - FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, - "Saved MSG EP fid for further deletion\n"); - rxm_conn->msg_ep = NULL; -} - -static void rxm_conn_close_saved(struct rxm_cmap_handle *handle) -{ - struct rxm_conn *rxm_conn = - container_of(handle, struct rxm_conn, handle); - - if (!rxm_conn->saved_msg_ep) - return; - - /* If user doesn't guarantee for serializing access to cmap - * objects, postpone the closing of the saved MSG EP for - * further deletion in main thread */ - if (handle->cmap->attr.serial_access) { - if (fi_close(&rxm_conn->saved_msg_ep->fid)) { - FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, - "Unable to close saved msg_ep\n"); - } else { - FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, - "Closed saved msg_ep\n"); - } - rxm_conn->saved_msg_ep = NULL; - } -} - -static void rxm_conn_connected_handler(struct rxm_cmap_handle *handle) -{ - struct rxm_conn *rxm_conn = container_of(handle, struct rxm_conn, handle); - - if (!rxm_conn->saved_msg_ep) - return; - /* Assuming fi_close also shuts down the connection gracefully if the - * endpoint is in connected state */ - if (fi_close(&rxm_conn->saved_msg_ep->fid)) - FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "Unable to close saved msg_ep\n"); - FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "Closed saved msg_ep\n"); - rxm_conn->saved_msg_ep = NULL; -} - static int rxm_conn_reprocess_directed_recvs(struct rxm_recv_queue *recv_queue) { struct rxm_rx_buf *rx_buf; @@ -976,7 +947,7 @@ static int rxm_conn_reprocess_directed_recvs(struct rxm_recv_queue *recv_queue) rx_buf->recv_entry = container_of(entry, struct rxm_recv_entry, entry); - ret = rxm_cq_handle_rx_buf(rx_buf); + ret = rxm_handle_rx_buf(rx_buf); if (ret) { err_entry.op_context = rx_buf; err_entry.flags = rx_buf->recv_entry->comp_flags; @@ -990,11 +961,10 @@ static int rxm_conn_reprocess_directed_recvs(struct rxm_recv_queue *recv_queue) if (rx_buf->ep->util_ep.flags & OFI_CNTR_ENABLED) rxm_cntr_incerr(rx_buf->ep->util_ep.rx_cntr); - rxm_rx_buf_finish(rx_buf); + rxm_rx_buf_free(rx_buf); if (!(rx_buf->recv_entry->flags & FI_MULTI_RECV)) - rxm_recv_entry_release(recv_queue, - rx_buf->recv_entry); + rxm_recv_entry_release(rx_buf->recv_entry); } count++; } @@ -1004,12 +974,12 @@ static int rxm_conn_reprocess_directed_recvs(struct rxm_recv_queue *recv_queue) static void rxm_conn_av_updated_handler(struct rxm_cmap_handle *handle) { - struct rxm_ep *rxm_ep = container_of(handle->cmap->ep, struct rxm_ep, util_ep); + struct rxm_ep *ep = handle->cmap->ep; int count = 0; - if (rxm_ep->rxm_info->caps & FI_DIRECTED_RECV) { - count += rxm_conn_reprocess_directed_recvs(&rxm_ep->recv_queue); - count += rxm_conn_reprocess_directed_recvs(&rxm_ep->trecv_queue); + if (ep->rxm_info->caps & FI_DIRECTED_RECV) { + count += rxm_conn_reprocess_directed_recvs(&ep->recv_queue); + count += rxm_conn_reprocess_directed_recvs(&ep->trecv_queue); FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "Reprocessed directed recvs - %d\n", count); @@ -1018,13 +988,13 @@ rxm_conn_av_updated_handler(struct rxm_cmap_handle *handle) static struct rxm_cmap_handle *rxm_conn_alloc(struct rxm_cmap *cmap) { - struct rxm_ep *rxm_ep = container_of(cmap->ep, struct rxm_ep, util_ep); - struct rxm_conn *rxm_conn = calloc(1, sizeof(*rxm_conn)); + struct rxm_conn *rxm_conn; - if (OFI_UNLIKELY(!rxm_conn)) + rxm_conn = calloc(1, sizeof(*rxm_conn)); + if (!rxm_conn) return NULL; - if (rxm_conn_res_alloc(rxm_ep, rxm_conn)) { + if (rxm_conn_res_alloc(cmap->ep, rxm_conn)) { free(rxm_conn); return NULL; } @@ -1081,11 +1051,10 @@ rxm_conn_verify_cm_data(union rxm_cm_data *remote_cm_data, static size_t rxm_conn_get_rx_size(struct rxm_ep *rxm_ep, struct fi_info *msg_info) { - /* TODO add env variable to tune the value for shared context case */ if (msg_info->ep_attr->rx_ctx_cnt == FI_SHARED_CONTEXT) return MAX(MIN(16, msg_info->rx_attr->size), (msg_info->rx_attr->size / - rxm_ep->util_ep.av->count)); + ofi_av_size(rxm_ep->util_ep.av))); else return msg_info->rx_attr->size; } @@ -1101,7 +1070,7 @@ rxm_msg_process_connreq(struct rxm_ep *rxm_ep, struct fi_info *msg_info, .endianness = ofi_detect_endianness(), .ctrl_version = RXM_CTRL_VERSION, .op_version = RXM_OP_VERSION, - .eager_size = rxm_ep->rxm_info->tx_attr->inject_size, + .eager_size = rxm_eager_limit, }, }; union rxm_cm_data reject_cm_data = { @@ -1112,7 +1081,7 @@ rxm_msg_process_connreq(struct rxm_ep *rxm_ep, struct fi_info *msg_info, }; struct rxm_cmap_handle *handle; struct sockaddr_storage remote_pep_addr; - int ret, rv; + int ret; assert(sizeof(uint32_t) == sizeof(cm_data.accept.rx_size)); assert(msg_info->rx_attr->size <= (uint32_t)-1); @@ -1159,43 +1128,68 @@ rxm_msg_process_connreq(struct rxm_ep *rxm_ep, struct fi_info *msg_info, rxm_cmap_del_handle(&rxm_conn->handle); err1: FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, - "Rejecting incoming connection request (reject reason: %d)\n", + "rejecting incoming connection request (reject reason: %d)\n", (enum rxm_cmap_reject_reason)reject_cm_data.reject.reason); - rv = fi_reject(rxm_ep->msg_pep, msg_info->handle, - &reject_cm_data.reject, sizeof(reject_cm_data.reject)); - if (rv) - FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, - "Unable to reject incoming connection: %s (%d)\n", - fi_strerror(-rv), -rv); + fi_reject(rxm_ep->msg_pep, msg_info->handle, + &reject_cm_data.reject, sizeof(reject_cm_data.reject)); return ret; } +static void rxm_flush_msg_cq(struct rxm_ep *rxm_ep) +{ + struct fi_cq_data_entry comp; + int ret; + do { + ret = fi_cq_read(rxm_ep->msg_cq, &comp, 1); + if (ret > 0) { + ret = rxm_handle_comp(rxm_ep, &comp); + if (OFI_UNLIKELY(ret)) { + rxm_cq_write_error_all(rxm_ep, ret); + } else { + ret = 1; + } + } else if (ret == -FI_EAVAIL) { + rxm_handle_comp_error(rxm_ep); + ret = 1; + } else if (ret < 0 && ret != -FI_EAGAIN) { + rxm_cq_write_error_all(rxm_ep, ret); + } + } while (ret > 0); +} + static int rxm_conn_handle_notify(struct fi_eq_entry *eq_entry) { struct rxm_cmap *cmap; struct rxm_cmap_handle *handle; - assert((enum rxm_cmap_signal)eq_entry->data); - - if ((enum rxm_cmap_signal)eq_entry->data == RXM_CMAP_FREE) { - handle = eq_entry->context; - assert(handle->state == RXM_CMAP_SHUTDOWN); - FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "freeing handle: %p\n", handle); - cmap = handle->cmap; - if (handle->peer) { - dlist_remove(&handle->peer->entry); - free(handle->peer); - handle->peer = NULL; - } else { - cmap->handles_av[handle->fi_addr] = 0; - } - rxm_conn_free(handle); - return 0; - } else { - FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "unknown cmap signal\n"); - assert(0); + FI_INFO(&rxm_prov, FI_LOG_EP_CTRL, "notify event %" PRIu64 "\n", + eq_entry->data); + + if ((enum rxm_cmap_signal) eq_entry->data != RXM_CMAP_FREE) return -FI_EOTHER; + + handle = eq_entry->context; + assert(handle->state == RXM_CMAP_SHUTDOWN); + FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "freeing handle: %p\n", handle); + cmap = handle->cmap; + + rxm_conn_close(handle); + + // after closing the connection, we need to flush any dangling references to the + // handle from msg_cq entries that have not been cleaned up yet, otherwise we + // could run into problems during CQ cleanup. these entries will be errored so + // keep reading through EAVAIL. + rxm_flush_msg_cq(cmap->ep); + + if (handle->peer) { + dlist_remove(&handle->peer->entry); + free(handle->peer); + handle->peer = NULL; + } else { + cmap->handles_av[handle->fi_addr] = NULL; } + rxm_conn_free(handle); + return 0; } static void rxm_conn_wake_up_wait_obj(struct rxm_ep *rxm_ep) @@ -1207,58 +1201,58 @@ static void rxm_conn_wake_up_wait_obj(struct rxm_ep *rxm_ep) } static int -rxm_conn_handle_event(struct rxm_ep *rxm_ep, struct rxm_msg_eq_entry *entry) +rxm_conn_handle_reject(struct rxm_ep *rxm_ep, struct rxm_msg_eq_entry *entry) { union rxm_cm_data *cm_data = entry->err_entry.err_data; - enum rxm_cmap_reject_reason reject_reason; - - if (entry->rd == -FI_ECONNREFUSED) { - if (OFI_UNLIKELY(entry->err_entry.err_data_size != - sizeof(cm_data->reject))) { - FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "connection reject: " - "no reject error data (cm_data) was found " - "(data length expected: %zu found: %zu)\n", - sizeof(cm_data->reject), - entry->err_entry.err_data_size); - goto err; - } - assert(cm_data); - if (cm_data->reject.version != RXM_CM_DATA_VERSION) { - FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "connection reject: " - "cm data version mismatch (local: %" PRIu8 - ", remote: %" PRIu8 ")\n", - (uint8_t) RXM_CM_DATA_VERSION, - cm_data->reject.version); - goto err; - } - reject_reason = cm_data->reject.reason; + if (!cm_data || entry->err_entry.err_data_size != sizeof(cm_data->reject)) { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "connection reject: " + "no reject error data (cm_data) was found " + "(data length expected: %zu found: %zu)\n", + sizeof(cm_data->reject), + entry->err_entry.err_data_size); + return -FI_EOTHER; + } - if (reject_reason == RXM_CMAP_REJECT_GENUINE) { - FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "connection reject: " - "remote peer didn't accept the connection\n"); - FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "connection reject: " - "(reason: RXM_CMAP_REJECT_GENUINE)\n"); - OFI_EQ_STRERROR(&rxm_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, - rxm_ep->msg_eq, &entry->err_entry); - } else if (reject_reason == RXM_CMAP_REJECT_SIMULT_CONN) { - FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "connection reject: " - "(reason: RXM_CMAP_REJECT_SIMULT_CONN)\n"); - } else { - FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "connection reject: " - "received unknown reject reason: %d\n", - reject_reason); - } - rxm_cmap_process_reject(rxm_ep->cmap, entry->context, - reject_reason); - return 0; + if (cm_data->reject.version != RXM_CM_DATA_VERSION) { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "connection reject: " + "cm data version mismatch (local: %" PRIu8 + ", remote: %" PRIu8 ")\n", + (uint8_t) RXM_CM_DATA_VERSION, + cm_data->reject.version); + return -FI_EOTHER; } - switch(entry->event) { + if (cm_data->reject.reason == RXM_CMAP_REJECT_GENUINE) { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "connection reject: " + "remote peer didn't accept the connection\n"); + FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "connection reject: " + "(reason: RXM_CMAP_REJECT_GENUINE)\n"); + OFI_EQ_STRERROR(&rxm_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, + rxm_ep->msg_eq, &entry->err_entry); + } else if (cm_data->reject.reason == RXM_CMAP_REJECT_SIMULT_CONN) { + FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "connection reject: " + "(reason: RXM_CMAP_REJECT_SIMULT_CONN)\n"); + } else { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "connection reject: " + "received unknown reject reason: %d\n", + cm_data->reject.reason); + } + rxm_cmap_process_reject(rxm_ep->cmap, entry->context, + cm_data->reject.reason); + return 0; +} + +static int +rxm_conn_handle_event(struct rxm_ep *rxm_ep, struct rxm_msg_eq_entry *entry) +{ + if (entry->rd == -FI_ECONNREFUSED) + return rxm_conn_handle_reject(rxm_ep, entry); + + switch (entry->event) { case FI_NOTIFY: - if (rxm_conn_handle_notify((struct fi_eq_entry *)&entry->cm_entry)) - goto err; - break; + return rxm_conn_handle_notify((struct fi_eq_entry *) + &entry->cm_entry); case FI_CONNREQ: FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "Got new connection\n"); if ((size_t)entry->rd != RXM_CM_ENTRY_SZ) { @@ -1268,21 +1262,20 @@ rxm_conn_handle_event(struct rxm_ep *rxm_ep, struct rxm_msg_eq_entry *entry) FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "Received CM entry " "size (%zd) not matching expected (%zu)\n", entry->rd, RXM_CM_ENTRY_SZ); - goto err; + return -FI_EOTHER; } rxm_msg_process_connreq(rxm_ep, entry->cm_entry.info, - (union rxm_cm_data *)entry->cm_entry.data); + (union rxm_cm_data *) entry->cm_entry.data); fi_freeinfo(entry->cm_entry.info); break; case FI_CONNECTED: assert(entry->cm_entry.fid->context); FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, - "Connection successful\n"); - cm_data = (void *)entry->cm_entry.data; + "connection successful\n"); rxm_cmap_process_connect(rxm_ep->cmap, - entry->cm_entry.fid->context, - ((entry->rd - sizeof(entry->cm_entry)) ? - cm_data : NULL)); + entry->cm_entry.fid->context, + entry->rd - sizeof(entry->cm_entry) > 0 ? + (union rxm_cm_data *) entry->cm_entry.data : NULL); rxm_conn_wake_up_wait_obj(rxm_ep); break; case FI_SHUTDOWN: @@ -1294,11 +1287,9 @@ rxm_conn_handle_event(struct rxm_ep *rxm_ep, struct rxm_msg_eq_entry *entry) default: FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "Unknown event: %u\n", entry->event); - goto err; + return -FI_EOTHER; } return 0; -err: - return -FI_EOTHER; } static ssize_t rxm_eq_sread(struct rxm_ep *rxm_ep, size_t len, @@ -1308,6 +1299,11 @@ static ssize_t rxm_eq_sread(struct rxm_ep *rxm_ep, size_t len, int once = 1; do { + /* TODO convert this to poll + fi_eq_read so that we can grab + * rxm_ep lock before reading the EQ. This is needed to avoid + * processing events / error entries from closed MSG EPs. This + * can be done only for non-Windows OSes as Windows doesn't + * have poll for a generic file descriptor. */ rd = fi_eq_sread(rxm_ep->msg_eq, &entry->event, &entry->cm_entry, len, -1, 0); if (rd >= 0) @@ -1320,11 +1316,15 @@ static ssize_t rxm_eq_sread(struct rxm_ep *rxm_ep, size_t len, if (rd != -FI_EAVAIL) { FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, - "Unable to fi_eq_sread: %zu\n", rd); + "unable to fi_eq_sread: %s (%zd)\n", + fi_strerror(-rd), -rd); return rd; } - return rxm_eq_readerr(rxm_ep, entry); + ofi_ep_lock_acquire(&rxm_ep->util_ep); + rd = rxm_eq_readerr(rxm_ep, entry); + ofi_ep_lock_release(&rxm_ep->util_ep); + return rd; } static inline int rxm_conn_eq_event(struct rxm_ep *rxm_ep, @@ -1332,11 +1332,6 @@ static inline int rxm_conn_eq_event(struct rxm_ep *rxm_ep, { int ret; - if (entry->event == FI_NOTIFY && (enum rxm_cmap_signal) - ((struct fi_eq_entry *) &entry->cm_entry)->data == RXM_CMAP_EXIT) { - FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "Closing CM thread\n"); - return -1; - } ofi_ep_lock_acquire(&rxm_ep->util_ep); ret = rxm_conn_handle_event(rxm_ep, entry) ? -1 : 0; ofi_ep_lock_release(&rxm_ep->util_ep); @@ -1346,133 +1341,110 @@ static inline int rxm_conn_eq_event(struct rxm_ep *rxm_ep, static void *rxm_conn_progress(void *arg) { - struct rxm_ep *rxm_ep = container_of(arg, struct rxm_ep, util_ep); + struct rxm_ep *ep = container_of(arg, struct rxm_ep, util_ep); struct rxm_msg_eq_entry *entry; entry = alloca(RXM_MSG_EQ_ENTRY_SZ); - if (!entry) { - FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, - "Unable to allocate memory!\n"); + if (!entry) return NULL; - } - FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "Starting conn event handler\n"); - while (1) { + FI_INFO(&rxm_prov, FI_LOG_EP_CTRL, "Starting auto-progress thread\n"); + + ofi_ep_lock_acquire(&ep->util_ep); + while (ep->do_progress) { + ofi_ep_lock_release(&ep->util_ep); memset(entry, 0, RXM_MSG_EQ_ENTRY_SZ); - entry->rd = rxm_eq_sread(rxm_ep, RXM_CM_ENTRY_SZ, entry); - if (entry->rd < 0 && entry->rd != -FI_ECONNREFUSED) - break; - if (rxm_conn_eq_event(rxm_ep, entry)) - break; + entry->rd = rxm_eq_sread(ep, RXM_CM_ENTRY_SZ, entry); + if (entry->rd >= 0 || entry->rd == -FI_ECONNREFUSED) + rxm_conn_eq_event(ep, entry); + + ofi_ep_lock_acquire(&ep->util_ep); } + ofi_ep_lock_release(&ep->util_ep); - FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, "Stoping conn event handler\n"); + FI_INFO(&rxm_prov, FI_LOG_EP_CTRL, "Stopping auto-progress thread\n"); return NULL; } static inline int rxm_conn_auto_progress_eq(struct rxm_ep *rxm_ep, struct rxm_msg_eq_entry *entry) { - while (1) { - memset(entry, 0, RXM_MSG_EQ_ENTRY_SZ); - entry->rd = rxm_eq_read(rxm_ep, RXM_CM_ENTRY_SZ, entry); - if (OFI_UNLIKELY(!entry->rd || entry->rd == -FI_EAGAIN)) - return FI_SUCCESS; - if (entry->rd < 0 && - entry->rd != -FI_ECONNREFUSED) - break; - if (rxm_conn_eq_event(rxm_ep, entry)) - break; - } - return -1; + memset(entry, 0, RXM_MSG_EQ_ENTRY_SZ); + + ofi_ep_lock_acquire(&rxm_ep->util_ep); + entry->rd = rxm_eq_read(rxm_ep, RXM_CM_ENTRY_SZ, entry); + ofi_ep_lock_release(&rxm_ep->util_ep); + + if (!entry->rd || entry->rd == -FI_EAGAIN) + return FI_SUCCESS; + if (entry->rd < 0 && entry->rd != -FI_ECONNREFUSED) + return entry->rd; + + return rxm_conn_eq_event(rxm_ep, entry); } -/* Atomic auto progress of EQ and CQ */ -static int rxm_conn_atomic_progress_eq_cq(struct rxm_ep *rxm_ep, - struct rxm_msg_eq_entry *entry) +static void *rxm_conn_atomic_progress(void *arg) { - struct rxm_fabric *rxm_fabric; + struct rxm_ep *ep = container_of(arg, struct rxm_ep, util_ep); + struct rxm_msg_eq_entry *entry; + struct rxm_fabric *fabric; struct fid *fids[2] = { - &rxm_ep->msg_eq->fid, - &rxm_ep->msg_cq->fid, + &ep->msg_eq->fid, + &ep->msg_cq->fid, }; struct pollfd fds[2] = { {.events = POLLIN}, {.events = POLLIN}, }; - int again; int ret; - rxm_fabric = container_of(rxm_ep->util_ep.domain->fabric, - struct rxm_fabric, util_fabric); + entry = alloca(RXM_MSG_EQ_ENTRY_SZ); + if (!entry) + return NULL; + + fabric = container_of(ep->util_ep.domain->fabric, + struct rxm_fabric, util_fabric); - ret = fi_control(&rxm_ep->msg_eq->fid, FI_GETWAIT, &fds[0].fd); + ret = fi_control(&ep->msg_eq->fid, FI_GETWAIT, &fds[0].fd); if (ret) { FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, - "unable to get MSG EQ wait fd: %d\n", ret); - goto exit; + "unable to get msg EQ fd: %s\n", fi_strerror(ret)); + return NULL; } - ret = fi_control(&rxm_ep->msg_cq->fid, FI_GETWAIT, &fds[1].fd); + ret = fi_control(&ep->msg_cq->fid, FI_GETWAIT, &fds[1].fd); if (ret) { FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, - "unable to get MSG CQ wait fd: %d\n", ret); - goto exit; + "unable to get msg CQ fd: %s\n", fi_strerror(ret)); + return NULL; } - memset(entry, 0, RXM_MSG_EQ_ENTRY_SZ); - - while(1) { - ofi_ep_lock_acquire(&rxm_ep->util_ep); - again = fi_trywait(rxm_fabric->msg_fabric, fids, 2); - ofi_ep_lock_release(&rxm_ep->util_ep); + FI_INFO(&rxm_prov, FI_LOG_EP_CTRL, "Starting auto-progress thread\n"); + ofi_ep_lock_acquire(&ep->util_ep); + while (ep->do_progress) { + ofi_ep_lock_release(&ep->util_ep); + ret = fi_trywait(fabric->msg_fabric, fids, 2); - if (!again) { + if (!ret) { fds[0].revents = 0; fds[1].revents = 0; ret = poll(fds, 2, -1); - if (OFI_UNLIKELY(ret == -1)) { - if (errno == EINTR) - continue; + if (ret == -1 && errno != EINTR) { FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, - "Select error %d, closing CM thread\n", - errno); - goto exit; + "Select error %s, closing CM thread\n", + strerror(errno)); + goto out; } } - if (again || fds[0].revents & POLLIN) { - if (rxm_conn_auto_progress_eq(rxm_ep, entry)) - goto exit; - } - if (again || fds[1].revents & POLLIN) - rxm_ep_progress(&rxm_ep->util_ep); - } -exit: - return -1; -} - -static void *rxm_conn_atomic_progress(void *arg) -{ - struct rxm_ep *rxm_ep = container_of(arg, struct rxm_ep, util_ep); - struct rxm_msg_eq_entry *entry; - - assert(rxm_ep->msg_eq); - entry = alloca(RXM_MSG_EQ_ENTRY_SZ); - if (!entry) { - FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, - "Unable to allocate memory!\n"); - return NULL; + rxm_conn_auto_progress_eq(ep, entry); + ep->util_ep.progress(&ep->util_ep); + ofi_ep_lock_acquire(&ep->util_ep); } + ofi_ep_lock_release(&ep->util_ep); - FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, - "Starting CM conn thread with atomic AUTO_PROGRESS\n"); - - rxm_conn_atomic_progress_eq_cq(rxm_ep, entry); - - FI_DBG(&rxm_prov, FI_LOG_EP_CTRL, - "Stoping CM conn thread with atomic AUTO_PROGRESS\n"); - +out: + FI_INFO(&rxm_prov, FI_LOG_EP_CTRL, "Stopping auto progress thread\n"); return NULL; } @@ -1509,73 +1481,72 @@ static int rxm_prepare_cm_data(struct fid_pep *pep, struct rxm_cmap_handle *hand } static int -rxm_conn_connect(struct util_ep *util_ep, struct rxm_cmap_handle *handle, +rxm_conn_connect(struct rxm_ep *ep, struct rxm_cmap_handle *handle, const void *addr) { int ret; - struct rxm_ep *rxm_ep = - container_of(util_ep, struct rxm_ep, util_ep); - struct rxm_conn *rxm_conn = - container_of(handle, struct rxm_conn, handle); + struct rxm_conn *rxm_conn = container_of(handle, struct rxm_conn, handle); union rxm_cm_data cm_data = { .connect = { .version = RXM_CM_DATA_VERSION, .ctrl_version = RXM_CTRL_VERSION, .op_version = RXM_OP_VERSION, .endianness = ofi_detect_endianness(), - .eager_size = rxm_ep->rxm_info->tx_attr->inject_size, + .eager_size = rxm_eager_limit, }, }; assert(sizeof(uint32_t) == sizeof(cm_data.connect.eager_size)); assert(sizeof(uint32_t) == sizeof(cm_data.connect.rx_size)); - assert(rxm_ep->rxm_info->tx_attr->inject_size <= (uint32_t)-1); - assert(rxm_ep->msg_info->rx_attr->size <= (uint32_t)-1); + assert(ep->msg_info->rx_attr->size <= (uint32_t) -1); - free(rxm_ep->msg_info->dest_addr); - rxm_ep->msg_info->dest_addrlen = rxm_ep->msg_info->src_addrlen; + free(ep->msg_info->dest_addr); + ep->msg_info->dest_addrlen = ep->msg_info->src_addrlen; - rxm_ep->msg_info->dest_addr = mem_dup(addr, rxm_ep->msg_info->dest_addrlen); - if (!rxm_ep->msg_info->dest_addr) + ep->msg_info->dest_addr = mem_dup(addr, ep->msg_info->dest_addrlen); + if (!ep->msg_info->dest_addr) { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "mem_dup failed, len %zu\n", + ep->msg_info->dest_addrlen); return -FI_ENOMEM; + } - ret = rxm_msg_ep_open(rxm_ep, rxm_ep->msg_info, rxm_conn, &rxm_conn->handle); + ret = rxm_msg_ep_open(ep, ep->msg_info, rxm_conn, &rxm_conn->handle); if (ret) return ret; /* We have to send passive endpoint's address to the server since the * address from which connection request would be sent would have a * different port. */ - ret = rxm_prepare_cm_data(rxm_ep->msg_pep, &rxm_conn->handle, &cm_data); + ret = rxm_prepare_cm_data(ep->msg_pep, &rxm_conn->handle, &cm_data); if (ret) goto err; - cm_data.connect.rx_size = rxm_conn_get_rx_size(rxm_ep, rxm_ep->msg_info); + cm_data.connect.rx_size = rxm_conn_get_rx_size(ep, ep->msg_info); - ret = fi_connect(rxm_conn->msg_ep, rxm_ep->msg_info->dest_addr, + ret = fi_connect(rxm_conn->msg_ep, ep->msg_info->dest_addr, &cm_data, sizeof(cm_data)); if (ret) { FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "unable to connect msg_ep\n"); goto err; } return 0; + err: fi_close(&rxm_conn->msg_ep->fid); rxm_conn->msg_ep = NULL; return ret; } -static int rxm_conn_signal(struct util_ep *util_ep, void *context, +static int rxm_conn_signal(struct rxm_ep *ep, void *context, enum rxm_cmap_signal signal) { - struct rxm_ep *rxm_ep = container_of(util_ep, struct rxm_ep, util_ep); struct fi_eq_entry entry = {0}; ssize_t rd; entry.context = context; - entry.data = (uint64_t)signal; + entry.data = (uint64_t) signal; - rd = fi_eq_write(rxm_ep->msg_eq, FI_NOTIFY, &entry, sizeof(entry), 0); + rd = fi_eq_write(ep->msg_eq, FI_NOTIFY, &entry, sizeof(entry), 0); if (rd != sizeof(entry)) { FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "Unable to signal\n"); return (int)rd; @@ -1607,12 +1578,6 @@ int rxm_conn_cmap_alloc(struct rxm_ep *rxm_ep) attr.name = name; - if (rxm_ep->util_ep.domain->threading == FI_THREAD_DOMAIN && - rxm_ep->util_ep.domain->data_progress == FI_PROGRESS_MANUAL) - attr.serial_access = 1; - else - attr.serial_access = 0; - ret = rxm_cmap_alloc(rxm_ep, &attr); if (ret) FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, diff --git a/prov/rxm/src/rxm_cq.c b/prov/rxm/src/rxm_cq.c index 0c38f6f1236..6cdf7bba1fe 100644 --- a/prov/rxm/src/rxm_cq.c +++ b/prov/rxm/src/rxm_cq.c @@ -1,7 +1,9 @@ /* - * Copyright (c) 2013-2016 Intel Corporation. All rights reserved. + * Copyright (c) 2013-2020 Intel Corporation. All rights reserved. * Copyright (c) 2018 Cray Inc. All rights reserved. + * Copyright (c) 2018 System Fabric Works, Inc. All rights reserved. * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -39,12 +41,15 @@ #include "ofi.h" #include "ofi_iov.h" #include "ofi_atomic.h" +#include #include "rxm.h" size_t rxm_cm_progress_interval; +size_t rxm_cq_eq_fairness; -static const char *rxm_cq_strerror(struct fid_cq *cq_fid, int prov_errno, +static const char * +rxm_cq_strerror(struct fid_cq *cq_fid, int prov_errno, const void *err_data, char *buf, size_t len) { struct util_cq *cq; @@ -58,82 +63,63 @@ static const char *rxm_cq_strerror(struct fid_cq *cq_fid, int prov_errno, return fi_cq_strerror(rxm_ep->msg_cq, prov_errno, err_data, buf, len); } -/* Get a match_iov derived from iov whose size matches given length */ -static int rxm_match_iov(const struct iovec *iov, void **desc, - uint8_t count, uint64_t offset, size_t match_len, - struct rxm_iov *match_iov) +static struct rxm_rx_buf * +rxm_rx_buf_alloc(struct rxm_ep *rxm_ep, struct fid_ep *rx_ep, bool repost) { - uint8_t i; - - assert(count <= RXM_IOV_LIMIT); - - for (i = 0; i < count; i++) { - if (offset >= iov[i].iov_len) { - offset -= iov[i].iov_len; - continue; - } + struct rxm_rx_buf *rx_buf; - match_iov->iov[i].iov_base = (char *)iov[i].iov_base + offset; - match_iov->iov[i].iov_len = MIN(iov[i].iov_len - offset, match_len); - if (desc) - match_iov->desc[i] = desc[i]; + rx_buf = ofi_buf_alloc(rxm_ep->buf_pools[RXM_BUF_POOL_RX].pool); + if (!rx_buf) + return NULL; - match_len -= match_iov->iov[i].iov_len; - if (!match_len) - break; - offset = 0; - } + assert(rx_buf->ep == rxm_ep); + rx_buf->hdr.state = RXM_RX; + rx_buf->rx_ep = rx_ep; + rx_buf->repost = repost; - if (match_len) { - FI_WARN(&rxm_prov, FI_LOG_CQ, - "Given iov size (%zu) < match_len (remained match_len = %zu)!\n", - ofi_total_iov_len(iov, count), match_len); - return -FI_ETOOSMALL; + if (!rxm_ep->srx_ctx) { + rx_buf->conn = container_of(rx_ep->fid.context, + struct rxm_conn, handle); } - match_iov->count = i + 1; - return FI_SUCCESS; + return rx_buf; } -static inline uint64_t -rxm_cq_get_rx_comp_and_op_flags(struct rxm_rx_buf *rx_buf) +static void rxm_repost_new_rx(struct rxm_rx_buf *rx_buf) { - return (rx_buf->pkt.hdr.flags | ofi_rx_flags[rx_buf->pkt.hdr.op]); -} + struct rxm_rx_buf *new_rx_buf; -static inline uint64_t -rxm_cq_get_rx_comp_flags(struct rxm_rx_buf *rx_buf) -{ - return (rx_buf->pkt.hdr.flags); + if (!rx_buf->repost) + return; + + new_rx_buf = rxm_rx_buf_alloc(rx_buf->ep, rx_buf->rx_ep, true); + if (!new_rx_buf) + return; + + rx_buf->repost = false; + dlist_insert_tail(&new_rx_buf->repost_entry, + &new_rx_buf->ep->repost_ready_list); } -static int rxm_finish_buf_recv(struct rxm_rx_buf *rx_buf) +static void rxm_finish_buf_recv(struct rxm_rx_buf *rx_buf) { uint64_t flags; char *data; - if (rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg && + if ((rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_seg) && rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) != RXM_SAR_SEG_FIRST) { dlist_insert_tail(&rx_buf->unexp_msg.entry, &rx_buf->conn->sar_deferred_rx_msg_list); - rx_buf = rxm_rx_buf_alloc(rx_buf->ep, rx_buf->msg_ep, 1); - if (OFI_UNLIKELY(!rx_buf)) { - FI_WARN(&rxm_prov, FI_LOG_EP_DATA, - "ran out of buffers from RX buffer pool\n"); - return -FI_ENOMEM; - } - dlist_insert_tail(&rx_buf->repost_entry, - &rx_buf->ep->repost_ready_list); - - return 0; + /* repost a new buffer since SAR takes some time to complete */ + rxm_repost_new_rx(rx_buf); } - flags = rxm_cq_get_rx_comp_and_op_flags(rx_buf); + flags = (rx_buf->pkt.hdr.flags | ofi_rx_flags[rx_buf->pkt.hdr.op]); if (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_eager) flags |= FI_MORE; - if (rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_rndv) + if (rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_rndv_req) data = rxm_pkt_rndv_data(&rx_buf->pkt); else data = rx_buf->pkt.data; @@ -142,11 +128,11 @@ static int rxm_finish_buf_recv(struct rxm_rx_buf *rx_buf) "length: %" PRIu64 "\n", rx_buf->pkt.hdr.size); rx_buf->recv_context.ep = &rx_buf->ep->util_ep.ep_fid; - return rxm_cq_write_recv_comp(rx_buf, &rx_buf->recv_context, flags, - rx_buf->pkt.hdr.size, data); + rxm_cq_write_recv_comp(rx_buf, &rx_buf->recv_context, flags, + rx_buf->pkt.hdr.size, data); } -static int rxm_cq_write_error_trunc(struct rxm_rx_buf *rx_buf, size_t done_len) +static void rxm_cq_write_error_trunc(struct rxm_rx_buf *rx_buf, size_t done_len) { int ret; @@ -159,147 +145,90 @@ static int rxm_cq_write_error_trunc(struct rxm_rx_buf *rx_buf, size_t done_len) ret = ofi_cq_write_error_trunc(rx_buf->ep->util_ep.rx_cq, rx_buf->recv_entry->context, rx_buf->recv_entry->comp_flags | - rxm_cq_get_rx_comp_flags(rx_buf), + rx_buf->pkt.hdr.flags, rx_buf->pkt.hdr.size, rx_buf->recv_entry->rxm_iov.iov[0].iov_base, rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag, rx_buf->pkt.hdr.size - done_len); - if (OFI_UNLIKELY(ret)) { - FI_WARN(&rxm_prov, FI_LOG_CQ, - "Unable to write recv error CQ\n"); - return ret; + if (ret) { + FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to write recv error CQ\n"); + assert(0); } - return 0; } -static int rxm_finish_recv(struct rxm_rx_buf *rx_buf, size_t done_len) +static void rxm_finish_recv(struct rxm_rx_buf *rx_buf, size_t done_len) { - int ret; struct rxm_recv_entry *recv_entry = rx_buf->recv_entry; - if (OFI_UNLIKELY(done_len < rx_buf->pkt.hdr.size)) { - ret = rxm_cq_write_error_trunc(rx_buf, done_len); - if (ret) - return ret; - } else { - if (rx_buf->recv_entry->flags & FI_COMPLETION || - rx_buf->ep->rxm_info->mode & FI_BUFFERED_RECV) { - ret = rxm_cq_write_recv_comp( - rx_buf, rx_buf->recv_entry->context, - rx_buf->recv_entry->comp_flags | - rxm_cq_get_rx_comp_flags(rx_buf), - rx_buf->pkt.hdr.size, - rx_buf->recv_entry->rxm_iov.iov[0].iov_base); - if (ret) - return ret; - } - ofi_ep_rx_cntr_inc(&rx_buf->ep->util_ep); + if (done_len < rx_buf->pkt.hdr.size) { + rxm_cq_write_error_trunc(rx_buf, done_len); + goto release; } - if (rx_buf->recv_entry->flags & FI_MULTI_RECV) { - struct rxm_iov rxm_iov; - size_t recv_size = rx_buf->pkt.hdr.size; - struct rxm_ep *rxm_ep = rx_buf->ep; - - rxm_rx_buf_finish(rx_buf); - - recv_entry->total_len -= recv_size; - - if (recv_entry->total_len <= rxm_ep->min_multi_recv_size) { - FI_DBG(&rxm_prov, FI_LOG_CQ, - "Buffer %p has been completely consumed. " - "Reporting Multi-Recv completion\n", - recv_entry->multi_recv.buf); - ret = rxm_cq_write_multi_recv_comp(rxm_ep, recv_entry); - if (OFI_UNLIKELY(ret)) { - FI_WARN(&rxm_prov, FI_LOG_CQ, - "Unable to write FI_MULTI_RECV completion\n"); - return ret; - } - /* Since buffer is elapsed, release recv_entry */ - rxm_recv_entry_release(recv_entry->recv_queue, - recv_entry); - return ret; - } - - FI_DBG(&rxm_prov, FI_LOG_CQ, - "Repost Multi-Recv entry: %p " - "consumed len = %zu, remain len = %zu\n", - recv_entry, recv_size, recv_entry->total_len); - - rxm_iov = recv_entry->rxm_iov; - ret = rxm_match_iov(/* prev iovecs */ - rxm_iov.iov, rxm_iov.desc, rxm_iov.count, - recv_size, /* offset */ - recv_entry->total_len, /* match_len */ - &recv_entry->rxm_iov); /* match_iov */ - if (OFI_UNLIKELY(ret)) - return ret; - - return rxm_process_recv_entry(recv_entry->recv_queue, recv_entry); - } else { - rxm_rx_buf_finish(rx_buf); - rxm_recv_entry_release(recv_entry->recv_queue, recv_entry); + if (rx_buf->recv_entry->flags & FI_COMPLETION || + rx_buf->ep->rxm_info->mode & FI_BUFFERED_RECV) { + rxm_cq_write_recv_comp(rx_buf, rx_buf->recv_entry->context, + rx_buf->recv_entry->comp_flags | + rx_buf->pkt.hdr.flags | + (rx_buf->recv_entry->flags & FI_MULTI_RECV), + rx_buf->pkt.hdr.size, + rx_buf->recv_entry->rxm_iov. + iov[0].iov_base); } + ofi_ep_rx_cntr_inc(&rx_buf->ep->util_ep); - return FI_SUCCESS; +release: + rxm_recv_entry_release(recv_entry); + rxm_rx_buf_free(rx_buf); } -static inline int -rxm_cq_tx_comp_write(struct rxm_ep *rxm_ep, uint64_t comp_flags, +static void +rxm_cq_write_tx_comp(struct rxm_ep *rxm_ep, uint64_t comp_flags, void *app_context, uint64_t flags) { if (flags & FI_COMPLETION) { - int ret = ofi_cq_write(rxm_ep->util_ep.tx_cq, app_context, - comp_flags, 0, NULL, 0, 0); - if (OFI_UNLIKELY(ret)) { - FI_WARN(&rxm_prov, FI_LOG_CQ, - "Unable to report completion\n"); - return ret; - } - rxm_cq_log_comp(comp_flags); + rxm_cq_write(rxm_ep->util_ep.tx_cq, app_context, + comp_flags, 0, NULL, 0, 0); } - return 0; } -static inline int rxm_finish_rma(struct rxm_ep *rxm_ep, struct rxm_rma_buf *rma_buf, - uint64_t comp_flags) +static void rxm_finish_rma(struct rxm_ep *rxm_ep, struct rxm_rma_buf *rma_buf, + uint64_t comp_flags) { - int ret = rxm_cq_tx_comp_write(rxm_ep, comp_flags, - rma_buf->app_context, rma_buf->flags); - assert(((comp_flags & FI_WRITE) && !(comp_flags & FI_READ)) || ((comp_flags & FI_READ) && !(comp_flags & FI_WRITE))); + rxm_cq_write_tx_comp(rxm_ep, comp_flags, rma_buf->app_context, + rma_buf->flags); + if (comp_flags & FI_WRITE) ofi_ep_wr_cntr_inc(&rxm_ep->util_ep); else ofi_ep_rd_cntr_inc(&rxm_ep->util_ep); - if (!(rma_buf->flags & FI_INJECT) && !rxm_ep->rxm_mr_local && rxm_ep->msg_mr_local) { - rxm_ep_msg_mr_closev(rma_buf->mr.mr, rma_buf->mr.count); + if (!(rma_buf->flags & FI_INJECT) && !rxm_ep->rdm_mr_local && + rxm_ep->msg_mr_local) { + rxm_msg_mr_closev(rma_buf->mr.mr, rma_buf->mr.count); } ofi_buf_free(rma_buf); - return ret; } -static inline int rxm_finish_eager_send(struct rxm_ep *rxm_ep, struct rxm_tx_eager_buf *tx_buf) +void rxm_finish_eager_send(struct rxm_ep *rxm_ep, struct rxm_tx_eager_buf *tx_buf) { - int ret = rxm_cq_tx_comp_write(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op), - tx_buf->app_context, tx_buf->flags); - assert(ofi_tx_cq_flags(tx_buf->pkt.hdr.op) & FI_SEND); - ofi_ep_tx_cntr_inc(&rxm_ep->util_ep); - return ret; + rxm_cq_write_tx_comp(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op), + tx_buf->app_context, tx_buf->flags); + ofi_ep_tx_cntr_inc(&rxm_ep->util_ep); } -static inline int rxm_finish_sar_segment_send(struct rxm_ep *rxm_ep, struct rxm_tx_sar_buf *tx_buf) +static bool rxm_complete_sar(struct rxm_ep *rxm_ep, + struct rxm_tx_sar_buf *tx_buf) { - int ret = FI_SUCCESS; struct rxm_tx_sar_buf *first_tx_buf; + assert(ofi_tx_cq_flags(tx_buf->pkt.hdr.op) & FI_SEND); switch (rxm_sar_get_seg_type(&tx_buf->pkt.ctrl_hdr)) { case RXM_SAR_SEG_FIRST: break; @@ -307,23 +236,35 @@ static inline int rxm_finish_sar_segment_send(struct rxm_ep *rxm_ep, struct rxm_ ofi_buf_free(tx_buf); break; case RXM_SAR_SEG_LAST: - ret = rxm_cq_tx_comp_write(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op), - tx_buf->app_context, tx_buf->flags); - - assert(ofi_tx_cq_flags(tx_buf->pkt.hdr.op) & FI_SEND); - ofi_ep_tx_cntr_inc(&rxm_ep->util_ep); first_tx_buf = ofi_bufpool_get_ibuf(rxm_ep-> buf_pools[RXM_BUF_POOL_TX_SAR].pool, tx_buf->pkt.ctrl_hdr.msg_id); ofi_buf_free(first_tx_buf); ofi_buf_free(tx_buf); - break; + return true; } - return ret; + return false; } -static inline int rxm_finish_send_rndv_ack(struct rxm_rx_buf *rx_buf) +static void rxm_handle_sar_comp(struct rxm_ep *rxm_ep, + struct rxm_tx_sar_buf *tx_buf) +{ + void *app_context; + uint64_t comp_flags, tx_flags; + + app_context = tx_buf->app_context; + comp_flags = ofi_tx_cq_flags(tx_buf->pkt.hdr.op); + tx_flags = tx_buf->flags; + + if (!rxm_complete_sar(rxm_ep, tx_buf)) + return; + + rxm_cq_write_tx_comp(rxm_ep, comp_flags, app_context, tx_flags); + ofi_ep_tx_cntr_inc(&rxm_ep->util_ep); +} + +static void rxm_rndv_rx_finish(struct rxm_rx_buf *rx_buf) { RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_FINISH); @@ -332,75 +273,130 @@ static inline int rxm_finish_send_rndv_ack(struct rxm_rx_buf *rx_buf) rx_buf->recv_entry->rndv.tx_buf = NULL; } - if (!rx_buf->ep->rxm_mr_local) - rxm_ep_msg_mr_closev(rx_buf->mr, rx_buf->recv_entry->rxm_iov.count); + if (!rx_buf->ep->rdm_mr_local) + rxm_msg_mr_closev(rx_buf->mr, + rx_buf->recv_entry->rxm_iov.count); - return rxm_finish_recv(rx_buf, rx_buf->recv_entry->total_len); + rxm_finish_recv(rx_buf, rx_buf->recv_entry->total_len); } -static int rxm_rndv_tx_finish(struct rxm_ep *rxm_ep, struct rxm_tx_rndv_buf *tx_buf) +static void rxm_rndv_tx_finish(struct rxm_ep *rxm_ep, + struct rxm_tx_rndv_buf *tx_buf) { - int ret; + assert(ofi_tx_cq_flags(tx_buf->pkt.hdr.op) & FI_SEND); RXM_UPDATE_STATE(FI_LOG_CQ, tx_buf, RXM_RNDV_FINISH); + if (!rxm_ep->rdm_mr_local) + rxm_msg_mr_closev(tx_buf->mr, tx_buf->count); - if (!rxm_ep->rxm_mr_local) - rxm_ep_msg_mr_closev(tx_buf->mr, tx_buf->count); + rxm_cq_write_tx_comp(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op), + tx_buf->app_context, tx_buf->flags); - ret = rxm_cq_tx_comp_write(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op), - tx_buf->app_context, tx_buf->flags); - - assert(ofi_tx_cq_flags(tx_buf->pkt.hdr.op) & FI_SEND); + if (rxm_ep->rndv_ops == &rxm_rndv_ops_write && + tx_buf->write_rndv.done_buf) { + ofi_buf_free(tx_buf->write_rndv.done_buf); + tx_buf->write_rndv.done_buf = NULL; + } ofi_ep_tx_cntr_inc(&rxm_ep->util_ep); - ofi_buf_free(tx_buf); - - return ret; } -static int rxm_rndv_handle_ack(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf) +static void rxm_rndv_handle_rd_done(struct rxm_ep *rxm_ep, + struct rxm_rx_buf *rx_buf) { struct rxm_tx_rndv_buf *tx_buf; - tx_buf = ofi_bufpool_get_ibuf(rxm_ep->buf_pools[RXM_BUF_POOL_TX_RNDV].pool, - rx_buf->pkt.ctrl_hdr.msg_id); - FI_DBG(&rxm_prov, FI_LOG_CQ, "Got ACK for msg_id: 0x%" PRIx64 "\n", rx_buf->pkt.ctrl_hdr.msg_id); + tx_buf = ofi_bufpool_get_ibuf(rxm_ep->buf_pools[RXM_BUF_POOL_TX_RNDV_REQ].pool, + rx_buf->pkt.ctrl_hdr.msg_id); assert(tx_buf->pkt.ctrl_hdr.msg_id == rx_buf->pkt.ctrl_hdr.msg_id); - rxm_rx_buf_finish(rx_buf); + rxm_rx_buf_free(rx_buf); - if (tx_buf->hdr.state == RXM_RNDV_ACK_WAIT) { - return rxm_rndv_tx_finish(rxm_ep, tx_buf); + if (tx_buf->hdr.state == RXM_RNDV_READ_DONE_WAIT) { + rxm_rndv_tx_finish(rxm_ep, tx_buf); } else { assert(tx_buf->hdr.state == RXM_RNDV_TX); - RXM_UPDATE_STATE(FI_LOG_CQ, tx_buf, RXM_RNDV_ACK_RECVD); - return 0; + RXM_UPDATE_STATE(FI_LOG_CQ, tx_buf, RXM_RNDV_READ_DONE_RECVD); + } +} + +static int rxm_rndv_rx_match(struct dlist_entry *item, const void *arg) +{ + uint64_t msg_id = *((uint64_t *) arg); + struct rxm_rx_buf *rx_buf; + + rx_buf = container_of(item, struct rxm_rx_buf, rndv_wait_entry); + return (msg_id == rx_buf->pkt.ctrl_hdr.msg_id); +} + +static int rxm_rndv_handle_wr_done(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf) +{ + struct dlist_entry *rx_buf_entry; + struct rxm_rx_buf *rndv_rx_buf; + int ret = 0; + + FI_DBG(&rxm_prov, FI_LOG_CQ, "Got DONE for msg_id: 0x%" PRIx64 "\n", + rx_buf->pkt.ctrl_hdr.msg_id); + + rx_buf_entry = dlist_remove_first_match(&rx_buf->ep->rndv_wait_list, + rxm_rndv_rx_match, + &rx_buf->pkt.ctrl_hdr.msg_id); + if (!rx_buf_entry) { + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Failed to find rndv wait entry for msg_id: 0x%" PRIx64 "\n", + rx_buf->pkt.ctrl_hdr.msg_id); + ret = -FI_EINVAL; + goto out; } + rndv_rx_buf = container_of(rx_buf_entry, struct rxm_rx_buf, + rndv_wait_entry); + + if (rndv_rx_buf->hdr.state == RXM_RNDV_WRITE_DONE_WAIT) { + rxm_rndv_rx_finish(rndv_rx_buf); + } else { + assert(rndv_rx_buf->hdr.state == RXM_RNDV_WRITE_DATA_SENT); + RXM_UPDATE_STATE(FI_LOG_CQ, rndv_rx_buf, RXM_RNDV_WRITE_DONE_RECVD); + } +out: + rxm_rx_buf_free(rx_buf); + return ret; } static int rxm_rx_buf_match_msg_id(struct dlist_entry *item, const void *arg) { - uint64_t msg_id = *((uint64_t *)arg); - struct rxm_rx_buf *rx_buf = - container_of(item, struct rxm_rx_buf, unexp_msg.entry); + uint64_t msg_id = *((uint64_t *) arg); + struct rxm_rx_buf *rx_buf; + + rx_buf = container_of(item, struct rxm_rx_buf, unexp_msg.entry); return (msg_id == rx_buf->pkt.ctrl_hdr.msg_id); } -static inline -ssize_t rxm_cq_copy_seg_data(struct rxm_rx_buf *rx_buf, int *done) +static void rxm_process_seg_data(struct rxm_rx_buf *rx_buf, int *done) { - uint64_t done_len = ofi_copy_to_iov(rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.count, - rx_buf->recv_entry->sar.total_recv_len, - rx_buf->pkt.data, - rx_buf->pkt.ctrl_hdr.seg_size); + enum fi_hmem_iface iface; + uint64_t device; + ssize_t done_len; + + iface = rxm_mr_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.desc, + rx_buf->recv_entry->rxm_iov.count, + &device); + + done_len = ofi_copy_to_hmem_iov(iface, device, + rx_buf->recv_entry->rxm_iov.iov, + rx_buf->recv_entry->rxm_iov.count, + rx_buf->recv_entry->sar.total_recv_len, + rx_buf->pkt.data, + rx_buf->pkt.ctrl_hdr.seg_size); + assert(done_len == rx_buf->pkt.ctrl_hdr.seg_size); + rx_buf->recv_entry->sar.total_recv_len += done_len; if ((rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) == RXM_SAR_SEG_LAST) || (done_len != rx_buf->pkt.ctrl_hdr.seg_size)) { + dlist_remove(&rx_buf->recv_entry->sar.entry); /* Mark rxm_recv_entry::msg_id as unknown for futher re-use */ @@ -410,7 +406,7 @@ ssize_t rxm_cq_copy_seg_data(struct rxm_rx_buf *rx_buf, int *done) rx_buf->recv_entry->sar.total_recv_len = 0; *done = 1; - return rxm_finish_recv(rx_buf, done_len); + rxm_finish_recv(rx_buf, done_len); } else { if (rx_buf->recv_entry->sar.msg_id == RXM_SAR_RX_INIT) { if (!rx_buf->conn) { @@ -427,195 +423,262 @@ ssize_t rxm_cq_copy_seg_data(struct rxm_rx_buf *rx_buf, int *done) /* The RX buffer can be reposted for further re-use */ rx_buf->recv_entry = NULL; - rxm_rx_buf_finish(rx_buf); + rxm_rx_buf_free(rx_buf); *done = 0; - return FI_SUCCESS; } } -static inline -ssize_t rxm_cq_handle_seg_data(struct rxm_rx_buf *rx_buf) +static void rxm_handle_seg_data(struct rxm_rx_buf *rx_buf) { + struct rxm_recv_entry *recv_entry; + struct rxm_conn *conn; + uint64_t msg_id; + struct dlist_entry *entry; int done; - if (rx_buf->ep->rxm_info->mode & FI_BUFFERED_RECV) { - struct rxm_recv_entry *recv_entry = rx_buf->recv_entry; - struct rxm_conn *conn = rx_buf->conn; - uint64_t msg_id = rx_buf->pkt.ctrl_hdr.msg_id; - struct dlist_entry *entry; - ssize_t ret; + rxm_process_seg_data(rx_buf, &done); + if (done || !(rx_buf->ep->rxm_info->mode & FI_BUFFERED_RECV)) + return; + + recv_entry = rx_buf->recv_entry; + conn = rx_buf->conn; + msg_id = rx_buf->pkt.ctrl_hdr.msg_id; - ret = rxm_cq_copy_seg_data(rx_buf, &done); + dlist_foreach_container_safe(&conn->sar_deferred_rx_msg_list, + struct rxm_rx_buf, rx_buf, + unexp_msg.entry, entry) { + if (!rxm_rx_buf_match_msg_id(&rx_buf->unexp_msg.entry, &msg_id)) + continue; + + dlist_remove(&rx_buf->unexp_msg.entry); + rx_buf->recv_entry = recv_entry; + rxm_process_seg_data(rx_buf, &done); if (done) + break; + } +} + +static ssize_t rxm_rndv_xfer(struct rxm_ep *rxm_ep, struct fid_ep *msg_ep, + struct rxm_rndv_hdr *remote_hdr, struct iovec *local_iov, + void **local_desc, size_t local_count, size_t total_len, + void *context) +{ + size_t i, index = 0, offset = 0, count, copy_len; + struct iovec iov[RXM_IOV_LIMIT]; + void *desc[RXM_IOV_LIMIT]; + ssize_t ret = FI_SUCCESS; + + for (i = 0; i < remote_hdr->count && total_len > 0; i++) { + copy_len = MIN(remote_hdr->iov[i].len, total_len); + + ret = ofi_copy_iov_desc(&iov[0], &desc[0], &count, + &local_iov[0], + &local_desc[0], + local_count, + &index, &offset, copy_len); + if (ret) return ret; + total_len -= copy_len; + ret = rxm_ep->rndv_ops->xfer(msg_ep, iov, desc, count, 0, + remote_hdr->iov[i].addr, remote_hdr->iov[i].key, + context); + + if (ret) { + if (ret == -FI_EAGAIN) { + struct rxm_deferred_tx_entry *def_tx_entry; - dlist_foreach_container_safe(&conn->sar_deferred_rx_msg_list, - struct rxm_rx_buf, rx_buf, - unexp_msg.entry, entry) { - if (!rxm_rx_buf_match_msg_id(&rx_buf->unexp_msg.entry, &msg_id)) + ret = rxm_ep->rndv_ops->defer_xfer( + &def_tx_entry, i, iov, desc, count, + context); + + if (ret) + break; + rxm_ep_enqueue_deferred_tx_queue(def_tx_entry); continue; - dlist_remove(&rx_buf->unexp_msg.entry); - rx_buf->recv_entry = recv_entry; - ret = rxm_cq_copy_seg_data(rx_buf, &done); - if (done) - break; + } + break; } - return ret; - } else { - return rxm_cq_copy_seg_data(rx_buf, &done); } + assert(!total_len); + return ret; } -static inline ssize_t -rxm_cq_rndv_read_prepare_deferred(struct rxm_deferred_tx_entry **def_tx_entry, size_t index, - struct iovec *iov, void *desc[RXM_IOV_LIMIT], - size_t count, struct rxm_rx_buf *rx_buf) +ssize_t rxm_rndv_read(struct rxm_rx_buf *rx_buf) { - uint8_t i; + ssize_t ret; + size_t total_len = + MIN(rx_buf->recv_entry->total_len, rx_buf->pkt.hdr.size); - *def_tx_entry = rxm_ep_alloc_deferred_tx_entry(rx_buf->ep, rx_buf->conn, - RXM_DEFERRED_TX_RNDV_READ); - if (OFI_UNLIKELY(!*def_tx_entry)) - return -FI_ENOMEM; + RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_READ); - (*def_tx_entry)->rndv_read.rx_buf = rx_buf; - (*def_tx_entry)->rndv_read.rma_iov.addr = - rx_buf->rndv_hdr->iov[index].addr; - (*def_tx_entry)->rndv_read.rma_iov.key = - rx_buf->rndv_hdr->iov[index].key; - for (i = 0; i < count; i++) { - (*def_tx_entry)->rndv_read.rxm_iov.iov[i] = iov[i]; - (*def_tx_entry)->rndv_read.rxm_iov.desc[i] = desc[i]; - } - (*def_tx_entry)->rndv_read.rxm_iov.count = count; + ret = rxm_rndv_xfer(rx_buf->ep, rx_buf->conn->msg_ep, rx_buf->remote_rndv_hdr, + rx_buf->recv_entry->rxm_iov.iov, + rx_buf->recv_entry->rxm_iov.desc, + rx_buf->recv_entry->rxm_iov.count, total_len, + rx_buf); - return 0; + if (ret) + rxm_cq_write_error(rx_buf->ep->util_ep.rx_cq, + rx_buf->ep->util_ep.rx_cntr, + rx_buf, ret); + return ret; } -static inline -ssize_t rxm_cq_handle_rndv(struct rxm_rx_buf *rx_buf) +static ssize_t rxm_rndv_handle_wr_data(struct rxm_rx_buf *rx_buf) { - size_t i, index = 0, offset = 0, count, total_recv_len; - struct iovec iov[RXM_IOV_LIMIT]; - void *desc[RXM_IOV_LIMIT]; - struct rxm_rx_buf *new_rx_buf; - int ret = 0; + int i; + ssize_t ret; + struct rxm_tx_rndv_buf *tx_buf; + size_t total_len, rma_len = 0; + struct rxm_rndv_hdr *rx_hdr = (struct rxm_rndv_hdr *) rx_buf->pkt.data; + + tx_buf = ofi_bufpool_get_ibuf( + rx_buf->ep->buf_pools[RXM_BUF_POOL_TX_RNDV_REQ].pool, + rx_buf->pkt.ctrl_hdr.msg_id); + total_len = tx_buf->pkt.hdr.size; + + tx_buf->write_rndv.remote_hdr.count = rx_hdr->count; + memcpy(tx_buf->write_rndv.remote_hdr.iov, rx_hdr->iov, + rx_hdr->count * sizeof(rx_hdr->iov[0])); + // calculate number of RMA writes required to complete the transfer. + // there me be less than iov count RMA writes required, + // depending on differences between remote and local IOV sizes. + for (i = 0; i < tx_buf->write_rndv.remote_hdr.count; i++) { + if (total_len > rma_len) { + tx_buf->write_rndv.rndv_rma_count++; + rma_len += tx_buf->write_rndv.remote_hdr.iov[i].len; + } + } + + RXM_UPDATE_STATE(FI_LOG_CQ, tx_buf, RXM_RNDV_WRITE); + + ret = rxm_rndv_xfer(rx_buf->ep, tx_buf->write_rndv.conn->msg_ep, rx_hdr, + tx_buf->write_rndv.iov, tx_buf->write_rndv.desc, + tx_buf->count, total_len, tx_buf); - rx_buf->repost = 0; + if (ret) + rxm_cq_write_error(rx_buf->ep->util_ep.rx_cq, + rx_buf->ep->util_ep.rx_cntr, + tx_buf, ret); + rxm_rx_buf_free(rx_buf); + return ret; +} + +static ssize_t rxm_handle_rndv(struct rxm_rx_buf *rx_buf) +{ + int ret = 0, i; + size_t total_recv_len; /* En-queue new rx buf to be posted ASAP so that we don't block any - * incoming messages. RNDV processing can take a while. */ - new_rx_buf = rxm_rx_buf_alloc(rx_buf->ep, rx_buf->msg_ep, 1); - if (OFI_UNLIKELY(!new_rx_buf)) - return -FI_ENOMEM; - dlist_insert_tail(&new_rx_buf->repost_entry, - &new_rx_buf->ep->repost_ready_list); + * incoming messages. RNDV processing can take a while. + */ + rxm_repost_new_rx(rx_buf); if (!rx_buf->conn) { assert(rx_buf->ep->srx_ctx); rx_buf->conn = rxm_key2conn(rx_buf->ep, rx_buf->pkt.ctrl_hdr.conn_id); - if (OFI_UNLIKELY(!rx_buf->conn)) + if (!rx_buf->conn) return -FI_EOTHER; } assert(rx_buf->conn); FI_DBG(&rxm_prov, FI_LOG_CQ, - "Got incoming recv with msg_id: 0x%" PRIx64 "\n", + "Got incoming rndv req with msg_id: 0x%" PRIx64 "\n", rx_buf->pkt.ctrl_hdr.msg_id); - rx_buf->rndv_hdr = (struct rxm_rndv_hdr *)rx_buf->pkt.data; + rx_buf->remote_rndv_hdr = (struct rxm_rndv_hdr *) rx_buf->pkt.data; rx_buf->rndv_rma_index = 0; - if (!rx_buf->ep->rxm_mr_local) { + if (!rx_buf->ep->rdm_mr_local) { total_recv_len = MIN(rx_buf->recv_entry->total_len, rx_buf->pkt.hdr.size); - ret = rxm_ep_msg_mr_regv_lim(rx_buf->ep, - rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.count, - total_recv_len, - FI_READ, rx_buf->mr); - if (OFI_UNLIKELY(ret)) + ret = rxm_msg_mr_regv(rx_buf->ep, rx_buf->recv_entry->rxm_iov.iov, + rx_buf->recv_entry->rxm_iov.count, + total_recv_len, + rx_buf->ep->rndv_ops->rx_mr_access, + rx_buf->mr); + if (ret) return ret; for (i = 0; (i < rx_buf->recv_entry->rxm_iov.count && - rx_buf->mr[i]); i++) + rx_buf->mr[i]); i++) { rx_buf->recv_entry->rxm_iov.desc[i] = fi_mr_desc(rx_buf->mr[i]); + } } else { - for (i = 0; i < rx_buf->recv_entry->rxm_iov.count; i++) + struct rxm_mr *mr; + + for (i = 0; i < rx_buf->recv_entry->rxm_iov.count; i++) { + mr = rx_buf->recv_entry->rxm_iov.desc[i]; rx_buf->recv_entry->rxm_iov.desc[i] = - fi_mr_desc(rx_buf->recv_entry->rxm_iov.desc[i]); - total_recv_len = MIN(rx_buf->recv_entry->total_len, - rx_buf->pkt.hdr.size); + fi_mr_desc(mr->msg_mr); + } } - assert(rx_buf->rndv_hdr->count && - (rx_buf->rndv_hdr->count <= RXM_IOV_LIMIT)); + assert(rx_buf->remote_rndv_hdr->count && + (rx_buf->remote_rndv_hdr->count <= RXM_IOV_LIMIT)); - RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_READ); + return rx_buf->ep->rndv_ops->handle_rx(rx_buf); +} - for (i = 0; i < rx_buf->rndv_hdr->count; i++) { - size_t copy_len = MIN(rx_buf->rndv_hdr->iov[i].len, - total_recv_len); +void rxm_handle_eager(struct rxm_rx_buf *rx_buf) +{ + enum fi_hmem_iface iface; + uint64_t device; + ssize_t done_len; - ret = ofi_copy_iov_desc(&iov[0], &desc[0], &count, - &rx_buf->recv_entry->rxm_iov.iov[0], - &rx_buf->recv_entry->rxm_iov.desc[0], - rx_buf->recv_entry->rxm_iov.count, - &index, &offset, copy_len); - if (ret) { - assert(ret == -FI_ETOOSMALL); - return rxm_cq_write_error_trunc( - rx_buf, rx_buf->recv_entry->total_len); - } - total_recv_len -= copy_len; - ret = fi_readv(rx_buf->conn->msg_ep, iov, desc, count, 0, - rx_buf->rndv_hdr->iov[i].addr, - rx_buf->rndv_hdr->iov[i].key, rx_buf); - if (OFI_UNLIKELY(ret)) { - if (OFI_LIKELY(ret == -FI_EAGAIN)) { - struct rxm_deferred_tx_entry *def_tx_entry; + iface = rxm_mr_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.desc, + rx_buf->recv_entry->rxm_iov.count, + &device); - ret = rxm_cq_rndv_read_prepare_deferred( - &def_tx_entry, i, iov, desc, - count, rx_buf); - if (ret) - goto readv_err; - rxm_ep_enqueue_deferred_tx_queue(def_tx_entry); - continue; - } -readv_err: - rxm_cq_write_error(rx_buf->ep->util_ep.rx_cq, - rx_buf->ep->util_ep.rx_cntr, - rx_buf->recv_entry->context, ret); - break; - } - } - assert(!total_recv_len); - return ret; + done_len = ofi_copy_to_hmem_iov(iface, device, + rx_buf->recv_entry->rxm_iov.iov, + rx_buf->recv_entry->rxm_iov.count, 0, + rx_buf->pkt.data, rx_buf->pkt.hdr.size); + assert(done_len == rx_buf->pkt.hdr.size); + + rxm_finish_recv(rx_buf, done_len); } -static inline -ssize_t rxm_cq_handle_eager(struct rxm_rx_buf *rx_buf) +void rxm_handle_coll_eager(struct rxm_rx_buf *rx_buf) { - uint64_t done_len = ofi_copy_to_iov(rx_buf->recv_entry->rxm_iov.iov, - rx_buf->recv_entry->rxm_iov.count, - 0, rx_buf->pkt.data, - rx_buf->pkt.hdr.size); - return rxm_finish_recv(rx_buf, done_len); + enum fi_hmem_iface iface; + uint64_t device; + ssize_t done_len; + + iface = rxm_mr_desc_to_hmem_iface_dev(rx_buf->recv_entry->rxm_iov.desc, + rx_buf->recv_entry->rxm_iov.count, + &device); + + done_len = ofi_copy_to_hmem_iov(iface, device, + rx_buf->recv_entry->rxm_iov.iov, + rx_buf->recv_entry->rxm_iov.count, 0, + rx_buf->pkt.data, rx_buf->pkt.hdr.size); + assert(done_len == rx_buf->pkt.hdr.size); + + if (rx_buf->pkt.hdr.tag & OFI_COLL_TAG_FLAG) { + ofi_coll_handle_xfer_comp(rx_buf->pkt.hdr.tag, + rx_buf->recv_entry->context); + rxm_rx_buf_free(rx_buf); + rxm_recv_entry_release(rx_buf->recv_entry); + } else { + rxm_finish_recv(rx_buf, done_len); + } } -ssize_t rxm_cq_handle_rx_buf(struct rxm_rx_buf *rx_buf) +ssize_t rxm_handle_rx_buf(struct rxm_rx_buf *rx_buf) { switch (rx_buf->pkt.ctrl_hdr.type) { case rxm_ctrl_eager: - return rxm_cq_handle_eager(rx_buf); - case rxm_ctrl_rndv: - return rxm_cq_handle_rndv(rx_buf); + rx_buf->ep->eager_ops->handle_rx(rx_buf); + return 0; + case rxm_ctrl_rndv_req: + return rxm_handle_rndv(rx_buf); case rxm_ctrl_seg: - return rxm_cq_handle_seg_data(rx_buf); + rxm_handle_seg_data(rx_buf); + return 0; default: FI_WARN(&rxm_prov, FI_LOG_CQ, "Unknown message type\n"); assert(0); @@ -623,50 +686,85 @@ ssize_t rxm_cq_handle_rx_buf(struct rxm_rx_buf *rx_buf) } } -static inline ssize_t -rxm_cq_match_rx_buf(struct rxm_rx_buf *rx_buf, - struct rxm_recv_queue *recv_queue, - struct rxm_recv_match_attr *match_attr) +static void rxm_adjust_multi_recv(struct rxm_rx_buf *rx_buf) +{ + struct rxm_recv_entry *recv_entry; + struct iovec new_iov; + size_t recv_size; + + recv_size = rx_buf->pkt.hdr.size; + + if (rx_buf->recv_entry->rxm_iov.iov[0].iov_len < recv_size || + rx_buf->recv_entry->rxm_iov.iov[0].iov_len - recv_size < + rx_buf->ep->min_multi_recv_size) + return; + + new_iov.iov_base = (uint8_t *) + rx_buf->recv_entry->rxm_iov.iov[0].iov_base + recv_size; + new_iov.iov_len = rx_buf->recv_entry->rxm_iov.iov[0].iov_len - recv_size;; + + rx_buf->recv_entry->rxm_iov.iov[0].iov_len = recv_size; + + recv_entry = rxm_multi_recv_entry_get(rx_buf->ep, &new_iov, + rx_buf->recv_entry->rxm_iov.desc, 1, + rx_buf->recv_entry->addr, + rx_buf->recv_entry->tag, + rx_buf->recv_entry->ignore, + rx_buf->recv_entry->context, + rx_buf->recv_entry->flags); + + rx_buf->recv_entry->flags &= ~FI_MULTI_RECV; + + dlist_insert_head(&recv_entry->entry, &rx_buf->ep->recv_queue.recv_list); +} + +static ssize_t +rxm_match_rx_buf(struct rxm_rx_buf *rx_buf, + struct rxm_recv_queue *recv_queue, + struct rxm_recv_match_attr *match_attr) { struct dlist_entry *entry; - struct rxm_ep *rxm_ep; - struct fid_ep *msg_ep; + + /* Dynamic receive buffers may have already matched */ + if (rx_buf->recv_entry) { + if (rx_buf->pkt.ctrl_hdr.type == rxm_ctrl_rndv_req) + return rxm_handle_rndv(rx_buf); + + rxm_finish_recv(rx_buf, rx_buf->pkt.hdr.size); + return 0; + } + + if (recv_queue->dyn_rbuf_unexp_cnt) + recv_queue->dyn_rbuf_unexp_cnt--; entry = dlist_remove_first_match(&recv_queue->recv_list, recv_queue->match_recv, match_attr); - if (!entry) { - RXM_DBG_ADDR_TAG(FI_LOG_CQ, "No matching recv found for " - "incoming msg", match_attr->addr, - match_attr->tag); - FI_DBG(&rxm_prov, FI_LOG_CQ, "Enqueueing msg to unexpected msg" - "queue\n"); - rx_buf->unexp_msg.addr = match_attr->addr; - rx_buf->unexp_msg.tag = match_attr->tag; - rx_buf->repost = 0; + if (entry) { + rx_buf->recv_entry = container_of(entry, struct rxm_recv_entry, entry); - dlist_insert_tail(&rx_buf->unexp_msg.entry, - &recv_queue->unexp_msg_list); + if (rx_buf->recv_entry->flags & FI_MULTI_RECV) + rxm_adjust_multi_recv(rx_buf); - msg_ep = rx_buf->msg_ep; - rxm_ep = rx_buf->ep; + return rxm_handle_rx_buf(rx_buf); + } - rx_buf = rxm_rx_buf_alloc(rxm_ep, msg_ep, 1); - if (OFI_UNLIKELY(!rx_buf)) { - FI_WARN(&rxm_prov, FI_LOG_EP_DATA, - "ran out of buffers from RX buffer pool\n"); - return -FI_ENOMEM; - } + RXM_DBG_ADDR_TAG(FI_LOG_CQ, "No matching recv found for incoming msg", + match_attr->addr, match_attr->tag); + FI_DBG(&rxm_prov, FI_LOG_CQ, "Enqueueing msg to unexpected msg queue\n"); + rx_buf->unexp_msg.addr = match_attr->addr; + rx_buf->unexp_msg.tag = match_attr->tag; - dlist_insert_tail(&rx_buf->repost_entry, - &rxm_ep->repost_ready_list); - return 0; - } + dlist_insert_tail(&rx_buf->unexp_msg.entry, + &recv_queue->unexp_msg_list); - rx_buf->recv_entry = container_of(entry, struct rxm_recv_entry, entry); - return rxm_cq_handle_rx_buf(rx_buf); + /* post a new buffer since we don't know when the unexpected buffer + * will be consumed + */ + rxm_repost_new_rx(rx_buf); + return 0; } -static inline ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf) +static ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf) { struct rxm_recv_match_attr match_attr = { .addr = FI_ADDR_UNSPEC, @@ -674,26 +772,28 @@ static inline ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf) if (rx_buf->ep->rxm_info->caps & (FI_SOURCE | FI_DIRECTED_RECV)) { if (rx_buf->ep->srx_ctx) - rx_buf->conn = - rxm_key2conn(rx_buf->ep, rx_buf->pkt.ctrl_hdr.conn_id); - if (OFI_UNLIKELY(!rx_buf->conn)) + rx_buf->conn = rxm_key2conn(rx_buf->ep, rx_buf-> + pkt.ctrl_hdr.conn_id); + if (!rx_buf->conn) return -FI_EOTHER; match_attr.addr = rx_buf->conn->handle.fi_addr; } - if (rx_buf->ep->rxm_info->mode & FI_BUFFERED_RECV) - return rxm_finish_buf_recv(rx_buf); + if (rx_buf->ep->rxm_info->mode & FI_BUFFERED_RECV) { + rxm_finish_buf_recv(rx_buf); + return 0; + } switch(rx_buf->pkt.hdr.op) { case ofi_op_msg: FI_DBG(&rxm_prov, FI_LOG_CQ, "Got MSG op\n"); - return rxm_cq_match_rx_buf(rx_buf, &rx_buf->ep->recv_queue, - &match_attr); + return rxm_match_rx_buf(rx_buf, &rx_buf->ep->recv_queue, + &match_attr); case ofi_op_tagged: FI_DBG(&rxm_prov, FI_LOG_CQ, "Got TAGGED op\n"); match_attr.tag = rx_buf->pkt.hdr.tag; - return rxm_cq_match_rx_buf(rx_buf, &rx_buf->ep->trecv_queue, - &match_attr); + return rxm_match_rx_buf(rx_buf, &rx_buf->ep->trecv_queue, + &match_attr); default: FI_WARN(&rxm_prov, FI_LOG_CQ, "Unknown op!\n"); assert(0); @@ -703,151 +803,217 @@ static inline ssize_t rxm_handle_recv_comp(struct rxm_rx_buf *rx_buf) static int rxm_sar_match_msg_id(struct dlist_entry *item, const void *arg) { - uint64_t msg_id = *((uint64_t *)arg); - struct rxm_recv_entry *recv_entry = - container_of(item, struct rxm_recv_entry, sar.entry); + uint64_t msg_id = *((uint64_t *) arg); + struct rxm_recv_entry *recv_entry; + + recv_entry = container_of(item, struct rxm_recv_entry, sar.entry); return (msg_id == recv_entry->sar.msg_id); } -static inline -ssize_t rxm_sar_handle_segment(struct rxm_rx_buf *rx_buf) +static ssize_t rxm_sar_handle_segment(struct rxm_rx_buf *rx_buf) { struct dlist_entry *sar_entry; rx_buf->conn = rxm_key2conn(rx_buf->ep, rx_buf->pkt.ctrl_hdr.conn_id); - if (OFI_UNLIKELY(!rx_buf->conn)) + if (!rx_buf->conn) return -FI_EOTHER; + FI_DBG(&rxm_prov, FI_LOG_CQ, - "Got incoming recv with msg_id: 0x%" PRIx64 "for conn - %p\n", + "Got incoming recv with msg_id: 0x%" PRIx64 " for conn - %p\n", rx_buf->pkt.ctrl_hdr.msg_id, rx_buf->conn); sar_entry = dlist_find_first_match(&rx_buf->conn->sar_rx_msg_list, rxm_sar_match_msg_id, &rx_buf->pkt.ctrl_hdr.msg_id); if (!sar_entry) return rxm_handle_recv_comp(rx_buf); - rx_buf->recv_entry = - container_of(sar_entry, struct rxm_recv_entry, sar.entry); - return rxm_cq_handle_seg_data(rx_buf); + + rx_buf->recv_entry = container_of(sar_entry, struct rxm_recv_entry, + sar.entry); + rxm_handle_seg_data(rx_buf); + return 0; } -static ssize_t rxm_rndv_send_ack_inject(struct rxm_rx_buf *rx_buf) +static void rxm_rndv_send_rd_done(struct rxm_rx_buf *rx_buf) { - struct rxm_pkt pkt; - struct iovec iov = { - .iov_base = &pkt, - .iov_len = sizeof(pkt), - }; - struct fi_msg msg = { - .msg_iov = &iov, - .iov_count = 1, - .context = rx_buf, - }; + struct rxm_deferred_tx_entry *def_entry; + struct rxm_tx_base_buf *buf; + ssize_t ret; assert(rx_buf->conn); + assert(rx_buf->hdr.state == RXM_RNDV_READ); + buf = rxm_tx_buf_alloc(rx_buf->ep, RXM_BUF_POOL_TX_RNDV_RD_DONE); + if (!buf) { + ret = -FI_ENOMEM; + goto err; + } + + rx_buf->recv_entry->rndv.tx_buf = buf; + assert(buf->pkt.ctrl_hdr.type == rxm_ctrl_rndv_rd_done); + + buf->pkt.ctrl_hdr.conn_id = rx_buf->conn->handle.remote_key; + buf->pkt.ctrl_hdr.msg_id = rx_buf->pkt.ctrl_hdr.msg_id; - pkt.hdr.op = ofi_op_msg; - pkt.hdr.version = OFI_OP_VERSION; - pkt.ctrl_hdr.version = RXM_CTRL_VERSION; - pkt.ctrl_hdr.type = rxm_ctrl_rndv_ack; - pkt.ctrl_hdr.conn_id = rx_buf->conn->handle.remote_key; - pkt.ctrl_hdr.msg_id = rx_buf->pkt.ctrl_hdr.msg_id; + ret = fi_send(rx_buf->conn->msg_ep, &buf->pkt, sizeof(buf->pkt), + buf->hdr.desc, 0, rx_buf); + if (ret) { + if (ret == -FI_EAGAIN) { + def_entry = rxm_ep_alloc_deferred_tx_entry(rx_buf->ep, + rx_buf->conn, + RXM_DEFERRED_TX_RNDV_ACK); + if (def_entry) { + def_entry->rndv_ack.rx_buf = rx_buf; + def_entry->rndv_ack.pkt_size = sizeof(rx_buf->pkt); + rxm_ep_enqueue_deferred_tx_queue(def_entry); + return; + } + } + goto free; + } + + RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_READ_DONE_SENT); + return; - return fi_sendmsg(rx_buf->conn->msg_ep, &msg, FI_INJECT); +free: + ofi_buf_free(buf); + rx_buf->recv_entry->rndv.tx_buf = NULL; +err: + FI_WARN(&rxm_prov, FI_LOG_CQ, + "unable to allocate/send rd rndv ack: %s\n", + fi_strerror((int) ret)); + assert(0); + /* TODO: Allocate all resources needed on receiving + * original message receive request, to avoid allocation failures. + * On other failures, we need to fail the receive. + */ } -static ssize_t rxm_rndv_send_ack(struct rxm_rx_buf *rx_buf) +static void +rxm_rndv_send_wr_done(struct rxm_ep *rxm_ep, struct rxm_tx_rndv_buf *tx_buf) { + struct rxm_deferred_tx_entry *def_entry; + struct rxm_tx_base_buf *buf; ssize_t ret; - assert(rx_buf->conn); + assert(tx_buf->hdr.state == RXM_RNDV_WRITE); + buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_RNDV_WR_DONE); + if (!buf) { + ret = -FI_ENOMEM; + goto err; + } - if (sizeof(rx_buf->pkt) <= rx_buf->ep->inject_limit) { - ret = rxm_rndv_send_ack_inject(rx_buf); - if (!ret) - goto out; + tx_buf->write_rndv.done_buf = buf; + assert(buf->pkt.ctrl_hdr.type == rxm_ctrl_rndv_wr_done); - if (OFI_UNLIKELY(ret != -FI_EAGAIN)) { - FI_WARN(&rxm_prov, FI_LOG_CQ, - "send ack via inject failed for MSG provider\n"); - return ret; + buf->pkt.ctrl_hdr.conn_id = tx_buf->pkt.ctrl_hdr.conn_id; + buf->pkt.ctrl_hdr.msg_id = tx_buf->pkt.ctrl_hdr.msg_id; + + ret = fi_send(tx_buf->write_rndv.conn->msg_ep, &buf->pkt, + sizeof(buf->pkt), buf->hdr.desc, 0, tx_buf); + if (ret) { + if (ret == -FI_EAGAIN) { + def_entry = rxm_ep_alloc_deferred_tx_entry(rxm_ep, + tx_buf->write_rndv.conn, + RXM_DEFERRED_TX_RNDV_DONE); + if (def_entry) { + def_entry->rndv_done.tx_buf = tx_buf; + rxm_ep_enqueue_deferred_tx_queue(def_entry); + return; + } } + goto free; } - rx_buf->recv_entry->rndv.tx_buf = (struct rxm_tx_base_buf *) - rxm_tx_buf_alloc(rx_buf->ep, RXM_BUF_POOL_TX_ACK); - if (OFI_UNLIKELY(!rx_buf->recv_entry->rndv.tx_buf)) { - FI_WARN(&rxm_prov, FI_LOG_CQ, - "ran out of buffers from ACK buffer pool\n"); - return -FI_EAGAIN; + RXM_UPDATE_STATE(FI_LOG_CQ, tx_buf, RXM_RNDV_WRITE_DONE_SENT); + return; + +free: + ofi_buf_free(buf); + tx_buf->write_rndv.done_buf = NULL; +err: + FI_WARN(&rxm_prov, FI_LOG_CQ, + "unable to allocate/send wr rndv ack: %s\n", + fi_strerror((int) ret)); + assert(0); + /* TODO: Allocate all resources needed prior to initiating the + * original message send request, to avoid allocation failures. + * On other failures, we need to fail the original message. + */ +} + +ssize_t rxm_rndv_send_wr_data(struct rxm_rx_buf *rx_buf) +{ + struct rxm_deferred_tx_entry *def_entry; + struct rxm_tx_base_buf *buf; + ssize_t ret; + + assert(rx_buf->conn); + + buf = rxm_tx_buf_alloc(rx_buf->ep, RXM_BUF_POOL_TX_RNDV_WR_DATA); + if (!buf) { + ret = -FI_ENOMEM; + goto err; } - assert(rx_buf->recv_entry->rndv.tx_buf->pkt.ctrl_hdr.type == rxm_ctrl_rndv_ack); - assert(rx_buf->hdr.state == RXM_RNDV_READ); + assert(buf->pkt.ctrl_hdr.type == rxm_ctrl_rndv_wr_data); + rx_buf->recv_entry->rndv.tx_buf = buf; - rx_buf->recv_entry->rndv.tx_buf->pkt.ctrl_hdr.conn_id = - rx_buf->conn->handle.remote_key; - rx_buf->recv_entry->rndv.tx_buf->pkt.ctrl_hdr.msg_id = - rx_buf->pkt.ctrl_hdr.msg_id; - - ret = fi_send(rx_buf->conn->msg_ep, &rx_buf->recv_entry->rndv.tx_buf->pkt, - sizeof(rx_buf->recv_entry->rndv.tx_buf->pkt), - rx_buf->recv_entry->rndv.tx_buf->hdr.desc, 0, rx_buf); - if (OFI_UNLIKELY(ret)) { - if (OFI_LIKELY(ret == -FI_EAGAIN)) { - struct rxm_deferred_tx_entry *def_tx_entry = - rxm_ep_alloc_deferred_tx_entry( - rx_buf->ep, rx_buf->conn, - RXM_DEFERRED_TX_RNDV_ACK); - if (OFI_UNLIKELY(!def_tx_entry)) { - FI_WARN(&rxm_prov, FI_LOG_CQ, "unable to " - "allocate TX entry for deferred ACK\n"); - ret = -FI_EAGAIN; - goto err; - } + buf->pkt.ctrl_hdr.conn_id = rx_buf->conn->handle.remote_key; + buf->pkt.ctrl_hdr.msg_id = rx_buf->pkt.ctrl_hdr.msg_id; + rxm_rndv_hdr_init(rx_buf->ep, buf->pkt.data, + rx_buf->recv_entry->rxm_iov.iov, + rx_buf->recv_entry->rxm_iov.count, rx_buf->mr); - def_tx_entry->rndv_ack.rx_buf = rx_buf; - rxm_ep_enqueue_deferred_tx_queue(def_tx_entry); - return 0; - } else { - FI_WARN(&rxm_prov, FI_LOG_CQ, - "unable to send ACK: %zd\n", ret); + ret = fi_send(rx_buf->conn->msg_ep, &buf->pkt, sizeof(buf->pkt) + + sizeof(struct rxm_rndv_hdr), buf->hdr.desc, 0, rx_buf); + if (ret) { + if (ret == -FI_EAGAIN) { + def_entry = rxm_ep_alloc_deferred_tx_entry(rx_buf->ep, + rx_buf->conn, + RXM_DEFERRED_TX_RNDV_ACK); + if (def_entry) { + def_entry->rndv_ack.rx_buf = rx_buf; + def_entry->rndv_ack.pkt_size = + sizeof(buf->pkt) + + sizeof(struct rxm_rndv_hdr); + rxm_ep_enqueue_deferred_tx_queue(def_entry); + return 0; + } } - goto err; + goto free; } -out: - RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_ACK_SENT); + RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_WRITE_DATA_SENT); return 0; + +free: + ofi_buf_free(buf); + rx_buf->recv_entry->rndv.tx_buf = NULL; err: - ofi_buf_free(rx_buf->recv_entry->rndv.tx_buf); - return ret; + FI_WARN(&rxm_prov, FI_LOG_CQ, + "unable to allocate/send wr rndv ready: %s\n", + fi_strerror((int) ret)); + assert(0); + /* TODO: Sender will be blocked forever waiting for a response + * that will not come. Need to tear down communication. + */ + return 0; } - - -static int rxm_handle_remote_write(struct rxm_ep *rxm_ep, +static void rxm_handle_remote_write(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp) { - int ret; - - FI_DBG(&rxm_prov, FI_LOG_CQ, "writing remote write completion\n"); - ret = ofi_cq_write(rxm_ep->util_ep.rx_cq, NULL, comp->flags, 0, NULL, - comp->data, 0); - if (ret) { - FI_WARN(&rxm_prov, FI_LOG_CQ, - "Unable to write remote write completion\n"); - return ret; - } + rxm_cq_write(rxm_ep->util_ep.rx_cq, NULL, comp->flags, 0, NULL, + comp->data, 0); ofi_ep_rem_wr_cntr_inc(&rxm_ep->util_ep); if (comp->op_context) - rxm_rx_buf_finish(comp->op_context); - return 0; + rxm_rx_buf_free(comp->op_context); } -static inline void rxm_ep_format_atomic_resp_pkt_hdr(struct rxm_conn *rxm_conn, - struct rxm_tx_atomic_buf *tx_buf, - size_t data_len, uint32_t pkt_op, - enum fi_datatype datatype, uint8_t atomic_op) +static void rxm_format_atomic_resp_pkt_hdr(struct rxm_conn *rxm_conn, + struct rxm_tx_atomic_buf *tx_buf, + size_t data_len, uint32_t pkt_op, + enum fi_datatype datatype, + uint8_t atomic_op) { rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, pkt_op, 0, 0, 0, &tx_buf->pkt); @@ -866,16 +1032,16 @@ static ssize_t rxm_atomic_send_resp(struct rxm_ep *rxm_ep, struct rxm_deferred_tx_entry *def_tx_entry; struct rxm_atomic_resp_hdr *atomic_hdr; ssize_t ret; - ssize_t resp_len = result_len + sizeof(struct rxm_atomic_resp_hdr) + - sizeof(struct rxm_pkt); + ssize_t resp_len; + + resp_len = result_len + sizeof(struct rxm_atomic_resp_hdr) + + sizeof(struct rxm_pkt); resp_buf->hdr.state = RXM_ATOMIC_RESP_SENT; - rxm_ep_format_atomic_resp_pkt_hdr(rx_buf->conn, - resp_buf, - resp_len, - rx_buf->pkt.hdr.op, - rx_buf->pkt.hdr.atomic.datatype, - rx_buf->pkt.hdr.atomic.op); + rxm_format_atomic_resp_pkt_hdr(rx_buf->conn, resp_buf, resp_len, + rx_buf->pkt.hdr.op, + rx_buf->pkt.hdr.atomic.datatype, + rx_buf->pkt.hdr.atomic.op); resp_buf->pkt.ctrl_hdr.conn_id = rx_buf->conn->handle.remote_key; resp_buf->pkt.ctrl_hdr.msg_id = rx_buf->pkt.ctrl_hdr.msg_id; atomic_hdr = (struct rxm_atomic_resp_hdr *) resp_buf->pkt.data; @@ -885,21 +1051,20 @@ static ssize_t rxm_atomic_send_resp(struct rxm_ep *rxm_ep, if (resp_len < rxm_ep->inject_limit) { ret = fi_inject(rx_buf->conn->msg_ep, &resp_buf->pkt, resp_len, 0); - if (OFI_LIKELY(!ret)) + if (!ret) ofi_buf_free(resp_buf); } else { ret = rxm_atomic_send_respmsg(rxm_ep, rx_buf->conn, resp_buf, resp_len); } - if (OFI_UNLIKELY(ret)) { + if (ret) { FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to send Atomic Response\n"); - if (OFI_LIKELY(ret == -FI_EAGAIN)) { - def_tx_entry = - rxm_ep_alloc_deferred_tx_entry(rxm_ep, + if (ret == -FI_EAGAIN) { + def_tx_entry = rxm_ep_alloc_deferred_tx_entry(rxm_ep, rx_buf->conn, RXM_DEFERRED_TX_ATOMIC_RESP); - if (OFI_UNLIKELY(!def_tx_entry)) { + if (!def_tx_entry) { FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to allocate deferred Atomic " "Response\n"); @@ -912,26 +1077,29 @@ static ssize_t rxm_atomic_send_resp(struct rxm_ep *rxm_ep, ret = 0; } } - rxm_rx_buf_finish(rx_buf); + rxm_rx_buf_free(rx_buf); return ret; } -static inline void rxm_do_atomic(struct rxm_pkt *pkt, void *dst, void *src, - void *cmp, void *res, size_t count, - enum fi_datatype datatype, enum fi_op op) +static void rxm_do_atomic(uint8_t op, void *dst, void *src, void *cmp, + void *res, size_t count, enum fi_datatype datatype, + enum fi_op amo_op) { - switch (pkt->hdr.op) { + switch (op) { case ofi_op_atomic: - ofi_atomic_write_handlers[op][datatype](dst, src, count); + assert(ofi_atomic_iswrite_op(amo_op)); + ofi_atomic_write_handler(amo_op, datatype, dst, src, count); break; case ofi_op_atomic_fetch: - ofi_atomic_readwrite_handlers[op][datatype](dst, src, res, - count); + assert(ofi_atomic_isreadwrite_op(amo_op)); + ofi_atomic_readwrite_handler(amo_op, datatype, dst, src, res, + count); break; case ofi_op_atomic_compare: - ofi_atomic_swap_handlers[op - OFI_SWAP_OP_START][datatype](dst, - src, cmp, res, count); + assert(ofi_atomic_isswap_op(amo_op)); + ofi_atomic_swap_handler(amo_op, datatype, dst, src, cmp, res, + count); break; default: /* Validated prior to calling function */ @@ -939,8 +1107,50 @@ static inline void rxm_do_atomic(struct rxm_pkt *pkt, void *dst, void *src, } } -static inline ssize_t rxm_handle_atomic_req(struct rxm_ep *rxm_ep, - struct rxm_rx_buf *rx_buf) +static int rxm_do_device_mem_atomic(struct rxm_mr *dev_mr, uint8_t op, + void *dev_dst, void *src, void *cmp, + void *res, size_t amo_count, + enum fi_datatype datatype, + enum fi_op amo_op, size_t amo_op_size) +{ + struct rxm_domain *dom = dev_mr->domain; + void *bounce_buf; + ssize_t ret __attribute__((unused)); + struct iovec iov = { + .iov_base = dev_dst, + .iov_len = amo_op_size, + }; + + fastlock_acquire(&dom->amo_bufpool_lock); + bounce_buf = ofi_buf_alloc(dom->amo_bufpool); + fastlock_release(&dom->amo_bufpool_lock); + + if (!bounce_buf) + return -FI_ENOMEM; + + fastlock_acquire(&dev_mr->amo_lock); + ret = ofi_copy_from_hmem_iov(bounce_buf, amo_op_size, dev_mr->iface, 0, + &iov, 1, 0); + assert(ret == amo_op_size); + + rxm_do_atomic(op, bounce_buf, src, cmp, res, amo_count, datatype, + amo_op); + + ret = ofi_copy_to_hmem_iov(dev_mr->iface, 0, &iov, 1, 0, bounce_buf, + amo_op_size); + assert(ret == amo_op_size); + + fastlock_release(&dev_mr->amo_lock); + + fastlock_acquire(&dom->amo_bufpool_lock); + ofi_buf_free(bounce_buf); + fastlock_release(&dom->amo_bufpool_lock); + + return FI_SUCCESS; +} + +static ssize_t rxm_handle_atomic_req(struct rxm_ep *rxm_ep, + struct rxm_rx_buf *rx_buf) { struct rxm_atomic_hdr *req_hdr = (struct rxm_atomic_hdr *) rx_buf->pkt.data; @@ -951,27 +1161,26 @@ static inline ssize_t rxm_handle_atomic_req(struct rxm_ep *rxm_ep, ssize_t result_len; uint64_t offset; int i; - int ret = 0; + ssize_t ret = 0; struct rxm_tx_atomic_buf *resp_buf; struct rxm_atomic_resp_hdr *resp_hdr; struct rxm_domain *domain = container_of(rxm_ep->util_ep.domain, struct rxm_domain, util_domain); + uint8_t op = rx_buf->pkt.hdr.op; assert(!(rx_buf->comp_flags & ~(FI_RECV | FI_RECV | FI_REMOTE_CQ_DATA))); - assert(rx_buf->pkt.hdr.op == ofi_op_atomic || - rx_buf->pkt.hdr.op == ofi_op_atomic_fetch || - rx_buf->pkt.hdr.op == ofi_op_atomic_compare); + assert(op == ofi_op_atomic || op == ofi_op_atomic_fetch || + op == ofi_op_atomic_compare); if (rx_buf->ep->srx_ctx) rx_buf->conn = rxm_key2conn(rx_buf->ep, rx_buf->pkt.ctrl_hdr.conn_id); - if (OFI_UNLIKELY(!rx_buf->conn)) + if (!rx_buf->conn) return -FI_EOTHER; - resp_buf = (struct rxm_tx_atomic_buf *) - rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_ATOMIC); - if (OFI_UNLIKELY(!resp_buf)) { + resp_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_ATOMIC); + if (!resp_buf) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "Unable to allocate from Atomic buffer pool\n"); /* TODO: Should this be -FI_ENOMEM - how does it get @@ -988,9 +1197,9 @@ static inline ssize_t rxm_handle_atomic_req(struct rxm_ep *rxm_ep, atomic_op)); if (ret) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, - "Atomic RMA MR verify error %d\n", ret); - ret = -FI_EACCES; - goto send_nak; + "Atomic RMA MR verify error %ld\n", ret); + return rxm_atomic_send_resp(rxm_ep, rx_buf, resp_buf, 0, + -FI_EACCES); } } @@ -999,74 +1208,98 @@ static inline ssize_t rxm_handle_atomic_req(struct rxm_ep *rxm_ep, resp_hdr = (struct rxm_atomic_resp_hdr *) resp_buf->pkt.data; for (i = 0, offset = 0; i < rx_buf->pkt.hdr.atomic.ioc_count; i++) { - rxm_do_atomic(&rx_buf->pkt, - (uintptr_t *) req_hdr->rma_ioc[i].addr, - req_hdr->data + offset, - req_hdr->data + len + offset, - resp_hdr->data + offset, - req_hdr->rma_ioc[i].count, datatype, atomic_op); - offset += req_hdr->rma_ioc[i].count * datatype_sz; + struct rxm_mr *mr = + rxm_mr_get_map_entry(domain, req_hdr->rma_ioc[i].key); + size_t amo_count = req_hdr->rma_ioc[i].count; + size_t amo_op_size = amo_count * datatype_sz; + void *src_buf = req_hdr->data + offset; + void *cmp_buf = req_hdr->data + len + offset; + void *res_buf = resp_hdr->data + offset; + void *dst_buf = (void *) req_hdr->rma_ioc[i].addr; + + if (mr->iface != FI_HMEM_SYSTEM) { + ret = rxm_do_device_mem_atomic(mr, op, dst_buf, src_buf, + cmp_buf, res_buf, + amo_count, datatype, + atomic_op, amo_op_size); + if (ret) { + FI_WARN(&rxm_prov, FI_LOG_EP_DATA, + "Atomic operation failed %ld\n", ret); + + return rxm_atomic_send_resp(rxm_ep, rx_buf, + resp_buf, 0, ret); + } + } else { + rxm_do_atomic(op, dst_buf, src_buf, cmp_buf, res_buf, + amo_count, datatype, atomic_op); + } + + offset += amo_op_size; } - result_len = rx_buf->pkt.hdr.op == ofi_op_atomic ? 0 : offset; + result_len = op == ofi_op_atomic ? 0 : offset; - if (rx_buf->pkt.hdr.op == ofi_op_atomic) + if (op == ofi_op_atomic) ofi_ep_rem_wr_cntr_inc(&rxm_ep->util_ep); else ofi_ep_rem_rd_cntr_inc(&rxm_ep->util_ep); return rxm_atomic_send_resp(rxm_ep, rx_buf, resp_buf, result_len, FI_SUCCESS); -send_nak: - return rxm_atomic_send_resp(rxm_ep, rx_buf, resp_buf, 0, ret); } - -static inline ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep, - struct rxm_rx_buf *rx_buf) +static ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep, + struct rxm_rx_buf *rx_buf) { struct rxm_tx_atomic_buf *tx_buf; - struct rxm_atomic_resp_hdr *resp_hdr = - (struct rxm_atomic_resp_hdr *) rx_buf->pkt.data; + struct rxm_atomic_resp_hdr *resp_hdr; + struct util_cntr *cntr = NULL; uint64_t len; - int ret = 0; + ssize_t copy_len; + ssize_t ret = 0; + enum fi_hmem_iface iface; + uint64_t device; + resp_hdr = (struct rxm_atomic_resp_hdr *) rx_buf->pkt.data; tx_buf = ofi_bufpool_get_ibuf(rxm_ep->buf_pools[RXM_BUF_POOL_TX_ATOMIC].pool, rx_buf->pkt.ctrl_hdr.msg_id); FI_DBG(&rxm_prov, FI_LOG_CQ, "received atomic response: op: %" PRIu8 " msg_id: 0x%" PRIx64 "\n", rx_buf->pkt.hdr.op, rx_buf->pkt.ctrl_hdr.msg_id); + iface = rxm_mr_desc_to_hmem_iface_dev(tx_buf->result_iov.desc, + tx_buf->result_iov.count, + &device); + assert(!(rx_buf->comp_flags & ~(FI_RECV | FI_REMOTE_CQ_DATA))); - if (OFI_UNLIKELY(resp_hdr->status)) { - struct util_cntr *cntr = NULL; + if (resp_hdr->status) { + ret = ntohl(resp_hdr->status); FI_WARN(&rxm_prov, FI_LOG_CQ, - "bad atomic response status %d\n", ntohl(resp_hdr->status)); + "bad atomic response status %d\n", + ntohl(resp_hdr->status)); + goto write_err; + } - if (tx_buf->pkt.hdr.op == ofi_op_atomic) { - cntr = rxm_ep->util_ep.wr_cntr; - } else if (tx_buf->pkt.hdr.op == ofi_op_atomic_compare || - tx_buf->pkt.hdr.op == ofi_op_atomic_fetch) { - cntr = rxm_ep->util_ep.rd_cntr; - } else { - FI_WARN(&rxm_prov, FI_LOG_CQ, - "unknown atomic request op!\n"); - assert(0); - } - rxm_cq_write_error(rxm_ep->util_ep.tx_cq, cntr, - tx_buf->app_context, ntohl(resp_hdr->status)); - goto err; + len = ofi_total_iov_len(tx_buf->result_iov.iov, + tx_buf->result_iov.count); + if (ntohl(resp_hdr->result_len) != len) { + ret = -FI_EIO; + FI_WARN(&rxm_prov, FI_LOG_CQ, "result size mismatch\n"); + goto write_err; } - len = ofi_total_iov_len(tx_buf->result_iov, tx_buf->result_iov_count); - assert(ntohl(resp_hdr->result_len) == len); - ofi_copy_to_iov(tx_buf->result_iov, tx_buf->result_iov_count, 0, - resp_hdr->data, len); + copy_len = ofi_copy_to_hmem_iov(iface, device, tx_buf->result_iov.iov, + tx_buf->result_iov.count, 0, resp_hdr->data, + len); + if (copy_len != len) { + ret = -FI_EIO; + FI_WARN(&rxm_prov, FI_LOG_CQ, "copy length error\n"); + goto write_err; + } if (!(tx_buf->flags & FI_INJECT)) - ret = rxm_cq_tx_comp_write(rxm_ep, - ofi_tx_cq_flags(tx_buf->pkt.hdr.op), - tx_buf->app_context, tx_buf->flags); + rxm_cq_write_tx_comp(rxm_ep, ofi_tx_cq_flags(tx_buf->pkt.hdr.op), + tx_buf->app_context, tx_buf->flags); if (tx_buf->pkt.hdr.op == ofi_op_atomic) { ofi_ep_wr_cntr_inc(&rxm_ep->util_ep); @@ -1074,23 +1307,61 @@ static inline ssize_t rxm_handle_atomic_resp(struct rxm_ep *rxm_ep, tx_buf->pkt.hdr.op == ofi_op_atomic_fetch) { ofi_ep_rd_cntr_inc(&rxm_ep->util_ep); } else { - FI_WARN(&rxm_prov, FI_LOG_CQ, "unknown atomic request op!\n"); - rxm_cq_write_error(rxm_ep->util_ep.tx_cq, NULL, - tx_buf->app_context, ntohl(resp_hdr->status)); - assert(0); + ret = -FI_EOPNOTSUPP; + goto write_err; } -err: - rxm_rx_buf_finish(rx_buf); +free: + rxm_rx_buf_free(rx_buf); ofi_buf_free(tx_buf); - + ofi_atomic_inc32(&rxm_ep->atomic_tx_credits); + assert(ofi_atomic_get32(&rxm_ep->atomic_tx_credits) <= + rxm_ep->rxm_info->tx_attr->size); return ret; + +write_err: + if (tx_buf->pkt.hdr.op == ofi_op_atomic) { + cntr = rxm_ep->util_ep.wr_cntr; + } else if (tx_buf->pkt.hdr.op == ofi_op_atomic_compare || + tx_buf->pkt.hdr.op == ofi_op_atomic_fetch) { + cntr = rxm_ep->util_ep.rd_cntr; + } else { + FI_WARN(&rxm_prov, FI_LOG_CQ, + "unknown atomic request op!\n"); + assert(0); + } + rxm_cq_write_error(rxm_ep->util_ep.tx_cq, cntr, + tx_buf->app_context, (int) ret); + goto free; } -static ssize_t rxm_cq_handle_comp(struct rxm_ep *rxm_ep, - struct fi_cq_data_entry *comp) +static ssize_t rxm_handle_credit(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf) +{ + struct rxm_domain *domain; + + assert(rx_buf->rx_ep->fid.fclass == FI_CLASS_EP); + domain = container_of(rxm_ep->util_ep.domain, struct rxm_domain, + util_domain); + domain->flow_ctrl_ops->add_credits(rx_buf->rx_ep, + rx_buf->pkt.ctrl_hdr.ctrl_data); + rxm_rx_buf_free(rx_buf); + return FI_SUCCESS; +} + +void rxm_finish_coll_eager_send(struct rxm_ep *rxm_ep, + struct rxm_tx_eager_buf *tx_eager_buf) +{ + if (tx_eager_buf->pkt.hdr.tag & OFI_COLL_TAG_FLAG) { + ofi_coll_handle_xfer_comp(tx_eager_buf->pkt.hdr.tag, + tx_eager_buf->app_context); + } else { + rxm_finish_eager_send(rxm_ep, tx_eager_buf); + } +}; + +ssize_t rxm_handle_comp(struct rxm_ep *rxm_ep, struct fi_cq_data_entry *comp) { - ssize_t ret; struct rxm_rx_buf *rx_buf; + struct rxm_tx_base_buf *tx_buf; struct rxm_tx_sar_buf *tx_sar_buf; struct rxm_tx_eager_buf *tx_eager_buf; struct rxm_tx_rndv_buf *tx_rndv_buf; @@ -1099,27 +1370,31 @@ static ssize_t rxm_cq_handle_comp(struct rxm_ep *rxm_ep, /* Remote write events may not consume a posted recv so op context * and hence state would be NULL */ - if (comp->flags & FI_REMOTE_WRITE) - return rxm_handle_remote_write(rxm_ep, comp); - - assert(RXM_GET_PROTO_STATE(comp->op_context) != RXM_INJECT_TX); + if (comp->flags & FI_REMOTE_WRITE) { + rxm_handle_remote_write(rxm_ep, comp); + return 0; + } switch (RXM_GET_PROTO_STATE(comp->op_context)) { case RXM_TX: tx_eager_buf = comp->op_context; - assert(comp->flags & FI_SEND); - ret = rxm_finish_eager_send(rxm_ep, tx_eager_buf); + rxm_ep->eager_ops->comp_tx(rxm_ep, tx_eager_buf); ofi_buf_free(tx_eager_buf); - return ret; - case RXM_SAR_TX: - tx_sar_buf = comp->op_context; + return 0; + case RXM_CREDIT_TX: + tx_buf = comp->op_context; assert(comp->flags & FI_SEND); - return rxm_finish_sar_segment_send(rxm_ep, tx_sar_buf); + ofi_buf_free(tx_buf); + return 0; + case RXM_INJECT_TX: + assert(0); + return 0; case RXM_RMA: rma_buf = comp->op_context; assert((comp->flags & (FI_WRITE | FI_RMA)) || (comp->flags & (FI_READ | FI_RMA))); - return rxm_finish_rma(rxm_ep, rma_buf, comp->flags); + rxm_finish_rma(rxm_ep, rma_buf, comp->flags); + return 0; case RXM_RX: rx_buf = comp->op_context; assert(!(comp->flags & FI_REMOTE_READ)); @@ -1128,55 +1403,240 @@ static ssize_t rxm_cq_handle_comp(struct rxm_ep *rxm_ep, switch (rx_buf->pkt.ctrl_hdr.type) { case rxm_ctrl_eager: - case rxm_ctrl_rndv: + case rxm_ctrl_rndv_req: return rxm_handle_recv_comp(rx_buf); - case rxm_ctrl_rndv_ack: - return rxm_rndv_handle_ack(rxm_ep, rx_buf); + case rxm_ctrl_rndv_rd_done: + rxm_rndv_handle_rd_done(rxm_ep, rx_buf); + return 0; + case rxm_ctrl_rndv_wr_done: + return rxm_rndv_handle_wr_done(rxm_ep, rx_buf); + case rxm_ctrl_rndv_wr_data: + return rxm_rndv_handle_wr_data(rx_buf); case rxm_ctrl_seg: return rxm_sar_handle_segment(rx_buf); case rxm_ctrl_atomic: return rxm_handle_atomic_req(rxm_ep, rx_buf); case rxm_ctrl_atomic_resp: return rxm_handle_atomic_resp(rxm_ep, rx_buf); + case rxm_ctrl_credit: + return rxm_handle_credit(rxm_ep, rx_buf); default: FI_WARN(&rxm_prov, FI_LOG_CQ, "Unknown message type\n"); assert(0); return -FI_EINVAL; } - case RXM_RNDV_TX: - tx_rndv_buf = comp->op_context; + case RXM_SAR_TX: + tx_sar_buf = comp->op_context; assert(comp->flags & FI_SEND); - RXM_UPDATE_STATE(FI_LOG_CQ, tx_rndv_buf, RXM_RNDV_ACK_WAIT); + rxm_handle_sar_comp(rxm_ep, tx_sar_buf); return 0; - case RXM_RNDV_ACK_RECVD: + case RXM_RNDV_TX: tx_rndv_buf = comp->op_context; assert(comp->flags & FI_SEND); - return rxm_rndv_tx_finish(rxm_ep, tx_rndv_buf); + if (rxm_ep->rndv_ops == &rxm_rndv_ops_write) + RXM_UPDATE_STATE(FI_LOG_CQ, tx_rndv_buf, + RXM_RNDV_WRITE_DATA_WAIT); + else + RXM_UPDATE_STATE(FI_LOG_CQ, tx_rndv_buf, + RXM_RNDV_READ_DONE_WAIT); + return 0; + case RXM_RNDV_READ_DONE_WAIT: + case RXM_RNDV_WRITE_DATA_WAIT: + assert(0); + return 0; case RXM_RNDV_READ: rx_buf = comp->op_context; assert(comp->flags & FI_READ); - if (++rx_buf->rndv_rma_index < rx_buf->rndv_hdr->count) + if (++rx_buf->rndv_rma_index < rx_buf->remote_rndv_hdr->count) return 0; - else - return rxm_rndv_send_ack(rx_buf); - case RXM_RNDV_ACK_SENT: + + rxm_rndv_send_rd_done(rx_buf); + return 0; + case RXM_RNDV_WRITE: + tx_rndv_buf = comp->op_context; + assert(comp->flags & FI_WRITE); + if (++tx_rndv_buf->write_rndv.rndv_rma_index < + tx_rndv_buf->write_rndv.rndv_rma_count) + return 0; + + rxm_rndv_send_wr_done(rxm_ep, tx_rndv_buf); + return 0; + case RXM_RNDV_READ_DONE_SENT: assert(comp->flags & FI_SEND); - return rxm_finish_send_rndv_ack(comp->op_context); - case RXM_ATOMIC_RESP_SENT: - tx_atomic_buf = comp->op_context; + rxm_rndv_rx_finish(comp->op_context); + return 0; + case RXM_RNDV_WRITE_DATA_SENT: + rx_buf = comp->op_context; assert(comp->flags & FI_SEND); - ofi_buf_free(tx_atomic_buf); + dlist_insert_tail(&rx_buf->rndv_wait_entry, &rx_buf->ep->rndv_wait_list); + RXM_UPDATE_STATE(FI_LOG_CQ, rx_buf, RXM_RNDV_WRITE_DONE_WAIT); + return 0; + case RXM_RNDV_WRITE_DONE_SENT: + case RXM_RNDV_READ_DONE_RECVD: + assert(comp->flags & FI_SEND || comp->flags & FI_WRITE); + rxm_rndv_tx_finish(rxm_ep, comp->op_context); + return 0; + case RXM_RNDV_WRITE_DONE_RECVD: + assert(comp->flags & FI_SEND); + rxm_rndv_rx_finish(comp->op_context); + return 0; + case RXM_RNDV_FINISH: + assert(0); return 0; case RXM_ATOMIC_RESP_WAIT: /* Optional atomic request completion; TX completion * processing is performed when atomic response is received */ assert(comp->flags & FI_SEND); return 0; + case RXM_ATOMIC_RESP_SENT: + tx_atomic_buf = comp->op_context; + assert(comp->flags & FI_SEND); + ofi_buf_free(tx_atomic_buf); + return 0; default: - FI_WARN(&rxm_prov, FI_LOG_CQ, "Invalid state!\n"); assert(0); - return -FI_EOPBADSTATE; + return 0; + } +} + +static int rxm_get_recv_entry(struct rxm_rx_buf *rx_buf) +{ + struct rxm_recv_match_attr match_attr; + struct rxm_recv_queue *recv_queue; + struct dlist_entry *entry; + + assert(!rx_buf->recv_entry); + if (rx_buf->ep->rxm_info->caps & (FI_SOURCE | FI_DIRECTED_RECV)) { + if (rx_buf->ep->srx_ctx) + rx_buf->conn = rxm_key2conn(rx_buf->ep, rx_buf-> + pkt.ctrl_hdr.conn_id); + if (!rx_buf->conn) + return -FI_EOTHER; + match_attr.addr = rx_buf->conn->handle.fi_addr; + } else { + match_attr.addr = FI_ADDR_UNSPEC; + } + + if (rx_buf->pkt.hdr.op == ofi_op_msg) { + match_attr.tag = 0; + recv_queue = &rx_buf->ep->recv_queue; + } else { + assert(rx_buf->pkt.hdr.op == ofi_op_tagged); + match_attr.tag = rx_buf->pkt.hdr.tag; + recv_queue = &rx_buf->ep->trecv_queue; + } + + /* See comment with rxm_get_dyn_rbuf */ + if (recv_queue->dyn_rbuf_unexp_cnt == 0) { + entry = dlist_remove_first_match(&recv_queue->recv_list, + recv_queue->match_recv, + &match_attr); + if (entry) { + rx_buf->recv_entry = container_of(entry, + struct rxm_recv_entry, entry); + } else { + recv_queue->dyn_rbuf_unexp_cnt++; + } + } else { + recv_queue->dyn_rbuf_unexp_cnt++; } + + return 0; +} + +/* + * Dynamic receive buffer callback from fi_cq_read(msg cq). + * We're holding the ep lock. + * + * There's a subtle race condition handling unexpected messages. If we cannot + * find a matching receive, the message will be marked as unexpected. + * However, we can't queue it on the unexpected list until is has been fully + * received and returned through fi_cq_read(). It's possible for the + * application to post the matching buffer prior to that occurring. That is, + * the matching buffer is posted after we checked for a match, but before the + * message endpoint is finishes receiving the unexpected data. + * + * Once the unexpected message has been received, it's completion may be + * written to the CQ. If the message provider continues processing messages + * it could invoke a callback for a second message. If we allow the second + * message to match the posted receive buffer, then the second message would + * match out of order from the first message. + * + * To handle this, we need to track the number of unexpected messages queued + * within the message provider, so that they can check for matching + * receives in order. If there are any unexpected messages outstanding, we + * need to fail all matches until they have been read from the CQ. + */ +ssize_t rxm_get_dyn_rbuf(struct fi_cq_data_entry *entry, struct iovec *iov, + size_t *count) +{ + struct rxm_rx_buf *rx_buf; + int ret; + + rx_buf = entry->op_context; + assert((rx_buf->pkt.hdr.version == OFI_OP_VERSION) && + (rx_buf->pkt.ctrl_hdr.version == RXM_CTRL_VERSION)); + assert(!(rx_buf->ep->rxm_info->mode & FI_BUFFERED_RECV)); + + switch (rx_buf->pkt.ctrl_hdr.type) { + case rxm_ctrl_eager: + ret = rxm_get_recv_entry(rx_buf); + if (ret) + return ret; + + if (rx_buf->recv_entry) { + *count = rx_buf->recv_entry->rxm_iov.count; + memcpy(iov, rx_buf->recv_entry->rxm_iov.iov, *count * + sizeof(*iov)); + } else { + *count = 1; + iov[0].iov_base = &rx_buf->pkt + 1; + iov[0].iov_len = rxm_eager_limit; + } + break; + case rxm_ctrl_rndv_req: + /* find matching receive to maintain message ordering, but we + * only need to receive rendezvous header to complete message + */ + ret = rxm_get_recv_entry(rx_buf); + if (ret) + return ret; + + *count = 1; + iov[0].iov_base = &rx_buf->pkt + 1; + iov[0].iov_len = sizeof(struct rxm_rndv_hdr); + break; + case rxm_ctrl_atomic: + *count = 1; + iov[0].iov_base = &rx_buf->pkt + 1; + iov[0].iov_len = sizeof(struct rxm_atomic_hdr); + break; + case rxm_ctrl_atomic_resp: + *count = 1; + iov[0].iov_base = &rx_buf->pkt + 1; + iov[0].iov_len = sizeof(struct rxm_atomic_resp_hdr); + break; + case rxm_ctrl_rndv_wr_data: + *count = 1; + iov[0].iov_base = &rx_buf->pkt + 1; + iov[0].iov_len = sizeof(struct rxm_rndv_hdr); + break; + case rxm_ctrl_rndv_wr_done: + case rxm_ctrl_rndv_rd_done: + case rxm_ctrl_credit: + *count = 0; + iov[0].iov_base = NULL; + iov[0].iov_len = 0; + break; + case rxm_ctrl_seg: + default: + FI_WARN(&rxm_prov, FI_LOG_CQ, + "Unexpected request for dynamic rbuf\n"); + *count = 0; + break; + } + + return 0; } void rxm_cq_write_error(struct util_cq *cq, struct util_cntr *cntr, @@ -1189,13 +1649,14 @@ void rxm_cq_write_error(struct util_cq *cq, struct util_cntr *cntr, if (cntr) rxm_cntr_incerr(cntr); + if (ofi_cq_write_error(cq, &err_entry)) { FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to ofi_cq_write_error\n"); assert(0); } } -static void rxm_cq_write_error_all(struct rxm_ep *rxm_ep, int err) +void rxm_cq_write_error_all(struct rxm_ep *rxm_ep, int err) { struct fi_cq_err_entry err_entry = {0}; ssize_t ret = 0; @@ -1231,21 +1692,17 @@ static void rxm_cq_write_error_all(struct rxm_ep *rxm_ep, int err) rxm_cntr_incerr(rxm_ep->util_ep.rd_cntr); } -#define RXM_IS_PROTO_STATE_TX(state) \ - ((state == RXM_SAR_TX) || \ - (state == RXM_TX) || \ - (state == RXM_RNDV_TX)) - -static void rxm_cq_read_write_error(struct rxm_ep *rxm_ep) +void rxm_handle_comp_error(struct rxm_ep *rxm_ep) { + struct rxm_tx_base_buf *base_buf; struct rxm_tx_eager_buf *eager_buf; struct rxm_tx_sar_buf *sar_buf; struct rxm_tx_rndv_buf *rndv_buf; struct rxm_rx_buf *rx_buf; + struct rxm_rma_buf *rma_buf; + struct util_cq *cq; + struct util_cntr *cntr; struct fi_cq_err_entry err_entry = {0}; - struct util_cq *util_cq = NULL; - struct util_cntr *util_cntr = NULL; - enum rxm_proto_state state; ssize_t ret; ret = fi_cq_readerr(rxm_ep->msg_cq, &err_entry, 0); @@ -1256,111 +1713,137 @@ static void rxm_cq_read_write_error(struct rxm_ep *rxm_ep) return; } - if (err_entry.err == FI_ECANCELED) - OFI_CQ_STRERROR(&rxm_prov, FI_LOG_DEBUG, FI_LOG_CQ, - rxm_ep->msg_cq, &err_entry); - else + if (err_entry.err != FI_ECANCELED) OFI_CQ_STRERROR(&rxm_prov, FI_LOG_WARN, FI_LOG_CQ, rxm_ep->msg_cq, &err_entry); - state = RXM_GET_PROTO_STATE(err_entry.op_context); - if (RXM_IS_PROTO_STATE_TX(state)) { - util_cq = rxm_ep->util_ep.tx_cq; - util_cntr = rxm_ep->util_ep.tx_cntr; - } + cq = rxm_ep->util_ep.tx_cq; + cntr = rxm_ep->util_ep.tx_cntr; - switch (state) { + switch (RXM_GET_PROTO_STATE(err_entry.op_context)) { + case RXM_TX: + eager_buf = err_entry.op_context; + err_entry.op_context = eager_buf->app_context; + err_entry.flags = ofi_tx_cq_flags(eager_buf->pkt.hdr.op); + ofi_buf_free(eager_buf); + break; + case RXM_INJECT_TX: + assert(0); + return; + case RXM_RMA: + rma_buf = err_entry.op_context; + err_entry.op_context = rma_buf->app_context; + /* err_entry.flags pass through from msg ep */ + if (!(rma_buf->flags & FI_INJECT) && !rxm_ep->rdm_mr_local && + rxm_ep->msg_mr_local) { + rxm_msg_mr_closev(rma_buf->mr.mr, rma_buf->mr.count); + } + ofi_buf_free(rma_buf); + break; case RXM_SAR_TX: sar_buf = err_entry.op_context; err_entry.op_context = sar_buf->app_context; err_entry.flags = ofi_tx_cq_flags(sar_buf->pkt.hdr.op); + if (!rxm_complete_sar(rxm_ep, sar_buf)) + return; break; - case RXM_TX: - eager_buf = err_entry.op_context; - err_entry.op_context = eager_buf->app_context; - err_entry.flags = ofi_tx_cq_flags(eager_buf->pkt.hdr.op); + case RXM_CREDIT_TX: + base_buf = err_entry.op_context; + err_entry.op_context = 0; + err_entry.flags = ofi_tx_cq_flags(base_buf->pkt.hdr.op); break; + case RXM_RNDV_WRITE: + /* fall through */ case RXM_RNDV_TX: rndv_buf = err_entry.op_context; err_entry.op_context = rndv_buf->app_context; err_entry.flags = ofi_tx_cq_flags(rndv_buf->pkt.hdr.op); break; + + /* Incoming application data error */ case RXM_RX: - /* Silently drop any MSG CQ error entries for canceled receive - * operations as these are internal to RxM. This situation can - * happen when the MSG EP receives a reject / shutdown and CM - * thread hasn't handled the event yet. */ - if (err_entry.err == FI_ECANCELED) { - /* No need to re-post these buffers. Free directly */ + /* Silently drop MSG CQ error entries for internal receive + * operations not associated with an application posted + * receive. This situation can happen when the MSG EP + * receives a reject / shutdown and CM thread hasn't handled + * the event yet. + */ + rx_buf = (struct rxm_rx_buf *) err_entry.op_context; + if (!rx_buf->recv_entry) { ofi_buf_free((struct rxm_rx_buf *)err_entry.op_context); return; } /* fall through */ - case RXM_RNDV_ACK_SENT: - /* fall through */ + case RXM_RNDV_READ_DONE_SENT: + case RXM_RNDV_WRITE_DATA_SENT: case RXM_RNDV_READ: - rx_buf = (struct rxm_rx_buf *)err_entry.op_context; - util_cq = rx_buf->ep->util_ep.rx_cq; - util_cntr = rx_buf->ep->util_ep.rx_cntr; + rx_buf = (struct rxm_rx_buf *) err_entry.op_context; assert(rx_buf->recv_entry); err_entry.op_context = rx_buf->recv_entry->context; err_entry.flags = rx_buf->recv_entry->comp_flags; + + cq = rx_buf->ep->util_ep.rx_cq; + cntr = rx_buf->ep->util_ep.rx_cntr; break; default: - FI_WARN(&rxm_prov, FI_LOG_CQ, "Invalid state!\n"); - FI_WARN(&rxm_prov, FI_LOG_CQ, "msg cq error info: %s\n", + FI_WARN(&rxm_prov, FI_LOG_CQ, "Invalid state!\nmsg cq error info: %s\n", fi_cq_strerror(rxm_ep->msg_cq, err_entry.prov_errno, err_entry.err_data, NULL, 0)); rxm_cq_write_error_all(rxm_ep, -FI_EOPBADSTATE); + return; } - if (util_cntr) - rxm_cntr_incerr(util_cntr); - if (util_cq) { - ret = ofi_cq_write_error(util_cq, &err_entry); - if (ret) { - FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to ofi_cq_write_error\n"); - assert(0); - } + + if (cntr) + rxm_cntr_incerr(cntr); + + assert(cq); + ret = ofi_cq_write_error(cq, &err_entry); + if (ret) { + FI_WARN(&rxm_prov, FI_LOG_CQ, "Unable to ofi_cq_write_error\n"); + assert(0); } } -static inline int rxm_msg_ep_recv(struct rxm_rx_buf *rx_buf) +static int rxm_post_recv(struct rxm_rx_buf *rx_buf) { - int ret; + struct rxm_domain *domain; + int ret, level; if (rx_buf->ep->srx_ctx) rx_buf->conn = NULL; rx_buf->hdr.state = RXM_RX; - - ret = (int)fi_recv(rx_buf->msg_ep, &rx_buf->pkt, - rxm_eager_limit + sizeof(struct rxm_pkt), - rx_buf->hdr.desc, FI_ADDR_UNSPEC, rx_buf); - if (OFI_LIKELY(!ret)) + rx_buf->recv_entry = NULL; + + domain = container_of(rx_buf->ep->util_ep.domain, + struct rxm_domain, util_domain); + ret = (int) fi_recv(rx_buf->rx_ep, &rx_buf->pkt, + domain->rx_post_size, rx_buf->hdr.desc, + FI_ADDR_UNSPEC, rx_buf); + if (!ret) return 0; if (ret != -FI_EAGAIN) { - int level = FI_LOG_WARN; - if (rx_buf->conn->handle.state == RXM_CMAP_SHUTDOWN) - level = FI_LOG_DEBUG; + level = (rx_buf->conn->handle.state == RXM_CMAP_SHUTDOWN) ? + FI_LOG_DEBUG : FI_LOG_WARN; FI_LOG(&rxm_prov, level, FI_LOG_EP_CTRL, "unable to post recv buf: %d\n", ret); } return ret; } -int rxm_msg_ep_prepost_recv(struct rxm_ep *rxm_ep, struct fid_ep *msg_ep) +int rxm_prepost_recv(struct rxm_ep *rxm_ep, struct fid_ep *rx_ep) { struct rxm_rx_buf *rx_buf; int ret; size_t i; for (i = 0; i < rxm_ep->msg_info->rx_attr->size; i++) { - rx_buf = rxm_rx_buf_alloc(rxm_ep, msg_ep, 1); - if (OFI_UNLIKELY(!rx_buf)) + rx_buf = rxm_rx_buf_alloc(rxm_ep, rx_ep, true); + if (!rx_buf) return -FI_ENOMEM; - ret = rxm_msg_ep_recv(rx_buf); - if (OFI_UNLIKELY(ret)) { + ret = rxm_post_recv(rx_buf); + if (ret) { ofi_buf_free(&rx_buf->hdr); return ret; } @@ -1389,32 +1872,34 @@ void rxm_ep_do_progress(struct util_ep *util_ep) continue; } - ret = rxm_msg_ep_recv(buf); + ret = rxm_post_recv(buf); if (ret) { - if (OFI_LIKELY(ret == -FI_EAGAIN)) + if (ret == -FI_EAGAIN) ofi_buf_free(&buf->hdr); } } do { - ret = fi_cq_read(rxm_ep->msg_cq, &comp, 1); if (ret > 0) { - // We don't have enough info to write a good - // error entry to the CQ at this point - ret = rxm_cq_handle_comp(rxm_ep, &comp); - if (OFI_UNLIKELY(ret)) { + ret = rxm_handle_comp(rxm_ep, &comp); + if (ret) { + // We don't have enough info to write a good + // error entry to the CQ at this point rxm_cq_write_error_all(rxm_ep, ret); } else { ret = 1; } } else if (ret < 0 && (ret != -FI_EAGAIN)) { if (ret == -FI_EAVAIL) - rxm_cq_read_write_error(rxm_ep); + rxm_handle_comp_error(rxm_ep); else rxm_cq_write_error_all(rxm_ep, ret); - } else { - timestamp = fi_gettime_us(); + } + + if (ret == -FI_EAGAIN || --rxm_ep->cq_eq_fairness <= 0) { + rxm_ep->cq_eq_fairness = rxm_cq_eq_fairness; + timestamp = ofi_gettime_us(); if (timestamp - rxm_ep->msg_cq_last_poll > rxm_cm_progress_interval) { rxm_ep->msg_cq_last_poll = timestamp; @@ -1423,11 +1908,12 @@ void rxm_ep_do_progress(struct util_ep *util_ep) } } while ((ret > 0) && (++comp_read < rxm_ep->comp_per_progress)); - if (OFI_UNLIKELY(!dlist_empty(&rxm_ep->deferred_tx_conn_queue))) { + if (!dlist_empty(&rxm_ep->deferred_tx_conn_queue)) { dlist_foreach_container_safe(&rxm_ep->deferred_tx_conn_queue, struct rxm_conn, rxm_conn, - deferred_conn_entry, conn_entry_tmp) + deferred_conn_entry, conn_entry_tmp) { rxm_ep_progress_deferred_queue(rxm_ep, rxm_conn); + } } } @@ -1438,6 +1924,15 @@ void rxm_ep_progress(struct util_ep *util_ep) ofi_ep_lock_release(util_ep); } +void rxm_ep_progress_coll(struct util_ep *util_ep) +{ + ofi_ep_lock_acquire(util_ep); + rxm_ep_do_progress(util_ep); + ofi_ep_lock_release(util_ep); + + ofi_coll_ep_progress(&util_ep->ep_fid); +} + static int rxm_cq_close(struct fid *fid) { struct util_cq *util_cq; @@ -1483,7 +1978,7 @@ int rxm_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, return -FI_ENOMEM; ret = ofi_cq_init(&rxm_prov, domain, attr, util_cq, &ofi_cq_progress, - context); + context); if (ret) goto err1; diff --git a/prov/rxm/src/rxm_domain.c b/prov/rxm/src/rxm_domain.c index 6c9162c3b9c..0bcd8e3850f 100644 --- a/prov/rxm/src/rxm_domain.c +++ b/prov/rxm/src/rxm_domain.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2016 Intel Corporation, Inc. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -35,6 +36,7 @@ #include #include +#include #include "rxm.h" int rxm_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, @@ -71,6 +73,7 @@ static struct fi_ops_domain rxm_domain_ops = { .stx_ctx = fi_no_stx_context, .srx_ctx = fi_no_srx_context, .query_atomic = rxm_ep_query_atomic, + .query_collective = ofi_query_collective, }; static void rxm_mr_remove_map_entry(struct rxm_mr *mr) @@ -104,6 +107,17 @@ static int rxm_mr_add_map_entry(struct util_domain *domain, return ret; } +struct rxm_mr *rxm_mr_get_map_entry(struct rxm_domain *domain, uint64_t key) +{ + struct rxm_mr *mr; + + fastlock_acquire(&domain->util_domain.lock); + mr = ofi_mr_map_get(&domain->util_domain.mr_map, key); + fastlock_release(&domain->util_domain.lock); + + return mr; +} + static int rxm_domain_close(fid_t fid) { struct rxm_domain *rxm_domain; @@ -111,6 +125,9 @@ static int rxm_domain_close(fid_t fid) rxm_domain = container_of(fid, struct rxm_domain, util_domain.domain_fid.fid); + fastlock_destroy(&rxm_domain->amo_bufpool_lock); + ofi_bufpool_destroy(rxm_domain->amo_bufpool); + ret = fi_close(&rxm_domain->msg_domain->fid); if (ret) return ret; @@ -158,14 +175,74 @@ static struct fi_ops rxm_mr_ops = { .ops_open = fi_no_ops_open, }; +int rxm_msg_mr_reg_internal(struct rxm_domain *rxm_domain, const void *buf, + size_t len, uint64_t acs, uint64_t flags, struct fid_mr **mr) +{ + int ret, tries = 0; + + /* If we can't get a key within 1024 tries, give up */ + do { + ret = fi_mr_reg(rxm_domain->msg_domain, buf, len, acs, 0, + rxm_domain->mr_key++ | FI_PROV_SPECIFIC, + flags, mr, NULL); + } while (ret == -FI_ENOKEY && tries++ < 1024); + + return ret; +} + +void rxm_msg_mr_closev(struct fid_mr **mr, size_t count) +{ + int ret; + size_t i; + + for (i = 0; i < count; i++) { + if (mr[i]) { + ret = fi_close(&mr[i]->fid); + if (ret) + FI_WARN(&rxm_prov, FI_LOG_EP_DATA, + "Unable to close msg mr: %zu\n", i); + mr[i] = NULL; + } + } +} + +int rxm_msg_mr_regv(struct rxm_ep *rxm_ep, const struct iovec *iov, + size_t count, size_t reg_limit, uint64_t access, + struct fid_mr **mr) +{ + struct rxm_domain *rxm_domain; + size_t i; + int ret; + + rxm_domain = container_of(rxm_ep->util_ep.domain, struct rxm_domain, + util_domain); + + for (i = 0; i < count && reg_limit; i++) { + size_t len = MIN(iov[i].iov_len, reg_limit); + ret = rxm_msg_mr_reg_internal(rxm_domain, iov[i].iov_base, + len, access, 0, &mr[i]); + if (ret) + goto err; + reg_limit -= len; + } + return 0; +err: + rxm_msg_mr_closev(mr, i); + return ret; +} + +/* Large send/recv transfers use RMA rendezvous protocol */ static uint64_t rxm_mr_get_msg_access(struct rxm_domain *rxm_domain, uint64_t access) { - /* Additional flags to use RMA read for large message transfers */ - access |= FI_READ | FI_REMOTE_READ; + if (access & FI_SEND) { + access |= rxm_use_write_rndv ? FI_WRITE : FI_REMOTE_READ; + } + + if (access & FI_RECV) { + access |= rxm_use_write_rndv ? FI_REMOTE_WRITE : FI_READ; + } - if (rxm_domain->mr_local) - access |= FI_WRITE; return access; } @@ -175,10 +252,7 @@ static void rxm_mr_init(struct rxm_mr *rxm_mr, struct rxm_domain *domain, rxm_mr->mr_fid.fid.fclass = FI_CLASS_MR; rxm_mr->mr_fid.fid.context = context; rxm_mr->mr_fid.fid.ops = &rxm_mr_ops; - /* Store msg_mr as rxm_mr descriptor so that we can get its key when - * the app passes msg_mr as the descriptor in fi_send and friends. - * The key would be used in large message transfer protocol and RMA. */ - rxm_mr->mr_fid.mem_desc = rxm_mr->msg_mr; + rxm_mr->mr_fid.mem_desc = rxm_mr; rxm_mr->mr_fid.key = fi_mr_key(rxm_mr->msg_mr); rxm_mr->domain = domain; ofi_atomic_inc32(&domain->util_domain.ref); @@ -199,6 +273,10 @@ static int rxm_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, if (!rxm_mr) return -FI_ENOMEM; + ofi_mr_update_attr(rxm_domain->util_domain.fabric->fabric_fid.api_version, + rxm_domain->util_domain.info_domain_caps, attr, + &msg_attr); + msg_attr.access = rxm_mr_get_msg_access(rxm_domain, attr->access); ret = fi_mr_regattr(rxm_domain->msg_domain, &msg_attr, @@ -208,6 +286,9 @@ static int rxm_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, goto err; } rxm_mr_init(rxm_mr, rxm_domain, attr->context); + fastlock_init(&rxm_mr->amo_lock); + rxm_mr->iface = msg_attr.iface; + rxm_mr->device = msg_attr.device.reserved; *mr = &rxm_mr->mr_fid; if (rxm_domain->util_domain.info_domain_caps & FI_ATOMIC) { @@ -297,13 +378,146 @@ static struct fi_ops_mr rxm_domain_mr_ops = { .regattr = rxm_mr_regattr, }; +static ssize_t rxm_send_credits(struct fid_ep *ep, size_t credits) +{ + struct rxm_conn *rxm_conn = + container_of(ep->fid.context, struct rxm_conn, handle); + struct rxm_ep *rxm_ep = rxm_conn->handle.cmap->ep; + struct rxm_deferred_tx_entry *def_tx_entry; + struct rxm_tx_base_buf *tx_buf; + struct iovec iov; + struct fi_msg msg; + ssize_t ret; + + tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_CREDIT); + if (!tx_buf) { + FI_WARN(&rxm_prov, FI_LOG_EP_DATA, + "Ran out of buffers from TX credit buffer pool.\n"); + return -FI_ENOMEM; + } + + rxm_ep_format_tx_buf_pkt(rxm_conn, 0, rxm_ctrl_credit, 0, 0, FI_SEND, + &tx_buf->pkt); + tx_buf->pkt.ctrl_hdr.type = rxm_ctrl_credit; + tx_buf->pkt.ctrl_hdr.msg_id = ofi_buf_index(tx_buf); + tx_buf->pkt.ctrl_hdr.ctrl_data = credits; + + if (rxm_conn->handle.state != RXM_CMAP_CONNECTED) + goto defer; + + iov.iov_base = &tx_buf->pkt; + iov.iov_len = sizeof(struct rxm_pkt); + msg.msg_iov = &iov; + msg.iov_count = 1; + msg.context = tx_buf; + msg.desc = &tx_buf->hdr.desc; + + ret = fi_sendmsg(ep, &msg, FI_PRIORITY); + if (!ret) + return FI_SUCCESS; + +defer: + def_tx_entry = rxm_ep_alloc_deferred_tx_entry( + rxm_ep, rxm_conn, RXM_DEFERRED_TX_CREDIT_SEND); + if (!def_tx_entry) { + FI_WARN(&rxm_prov, FI_LOG_CQ, + "unable to allocate TX entry for deferred CREDIT mxg\n"); + ofi_buf_free(tx_buf); + return -FI_ENOMEM; + } + + def_tx_entry->credit_msg.tx_buf = tx_buf; + rxm_ep_enqueue_deferred_tx_queue_priority(def_tx_entry); + return FI_SUCCESS; +} + +static void rxm_no_set_threshold(struct fid_ep *ep_fid, size_t threshold) +{ } + +static void rxm_no_add_credits(struct fid_ep *ep_fid, size_t credits) +{ } + +static void rxm_no_credit_handler(struct fid_domain *domain_fid, + ssize_t (*credit_handler)(struct fid_ep *ep, size_t credits)) +{ } + +static int rxm_no_enable_flow_ctrl(struct fid_ep *ep_fid) +{ + return -FI_ENOSYS; +} + +struct ofi_ops_flow_ctrl rxm_no_ops_flow_ctrl = { + .size = sizeof(struct ofi_ops_flow_ctrl), + .set_threshold = rxm_no_set_threshold, + .add_credits = rxm_no_add_credits, + .enable = rxm_no_enable_flow_ctrl, + .set_send_handler = rxm_no_credit_handler, +}; + +static int rxm_config_flow_ctrl(struct rxm_domain *domain) +{ + struct ofi_ops_flow_ctrl *flow_ctrl_ops; + int ret; + + ret = fi_open_ops(&domain->msg_domain->fid, OFI_OPS_FLOW_CTRL, 0, + (void **) &flow_ctrl_ops, NULL); + if (ret) { + if (ret == -FI_ENOSYS) { + domain->flow_ctrl_ops = &rxm_no_ops_flow_ctrl; + return 0; + } + return ret; + } + + assert(flow_ctrl_ops); + domain->flow_ctrl_ops = flow_ctrl_ops; + domain->flow_ctrl_ops->set_send_handler(domain->msg_domain, + rxm_send_credits); + return 0; +} + +struct ofi_ops_dynamic_rbuf rxm_dynamic_rbuf = { + .size = sizeof(struct ofi_ops_dynamic_rbuf), + .get_rbuf = rxm_get_dyn_rbuf, +}; + +static void rxm_config_dyn_rbuf(struct rxm_domain *domain, struct fi_info *info, + struct fi_info *msg_info) +{ + int ret = 0; + + /* Collective support requires rxm generated and consumed messages. + * Although we could update the code to handle receiving collective + * messages, collective support is mostly for development purposes. + * So, fallback to bounce buffers when enabled. + * We also can't pass through HMEM buffers, unless the lower layer + * can handle them. + */ + if ((info->caps & FI_COLLECTIVE) || + ((info->caps & FI_HMEM) && !(msg_info->caps & FI_HMEM))) + return; + + fi_param_get_bool(&rxm_prov, "enable_dyn_rbuf", &ret); + domain->dyn_rbuf = (ret != 0); + if (!domain->dyn_rbuf) + return; + + ret = fi_set_ops(&domain->msg_domain->fid, OFI_OPS_DYNAMIC_RBUF, 0, + (void *) &rxm_dynamic_rbuf, NULL); + domain->dyn_rbuf = (ret == FI_SUCCESS); + + if (domain->dyn_rbuf) { + domain->rx_post_size = sizeof(struct rxm_pkt); + } +} + int rxm_domain_open(struct fid_fabric *fabric, struct fi_info *info, struct fid_domain **domain, void *context) { - int ret; struct rxm_domain *rxm_domain; struct rxm_fabric *rxm_fabric; struct fi_info *msg_info; + int ret; rxm_domain = calloc(1, sizeof(*rxm_domain)); if (!rxm_domain) @@ -312,17 +526,10 @@ int rxm_domain_open(struct fid_fabric *fabric, struct fi_info *info, rxm_fabric = container_of(fabric, struct rxm_fabric, util_fabric.fabric_fid); ret = ofi_get_core_info(fabric->api_version, NULL, NULL, 0, &rxm_util_prov, - info, rxm_info_to_core, &msg_info); + info, NULL, rxm_info_to_core, &msg_info); if (ret) goto err1; - /* Force core provider to supply MR key */ - if (FI_VERSION_LT(fabric->api_version, FI_VERSION(1, 5)) || - (msg_info->domain_attr->mr_mode & (FI_MR_BASIC | FI_MR_SCALABLE))) - msg_info->domain_attr->mr_mode = FI_MR_BASIC; - else - msg_info->domain_attr->mr_mode |= FI_MR_PROV_KEY; - ret = fi_domain(rxm_fabric->msg_fabric, msg_info, &rxm_domain->msg_domain, context); if (ret) @@ -333,22 +540,39 @@ int rxm_domain_open(struct fid_fabric *fabric, struct fi_info *info, goto err3; } - /* We maintain an RMA key to MR map used for emulated atomic access - * and bounds validation. We turn off the map mode bit FI_MR_PROV_KEY - * since we specify the key used by MSG_EP provider. */ + /* We turn off the mr map mode bit FI_MR_PROV_KEY. We always use the + * key returned by the MSG provider. That key may be generated by the + * MSG provider, or will be provided as input by the rxm provider. + */ rxm_domain->util_domain.mr_map.mode &= ~FI_MR_PROV_KEY; rxm_domain->max_atomic_size = rxm_ep_max_atomic_size(info); + rxm_domain->rx_post_size = rxm_buffer_size; + *domain = &rxm_domain->util_domain.domain_fid; (*domain)->fid.ops = &rxm_domain_fi_ops; /* Replace MR ops set by ofi_domain_init() */ (*domain)->mr = &rxm_domain_mr_ops; (*domain)->ops = &rxm_domain_ops; - rxm_domain->mr_local = ofi_mr_local(msg_info) && !ofi_mr_local(info); + ret = ofi_bufpool_create(&rxm_domain->amo_bufpool, + rxm_domain->max_atomic_size, 64, 0, 0, 0); + if (ret) + goto err3; + + fastlock_init(&rxm_domain->amo_bufpool_lock); + + ret = rxm_config_flow_ctrl(rxm_domain); + if (ret) + goto err4; + + rxm_config_dyn_rbuf(rxm_domain, info, msg_info); fi_freeinfo(msg_info); return 0; +err4: + fastlock_destroy(&rxm_domain->amo_bufpool_lock); + ofi_bufpool_destroy(rxm_domain->amo_bufpool); err3: fi_close(&rxm_domain->msg_domain->fid); err2: diff --git a/prov/rxm/src/rxm_ep.c b/prov/rxm/src/rxm_ep.c index 0ac88fe3069..61ae89f0213 100644 --- a/prov/rxm/src/rxm_ep.c +++ b/prov/rxm/src/rxm_ep.c @@ -1,5 +1,7 @@ /* - * Copyright (c) 2013-2016 Intel Corporation. All rights reserved. + * Copyright (c) 2013-2020 Intel Corporation. All rights reserved. + * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -34,8 +36,10 @@ #include #include +#include #include "ofi.h" #include +#include #include "rxm.h" @@ -56,7 +60,7 @@ static int rxm_match_recv_entry(struct dlist_entry *item, const void *arg) static int rxm_match_recv_entry_tag(struct dlist_entry *item, const void *arg) { - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *)arg; + struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; struct rxm_recv_entry *recv_entry = container_of(item, struct rxm_recv_entry, entry); return ofi_match_tag(recv_entry->tag, recv_entry->ignore, attr->tag); @@ -64,7 +68,7 @@ static int rxm_match_recv_entry_tag(struct dlist_entry *item, const void *arg) static int rxm_match_recv_entry_tag_addr(struct dlist_entry *item, const void *arg) { - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *)arg; + struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; struct rxm_recv_entry *recv_entry = container_of(item, struct rxm_recv_entry, entry); return ofi_match_addr(recv_entry->addr, attr->addr) && @@ -88,7 +92,7 @@ static int rxm_match_unexp_msg(struct dlist_entry *item, const void *arg) static int rxm_match_unexp_msg_tag(struct dlist_entry *item, const void *arg) { - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *)arg; + struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; struct rxm_unexp_msg *unexp_msg = container_of(item, struct rxm_unexp_msg, entry); return ofi_match_tag(attr->tag, attr->ignore, unexp_msg->tag); @@ -96,47 +100,44 @@ static int rxm_match_unexp_msg_tag(struct dlist_entry *item, const void *arg) static int rxm_match_unexp_msg_tag_addr(struct dlist_entry *item, const void *arg) { - struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *)arg; + struct rxm_recv_match_attr *attr = (struct rxm_recv_match_attr *) arg; struct rxm_unexp_msg *unexp_msg = container_of(item, struct rxm_unexp_msg, entry); return ofi_match_addr(attr->addr, unexp_msg->addr) && ofi_match_tag(attr->tag, attr->ignore, unexp_msg->tag); } -static inline int -rxm_mr_buf_reg(struct rxm_ep *rxm_ep, void *addr, size_t len, void **context) +static int rxm_buf_reg(struct ofi_bufpool_region *region) { - int ret = FI_SUCCESS; - struct fid_mr *mr; - struct rxm_domain *rxm_domain = container_of(rxm_ep->util_ep.domain, - struct rxm_domain, util_domain); - - *context = NULL; - if (rxm_ep->msg_mr_local) { - struct fid_domain *msg_domain = - (struct fid_domain *)rxm_domain->msg_domain; + struct rxm_buf_pool *pool = region->pool->attr.context; + struct rxm_ep *rxm_ep = pool->rxm_ep; + struct rxm_domain *rxm_domain; + int ret; + bool hmem_enabled = !!(rxm_ep->util_ep.caps & FI_HMEM); - ret = fi_mr_reg(msg_domain, addr, len, - FI_SEND | FI_RECV | FI_READ | FI_WRITE, - 0, 0, 0, &mr, NULL); - *context = mr; + if (hmem_enabled) { + ret = ofi_hmem_host_register(region->mem_region, + region->pool->region_size); + if (ret != FI_SUCCESS) + return ret; } - return ret; -} + if ((pool->type == RXM_BUF_POOL_TX_INJECT) || + !pool->rxm_ep->msg_mr_local) + return 0; -static int rxm_buf_reg(struct ofi_bufpool_region *region) -{ - struct rxm_buf_pool *pool = region->pool->attr.context; - int ret; + rxm_domain = container_of(pool->rxm_ep->util_ep.domain, + struct rxm_domain, util_domain); - if ((pool->type != RXM_BUF_POOL_TX_INJECT) && - pool->rxm_ep->msg_mr_local) { - ret = rxm_mr_buf_reg(pool->rxm_ep, region->mem_region, - region->pool->region_size, - ®ion->context); - } else { - ret = 0; + ret = rxm_msg_mr_reg_internal(rxm_domain, region->mem_region, + region->pool->region_size, + FI_SEND | FI_RECV | FI_READ | FI_WRITE, + OFI_MR_NOCACHE, + (struct fid_mr **) ®ion->context); + + if (ret != FI_SUCCESS) { + if (hmem_enabled) + ofi_hmem_host_unregister(region->mem_region); } return ret; @@ -156,7 +157,8 @@ static void rxm_buf_init(struct ofi_bufpool_region *region, void *buf) void *mr_desc; uint8_t type; - if ((pool->type != RXM_BUF_POOL_TX_INJECT) && pool->rxm_ep->msg_mr_local) { + if ((pool->type != RXM_BUF_POOL_TX_INJECT) && + pool->rxm_ep->msg_mr_local) { mr_desc = fi_mr_desc((struct fid_mr *) region->context); } else { mr_desc = NULL; @@ -194,12 +196,20 @@ static void rxm_buf_init(struct ofi_bufpool_region *region, void *buf) pkt = &tx_sar_buf->pkt; type = rxm_ctrl_seg; break; - case RXM_BUF_POOL_TX_RNDV: + case RXM_BUF_POOL_TX_CREDIT: + tx_base_buf = buf; + tx_base_buf->hdr.state = RXM_CREDIT_TX; + + tx_base_buf->hdr.desc = mr_desc; + pkt = &tx_base_buf->pkt; + type = rxm_ctrl_credit; + break; + case RXM_BUF_POOL_TX_RNDV_REQ: tx_rndv_buf = buf; tx_rndv_buf->hdr.desc = mr_desc; pkt = &tx_rndv_buf->pkt; - type = rxm_ctrl_rndv; + type = rxm_ctrl_rndv_req; break; case RXM_BUF_POOL_TX_ATOMIC: tx_atomic_buf = buf; @@ -208,13 +218,29 @@ static void rxm_buf_init(struct ofi_bufpool_region *region, void *buf) pkt = &tx_atomic_buf->pkt; type = rxm_ctrl_atomic; break; - case RXM_BUF_POOL_TX_ACK: + case RXM_BUF_POOL_TX_RNDV_RD_DONE: + tx_base_buf = buf; + tx_base_buf->pkt.hdr.op = ofi_op_msg; + + tx_base_buf->hdr.desc = mr_desc; + pkt = &tx_base_buf->pkt; + type = rxm_ctrl_rndv_rd_done; + break; + case RXM_BUF_POOL_TX_RNDV_WR_DONE: tx_base_buf = buf; tx_base_buf->pkt.hdr.op = ofi_op_msg; tx_base_buf->hdr.desc = mr_desc; pkt = &tx_base_buf->pkt; - type = rxm_ctrl_rndv_ack; + type = rxm_ctrl_rndv_wr_done; + break; + case RXM_BUF_POOL_TX_RNDV_WR_DATA: + tx_base_buf = buf; + tx_base_buf->pkt.hdr.op = ofi_op_msg; + + tx_base_buf->hdr.desc = mr_desc; + pkt = &tx_base_buf->pkt; + type = rxm_ctrl_rndv_wr_data; break; case RXM_BUF_POOL_RMA: rma_buf = buf; @@ -238,10 +264,14 @@ static void rxm_buf_init(struct ofi_bufpool_region *region, void *buf) } } -static inline void rxm_buf_close(struct ofi_bufpool_region *region) +static void rxm_buf_close(struct ofi_bufpool_region *region) { struct rxm_buf_pool *pool = region->pool->attr.context; struct rxm_ep *rxm_ep = pool->rxm_ep; + bool hmem_enabled = !!(rxm_ep->util_ep.caps & FI_HMEM); + + if (hmem_enabled) + ofi_hmem_host_unregister(region->mem_region); if ((rxm_ep->msg_mr_local) && (pool->type != RXM_BUF_POOL_TX_INJECT)) { /* We would get a (fid_mr *) in context but @@ -280,7 +310,8 @@ static int rxm_buf_pool_create(struct rxm_ep *rxm_ep, size_t size, pool->type = type; ret = ofi_bufpool_create_attr(&attr, &pool->pool); if (ret) - FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "Unable to create buf pool\n"); + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, + "Unable to create buf pool\n"); return ret; } @@ -310,7 +341,8 @@ static int rxm_recv_queue_init(struct rxm_ep *rxm_ep, struct rxm_recv_queue *re { recv_queue->rxm_ep = rxm_ep; recv_queue->type = type; - recv_queue->fs = rxm_recv_fs_create(size, rxm_recv_entry_init, recv_queue); + recv_queue->fs = rxm_recv_fs_create(size, rxm_recv_entry_init, + recv_queue); if (!recv_queue->fs) return -FI_ENOMEM; @@ -349,38 +381,33 @@ static void rxm_recv_queue_close(struct rxm_recv_queue *recv_queue) static int rxm_ep_txrx_pool_create(struct rxm_ep *rxm_ep) { int ret, i; - size_t queue_sizes[] = { - [RXM_BUF_POOL_RX] = rxm_ep->msg_info->rx_attr->size, - [RXM_BUF_POOL_TX] = rxm_ep->msg_info->tx_attr->size, - [RXM_BUF_POOL_TX_INJECT] = rxm_ep->msg_info->tx_attr->size, - [RXM_BUF_POOL_TX_ACK] = rxm_ep->msg_info->tx_attr->size, - [RXM_BUF_POOL_TX_RNDV] = rxm_ep->msg_info->tx_attr->size, - [RXM_BUF_POOL_TX_ATOMIC] = rxm_ep->msg_info->tx_attr->size, - [RXM_BUF_POOL_TX_SAR] = rxm_ep->msg_info->tx_attr->size, - [RXM_BUF_POOL_RMA] = rxm_ep->msg_info->tx_attr->size, - }; - size_t entry_sizes[] = { + size_t entry_sizes[] = { [RXM_BUF_POOL_RX] = rxm_eager_limit + sizeof(struct rxm_rx_buf), [RXM_BUF_POOL_TX] = rxm_eager_limit + sizeof(struct rxm_tx_eager_buf), [RXM_BUF_POOL_TX_INJECT] = rxm_ep->inject_limit + sizeof(struct rxm_tx_base_buf), - [RXM_BUF_POOL_TX_ACK] = sizeof(struct rxm_tx_base_buf), - [RXM_BUF_POOL_TX_RNDV] = sizeof(struct rxm_rndv_hdr) + + [RXM_BUF_POOL_TX_RNDV_RD_DONE] = sizeof(struct rxm_tx_base_buf), + [RXM_BUF_POOL_TX_RNDV_WR_DONE] = sizeof(struct rxm_tx_base_buf), + [RXM_BUF_POOL_TX_RNDV_REQ] = sizeof(struct rxm_rndv_hdr) + rxm_ep->buffered_min + sizeof(struct rxm_tx_rndv_buf), + [RXM_BUF_POOL_TX_RNDV_WR_DATA] = sizeof(struct rxm_rndv_hdr) + + sizeof(struct rxm_tx_base_buf), [RXM_BUF_POOL_TX_ATOMIC] = rxm_eager_limit + sizeof(struct rxm_tx_atomic_buf), [RXM_BUF_POOL_TX_SAR] = rxm_eager_limit + sizeof(struct rxm_tx_sar_buf), + [RXM_BUF_POOL_TX_CREDIT] = sizeof(struct rxm_tx_base_buf), [RXM_BUF_POOL_RMA] = rxm_eager_limit + sizeof(struct rxm_rma_buf), }; dlist_init(&rxm_ep->repost_ready_list); - rxm_ep->buf_pools = calloc(1, RXM_BUF_POOL_MAX * sizeof(*rxm_ep->buf_pools)); + rxm_ep->buf_pools = calloc(1, RXM_BUF_POOL_MAX * + sizeof(*rxm_ep->buf_pools)); if (!rxm_ep->buf_pools) return -FI_ENOMEM; @@ -390,15 +417,17 @@ static int rxm_ep_txrx_pool_create(struct rxm_ep *rxm_ep) continue; ret = rxm_buf_pool_create(rxm_ep, entry_sizes[i], - (i == RXM_BUF_POOL_RX ? 0 : - rxm_ep->rxm_info->tx_attr->size), - queue_sizes[i], + (i == RXM_BUF_POOL_RX || + i == RXM_BUF_POOL_TX_ATOMIC) ? 0 : + rxm_ep->rxm_info->tx_attr->size, + 1024, &rxm_ep->buf_pools[i], i); if (ret) goto err; } return FI_SUCCESS; + err: while (--i >= RXM_BUF_POOL_START) rxm_buf_pool_destroy(&rxm_ep->buf_pools[i]); @@ -415,6 +444,22 @@ static void rxm_ep_txrx_pool_destroy(struct rxm_ep *rxm_ep) free(rxm_ep->buf_pools); } +static int rxm_multi_recv_pool_init(struct rxm_ep *rxm_ep) +{ + struct ofi_bufpool_attr attr = { + .size = sizeof(struct rxm_recv_entry), + .alignment = 16, + .max_cnt = 0, + .chunk_cnt = 16, + .alloc_fn = NULL, + .init_fn = NULL, + .context = rxm_ep, + .flags = OFI_BUFPOOL_NO_TRACK, + }; + + return ofi_bufpool_create_attr(&attr, &rxm_ep->multi_recv_pool); +} + static int rxm_ep_rx_queue_init(struct rxm_ep *rxm_ep) { int ret; @@ -431,7 +476,14 @@ static int rxm_ep_rx_queue_init(struct rxm_ep *rxm_ep) if (ret) goto err_recv_tag; + ret = rxm_multi_recv_pool_init(rxm_ep); + if (ret) + goto err_multi; + return FI_SUCCESS; + +err_multi: + rxm_recv_queue_close(&rxm_ep->trecv_queue); err_recv_tag: rxm_recv_queue_close(&rxm_ep->recv_queue); return ret; @@ -448,6 +500,8 @@ static void rxm_ep_rx_queue_close(struct rxm_ep *rxm_ep) static void rxm_ep_txrx_res_close(struct rxm_ep *rxm_ep) { rxm_ep_rx_queue_close(rxm_ep); + if (rxm_ep->multi_recv_pool) + ofi_bufpool_destroy(rxm_ep->multi_recv_pool); if (rxm_ep->buf_pools) rxm_ep_txrx_pool_destroy(rxm_ep); } @@ -468,6 +522,19 @@ static int rxm_getname(fid_t fid, void *addr, size_t *addrlen) return fi_getname(&rxm_ep->msg_pep->fid, addr, addrlen); } +static int rxm_join_coll(struct fid_ep *ep, const void *addr, uint64_t flags, + struct fid_mc **mc, void *context) +{ + struct fi_collective_addr *c_addr; + + if (!(flags & FI_COLLECTIVE)) + return -FI_ENOSYS; + + c_addr = (struct fi_collective_addr *) addr; + return ofi_join_collective(ep, c_addr->coll_addr, c_addr->set, flags, + mc, context); +} + static struct fi_ops_cm rxm_ops_cm = { .size = sizeof(struct fi_ops_cm), .setname = rxm_setname, @@ -478,11 +545,21 @@ static struct fi_ops_cm rxm_ops_cm = { .accept = fi_no_accept, .reject = fi_no_reject, .shutdown = fi_no_shutdown, - .join = fi_no_join, + .join = rxm_join_coll, +}; + +static struct rxm_eager_ops def_eager_ops = { + .comp_tx = rxm_finish_eager_send, + .handle_rx = rxm_handle_eager, +}; + +static struct rxm_eager_ops coll_eager_ops = { + .comp_tx = rxm_finish_coll_eager_send, + .handle_rx = rxm_handle_coll_eager, }; -static int rxm_ep_cancel_recv(struct rxm_ep *rxm_ep, - struct rxm_recv_queue *recv_queue, void *context) +static bool rxm_ep_cancel_recv(struct rxm_ep *rxm_ep, + struct rxm_recv_queue *recv_queue, void *context) { struct fi_cq_err_entry err_entry; struct rxm_recv_entry *recv_entry; @@ -493,35 +570,35 @@ static int rxm_ep_cancel_recv(struct rxm_ep *rxm_ep, entry = dlist_remove_first_match(&recv_queue->recv_list, rxm_match_recv_entry_context, context); - if (entry) { - recv_entry = container_of(entry, struct rxm_recv_entry, entry); - memset(&err_entry, 0, sizeof(err_entry)); - err_entry.op_context = recv_entry->context; - err_entry.flags |= recv_entry->comp_flags; - err_entry.tag = recv_entry->tag; - err_entry.err = FI_ECANCELED; - err_entry.prov_errno = -FI_ECANCELED; - rxm_recv_entry_release(recv_queue, recv_entry); - ret = ofi_cq_write_error(rxm_ep->util_ep.rx_cq, &err_entry); - } else { - ret = 0; + if (!entry) + goto unlock; + + recv_entry = container_of(entry, struct rxm_recv_entry, entry); + memset(&err_entry, 0, sizeof(err_entry)); + err_entry.op_context = recv_entry->context; + err_entry.flags |= recv_entry->comp_flags; + err_entry.tag = recv_entry->tag; + err_entry.err = FI_ECANCELED; + err_entry.prov_errno = -FI_ECANCELED; + rxm_recv_entry_release(recv_entry); + ret = ofi_cq_write_error(rxm_ep->util_ep.rx_cq, &err_entry); + if (ret) { + FI_WARN(&rxm_prov, FI_LOG_CQ, "Error writing to CQ\n"); + assert(0); } + +unlock: ofi_ep_lock_release(&rxm_ep->util_ep); - return ret; + return entry != NULL; } static ssize_t rxm_ep_cancel(fid_t fid_ep, void *context) { - struct rxm_ep *rxm_ep = container_of(fid_ep, struct rxm_ep, util_ep.ep_fid); - int ret; - - ret = rxm_ep_cancel_recv(rxm_ep, &rxm_ep->recv_queue, context); - if (ret) - return ret; + struct rxm_ep *rxm_ep; - ret = rxm_ep_cancel_recv(rxm_ep, &rxm_ep->trecv_queue, context); - if (ret) - return ret; + rxm_ep = container_of(fid_ep, struct rxm_ep, util_ep.ep_fid); + if (!rxm_ep_cancel_recv(rxm_ep, &rxm_ep->recv_queue, context)) + rxm_ep_cancel_recv(rxm_ep, &rxm_ep->trecv_queue, context); return 0; } @@ -628,40 +705,118 @@ static struct fi_ops_ep rxm_ops_ep = { .tx_size_left = fi_no_tx_size_left, }; -static int rxm_ep_discard_recv(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf, +/* Caller must hold recv_queue->lock */ +static struct rxm_rx_buf * +rxm_get_unexp_msg(struct rxm_recv_queue *recv_queue, fi_addr_t addr, + uint64_t tag, uint64_t ignore) +{ + struct rxm_recv_match_attr match_attr; + struct dlist_entry *entry; + + if (dlist_empty(&recv_queue->unexp_msg_list)) + return NULL; + + match_attr.addr = addr; + match_attr.tag = tag; + match_attr.ignore = ignore; + + entry = dlist_find_first_match(&recv_queue->unexp_msg_list, + recv_queue->match_unexp, &match_attr); + if (!entry) + return NULL; + + RXM_DBG_ADDR_TAG(FI_LOG_EP_DATA, "Match for posted recv found in unexp" + " msg list\n", match_attr.addr, match_attr.tag); + + return container_of(entry, struct rxm_rx_buf, unexp_msg.entry); +} + +static int rxm_handle_unexp_sar(struct rxm_recv_queue *recv_queue, + struct rxm_recv_entry *recv_entry, + struct rxm_rx_buf *rx_buf) +{ + struct rxm_recv_match_attr match_attr; + struct dlist_entry *entry; + bool last; + ssize_t ret; + + ret = rxm_handle_rx_buf(rx_buf); + last = rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) == RXM_SAR_SEG_LAST; + if (ret || last) + return ret; + + match_attr.addr = recv_entry->addr; + match_attr.tag = recv_entry->tag; + match_attr.ignore = recv_entry->ignore; + + dlist_foreach_container_safe(&recv_queue->unexp_msg_list, + struct rxm_rx_buf, rx_buf, + unexp_msg.entry, entry) { + if (!recv_queue->match_unexp(&rx_buf->unexp_msg.entry, + &match_attr)) + continue; + /* Handle unordered completions from MSG provider */ + if ((rx_buf->pkt.ctrl_hdr.msg_id != recv_entry->sar.msg_id) || + ((rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg))) + continue; + + if (!rx_buf->conn) { + rx_buf->conn = rxm_key2conn(rx_buf->ep, + rx_buf->pkt.ctrl_hdr.conn_id); + } + if (recv_entry->sar.conn != rx_buf->conn) + continue; + rx_buf->recv_entry = recv_entry; + dlist_remove(&rx_buf->unexp_msg.entry); + last = rxm_sar_get_seg_type(&rx_buf->pkt.ctrl_hdr) == + RXM_SAR_SEG_LAST; + ret = rxm_handle_rx_buf(rx_buf); + if (ret || last) + break; + } + return ret; + +} + +static void rxm_ep_discard_recv(struct rxm_ep *rxm_ep, struct rxm_rx_buf *rx_buf, void *context) { RXM_DBG_ADDR_TAG(FI_LOG_EP_DATA, "Discarding message", rx_buf->unexp_msg.addr, rx_buf->unexp_msg.tag); - dlist_insert_tail(&rx_buf->repost_entry, - &rx_buf->ep->repost_ready_list); - return ofi_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, - 0, NULL, rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag); + rxm_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, + 0, NULL, rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag); + rxm_rx_buf_free(rx_buf); } -static int rxm_ep_peek_recv(struct rxm_ep *rxm_ep, fi_addr_t addr, uint64_t tag, - uint64_t ignore, void *context, uint64_t flags, - struct rxm_recv_queue *recv_queue) +static void +rxm_ep_peek_recv(struct rxm_ep *rxm_ep, fi_addr_t addr, uint64_t tag, + uint64_t ignore, void *context, uint64_t flags, + struct rxm_recv_queue *recv_queue) { struct rxm_rx_buf *rx_buf; + int ret; RXM_DBG_ADDR_TAG(FI_LOG_EP_DATA, "Peeking message", addr, tag); rxm_ep_do_progress(&rxm_ep->util_ep); - rx_buf = rxm_check_unexp_msg_list(recv_queue, addr, tag, ignore); + rx_buf = rxm_get_unexp_msg(recv_queue, addr, tag, ignore); if (!rx_buf) { FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Message not found\n"); - return ofi_cq_write_error_peek(rxm_ep->util_ep.rx_cq, tag, - context); + ret = ofi_cq_write_error_peek(rxm_ep->util_ep.rx_cq, tag, + context); + if (ret) + FI_WARN(&rxm_prov, FI_LOG_CQ, "Error writing to CQ\n"); + return; } FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Message found\n"); if (flags & FI_DISCARD) { dlist_remove(&rx_buf->unexp_msg.entry); - return rxm_ep_discard_recv(rxm_ep, rx_buf, context); + rxm_ep_discard_recv(rxm_ep, rx_buf, context); + return; } if (flags & FI_CLAIM) { @@ -670,189 +825,246 @@ static int rxm_ep_peek_recv(struct rxm_ep *rxm_ep, fi_addr_t addr, uint64_t tag, dlist_remove(&rx_buf->unexp_msg.entry); } - return ofi_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, - rx_buf->pkt.hdr.size, NULL, - rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag); + rxm_cq_write(rxm_ep->util_ep.rx_cq, context, FI_TAGGED | FI_RECV, + rx_buf->pkt.hdr.size, NULL, + rx_buf->pkt.hdr.data, rx_buf->pkt.hdr.tag); } -static inline ssize_t -rxm_ep_format_rx_res(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, - uint64_t flags, struct rxm_recv_queue *recv_queue, - struct rxm_recv_entry **recv_entry) +static void rxm_recv_entry_init_common(struct rxm_recv_entry *recv_entry, + const struct iovec *iov, void **desc, size_t count, + fi_addr_t src_addr, uint64_t tag, uint64_t ignore, + void *context, uint64_t flags, + struct rxm_recv_queue *recv_queue) { size_t i; - *recv_entry = rxm_recv_entry_get(recv_queue); - if (OFI_UNLIKELY(!*recv_entry)) - return -FI_EAGAIN; - - assert(!(*recv_entry)->rndv.tx_buf); + assert(!recv_entry->rndv.tx_buf); + recv_entry->rxm_iov.count = (uint8_t) count; + recv_entry->addr = src_addr; + recv_entry->context = context; + recv_entry->flags = flags; + recv_entry->ignore = ignore; + recv_entry->tag = tag; - (*recv_entry)->rxm_iov.count = (uint8_t)count; - (*recv_entry)->addr = src_addr; - (*recv_entry)->context = context; - (*recv_entry)->flags = flags; - (*recv_entry)->ignore = ignore; - (*recv_entry)->tag = tag; + recv_entry->sar.msg_id = RXM_SAR_RX_INIT; + recv_entry->sar.total_recv_len = 0; + recv_entry->total_len = 0; for (i = 0; i < count; i++) { - (*recv_entry)->rxm_iov.iov[i].iov_base = iov[i].iov_base; - (*recv_entry)->total_len += - (*recv_entry)->rxm_iov.iov[i].iov_len = iov[i].iov_len; - if (desc) - (*recv_entry)->rxm_iov.desc[i] = desc[i]; + recv_entry->rxm_iov.iov[i] = iov[i]; + recv_entry->total_len += iov[i].iov_len; + if (desc && desc[i]) + recv_entry->rxm_iov.desc[i] = desc[i]; + else + recv_entry->rxm_iov.desc[i] = NULL; } +} - (*recv_entry)->multi_recv.len = (*recv_entry)->total_len; - (*recv_entry)->multi_recv.buf = iov[0].iov_base; +static struct rxm_recv_entry * +rxm_recv_entry_get(struct rxm_ep *rxm_ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, void *context, + uint64_t flags, struct rxm_recv_queue *recv_queue) +{ + struct rxm_recv_entry *recv_entry; - return FI_SUCCESS; + if (ofi_freestack_isempty(recv_queue->fs)) + return NULL; + + recv_entry = ofi_freestack_pop(recv_queue->fs); + + rxm_recv_entry_init_common(recv_entry, iov, desc, count, src_addr, tag, + ignore, context, flags, recv_queue); + + return recv_entry; +} + +struct rxm_recv_entry * +rxm_multi_recv_entry_get(struct rxm_ep *rxm_ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, void *context, + uint64_t flags) +{ + struct rxm_recv_entry *recv_entry; + + recv_entry = ofi_buf_alloc(rxm_ep->multi_recv_pool); + + rxm_recv_entry_init_common(recv_entry, iov, desc, count, src_addr, tag, + ignore, context, flags, NULL); + + recv_entry->comp_flags = FI_MSG | FI_RECV; + return recv_entry; +} + +/* + * We don't expect to have unexpected messages when the app is using + * multi-recv buffers. Optimize for that case. + * + * If there are unexpected messages waiting when we post a mult-recv buffer, + * we trim off the start of the buffer, treat it as a normal buffer, and pair + * it with an unexpected message. We continue doing this until either no + * unexpected messages are left or the multi-recv buffer has been consumed. + */ +static ssize_t +rxm_ep_post_mrecv(struct rxm_ep *ep, const struct iovec *iov, + void **desc, void *context, uint64_t op_flags) +{ + struct rxm_recv_entry *recv_entry; + struct rxm_rx_buf *rx_buf; + struct iovec cur_iov = *iov; + int ret; + + do { + recv_entry = rxm_recv_entry_get(ep, &cur_iov, desc, 1, + FI_ADDR_UNSPEC, 0, 0, context, + op_flags, &ep->recv_queue); + if (!recv_entry) { + ret = -FI_ENOMEM; + break; + } + + rx_buf = rxm_get_unexp_msg(&ep->recv_queue, recv_entry->addr, 0, 0); + if (!rx_buf) { + dlist_insert_tail(&recv_entry->entry, + &ep->recv_queue.recv_list); + return 0; + } + + dlist_remove(&rx_buf->unexp_msg.entry); + rx_buf->recv_entry = recv_entry; + recv_entry->flags &= ~FI_MULTI_RECV; + recv_entry->total_len = MIN(cur_iov.iov_len, rx_buf->pkt.hdr.size); + recv_entry->rxm_iov.iov[0].iov_len = recv_entry->total_len; + + cur_iov.iov_base = (uint8_t *) cur_iov.iov_base + recv_entry->total_len; + cur_iov.iov_len -= recv_entry->total_len; + + if (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg) + ret = rxm_handle_rx_buf(rx_buf); + else + ret = rxm_handle_unexp_sar(&ep->recv_queue, recv_entry, + rx_buf); + + } while (!ret && cur_iov.iov_len >= ep->min_multi_recv_size); + + if ((cur_iov.iov_len < ep->min_multi_recv_size) || + (ret && cur_iov.iov_len != iov->iov_len)) { + rxm_cq_write(ep->util_ep.rx_cq, context, FI_MULTI_RECV, + 0, NULL, 0, 0); + } + + return ret; } -static inline ssize_t +static ssize_t rxm_ep_post_recv(struct rxm_ep *rxm_ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, - uint64_t op_flags, struct rxm_recv_queue *recv_queue) + void *context, uint64_t op_flags) { struct rxm_recv_entry *recv_entry; - ssize_t ret; + struct rxm_rx_buf *rx_buf; assert(count <= rxm_ep->rxm_info->rx_attr->iov_limit); + if (op_flags & FI_MULTI_RECV) + return rxm_ep_post_mrecv(rxm_ep, iov, desc, context, op_flags); - ret = rxm_ep_format_rx_res(rxm_ep, iov, desc, count, src_addr, - tag, ignore, context, op_flags, - recv_queue, &recv_entry); - if (OFI_UNLIKELY(ret)) - return ret; + recv_entry = rxm_recv_entry_get(rxm_ep, iov, desc, count, src_addr, + 0, 0, context, op_flags, + &rxm_ep->recv_queue); + if (!recv_entry) + return -FI_EAGAIN; - if (recv_queue->type == RXM_RECV_QUEUE_MSG) - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Posting recv with length: %zu " - "addr: 0x%" PRIx64 "\n", recv_entry->total_len, - recv_entry->addr); - else - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Posting trecv with " - "length: %zu addr: 0x%" PRIx64 " tag: 0x%" PRIx64 - " ignore: 0x%" PRIx64 "\n", recv_entry->total_len, - recv_entry->addr, recv_entry->tag, recv_entry->ignore); + rx_buf = rxm_get_unexp_msg(&rxm_ep->recv_queue, recv_entry->addr, 0, 0); + if (!rx_buf) { + dlist_insert_tail(&recv_entry->entry, + &rxm_ep->recv_queue.recv_list); + return FI_SUCCESS; + } - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "recv op_flags: %s\n", - fi_tostr(&recv_entry->flags, FI_TYPE_OP_FLAGS)); - ret = rxm_process_recv_entry(recv_queue, recv_entry); + dlist_remove(&rx_buf->unexp_msg.entry); + rx_buf->recv_entry = recv_entry; - return ret; + if (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg) + return rxm_handle_rx_buf(rx_buf); + else + return rxm_handle_unexp_sar(&rxm_ep->recv_queue, recv_entry, + rx_buf); } -static inline ssize_t +static ssize_t rxm_ep_recv_common(struct rxm_ep *rxm_ep, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, - uint64_t op_flags, struct rxm_recv_queue *recv_queue) + void *context, uint64_t op_flags) { ssize_t ret; assert(rxm_ep->util_ep.rx_cq); ofi_ep_lock_acquire(&rxm_ep->util_ep); ret = rxm_ep_post_recv(rxm_ep, iov, desc, count, src_addr, - tag, ignore, context, op_flags, recv_queue); + context, op_flags); ofi_ep_lock_release(&rxm_ep->util_ep); return ret; } static ssize_t -rxm_ep_recv_common_flags(struct rxm_ep *rxm_ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, - uint64_t tag, uint64_t ignore, void *context, - uint64_t flags, struct rxm_recv_queue *recv_queue) +rxm_ep_buf_recv(struct rxm_ep *rxm_ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, + void *context, uint64_t flags) { struct rxm_recv_entry *recv_entry; - struct fi_recv_context *recv_ctx; + struct fi_recv_context *recv_ctx = context; struct rxm_rx_buf *rx_buf; ssize_t ret = 0; - assert(rxm_ep->util_ep.rx_cq); - assert(count <= rxm_ep->rxm_info->rx_attr->iov_limit); - assert(!(flags & FI_PEEK) || - (recv_queue->type == RXM_RECV_QUEUE_TAGGED)); - assert(!(flags & (FI_MULTI_RECV)) || - (recv_queue->type == RXM_RECV_QUEUE_MSG)); + context = recv_ctx->context; + rx_buf = container_of(recv_ctx, struct rxm_rx_buf, recv_context); ofi_ep_lock_acquire(&rxm_ep->util_ep); - if (rxm_ep->rxm_info->mode & FI_BUFFERED_RECV) { - assert(!(flags & FI_PEEK)); - recv_ctx = context; - context = recv_ctx->context; - rx_buf = container_of(recv_ctx, struct rxm_rx_buf, recv_context); + if (flags & FI_CLAIM) { + FI_DBG(&rxm_prov, FI_LOG_EP_DATA, + "Claiming buffered receive\n"); - if (flags & FI_CLAIM) { - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, - "Claiming buffered receive\n"); - goto claim; + recv_entry = rxm_recv_entry_get(rxm_ep, iov, desc, count, + src_addr, 0, 0, context, + flags, &rxm_ep->recv_queue); + if (!recv_entry) { + ret = -FI_EAGAIN; + goto unlock; } + recv_entry->comp_flags |= FI_CLAIM; + + rx_buf->recv_entry = recv_entry; + ret = rxm_handle_rx_buf(rx_buf); + } else { assert(flags & FI_DISCARD); FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Discarding buffered receive\n"); - dlist_insert_tail(&rx_buf->repost_entry, - &rx_buf->ep->repost_ready_list); - goto unlock; - } - - if (flags & FI_PEEK) { - ret = rxm_ep_peek_recv(rxm_ep, src_addr, tag, ignore, - context, flags, recv_queue); - goto unlock; - } - - if (!(flags & FI_CLAIM)) { - ret = rxm_ep_post_recv(rxm_ep, iov, desc, count, src_addr, - tag, ignore, context, flags, - recv_queue); - goto unlock; - } - - rx_buf = ((struct fi_context *)context)->internal[0]; - assert(rx_buf); - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Claim message\n"); - - if (flags & FI_DISCARD) { - ret = rxm_ep_discard_recv(rxm_ep, rx_buf, context); - goto unlock; + rxm_rx_buf_free(rx_buf); } - -claim: - ret = rxm_ep_format_rx_res(rxm_ep, iov, desc, count, src_addr, - tag, ignore, context, flags, - recv_queue, &recv_entry); - if (OFI_UNLIKELY(ret)) - goto unlock; - - if (rxm_ep->rxm_info->mode & FI_BUFFERED_RECV) - recv_entry->comp_flags |= FI_CLAIM; - - rx_buf->recv_entry = recv_entry; - ret = rxm_cq_handle_rx_buf(rx_buf); - unlock: ofi_ep_lock_release(&rxm_ep->util_ep); return ret; } -static ssize_t rxm_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, - uint64_t flags) +static ssize_t +rxm_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); - return rxm_ep_recv_common_flags(rxm_ep, msg->msg_iov, msg->desc, msg->iov_count, - msg->addr, 0, 0, msg->context, - flags | rxm_ep->util_ep.rx_msg_flags, - &rxm_ep->recv_queue); + if (rxm_ep->rxm_info->mode & FI_BUFFERED_RECV) + return rxm_ep_buf_recv(rxm_ep, msg->msg_iov, msg->desc, + msg->iov_count, msg->addr, msg->context, + flags | rxm_ep->util_ep.rx_msg_flags); + + return rxm_ep_recv_common(rxm_ep, msg->msg_iov, msg->desc, + msg->iov_count, msg->addr, msg->context, + flags | rxm_ep->util_ep.rx_msg_flags); + } -static ssize_t rxm_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, - fi_addr_t src_addr, void *context) +static ssize_t rxm_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, + void *desc, fi_addr_t src_addr, void *context) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); @@ -861,30 +1073,29 @@ static ssize_t rxm_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, void *d .iov_len = len, }; - return rxm_ep_recv_common(rxm_ep, &iov, &desc, 1, src_addr, 0, 0, - context, rxm_ep_rx_flags(rxm_ep), - &rxm_ep->recv_queue); + return rxm_ep_recv_common(rxm_ep, &iov, &desc, 1, src_addr, + context, rxm_ep->util_ep.rx_op_flags); } static ssize_t rxm_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, void *context) + void **desc, size_t count, fi_addr_t src_addr, + void *context) { struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); - return rxm_ep_recv_common(rxm_ep, iov, desc, count, src_addr, 0, 0, - context, rxm_ep_rx_flags(rxm_ep), - &rxm_ep->recv_queue); + return rxm_ep_recv_common(rxm_ep, iov, desc, count, src_addr, + context, rxm_ep->util_ep.rx_op_flags); } -static void rxm_rndv_hdr_init(struct rxm_ep *rxm_ep, void *buf, +void rxm_rndv_hdr_init(struct rxm_ep *rxm_ep, void *buf, const struct iovec *iov, size_t count, struct fid_mr **mr) { struct rxm_rndv_hdr *rndv_hdr = (struct rxm_rndv_hdr *)buf; size_t i; - for (i = 0; i < count; i++) { + for (i = 0; i < count && mr[i]; i++) { rndv_hdr->iov[i].addr = RXM_MR_VIRT_ADDR(rxm_ep->msg_info) ? (uintptr_t)iov[i].iov_base : 0; rndv_hdr->iov[i].len = (uint64_t)iov[i].iov_len; @@ -893,35 +1104,28 @@ static void rxm_rndv_hdr_init(struct rxm_ep *rxm_ep, void *buf, rndv_hdr->count = (uint8_t)count; } -static inline ssize_t +static ssize_t rxm_ep_msg_inject_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, struct rxm_pkt *tx_pkt, size_t pkt_size, ofi_cntr_inc_func cntr_inc_func) { - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Posting inject with length: %" PRIu64 + FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Posting inject with length: %zu" " tag: 0x%" PRIx64 "\n", pkt_size, tx_pkt->hdr.tag); assert((tx_pkt->hdr.flags & FI_REMOTE_CQ_DATA) || !tx_pkt->hdr.flags); assert(pkt_size <= rxm_ep->inject_limit); ssize_t ret = fi_inject(rxm_conn->msg_ep, tx_pkt, pkt_size, 0); - if (OFI_LIKELY(!ret)) { - cntr_inc_func(rxm_ep->util_ep.tx_cntr); - } else { - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, - "fi_inject for MSG provider failed with ret - %" PRId64"\n", - ret); - if (OFI_LIKELY(ret == -FI_EAGAIN)) - rxm_ep_do_progress(&rxm_ep->util_ep); - } + if (ret == -FI_EAGAIN) + rxm_ep_do_progress(&rxm_ep->util_ep); return ret; } -static inline ssize_t +static ssize_t rxm_ep_msg_normal_send(struct rxm_conn *rxm_conn, struct rxm_pkt *tx_pkt, size_t pkt_size, void *desc, void *context) { - FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Posting send with length: %" PRIu64 + FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Posting send with length: %zu" " tag: 0x%" PRIx64 "\n", pkt_size, tx_pkt->hdr.tag); assert((tx_pkt->hdr.flags & FI_REMOTE_CQ_DATA) || !tx_pkt->hdr.flags); @@ -929,59 +1133,77 @@ rxm_ep_msg_normal_send(struct rxm_conn *rxm_conn, struct rxm_pkt *tx_pkt, return fi_send(rxm_conn->msg_ep, tx_pkt, pkt_size, desc, 0, context); } -static inline ssize_t -rxm_ep_alloc_rndv_tx_res(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, void *context, - uint8_t count, const struct iovec *iov, void **desc, size_t data_len, - uint64_t data, uint64_t flags, uint64_t tag, uint8_t op, - struct rxm_tx_rndv_buf **tx_rndv_buf) +static ssize_t +rxm_alloc_rndv_buf(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, + void *context, uint8_t count, const struct iovec *iov, + void **desc, size_t data_len, uint64_t data, + uint64_t flags, uint64_t tag, uint8_t op, + enum fi_hmem_iface iface, uint64_t device, + struct rxm_tx_rndv_buf **rndv_buf) { + struct fid_mr *rxm_mr_msg_mr[RXM_IOV_LIMIT]; struct fid_mr **mr_iov; + size_t len, i; ssize_t ret; - struct rxm_tx_rndv_buf *tx_buf = (struct rxm_tx_rndv_buf *) - rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_RNDV); - if (OFI_UNLIKELY(!tx_buf)) { + *rndv_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_RNDV_REQ); + if (!*rndv_buf) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "Ran out of buffers from RNDV buffer pool\n"); return -FI_EAGAIN; } - rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, op, data, tag, flags, &(tx_buf)->pkt); - tx_buf->pkt.ctrl_hdr.msg_id = ofi_buf_index(tx_buf); - tx_buf->app_context = context; - tx_buf->flags = flags; - tx_buf->count = count; - - if (!rxm_ep->rxm_mr_local) { - ret = rxm_ep_msg_mr_regv(rxm_ep, iov, tx_buf->count, - FI_REMOTE_READ, tx_buf->mr); + rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, op, data, tag, + flags, &(*rndv_buf)->pkt); + (*rndv_buf)->pkt.ctrl_hdr.msg_id = ofi_buf_index(*rndv_buf); + (*rndv_buf)->app_context = context; + (*rndv_buf)->flags = flags; + (*rndv_buf)->count = count; + + if (!rxm_ep->rdm_mr_local) { + ret = rxm_msg_mr_regv(rxm_ep, iov, (*rndv_buf)->count, data_len, + rxm_ep->rndv_ops->tx_mr_access, + (*rndv_buf)->mr); if (ret) goto err; - mr_iov = tx_buf->mr; + mr_iov = (*rndv_buf)->mr; } else { - /* desc is msg fid_mr * array */ - mr_iov = (struct fid_mr **)desc; + for (i = 0; i < count; i++) + rxm_mr_msg_mr[i] = ((struct rxm_mr *) desc[i])->msg_mr; + + mr_iov = rxm_mr_msg_mr; + } + + if (rxm_ep->rndv_ops == &rxm_rndv_ops_write) { + (*rndv_buf)->write_rndv.conn = rxm_conn; + for (i = 0; i < count; i++) { + (*rndv_buf)->write_rndv.iov[i] = iov[i]; + (*rndv_buf)->write_rndv.desc[i] = fi_mr_desc(mr_iov[i]); + } } - rxm_rndv_hdr_init(rxm_ep, &tx_buf->pkt.data, iov, tx_buf->count, mr_iov); + rxm_rndv_hdr_init(rxm_ep, &(*rndv_buf)->pkt.data, iov, + (*rndv_buf)->count, mr_iov); - ret = sizeof(struct rxm_pkt) + sizeof(struct rxm_rndv_hdr); + len = sizeof(struct rxm_pkt) + sizeof(struct rxm_rndv_hdr); if (rxm_ep->rxm_info->mode & FI_BUFFERED_RECV) { - ofi_copy_from_iov(rxm_pkt_rndv_data(&tx_buf->pkt), - rxm_ep->buffered_min, iov, count, 0); - ret += rxm_ep->buffered_min; + ret = ofi_copy_from_hmem_iov(rxm_pkt_rndv_data(&(*rndv_buf)->pkt), + rxm_ep->buffered_min, iface, + device, iov, count, 0); + assert(ret == rxm_ep->buffered_min); + + len += rxm_ep->buffered_min; } - *tx_rndv_buf = tx_buf; - return ret; + return len; + err: - *tx_rndv_buf = NULL; - ofi_buf_free(tx_buf); + ofi_buf_free(*rndv_buf); return ret; } -static inline ssize_t +static ssize_t rxm_ep_rndv_tx_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, struct rxm_tx_rndv_buf *tx_buf, size_t pkt_size) { @@ -989,7 +1211,10 @@ rxm_ep_rndv_tx_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, RXM_UPDATE_STATE(FI_LOG_EP_DATA, tx_buf, RXM_RNDV_TX); if (pkt_size <= rxm_ep->inject_limit) { - RXM_UPDATE_STATE(FI_LOG_EP_DATA, tx_buf, RXM_RNDV_ACK_WAIT); + if (rxm_ep->rndv_ops == &rxm_rndv_ops_write) + RXM_UPDATE_STATE(FI_LOG_EP_DATA, tx_buf, RXM_RNDV_WRITE_DATA_WAIT); + else + RXM_UPDATE_STATE(FI_LOG_EP_DATA, tx_buf, RXM_RNDV_READ_DONE_WAIT); ret = rxm_ep_msg_inject_send(rxm_ep, rxm_conn, &tx_buf->pkt, pkt_size, ofi_cntr_inc_noop); } else { @@ -998,41 +1223,42 @@ rxm_ep_rndv_tx_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, ret = rxm_ep_msg_normal_send(rxm_conn, &tx_buf->pkt, pkt_size, tx_buf->hdr.desc, tx_buf); } - if (OFI_UNLIKELY(ret)) + if (ret) goto err; return FI_SUCCESS; err: FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Transmit for MSG provider failed\n"); - if (!rxm_ep->rxm_mr_local) - rxm_ep_msg_mr_closev(tx_buf->mr, tx_buf->count); + if (!rxm_ep->rdm_mr_local) + rxm_msg_mr_closev(tx_buf->mr, tx_buf->count); ofi_buf_free(tx_buf); return ret; } -static inline size_t +static size_t rxm_ep_sar_calc_segs_cnt(struct rxm_ep *rxm_ep, size_t data_len) { - return (data_len + rxm_eager_limit - 1) / - rxm_eager_limit; + return (data_len + rxm_eager_limit - 1) / rxm_eager_limit; } -static inline struct rxm_tx_sar_buf * +static struct rxm_tx_sar_buf * rxm_ep_sar_tx_prepare_segment(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, - void *app_context, size_t total_len, size_t seg_len, - size_t seg_no, uint64_t data, uint64_t flags, uint64_t tag, - uint8_t op, enum rxm_sar_seg_type seg_type, uint64_t *msg_id) + void *app_context, size_t total_len, + size_t seg_len, size_t seg_no, uint64_t data, + uint64_t flags, uint64_t tag, uint8_t op, + enum rxm_sar_seg_type seg_type, uint64_t *msg_id) { - struct rxm_tx_sar_buf *tx_buf = (struct rxm_tx_sar_buf *) - rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_SAR); + struct rxm_tx_sar_buf *tx_buf; - if (OFI_UNLIKELY(!tx_buf)) { + tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_SAR); + if (!tx_buf) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "Ran out of buffers from SAR buffer pool\n"); return NULL; }; - rxm_ep_format_tx_buf_pkt(rxm_conn, total_len, op, data, tag, flags, &tx_buf->pkt); + rxm_ep_format_tx_buf_pkt(rxm_conn, total_len, op, data, tag, flags, + &tx_buf->pkt); if (seg_type == RXM_SAR_SEG_FIRST) { *msg_id = tx_buf->pkt.ctrl_hdr.msg_id = ofi_buf_index(tx_buf); } else { @@ -1054,22 +1280,25 @@ rxm_ep_sar_tx_cleanup(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, struct rxm_tx_sar_buf *first_tx_buf; first_tx_buf = ofi_bufpool_get_ibuf(rxm_ep-> - buf_pools[RXM_BUF_POOL_TX_SAR].pool, - tx_buf->pkt.ctrl_hdr.msg_id); + buf_pools[RXM_BUF_POOL_TX_SAR].pool, + tx_buf->pkt.ctrl_hdr.msg_id); ofi_buf_free(first_tx_buf); ofi_buf_free(tx_buf); } -static inline ssize_t -rxm_ep_sar_tx_prepare_and_send_segment(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, - void *app_context, size_t data_len, size_t remain_len, - uint64_t msg_id, size_t seg_len, size_t seg_no, size_t segs_cnt, - uint64_t data, uint64_t flags, uint64_t tag, uint8_t op, - const struct iovec *iov, uint8_t count, size_t *iov_offset, - struct rxm_tx_sar_buf **out_tx_buf) +static ssize_t +rxm_ep_sar_tx_prepare_and_send_segment(struct rxm_ep *rxm_ep, + struct rxm_conn *rxm_conn, void *app_context, size_t data_len, + size_t remain_len, uint64_t msg_id, size_t seg_len, + size_t seg_no, size_t segs_cnt, uint64_t data, uint64_t flags, + uint64_t tag, uint8_t op, const struct iovec *iov, + uint8_t count, size_t *iov_offset, + struct rxm_tx_sar_buf **out_tx_buf, + enum fi_hmem_iface iface, uint64_t device) { struct rxm_tx_sar_buf *tx_buf; enum rxm_sar_seg_type seg_type = RXM_SAR_SEG_MIDDLE; + ssize_t ret __attribute__((unused)); if (seg_no == (segs_cnt - 1)) { seg_type = RXM_SAR_SEG_LAST; @@ -1077,14 +1306,18 @@ rxm_ep_sar_tx_prepare_and_send_segment(struct rxm_ep *rxm_ep, struct rxm_conn *r seg_len = remain_len; } - tx_buf = rxm_ep_sar_tx_prepare_segment(rxm_ep, rxm_conn, app_context, data_len, seg_len, - seg_no, data, flags, tag, op, seg_type, &msg_id); - if (OFI_UNLIKELY(!tx_buf)) { + tx_buf = rxm_ep_sar_tx_prepare_segment(rxm_ep, rxm_conn, app_context, + data_len, seg_len, seg_no, data, + flags, tag, op, seg_type, &msg_id); + if (!tx_buf) { *out_tx_buf = NULL; return -FI_EAGAIN; } - ofi_copy_from_iov(tx_buf->pkt.data, seg_len, iov, count, *iov_offset); + ret = ofi_copy_from_hmem_iov(tx_buf->pkt.data, seg_len, iface, device, + iov, count, *iov_offset); + assert(ret == seg_len); + *iov_offset += seg_len; *out_tx_buf = tx_buf; @@ -1093,34 +1326,39 @@ rxm_ep_sar_tx_prepare_and_send_segment(struct rxm_ep *rxm_ep, struct rxm_conn *r tx_buf->pkt.ctrl_hdr.seg_size, tx_buf->hdr.desc, 0, tx_buf); } -static inline ssize_t +static ssize_t rxm_ep_sar_tx_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, void *context, uint8_t count, const struct iovec *iov, size_t data_len, size_t segs_cnt, uint64_t data, - uint64_t flags, uint64_t tag, uint8_t op) + uint64_t flags, uint64_t tag, uint8_t op, + enum fi_hmem_iface iface, uint64_t device) { struct rxm_tx_sar_buf *tx_buf, *first_tx_buf; size_t i, iov_offset = 0, remain_len = data_len; ssize_t ret; - struct rxm_deferred_tx_entry *def_tx_entry; + struct rxm_deferred_tx_entry *def_tx; uint64_t msg_id = 0; - assert(segs_cnt >= 2); + assert(segs_cnt >= 2); - first_tx_buf = rxm_ep_sar_tx_prepare_segment(rxm_ep, rxm_conn, context, data_len, - rxm_eager_limit, 0, data, flags, - tag, op, RXM_SAR_SEG_FIRST, &msg_id); - if (OFI_UNLIKELY(!first_tx_buf)) + first_tx_buf = rxm_ep_sar_tx_prepare_segment(rxm_ep, rxm_conn, context, + data_len, rxm_eager_limit, + 0, data, flags, tag, op, + RXM_SAR_SEG_FIRST, &msg_id); + if (!first_tx_buf) return -FI_EAGAIN; - ofi_copy_from_iov(first_tx_buf->pkt.data, rxm_eager_limit, - iov, count, iov_offset); + ret = ofi_copy_from_hmem_iov(first_tx_buf->pkt.data, rxm_eager_limit, + iface, device, iov, count, iov_offset); + assert(ret == rxm_eager_limit); + iov_offset += rxm_eager_limit; - ret = fi_send(rxm_conn->msg_ep, &first_tx_buf->pkt, sizeof(struct rxm_pkt) + - first_tx_buf->pkt.ctrl_hdr.seg_size, first_tx_buf->hdr.desc, 0, first_tx_buf); - if (OFI_UNLIKELY(ret)) { - if (OFI_LIKELY(ret == -FI_EAGAIN)) + ret = fi_send(rxm_conn->msg_ep, &first_tx_buf->pkt, + sizeof(struct rxm_pkt) + first_tx_buf->pkt.ctrl_hdr.seg_size, + first_tx_buf->hdr.desc, 0, first_tx_buf); + if (ret) { + if (ret == -FI_EAGAIN) rxm_ep_do_progress(&rxm_ep->util_ep); ofi_buf_free(first_tx_buf); return ret; @@ -1132,44 +1370,51 @@ rxm_ep_sar_tx_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, ret = rxm_ep_sar_tx_prepare_and_send_segment( rxm_ep, rxm_conn, context, data_len, remain_len, msg_id, rxm_eager_limit, i, segs_cnt, data, - flags, tag, op, iov, count, &iov_offset, &tx_buf); - if (OFI_UNLIKELY(ret)) { - if (OFI_LIKELY(ret == -FI_EAGAIN)) { - def_tx_entry = rxm_ep_alloc_deferred_tx_entry(rxm_ep, rxm_conn, - RXM_DEFERRED_TX_SAR_SEG); - if (OFI_UNLIKELY(!def_tx_entry)) { - if (tx_buf) - ofi_buf_free(tx_buf); - return -FI_ENOMEM; - } - memcpy(def_tx_entry->sar_seg.payload.iov, iov, sizeof(*iov) * count); - def_tx_entry->sar_seg.payload.count = count; - def_tx_entry->sar_seg.payload.cur_iov_offset = iov_offset; - def_tx_entry->sar_seg.payload.tag = tag; - def_tx_entry->sar_seg.payload.data = data; - def_tx_entry->sar_seg.cur_seg_tx_buf = tx_buf; - def_tx_entry->sar_seg.app_context = context; - def_tx_entry->sar_seg.flags = flags; - def_tx_entry->sar_seg.op = op; - def_tx_entry->sar_seg.next_seg_no = i; - def_tx_entry->sar_seg.segs_cnt = segs_cnt; - def_tx_entry->sar_seg.total_len = data_len; - def_tx_entry->sar_seg.remain_len = remain_len; - def_tx_entry->sar_seg.msg_id = msg_id; - rxm_ep_enqueue_deferred_tx_queue(def_tx_entry); - return 0; - } - - ofi_buf_free(first_tx_buf); - return ret; + flags, tag, op, iov, count, &iov_offset, &tx_buf, + iface, device); + if (ret) { + if (ret == -FI_EAGAIN) + goto defer; + goto free; } remain_len -= rxm_eager_limit; } return 0; + +free: + ofi_buf_free(first_tx_buf); + return ret; +defer: + def_tx = rxm_ep_alloc_deferred_tx_entry(rxm_ep, + rxm_conn, RXM_DEFERRED_TX_SAR_SEG); + if (!def_tx) { + if (tx_buf) + ofi_buf_free(tx_buf); + return -FI_ENOMEM; + } + memcpy(def_tx->sar_seg.payload.iov, + iov, sizeof(*iov) * count); + def_tx->sar_seg.payload.count = count; + def_tx->sar_seg.payload.cur_iov_offset = iov_offset; + def_tx->sar_seg.payload.tag = tag; + def_tx->sar_seg.payload.data = data; + def_tx->sar_seg.cur_seg_tx_buf = tx_buf; + def_tx->sar_seg.app_context = context; + def_tx->sar_seg.flags = flags; + def_tx->sar_seg.op = op; + def_tx->sar_seg.next_seg_no = i; + def_tx->sar_seg.segs_cnt = segs_cnt; + def_tx->sar_seg.total_len = data_len; + def_tx->sar_seg.remain_len = remain_len; + def_tx->sar_seg.msg_id = msg_id; + def_tx->sar_seg.iface = iface; + def_tx->sar_seg.device = device; + rxm_ep_enqueue_deferred_tx_queue(def_tx); + return 0; } -static inline ssize_t +static ssize_t rxm_ep_emulate_inject(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, const void *buf, size_t len, size_t pkt_size, uint64_t data, uint64_t flags, uint64_t tag, @@ -1177,10 +1422,14 @@ rxm_ep_emulate_inject(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, { struct rxm_tx_eager_buf *tx_buf; ssize_t ret; + enum fi_hmem_iface iface = FI_HMEM_SYSTEM; + const struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = len, + }; - tx_buf = (struct rxm_tx_eager_buf *) - rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX); - if (OFI_UNLIKELY(!tx_buf)) { + tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX); + if (!tx_buf) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "Ran out of buffers from Eager buffer pool\n"); return -FI_EAGAIN; @@ -1189,20 +1438,24 @@ rxm_ep_emulate_inject(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, tx_buf->app_context = NULL; rxm_ep_format_tx_buf_pkt(rxm_conn, len, op, data, tag, flags, &tx_buf->pkt); - memcpy(tx_buf->pkt.data, buf, len); + + ret = ofi_copy_from_hmem_iov(tx_buf->pkt.data, len, iface, 0, &iov, 1, + 0); + assert(ret == len); + tx_buf->flags = flags; ret = rxm_ep_msg_normal_send(rxm_conn, &tx_buf->pkt, pkt_size, tx_buf->hdr.desc, tx_buf); - if (OFI_UNLIKELY(ret)) { - if (OFI_LIKELY(ret == -FI_EAGAIN)) + if (ret) { + if (ret == -FI_EAGAIN) rxm_ep_do_progress(&rxm_ep->util_ep); ofi_buf_free(tx_buf); } return ret; } -static inline ssize_t +static ssize_t rxm_ep_inject_send_fast(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, const void *buf, size_t len, struct rxm_pkt *inject_pkt) { @@ -1211,35 +1464,39 @@ rxm_ep_inject_send_fast(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, assert(len <= rxm_ep->rxm_info->tx_attr->inject_size); - if (pkt_size <= rxm_ep->inject_limit) { + if (pkt_size <= rxm_ep->inject_limit && !rxm_ep->util_ep.tx_cntr) { inject_pkt->hdr.size = len; memcpy(inject_pkt->data, buf, len); ret = rxm_ep_msg_inject_send(rxm_ep, rxm_conn, inject_pkt, - pkt_size, rxm_ep->util_ep.tx_cntr_inc); + pkt_size, + rxm_ep->util_ep.tx_cntr_inc); } else { - ret = rxm_ep_emulate_inject(rxm_ep, rxm_conn, buf, len, pkt_size, - inject_pkt->hdr.data, inject_pkt->hdr.flags, - inject_pkt->hdr.tag, inject_pkt->hdr.op); + ret = rxm_ep_emulate_inject(rxm_ep, rxm_conn, buf, len, + pkt_size, inject_pkt->hdr.data, + inject_pkt->hdr.flags, + inject_pkt->hdr.tag, + inject_pkt->hdr.op); } return ret; } -static inline ssize_t +static ssize_t rxm_ep_inject_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, const void *buf, size_t len, uint64_t data, uint64_t flags, uint64_t tag, uint8_t op) { + struct rxm_tx_base_buf *tx_buf; size_t pkt_size = sizeof(struct rxm_pkt) + len; ssize_t ret; assert(len <= rxm_ep->rxm_info->tx_attr->inject_size); - if (pkt_size <= rxm_ep->inject_limit) { - struct rxm_tx_base_buf *tx_buf = (struct rxm_tx_base_buf *) - rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_INJECT); - if (OFI_UNLIKELY(!tx_buf)) { + if (pkt_size <= rxm_ep->inject_limit && + !rxm_ep->util_ep.tx_cntr) { + tx_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX_INJECT); + if (!tx_buf) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, - "Ran out of buffers from Eager Inject buffer pool\n"); + "Ran out of eager inject buffers\n"); ret = -FI_EAGAIN; goto unlock; } @@ -1248,7 +1505,8 @@ rxm_ep_inject_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, memcpy(tx_buf->pkt.data, buf, len); ret = rxm_ep_msg_inject_send(rxm_ep, rxm_conn, &tx_buf->pkt, - pkt_size, rxm_ep->util_ep.tx_cntr_inc); + pkt_size, + rxm_ep->util_ep.tx_cntr_inc); ofi_buf_free(tx_buf); } else { ret = rxm_ep_emulate_inject(rxm_ep, rxm_conn, buf, len, @@ -1259,64 +1517,117 @@ rxm_ep_inject_send(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, } +static bool +rxm_use_direct_send(struct rxm_ep *ep, size_t iov_count, uint64_t flags) +{ + return ep->enable_direct_send && !(flags & FI_INJECT) && + (iov_count < ep->msg_info->tx_attr->iov_limit); +} + +static ssize_t +rxm_direct_send(struct rxm_ep *ep, struct rxm_conn *rxm_conn, + struct rxm_tx_eager_buf *tx_buf, + const struct iovec *iov, void **desc, size_t count) +{ + struct iovec send_iov[RXM_IOV_LIMIT]; + void *send_desc[RXM_IOV_LIMIT]; + struct rxm_mr *mr; + ssize_t ret; + int i; + + send_iov[0].iov_base = &tx_buf->pkt; + send_iov[0].iov_len = sizeof(tx_buf->pkt); + memcpy(send_iov + 1, iov, sizeof(*iov) * count); + + if (ep->msg_mr_local) { + send_desc[0] = tx_buf->hdr.desc; + + for (i = 0; i < count; i++) { + assert(desc[i]); + mr = desc[i]; + send_desc[i + 1] = fi_mr_desc(mr->msg_mr); + } + + ret = fi_sendv(rxm_conn->msg_ep, send_iov, send_desc, + count + 1, 0, tx_buf); + } else { + ret = fi_sendv(rxm_conn->msg_ep, send_iov, NULL, + count + 1, 0, tx_buf); + } + return ret; +} + static ssize_t rxm_ep_send_common(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, const struct iovec *iov, void **desc, size_t count, void *context, uint64_t data, uint64_t flags, uint64_t tag, - uint8_t op, struct rxm_pkt *inject_pkt) + uint8_t op) { - size_t data_len = ofi_total_iov_len(iov, count); - size_t total_len = sizeof(struct rxm_pkt) + data_len; + struct rxm_tx_eager_buf *eager_buf; + struct rxm_tx_rndv_buf *rndv_buf; + size_t data_len, total_len; ssize_t ret; + enum fi_hmem_iface iface; + uint64_t device; + + data_len = ofi_total_iov_len(iov, count); + total_len = sizeof(struct rxm_pkt) + data_len; assert(count <= rxm_ep->rxm_info->tx_attr->iov_limit); assert((!(flags & FI_INJECT) && (data_len > rxm_ep->rxm_info->tx_attr->inject_size)) || (data_len <= rxm_ep->rxm_info->tx_attr->inject_size)); - if (data_len <= rxm_eager_limit) { - struct rxm_tx_eager_buf *tx_buf = (struct rxm_tx_eager_buf *) - rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX); + iface = rxm_mr_desc_to_hmem_iface_dev(desc, count, &device); - if (OFI_UNLIKELY(!tx_buf)) { + if (data_len <= rxm_eager_limit) { + eager_buf = rxm_tx_buf_alloc(rxm_ep, RXM_BUF_POOL_TX); + if (!eager_buf) { FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "Ran out of buffers from Eager buffer pool\n"); - ret = -FI_EAGAIN; - goto unlock; + return -FI_EAGAIN; } + eager_buf->app_context = context; + eager_buf->flags = flags; rxm_ep_format_tx_buf_pkt(rxm_conn, data_len, op, data, tag, - flags, &tx_buf->pkt); - ofi_copy_from_iov(tx_buf->pkt.data, tx_buf->pkt.hdr.size, - iov, count, 0); - tx_buf->app_context = context; - tx_buf->flags = flags; + flags, &eager_buf->pkt); - ret = rxm_ep_msg_normal_send(rxm_conn, &tx_buf->pkt, total_len, - tx_buf->hdr.desc, tx_buf); - if (OFI_UNLIKELY(ret)) { + if (rxm_use_direct_send(rxm_ep, count, flags)) { + ret = rxm_direct_send(rxm_ep, rxm_conn, eager_buf, + iov, desc, count); + } else { + ret = ofi_copy_from_hmem_iov(eager_buf->pkt.data, + eager_buf->pkt.hdr.size, + iface, device, iov, + count, 0); + assert(ret == eager_buf->pkt.hdr.size); + + ret = rxm_ep_msg_normal_send(rxm_conn, &eager_buf->pkt, + total_len, + eager_buf->hdr.desc, + eager_buf); + } + if (ret) { if (ret == -FI_EAGAIN) rxm_ep_do_progress(&rxm_ep->util_ep); - ofi_buf_free(tx_buf); + ofi_buf_free(eager_buf); } - } else if (data_len <= rxm_ep->sar_limit && - /* SAR uses eager_limit as segment size */ - (rxm_eager_limit < - (1ULL << (8 * sizeof_field(struct ofi_ctrl_hdr, seg_size))))) { + } else if (data_len <= rxm_ep->sar_limit) { ret = rxm_ep_sar_tx_send(rxm_ep, rxm_conn, context, count, iov, data_len, rxm_ep_sar_calc_segs_cnt(rxm_ep, data_len), - data, flags, tag, op); + data, flags, tag, op, iface, device); } else { - struct rxm_tx_rndv_buf *tx_buf; - - ret = rxm_ep_alloc_rndv_tx_res(rxm_ep, rxm_conn, context, (uint8_t)count, - iov, desc, data_len, data, flags, tag, op, - &tx_buf); - if (OFI_LIKELY(ret >= 0)) - ret = rxm_ep_rndv_tx_send(rxm_ep, rxm_conn, tx_buf, ret); + ret = rxm_alloc_rndv_buf(rxm_ep, rxm_conn, context, + (uint8_t) count, iov, desc, + data_len, data, flags, tag, op, + iface, device, &rndv_buf); + if (ret >= 0) + ret = rxm_ep_rndv_tx_send(rxm_ep, rxm_conn, + rndv_buf, ret); } -unlock: + return ret; } @@ -1324,9 +1635,10 @@ struct rxm_deferred_tx_entry * rxm_ep_alloc_deferred_tx_entry(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, enum rxm_deferred_tx_entry_type type) { - struct rxm_deferred_tx_entry *def_tx_entry = - calloc(1, sizeof(*def_tx_entry)); - if (OFI_UNLIKELY(!def_tx_entry)) + struct rxm_deferred_tx_entry *def_tx_entry; + + def_tx_entry = calloc(1, sizeof(*def_tx_entry)); + if (!def_tx_entry) return NULL; def_tx_entry->rxm_ep = rxm_ep; @@ -1337,8 +1649,9 @@ rxm_ep_alloc_deferred_tx_entry(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, return def_tx_entry; } -static inline void -rxm_ep_sar_handle_segment_failure(struct rxm_deferred_tx_entry *def_tx_entry, ssize_t ret) +static void +rxm_ep_sar_handle_segment_failure(struct rxm_deferred_tx_entry *def_tx_entry, + ssize_t ret) { rxm_ep_sar_tx_cleanup(def_tx_entry->rxm_ep, def_tx_entry->rxm_conn, def_tx_entry->sar_seg.cur_seg_tx_buf); @@ -1356,12 +1669,13 @@ rxm_ep_progress_sar_deferred_segments(struct rxm_deferred_tx_entry *def_tx_entry struct rxm_tx_sar_buf *tx_buf = def_tx_entry->sar_seg.cur_seg_tx_buf; if (tx_buf) { - ret = fi_send(def_tx_entry->rxm_conn->msg_ep, &tx_buf->pkt, sizeof(tx_buf->pkt) + - tx_buf->pkt.ctrl_hdr.seg_size, tx_buf->hdr.desc, 0, tx_buf); - if (OFI_UNLIKELY(ret)) { - if (OFI_LIKELY(ret != -FI_EAGAIN)) { - rxm_ep_sar_handle_segment_failure(def_tx_entry, ret); - goto sar_finish; + ret = fi_send(def_tx_entry->rxm_conn->msg_ep, &tx_buf->pkt, + sizeof(tx_buf->pkt) + tx_buf->pkt.ctrl_hdr.seg_size, + tx_buf->hdr.desc, 0, tx_buf); + if (ret) { + if (ret != -FI_EAGAIN) { + rxm_ep_sar_handle_segment_failure(def_tx_entry, + ret); } return ret; } @@ -1369,29 +1683,38 @@ rxm_ep_progress_sar_deferred_segments(struct rxm_deferred_tx_entry *def_tx_entry def_tx_entry->sar_seg.next_seg_no++; def_tx_entry->sar_seg.remain_len -= rxm_eager_limit; - if (def_tx_entry->sar_seg.next_seg_no == def_tx_entry->sar_seg.segs_cnt) { - assert(rxm_sar_get_seg_type(&tx_buf->pkt.ctrl_hdr) == RXM_SAR_SEG_LAST); - goto sar_finish; + if (def_tx_entry->sar_seg.next_seg_no == + def_tx_entry->sar_seg.segs_cnt) { + assert(rxm_sar_get_seg_type(&tx_buf->pkt.ctrl_hdr) == + RXM_SAR_SEG_LAST); + return 0; } } - while (def_tx_entry->sar_seg.next_seg_no != def_tx_entry->sar_seg.segs_cnt) { + while (def_tx_entry->sar_seg.next_seg_no != + def_tx_entry->sar_seg.segs_cnt) { ret = rxm_ep_sar_tx_prepare_and_send_segment( def_tx_entry->rxm_ep, def_tx_entry->rxm_conn, def_tx_entry->sar_seg.app_context, - def_tx_entry->sar_seg.total_len, def_tx_entry->sar_seg.remain_len, + def_tx_entry->sar_seg.total_len, + def_tx_entry->sar_seg.remain_len, def_tx_entry->sar_seg.msg_id, rxm_eager_limit, - def_tx_entry->sar_seg.next_seg_no, def_tx_entry->sar_seg.segs_cnt, - def_tx_entry->sar_seg.payload.data, def_tx_entry->sar_seg.flags, - def_tx_entry->sar_seg.payload.tag, def_tx_entry->sar_seg.op, + def_tx_entry->sar_seg.next_seg_no, + def_tx_entry->sar_seg.segs_cnt, + def_tx_entry->sar_seg.payload.data, + def_tx_entry->sar_seg.flags, + def_tx_entry->sar_seg.payload.tag, + def_tx_entry->sar_seg.op, def_tx_entry->sar_seg.payload.iov, def_tx_entry->sar_seg.payload.count, &def_tx_entry->sar_seg.payload.cur_iov_offset, - &def_tx_entry->sar_seg.cur_seg_tx_buf); - if (OFI_UNLIKELY(ret)) { - if (OFI_LIKELY(ret != -FI_EAGAIN)) { - rxm_ep_sar_handle_segment_failure(def_tx_entry, ret); - goto sar_finish; + &def_tx_entry->sar_seg.cur_seg_tx_buf, + def_tx_entry->sar_seg.iface, + def_tx_entry->sar_seg.device); + if (ret) { + if (ret != -FI_EAGAIN) { + rxm_ep_sar_handle_segment_failure(def_tx_entry, + ret); } return ret; @@ -1400,19 +1723,20 @@ rxm_ep_progress_sar_deferred_segments(struct rxm_deferred_tx_entry *def_tx_entry def_tx_entry->sar_seg.remain_len -= rxm_eager_limit; } -sar_finish: - rxm_ep_dequeue_deferred_tx_queue(def_tx_entry); - free(def_tx_entry); - - return ret; + return 0; } void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn) { struct rxm_deferred_tx_entry *def_tx_entry; + struct iovec iov; + struct fi_msg msg; ssize_t ret = 0; + if (rxm_conn->handle.state != RXM_CMAP_CONNECTED) + return; + while (!dlist_empty(&rxm_conn->deferred_tx_queue) && !ret) { def_tx_entry = container_of(rxm_conn->deferred_tx_queue.next, struct rxm_deferred_tx_entry, entry); @@ -1421,81 +1745,141 @@ void rxm_ep_progress_deferred_queue(struct rxm_ep *rxm_ep, ret = fi_send(def_tx_entry->rxm_conn->msg_ep, &def_tx_entry->rndv_ack.rx_buf-> recv_entry->rndv.tx_buf->pkt, - sizeof(def_tx_entry->rndv_ack.rx_buf-> - recv_entry->rndv.tx_buf->pkt), + def_tx_entry->rndv_ack.pkt_size, def_tx_entry->rndv_ack.rx_buf->recv_entry-> rndv.tx_buf->hdr.desc, 0, def_tx_entry->rndv_ack.rx_buf); - if (OFI_UNLIKELY(ret)) { - if (OFI_LIKELY(ret == -FI_EAGAIN)) - break; + if (ret) { + if (ret == -FI_EAGAIN) + return; rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.rx_cq, def_tx_entry->rxm_ep->util_ep.rx_cntr, - def_tx_entry->rndv_read.rx_buf-> + def_tx_entry->rndv_ack.rx_buf-> recv_entry->context, ret); } + if (def_tx_entry->rndv_ack.rx_buf->recv_entry->rndv + .tx_buf->pkt.ctrl_hdr + .type == rxm_ctrl_rndv_rd_done) + RXM_UPDATE_STATE(FI_LOG_EP_DATA, + def_tx_entry->rndv_ack.rx_buf, + RXM_RNDV_READ_DONE_SENT); + else + RXM_UPDATE_STATE(FI_LOG_EP_DATA, + def_tx_entry->rndv_ack.rx_buf, + RXM_RNDV_WRITE_DATA_SENT); + break; + case RXM_DEFERRED_TX_RNDV_DONE: + ret = fi_send(def_tx_entry->rxm_conn->msg_ep, + &def_tx_entry->rndv_done.tx_buf->write_rndv.done_buf->pkt, + sizeof(struct rxm_pkt), + def_tx_entry->rndv_done.tx_buf->write_rndv.done_buf->hdr.desc, + 0, def_tx_entry->rndv_done.tx_buf); + if (ret) { + if (ret == -FI_EAGAIN) + return; + rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.tx_cq, + def_tx_entry->rxm_ep->util_ep.tx_cntr, + def_tx_entry->rndv_done.tx_buf, ret); + } RXM_UPDATE_STATE(FI_LOG_EP_DATA, - def_tx_entry->rndv_ack.rx_buf, - RXM_RNDV_ACK_SENT); - rxm_ep_dequeue_deferred_tx_queue(def_tx_entry); - free(def_tx_entry); + def_tx_entry->rndv_done.tx_buf, + RXM_RNDV_WRITE_DONE_SENT); break; case RXM_DEFERRED_TX_RNDV_READ: - ret = fi_readv(def_tx_entry->rxm_conn->msg_ep, - def_tx_entry->rndv_read.rxm_iov.iov, - def_tx_entry->rndv_read.rxm_iov.desc, - def_tx_entry->rndv_read.rxm_iov.count, 0, - def_tx_entry->rndv_read.rma_iov.addr, - def_tx_entry->rndv_read.rma_iov.key, - def_tx_entry->rndv_read.rx_buf); - if (OFI_UNLIKELY(ret)) { - if (OFI_LIKELY(ret == -FI_EAGAIN)) - break; + ret = rxm_ep->rndv_ops->xfer( + def_tx_entry->rxm_conn->msg_ep, + def_tx_entry->rndv_read.rxm_iov.iov, + def_tx_entry->rndv_read.rxm_iov.desc, + def_tx_entry->rndv_read.rxm_iov.count, 0, + def_tx_entry->rndv_read.rma_iov.addr, + def_tx_entry->rndv_read.rma_iov.key, + def_tx_entry->rndv_read.rx_buf); + if (ret) { + if (ret == -FI_EAGAIN) + return; rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.rx_cq, def_tx_entry->rxm_ep->util_ep.rx_cntr, def_tx_entry->rndv_read.rx_buf-> recv_entry->context, ret); - break; } - rxm_ep_dequeue_deferred_tx_queue(def_tx_entry); - free(def_tx_entry); + break; + case RXM_DEFERRED_TX_RNDV_WRITE: + ret = rxm_ep->rndv_ops->xfer( + def_tx_entry->rxm_conn->msg_ep, + def_tx_entry->rndv_write.rxm_iov.iov, + def_tx_entry->rndv_write.rxm_iov.desc, + def_tx_entry->rndv_write.rxm_iov.count, 0, + def_tx_entry->rndv_write.rma_iov.addr, + def_tx_entry->rndv_write.rma_iov.key, + def_tx_entry->rndv_write.tx_buf); + if (ret) { + if (ret == -FI_EAGAIN) + return; + rxm_cq_write_error(def_tx_entry->rxm_ep->util_ep.rx_cq, + def_tx_entry->rxm_ep->util_ep.rx_cntr, + def_tx_entry->rndv_write.tx_buf, ret); + } break; case RXM_DEFERRED_TX_SAR_SEG: ret = rxm_ep_progress_sar_deferred_segments(def_tx_entry); + if (ret == -FI_EAGAIN) + return; break; case RXM_DEFERRED_TX_ATOMIC_RESP: ret = rxm_atomic_send_respmsg(rxm_ep, def_tx_entry->rxm_conn, def_tx_entry->atomic_resp.tx_buf, def_tx_entry->atomic_resp.len); - if (OFI_UNLIKELY(ret)) - if (OFI_LIKELY(ret == -FI_EAGAIN)) - break; - rxm_ep_dequeue_deferred_tx_queue(def_tx_entry); - free(def_tx_entry); + if (ret == -FI_EAGAIN) + return; + break; + case RXM_DEFERRED_TX_CREDIT_SEND: + iov.iov_base = &def_tx_entry->credit_msg.tx_buf->pkt; + iov.iov_len = sizeof(def_tx_entry->credit_msg.tx_buf->pkt); + + msg.addr = 0; + msg.context = def_tx_entry->credit_msg.tx_buf; + msg.data = 0; + msg.desc = &def_tx_entry->credit_msg.tx_buf->hdr.desc; + msg.iov_count = 1; + msg.msg_iov = &iov; + + ret = fi_sendmsg(def_tx_entry->rxm_conn->msg_ep, &msg, + FI_PRIORITY); + if (ret) { + if (ret != -FI_EAGAIN) { + rxm_cq_write_error( + def_tx_entry->rxm_ep->util_ep.rx_cq, + def_tx_entry->rxm_ep->util_ep.rx_cntr, + def_tx_entry->rndv_read.rx_buf-> + recv_entry->context, ret); + } + return; + } break; } + + rxm_ep_dequeue_deferred_tx_queue(def_tx_entry); + free(def_tx_entry); } } -static ssize_t rxm_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, - uint64_t flags) +static ssize_t +rxm_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) { - int ret; struct rxm_conn *rxm_conn; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; + ssize_t ret; + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ofi_ep_lock_acquire(&rxm_ep->util_ep); - ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + ret = rxm_get_conn(rxm_ep, msg->addr, &rxm_conn); + if (ret) goto unlock; ret = rxm_ep_send_common(rxm_ep, rxm_conn, msg->msg_iov, msg->desc, - msg->iov_count, msg->context, msg->data, - flags | rxm_ep->util_ep.tx_msg_flags, 0, ofi_op_msg, - ((flags & FI_REMOTE_CQ_DATA) ? - rxm_conn->inject_data_pkt : rxm_conn->inject_pkt)); + msg->iov_count, msg->context, msg->data, + flags | rxm_ep->util_ep.tx_msg_flags, 0, ofi_op_msg); unlock: ofi_ep_lock_release(&rxm_ep->util_ep); return ret; @@ -1504,23 +1888,22 @@ static ssize_t rxm_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, static ssize_t rxm_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context) { - int ret; struct rxm_conn *rxm_conn; + struct rxm_ep *rxm_ep; struct iovec iov = { - .iov_base = (void *)buf, + .iov_base = (void *) buf, .iov_len = len, }; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + ssize_t ret; + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ofi_ep_lock_acquire(&rxm_ep->util_ep); - ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn); + if (ret) goto unlock; ret = rxm_ep_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context, - 0, rxm_ep_tx_flags(rxm_ep), 0, ofi_op_msg, - rxm_conn->inject_pkt); + 0, rxm_ep->util_ep.tx_op_flags, 0, ofi_op_msg); unlock: ofi_ep_lock_release(&rxm_ep->util_ep); return ret; @@ -1530,19 +1913,18 @@ static ssize_t rxm_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, void *context) { - int ret; struct rxm_conn *rxm_conn; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; + ssize_t ret; + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ofi_ep_lock_acquire(&rxm_ep->util_ep); - ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn); + if (ret) goto unlock; ret = rxm_ep_send_common(rxm_ep, rxm_conn, iov, desc, count, context, - 0, rxm_ep_tx_flags(rxm_ep), 0, ofi_op_msg, - rxm_conn->inject_pkt); + 0, rxm_ep->util_ep.tx_op_flags, 0, ofi_op_msg); unlock: ofi_ep_lock_release(&rxm_ep->util_ep); return ret; @@ -1551,14 +1933,14 @@ static ssize_t rxm_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov, static ssize_t rxm_ep_inject(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr) { - int ret; struct rxm_conn *rxm_conn; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; + ssize_t ret; + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ofi_ep_lock_acquire(&rxm_ep->util_ep); - ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn); + if (ret) goto unlock; ret = rxm_ep_inject_send(rxm_ep, rxm_conn, buf, len, 0, @@ -1569,16 +1951,16 @@ static ssize_t rxm_ep_inject(struct fid_ep *ep_fid, const void *buf, size_t len, return ret; } -static ssize_t rxm_ep_inject_fast(struct fid_ep *ep_fid, const void *buf, size_t len, - fi_addr_t dest_addr) +static ssize_t rxm_ep_inject_fast(struct fid_ep *ep_fid, const void *buf, + size_t len, fi_addr_t dest_addr) { - int ret; struct rxm_conn *rxm_conn; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; + ssize_t ret; - ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); + ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn); + if (ret) return ret; return rxm_ep_inject_send_fast(rxm_ep, rxm_conn, buf, len, @@ -1589,23 +1971,23 @@ static ssize_t rxm_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t le void *desc, uint64_t data, fi_addr_t dest_addr, void *context) { - int ret; struct rxm_conn *rxm_conn; + struct rxm_ep *rxm_ep; struct iovec iov = { - .iov_base = (void *)buf, + .iov_base = (void *) buf, .iov_len = len, }; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + ssize_t ret; + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ofi_ep_lock_acquire(&rxm_ep->util_ep); - ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn); + if (ret) goto unlock; - ret = rxm_ep_send_common(rxm_ep, rxm_conn, &iov, desc, 1, context, data, - rxm_ep_tx_flags(rxm_ep) | FI_REMOTE_CQ_DATA, - 0, ofi_op_msg, rxm_conn->inject_data_pkt); + ret = rxm_ep_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context, data, + rxm_ep->util_ep.tx_op_flags | FI_REMOTE_CQ_DATA, + 0, ofi_op_msg); unlock: ofi_ep_lock_release(&rxm_ep->util_ep); return ret; @@ -1614,14 +1996,14 @@ static ssize_t rxm_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t le static ssize_t rxm_ep_injectdata(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr) { - int ret; struct rxm_conn *rxm_conn; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; + ssize_t ret; + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ofi_ep_lock_acquire(&rxm_ep->util_ep); - ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn); + if (ret) goto unlock; ret = rxm_ep_inject_send(rxm_ep, rxm_conn, buf, len, data, @@ -1635,13 +2017,13 @@ static ssize_t rxm_ep_injectdata(struct fid_ep *ep_fid, const void *buf, size_t static ssize_t rxm_ep_injectdata_fast(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr) { - int ret; struct rxm_conn *rxm_conn; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; + ssize_t ret; - ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); + ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn); + if (ret) return ret; rxm_conn->inject_data_pkt->hdr.data = data; @@ -1676,64 +2058,177 @@ static struct fi_ops_msg rxm_ops_msg_thread_unsafe = { .injectdata = rxm_ep_injectdata_fast, }; -static ssize_t rxm_ep_trecvmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg, - uint64_t flags) +static ssize_t +rxm_ep_post_trecv(struct rxm_ep *rxm_ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, void *context, + uint64_t op_flags) { - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_recv_entry *recv_entry; + struct rxm_rx_buf *rx_buf; + + assert(count <= rxm_ep->rxm_info->rx_attr->iov_limit); + + recv_entry = rxm_recv_entry_get(rxm_ep, iov, desc, count, src_addr, + tag, ignore, context, op_flags, + &rxm_ep->trecv_queue); + if (!recv_entry) + return -FI_EAGAIN; + + rx_buf = rxm_get_unexp_msg(&rxm_ep->trecv_queue, recv_entry->addr, + recv_entry->tag, recv_entry->ignore); + if (!rx_buf) { + dlist_insert_tail(&recv_entry->entry, + &rxm_ep->trecv_queue.recv_list); + return FI_SUCCESS; + } + + dlist_remove(&rx_buf->unexp_msg.entry); + rx_buf->recv_entry = recv_entry; + + if (rx_buf->pkt.ctrl_hdr.type != rxm_ctrl_seg) + return rxm_handle_rx_buf(rx_buf); + else + return rxm_handle_unexp_sar(&rxm_ep->trecv_queue, recv_entry, + rx_buf); +} + +static ssize_t +rxm_ep_trecv_common(struct rxm_ep *rxm_ep, const struct iovec *iov, + void **desc, size_t count, fi_addr_t src_addr, + uint64_t tag, uint64_t ignore, void *context, + uint64_t op_flags) +{ + ssize_t ret; + + ofi_ep_lock_acquire(&rxm_ep->util_ep); + ret = rxm_ep_post_trecv(rxm_ep, iov, desc, count, src_addr, + tag, ignore, context, op_flags); + ofi_ep_lock_release(&rxm_ep->util_ep); + return ret; +} + +static ssize_t +rxm_ep_trecvmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg, + uint64_t flags) +{ + struct rxm_ep *rxm_ep; + struct rxm_recv_entry *recv_entry; + struct fi_recv_context *recv_ctx; + struct rxm_rx_buf *rx_buf; + void *context = msg->context; + ssize_t ret = 0; + + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); + flags |= rxm_ep->util_ep.rx_msg_flags; + + if (!(flags & (FI_CLAIM | FI_PEEK)) && + !(rxm_ep->rxm_info->mode & FI_BUFFERED_RECV)) { + return rxm_ep_trecv_common(rxm_ep, msg->msg_iov, msg->desc, + msg->iov_count, msg->addr, + msg->tag, msg->ignore, context, + flags); + } + + ofi_ep_lock_acquire(&rxm_ep->util_ep); + if (rxm_ep->rxm_info->mode & FI_BUFFERED_RECV) { + recv_ctx = msg->context; + context = recv_ctx->context; + rx_buf = container_of(recv_ctx, struct rxm_rx_buf, recv_context); - return rxm_ep_recv_common_flags(rxm_ep, msg->msg_iov, msg->desc, msg->iov_count, - msg->addr, msg->tag, msg->ignore, msg->context, - flags | rxm_ep->util_ep.rx_msg_flags, + if (flags & FI_CLAIM) { + FI_DBG(&rxm_prov, FI_LOG_EP_DATA, + "Claiming buffered receive\n"); + goto claim; + } + + assert(flags & FI_DISCARD); + FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Discarding buffered receive\n"); + rxm_rx_buf_free(rx_buf); + goto unlock; + } + + if (flags & FI_PEEK) { + rxm_ep_peek_recv(rxm_ep, msg->addr, msg->tag, msg->ignore, + context, flags, &rxm_ep->trecv_queue); + goto unlock; + } + + rx_buf = ((struct fi_context *) context)->internal[0]; + assert(rx_buf); + FI_DBG(&rxm_prov, FI_LOG_EP_DATA, "Claim message\n"); + + if (flags & FI_DISCARD) { + rxm_ep_discard_recv(rxm_ep, rx_buf, context); + goto unlock; + } + +claim: + assert (flags & FI_CLAIM); + recv_entry = rxm_recv_entry_get(rxm_ep, msg->msg_iov, msg->desc, + msg->iov_count, msg->addr, + msg->tag, msg->ignore, context, flags, &rxm_ep->trecv_queue); + if (!recv_entry) { + ret = -FI_EAGAIN; + goto unlock; + } + + if (rxm_ep->rxm_info->mode & FI_BUFFERED_RECV) + recv_entry->comp_flags |= FI_CLAIM; + + rx_buf->recv_entry = recv_entry; + ret = rxm_handle_rx_buf(rx_buf); + +unlock: + ofi_ep_lock_release(&rxm_ep->util_ep); + return ret; } static ssize_t rxm_ep_trecv(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context) { - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; struct iovec iov = { .iov_base = buf, .iov_len = len, }; - return rxm_ep_recv_common(rxm_ep, &iov, &desc, 1, src_addr, tag, ignore, - context, rxm_ep_rx_flags(rxm_ep), - &rxm_ep->trecv_queue); + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); + return rxm_ep_trecv_common(rxm_ep, &iov, &desc, 1, src_addr, tag, ignore, + context, rxm_ep->util_ep.rx_op_flags); } static ssize_t rxm_ep_trecvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context) { - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; - return rxm_ep_recv_common(rxm_ep, iov, desc, count, src_addr, tag, ignore, - context, rxm_ep_rx_flags(rxm_ep), - &rxm_ep->trecv_queue); + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); + return rxm_ep_trecv_common(rxm_ep, iov, desc, count, src_addr, tag, + ignore, context, rxm_ep->util_ep.rx_op_flags); } -static ssize_t rxm_ep_tsendmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg, - uint64_t flags) +static ssize_t +rxm_ep_tsendmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg, + uint64_t flags) { - int ret; struct rxm_conn *rxm_conn; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; + ssize_t ret; + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ofi_ep_lock_acquire(&rxm_ep->util_ep); - ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + ret = rxm_get_conn(rxm_ep, msg->addr, &rxm_conn); + if (ret) goto unlock; ret = rxm_ep_send_common(rxm_ep, rxm_conn, msg->msg_iov, msg->desc, msg->iov_count, msg->context, msg->data, flags | rxm_ep->util_ep.tx_msg_flags, msg->tag, - ofi_op_tagged, ((flags & FI_REMOTE_CQ_DATA) ? - rxm_conn->tinject_data_pkt : rxm_conn->tinject_pkt)); + ofi_op_tagged); unlock: ofi_ep_lock_release(&rxm_ep->util_ep); return ret; @@ -1743,23 +2238,22 @@ static ssize_t rxm_ep_tsend(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t tag, void *context) { - int ret; struct rxm_conn *rxm_conn; + struct rxm_ep *rxm_ep; struct iovec iov = { - .iov_base = (void *)buf, + .iov_base = (void *) buf, .iov_len = len, }; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + ssize_t ret; + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ofi_ep_lock_acquire(&rxm_ep->util_ep); - ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn); + if (ret) goto unlock; ret = rxm_ep_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context, 0, - rxm_ep_tx_flags(rxm_ep), tag, ofi_op_tagged, - rxm_conn->tinject_pkt); + rxm_ep->util_ep.tx_op_flags, tag, ofi_op_tagged); unlock: ofi_ep_lock_release(&rxm_ep->util_ep); return ret; @@ -1769,19 +2263,18 @@ static ssize_t rxm_ep_tsendv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t tag, void *context) { - int ret; struct rxm_conn *rxm_conn; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; + ssize_t ret; + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ofi_ep_lock_acquire(&rxm_ep->util_ep); - ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn); + if (ret) goto unlock; ret = rxm_ep_send_common(rxm_ep, rxm_conn, iov, desc, count, context, 0, - rxm_ep_tx_flags(rxm_ep), tag, ofi_op_tagged, - rxm_conn->tinject_pkt); + rxm_ep->util_ep.tx_op_flags, tag, ofi_op_tagged); unlock: ofi_ep_lock_release(&rxm_ep->util_ep); return ret; @@ -1790,14 +2283,14 @@ static ssize_t rxm_ep_tsendv(struct fid_ep *ep_fid, const struct iovec *iov, static ssize_t rxm_ep_tinject(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag) { - int ret; struct rxm_conn *rxm_conn; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; + ssize_t ret; + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ofi_ep_lock_acquire(&rxm_ep->util_ep); - ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn); + if (ret) goto unlock; ret = rxm_ep_inject_send(rxm_ep, rxm_conn, buf, len, 0, @@ -1811,13 +2304,13 @@ static ssize_t rxm_ep_tinject(struct fid_ep *ep_fid, const void *buf, size_t len static ssize_t rxm_ep_tinject_fast(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t tag) { - int ret; struct rxm_conn *rxm_conn; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; + ssize_t ret; - ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); + ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn); + if (ret) return ret; rxm_conn->tinject_pkt->hdr.tag = tag; @@ -1830,23 +2323,23 @@ static ssize_t rxm_ep_tsenddata(struct fid_ep *ep_fid, const void *buf, size_t l void *desc, uint64_t data, fi_addr_t dest_addr, uint64_t tag, void *context) { - int ret; struct rxm_conn *rxm_conn; struct iovec iov = { .iov_base = (void *)buf, .iov_len = len, }; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; + ssize_t ret; + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ofi_ep_lock_acquire(&rxm_ep->util_ep); - ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn); + if (ret) goto unlock; - ret = rxm_ep_send_common(rxm_ep, rxm_conn, &iov, desc, 1, context, data, - rxm_ep_tx_flags(rxm_ep) | FI_REMOTE_CQ_DATA, - tag, ofi_op_tagged, rxm_conn->tinject_data_pkt); + ret = rxm_ep_send_common(rxm_ep, rxm_conn, &iov, &desc, 1, context, data, + rxm_ep->util_ep.tx_op_flags | FI_REMOTE_CQ_DATA, + tag, ofi_op_tagged); unlock: ofi_ep_lock_release(&rxm_ep->util_ep); return ret; @@ -1855,14 +2348,14 @@ static ssize_t rxm_ep_tsenddata(struct fid_ep *ep_fid, const void *buf, size_t l static ssize_t rxm_ep_tinjectdata(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr, uint64_t tag) { - int ret; struct rxm_conn *rxm_conn; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; + ssize_t ret; + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ofi_ep_lock_acquire(&rxm_ep->util_ep); - ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn); + if (ret) goto unlock; ret = rxm_ep_inject_send(rxm_ep, rxm_conn, buf, len, data, @@ -1876,13 +2369,13 @@ static ssize_t rxm_ep_tinjectdata(struct fid_ep *ep_fid, const void *buf, size_t static ssize_t rxm_ep_tinjectdata_fast(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr, uint64_t tag) { - int ret; struct rxm_conn *rxm_conn; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; + ssize_t ret; - ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); - if (OFI_UNLIKELY(ret)) + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); + ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn); + if (ret) return ret; rxm_conn->tinject_data_pkt->hdr.tag = tag; @@ -1918,21 +2411,48 @@ static struct fi_ops_tagged rxm_ops_tagged_thread_unsafe = { .injectdata = rxm_ep_tinjectdata_fast, }; +static struct fi_ops_collective rxm_ops_collective = { + .size = sizeof(struct fi_ops_collective), + .barrier = ofi_ep_barrier, + .broadcast = ofi_ep_broadcast, + .alltoall = fi_coll_no_alltoall, + .allreduce = ofi_ep_allreduce, + .allgather = ofi_ep_allgather, + .reduce_scatter = fi_coll_no_reduce_scatter, + .reduce = fi_coll_no_reduce, + .scatter = ofi_ep_scatter, + .gather = fi_coll_no_gather, + .msg = fi_coll_no_msg, +}; + +static struct fi_ops_collective rxm_ops_collective_none = { + .size = sizeof(struct fi_ops_collective), + .barrier = fi_coll_no_barrier, + .broadcast = fi_coll_no_broadcast, + .alltoall = fi_coll_no_alltoall, + .allreduce = fi_coll_no_allreduce, + .allgather = fi_coll_no_allgather, + .reduce_scatter = fi_coll_no_reduce_scatter, + .reduce = fi_coll_no_reduce, + .scatter = fi_coll_no_scatter, + .gather = fi_coll_no_gather, + .msg = fi_coll_no_msg, +}; + static int rxm_ep_msg_res_close(struct rxm_ep *rxm_ep) { - int ret, retv = 0; + int ret = 0; if (rxm_ep->srx_ctx) { ret = fi_close(&rxm_ep->srx_ctx->fid); if (ret) { FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, \ "Unable to close msg shared ctx\n"); - retv = ret; } } fi_freeinfo(rxm_ep->msg_info); - return retv; + return ret; } static int rxm_listener_close(struct rxm_ep *rxm_ep) @@ -1961,9 +2481,9 @@ static int rxm_listener_close(struct rxm_ep *rxm_ep) static int rxm_ep_close(struct fid *fid) { int ret, retv = 0; - struct rxm_ep *rxm_ep = - container_of(fid, struct rxm_ep, util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; + rxm_ep = container_of(fid, struct rxm_ep, util_ep.ep_fid.fid); if (rxm_ep->cmap) rxm_cmap_free(rxm_ep->cmap); @@ -1972,6 +2492,9 @@ static int rxm_ep_close(struct fid *fid) retv = ret; rxm_ep_txrx_res_close(rxm_ep); + ret = rxm_ep_msg_res_close(rxm_ep); + if (ret) + retv = ret; if (rxm_ep->msg_cq) { ret = fi_close(&rxm_ep->msg_cq->fid); @@ -1981,10 +2504,6 @@ static int rxm_ep_close(struct fid *fid) } } - ret = rxm_ep_msg_res_close(rxm_ep); - if (ret) - retv = ret; - ofi_endpoint_close(&rxm_ep->util_ep); fi_freeinfo(rxm_ep->rxm_info); free(rxm_ep); @@ -1994,8 +2513,9 @@ static int rxm_ep_close(struct fid *fid) static int rxm_ep_trywait_cq(void *arg) { struct rxm_fabric *rxm_fabric; - struct rxm_ep *rxm_ep = (struct rxm_ep *)arg; - struct fid *fids[1] = {&rxm_ep->msg_cq->fid}; + fid_t fid = arg; + struct rxm_ep *rxm_ep = fid->context; + struct fid *fids[1] = { &rxm_ep->msg_cq->fid }; int ret; rxm_fabric = container_of(rxm_ep->util_ep.domain->fabric, @@ -2009,8 +2529,9 @@ static int rxm_ep_trywait_cq(void *arg) static int rxm_ep_trywait_eq(void *arg) { struct rxm_fabric *rxm_fabric; - struct rxm_ep *rxm_ep = (struct rxm_ep *)arg; - struct fid *fids[1] = {&rxm_ep->msg_eq->fid}; + fid_t fid = arg; + struct rxm_ep *rxm_ep = fid->context; + struct fid *fids[1] = { &rxm_ep->msg_eq->fid }; rxm_fabric = container_of(rxm_ep->util_ep.domain->fabric, struct rxm_fabric, util_fabric); @@ -2019,34 +2540,17 @@ static int rxm_ep_trywait_eq(void *arg) static int rxm_ep_wait_fd_add(struct rxm_ep *rxm_ep, struct util_wait *wait) { - int msg_eq_fd, msg_cq_fd, ret; - - ret = fi_control(&rxm_ep->msg_cq->fid, FI_GETWAIT, &msg_cq_fd); - if (ret) { - FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, - "unable to get MSG CQ wait fd %d\n", ret); - return ret; - } - - ret = ofi_wait_fd_add(wait, msg_cq_fd, FI_EPOLL_IN, - rxm_ep_trywait_cq, rxm_ep, - &rxm_ep->util_ep.ep_fid.fid); - if (ret) - return ret; + int ret; - if (rxm_ep->util_ep.domain->data_progress == FI_PROGRESS_AUTO && - !(rxm_ep->util_ep.caps & FI_ATOMIC)) - return 0; + ret = ofi_wait_add_fid(wait, &rxm_ep->msg_cq->fid, POLLIN, + rxm_ep_trywait_cq); - ret = fi_control(&rxm_ep->msg_eq->fid, FI_GETWAIT, &msg_eq_fd); - if (ret) { - FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, - "unable to get MSG EQ wait fd %d\n", ret); + if (ret || (rxm_ep->util_ep.domain->data_progress == FI_PROGRESS_AUTO && + !(rxm_ep->util_ep.caps & FI_ATOMIC))) return ret; - } - return ofi_wait_fd_add(wait, msg_eq_fd, FI_EPOLL_IN, rxm_ep_trywait_eq, - rxm_ep, &rxm_ep->util_ep.ep_fid.fid); + return ofi_wait_add_fid(wait, &rxm_ep->msg_eq->fid, POLLIN, + rxm_ep_trywait_eq); } static int rxm_msg_cq_fd_needed(struct rxm_ep *rxm_ep) @@ -2062,6 +2566,19 @@ static int rxm_msg_cq_fd_needed(struct rxm_ep *rxm_ep) (rxm_ep->util_ep.rem_rd_cntr && rxm_ep->util_ep.rem_rd_cntr->wait)); } +static enum fi_wait_obj rxm_get_wait_obj(struct rxm_ep *ep) +{ + if (!rxm_msg_cq_fd_needed(ep)) + return FI_WAIT_NONE; + + if ((def_tcp_wait_obj != FI_WAIT_UNSPEC) && + !strncasecmp(ep->msg_info->fabric_attr->prov_name, "tcp", + strlen("tcp"))) { + return def_tcp_wait_obj; + } + return def_wait_obj; +} + static int rxm_ep_msg_cq_open(struct rxm_ep *rxm_ep) { struct rxm_domain *rxm_domain; @@ -2080,15 +2597,18 @@ static int rxm_ep_msg_cq_open(struct rxm_ep *rxm_ep) }; int i, ret; - cq_attr.size = (rxm_ep->msg_info->tx_attr->size + - rxm_ep->msg_info->rx_attr->size) * rxm_def_univ_size; + cq_attr.size = rxm_ep->msg_info->rx_attr->size; + if (rxm_ep->msg_info->ep_attr->rx_ctx_cnt != FI_SHARED_CONTEXT) + cq_attr.size *= ofi_universe_size; + cq_attr.size += rxm_ep->msg_info->tx_attr->size * ofi_universe_size; cq_attr.format = FI_CQ_FORMAT_DATA; - cq_attr.wait_obj = (rxm_msg_cq_fd_needed(rxm_ep) ? - FI_WAIT_FD : FI_WAIT_NONE); + cq_attr.wait_obj = rxm_get_wait_obj(rxm_ep); - rxm_domain = container_of(rxm_ep->util_ep.domain, struct rxm_domain, util_domain); + rxm_domain = container_of(rxm_ep->util_ep.domain, struct rxm_domain, + util_domain); - ret = fi_cq_open(rxm_domain->msg_domain, &cq_attr, &rxm_ep->msg_cq, NULL); + ret = fi_cq_open(rxm_domain->msg_domain, &cq_attr, &rxm_ep->msg_cq, + rxm_ep); if (ret) { FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "unable to open MSG CQ\n"); return ret; @@ -2115,36 +2635,58 @@ static int rxm_ep_msg_cq_open(struct rxm_ep *rxm_ep) return 0; err: fi_close(&rxm_ep->msg_cq->fid); + rxm_ep->msg_cq = NULL; return ret; } static void rxm_ep_sar_init(struct rxm_ep *rxm_ep) { + struct rxm_domain *domain; size_t param; + /* SAR segment size is capped at 64k. */ + if (rxm_eager_limit > UINT16_MAX) + goto disable_sar; + + domain = container_of(rxm_ep->util_ep.domain, struct rxm_domain, + util_domain); + if (domain->dyn_rbuf) { + FI_INFO(&rxm_prov, FI_LOG_CORE, "Dynamic receive buffer " + "enabled, disabling SAR protocol\n"); + goto disable_sar; + } + if (!fi_param_get_size_t(&rxm_prov, "sar_limit", ¶m)) { if (param <= rxm_eager_limit) { FI_WARN(&rxm_prov, FI_LOG_CORE, - "Requsted SAR limit (%zd) less or equal " - "Eager limit (%zd). SAR limit won't be used. " - "Messages of size <= SAR limit would be " - "transmitted via Inject/Eager protocol. " - "Messages of size > SAR limit would be " - "transmitted via Rendezvous protocol\n", + "Requested SAR limit (%zd) less or equal to " + "eager limit (%zd) - disabling.", param, rxm_eager_limit); - param = rxm_eager_limit; + goto disable_sar; } rxm_ep->sar_limit = param; } else { - size_t sar_limit = rxm_ep->msg_info->tx_attr->size * - rxm_eager_limit; - - rxm_ep->sar_limit = (sar_limit > RXM_SAR_LIMIT) ? - RXM_SAR_LIMIT : sar_limit; + rxm_ep->sar_limit = rxm_eager_limit * 8; } + + return; + +disable_sar: + rxm_ep->sar_limit = rxm_eager_limit; } +static void rxm_config_direct_send(struct rxm_ep *ep) +{ + int ret = 0; + + if (ep->msg_mr_local == ep->rdm_mr_local) + fi_param_get_bool(&rxm_prov, "enable_direct_send", &ret); + + ep->enable_direct_send = (ret != 0); +} + + static void rxm_ep_settings_init(struct rxm_ep *rxm_ep) { size_t max_prog_val; @@ -2155,9 +2697,11 @@ static void rxm_ep_settings_init(struct rxm_ep *rxm_ep) rxm_ep->msg_info->rx_attr->size) / 2; rxm_ep->comp_per_progress = (rxm_ep->comp_per_progress > max_prog_val) ? max_prog_val : rxm_ep->comp_per_progress; + ofi_atomic_initialize32(&rxm_ep->atomic_tx_credits, + rxm_ep->rxm_info->tx_attr->size); rxm_ep->msg_mr_local = ofi_mr_local(rxm_ep->msg_info); - rxm_ep->rxm_mr_local = ofi_mr_local(rxm_ep->rxm_info); + rxm_ep->rdm_mr_local = ofi_mr_local(rxm_ep->rxm_info); rxm_ep->inject_limit = rxm_ep->msg_info->tx_attr->inject_size; @@ -2167,8 +2711,8 @@ static void rxm_ep_settings_init(struct rxm_ep *rxm_ep) if (rxm_ep->inject_limit > (sizeof(struct rxm_pkt) + sizeof(struct rxm_rndv_hdr))) rxm_ep->buffered_min = MIN((rxm_ep->inject_limit - - (sizeof(struct rxm_pkt) + - sizeof(struct rxm_rndv_hdr))), + (sizeof(struct rxm_pkt) + + sizeof(struct rxm_rndv_hdr))), rxm_eager_limit); assert(!rxm_ep->min_multi_recv_size); @@ -2178,6 +2722,7 @@ static void rxm_ep_settings_init(struct rxm_ep *rxm_ep) rxm_ep->buffered_limit = rxm_eager_limit; rxm_ep_sar_init(rxm_ep); + rxm_config_direct_send(rxm_ep); FI_INFO(&rxm_prov, FI_LOG_CORE, "Settings:\n" @@ -2189,7 +2734,7 @@ static void rxm_ep_settings_init(struct rxm_ep *rxm_ep) "\t\t rxm inject size: %zu\n" "\t\t Protocol limits: Eager: %zu, " "SAR: %zu\n", - rxm_ep->msg_mr_local, rxm_ep->rxm_mr_local, + rxm_ep->msg_mr_local, rxm_ep->rdm_mr_local, rxm_ep->comp_per_progress, rxm_ep->buffered_min, rxm_ep->min_multi_recv_size, rxm_ep->inject_limit, rxm_ep->rxm_info->tx_attr->inject_size, @@ -2216,23 +2761,21 @@ static int rxm_ep_txrx_res_open(struct rxm_ep *rxm_ep) return ret; } -#define RXM_NEED_RX_CQ_PROGRESS(info) \ - ((info->rx_attr->caps & (FI_MSG | FI_TAGGED)) || \ - (info->rx_attr->caps & FI_ATOMIC)) - static int rxm_ep_enable_check(struct rxm_ep *rxm_ep) { if (!rxm_ep->util_ep.av) return -FI_EOPBADSTATE; + if (ofi_needs_tx(rxm_ep->rxm_info->caps) && !rxm_ep->util_ep.tx_cq) { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "missing Tx CQ\n"); + return -FI_ENOCQ; + } + if (rxm_ep->util_ep.rx_cq) return 0; - if (RXM_NEED_RX_CQ_PROGRESS(rxm_ep->rxm_info)) { - FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "endpoint missing recv CQ" - "needed for progress of operations enabled by one " - "or more of requested capabilities: %s\n", - fi_tostr(&rxm_ep->rxm_info->rx_attr->caps, FI_TYPE_CAPS)); + if (ofi_needs_rx(rxm_ep->rxm_info->caps)) { + FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "missing Rx CQ\n"); return -FI_ENOCQ; } @@ -2241,6 +2784,7 @@ static int rxm_ep_enable_check(struct rxm_ep *rxm_ep) "may be used but endpoint is missing recv CQ\n"); return -FI_ENOCQ; } + return 0; } @@ -2291,7 +2835,7 @@ static int rxm_ep_ctrl(struct fid *fid, int command, void *arg) return ret; if (rxm_ep->srx_ctx) { - ret = rxm_msg_ep_prepost_recv(rxm_ep, rxm_ep->srx_ctx); + ret = rxm_prepost_recv(rxm_ep, rxm_ep->srx_ctx); if (ret) { rxm_cmap_free(rxm_ep->cmap); FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, @@ -2323,12 +2867,13 @@ static int rxm_listener_open(struct rxm_ep *rxm_ep) .wait_obj = FI_WAIT_UNSPEC, .flags = FI_WRITE, }; + struct rxm_fabric *rxm_fabric; int ret; - struct rxm_fabric *rxm_fabric = - container_of(rxm_ep->util_ep.domain->fabric, - struct rxm_fabric, util_fabric); - ret = fi_eq_open(rxm_fabric->msg_fabric, &eq_attr, &rxm_ep->msg_eq, NULL); + rxm_fabric = container_of(rxm_ep->util_ep.domain->fabric, + struct rxm_fabric, util_fabric); + ret = fi_eq_open(rxm_fabric->msg_fabric, &eq_attr, &rxm_ep->msg_eq, + rxm_ep); if (ret) { FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "Unable to open msg EQ\n"); return ret; @@ -2354,21 +2899,21 @@ static int rxm_listener_open(struct rxm_ep *rxm_ep) return ret; } -static int rxm_ep_msg_res_open(struct rxm_ep *rxm_ep) +static int rxm_open_core_res(struct rxm_ep *ep) { + struct rxm_domain *domain; int ret; - struct rxm_domain *rxm_domain = - container_of(rxm_ep->util_ep.domain, struct rxm_domain, util_domain); - ret = ofi_get_core_info(rxm_ep->util_ep.domain->fabric->fabric_fid.api_version, - NULL, NULL, 0, &rxm_util_prov, rxm_ep->rxm_info, - rxm_info_to_core, &rxm_ep->msg_info); + domain = container_of(ep->util_ep.domain, struct rxm_domain, util_domain); + ret = ofi_get_core_info(domain->util_domain.fabric->fabric_fid.api_version, + NULL, NULL, 0, &rxm_util_prov, ep->rxm_info, + NULL, rxm_info_to_core, &ep->msg_info); if (ret) return ret; - if (rxm_ep->msg_info->ep_attr->rx_ctx_cnt == FI_SHARED_CONTEXT) { - ret = fi_srx_context(rxm_domain->msg_domain, rxm_ep->msg_info->rx_attr, - &rxm_ep->srx_ctx, NULL); + if (ep->msg_info->ep_attr->rx_ctx_cnt == FI_SHARED_CONTEXT) { + ret = fi_srx_context(domain->msg_domain, ep->msg_info->rx_attr, + &ep->srx_ctx, NULL); if (ret) { FI_WARN(&rxm_prov, FI_LOG_EP_CTRL, "Unable to open shared receive context\n"); @@ -2376,28 +2921,103 @@ static int rxm_ep_msg_res_open(struct rxm_ep *rxm_ep) } } - ret = rxm_listener_open(rxm_ep); + ret = rxm_listener_open(ep); if (ret) goto err2; - /* Zero out the port as we would be creating multiple MSG EPs for a single - * RXM EP and we don't want address conflicts. */ - if (rxm_ep->msg_info->src_addr) { - if (((struct sockaddr *)rxm_ep->msg_info->src_addr)->sa_family == AF_INET) - ((struct sockaddr_in *)(rxm_ep->msg_info->src_addr))->sin_port = 0; - else - ((struct sockaddr_in6 *)(rxm_ep->msg_info->src_addr))->sin6_port = 0; - } + /* Zero out the port as we will create multiple MSG EPs for a + * single RXM EP, and we don't want address conflicts. + */ + if (ep->msg_info->src_addr) + ofi_addr_set_port(ep->msg_info->src_addr, 0); return 0; err2: - if (rxm_ep->srx_ctx) - fi_close(&rxm_ep->srx_ctx->fid); + if (ep->srx_ctx) { + fi_close(&ep->srx_ctx->fid); + ep->srx_ctx = NULL; + } err1: - fi_freeinfo(rxm_ep->msg_info); + fi_freeinfo(ep->msg_info); + ep->msg_info = NULL; return ret; } +static ssize_t +rxm_prepare_deferred_rndv_read(struct rxm_deferred_tx_entry **def_tx_entry, + size_t index, struct iovec *iov, + void *desc[RXM_IOV_LIMIT], size_t count, + void *buf) +{ + uint8_t i; + struct rxm_rx_buf *rx_buf = buf; + + *def_tx_entry = rxm_ep_alloc_deferred_tx_entry(rx_buf->ep, rx_buf->conn, + RXM_DEFERRED_TX_RNDV_READ); + if (!*def_tx_entry) + return -FI_ENOMEM; + + (*def_tx_entry)->rndv_read.rx_buf = rx_buf; + (*def_tx_entry)->rndv_read.rma_iov.addr = + rx_buf->remote_rndv_hdr->iov[index].addr; + (*def_tx_entry)->rndv_read.rma_iov.key = + rx_buf->remote_rndv_hdr->iov[index].key; + + for (i = 0; i < count; i++) { + (*def_tx_entry)->rndv_read.rxm_iov.iov[i] = iov[i]; + (*def_tx_entry)->rndv_read.rxm_iov.desc[i] = desc[i]; + } + (*def_tx_entry)->rndv_read.rxm_iov.count = count; + + return 0; +} + +static ssize_t +rxm_prepare_deferred_rndv_write(struct rxm_deferred_tx_entry **def_tx_entry, + size_t index, struct iovec *iov, + void *desc[RXM_IOV_LIMIT], size_t count, + void *buf) +{ + uint8_t i; + struct rxm_tx_rndv_buf *tx_buf = buf; + struct rxm_ep *rxm_ep = tx_buf->write_rndv.conn->handle.cmap->ep; + + *def_tx_entry = rxm_ep_alloc_deferred_tx_entry(rxm_ep, tx_buf->write_rndv.conn, + RXM_DEFERRED_TX_RNDV_WRITE); + if (!*def_tx_entry) + return -FI_ENOMEM; + + (*def_tx_entry)->rndv_write.tx_buf = tx_buf; + (*def_tx_entry)->rndv_write.rma_iov.addr = + tx_buf->write_rndv.remote_hdr.iov[index].addr; + (*def_tx_entry)->rndv_write.rma_iov.key = + tx_buf->write_rndv.remote_hdr.iov[index].key; + + for (i = 0; i < count; i++) { + (*def_tx_entry)->rndv_write.rxm_iov.iov[i] = iov[i]; + (*def_tx_entry)->rndv_write.rxm_iov.desc[i] = desc[i]; + } + (*def_tx_entry)->rndv_write.rxm_iov.count = count; + + return 0; +} + +struct rxm_rndv_ops rxm_rndv_ops_read = { + .rx_mr_access = FI_READ, + .tx_mr_access = FI_REMOTE_READ, + .handle_rx = rxm_rndv_read, + .xfer = fi_readv, + .defer_xfer = rxm_prepare_deferred_rndv_read +}; + +struct rxm_rndv_ops rxm_rndv_ops_write = { + .rx_mr_access = FI_REMOTE_WRITE, + .tx_mr_access = FI_WRITE, + .handle_rx = rxm_rndv_send_wr_data, + .xfer = fi_writev, + .defer_xfer = rxm_prepare_deferred_rndv_write +}; + int rxm_endpoint(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep_fid, void *context) { @@ -2418,12 +3038,20 @@ int rxm_endpoint(struct fid_domain *domain, struct fi_info *info, (int *)&rxm_ep->comp_per_progress)) rxm_ep->comp_per_progress = 1; - ret = ofi_endpoint_init(domain, &rxm_util_prov, info, &rxm_ep->util_ep, - context, &rxm_ep_progress); + if (rxm_ep->rxm_info->caps & FI_COLLECTIVE) { + ret = ofi_endpoint_init(domain, &rxm_util_prov, info, + &rxm_ep->util_ep, context, + &rxm_ep_progress_coll); + } else { + ret = ofi_endpoint_init(domain, &rxm_util_prov, info, + &rxm_ep->util_ep, context, + &rxm_ep_progress); + } + if (ret) goto err1; - ret = rxm_ep_msg_res_open(rxm_ep); + ret = rxm_open_core_res(rxm_ep); if (ret) goto err2; @@ -2433,6 +3061,21 @@ int rxm_endpoint(struct fid_domain *domain, struct fi_info *info, (*ep_fid)->fid.ops = &rxm_ep_fi_ops; (*ep_fid)->ops = &rxm_ops_ep; (*ep_fid)->cm = &rxm_ops_cm; + + if(rxm_ep->rxm_info->caps & FI_COLLECTIVE) { + (*ep_fid)->collective = &rxm_ops_collective; + rxm_ep->eager_ops = &coll_eager_ops; + } else { + (*ep_fid)->collective = &rxm_ops_collective_none; + rxm_ep->eager_ops = &def_eager_ops; + } + + if (rxm_use_write_rndv) + rxm_ep->rndv_ops = &rxm_rndv_ops_write; + else + rxm_ep->rndv_ops = &rxm_rndv_ops_read; + dlist_init(&rxm_ep->rndv_wait_list); + if (rxm_ep->util_ep.domain->threading != FI_THREAD_SAFE) { (*ep_fid)->msg = &rxm_ops_msg_thread_unsafe; (*ep_fid)->tagged = &rxm_ops_tagged_thread_unsafe; diff --git a/prov/rxm/src/rxm_init.c b/prov/rxm/src/rxm_init.c index bb3f0aa6bd9..1d0fb101e6b 100644 --- a/prov/rxm/src/rxm_init.c +++ b/prov/rxm/src/rxm_init.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2016 Intel Corporation. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -37,19 +38,24 @@ #include #include "rxm.h" +#include "ofi_coll.h" -#define RXM_ATOMIC_UNSUPPORTED_MSG_ORDER (OFI_ORDER_RAR_SET | OFI_ORDER_RAW_SET | \ - OFI_ORDER_WAR_SET | OFI_ORDER_WAW_SET | \ +#define RXM_ATOMIC_UNSUPPORTED_MSG_ORDER (FI_ORDER_RAW | FI_ORDER_RAR | \ + FI_ORDER_WAW | FI_ORDER_WAR | \ FI_ORDER_SAR | FI_ORDER_SAW) #define RXM_PASSTHRU_CAPS (FI_MSG | FI_RMA | FI_SEND | FI_RECV | \ FI_READ | FI_WRITE | FI_REMOTE_READ | \ - FI_REMOTE_WRITE) + FI_REMOTE_WRITE | FI_HMEM) size_t rxm_msg_tx_size = 128; size_t rxm_msg_rx_size = 128; -size_t rxm_def_univ_size = 256; -size_t rxm_eager_limit = RXM_BUF_SIZE - sizeof(struct rxm_pkt); +size_t rxm_eager_limit = 16384; +size_t rxm_buffer_size = 16384 + sizeof(struct rxm_pkt); + +int force_auto_progress = 0; +int rxm_use_write_rndv = 0; +enum fi_wait_obj def_wait_obj = FI_WAIT_FD, def_tcp_wait_obj = FI_WAIT_UNSPEC; char *rxm_proto_state_str[] = { RXM_PROTO_STATES(OFI_STR) @@ -59,38 +65,56 @@ char *rxm_proto_state_str[] = { * - Support FI_MR_LOCAL/FI_LOCAL_MR as ofi_rxm can handle it. * - The RxM FI_RMA implementation is pass-through but the provider can handle * FI_MR_PROV_KEY and FI_MR_VIRT_ADDR in its large message transfer rendezvous - * protocol. + * protocol. We can set FI_MR_PROV_KEY and FI_MR_VIRT_ADDR only if the app + * is not using RMA. * - fi_alter_domain_attr should correctly set the mr_mode in return fi_info * based on hints. */ void rxm_info_to_core_mr_modes(uint32_t version, const struct fi_info *hints, struct fi_info *core_info) { - /* We handle FI_MR_BASIC and FI_MR_SCALABLE irrespective of version */ if (hints && hints->domain_attr && (hints->domain_attr->mr_mode & (FI_MR_SCALABLE | FI_MR_BASIC))) { - core_info->mode = FI_LOCAL_MR; + core_info->mode |= FI_LOCAL_MR; core_info->domain_attr->mr_mode = hints->domain_attr->mr_mode; } else if (FI_VERSION_LT(version, FI_VERSION(1, 5))) { core_info->mode |= FI_LOCAL_MR; - /* Specify FI_MR_UNSPEC (instead of FI_MR_BASIC) so that - * providers that support only FI_MR_SCALABLE aren't dropped */ core_info->domain_attr->mr_mode = FI_MR_UNSPEC; } else { core_info->domain_attr->mr_mode |= FI_MR_LOCAL; - if (!hints || !ofi_rma_target_allowed(hints->caps)) + if (!hints || !hints->domain_attr || + !ofi_rma_target_allowed(hints->caps)) core_info->domain_attr->mr_mode |= OFI_MR_BASIC_MAP; - else if (hints->domain_attr) + else core_info->domain_attr->mr_mode |= - hints->domain_attr->mr_mode & OFI_MR_BASIC_MAP; + hints->domain_attr->mr_mode; + + /* RxM is setup to support FI_HMEM with the core provider requiring + * FI_MR_HMEM. Always set this MR mode bit. + */ + if (hints && hints->caps & FI_HMEM) + core_info->domain_attr->mr_mode |= FI_MR_HMEM; } } +static bool rxm_use_srx(const struct fi_info *hints, + const struct fi_info *base_info) +{ + const struct fi_info *info; + int ret, use_srx = 0; + + ret = fi_param_get_bool(&rxm_prov, "use_srx", &use_srx); + if (ret != -FI_ENODATA) + return use_srx; + + info = base_info ? base_info : hints; + + return info && info->fabric_attr && info->fabric_attr->prov_name && + !strncasecmp(info->fabric_attr->prov_name, "tcp", 3); +} int rxm_info_to_core(uint32_t version, const struct fi_info *hints, - struct fi_info *core_info) + const struct fi_info *base_info, struct fi_info *core_info) { - int use_srx = 0; - rxm_info_to_core_mr_modes(version, hints, core_info); core_info->mode |= FI_RX_CQ_DATA | FI_CONTEXT; @@ -101,12 +125,15 @@ int rxm_info_to_core(uint32_t version, const struct fi_info *hints, core_info->caps |= FI_MSG | FI_SEND | FI_RECV; /* FI_RMA cap is needed for large message transfer protocol */ - if (core_info->caps & FI_MSG) - core_info->caps |= FI_RMA | FI_READ | FI_REMOTE_READ; + if (core_info->caps & FI_MSG) { + core_info->caps |= FI_RMA | FI_READ | + FI_REMOTE_READ | FI_REMOTE_WRITE; + } if (hints->domain_attr) { core_info->domain_attr->caps |= hints->domain_attr->caps; - core_info->domain_attr->threading = hints->domain_attr->threading; + core_info->domain_attr->threading = + hints->domain_attr->threading; } if (hints->tx_attr) { core_info->tx_attr->op_flags = @@ -121,8 +148,10 @@ int rxm_info_to_core(uint32_t version, const struct fi_info *hints, core_info->rx_attr->comp_order = hints->rx_attr->comp_order; } } + core_info->ep_attr->type = FI_EP_MSG; - if (!fi_param_get_bool(&rxm_prov, "use_srx", &use_srx) && use_srx) { + + if (rxm_use_srx(hints, base_info)) { FI_DBG(&rxm_prov, FI_LOG_FABRIC, "Requesting shared receive context from core provider\n"); core_info->ep_attr->rx_ctx_cnt = FI_SHARED_CONTEXT; @@ -138,41 +167,59 @@ int rxm_info_to_core(uint32_t version, const struct fi_info *hints, } int rxm_info_to_rxm(uint32_t version, const struct fi_info *core_info, - struct fi_info *info) + const struct fi_info *base_info, struct fi_info *info) { - info->caps = rxm_info.caps; - // TODO find which other modes should be filtered - info->mode = (core_info->mode & ~FI_RX_CQ_DATA) | rxm_info.mode; + info->caps = base_info->caps; + info->mode = (core_info->mode & ~FI_RX_CQ_DATA) | base_info->mode; - info->tx_attr->caps = rxm_info.tx_attr->caps; + info->tx_attr->caps = base_info->tx_attr->caps; info->tx_attr->mode = info->mode; info->tx_attr->msg_order = core_info->tx_attr->msg_order; - info->tx_attr->comp_order = rxm_info.tx_attr->comp_order; - info->tx_attr->inject_size = rxm_info.tx_attr->inject_size; - info->tx_attr->size = rxm_info.tx_attr->size; - info->tx_attr->iov_limit = MIN(rxm_info.tx_attr->iov_limit, + info->tx_attr->comp_order = base_info->tx_attr->comp_order; + + /* If the core provider requires registering send buffers, it's + * usually faster to copy small transfer through bounce buffers + * than requiring the user to register the buffers. Bump the + * inject size up to the rxm limit (eager buffer size) in this + * case. If registration is not required, use the core provider's + * limit, which avoids potential extra data copies. + * + * If we report the size of the bounce buffer, apps may call inject + * rather than send, which hampers our ability to use the direct + * send feature that avoids data copies. + */ + if (ofi_mr_local(info) || + (core_info->tx_attr->inject_size <= sizeof(struct rxm_pkt))) { + info->tx_attr->inject_size = base_info->tx_attr->inject_size; + } else { + info->tx_attr->inject_size = core_info->tx_attr->inject_size - + sizeof(struct rxm_pkt); + } + + info->tx_attr->size = base_info->tx_attr->size; + info->tx_attr->iov_limit = MIN(base_info->tx_attr->iov_limit, core_info->tx_attr->iov_limit); - info->tx_attr->rma_iov_limit = MIN(rxm_info.tx_attr->rma_iov_limit, + info->tx_attr->rma_iov_limit = MIN(base_info->tx_attr->rma_iov_limit, core_info->tx_attr->rma_iov_limit); - info->rx_attr->caps = rxm_info.rx_attr->caps; + info->rx_attr->caps = base_info->rx_attr->caps; info->rx_attr->mode = info->rx_attr->mode & ~FI_RX_CQ_DATA; info->rx_attr->msg_order = core_info->rx_attr->msg_order; - info->rx_attr->comp_order = rxm_info.rx_attr->comp_order; - info->rx_attr->size = rxm_info.rx_attr->size; - info->rx_attr->iov_limit = MIN(rxm_info.rx_attr->iov_limit, + info->rx_attr->comp_order = base_info->rx_attr->comp_order; + info->rx_attr->size = base_info->rx_attr->size; + info->rx_attr->iov_limit = MIN(base_info->rx_attr->iov_limit, core_info->rx_attr->iov_limit); - *info->ep_attr = *rxm_info.ep_attr; + *info->ep_attr = *base_info->ep_attr; info->ep_attr->max_msg_size = core_info->ep_attr->max_msg_size; info->ep_attr->max_order_raw_size = core_info->ep_attr->max_order_raw_size; info->ep_attr->max_order_war_size = core_info->ep_attr->max_order_war_size; info->ep_attr->max_order_waw_size = core_info->ep_attr->max_order_waw_size; - *info->domain_attr = *rxm_info.domain_attr; + *info->domain_attr = *base_info->domain_attr; info->domain_attr->mr_mode |= core_info->domain_attr->mr_mode; info->domain_attr->cq_data_size = MIN(core_info->domain_attr->cq_data_size, - rxm_info.domain_attr->cq_data_size); + base_info->domain_attr->cq_data_size); info->domain_attr->mr_key_size = core_info->domain_attr->mr_key_size; if (core_info->nic) { @@ -181,25 +228,51 @@ int rxm_info_to_rxm(uint32_t version, const struct fi_info *core_info, return -FI_ENOMEM; } + /* FI_HMEM is only supported if core provider supports it. */ + if (!(core_info->caps & FI_HMEM)) { + info->caps &= ~FI_HMEM; + info->tx_attr->caps &= ~FI_HMEM; + info->rx_attr->caps &= ~FI_HMEM; + } + return 0; } -static int rxm_init_info(void) +static void rxm_init_infos(void) { - size_t param; - - if (!fi_param_get_size_t(&rxm_prov, "buffer_size", ¶m)) { - if (param > sizeof(struct rxm_pkt)) { - rxm_eager_limit = param - sizeof(struct rxm_pkt); - } else { + struct fi_info *cur; + size_t eager_size, tx_size = 0, rx_size = 0; + + /* Historically, 'buffer_size' was the name given for the eager message + * size. Maintain the name for backwards compatability. + */ + if (!fi_param_get_size_t(&rxm_prov, "buffer_size", &eager_size)) { + /* We need enough space to carry extra headers */ + if (eager_size < sizeof(struct rxm_rndv_hdr) || + eager_size < sizeof(struct rxm_atomic_hdr)) { FI_WARN(&rxm_prov, FI_LOG_CORE, "Requested buffer size too small\n"); - return -FI_EINVAL; + eager_size = MAX(sizeof(struct rxm_rndv_hdr), + sizeof(struct rxm_atomic_hdr)); } + + rxm_eager_limit = eager_size; + if (rxm_eager_limit > INT32_MAX) + rxm_eager_limit = INT32_MAX; + + rxm_buffer_size = rxm_eager_limit + sizeof(struct rxm_pkt); + } + + fi_param_get_size_t(&rxm_prov, "tx_size", &tx_size); + fi_param_get_size_t(&rxm_prov, "rx_size", &rx_size); + + for (cur = (struct fi_info *) rxm_util_prov.info; cur; cur = cur->next) { + cur->tx_attr->inject_size = rxm_eager_limit; + if (tx_size) + cur->tx_attr->size = tx_size; + if (rx_size) + cur->rx_attr->size = rx_size; } - rxm_info.tx_attr->inject_size = rxm_eager_limit; - rxm_util_prov.info = &rxm_info; - return 0; } static void rxm_alter_info(const struct fi_info *hints, struct fi_info *info) @@ -207,25 +280,24 @@ static void rxm_alter_info(const struct fi_info *hints, struct fi_info *info) struct fi_info *cur; for (cur = info; cur; cur = cur->next) { - /* RxM can support higher inject size without any big - * performance penalty even if app had requested lower value - * in hints. App is still free to reduce this when opening an - * endpoint. This overrides setting by ofi_alter_info */ - cur->tx_attr->inject_size = rxm_eager_limit; - /* Remove the following caps if they are not requested as they * may affect performance in fast-path */ if (!hints) { cur->caps &= ~(FI_DIRECTED_RECV | FI_SOURCE | FI_ATOMIC); - cur->tx_attr->caps &= ~FI_ATOMIC; - cur->rx_attr->caps &= ~FI_ATOMIC; + cur->tx_attr->caps &= ~(FI_ATOMIC); + cur->rx_attr->caps &= ~(FI_DIRECTED_RECV | FI_ATOMIC | + FI_SOURCE); cur->domain_attr->data_progress = FI_PROGRESS_MANUAL; } else { - if (!(hints->caps & FI_DIRECTED_RECV)) + if (!(hints->caps & FI_DIRECTED_RECV)) { cur->caps &= ~FI_DIRECTED_RECV; - if (!(hints->caps & FI_SOURCE)) + cur->rx_attr->caps &= ~FI_DIRECTED_RECV; + } + if (!(hints->caps & FI_SOURCE)) { cur->caps &= ~FI_SOURCE; + cur->rx_attr->caps &= ~FI_SOURCE; + } if (hints->mode & FI_BUFFERED_RECV) cur->mode |= FI_BUFFERED_RECV; @@ -256,7 +328,7 @@ static void rxm_alter_info(const struct fi_info *hints, struct fi_info *info) cur->domain_attr->data_progress = FI_PROGRESS_MANUAL; if (hints->ep_attr && hints->ep_attr->mem_tag_format && - (info->caps & FI_TAGGED)) { + (info->caps & (FI_TAGGED | FI_COLLECTIVE))) { FI_INFO(&rxm_prov, FI_LOG_CORE, "mem_tag_format requested: 0x%" PRIx64 " (note: provider doesn't optimize " @@ -266,7 +338,9 @@ static void rxm_alter_info(const struct fi_info *hints, struct fi_info *info) hints->ep_attr->mem_tag_format; } } - if (cur->domain_attr->data_progress == FI_PROGRESS_AUTO) + + if (cur->domain_attr->data_progress == FI_PROGRESS_AUTO || + force_auto_progress) cur->domain_attr->threading = FI_THREAD_SAFE; } } @@ -340,18 +414,46 @@ static int rxm_getinfo(uint32_t version, const char *node, const char *service, static void rxm_fini(void) { - /* yawn */ +#if HAVE_RXM_DL + ofi_mem_fini(); +#endif } struct fi_provider rxm_prov = { .name = OFI_UTIL_PREFIX "rxm", - .version = FI_VERSION(RXM_MAJOR_VERSION, RXM_MINOR_VERSION), - .fi_version = FI_VERSION(1, 8), + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, .getinfo = rxm_getinfo, .fabric = rxm_fabric, .cleanup = rxm_fini }; +static void rxm_get_def_wait(void) +{ + char *wait_str = NULL; + + fi_param_define(&rxm_prov, "def_wait_obj", FI_PARAM_STRING, + "Specifies the default wait object used for blocking " + "operations (e.g. fi_cq_sread). Supported values " + "are: fd and pollfd (default: fd)."); + + fi_param_define(&rxm_prov, "def_tcp_wait_obj", FI_PARAM_STRING, + "See def_wait_obj for description. If set, this " + "overrides the def_wait_obj when running over the " + "tcp provider."); + + fi_param_get_str(&rxm_prov, "def_wait_obj", &wait_str); + if (wait_str && !strcasecmp(wait_str, "pollfd")) + def_wait_obj = FI_WAIT_POLLFD; + + wait_str = NULL; + fi_param_get_str(&rxm_prov, "def_tcp_wait_obj", &wait_str); + if (wait_str) { + def_tcp_wait_obj = (!strcasecmp(wait_str, "pollfd")) ? + FI_WAIT_POLLFD : FI_WAIT_FD; + } +} + RXM_INI { fi_param_define(&rxm_prov, "buffer_size", FI_PARAM_SIZE_T, @@ -374,7 +476,7 @@ RXM_INI fi_param_define(&rxm_prov, "sar_limit", FI_PARAM_SIZE_T, "Set this environment variable to enable and control " "RxM SAR (Segmentation And Reassembly) protocol " - "(default: 256 KB). This value should be set greater than " + "(default: 128 KB). This value should be set greater than " " eager limit (FI_OFI_RXM_BUFFER_SIZE - RxM protocol " "header size (%zu B)) for SAR to take effect. Messages " "of size greater than this would be transmitted via " @@ -388,10 +490,10 @@ RXM_INI "latency as a side-effect."); fi_param_define(&rxm_prov, "tx_size", FI_PARAM_SIZE_T, - "Defines default tx context size (default: 1024)."); + "Defines default tx context size (default: 65536)."); fi_param_define(&rxm_prov, "rx_size", FI_PARAM_SIZE_T, - "Defines default rx context size (default: 1024)."); + "Defines default rx context size (default: 65536)."); fi_param_define(&rxm_prov, "msg_tx_size", FI_PARAM_SIZE_T, "Defines FI_EP_MSG tx size that would be requested " @@ -410,19 +512,60 @@ RXM_INI "decrease noise during cq polling, but may result in " "longer connection establishment times. (default: 10000)."); - fi_param_get_size_t(&rxm_prov, "tx_size", &rxm_info.tx_attr->size); - fi_param_get_size_t(&rxm_prov, "rx_size", &rxm_info.rx_attr->size); + fi_param_define(&rxm_prov, "cq_eq_fairness", FI_PARAM_INT, + "Defines the maximum number of message provider CQ entries " + "that can be consecutively read across progress calls " + "without checking to see if the CM progress interval has " + "been reached. (default: 128)."); + + fi_param_define(&rxm_prov, "data_auto_progress", FI_PARAM_BOOL, + "Force auto-progress for data transfers even if app " + "requested manual progress (default: false/no)."); + + fi_param_define(&rxm_prov, "use_rndv_write", FI_PARAM_BOOL, + "Set this environment variable to control the " + "RxM Rendezvous protocol. If set (1), RxM will use " + "RMA writes rather than RMA reads during Rendezvous " + "transactions. (default: false/no)."); + + fi_param_define(&rxm_prov, "enable_dyn_rbuf", FI_PARAM_BOOL, + "Enable support for dynamic receive buffering, if " + "available by the message endpoint provider. " + "This allows direct placement of received messages " + "into application buffers, bypassing RxM bounce " + "buffers. This feature targets using tcp sockets " + "for the message transport. (default: false)"); + + fi_param_define(&rxm_prov, "enable_direct_send", FI_PARAM_BOOL, + "Enable support to pass application buffers directly " + "to the core provider when possible. This avoids " + "copying application buffers through bounce buffers " + "before passing them to the core provider. This " + "feature targets small to medium size message " + "transfers over the tcp provider. (default: false)"); + + rxm_init_infos(); fi_param_get_size_t(&rxm_prov, "msg_tx_size", &rxm_msg_tx_size); fi_param_get_size_t(&rxm_prov, "msg_rx_size", &rxm_msg_rx_size); - fi_param_get_size_t(NULL, "universe_size", &rxm_def_univ_size); if (fi_param_get_int(&rxm_prov, "cm_progress_interval", (int *) &rxm_cm_progress_interval)) rxm_cm_progress_interval = 10000; - - if (rxm_init_info()) { - FI_WARN(&rxm_prov, FI_LOG_CORE, "Unable to initialize rxm_info\n"); - return NULL; - } + if (fi_param_get_int(&rxm_prov, "cq_eq_fairness", + (int *) &rxm_cq_eq_fairness)) + rxm_cq_eq_fairness = 128; + fi_param_get_bool(&rxm_prov, "data_auto_progress", &force_auto_progress); + fi_param_get_bool(&rxm_prov, "use_rndv_write", &rxm_use_write_rndv); + + rxm_get_def_wait(); + + if (force_auto_progress) + FI_INFO(&rxm_prov, FI_LOG_CORE, "auto-progress for data requested " + "(FI_OFI_RXM_DATA_AUTO_PROGRESS = 1), domain threading " + "level would be set to FI_THREAD_SAFE\n"); + +#if HAVE_RXM_DL + ofi_mem_init(); +#endif return &rxm_prov; } diff --git a/prov/rxm/src/rxm_rma.c b/prov/rxm/src/rxm_rma.c index 45e70dcc963..15c69af4048 100644 --- a/prov/rxm/src/rxm_rma.c +++ b/prov/rxm/src/rxm_rma.c @@ -1,5 +1,6 @@ /* - * Copyright (c) 2017 Intel Corporation. All rights reserved. + * Copyright (c) 2017-2020 Intel Corporation. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -32,24 +33,20 @@ #include "rxm.h" -typedef ssize_t rxm_rma_msg_fn(struct fid_ep *ep_fid, - const struct fi_msg_rma *msg, uint64_t flags); -static inline ssize_t +static ssize_t rxm_ep_rma_reg_iov(struct rxm_ep *rxm_ep, const struct iovec *msg_iov, void **desc, void **desc_storage, size_t iov_count, - uint64_t comp_flags, struct rxm_rma_buf *rma_buf) + uint64_t access, struct rxm_rma_buf *rma_buf) { - size_t i; + size_t i, ret; if (!rxm_ep->msg_mr_local) return FI_SUCCESS; - if (!rxm_ep->rxm_mr_local) { - ssize_t ret = - rxm_ep_msg_mr_regv(rxm_ep, msg_iov, iov_count, - comp_flags & (FI_WRITE | FI_READ), - rma_buf->mr.mr); + if (!rxm_ep->rdm_mr_local) { + ret = rxm_msg_mr_regv(rxm_ep, msg_iov, iov_count, SIZE_MAX, + access, rma_buf->mr.mr); if (OFI_UNLIKELY(ret)) return ret; @@ -58,31 +55,34 @@ rxm_ep_rma_reg_iov(struct rxm_ep *rxm_ep, const struct iovec *msg_iov, rma_buf->mr.count = iov_count; } else { for (i = 0; i < iov_count; i++) - desc_storage[i] = fi_mr_desc(desc[i]); + desc_storage[i] = + fi_mr_desc(((struct rxm_mr *) desc[i])->msg_mr); } return FI_SUCCESS; } -static inline ssize_t -rxm_ep_rma_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, uint64_t flags, - rxm_rma_msg_fn rma_msg, uint64_t comp_flags) +static ssize_t +rxm_ep_rma_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, + uint64_t flags, ssize_t (*rma_msg)(struct fid_ep *ep_fid, + const struct fi_msg_rma *msg, uint64_t flags), + uint64_t comp_flags) { struct rxm_rma_buf *rma_buf; struct fi_msg_rma msg_rma = *msg; struct rxm_conn *rxm_conn; void *mr_desc[RXM_IOV_LIMIT] = { 0 }; - int ret; + ssize_t ret; assert(msg->rma_iov_count <= rxm_ep->rxm_info->tx_attr->rma_iov_limit); ofi_ep_lock_acquire(&rxm_ep->util_ep); - ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn); + ret = rxm_get_conn(rxm_ep, msg->addr, &rxm_conn); if (OFI_UNLIKELY(ret)) goto unlock; - rma_buf = rxm_rma_buf_alloc(rxm_ep); - if (OFI_UNLIKELY(!rma_buf)) { + rma_buf = ofi_buf_alloc(rxm_ep->buf_pools[RXM_BUF_POOL_RMA].pool); + if (!rma_buf) { ret = -FI_EAGAIN; goto unlock; } @@ -91,8 +91,8 @@ rxm_ep_rma_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, uint64_t rma_buf->flags = flags; ret = rxm_ep_rma_reg_iov(rxm_ep, msg_rma.msg_iov, msg_rma.desc, mr_desc, - msg_rma.iov_count, comp_flags & (FI_WRITE | FI_READ), - rma_buf); + msg_rma.iov_count, + comp_flags & (FI_WRITE | FI_READ), rma_buf); if (OFI_UNLIKELY(ret)) goto release; @@ -103,8 +103,8 @@ rxm_ep_rma_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, uint64_t if (OFI_LIKELY(!ret)) goto unlock; - if ((rxm_ep->msg_mr_local) && (!rxm_ep->rxm_mr_local)) - rxm_ep_msg_mr_closev(rma_buf->mr.mr, rma_buf->mr.count); + if ((rxm_ep->msg_mr_local) && (!rxm_ep->rdm_mr_local)) + rxm_msg_mr_closev(rma_buf->mr.mr, rma_buf->mr.count); release: ofi_buf_free(rma_buf); unlock: @@ -112,12 +112,13 @@ rxm_ep_rma_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, uint64_t return ret; } -static inline ssize_t -rxm_ep_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) +static ssize_t +rxm_ep_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, + uint64_t flags) { - struct rxm_ep *rxm_ep = - container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); return rxm_ep_rma_common(rxm_ep, msg, flags | rxm_ep->util_ep.tx_msg_flags, fi_readmsg, FI_READ); } @@ -144,7 +145,8 @@ static ssize_t rxm_ep_readv(struct fid_ep *ep_fid, const struct iovec *iov, .data = 0, }; - return rxm_ep_rma_common(rxm_ep, &msg, rxm_ep_tx_flags(rxm_ep), fi_readmsg, FI_READ); + return rxm_ep_rma_common(rxm_ep, &msg, rxm_ep->util_ep.tx_op_flags, + fi_readmsg, FI_READ); } static ssize_t rxm_ep_read(struct fid_ep *ep_fid, void *buf, size_t len, @@ -170,22 +172,33 @@ static ssize_t rxm_ep_read(struct fid_ep *ep_fid, void *buf, size_t len, .context = context, .data = 0, }; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; - return rxm_ep_rma_common(rxm_ep, &msg, rxm_ep_tx_flags(rxm_ep), fi_readmsg, FI_READ); + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); + return rxm_ep_rma_common(rxm_ep, &msg, rxm_ep->util_ep.tx_op_flags, + fi_readmsg, FI_READ); } -static inline void +static void rxm_ep_format_rma_msg(struct rxm_rma_buf *rma_buf, const struct fi_msg_rma *orig_msg, struct iovec *rxm_iov, struct fi_msg_rma *rxm_msg) { + ssize_t ret __attribute__((unused)); + enum fi_hmem_iface iface; + uint64_t device; + + iface = rxm_mr_desc_to_hmem_iface_dev(orig_msg->desc, + orig_msg->iov_count, &device); + rxm_msg->context = rma_buf; rxm_msg->addr = orig_msg->addr; rxm_msg->data = orig_msg->data; - ofi_copy_from_iov(rma_buf->pkt.data, rma_buf->pkt.hdr.size, - orig_msg->msg_iov, orig_msg->iov_count, 0); + ret = ofi_copy_from_hmem_iov(rma_buf->pkt.data, rma_buf->pkt.hdr.size, + iface, device, orig_msg->msg_iov, + orig_msg->iov_count, 0); + assert(ret == rma_buf->pkt.hdr.size); + rxm_iov->iov_base = &rma_buf->pkt.data; rxm_iov->iov_len = rma_buf->pkt.hdr.size; rxm_msg->msg_iov = rxm_iov; @@ -196,9 +209,10 @@ rxm_ep_format_rma_msg(struct rxm_rma_buf *rma_buf, const struct fi_msg_rma *orig rxm_msg->rma_iov_count = orig_msg->rma_iov_count; } -static inline ssize_t -rxm_ep_rma_emulate_inject_msg(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, size_t total_size, - const struct fi_msg_rma *msg, uint64_t flags) +static ssize_t +rxm_ep_rma_emulate_inject_msg(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, + size_t total_size, const struct fi_msg_rma *msg, + uint64_t flags) { struct rxm_rma_buf *rma_buf; ssize_t ret; @@ -207,8 +221,8 @@ rxm_ep_rma_emulate_inject_msg(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, assert(msg->rma_iov_count <= rxm_ep->rxm_info->tx_attr->rma_iov_limit); - rma_buf = rxm_rma_buf_alloc(rxm_ep); - if (OFI_UNLIKELY(!rma_buf)) + rma_buf = ofi_buf_alloc(rxm_ep->buf_pools[RXM_BUF_POOL_RMA].pool); + if (!rma_buf) return -FI_EAGAIN; rma_buf->pkt.hdr.size = total_size; @@ -227,7 +241,7 @@ rxm_ep_rma_emulate_inject_msg(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, return ret; } -static inline ssize_t +static ssize_t rxm_ep_rma_emulate_inject(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key, @@ -256,8 +270,9 @@ rxm_ep_rma_emulate_inject(struct rxm_ep *rxm_ep, struct rxm_conn *rxm_conn, return rxm_ep_rma_emulate_inject_msg(rxm_ep, rxm_conn, len, &msg, flags); } -static inline ssize_t -rxm_ep_rma_inject_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, uint64_t flags) +static ssize_t +rxm_ep_rma_inject_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, + uint64_t flags) { struct rxm_conn *rxm_conn; size_t total_size = ofi_total_iov_len(msg->msg_iov, msg->iov_count); @@ -267,15 +282,16 @@ rxm_ep_rma_inject_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, ui ofi_ep_lock_acquire(&rxm_ep->util_ep); - ret = rxm_ep_prepare_tx(rxm_ep, msg->addr, &rxm_conn); + ret = rxm_get_conn(rxm_ep, msg->addr, &rxm_conn); if (OFI_UNLIKELY(ret)) goto unlock; if ((total_size > rxm_ep->msg_info->tx_attr->inject_size) || + rxm_ep->util_ep.wr_cntr || (flags & FI_COMPLETION) || (msg->iov_count > 1) || (msg->rma_iov_count > 1)) { - ret = rxm_ep_rma_emulate_inject_msg(rxm_ep, rxm_conn, total_size, - msg, flags); + ret = rxm_ep_rma_emulate_inject_msg(rxm_ep, rxm_conn, + total_size, msg, flags); goto unlock; } @@ -292,27 +308,23 @@ rxm_ep_rma_inject_common(struct rxm_ep *rxm_ep, const struct fi_msg_rma *msg, ui msg->rma_iov->addr, msg->rma_iov->key); } - if (OFI_LIKELY(!ret)) { - ofi_ep_wr_cntr_inc(&rxm_ep->util_ep); - } else { - if (OFI_LIKELY(ret == -FI_EAGAIN)) - rxm_ep_do_progress(&rxm_ep->util_ep); - else - FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "fi_inject_write* for" - "MSG provider failed: %zd\n", ret); - } + if (ret == -FI_EAGAIN) + rxm_ep_do_progress(&rxm_ep->util_ep); + else if (ret) + FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "fi_inject_write* for" + "MSG provider failed: %zd\n", ret); unlock: ofi_ep_lock_release(&rxm_ep->util_ep); return ret; } -static inline ssize_t +static ssize_t rxm_ep_generic_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) { - struct rxm_ep *rxm_ep = - container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); if (flags & FI_INJECT) return rxm_ep_rma_inject_common(rxm_ep, msg, flags); else @@ -320,13 +332,15 @@ rxm_ep_generic_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, fi_writemsg, FI_WRITE); } -static inline ssize_t -rxm_ep_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) +static ssize_t +rxm_ep_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, + uint64_t flags) { - struct rxm_ep *rxm_ep = - container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; - return rxm_ep_generic_writemsg(ep_fid, msg, flags | rxm_ep->util_ep.tx_msg_flags); + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); + return rxm_ep_generic_writemsg(ep_fid, msg, flags | + rxm_ep->util_ep.tx_msg_flags); } static ssize_t rxm_ep_writev(struct fid_ep *ep_fid, const struct iovec *iov, @@ -348,10 +362,11 @@ static ssize_t rxm_ep_writev(struct fid_ep *ep_fid, const struct iovec *iov, .context = context, .data = 0, }; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; - return rxm_ep_generic_writemsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep)); + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); + return rxm_ep_generic_writemsg(ep_fid, &msg, + rxm_ep->util_ep.tx_op_flags); } static ssize_t rxm_ep_writedata(struct fid_ep *ep_fid, const void *buf, @@ -378,10 +393,10 @@ static ssize_t rxm_ep_writedata(struct fid_ep *ep_fid, const void *buf, .context = context, .data = data, }; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; - return rxm_ep_generic_writemsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep) | + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); + return rxm_ep_generic_writemsg(ep_fid, &msg, rxm_ep->util_ep.tx_op_flags | FI_REMOTE_CQ_DATA); } @@ -408,44 +423,41 @@ static ssize_t rxm_ep_write(struct fid_ep *ep_fid, const void *buf, .context = context, .data = 0, }; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; - return rxm_ep_generic_writemsg(ep_fid, &msg, rxm_ep_tx_flags(rxm_ep)); + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); + return rxm_ep_generic_writemsg(ep_fid, &msg, rxm_ep->util_ep.tx_op_flags); } static ssize_t rxm_ep_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { - ssize_t ret; struct rxm_conn *rxm_conn; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; + ssize_t ret; + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ofi_ep_lock_acquire(&rxm_ep->util_ep); - ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); + ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn); if (OFI_UNLIKELY(ret)) goto unlock; - if (len > rxm_ep->msg_info->tx_attr->inject_size) { - ret = rxm_ep_rma_emulate_inject( - rxm_ep, rxm_conn, buf, len, 0, - dest_addr, addr, key, FI_INJECT); + if (len > rxm_ep->msg_info->tx_attr->inject_size || + rxm_ep->util_ep.wr_cntr) { + ret = rxm_ep_rma_emulate_inject(rxm_ep, rxm_conn, buf, len, 0, + dest_addr, addr, key, + FI_INJECT); goto unlock; } ret = fi_inject_write(rxm_conn->msg_ep, buf, len, dest_addr, addr, key); - if (OFI_LIKELY(!ret)) { - ofi_ep_wr_cntr_inc(&rxm_ep->util_ep); - } else { - if (OFI_LIKELY(ret == -FI_EAGAIN)) - rxm_ep_do_progress(&rxm_ep->util_ep); - else - FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "fi_inject_write for" - " MSG provider failed: %zd\n", ret); - } + if (ret == -FI_EAGAIN) + rxm_ep_do_progress(&rxm_ep->util_ep); + else if (ret) + FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "fi_inject_write for" + " MSG provider failed: %zd\n", ret); unlock: ofi_ep_lock_release(&rxm_ep->util_ep); return ret; @@ -456,17 +468,19 @@ static ssize_t rxm_ep_inject_writedata(struct fid_ep *ep_fid, const void *buf, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { - ssize_t ret; struct rxm_conn *rxm_conn; - struct rxm_ep *rxm_ep = container_of(ep_fid, struct rxm_ep, - util_ep.ep_fid.fid); + struct rxm_ep *rxm_ep; + ssize_t ret; + + rxm_ep = container_of(ep_fid, struct rxm_ep, util_ep.ep_fid.fid); ofi_ep_lock_acquire(&rxm_ep->util_ep); - ret = rxm_ep_prepare_tx(rxm_ep, dest_addr, &rxm_conn); + ret = rxm_get_conn(rxm_ep, dest_addr, &rxm_conn); if (OFI_UNLIKELY(ret)) goto unlock; - if (len > rxm_ep->msg_info->tx_attr->inject_size) { + if (len > rxm_ep->msg_info->tx_attr->inject_size || + rxm_ep->util_ep.wr_cntr) { ret = rxm_ep_rma_emulate_inject( rxm_ep, rxm_conn, buf, len, data, dest_addr, addr, key, FI_REMOTE_CQ_DATA | FI_INJECT); @@ -475,15 +489,11 @@ static ssize_t rxm_ep_inject_writedata(struct fid_ep *ep_fid, const void *buf, ret = fi_inject_writedata(rxm_conn->msg_ep, buf, len, data, dest_addr, addr, key); - if (OFI_LIKELY(!ret)) { - ofi_ep_wr_cntr_inc(&rxm_ep->util_ep); - } else { - if (OFI_LIKELY(ret == -FI_EAGAIN)) - rxm_ep_do_progress(&rxm_ep->util_ep); - else - FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "fi_inject_writedata" - " for MSG provider failed: %zd\n", ret); - } + if (ret == -FI_EAGAIN) + rxm_ep_do_progress(&rxm_ep->util_ep); + else if (ret) + FI_WARN(&rxm_prov, FI_LOG_EP_DATA, "fi_inject_writedata" + " for MSG provider failed: %zd\n", ret); unlock: ofi_ep_lock_release(&rxm_ep->util_ep); return ret; diff --git a/prov/shm/Makefile.include b/prov/shm/Makefile.include index 84285be4bac..fbd5ac57d0c 100644 --- a/prov/shm/Makefile.include +++ b/prov/shm/Makefile.include @@ -13,6 +13,7 @@ _shm_files = \ prov/shm/src/smr_fabric.c \ prov/shm/src/smr_init.c \ prov/shm/src/smr_av.c \ + prov/shm/src/smr_signal.h \ prov/shm/src/smr.h if HAVE_SHM_DL diff --git a/prov/shm/configure.m4 b/prov/shm/configure.m4 index e5cace3e22d..61b8d46d5a2 100644 --- a/prov/shm/configure.m4 +++ b/prov/shm/configure.m4 @@ -14,9 +14,12 @@ AC_DEFUN([FI_SHM_CONFIGURE],[ AS_IF([test x"$enable_shm" != x"no"], [ # check if CMA support are present - AC_CHECK_FUNC([process_vm_readv], - [cma_happy=1], - [cma_happy=0]) + AS_IF([test x$linux = x1 && test x$host_cpu = xx86_64], + [cma_happy=1], + [AC_CHECK_FUNC([process_vm_readv], + [cma_happy=1], + [cma_happy=0])] + ) # check if SHM support are present AC_CHECK_FUNC([shm_open], diff --git a/prov/shm/src/smr.h b/prov/shm/src/smr.h index 8b865413276..1800a81c025 100644 --- a/prov/shm/src/smr.h +++ b/prov/shm/src/smr.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2015-2021 Intel Corporation, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -35,6 +35,7 @@ #endif /* HAVE_CONFIG_H */ #include +#include #include #include #include @@ -57,16 +58,19 @@ #include #include #include +#include #include #include +#include #ifndef _SMR_H_ #define _SMR_H_ +struct smr_env { + size_t sar_threshold; +}; -#define SMR_MAJOR_VERSION 1 -#define SMR_MINOR_VERSION 1 - +extern struct smr_env smr_env; extern struct fi_provider smr_prov; extern struct fi_info smr_info; extern struct util_prov smr_util_prov; @@ -77,8 +81,14 @@ int smr_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, struct smr_av { struct util_av util_av; struct smr_map *smr_map; + size_t used; }; +static inline int64_t smr_addr_lookup(struct util_av *av, fi_addr_t fiaddr) +{ + return *((int64_t *) ofi_av_get_addr(av, fiaddr)); +} + int smr_domain_open(struct fid_fabric *fabric, struct fi_info *info, struct fid_domain **dom, void *context); @@ -93,36 +103,64 @@ int smr_query_atomic(struct fid_domain *domain, enum fi_datatype datatype, #define SMR_IOV_LIMIT 4 -struct smr_ep_entry { +struct smr_rx_entry { struct dlist_entry entry; void *context; - fi_addr_t addr; + int64_t peer_id; uint64_t tag; uint64_t ignore; struct iovec iov[SMR_IOV_LIMIT]; uint32_t iov_count; uint16_t flags; uint64_t err; + enum fi_hmem_iface iface; + uint64_t device; +}; + +struct smr_tx_entry { + struct smr_cmd cmd; + int64_t peer_id; + void *context; + struct iovec iov[SMR_IOV_LIMIT]; + uint32_t iov_count; + size_t bytes_done; + int next; + void *map_ptr; + struct smr_ep_name *map_name; + enum fi_hmem_iface iface; + uint64_t device; + int fd; +}; + +struct smr_sar_entry { + struct dlist_entry entry; + struct smr_cmd cmd; + struct smr_rx_entry rx_entry; + size_t bytes_done; + int next; + struct iovec iov[SMR_IOV_LIMIT]; + size_t iov_count; + enum fi_hmem_iface iface; + uint64_t device; }; struct smr_ep; typedef int (*smr_rx_comp_func)(struct smr_ep *ep, void *context, uint32_t op, - uint16_t flags, size_t len, void *buf, void *addr, + uint16_t flags, size_t len, void *buf, fi_addr_t addr, uint64_t tag, uint64_t data, uint64_t err); typedef int (*smr_tx_comp_func)(struct smr_ep *ep, void *context, uint32_t op, uint16_t flags, uint64_t err); struct smr_match_attr { - fi_addr_t addr; + int64_t id; uint64_t tag; uint64_t ignore; }; -static inline int smr_match_addr(fi_addr_t addr, fi_addr_t match_addr) +static inline int smr_match_id(int64_t id, int64_t match_id) { - return (addr == FI_ADDR_UNSPEC) || (match_addr == FI_ADDR_UNSPEC) || - (addr == match_addr); + return (id == -1) || (match_id == -1) || (id == match_id); } static inline int smr_match_tag(uint64_t tag, uint64_t ignore, uint64_t match_tag) @@ -130,14 +168,27 @@ static inline int smr_match_tag(uint64_t tag, uint64_t ignore, uint64_t match_ta return ((tag | ignore) == (match_tag | ignore)); } +static inline enum fi_hmem_iface smr_get_mr_hmem_iface(struct util_domain *domain, + void **desc, uint64_t *device) +{ + if (!(domain->mr_mode & FI_MR_HMEM) || !desc || !*desc) { + *device = 0; + return FI_HMEM_SYSTEM; + } + + *device = ((struct ofi_mr *) *desc)->device; + return ((struct ofi_mr *) *desc)->iface; +} + struct smr_unexp_msg { struct dlist_entry entry; struct smr_cmd cmd; }; -DECLARE_FREESTACK(struct smr_ep_entry, smr_recv_fs); -DECLARE_FREESTACK(struct smr_unexp_msg, smr_unexp_fs); -DECLARE_FREESTACK(struct smr_cmd, smr_pend_fs); +OFI_DECLARE_FREESTACK(struct smr_rx_entry, smr_recv_fs); +OFI_DECLARE_FREESTACK(struct smr_unexp_msg, smr_unexp_fs); +OFI_DECLARE_FREESTACK(struct smr_tx_entry, smr_pend_fs); +OFI_DECLARE_FREESTACK(struct smr_sar_entry, smr_sar_fs); struct smr_queue { struct dlist_entry list; @@ -159,6 +210,8 @@ struct smr_domain { #define SMR_PREFIX "fi_shm://" #define SMR_PREFIX_NS "fi_ns://" +#define SMR_ZE_SOCK_PATH "/dev/shm/ze_" + static inline const char *smr_no_prefix(const char *addr) { char *start; @@ -172,6 +225,46 @@ static inline const char *smr_no_prefix(const char *addr) #define smr_fast_rma_enabled(mode, order) ((mode & FI_MR_VIRT_ADDR) && \ !(order & SMR_RMA_ORDER)) +static inline uint64_t smr_get_offset(void *base, void *addr) +{ + return (uintptr_t) ((char *) addr - (char *) base); +} + +static inline void *smr_get_ptr(void *base, uint64_t offset) +{ + return (char *) base + (uintptr_t) offset; +} + +extern struct dlist_entry sock_name_list; +extern pthread_mutex_t sock_list_lock; + +struct smr_sock_name { + char name[SMR_SOCK_NAME_MAX]; + struct dlist_entry entry; +}; + +enum smr_cmap_state { + SMR_CMAP_INIT = 0, + SMR_CMAP_SUCCESS, + SMR_CMAP_FAILED, +}; + +struct smr_cmap_entry { + enum smr_cmap_state state; + int device_fds[ZE_MAX_DEVICES]; +}; + +struct smr_sock_info { + char name[SMR_SOCK_NAME_MAX]; + int listen_sock; + ofi_epoll_t epollfd; + struct fd_signal signal; + pthread_t listener_thread; + int *my_fds; + int nfds; + struct smr_cmap_entry peers[SMR_MAX_PEERS]; +}; + struct smr_ep { struct util_ep util_ep; smr_rx_comp_func rx_comp; @@ -180,45 +273,77 @@ struct smr_ep { size_t rx_size; size_t min_multi_recv_size; const char *name; - struct smr_region *region; + uint64_t msg_id; + struct smr_region *volatile region; struct smr_recv_fs *recv_fs; /* protected by rx_cq lock */ struct smr_queue recv_queue; struct smr_queue trecv_queue; struct smr_unexp_fs *unexp_fs; struct smr_pend_fs *pend_fs; - struct smr_queue unexp_queue; + struct smr_sar_fs *sar_fs; + struct smr_queue unexp_msg_queue; + struct smr_queue unexp_tagged_queue; + struct dlist_entry sar_list; + + int ep_idx; + struct smr_sock_info *sock_info; }; #define smr_ep_rx_flags(smr_ep) ((smr_ep)->util_ep.rx_op_flags) #define smr_ep_tx_flags(smr_ep) ((smr_ep)->util_ep.tx_op_flags) +static inline int smr_mmap_name(char *shm_name, const char *ep_name, + uint64_t msg_id) +{ + return snprintf(shm_name, SMR_NAME_MAX - 1, "%s_%ld", + ep_name, msg_id); +} + int smr_endpoint(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); +void smr_ep_exchange_fds(struct smr_ep *ep, int64_t id); int smr_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct fid_cq **cq_fid, void *context); int smr_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, struct fid_cntr **cntr_fid, void *context); -int smr_verify_peer(struct smr_ep *ep, int peer_id); - -void smr_post_pend_resp(struct smr_cmd *cmd, struct smr_cmd *pend, - struct smr_resp *resp); -void smr_generic_format(struct smr_cmd *cmd, fi_addr_t peer_id, - uint32_t op, uint64_t tag, uint8_t datatype, uint8_t atomic_op, - uint64_t data, uint64_t op_flags); -void smr_format_inline(struct smr_cmd *cmd, fi_addr_t peer_id, - const struct iovec *iov, size_t count, - uint32_t op, uint64_t tag, uint64_t data, uint64_t op_flags); -void smr_format_inject(struct smr_cmd *cmd, fi_addr_t peer_id, - const struct iovec *iov, size_t count, - uint32_t op, uint64_t tag, uint64_t data, uint64_t op_flags, - struct smr_region *smr, struct smr_inject_buf *tx_buf); -void smr_format_iov(struct smr_cmd *cmd, fi_addr_t peer_id, - const struct iovec *iov, size_t count, size_t total_len, - uint32_t op, uint64_t tag, uint64_t data, uint64_t op_flags, - void *context, struct smr_region *smr, struct smr_resp *resp, - struct smr_cmd *pend); +int64_t smr_verify_peer(struct smr_ep *ep, fi_addr_t fi_addr); + +void smr_format_pend_resp(struct smr_tx_entry *pend, struct smr_cmd *cmd, + void *context, enum fi_hmem_iface iface, uint64_t device, + const struct iovec *iov, uint32_t iov_count, + int64_t id, struct smr_resp *resp); +void smr_generic_format(struct smr_cmd *cmd, int64_t peer_id, uint32_t op, + uint64_t tag, uint64_t data, uint64_t op_flags); +void smr_format_inline(struct smr_cmd *cmd, enum fi_hmem_iface iface, + uint64_t device, const struct iovec *iov, size_t count); +void smr_format_inject(struct smr_cmd *cmd, enum fi_hmem_iface iface, uint64_t device, + const struct iovec *iov, size_t count, + struct smr_region *smr, struct smr_inject_buf *tx_buf); +void smr_format_iov(struct smr_cmd *cmd, const struct iovec *iov, size_t count, + size_t total_len, struct smr_region *smr, + struct smr_resp *resp); +int smr_format_ze_ipc(struct smr_ep *ep, int64_t id, struct smr_cmd *cmd, + const struct iovec *iov, uint64_t device, + size_t total_len, struct smr_region *smr, + struct smr_resp *resp, struct smr_tx_entry *pend); +int smr_format_mmap(struct smr_ep *ep, struct smr_cmd *cmd, + const struct iovec *iov, size_t count, size_t total_len, + struct smr_tx_entry *pend, struct smr_resp *resp); +void smr_format_sar(struct smr_cmd *cmd, enum fi_hmem_iface iface, uint64_t deivce, + const struct iovec *iov, size_t count, + size_t total_len, struct smr_region *smr, + struct smr_region *peer_smr, struct smr_sar_msg *sar_msg, + struct smr_tx_entry *pending, struct smr_resp *resp); +size_t smr_copy_to_sar(struct smr_sar_msg *sar_msg, struct smr_resp *resp, + struct smr_cmd *cmd, enum fi_hmem_iface, + uint64_t device, const struct iovec *iov, size_t count, + size_t *bytes_done, int *next); +size_t smr_copy_from_sar(struct smr_sar_msg *sar_msg, struct smr_resp *resp, + struct smr_cmd *cmd, enum fi_hmem_iface iface, + uint64_t device, const struct iovec *iov, size_t count, + size_t *bytes_done, int *next); int smr_complete_tx(struct smr_ep *ep, void *context, uint32_t op, uint16_t flags, uint64_t err); @@ -227,24 +352,71 @@ int smr_tx_comp(struct smr_ep *ep, void *context, uint32_t op, int smr_tx_comp_signal(struct smr_ep *ep, void *context, uint32_t op, uint16_t flags, uint64_t err); int smr_complete_rx(struct smr_ep *ep, void *context, uint32_t op, - uint16_t flags, size_t len, void *buf, void *addr, + uint16_t flags, size_t len, void *buf, int64_t id, uint64_t tag, uint64_t data, uint64_t err); int smr_rx_comp(struct smr_ep *ep, void *context, uint32_t op, - uint16_t flags, size_t len, void *buf, void *addr, + uint16_t flags, size_t len, void *buf, fi_addr_t addr, uint64_t tag, uint64_t data, uint64_t err); int smr_rx_src_comp(struct smr_ep *ep, void *context, uint32_t op, - uint16_t flags, size_t len, void *buf, void *addr, + uint16_t flags, size_t len, void *buf, fi_addr_t addr, uint64_t tag, uint64_t data, uint64_t err); int smr_rx_comp_signal(struct smr_ep *ep, void *context, uint32_t op, - uint16_t flags, size_t len, void *buf, void *addr, + uint16_t flags, size_t len, void *buf, fi_addr_t addr, uint64_t tag, uint64_t data, uint64_t err); int smr_rx_src_comp_signal(struct smr_ep *ep, void *context, uint32_t op, - uint16_t flags, size_t len, void *buf, void *addr, + uint16_t flags, size_t len, void *buf, fi_addr_t addr, uint64_t tag, uint64_t data, uint64_t err); uint64_t smr_rx_cq_flags(uint32_t op, uint16_t op_flags); void smr_ep_progress(struct util_ep *util_ep); -int smr_progress_unexp(struct smr_ep *ep, struct smr_ep_entry *entry); + +static inline bool smr_cma_enabled(struct smr_ep *ep, + struct smr_region *peer_smr) +{ + if (ep->region == peer_smr) + return ep->region->cma_cap_self == SMR_CMA_CAP_ON; + else + return ep->region->cma_cap_peer == SMR_CMA_CAP_ON; +} + +static inline bool smr_ze_ipc_enabled(struct smr_region *smr, + struct smr_region *peer_smr) +{ + return (smr->flags & SMR_FLAG_IPC_SOCK) && + (peer_smr->flags & SMR_FLAG_IPC_SOCK); +} + +static inline int smr_cma_loop(pid_t pid, struct iovec *local, + unsigned long local_cnt, struct iovec *remote, + unsigned long remote_cnt, unsigned long flags, + size_t total, bool write) +{ + ssize_t ret; + + while (1) { + if (write) + ret = ofi_process_vm_writev(pid, local, local_cnt, remote, + remote_cnt, flags); + else + ret = ofi_process_vm_readv(pid, local, local_cnt, remote, + remote_cnt, flags); + if (ret < 0) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "CMA error %d\n", errno); + return -FI_EIO; + } + + total -= ret; + if (!total) + return FI_SUCCESS; + + ofi_consume_iov(local, &local_cnt, (size_t) ret); + ofi_consume_iov(remote, &remote_cnt, (size_t) ret); + } +} + +int smr_progress_unexp_queue(struct smr_ep *ep, struct smr_rx_entry *entry, + struct smr_queue *unexp_queue); #endif diff --git a/prov/shm/src/smr_atomic.c b/prov/shm/src/smr_atomic.c index 2d1de2db4c6..14ff545f430 100644 --- a/prov/shm/src/smr_atomic.c +++ b/prov/shm/src/smr_atomic.c @@ -35,6 +35,7 @@ #include #include "ofi_iov.h" +#include "ofi_hmem.h" #include "smr.h" @@ -45,131 +46,83 @@ static void smr_format_rma_ioc(struct smr_cmd *cmd, const struct fi_rma_ioc *rma memcpy(cmd->rma.rma_ioc, rma_ioc, sizeof(*rma_ioc) * ioc_count); } -static void smr_format_inline_atomic(struct smr_cmd *cmd, fi_addr_t peer_id, +static void smr_generic_atomic_format(struct smr_cmd *cmd, uint8_t datatype, + uint8_t atomic_op) +{ + cmd->msg.hdr.datatype = datatype; + cmd->msg.hdr.atomic_op = atomic_op; +} + +static void smr_format_inline_atomic(struct smr_cmd *cmd, + enum fi_hmem_iface iface, uint64_t device, const struct iovec *iov, size_t count, const struct iovec *compv, - size_t comp_count, uint32_t op, - enum fi_datatype datatype, - enum fi_op atomic_op, uint64_t op_flags) + size_t comp_count) { size_t comp_size; - smr_generic_format(cmd, peer_id, op, 0, datatype, - atomic_op, 0, op_flags); cmd->msg.hdr.op_src = smr_src_inline; - switch (op) { + + switch (cmd->msg.hdr.op) { case ofi_op_atomic: case ofi_op_atomic_fetch: - cmd->msg.hdr.size = ofi_copy_from_iov(cmd->msg.data.msg, - SMR_MSG_DATA_LEN, iov, count, 0); + cmd->msg.hdr.size = ofi_copy_from_hmem_iov(cmd->msg.data.msg, + SMR_MSG_DATA_LEN, iface, device, + iov, count, 0); break; case ofi_op_atomic_compare: - cmd->msg.hdr.size = ofi_copy_from_iov(cmd->msg.data.buf, - SMR_MSG_DATA_LEN, iov, count, 0); - comp_size = ofi_copy_from_iov(cmd->msg.data.comp, - SMR_MSG_DATA_LEN, compv, - comp_count, 0); + cmd->msg.hdr.size = ofi_copy_from_hmem_iov(cmd->msg.data.buf, + SMR_MSG_DATA_LEN, iface, device, + iov, count, 0); + comp_size = ofi_copy_from_hmem_iov(cmd->msg.data.comp, + SMR_MSG_DATA_LEN, iface, device, + compv, comp_count, 0); if (comp_size != cmd->msg.hdr.size) FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "atomic and compare buffer size mimatch\n"); + "atomic and compare buffer size mismatch\n"); break; default: break; } } -static void smr_format_inject_atomic(struct smr_cmd *cmd, fi_addr_t peer_id, - const struct iovec *iov, size_t count, - const struct iovec *resultv, - size_t result_count, - const struct iovec *compv, - size_t comp_count, - uint32_t op, enum fi_datatype datatype, - enum fi_op atomic_op, - struct smr_region *smr, - struct smr_inject_buf *tx_buf, uint64_t op_flags) +static void smr_format_inject_atomic(struct smr_cmd *cmd, + enum fi_hmem_iface iface, uint64_t device, + const struct iovec *iov, size_t count, + const struct iovec *resultv, size_t result_count, + const struct iovec *compv, size_t comp_count, + struct smr_region *smr, struct smr_inject_buf *tx_buf) { size_t comp_size; - smr_generic_format(cmd, peer_id, op, 0, datatype, - atomic_op, 0, op_flags); cmd->msg.hdr.op_src = smr_src_inject; - cmd->msg.hdr.src_data = (char **) tx_buf - (char **) smr; + cmd->msg.hdr.src_data = smr_get_offset(smr, tx_buf); - switch (op) { + switch (cmd->msg.hdr.op) { case ofi_op_atomic: case ofi_op_atomic_fetch: - if (atomic_op == FI_ATOMIC_READ) + if (cmd->msg.hdr.atomic_op == FI_ATOMIC_READ) cmd->msg.hdr.size = ofi_total_iov_len(resultv, result_count); else - cmd->msg.hdr.size = ofi_copy_from_iov(tx_buf->data, - SMR_INJECT_SIZE, iov, count, 0); + cmd->msg.hdr.size = ofi_copy_from_hmem_iov(tx_buf->data, + SMR_INJECT_SIZE, iface, device, + iov, count, 0); break; case ofi_op_atomic_compare: - cmd->msg.hdr.size = ofi_copy_from_iov(tx_buf->buf, - SMR_COMP_INJECT_SIZE, iov, count, 0); - comp_size = ofi_copy_from_iov(tx_buf->comp, SMR_COMP_INJECT_SIZE, - compv, comp_count, 0); + cmd->msg.hdr.size = ofi_copy_from_hmem_iov(tx_buf->buf, + SMR_COMP_INJECT_SIZE, iface, device, + iov, count, 0); + comp_size = ofi_copy_from_hmem_iov(tx_buf->comp, SMR_COMP_INJECT_SIZE, + iface, device, compv, comp_count, 0); if (comp_size != cmd->msg.hdr.size) FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "atomic and compare buffer size mimatch\n"); + "atomic and compare buffer size mismatch\n"); break; default: break; } } -static int smr_fetch_result(struct smr_ep *ep, struct smr_region *peer_smr, - struct iovec *iov, size_t iov_count, - const struct fi_rma_ioc *rma_ioc, size_t rma_count, - enum fi_datatype datatype, size_t total_len) -{ - int ret, i; - struct iovec rma_iov[SMR_IOV_LIMIT]; - - for (i = 0; i < rma_count; i++) { - rma_iov[i].iov_base = (void *) rma_ioc[i].addr; - rma_iov[i].iov_len = rma_ioc[i].count * ofi_datatype_size(datatype); - } - - ret = process_vm_readv(peer_smr->pid, iov, iov_count, - rma_iov, rma_count, 0); - if (ret != total_len) { - if (ret < 0) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "CMA write error\n"); - return -errno; - } else { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "partial read occurred\n"); - return -FI_EIO; - } - } - - return 0; -} - -static void smr_post_fetch_resp(struct smr_ep *ep, struct smr_cmd *cmd, - const struct iovec *result_iov, size_t count) -{ - struct smr_cmd *pend; - struct smr_resp *resp; - - assert(!ofi_cirque_isfull(smr_resp_queue(ep->region))); - resp = ofi_cirque_tail(smr_resp_queue(ep->region)); - - cmd->msg.hdr.data = (uint64_t) ((char **) resp - - (char **) ep->region); - - pend = freestack_pop(ep->pend_fs); - smr_post_pend_resp(cmd, pend, resp); - memcpy(pend->msg.data.iov, result_iov, - sizeof(*result_iov) * count); - pend->msg.data.iov_count = count; - - ofi_cirque_commit(smr_resp_queue(ep->region)); -} - static ssize_t smr_generic_atomic(struct smr_ep *ep, const struct fi_ioc *ioc, void **desc, size_t count, const struct fi_ioc *compare_ioc, void **compare_desc, @@ -180,33 +133,36 @@ static ssize_t smr_generic_atomic(struct smr_ep *ep, enum fi_op atomic_op, void *context, uint32_t op, uint64_t op_flags) { - struct smr_domain *domain; struct smr_region *peer_smr; struct smr_inject_buf *tx_buf; + struct smr_tx_entry *pend; + struct smr_resp *resp = NULL; struct smr_cmd *cmd; struct iovec iov[SMR_IOV_LIMIT]; struct iovec compare_iov[SMR_IOV_LIMIT]; struct iovec result_iov[SMR_IOV_LIMIT]; - int peer_id, err = 0; + enum fi_hmem_iface iface; + uint64_t device; + int64_t id, peer_id; + int err = 0; uint16_t flags = 0; ssize_t ret = 0; - size_t msg_len, total_len; + size_t total_len; assert(count <= SMR_IOV_LIMIT); assert(result_count <= SMR_IOV_LIMIT); assert(compare_count <= SMR_IOV_LIMIT); assert(rma_count <= SMR_IOV_LIMIT); - domain = container_of(ep->util_ep.domain, struct smr_domain, util_domain); + id = smr_verify_peer(ep, addr); + if (id < 0) + return -FI_EAGAIN; - peer_id = (int) addr; - ret = smr_verify_peer(ep, peer_id); - if(ret) - return ret; + peer_id = smr_peer_data(ep->region)[id].addr.id; + peer_smr = smr_peer_region(ep->region, id); - peer_smr = smr_peer_region(ep->region, peer_id); fastlock_acquire(&peer_smr->lock); - if (peer_smr->cmd_cnt < 2) { + if (peer_smr->cmd_cnt < 2 || smr_peer_data(ep->region)[id].sar_status) { ret = -FI_EAGAIN; goto unlock_region; } @@ -217,10 +173,9 @@ static ssize_t smr_generic_atomic(struct smr_ep *ep, goto unlock_cq; } - cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr)); - msg_len = total_len = ofi_datatype_size(datatype) * - ofi_total_ioc_cnt(ioc, count); - + cmd = ofi_cirque_next(smr_cmd_queue(peer_smr)); + total_len = ofi_datatype_size(datatype) * ofi_total_ioc_cnt(ioc, count); + switch (op) { case ofi_op_atomic_compare: assert(compare_ioc); @@ -232,8 +187,7 @@ static ssize_t smr_generic_atomic(struct smr_ep *ep, assert(result_ioc); ofi_ioc_to_iov(result_ioc, result_iov, result_count, ofi_datatype_size(datatype)); - if (!domain->fast_rma) - flags |= SMR_RMA_REQ; + flags |= SMR_RMA_REQ; /* fall through */ case ofi_op_atomic: if (atomic_op != FI_ATOMIC_READ) { @@ -247,16 +201,33 @@ static ssize_t smr_generic_atomic(struct smr_ep *ep, break; } - if (total_len <= SMR_MSG_DATA_LEN && !(flags & SMR_RMA_REQ)) { - smr_format_inline_atomic(cmd, smr_peer_addr(ep->region)[peer_id].addr, - iov, count, compare_iov, compare_count, - op, datatype, atomic_op, op_flags); + iface = smr_get_mr_hmem_iface(ep->util_ep.domain, desc, &device); + + smr_generic_format(cmd, peer_id, op, 0, 0, op_flags); + smr_generic_atomic_format(cmd, datatype, atomic_op); + + if (total_len <= SMR_MSG_DATA_LEN && !(flags & SMR_RMA_REQ) && + !(op_flags & FI_DELIVERY_COMPLETE)) { + smr_format_inline_atomic(cmd, iface, device, iov, count, compare_iov, + compare_count); } else if (total_len <= SMR_INJECT_SIZE) { tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr)); - smr_format_inject_atomic(cmd, smr_peer_addr(ep->region)[peer_id].addr, - iov, count, result_iov, result_count, - compare_iov, compare_count, op, datatype, - atomic_op, peer_smr, tx_buf, op_flags); + smr_format_inject_atomic(cmd, iface, device, iov, count, result_iov, + result_count, compare_iov, compare_count, + peer_smr, tx_buf); + if (flags & SMR_RMA_REQ || op_flags & FI_DELIVERY_COMPLETE) { + if (ofi_cirque_isfull(smr_resp_queue(ep->region))) { + smr_freestack_push(smr_inject_pool(peer_smr), tx_buf); + ret = -FI_EAGAIN; + goto unlock_cq; + } + resp = ofi_cirque_next(smr_resp_queue(ep->region)); + pend = ofi_freestack_pop(ep->pend_fs); + smr_format_pend_resp(pend, cmd, context, iface, device, result_iov, + result_count, id, resp); + cmd->msg.hdr.data = smr_get_offset(ep->region, resp); + ofi_cirque_commit(smr_resp_queue(ep->region)); + } } else { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "message too large\n"); @@ -268,28 +239,16 @@ static ssize_t smr_generic_atomic(struct smr_ep *ep, ofi_cirque_commit(smr_cmd_queue(peer_smr)); peer_smr->cmd_cnt--; - if (op != ofi_op_atomic) { - if (flags & SMR_RMA_REQ) { - smr_post_fetch_resp(ep, cmd, - (const struct iovec *) result_iov, - result_count); - goto format_rma; - } - err = smr_fetch_result(ep, peer_smr, result_iov, result_count, - rma_ioc, rma_count, datatype, msg_len); - if (err) + if (!resp) { + ret = smr_complete_tx(ep, context, op, cmd->msg.hdr.op_flags, + err); + if (ret) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "unable to fetch results"); - } - - ret = smr_complete_tx(ep, context, op, cmd->msg.hdr.op_flags, err); - if (ret) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "unable to process tx completion\n"); + "unable to process tx completion\n"); + } } -format_rma: - cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr)); + cmd = ofi_cirque_next(smr_cmd_queue(peer_smr)); smr_format_rma_ioc(cmd, rma_ioc, rma_count); ofi_cirque_commit(smr_cmd_queue(peer_smr)); peer_smr->cmd_cnt--; @@ -366,7 +325,7 @@ static ssize_t smr_atomic_inject(struct fid_ep *ep_fid, const void *buf, struct smr_cmd *cmd; struct iovec iov; struct fi_rma_ioc rma_ioc; - int peer_id; + int64_t id, peer_id; ssize_t ret = 0; size_t total_len; @@ -374,21 +333,22 @@ static ssize_t smr_atomic_inject(struct fid_ep *ep_fid, const void *buf, ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - peer_id = (int) dest_addr; - ret = smr_verify_peer(ep, peer_id); - if(ret) - return ret; + id = smr_verify_peer(ep, dest_addr); + if (id < 0) + return -FI_EAGAIN; + + peer_id = smr_peer_data(ep->region)[id].addr.id; + peer_smr = smr_peer_region(ep->region, id); - peer_smr = smr_peer_region(ep->region, peer_id); fastlock_acquire(&peer_smr->lock); - if (peer_smr->cmd_cnt < 2) { + if (peer_smr->cmd_cnt < 2 || smr_peer_data(ep->region)[id].sar_status) { ret = -FI_EAGAIN; goto unlock_region; } - cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr)); + cmd = ofi_cirque_next(smr_cmd_queue(peer_smr)); total_len = count * ofi_datatype_size(datatype); - + iov.iov_base = (void *) buf; iov.iov_len = total_len; @@ -396,20 +356,20 @@ static ssize_t smr_atomic_inject(struct fid_ep *ep_fid, const void *buf, rma_ioc.count = count; rma_ioc.key = key; + smr_generic_format(cmd, peer_id, ofi_op_atomic, 0, 0, 0); + smr_generic_atomic_format(cmd, datatype, op); + if (total_len <= SMR_MSG_DATA_LEN) { - smr_format_inline_atomic(cmd, smr_peer_addr(ep->region)[peer_id].addr, - &iov, 1, NULL, 0, ofi_op_atomic, - datatype, op, 0); + smr_format_inline_atomic(cmd, FI_HMEM_SYSTEM, 0, &iov, 1, NULL, 0); } else if (total_len <= SMR_INJECT_SIZE) { tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr)); - smr_format_inject_atomic(cmd, smr_peer_addr(ep->region)[peer_id].addr, - &iov, 1, NULL, 0, NULL, 0, ofi_op_atomic, - datatype, op, peer_smr, tx_buf, 0); + smr_format_inject_atomic(cmd, FI_HMEM_SYSTEM, 0, &iov, 1, NULL, + 0, NULL, 0, peer_smr, tx_buf); } ofi_cirque_commit(smr_cmd_queue(peer_smr)); peer_smr->cmd_cnt--; - cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr)); + cmd = ofi_cirque_next(smr_cmd_queue(peer_smr)); smr_format_rma_ioc(cmd, &rma_ioc, 1); ofi_cirque_commit(smr_cmd_queue(peer_smr)); peer_smr->cmd_cnt--; diff --git a/prov/shm/src/smr_attr.c b/prov/shm/src/smr_attr.c index 917de5b4d74..34026d5964f 100644 --- a/prov/shm/src/smr_attr.c +++ b/prov/shm/src/smr_attr.c @@ -34,11 +34,10 @@ #define SMR_TX_CAPS (OFI_TX_MSG_CAPS | FI_TAGGED | OFI_TX_RMA_CAPS | FI_ATOMICS) #define SMR_RX_CAPS (FI_SOURCE | FI_RMA_EVENT | OFI_RX_MSG_CAPS | FI_TAGGED | \ - OFI_RX_RMA_CAPS | FI_ATOMICS) -#define SMR_TX_OP_FLAGS (FI_REMOTE_CQ_DATA | FI_COMPLETION | \ - FI_INJECT_COMPLETE | FI_TRANSMIT_COMPLETE | \ - /* TODO: support for delivery complete */ \ - FI_DELIVERY_COMPLETE) + OFI_RX_RMA_CAPS | FI_ATOMICS | FI_DIRECTED_RECV | \ + FI_MULTI_RECV) +#define SMR_TX_OP_FLAGS (FI_COMPLETION | FI_INJECT_COMPLETE | \ + FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE) #define SMR_RX_OP_FLAGS (FI_COMPLETION | FI_MULTI_RECV) struct fi_tx_attr smr_tx_attr = { @@ -53,7 +52,27 @@ struct fi_tx_attr smr_tx_attr = { }; struct fi_rx_attr smr_rx_attr = { - .caps = SMR_RX_CAPS | FI_MULTI_RECV, + .caps = SMR_RX_CAPS, + .op_flags = SMR_RX_OP_FLAGS, + .comp_order = FI_ORDER_STRICT, + .msg_order = SMR_RMA_ORDER | FI_ORDER_SAS, + .size = 1024, + .iov_limit = SMR_IOV_LIMIT +}; + +struct fi_tx_attr smr_hmem_tx_attr = { + .caps = SMR_TX_CAPS | FI_HMEM, + .op_flags = SMR_TX_OP_FLAGS, + .comp_order = FI_ORDER_NONE, + .msg_order = SMR_RMA_ORDER | FI_ORDER_SAS, + .inject_size = 0, + .size = 1024, + .iov_limit = SMR_IOV_LIMIT, + .rma_iov_limit = SMR_IOV_LIMIT +}; + +struct fi_rx_attr smr_hmem_rx_attr = { + .caps = SMR_RX_CAPS | FI_HMEM, .op_flags = SMR_RX_OP_FLAGS, .comp_order = FI_ORDER_STRICT, .msg_order = SMR_RMA_ORDER | FI_ORDER_SAS, @@ -81,11 +100,11 @@ struct fi_domain_attr smr_domain_attr = { .data_progress = FI_PROGRESS_MANUAL, .resource_mgmt = FI_RM_ENABLED, .av_type = FI_AV_UNSPEC, - .mr_mode = FI_MR_SCALABLE, + .mr_mode = FI_MR_BASIC | FI_MR_SCALABLE, .mr_key_size = sizeof_field(struct fi_rma_iov, key), .cq_data_size = sizeof_field(struct smr_msg_hdr, data), .cq_cnt = (1 << 10), - .ep_cnt = (1 << 10), + .ep_cnt = SMR_MAX_PEERS, .tx_ctx_cnt = (1 << 10), .rx_ctx_cnt = (1 << 10), .max_ep_tx_ctx = 1, @@ -96,7 +115,17 @@ struct fi_domain_attr smr_domain_attr = { struct fi_fabric_attr smr_fabric_attr = { .name = "shm", - .prov_version = FI_VERSION(SMR_MAJOR_VERSION, SMR_MINOR_VERSION) + .prov_version = OFI_VERSION_DEF_PROV +}; + +struct fi_info smr_hmem_info = { + .caps = SMR_TX_CAPS | SMR_RX_CAPS | FI_HMEM | FI_MULTI_RECV, + .addr_format = FI_ADDR_STR, + .tx_attr = &smr_hmem_tx_attr, + .rx_attr = &smr_hmem_rx_attr, + .ep_attr = &smr_ep_attr, + .domain_attr = &smr_domain_attr, + .fabric_attr = &smr_fabric_attr }; struct fi_info smr_info = { @@ -106,5 +135,6 @@ struct fi_info smr_info = { .rx_attr = &smr_rx_attr, .ep_attr = &smr_ep_attr, .domain_attr = &smr_domain_attr, - .fabric_attr = &smr_fabric_attr + .fabric_attr = &smr_fabric_attr, + .next = &smr_hmem_info, }; diff --git a/prov/shm/src/smr_av.c b/prov/shm/src/smr_av.c index b046542d175..7ce79167af1 100644 --- a/prov/shm/src/smr_av.c +++ b/prov/shm/src/smr_av.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017 Intel Corporation. All rights reserved. + * Copyright (c) 2015-2020 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -52,49 +52,61 @@ static int smr_av_close(struct fid *fid) /* * Input address: smr name (string) - * output address: index (integer), the output from util_av and peer index in map + * output address: index (fi_addr_t), the output from util_av */ static int smr_av_insert(struct fid_av *av_fid, const void *addr, size_t count, fi_addr_t *fi_addr, uint64_t flags, void *context) { - struct smr_addr *smr_names = (void *)addr; struct util_av *util_av; struct util_ep *util_ep; struct smr_av *smr_av; struct smr_ep *smr_ep; struct dlist_entry *av_entry; const char *ep_name; - fi_addr_t index; + fi_addr_t util_addr; + int64_t shm_id = -1; int i, ret; int succ_count = 0; util_av = container_of(av_fid, struct util_av, av_fid); smr_av = container_of(util_av, struct smr_av, util_av); - for (i = 0; i < count; i++) { - ep_name = smr_no_prefix((const char *) smr_names[i].name); - ret = ofi_av_insert_addr(util_av, ep_name, &index); + for (i = 0; i < count; i++, addr = (char *) addr + strlen(addr) + 1) { + if (smr_av->used < SMR_MAX_PEERS) { + ep_name = smr_no_prefix(addr); + ret = smr_map_add(&smr_prov, smr_av->smr_map, + ep_name, &shm_id); + if (!ret) + ret = ofi_av_insert_addr(util_av, &shm_id, + &util_addr); + } else { + FI_WARN(&smr_prov, FI_LOG_AV, + "AV insert failed. The maximum number of AV " + "entries shm supported has been reached.\n"); + util_addr = FI_ADDR_NOTAVAIL; + ret = -FI_ENOMEM; + } + + if (fi_addr) + fi_addr[i] = util_addr; + if (ret) { if (util_av->eq) ofi_av_write_event(util_av, i, -ret, context); + if (shm_id >= 0) + smr_map_del(smr_av->smr_map, shm_id); + continue; } else { - ret = smr_map_add(&smr_prov, smr_av->smr_map, - ep_name, index); - if (ret) { - if (util_av->eq) - ofi_av_write_event(util_av, i, -ret, context); - } else { - succ_count++; - } + assert(shm_id >= 0 && shm_id < SMR_MAX_PEERS); + smr_av->smr_map->peers[shm_id].fiaddr = util_addr; + succ_count++; + smr_av->used++; } - if (fi_addr) - fi_addr[i] = (ret == 0) ? index : FI_ADDR_NOTAVAIL; - dlist_foreach(&util_av->ep_list, av_entry) { util_ep = container_of(av_entry, struct util_ep, av_entry); smr_ep = container_of(util_ep, struct smr_ep, util_ep); - smr_map_to_endpoint(smr_ep->region, index); + smr_map_to_endpoint(smr_ep->region, shm_id); } } @@ -114,12 +126,14 @@ static int smr_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, size_t count struct smr_ep *smr_ep; struct dlist_entry *av_entry; int i, ret = 0; + int64_t id; util_av = container_of(av_fid, struct util_av, av_fid); smr_av = container_of(util_av, struct smr_av, util_av); fastlock_acquire(&util_av->lock); for (i = 0; i < count; i++) { + id = smr_addr_lookup(util_av, fi_addr[i]); ret = ofi_av_remove_addr(util_av, fi_addr[i]); if (ret) { FI_WARN(&smr_prov, FI_LOG_AV, @@ -127,12 +141,13 @@ static int smr_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, size_t count break; } - smr_map_del(smr_av->smr_map, fi_addr[i]); + smr_map_del(smr_av->smr_map, id); dlist_foreach(&util_av->ep_list, av_entry) { util_ep = container_of(av_entry, struct util_ep, av_entry); smr_ep = container_of(util_ep, struct smr_ep, util_ep); - smr_unmap_from_endpoint(smr_ep->region, fi_addr[i]); + smr_unmap_from_endpoint(smr_ep->region, id); } + smr_av->used--; } fastlock_release(&util_av->lock); @@ -145,18 +160,20 @@ static int smr_av_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr, struct util_av *util_av; struct smr_av *smr_av; struct smr_region *peer_smr; - int peer_id = (int)fi_addr; + int64_t id; util_av = container_of(av, struct util_av, av_fid); smr_av = container_of(util_av, struct smr_av, util_av); - peer_smr = smr_map_get(smr_av->smr_map, peer_id); + + id = smr_addr_lookup(util_av, fi_addr); + peer_smr = smr_map_get(smr_av->smr_map, id); if (!peer_smr) - return -FI_ADDR_NOTAVAIL; + return -FI_ENODATA; strncpy((char *)addr, smr_name(peer_smr), *addrlen); - ((char *) addr)[*addrlen] = '\0'; - *addrlen = sizeof(struct smr_addr); + ((char *) addr)[MIN(*addrlen - 1, strlen(smr_name(peer_smr)))] = '\0'; + *addrlen = strlen(smr_name(peer_smr) + 1); return 0; } @@ -212,7 +229,8 @@ int smr_av_open(struct fid_domain *domain, struct fi_av_attr *attr, if (!smr_av) return -FI_ENOMEM; - util_attr.addrlen = SMR_NAME_SIZE; + util_attr.addrlen = sizeof(int64_t); + util_attr.context_len = 0; util_attr.flags = 0; if (attr->count > SMR_MAX_PEERS) { ret = -FI_ENOSYS; @@ -223,6 +241,7 @@ int smr_av_open(struct fid_domain *domain, struct fi_av_attr *attr, if (ret) goto out; + smr_av->used = 0; *av = &smr_av->util_av.av_fid; (*av)->fid.ops = &smr_av_fi_ops; (*av)->ops = &smr_av_ops; diff --git a/prov/shm/src/smr_cntr.c b/prov/shm/src/smr_cntr.c index db57df2738b..a499d0c2531 100644 --- a/prov/shm/src/smr_cntr.c +++ b/prov/shm/src/smr_cntr.c @@ -38,8 +38,15 @@ int smr_cntr_open(struct fid_domain *domain, struct fi_cntr_attr *attr, int ret; struct util_cntr *cntr; - if (attr->wait_obj != FI_WAIT_NONE) { - FI_INFO(&smr_prov, FI_LOG_CNTR, "cntr wait not yet supported\n"); + switch (attr->wait_obj) { + case FI_WAIT_UNSPEC: + attr->wait_obj = FI_WAIT_YIELD; + /* fall through */ + case FI_WAIT_NONE: + case FI_WAIT_YIELD: + break; + default: + FI_INFO(&smr_prov, FI_LOG_CQ, "cntr wait not yet supported\n"); return -FI_ENOSYS; } diff --git a/prov/shm/src/smr_comp.c b/prov/shm/src/smr_comp.c index e3367358877..e883ec0dbf8 100644 --- a/prov/shm/src/smr_comp.c +++ b/prov/shm/src/smr_comp.c @@ -48,36 +48,67 @@ int smr_complete_tx(struct smr_ep *ep, void *context, uint32_t op, return ep->tx_comp(ep, context, op, flags, err); } -int smr_tx_comp(struct smr_ep *ep, void *context, uint32_t op, - uint16_t flags, uint64_t err) +static int +smr_write_err_comp(struct util_cq *cq, void *context, + uint64_t flags, uint64_t tag, uint64_t err) { - struct fi_cq_tagged_entry *comp; - struct util_cq_oflow_err_entry *entry; - - comp = ofi_cirque_tail(ep->util_ep.tx_cq->cirq); - if (err) { - if (!(entry = calloc(1, sizeof(*entry)))) - return -FI_ENOMEM; - entry->comp.op_context = context; - entry->comp.flags = ofi_tx_cq_flags(op); - entry->comp.err = err; - entry->comp.prov_errno = -err; - slist_insert_tail(&entry->list_entry, - &ep->util_ep.tx_cq->oflow_err_list); - comp->flags = UTIL_FLAG_ERROR; + struct fi_cq_err_entry err_entry; + + memset(&err_entry, 0, sizeof err_entry); + err_entry.op_context = context; + err_entry.flags = flags; + err_entry.tag = tag; + err_entry.err = err; + err_entry.prov_errno = -err; + return ofi_cq_insert_error(cq, &err_entry); +} + +static int +smr_write_comp(struct util_cq *cq, void *context, + uint64_t flags, size_t len, void *buf, + uint64_t tag, uint64_t data, uint64_t err) +{ + if (err) + return smr_write_err_comp(cq, context, flags, tag, err); + + if (ofi_cirque_freecnt(cq->cirq) > 1) { + ofi_cq_write_entry(cq, context, flags, len, + buf, data, tag); + return 0; } else { - comp->op_context = context; - comp->flags = ofi_tx_cq_flags(op); - comp->len = 0; - comp->buf = NULL; - comp->data = 0; + return ofi_cq_write_overflow(cq, context, flags, + len, buf, data, tag, + FI_ADDR_NOTAVAIL); } - ofi_cirque_commit(ep->util_ep.tx_cq->cirq); - return 0; } -int smr_tx_comp_signal(struct smr_ep *ep, void *context, uint32_t op, +static int +smr_write_src_comp(struct util_cq *cq, void *context, + uint64_t flags, size_t len, void *buf, fi_addr_t addr, + uint64_t tag, uint64_t data, uint64_t err) +{ + if (err) + return smr_write_err_comp(cq, context, flags, tag, err); + + if (ofi_cirque_freecnt(cq->cirq) > 1) { + ofi_cq_write_src_entry(cq, context, flags, len, + buf, data, tag, addr); + return 0; + } else { + return ofi_cq_write_overflow(cq, context, flags, + len, buf, data, tag, addr); + } +} + +int smr_tx_comp(struct smr_ep *ep, void *context, uint32_t op, uint16_t flags, uint64_t err) +{ + return smr_write_comp(ep->util_ep.tx_cq, context, + ofi_tx_cq_flags(op), 0, NULL, 0, 0, err); +} + +int smr_tx_comp_signal(struct smr_ep *ep, void *context, uint32_t op, + uint16_t flags, uint64_t err) { int ret; @@ -88,67 +119,50 @@ int smr_tx_comp_signal(struct smr_ep *ep, void *context, uint32_t op, return 0; } -int smr_complete_rx(struct smr_ep *ep, void *context, uint32_t op, uint16_t flags, - size_t len, void *buf, void *addr, uint64_t tag, uint64_t data, - uint64_t err) +int smr_complete_rx(struct smr_ep *ep, void *context, uint32_t op, + uint16_t flags, size_t len, void *buf, int64_t id, + uint64_t tag, uint64_t data, uint64_t err) { + fi_addr_t fiaddr = FI_ADDR_UNSPEC; + ofi_ep_rx_cntr_inc_func(&ep->util_ep, op); if (!err && !(flags & (SMR_REMOTE_CQ_DATA | SMR_RX_COMPLETION))) return 0; + if (ep->util_ep.domain->info_domain_caps & FI_SOURCE) + fiaddr = ep->region->map->peers[id].fiaddr; + return ep->rx_comp(ep, context, op, flags, len, buf, - addr, tag, data, err); + fiaddr, tag, data, err); } int smr_rx_comp(struct smr_ep *ep, void *context, uint32_t op, - uint16_t flags, size_t len, void *buf, void *addr, + uint16_t flags, size_t len, void *buf, fi_addr_t addr, uint64_t tag, uint64_t data, uint64_t err) { - struct fi_cq_tagged_entry *comp; - struct util_cq_oflow_err_entry *entry; - - comp = ofi_cirque_tail(ep->util_ep.rx_cq->cirq); - if (err) { - if (!(entry = calloc(1, sizeof(*entry)))) - return -FI_ENOMEM; - entry->comp.op_context = context; - entry->comp.flags = smr_rx_cq_flags(op, flags); - entry->comp.tag = tag; - entry->comp.err = err; - entry->comp.prov_errno = -err; - slist_insert_tail(&entry->list_entry, - &ep->util_ep.rx_cq->oflow_err_list); - comp->flags = UTIL_FLAG_ERROR; - } else { - comp->op_context = context; - comp->flags = smr_rx_cq_flags(op, flags); - comp->len = len; - comp->buf = buf; - comp->data = data; - comp->tag = tag; - } - ofi_cirque_commit(ep->util_ep.rx_cq->cirq); - return 0; + return smr_write_comp(ep->util_ep.rx_cq, context, + smr_rx_cq_flags(op, flags), len, buf, + tag, data, err); } int smr_rx_src_comp(struct smr_ep *ep, void *context, uint32_t op, - uint16_t flags, size_t len, void *buf, void *addr, + uint16_t flags, size_t len, void *buf, fi_addr_t addr, uint64_t tag, uint64_t data, uint64_t err) { - ep->util_ep.rx_cq->src[ofi_cirque_windex(ep->util_ep.rx_cq->cirq)] = - (uint32_t) (uintptr_t) addr; - return smr_rx_comp(ep, context, op, flags, len, buf, addr, tag, - data, err); + return smr_write_src_comp(ep->util_ep.rx_cq, context, + smr_rx_cq_flags(op, flags), len, buf, addr, + tag, data, err); } int smr_rx_comp_signal(struct smr_ep *ep, void *context, uint32_t op, - uint16_t flags, size_t len, void *buf, void *addr, + uint16_t flags, size_t len, void *buf, fi_addr_t addr, uint64_t tag, uint64_t data, uint64_t err) { int ret; - ret = smr_rx_comp(ep, context, op, flags, len, buf, addr, tag, data, err); + ret = smr_rx_comp(ep, context, op, flags, len, buf, addr, tag, + data, err); if (ret) return ret; ep->util_ep.rx_cq->wait->signal(ep->util_ep.rx_cq->wait); @@ -156,7 +170,7 @@ int smr_rx_comp_signal(struct smr_ep *ep, void *context, uint32_t op, } int smr_rx_src_comp_signal(struct smr_ep *ep, void *context, uint32_t op, - uint16_t flags, size_t len, void *buf, void *addr, + uint16_t flags, size_t len, void *buf, fi_addr_t addr, uint64_t tag, uint64_t data, uint64_t err) { int ret; diff --git a/prov/shm/src/smr_cq.c b/prov/shm/src/smr_cq.c index 29ac1b100f0..908629d3215 100644 --- a/prov/shm/src/smr_cq.c +++ b/prov/shm/src/smr_cq.c @@ -41,7 +41,14 @@ int smr_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct util_cq *util_cq; int ret; - if (attr->wait_obj != FI_WAIT_NONE) { + switch (attr->wait_obj) { + case FI_WAIT_UNSPEC: + attr->wait_obj = FI_WAIT_YIELD; + /* fall through */ + case FI_WAIT_NONE: + case FI_WAIT_YIELD: + break; + default: FI_INFO(&smr_prov, FI_LOG_CQ, "CQ wait not yet supported\n"); return -FI_ENOSYS; } @@ -50,7 +57,8 @@ int smr_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, if (!util_cq) return -FI_ENOMEM; - ret = ofi_cq_init(&smr_prov, domain, attr, util_cq, ofi_cq_progress, context); + ret = ofi_cq_init(&smr_prov, domain, attr, util_cq, + &ofi_cq_progress, context); if (ret) goto free; diff --git a/prov/shm/src/smr_domain.c b/prov/shm/src/smr_domain.c index 1ff6ee8ccee..59700024290 100644 --- a/prov/shm/src/smr_domain.c +++ b/prov/shm/src/smr_domain.c @@ -46,6 +46,7 @@ static struct fi_ops_domain smr_domain_ops = { .stx_ctx = fi_no_stx_context, .srx_ctx = fi_no_srx_context, .query_atomic = smr_query_atomic, + .query_collective = fi_no_query_collective, }; static int smr_domain_close(fid_t fid) diff --git a/prov/shm/src/smr_ep.c b/prov/shm/src/smr_ep.c index 5614977ffb2..cf55b3d4514 100644 --- a/prov/shm/src/smr_ep.c +++ b/prov/shm/src/smr_ep.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Intel Corporation. All rights reserved + * Copyright (c) 2013-2021 Intel Corporation. All rights reserved * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,14 +33,18 @@ #include #include #include +#include #include "ofi_iov.h" +#include "ofi_hmem.h" #include "smr.h" extern struct fi_ops_msg smr_msg_ops; extern struct fi_ops_tagged smr_tagged_ops; extern struct fi_ops_rma smr_rma_ops; extern struct fi_ops_atomic smr_atomic_ops; +DEFINE_LIST(sock_name_list); +pthread_mutex_t sock_list_lock = PTHREAD_MUTEX_INITIALIZER; int smr_setname(fid_t fid, void *addr, size_t addrlen) { @@ -70,7 +74,12 @@ int smr_getname(fid_t fid, void *addr, size_t *addrlen) if (!addr || *addrlen == 0 || snprintf(addr, *addrlen, "%s", ep->name) >= *addrlen) ret = -FI_ETOOSMALL; - *addrlen = sizeof(struct smr_addr); + + *addrlen = strlen(ep->name) + 1; + + if (!ret) + ((char *) addr)[*addrlen - 1] = '\0'; + return ret; } @@ -115,19 +124,18 @@ int smr_setopt(fid_t fid, int level, int optname, return FI_SUCCESS; } - static int smr_match_recv_ctx(struct dlist_entry *item, const void *args) { - struct smr_ep_entry *pending_recv; + struct smr_rx_entry *pending_recv; - pending_recv = container_of(item, struct smr_ep_entry, entry); + pending_recv = container_of(item, struct smr_rx_entry, entry); return pending_recv->context == args; } static int smr_ep_cancel_recv(struct smr_ep *ep, struct smr_queue *queue, void *context) { - struct smr_ep_entry *recv_entry; + struct smr_rx_entry *recv_entry; struct dlist_entry *entry; int ret = 0; @@ -135,12 +143,12 @@ static int smr_ep_cancel_recv(struct smr_ep *ep, struct smr_queue *queue, entry = dlist_remove_first_match(&queue->list, smr_match_recv_ctx, context); if (entry) { - recv_entry = container_of(entry, struct smr_ep_entry, entry); + recv_entry = container_of(entry, struct smr_rx_entry, entry); ret = smr_complete_rx(ep, (void *) recv_entry->context, ofi_op_msg, recv_entry->flags, 0, - NULL, (void *) recv_entry->addr, + NULL, recv_entry->peer_id, recv_entry->tag, 0, FI_ECANCELED); - freestack_push(ep->recv_fs, recv_entry); + ofi_freestack_push(ep->recv_fs, recv_entry); ret = ret ? ret : 1; } @@ -174,44 +182,101 @@ static struct fi_ops_ep smr_ep_ops = { .tx_size_left = fi_no_tx_size_left, }; -int smr_verify_peer(struct smr_ep *ep, int peer_id) +static void smr_send_name(struct smr_ep *ep, int64_t id) +{ + struct smr_region *peer_smr; + struct smr_cmd *cmd; + struct smr_inject_buf *tx_buf; + + peer_smr = smr_peer_region(ep->region, id); + + fastlock_acquire(&peer_smr->lock); + + if (smr_peer_data(ep->region)[id].name_sent || !peer_smr->cmd_cnt) + goto out; + + cmd = ofi_cirque_next(smr_cmd_queue(peer_smr)); + + cmd->msg.hdr.op = SMR_OP_MAX + ofi_ctrl_connreq; + cmd->msg.hdr.id = id; + + tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr)); + cmd->msg.hdr.src_data = smr_get_offset(peer_smr, tx_buf); + + cmd->msg.hdr.size = strlen(smr_name(ep->region)) + 1; + memcpy(tx_buf->data, smr_name(ep->region), cmd->msg.hdr.size); + + smr_peer_data(ep->region)[id].name_sent = 1; + ofi_cirque_commit(smr_cmd_queue(peer_smr)); + peer_smr->cmd_cnt--; + +out: + fastlock_release(&peer_smr->lock); +} + +int64_t smr_verify_peer(struct smr_ep *ep, fi_addr_t fi_addr) { + int64_t id; int ret; - if (ep->region->map->peers[peer_id].peer.addr != FI_ADDR_UNSPEC) - return 0; + id = smr_addr_lookup(ep->util_ep.av, fi_addr); + assert(id < SMR_MAX_PEERS); + + if (smr_peer_data(ep->region)[id].addr.id >= 0) + return id; + + if (ep->region->map->peers[id].peer.id < 0) { + ret = smr_map_to_region(&smr_prov, &ep->region->map->peers[id]); + if (ret == -ENOENT) + return -1; + + } - ret = smr_map_to_region(&smr_prov, &ep->region->map->peers[peer_id]); + smr_send_name(ep, id); - return (ret == -ENOENT) ? -FI_EAGAIN : ret; + return -1; } static int smr_match_msg(struct dlist_entry *item, const void *args) { struct smr_match_attr *attr = (struct smr_match_attr *)args; - struct smr_ep_entry *recv_entry; + struct smr_rx_entry *recv_entry; - recv_entry = container_of(item, struct smr_ep_entry, entry); - return smr_match_addr(recv_entry->addr, attr->addr); + recv_entry = container_of(item, struct smr_rx_entry, entry); + return smr_match_id(recv_entry->peer_id, attr->id); } static int smr_match_tagged(struct dlist_entry *item, const void *args) { struct smr_match_attr *attr = (struct smr_match_attr *)args; - struct smr_ep_entry *recv_entry; + struct smr_rx_entry *recv_entry; + + recv_entry = container_of(item, struct smr_rx_entry, entry); + return smr_match_id(recv_entry->peer_id, attr->id) && + smr_match_tag(recv_entry->tag, recv_entry->ignore, attr->tag); +} + +static int smr_match_unexp_msg(struct dlist_entry *item, const void *args) +{ + struct smr_match_attr *attr = (struct smr_match_attr *)args; + struct smr_unexp_msg *unexp_msg; - recv_entry = container_of(item, struct smr_ep_entry, entry); - return smr_match_addr(recv_entry->addr, attr->addr) && - smr_match_tag(recv_entry->tag, recv_entry->ignore, attr->tag); -} + unexp_msg = container_of(item, struct smr_unexp_msg, entry); + assert(unexp_msg->cmd.msg.hdr.op == ofi_op_msg); + return smr_match_id(unexp_msg->cmd.msg.hdr.id, attr->id); +} -static int smr_match_unexp(struct dlist_entry *item, const void *args) +static int smr_match_unexp_tagged(struct dlist_entry *item, const void *args) { struct smr_match_attr *attr = (struct smr_match_attr *)args; struct smr_unexp_msg *unexp_msg; unexp_msg = container_of(item, struct smr_unexp_msg, entry); - return smr_match_addr(unexp_msg->cmd.msg.hdr.addr, attr->addr) && + if (unexp_msg->cmd.msg.hdr.op == ofi_op_msg) + return smr_match_id(unexp_msg->cmd.msg.hdr.id, attr->id); + + assert(unexp_msg->cmd.msg.hdr.op == ofi_op_tagged); + return smr_match_id(unexp_msg->cmd.msg.hdr.id, attr->id) && smr_match_tag(unexp_msg->cmd.msg.hdr.tag, attr->ignore, attr->tag); } @@ -223,78 +288,269 @@ static void smr_init_queue(struct smr_queue *queue, queue->match_func = match_func; } -void smr_post_pend_resp(struct smr_cmd *cmd, struct smr_cmd *pend, - struct smr_resp *resp) +void smr_format_pend_resp(struct smr_tx_entry *pend, struct smr_cmd *cmd, + void *context, enum fi_hmem_iface iface, uint64_t device, + const struct iovec *iov, uint32_t iov_count, + int64_t id, struct smr_resp *resp) { - *pend = *cmd; + pend->cmd = *cmd; + pend->context = context; + memcpy(pend->iov, iov, sizeof(*iov) * iov_count); + pend->iov_count = iov_count; + pend->peer_id = id; + if (cmd->msg.hdr.op_src != smr_src_sar) + pend->bytes_done = 0; + + pend->iface = iface; + pend->device = device; + resp->msg_id = (uint64_t) (uintptr_t) pend; resp->status = FI_EBUSY; } -void smr_generic_format(struct smr_cmd *cmd, fi_addr_t peer_id, - uint32_t op, uint64_t tag, uint8_t datatype, - uint8_t atomic_op, uint64_t data, - uint64_t op_flags) +void smr_generic_format(struct smr_cmd *cmd, int64_t peer_id, uint32_t op, + uint64_t tag, uint64_t data, uint64_t op_flags) { cmd->msg.hdr.op = op; cmd->msg.hdr.op_flags = 0; + cmd->msg.hdr.tag = tag; + cmd->msg.hdr.id = peer_id; + cmd->msg.hdr.data = data; if (op_flags & FI_REMOTE_CQ_DATA) cmd->msg.hdr.op_flags |= SMR_REMOTE_CQ_DATA; if (op_flags & FI_COMPLETION) cmd->msg.hdr.op_flags |= SMR_TX_COMPLETION; - - if (op == ofi_op_tagged) { - cmd->msg.hdr.tag = tag; - } else if (op == ofi_op_atomic || - op == ofi_op_atomic_fetch || - op == ofi_op_atomic_compare) { - cmd->msg.hdr.datatype = datatype; - cmd->msg.hdr.atomic_op = atomic_op; - } - cmd->msg.hdr.addr = peer_id; - cmd->msg.hdr.data = data; } -void smr_format_inline(struct smr_cmd *cmd, fi_addr_t peer_id, - const struct iovec *iov, size_t count, - uint32_t op, uint64_t tag, uint64_t data, - uint64_t op_flags) +void smr_format_inline(struct smr_cmd *cmd, enum fi_hmem_iface iface, + uint64_t device, const struct iovec *iov, size_t count) { - smr_generic_format(cmd, peer_id, op, tag, 0, 0, data, op_flags); cmd->msg.hdr.op_src = smr_src_inline; - cmd->msg.hdr.size = ofi_copy_from_iov(cmd->msg.data.msg, - SMR_MSG_DATA_LEN, iov, count, 0); + cmd->msg.hdr.size = ofi_copy_from_hmem_iov(cmd->msg.data.msg, + SMR_MSG_DATA_LEN, iface, device, + iov, count, 0); } -void smr_format_inject(struct smr_cmd *cmd, fi_addr_t peer_id, +void smr_format_inject(struct smr_cmd *cmd, enum fi_hmem_iface iface, uint64_t device, const struct iovec *iov, size_t count, - uint32_t op, uint64_t tag, uint64_t data, - uint64_t op_flags, struct smr_region *smr, - struct smr_inject_buf *tx_buf) + struct smr_region *smr, struct smr_inject_buf *tx_buf) { - smr_generic_format(cmd, peer_id, op, tag, 0, 0, data, op_flags); cmd->msg.hdr.op_src = smr_src_inject; - cmd->msg.hdr.src_data = (char **) tx_buf - (char **) smr; - cmd->msg.hdr.size = ofi_copy_from_iov(tx_buf->data, SMR_INJECT_SIZE, - iov, count, 0); + cmd->msg.hdr.src_data = smr_get_offset(smr, tx_buf); + cmd->msg.hdr.size = ofi_copy_from_hmem_iov(tx_buf->data, SMR_INJECT_SIZE, + iface, device, iov, count, 0); } -void smr_format_iov(struct smr_cmd *cmd, fi_addr_t peer_id, - const struct iovec *iov, size_t count, size_t total_len, - uint32_t op, uint64_t tag, uint64_t data, uint64_t op_flags, - void *context, struct smr_region *smr, - struct smr_resp *resp, struct smr_cmd *pend_cmd) +void smr_format_iov(struct smr_cmd *cmd, const struct iovec *iov, size_t count, + size_t total_len, struct smr_region *smr, + struct smr_resp *resp) { - smr_generic_format(cmd, peer_id, op, tag, 0, 0, data, op_flags); cmd->msg.hdr.op_src = smr_src_iov; - cmd->msg.hdr.src_data = (uint64_t) ((char **) resp - (char **) smr); + cmd->msg.hdr.src_data = smr_get_offset(smr, resp); cmd->msg.data.iov_count = count; cmd->msg.hdr.size = total_len; - cmd->msg.hdr.msg_id = (uint64_t) (uintptr_t) context; memcpy(cmd->msg.data.iov, iov, sizeof(*iov) * count); +} + +int smr_format_ze_ipc(struct smr_ep *ep, int64_t id, struct smr_cmd *cmd, + const struct iovec *iov, uint64_t device, + size_t total_len, struct smr_region *smr, + struct smr_resp *resp, struct smr_tx_entry *pend) +{ + int ret; + void *base; + + cmd->msg.hdr.op_src = smr_src_ipc; + cmd->msg.hdr.src_data = smr_get_offset(smr, resp); + cmd->msg.hdr.size = total_len; + cmd->msg.data.ipc_info.iface = FI_HMEM_ZE; + + if (ep->sock_info->peers[id].state == SMR_CMAP_INIT) + smr_ep_exchange_fds(ep, id); + if (ep->sock_info->peers[id].state != SMR_CMAP_SUCCESS) + return -FI_EAGAIN; + + ret = ze_hmem_get_base_addr(iov[0].iov_base, &base); + if (ret) + return ret; + + ret = ze_hmem_get_shared_handle(ep->sock_info->my_fds[device], + base, &pend->fd, + (void **) &cmd->msg.data.ipc_info.fd_handle); + if (ret) + return ret; + + cmd->msg.data.ipc_info.device = device; + cmd->msg.data.ipc_info.offset = (char *) iov[0].iov_base - + (char *) base; + + return FI_SUCCESS; +} + +int smr_format_mmap(struct smr_ep *ep, struct smr_cmd *cmd, + const struct iovec *iov, size_t count, size_t total_len, + struct smr_tx_entry *pend, struct smr_resp *resp) +{ + void *mapped_ptr; + int fd, ret, num; + uint64_t msg_id; + struct smr_ep_name *map_name; + + msg_id = ep->msg_id++; + map_name = calloc(1, sizeof(*map_name)); + if (!map_name) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "calloc error\n"); + return -FI_ENOMEM; + } + + pthread_mutex_lock(&ep_list_lock); + dlist_insert_tail(&map_name->entry, &ep_name_list); + pthread_mutex_unlock(&ep_list_lock); + num = smr_mmap_name(map_name->name, ep->name, msg_id); + if (num < 0) { + FI_WARN(&smr_prov, FI_LOG_AV, "generating shm file name failed\n"); + ret = -errno; + goto remove_entry; + } + + fd = shm_open(map_name->name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if (fd < 0) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "shm_open error\n"); + ret = -errno; + goto remove_entry; + } + + ret = ftruncate(fd, total_len); + if (ret < 0) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "ftruncate error\n"); + goto unlink_close; + } + + mapped_ptr = mmap(NULL, total_len, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (mapped_ptr == MAP_FAILED) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "mmap error\n"); + ret = -errno; + goto unlink_close; + } + + if (cmd->msg.hdr.op != ofi_op_read_req) { + if (ofi_copy_from_iov(mapped_ptr, total_len, iov, count, 0) + != total_len) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "copy from iov error\n"); + ret = -FI_EIO; + goto munmap; + } + munmap(mapped_ptr, total_len); + } else { + pend->map_ptr = mapped_ptr; + } + + cmd->msg.hdr.op_src = smr_src_mmap; + cmd->msg.hdr.msg_id = msg_id; + cmd->msg.hdr.src_data = smr_get_offset(ep->region, resp); + cmd->msg.hdr.size = total_len; + pend->map_name = map_name; + + close(fd); + return 0; + +munmap: + munmap(mapped_ptr, total_len); +unlink_close: + shm_unlink(map_name->name); + close(fd); +remove_entry: + dlist_remove(&map_name->entry); + free(map_name); + return ret; +} + +size_t smr_copy_to_sar(struct smr_sar_msg *sar_msg, struct smr_resp *resp, + struct smr_cmd *cmd, enum fi_hmem_iface iface, + uint64_t device, const struct iovec *iov, size_t count, + size_t *bytes_done, int *next) +{ + size_t start = *bytes_done; + + if (sar_msg->sar[0].status == SMR_SAR_FREE && !*next) { + *bytes_done += ofi_copy_from_hmem_iov(sar_msg->sar[0].buf, + SMR_SAR_SIZE, iface, device, + iov, count, *bytes_done); + sar_msg->sar[0].status = SMR_SAR_READY; + if (cmd->msg.hdr.op == ofi_op_read_req) + resp->status = FI_SUCCESS; + *next = 1; + } + + if (*bytes_done < cmd->msg.hdr.size && + sar_msg->sar[1].status == SMR_SAR_FREE && *next) { + *bytes_done += ofi_copy_from_hmem_iov(sar_msg->sar[1].buf, + SMR_SAR_SIZE, iface, device, + iov, count, *bytes_done); + sar_msg->sar[1].status = SMR_SAR_READY; + if (cmd->msg.hdr.op == ofi_op_read_req) + resp->status = FI_SUCCESS; + *next = 0; + } + return *bytes_done - start; +} + +size_t smr_copy_from_sar(struct smr_sar_msg *sar_msg, struct smr_resp *resp, + struct smr_cmd *cmd, enum fi_hmem_iface iface, + uint64_t device, const struct iovec *iov, size_t count, + size_t *bytes_done, int *next) +{ + size_t start = *bytes_done; + + if (sar_msg->sar[0].status == SMR_SAR_READY && !*next) { + *bytes_done += ofi_copy_to_hmem_iov(iface, device, iov, count, + *bytes_done, sar_msg->sar[0].buf, + SMR_SAR_SIZE); + sar_msg->sar[0].status = SMR_SAR_FREE; + if (cmd->msg.hdr.op != ofi_op_read_req) + resp->status = FI_SUCCESS; + *next = 1; + } + + if (*bytes_done < cmd->msg.hdr.size && + sar_msg->sar[1].status == SMR_SAR_READY && *next) { + *bytes_done += ofi_copy_to_hmem_iov(iface, device, iov, count, + *bytes_done, sar_msg->sar[1].buf, + SMR_SAR_SIZE); + sar_msg->sar[1].status = SMR_SAR_FREE; + if (cmd->msg.hdr.op != ofi_op_read_req) + resp->status = FI_SUCCESS; + *next = 0; + } + return *bytes_done - start; +} + +void smr_format_sar(struct smr_cmd *cmd, enum fi_hmem_iface iface, uint64_t device, + const struct iovec *iov, size_t count, + size_t total_len, struct smr_region *smr, + struct smr_region *peer_smr, struct smr_sar_msg *sar_msg, + struct smr_tx_entry *pending, struct smr_resp *resp) +{ + cmd->msg.hdr.op_src = smr_src_sar; + cmd->msg.hdr.src_data = smr_get_offset(smr, resp); + cmd->msg.data.sar = smr_get_offset(peer_smr, sar_msg); + cmd->msg.hdr.size = total_len; + + pending->bytes_done = 0; + pending->next = 0; + sar_msg->sar[0].status = SMR_SAR_FREE; + sar_msg->sar[1].status = SMR_SAR_FREE; + if (cmd->msg.hdr.op != ofi_op_read_req) + smr_copy_to_sar(sar_msg, NULL, cmd, iface, device ,iov, count, + &pending->bytes_done, &pending->next); +} - smr_post_pend_resp(cmd, pend_cmd, resp); +static void smr_cleanup_epoll(struct smr_sock_info *sock_info) +{ + fd_signal_free(&sock_info->signal); + ofi_epoll_close(sock_info->epollfd); } static int smr_ep_close(struct fid *fid) @@ -303,6 +559,15 @@ static int smr_ep_close(struct fid *fid) ep = container_of(fid, struct smr_ep, util_ep.ep_fid.fid); + if (ep->sock_info) { + fd_signal_set(&ep->sock_info->signal); + pthread_join(ep->sock_info->listener_thread, NULL); + close(ep->sock_info->listen_sock); + unlink(ep->sock_info->name); + smr_cleanup_epoll(ep->sock_info); + free(ep->sock_info); + } + ofi_endpoint_close(&ep->util_ep); if (ep->region) @@ -311,10 +576,23 @@ static int smr_ep_close(struct fid *fid) smr_recv_fs_free(ep->recv_fs); smr_unexp_fs_free(ep->unexp_fs); smr_pend_fs_free(ep->pend_fs); + smr_sar_fs_free(ep->sar_fs); + free((void *)ep->name); free(ep); return 0; } +static int smr_ep_trywait(void *arg) +{ + struct smr_ep *ep; + + ep = container_of(arg, struct smr_ep, util_ep.ep_fid.fid); + + smr_ep_progress(&ep->util_ep); + + return FI_SUCCESS; +} + static int smr_ep_bind_cq(struct smr_ep *ep, struct util_cq *cq, uint64_t flags) { int ret; @@ -338,6 +616,13 @@ static int smr_ep_bind_cq(struct smr_ep *ep, struct util_cq *cq, uint64_t flags) } } + if (cq->wait) { + ret = ofi_wait_add_fid(cq->wait, &ep->util_ep.ep_fid.fid, 0, + smr_ep_trywait); + if (ret) + return ret; + } + ret = fid_list_insert(&cq->ep_list, &cq->ep_list_lock, &ep->util_ep.ep_fid.fid); @@ -345,6 +630,24 @@ static int smr_ep_bind_cq(struct smr_ep *ep, struct util_cq *cq, uint64_t flags) return ret; } +static int smr_ep_bind_cntr(struct smr_ep *ep, struct util_cntr *cntr, uint64_t flags) +{ + int ret; + + ret = ofi_ep_bind_cntr(&ep->util_ep, cntr, flags); + if (ret) + return ret; + + if (cntr->wait) { + ret = ofi_wait_add_fid(cntr->wait, &ep->util_ep.ep_fid.fid, 0, + smr_ep_trywait); + if (ret) + return ret; + } + + return FI_SUCCESS; +} + static int smr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) { struct smr_ep *ep; @@ -369,7 +672,7 @@ static int smr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) case FI_CLASS_EQ: break; case FI_CLASS_CNTR: - ret = ofi_ep_bind_cntr(&ep->util_ep, container_of(bfid, + ret = smr_ep_bind_cntr(ep, container_of(bfid, struct util_cntr, cntr_fid.fid), flags); break; default: @@ -381,6 +684,335 @@ static int smr_ep_bind(struct fid *ep_fid, struct fid *bfid, uint64_t flags) return ret; } +static int smr_sendmsg_fd(int sock, int64_t id, int64_t peer_id, + int *fds, int nfds) +{ + struct msghdr msg; + struct cmsghdr *cmsg; + struct iovec iov; + char *ctrl_buf; + size_t ctrl_size; + int ret; + + ctrl_size = sizeof(*fds) * nfds; + ctrl_buf = calloc(CMSG_SPACE(ctrl_size), 1); + if (!ctrl_buf) + return -FI_ENOMEM; + + iov.iov_base = &peer_id; + iov.iov_len = sizeof(peer_id); + + memset(&msg, 0, sizeof(msg)); + msg.msg_control = ctrl_buf; + msg.msg_controllen = CMSG_SPACE(ctrl_size); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(ctrl_size); + memcpy(CMSG_DATA(cmsg), fds, ctrl_size); + + ret = sendmsg(sock, &msg, 0); + if (ret == sizeof(peer_id)) { + ret = FI_SUCCESS; + } else { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "sendmsg error\n"); + ret = -FI_EIO; + } + + free(ctrl_buf); + return ret; +} + +static int smr_recvmsg_fd(int sock, int64_t *peer_id, int *fds, int nfds) +{ + struct msghdr msg; + struct cmsghdr *cmsg; + struct iovec iov; + char *ctrl_buf; + size_t ctrl_size; + int ret; + + ctrl_size = sizeof(*fds) * nfds; + ctrl_buf = calloc(CMSG_SPACE(ctrl_size), 1); + if (!ctrl_buf) + return -FI_ENOMEM; + + iov.iov_base = peer_id; + iov.iov_len = sizeof(*peer_id); + + memset(&msg, 0, sizeof(msg)); + msg.msg_control = ctrl_buf; + msg.msg_controllen = CMSG_SPACE(ctrl_size); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + ret = recvmsg(sock, &msg, 0); + if (ret == sizeof(*peer_id)) { + ret = FI_SUCCESS; + } else { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "recvmsg error\n"); + ret = -FI_EIO; + goto out; + } + + assert(!(msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))); + cmsg = CMSG_FIRSTHDR(&msg); + assert(cmsg && cmsg->cmsg_len == CMSG_LEN(ctrl_size) && + cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS); + memcpy(fds, CMSG_DATA(cmsg), ctrl_size); +out: + free(ctrl_buf); + return ret; +} + +static void *smr_start_listener(void *args) +{ + struct smr_ep *ep = (struct smr_ep *) args; + struct sockaddr_un sockaddr; + void *ctx[SMR_MAX_PEERS + 1]; + int i, ret, poll_fds, sock = -1; + int peer_fds[ZE_MAX_DEVICES]; + socklen_t len; + int64_t id, peer_id; + + ep->region->flags |= SMR_FLAG_IPC_SOCK; + while (1) { + poll_fds = ofi_epoll_wait(ep->sock_info->epollfd, ctx, + SMR_MAX_PEERS + 1, -1); + + if (poll_fds < 0) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "epoll error\n"); + continue; + } + + for (i = 0; i < poll_fds; i++) { + if (!ctx[i]) + goto out; + + sock = accept(ep->sock_info->listen_sock, + (struct sockaddr *) &sockaddr, &len); + if (sock < 0) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "accept error\n"); + continue; + } + + FI_DBG(&smr_prov, FI_LOG_EP_CTRL, + "EP accepted connection request from %s\n", + sockaddr.sun_path); + + ret = smr_recvmsg_fd(sock, &id, peer_fds, + ep->sock_info->nfds); + if (!ret) { + memcpy(ep->sock_info->peers[id].device_fds, + peer_fds, sizeof(*peer_fds) * + ep->sock_info->nfds); + + peer_id = smr_peer_data(ep->region)[id].addr.id; + ret = smr_sendmsg_fd(sock, id, peer_id, + ep->sock_info->my_fds, + ep->sock_info->nfds); + ep->sock_info->peers[id].state = + ret ? SMR_CMAP_FAILED : + SMR_CMAP_SUCCESS; + } + + close(sock); + unlink(sockaddr.sun_path); + } + } +out: + close(ep->sock_info->listen_sock); + unlink(ep->sock_info->name); + return NULL; +} + +static int smr_init_epoll(struct smr_sock_info *sock_info) +{ + int ret; + + ret = ofi_epoll_create(&sock_info->epollfd); + if (ret < 0) + return ret; + + ret = fd_signal_init(&sock_info->signal); + if (ret < 0) + goto err2; + + ret = ofi_epoll_add(sock_info->epollfd, + sock_info->signal.fd[FI_READ_FD], + OFI_EPOLL_IN, NULL); + if (ret != 0) + goto err1; + + ret = ofi_epoll_add(sock_info->epollfd, sock_info->listen_sock, + OFI_EPOLL_IN, sock_info); + if (ret != 0) + goto err1; + + return FI_SUCCESS; +err1: + ofi_epoll_close(sock_info->epollfd); +err2: + fd_signal_free(&sock_info->signal); + return ret; +} + +void smr_ep_exchange_fds(struct smr_ep *ep, int64_t id) +{ + struct smr_region *peer_smr = smr_peer_region(ep->region, id); + struct sockaddr_un server_sockaddr = {0}, client_sockaddr = {0}; + char *name1, *name2; + int ret = -1, sock = -1; + int64_t peer_id; + int peer_fds[ZE_MAX_DEVICES]; + + if (peer_smr->pid == ep->region->pid || + !(peer_smr->flags & SMR_FLAG_IPC_SOCK)) + goto out; + + sock = socket(AF_UNIX, SOCK_STREAM, 0); + if (sock < 0) + goto out; + + if (strcmp(smr_sock_name(ep->region), smr_sock_name(peer_smr)) < 1) { + name1 = smr_sock_name(ep->region); + name2 = smr_sock_name(peer_smr); + } else { + name1 = smr_sock_name(peer_smr); + name2 = smr_sock_name(ep->region); + } + client_sockaddr.sun_family = AF_UNIX; + snprintf(client_sockaddr.sun_path, SMR_SOCK_NAME_MAX, "%s%s:%s", + SMR_ZE_SOCK_PATH, name1, name2); + + ret = bind(sock, (struct sockaddr *) &client_sockaddr, + (socklen_t) sizeof(client_sockaddr)); + if (ret == -1) { + if (errno != EADDRINUSE) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "bind error\n"); + ep->sock_info->peers[id].state = SMR_CMAP_FAILED; + } + close(sock); + return; + } + + server_sockaddr.sun_family = AF_UNIX; + snprintf(server_sockaddr.sun_path, SMR_SOCK_NAME_MAX, "%s%s", + SMR_ZE_SOCK_PATH, smr_sock_name(peer_smr)); + + ret = connect(sock, (struct sockaddr *) &server_sockaddr, + sizeof(server_sockaddr)); + if (ret == -1) + goto cleanup; + + FI_DBG(&smr_prov, FI_LOG_EP_CTRL, "EP connected to UNIX socket %s\n", + server_sockaddr.sun_path); + + peer_id = smr_peer_data(ep->region)[id].addr.id; + ret = smr_sendmsg_fd(sock, id, peer_id, ep->sock_info->my_fds, + ep->sock_info->nfds); + if (ret) + goto cleanup; + + ret = smr_recvmsg_fd(sock, &id, peer_fds, ep->sock_info->nfds); + if (ret) + goto cleanup; + + memcpy(ep->sock_info->peers[id].device_fds, peer_fds, + sizeof(*peer_fds) * ep->sock_info->nfds); + +cleanup: + close(sock); + unlink(client_sockaddr.sun_path); +out: + ep->sock_info->peers[id].state = ret ? + SMR_CMAP_FAILED : SMR_CMAP_SUCCESS; +} + +static void smr_init_ipc_socket(struct smr_ep *ep) +{ + struct smr_domain *domain; + struct smr_sock_name *sock_name; + struct sockaddr_un sockaddr = {0}; + int ret; + + ep->sock_info = calloc(1, sizeof(*ep->sock_info)); + if (!ep->sock_info) + goto err_out; + + ep->sock_info->listen_sock = socket(AF_UNIX, SOCK_STREAM, 0); + if (ep->sock_info->listen_sock < 0) + goto free; + + domain = container_of(ep->util_ep.domain, struct smr_domain, util_domain); + snprintf(smr_sock_name(ep->region), SMR_SOCK_NAME_MAX, + "%ld:%d:%d", (long) ep->region->pid, domain->dom_idx, + ep->ep_idx); + + sockaddr.sun_family = AF_UNIX; + snprintf(sockaddr.sun_path, SMR_SOCK_NAME_MAX, + "%s%s", SMR_ZE_SOCK_PATH, smr_sock_name(ep->region)); + + ret = bind(ep->sock_info->listen_sock, (struct sockaddr *) &sockaddr, + (socklen_t) sizeof(sockaddr)); + if (ret) + goto close; + + ret = listen(ep->sock_info->listen_sock, SMR_MAX_PEERS); + if (ret) + goto close; + + FI_DBG(&smr_prov, FI_LOG_EP_CTRL, "EP listening on UNIX socket %s\n", + sockaddr.sun_path); + + ret = smr_init_epoll(ep->sock_info); + if (ret) + goto close; + + sock_name = calloc(1, sizeof(*sock_name)); + if (!sock_name) + goto cleanup; + + memcpy(sock_name->name, sockaddr.sun_path, strlen(sockaddr.sun_path)); + memcpy(ep->sock_info->name, sockaddr.sun_path, + strlen(sockaddr.sun_path)); + + pthread_mutex_lock(&sock_list_lock); + dlist_insert_tail(&sock_name->entry, &sock_name_list); + pthread_mutex_unlock(&sock_list_lock); + + ep->sock_info->my_fds = ze_hmem_get_dev_fds(&ep->sock_info->nfds); + ret = pthread_create(&ep->sock_info->listener_thread, NULL, + &smr_start_listener, ep); + if (ret) + goto remove; + + return; + +remove: + pthread_mutex_lock(&sock_list_lock); + dlist_remove(&sock_name->entry); + pthread_mutex_unlock(&sock_list_lock); + free(sock_name); +cleanup: + smr_cleanup_epoll(ep->sock_info); +close: + close(ep->sock_info->listen_sock); + unlink(sockaddr.sun_path); +free: + free(ep->sock_info); + ep->sock_info = NULL; +err_out: + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "Unable to initialize IPC socket." + "Defaulting to SAR for device transfers\n"); +} + static int smr_ep_ctrl(struct fid *fid, int command, void *arg) { struct smr_attr attr; @@ -393,7 +1025,8 @@ static int smr_ep_ctrl(struct fid *fid, int command, void *arg) switch (command) { case FI_ENABLE: - if (!ep->util_ep.rx_cq || !ep->util_ep.tx_cq) + if ((ofi_needs_rx(ep->util_ep.caps) && !ep->util_ep.rx_cq) || + (ofi_needs_tx(ep->util_ep.caps) && !ep->util_ep.tx_cq)) return -FI_ENOCQ; if (!ep->util_ep.av) return -FI_ENOAV; @@ -404,6 +1037,14 @@ static int smr_ep_ctrl(struct fid *fid, int command, void *arg) ret = smr_create(&smr_prov, av->smr_map, &attr, &ep->region); if (ret) return ret; + + if (ep->util_ep.caps & FI_HMEM) { + ep->region->cma_cap_peer = SMR_CMA_CAP_OFF; + ep->region->cma_cap_self = SMR_CMA_CAP_OFF; + if (ze_hmem_p2p_enabled()) + smr_init_ipc_socket(ep); + } + smr_exchange_all_peers(ep->region); break; default: @@ -424,16 +1065,16 @@ static int smr_endpoint_name(char *name, char *addr, size_t addrlen, int dom_idx, int ep_idx) { const char *start; - memset(name, 0, SMR_NAME_SIZE); - if (!addr || addrlen > SMR_NAME_SIZE) + memset(name, 0, SMR_NAME_MAX); + if (!addr || addrlen > SMR_NAME_MAX) return -FI_EINVAL; start = smr_no_prefix((const char *) addr); if (strstr(addr, SMR_PREFIX) || dom_idx || ep_idx) - snprintf(name, SMR_NAME_SIZE, "%s:%d:%d", start, dom_idx, - ep_idx); + snprintf(name, SMR_NAME_MAX - 1, "%s:%d:%d:%d", start, getuid(), + dom_idx, ep_idx); else - snprintf(name, SMR_NAME_SIZE, "%s", start); + snprintf(name, SMR_NAME_MAX - 1, "%s", start); return 0; } @@ -443,8 +1084,8 @@ int smr_endpoint(struct fid_domain *domain, struct fi_info *info, { struct smr_ep *ep; struct smr_domain *smr_domain; - int ret, ep_idx; - char name[SMR_NAME_SIZE]; + int ret; + char name[SMR_NAME_MAX]; ep = calloc(1, sizeof(*ep)); if (!ep) @@ -453,14 +1094,13 @@ int smr_endpoint(struct fid_domain *domain, struct fi_info *info, smr_domain = container_of(domain, struct smr_domain, util_domain.domain_fid); fastlock_acquire(&smr_domain->util_domain.lock); - ep_idx = smr_domain->ep_idx++; + ep->ep_idx = smr_domain->ep_idx++; fastlock_release(&smr_domain->util_domain.lock); ret = smr_endpoint_name(name, info->src_addr, info->src_addrlen, - smr_domain->dom_idx, ep_idx); + smr_domain->dom_idx, ep->ep_idx); if (ret) goto err2; - - ret = smr_setname(&ep->util_ep.ep_fid.fid, name, SMR_NAME_SIZE); + ret = smr_setname(&ep->util_ep.ep_fid.fid, name, SMR_NAME_MAX); if (ret) goto err2; @@ -474,9 +1114,12 @@ int smr_endpoint(struct fid_domain *domain, struct fi_info *info, ep->recv_fs = smr_recv_fs_create(info->rx_attr->size, NULL, NULL); ep->unexp_fs = smr_unexp_fs_create(info->rx_attr->size, NULL, NULL); ep->pend_fs = smr_pend_fs_create(info->tx_attr->size, NULL, NULL); + ep->sar_fs = smr_sar_fs_create(info->rx_attr->size, NULL, NULL); smr_init_queue(&ep->recv_queue, smr_match_msg); smr_init_queue(&ep->trecv_queue, smr_match_tagged); - smr_init_queue(&ep->unexp_queue, smr_match_unexp); + smr_init_queue(&ep->unexp_msg_queue, smr_match_unexp_msg); + smr_init_queue(&ep->unexp_tagged_queue, smr_match_unexp_tagged); + dlist_init(&ep->sar_list); ep->min_multi_recv_size = SMR_INJECT_SIZE; diff --git a/prov/shm/src/smr_fabric.c b/prov/shm/src/smr_fabric.c index fb2dc3a7729..74fe97cd909 100644 --- a/prov/shm/src/smr_fabric.c +++ b/prov/shm/src/smr_fabric.c @@ -35,13 +35,27 @@ #include "smr.h" +static int smr_wait_open(struct fid_fabric *fabric_fid, + struct fi_wait_attr *attr, + struct fid_wait **waitset) +{ + switch (attr->wait_obj) { + case FI_WAIT_UNSPEC: + case FI_WAIT_YIELD: + return ofi_wait_yield_open(fabric_fid, attr, waitset); + case FI_WAIT_FD: + return ofi_wait_fd_open(fabric_fid, attr, waitset); + default: + return -FI_ENOSYS; + } +} static struct fi_ops_fabric smr_fabric_ops = { .size = sizeof(struct fi_ops_fabric), .domain = smr_domain_open, .passive_ep = fi_no_passive_ep, .eq_open = ofi_eq_create, - .wait_open = ofi_wait_fd_open, + .wait_open = smr_wait_open, .trywait = ofi_trywait }; diff --git a/prov/shm/src/smr_init.c b/prov/shm/src/smr_init.c index a90d2423c27..035d0898ed9 100644 --- a/prov/shm/src/smr_init.c +++ b/prov/shm/src/smr_init.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017 Intel Corporation. All rights reserved. + * Copyright (c) 2015-2021 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -34,31 +34,87 @@ #include #include "smr.h" +#include "smr_signal.h" +#include +extern struct sigaction *old_action; +struct smr_env smr_env = { + .sar_threshold = SIZE_MAX, +}; + +static void smr_init_env(void) +{ + fi_param_get_size_t(&smr_prov, "sar_threshold", &smr_env.sar_threshold); + fi_param_get_size_t(&smr_prov, "tx_size", &smr_info.tx_attr->size); + fi_param_get_size_t(&smr_prov, "rx_size", &smr_info.rx_attr->size); +} static void smr_resolve_addr(const char *node, const char *service, char **addr, size_t *addrlen) { - char temp_name[SMR_NAME_SIZE]; + char temp_name[SMR_NAME_MAX]; if (service) { if (node) - snprintf(temp_name, SMR_NAME_SIZE, "%s%s:%s", + snprintf(temp_name, SMR_NAME_MAX - 1, "%s%s:%s", SMR_PREFIX_NS, node, service); else - snprintf(temp_name, SMR_NAME_SIZE, "%s%s", + snprintf(temp_name, SMR_NAME_MAX - 1, "%s%s", SMR_PREFIX_NS, service); } else { if (node) - snprintf(temp_name, SMR_NAME_SIZE, "%s%s", + snprintf(temp_name, SMR_NAME_MAX - 1, "%s%s", SMR_PREFIX, node); else - snprintf(temp_name, SMR_NAME_SIZE, "%s%d", + snprintf(temp_name, SMR_NAME_MAX - 1, "%s%d", SMR_PREFIX, getpid()); } *addr = strdup(temp_name); - *addrlen = strlen(*addr); + *addrlen = strlen(*addr) + 1; + (*addr)[*addrlen - 1] = '\0'; +} + +/* + * The smr_shm_space_check is to check if there's enough shm space we + * need under /dev/shm. + * Here we use #core instead of SMR_MAX_PEERS, as it is the most likely + * value and has less possibility of failing fi_getinfo calls that are + * currently passing, and breaking currently working app + */ +static int smr_shm_space_check(size_t tx_count, size_t rx_count) +{ + struct statvfs stat; + char shm_fs[] = "/dev/shm"; + uint64_t available_size, shm_size_needed; + int num_of_core, err; + + num_of_core = ofi_sysconf(_SC_NPROCESSORS_ONLN); + if (num_of_core < 0) { + FI_WARN(&smr_prov, FI_LOG_CORE, + "Get number of processor failed (%s)\n", + strerror(errno)); + return -errno; + } + shm_size_needed = num_of_core * + smr_calculate_size_offsets(tx_count, rx_count, + NULL, NULL, NULL, + NULL, NULL, NULL, + NULL); + err = statvfs(shm_fs, &stat); + if (err) { + FI_WARN(&smr_prov, FI_LOG_CORE, + "Get filesystem %s statistics failed (%s)\n", + shm_fs, strerror(errno)); + } else { + available_size = stat.f_bsize * stat.f_bavail; + if (available_size < shm_size_needed) { + FI_WARN(&smr_prov, FI_LOG_CORE, + "Not enough available space in %s.\n", shm_fs); + return -FI_ENOSPC; + } + } + return 0; } static int smr_getinfo(uint32_t version, const char *node, const char *service, @@ -71,7 +127,7 @@ static int smr_getinfo(uint32_t version, const char *node, const char *service, int ret; mr_mode = hints && hints->domain_attr ? hints->domain_attr->mr_mode : - FI_MR_VIRT_ADDR; + FI_MR_VIRT_ADDR | FI_MR_HMEM; msg_order = hints && hints->tx_attr ? hints->tx_attr->msg_order : 0; fast_rma = smr_fast_rma_enabled(mr_mode, msg_order); @@ -80,6 +136,12 @@ static int smr_getinfo(uint32_t version, const char *node, const char *service, if (ret) return ret; + ret = smr_shm_space_check((*info)->tx_attr->size, (*info)->rx_attr->size); + if (ret) { + fi_freeinfo(*info); + return ret; + } + for (cur = *info; cur; cur = cur->next) { if (!(flags & FI_SOURCE) && !cur->dest_addr) smr_resolve_addr(node, service, (char **) &cur->dest_addr, @@ -100,19 +162,32 @@ static int smr_getinfo(uint32_t version, const char *node, const char *service, cur->ep_attr->max_order_waw_size = 0; cur->ep_attr->max_order_war_size = 0; } + if (cur->caps & FI_HMEM) { + if (!(mr_mode & FI_MR_HMEM)) { + fi_freeinfo(cur); + return -FI_ENODATA; + } + cur->domain_attr->mr_mode |= FI_MR_HMEM; + } else { + cur->domain_attr->mr_mode &= ~FI_MR_HMEM; + } } return 0; } static void smr_fini(void) { - /* yawn */ +#if HAVE_SHM_DL + ofi_hmem_cleanup(); +#endif + smr_cleanup(); + free(old_action); } struct fi_provider smr_prov = { .name = "shm", - .version = FI_VERSION(SMR_MAJOR_VERSION, SMR_MINOR_VERSION), - .fi_version = FI_VERSION(1, 8), + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, .getinfo = smr_getinfo, .fabric = smr_fabric, .cleanup = smr_fini @@ -126,5 +201,32 @@ struct util_prov smr_util_prov = { SHM_INI { +#if HAVE_SHM_DL + ofi_hmem_init(); +#endif + fi_param_define(&smr_prov, "sar_threshold", FI_PARAM_SIZE_T, + "Max size to use for alternate SAR protocol if CMA \ + is not available before switching to mmap protocol \ + Default: SIZE_MAX (18446744073709551615)"); + fi_param_define(&smr_prov, "tx_size", FI_PARAM_SIZE_T, + "Max number of outstanding tx operations \ + Default: 1024"); + fi_param_define(&smr_prov, "rx_size", FI_PARAM_SIZE_T, + "Max number of outstanding rx operations \ + Default: 1024"); + + smr_init_env(); + + old_action = calloc(SIGRTMIN, sizeof(*old_action)); + if (!old_action) + return NULL; + /* Signal handlers to cleanup tmpfs files on an unclean shutdown */ + assert(SIGBUS < SIGRTMIN && SIGSEGV < SIGRTMIN + && SIGTERM < SIGRTMIN && SIGINT < SIGRTMIN); + smr_reg_sig_hander(SIGBUS); + smr_reg_sig_hander(SIGSEGV); + smr_reg_sig_hander(SIGTERM); + smr_reg_sig_hander(SIGINT); + return &smr_prov; } diff --git a/prov/shm/src/smr_msg.c b/prov/shm/src/smr_msg.c index f2f4e5c8e76..515ffac4b65 100644 --- a/prov/shm/src/smr_msg.c +++ b/prov/shm/src/smr_msg.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Intel Corporation. All rights reserved + * Copyright (c) 2013-2021 Intel Corporation. All rights reserved * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -50,135 +50,135 @@ static inline uint16_t smr_convert_rx_flags(uint64_t fi_flags) return flags; } -static inline struct smr_ep_entry *smr_get_recv_entry(struct smr_ep *ep, uint64_t flags) +static struct smr_rx_entry *smr_get_recv_entry(struct smr_ep *ep, + const struct iovec *iov, void **desc, size_t count, fi_addr_t addr, + void *context, uint64_t tag, uint64_t ignore, uint64_t flags) { - struct smr_ep_entry *entry; + struct smr_rx_entry *entry; - if (freestack_isempty(ep->recv_fs)) + if (ofi_cirque_isfull(ep->util_ep.rx_cq->cirq) || + ofi_freestack_isempty(ep->recv_fs)) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "not enough space to post recv\n"); return NULL; + } - entry = freestack_pop(ep->recv_fs); + entry = ofi_freestack_pop(ep->recv_fs); - entry->tag = 0; /* does this need to be set? */ - entry->ignore = 0; /* does this need to be set? */ + memcpy(&entry->iov, iov, sizeof(*iov) * count); + entry->iov_count = count; + entry->context = context; entry->err = 0; entry->flags = smr_convert_rx_flags(flags); + entry->peer_id = ep->util_ep.caps & FI_DIRECTED_RECV && + addr != FI_ADDR_UNSPEC ? + smr_addr_lookup(ep->util_ep.av, addr) : -1; + entry->tag = tag; + entry->ignore = ignore; + + entry->iface = smr_get_mr_hmem_iface(ep->util_ep.domain, desc, + &entry->device); return entry; } -ssize_t smr_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, - uint64_t flags) +ssize_t smr_generic_recv(struct smr_ep *ep, const struct iovec *iov, void **desc, + size_t iov_count, fi_addr_t addr, void *context, + uint64_t tag, uint64_t ignore, uint64_t flags, + struct smr_queue *recv_queue, + struct smr_queue *unexp_queue) { - struct smr_ep_entry *entry; - struct smr_ep *ep; - ssize_t ret = 0; + struct smr_rx_entry *entry; + ssize_t ret = -FI_EAGAIN; - assert(msg->iov_count <= SMR_IOV_LIMIT); - assert(!(flags & FI_MULTI_RECV) || msg->iov_count == 1); + assert(iov_count <= SMR_IOV_LIMIT); + assert(!(flags & FI_MULTI_RECV) || iov_count == 1); - ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); + fastlock_acquire(&ep->region->lock); fastlock_acquire(&ep->util_ep.rx_cq->cq_lock); - entry = smr_get_recv_entry(ep, flags | ep->util_ep.rx_msg_flags); - if (!entry) { - ret = -FI_EAGAIN; - goto out; - } - - entry->iov_count = msg->iov_count; - memcpy(&entry->iov, msg->msg_iov, sizeof(*msg->msg_iov) * msg->iov_count); - entry->context = msg->context; - entry->addr = msg->addr; + entry = smr_get_recv_entry(ep, iov, desc, iov_count, addr, context, tag, + ignore, flags); + if (!entry) + goto out; - dlist_insert_tail(&entry->entry, &ep->recv_queue.list); + dlist_insert_tail(&entry->entry, &recv_queue->list); + ret = smr_progress_unexp_queue(ep, entry, unexp_queue); out: fastlock_release(&ep->util_ep.rx_cq->cq_lock); + fastlock_release(&ep->region->lock); return ret; } -ssize_t smr_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, - size_t count, fi_addr_t src_addr, void *context) +ssize_t smr_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, + uint64_t flags) { - struct smr_ep_entry *entry; struct smr_ep *ep; - ssize_t ret = 0; ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - assert(count <= SMR_IOV_LIMIT); - assert(!(smr_ep_rx_flags(ep) & FI_MULTI_RECV) || count == 1); - fastlock_acquire(&ep->util_ep.rx_cq->cq_lock); - entry = smr_get_recv_entry(ep, smr_ep_rx_flags(ep)); - if (!entry) { - ret = -FI_EAGAIN; - goto out; - } + return smr_generic_recv(ep, msg->msg_iov, msg->desc, msg->iov_count, + msg->addr, msg->context, 0, 0, + flags | ep->util_ep.rx_msg_flags, + &ep->recv_queue, &ep->unexp_msg_queue); +} - entry->iov_count = count; - memcpy(&entry->iov, iov, sizeof(*iov) * count); +ssize_t smr_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, + size_t count, fi_addr_t src_addr, void *context) +{ + struct smr_ep *ep; - entry->context = context; - entry->addr = src_addr; + ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - dlist_insert_tail(&entry->entry, &ep->recv_queue.list); -out: - fastlock_release(&ep->util_ep.rx_cq->cq_lock); - return ret; + return smr_generic_recv(ep, iov, desc, count, src_addr, context, 0, 0, + smr_ep_rx_flags(ep), &ep->recv_queue, + &ep->unexp_msg_queue); } ssize_t smr_recv(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context) { - struct smr_ep_entry *entry; + struct iovec iov; struct smr_ep *ep; - ssize_t ret = 0; ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - fastlock_acquire(&ep->util_ep.rx_cq->cq_lock); - entry = smr_get_recv_entry(ep, smr_ep_rx_flags(ep)); - if (!entry) { - ret = -FI_EAGAIN; - goto out; - } - entry->iov_count = 1; - entry->iov[0].iov_base = buf; - entry->iov[0].iov_len = len; - - entry->context = context; - entry->addr = src_addr; + iov.iov_base = buf; + iov.iov_len = len; - dlist_insert_tail(&entry->entry, &ep->recv_queue.list); -out: - fastlock_release(&ep->util_ep.rx_cq->cq_lock); - return ret; + return smr_generic_recv(ep, &iov, &desc, 1, src_addr, context, 0, 0, + smr_ep_rx_flags(ep), &ep->recv_queue, + &ep->unexp_msg_queue); } static ssize_t smr_generic_sendmsg(struct smr_ep *ep, const struct iovec *iov, - size_t iov_count, fi_addr_t addr, uint64_t tag, - uint64_t data, void *context, uint32_t op, - uint64_t op_flags) + void **desc, size_t iov_count, fi_addr_t addr, + uint64_t tag, uint64_t data, void *context, + uint32_t op, uint64_t op_flags) { struct smr_region *peer_smr; struct smr_inject_buf *tx_buf; + struct smr_sar_msg *sar; struct smr_resp *resp; - struct smr_cmd *cmd, *pend; - int peer_id; + struct smr_cmd *cmd; + struct smr_tx_entry *pend; + enum fi_hmem_iface iface; + uint64_t device; + int64_t id, peer_id; ssize_t ret = 0; size_t total_len; assert(iov_count <= SMR_IOV_LIMIT); - peer_id = (int) addr; + id = smr_verify_peer(ep, addr); + if (id < 0) + return -FI_EAGAIN; - ret = smr_verify_peer(ep, peer_id); - if (ret) - return ret; + peer_id = smr_peer_data(ep->region)[id].addr.id; + peer_smr = smr_peer_region(ep->region, id); - peer_smr = smr_peer_region(ep->region, peer_id); fastlock_acquire(&peer_smr->lock); - if (!peer_smr->cmd_cnt) { + if (!peer_smr->cmd_cnt || smr_peer_data(ep->region)[peer_id].sar_status) { ret = -FI_EAGAIN; goto unlock_region; } @@ -189,28 +189,60 @@ static ssize_t smr_generic_sendmsg(struct smr_ep *ep, const struct iovec *iov, goto unlock_cq; } + iface = smr_get_mr_hmem_iface(ep->util_ep.domain, desc, &device); + total_len = ofi_total_iov_len(iov, iov_count); - cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr)); + cmd = ofi_cirque_next(smr_cmd_queue(peer_smr)); + smr_generic_format(cmd, peer_id, op, tag, data, op_flags); - if (total_len <= SMR_MSG_DATA_LEN) { - smr_format_inline(cmd, smr_peer_addr(ep->region)[peer_id].addr, iov, - iov_count, op, tag, data, op_flags); - } else if (total_len <= SMR_INJECT_SIZE) { + if (total_len <= SMR_MSG_DATA_LEN && !(op_flags & FI_DELIVERY_COMPLETE)) { + smr_format_inline(cmd, iface, device, iov, iov_count); + } else if (total_len <= SMR_INJECT_SIZE && + !(op_flags & FI_DELIVERY_COMPLETE)) { tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr)); - smr_format_inject(cmd, smr_peer_addr(ep->region)[peer_id].addr, - iov, iov_count, op, tag, data, op_flags, - peer_smr, tx_buf); + smr_format_inject(cmd, iface, device, iov, iov_count, peer_smr, tx_buf); } else { if (ofi_cirque_isfull(smr_resp_queue(ep->region))) { ret = -FI_EAGAIN; goto unlock_cq; } - resp = ofi_cirque_tail(smr_resp_queue(ep->region)); - pend = freestack_pop(ep->pend_fs); - smr_format_iov(cmd, smr_peer_addr(ep->region)[peer_id].addr, iov, - iov_count, total_len, op, tag, data, op_flags, - context, ep->region, resp, pend); + resp = ofi_cirque_next(smr_resp_queue(ep->region)); + pend = ofi_freestack_pop(ep->pend_fs); + if (smr_cma_enabled(ep, peer_smr) && iface == FI_HMEM_SYSTEM) { + smr_format_iov(cmd, iov, iov_count, total_len, ep->region, + resp); + } else { + if (iface == FI_HMEM_ZE && iov_count == 1 && + smr_ze_ipc_enabled(ep->region, peer_smr)) { + ret = smr_format_ze_ipc(ep, id, cmd, iov, + device, total_len, ep->region, + resp, pend); + } else if (total_len <= smr_env.sar_threshold || + iface != FI_HMEM_SYSTEM) { + if (!peer_smr->sar_cnt) { + ret = -FI_EAGAIN; + } else { + sar = smr_freestack_pop(smr_sar_pool(peer_smr)); + smr_format_sar(cmd, iface, device, iov, + iov_count, total_len, + ep->region, peer_smr, sar, + pend, resp); + peer_smr->sar_cnt--; + smr_peer_data(ep->region)[id].sar_status = 1; + } + } else { + ret = smr_format_mmap(ep, cmd, iov, iov_count, + total_len, pend, resp); + } + if (ret) { + ofi_freestack_push(ep->pend_fs, pend); + ret = -FI_EAGAIN; + goto unlock_cq; + } + } + smr_format_pend_resp(pend, cmd, context, iface, device, iov, + iov_count, id, resp); ofi_cirque_commit(smr_resp_queue(ep->region)); goto commit; } @@ -242,7 +274,7 @@ ssize_t smr_send(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, msg_iov.iov_base = (void *) buf; msg_iov.iov_len = len; - return smr_generic_sendmsg(ep, &msg_iov, 1, dest_addr, 0, + return smr_generic_sendmsg(ep, &msg_iov, &desc, 1, dest_addr, 0, 0, context, ofi_op_msg, smr_ep_tx_flags(ep)); } @@ -254,7 +286,7 @@ ssize_t smr_sendv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - return smr_generic_sendmsg(ep, iov, count, dest_addr, 0, + return smr_generic_sendmsg(ep, iov, desc, count, dest_addr, 0, 0, context, ofi_op_msg, smr_ep_tx_flags(ep)); } @@ -265,7 +297,7 @@ ssize_t smr_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - return smr_generic_sendmsg(ep, msg->msg_iov, msg->iov_count, + return smr_generic_sendmsg(ep, msg->msg_iov, msg->desc, msg->iov_count, msg->addr, 0, msg->data, msg->context, ofi_op_msg, flags | ep->util_ep.tx_msg_flags); } @@ -278,7 +310,7 @@ static ssize_t smr_generic_inject(struct fid_ep *ep_fid, const void *buf, struct smr_region *peer_smr; struct smr_inject_buf *tx_buf; struct smr_cmd *cmd; - int peer_id; + int64_t id, peer_id; ssize_t ret = 0; struct iovec msg_iov; @@ -288,28 +320,28 @@ static ssize_t smr_generic_inject(struct fid_ep *ep_fid, const void *buf, msg_iov.iov_len = len; ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - peer_id = (int) dest_addr; - ret = smr_verify_peer(ep, peer_id); - if (ret) - return ret; + id = smr_verify_peer(ep, dest_addr); + if (id < 0) + return -FI_EAGAIN; + + peer_id = smr_peer_data(ep->region)[id].addr.id; + peer_smr = smr_peer_region(ep->region, id); - peer_smr = smr_peer_region(ep->region, peer_id); fastlock_acquire(&peer_smr->lock); - if (!peer_smr->cmd_cnt) { + if (!peer_smr->cmd_cnt || smr_peer_data(ep->region)[id].sar_status) { ret = -FI_EAGAIN; goto unlock; } - cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr)); + cmd = ofi_cirque_next(smr_cmd_queue(peer_smr)); + smr_generic_format(cmd, peer_id, op, tag, data, op_flags); if (len <= SMR_MSG_DATA_LEN) { - smr_format_inline(cmd, smr_peer_addr(ep->region)[peer_id].addr, - &msg_iov, 1, op, tag, data, op_flags); + smr_format_inline(cmd, FI_HMEM_SYSTEM, 0, &msg_iov, 1); } else { tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr)); - smr_format_inject(cmd, smr_peer_addr(ep->region)[peer_id].addr, - &msg_iov, 1, op, tag, data, op_flags, + smr_format_inject(cmd, FI_HMEM_SYSTEM, 0, &msg_iov, 1, peer_smr, tx_buf); } ofi_ep_tx_cntr_inc_func(&ep->util_ep, op); @@ -340,8 +372,8 @@ ssize_t smr_senddata(struct fid_ep *ep_fid, const void *buf, size_t len, iov.iov_base = (void *) buf; iov.iov_len = len; - return smr_generic_sendmsg(ep, &iov, 1, dest_addr, 0, data, context, - ofi_op_msg, + return smr_generic_sendmsg(ep, &iov, &desc, 1, dest_addr, 0, data, + context, ofi_op_msg, FI_REMOTE_CQ_DATA | smr_ep_tx_flags(ep)); } @@ -365,126 +397,46 @@ struct fi_ops_msg smr_msg_ops = { .injectdata = smr_injectdata, }; -static inline struct smr_ep_entry *smr_get_trecv_entry(struct smr_ep *ep, uint64_t flags) -{ - struct smr_ep_entry *entry; - - if (freestack_isempty(ep->recv_fs)) - return NULL; - - entry = freestack_pop(ep->recv_fs); - entry->err = 0; - entry->flags = smr_convert_rx_flags(flags); - - return entry; -} - -static inline ssize_t -smr_proccess_trecv_post(struct smr_ep *ep, struct smr_ep_entry *entry) -{ - ssize_t ret; - - ret = smr_progress_unexp(ep, entry); - if (!ret || ret == -FI_EAGAIN) - return ret; - - dlist_insert_tail(&entry->entry, &ep->trecv_queue.list); - return 0; -} - ssize_t smr_trecv(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context) { - struct smr_ep_entry *entry; + struct iovec iov; struct smr_ep *ep; - ssize_t ret; ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - fastlock_acquire(&ep->util_ep.rx_cq->cq_lock); - entry = smr_get_trecv_entry(ep, smr_ep_rx_flags(ep)); - if (!entry) { - ret = -FI_EAGAIN; - goto out; - } - - entry->iov_count = 1; - entry->iov[0].iov_base = buf; - entry->iov[0].iov_len = len; - entry->context = context; - entry->addr = src_addr; - entry->tag = tag; - entry->ignore = ignore; + iov.iov_base = buf; + iov.iov_len = len; - ret = smr_proccess_trecv_post(ep, entry); -out: - fastlock_release(&ep->util_ep.rx_cq->cq_lock); - return ret; + return smr_generic_recv(ep, &iov, &desc, 1, src_addr, context, tag, ignore, + smr_ep_rx_flags(ep), &ep->trecv_queue, + &ep->unexp_tagged_queue); } ssize_t smr_trecvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t tag, uint64_t ignore, void *context) { - struct smr_ep_entry *entry; struct smr_ep *ep; - ssize_t ret; ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - assert(count <= SMR_IOV_LIMIT); - assert(!(smr_ep_rx_flags(ep) & FI_MULTI_RECV) || count == 1); - - fastlock_acquire(&ep->util_ep.rx_cq->cq_lock); - entry = smr_get_trecv_entry(ep, smr_ep_rx_flags(ep)); - if (!entry) { - ret = -FI_EAGAIN; - goto out; - } - - entry->iov_count = count; - memcpy(&entry->iov, iov, sizeof(*iov) * count); - - entry->context = context; - entry->addr = src_addr; - entry->tag = tag; - entry->ignore = ignore; - ret = smr_proccess_trecv_post(ep, entry); -out: - fastlock_release(&ep->util_ep.rx_cq->cq_lock); - return ret; + return smr_generic_recv(ep, iov, desc, count, src_addr, context, tag, ignore, + smr_ep_rx_flags(ep), &ep->trecv_queue, + &ep->unexp_tagged_queue); } ssize_t smr_trecvmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg, uint64_t flags) { - struct smr_ep_entry *entry; struct smr_ep *ep; - ssize_t ret; - - assert(msg->iov_count <= SMR_IOV_LIMIT); - assert(!(flags & FI_MULTI_RECV) || msg->iov_count == 1); ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - fastlock_acquire(&ep->util_ep.rx_cq->cq_lock); - entry = smr_get_trecv_entry(ep, flags | ep->util_ep.rx_msg_flags); - if (!entry) { - ret = -FI_EAGAIN; - goto out; - } - - entry->iov_count = msg->iov_count; - memcpy(&entry->iov, msg->msg_iov, sizeof(*msg->msg_iov) * msg->iov_count); - - entry->context = msg->context; - entry->addr = msg->addr; - entry->tag = msg->tag; - entry->ignore = msg->ignore; - ret = smr_proccess_trecv_post(ep, entry); -out: - fastlock_release(&ep->util_ep.rx_cq->cq_lock); - return ret; + return smr_generic_recv(ep, msg->msg_iov, msg->desc, msg->iov_count, + msg->addr, msg->context, msg->tag, msg->ignore, + flags | ep->util_ep.rx_msg_flags, + &ep->trecv_queue, &ep->unexp_tagged_queue); } ssize_t smr_tsend(struct fid_ep *ep_fid, const void *buf, size_t len, @@ -498,7 +450,7 @@ ssize_t smr_tsend(struct fid_ep *ep_fid, const void *buf, size_t len, msg_iov.iov_base = (void *) buf; msg_iov.iov_len = len; - return smr_generic_sendmsg(ep, &msg_iov, 1, dest_addr, tag, + return smr_generic_sendmsg(ep, &msg_iov, &desc, 1, dest_addr, tag, 0, context, ofi_op_tagged, smr_ep_tx_flags(ep)); } @@ -511,7 +463,7 @@ ssize_t smr_tsendv(struct fid_ep *ep_fid, const struct iovec *iov, ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - return smr_generic_sendmsg(ep, iov, count, dest_addr, tag, + return smr_generic_sendmsg(ep, iov, desc, count, dest_addr, tag, 0, context, ofi_op_tagged, smr_ep_tx_flags(ep)); } @@ -523,7 +475,7 @@ ssize_t smr_tsendmsg(struct fid_ep *ep_fid, const struct fi_msg_tagged *msg, ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); - return smr_generic_sendmsg(ep, msg->msg_iov, msg->iov_count, + return smr_generic_sendmsg(ep, msg->msg_iov, msg->desc, msg->iov_count, msg->addr, msg->tag, msg->data, msg->context, ofi_op_tagged, flags | ep->util_ep.tx_msg_flags); } @@ -547,8 +499,8 @@ ssize_t smr_tsenddata(struct fid_ep *ep_fid, const void *buf, size_t len, iov.iov_base = (void *) buf; iov.iov_len = len; - return smr_generic_sendmsg(ep, &iov, 1, dest_addr, tag, data, context, - ofi_op_tagged, + return smr_generic_sendmsg(ep, &iov, &desc, 1, dest_addr, tag, data, + context, ofi_op_tagged, FI_REMOTE_CQ_DATA | smr_ep_tx_flags(ep)); } diff --git a/prov/shm/src/smr_progress.c b/prov/shm/src/smr_progress.c index af302d9543b..9636ce56b34 100644 --- a/prov/shm/src/smr_progress.c +++ b/prov/shm/src/smr_progress.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Intel Corporation. All rights reserved + * Copyright (c) 2013-2020 Intel Corporation. All rights reserved * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -35,50 +35,144 @@ #include #include "ofi_iov.h" +#include "ofi_hmem.h" #include "smr.h" -static int smr_progress_fetch(struct smr_ep *ep, struct smr_cmd *pending, - uint64_t *ret) + +static inline void smr_try_progress_to_sar(struct smr_sar_msg *sar_msg, + struct smr_resp *resp, + struct smr_cmd *cmd, enum fi_hmem_iface iface, + uint64_t device, struct iovec *iov, + size_t iov_count, size_t *bytes_done, int *next) +{ + while (*bytes_done < cmd->msg.hdr.size && + smr_copy_to_sar(sar_msg, resp, cmd, iface, device, iov, + iov_count, bytes_done, next)); +} + +static inline void smr_try_progress_from_sar(struct smr_sar_msg *sar_msg, + struct smr_resp *resp, + struct smr_cmd *cmd, enum fi_hmem_iface iface, + uint64_t device, struct iovec *iov, + size_t iov_count, size_t *bytes_done, int *next) +{ + while (*bytes_done < cmd->msg.hdr.size && + smr_copy_from_sar(sar_msg, resp, cmd, iface, device, iov, + iov_count, bytes_done, next)); +} + +static int smr_progress_resp_entry(struct smr_ep *ep, struct smr_resp *resp, + struct smr_tx_entry *pending, uint64_t *err) { struct smr_region *peer_smr; - size_t inj_offset, size; - struct smr_inject_buf *tx_buf; + size_t inj_offset; + struct smr_inject_buf *tx_buf = NULL; + struct smr_sar_msg *sar_msg = NULL; uint8_t *src; - peer_smr = smr_peer_region(ep->region, pending->msg.hdr.addr); - if (fastlock_tryacquire(&peer_smr->lock)) - return -FI_EAGAIN; + peer_smr = smr_peer_region(ep->region, pending->peer_id); - inj_offset = (size_t) pending->msg.hdr.src_data; - tx_buf = (struct smr_inject_buf *) ((char **) peer_smr + - inj_offset); + switch (pending->cmd.msg.hdr.op_src) { + case smr_src_iov: + break; + case smr_src_ipc: + close(pending->fd); + break; + case smr_src_sar: + sar_msg = smr_get_ptr(peer_smr, pending->cmd.msg.data.sar); + if (pending->bytes_done == pending->cmd.msg.hdr.size && + sar_msg->sar[0].status == SMR_SAR_FREE && + sar_msg->sar[1].status == SMR_SAR_FREE) + break; - if (*ret) - goto out; + if (pending->cmd.msg.hdr.op == ofi_op_read_req) + smr_try_progress_from_sar(sar_msg, resp, + &pending->cmd, pending->iface, + pending->device, pending->iov, + pending->iov_count, &pending->bytes_done, + &pending->next); + else + smr_try_progress_to_sar(sar_msg, resp, + &pending->cmd, pending->iface, + pending->device, pending->iov, + pending->iov_count, &pending->bytes_done, + &pending->next); + if (pending->bytes_done != pending->cmd.msg.hdr.size || + sar_msg->sar[0].status != SMR_SAR_FREE || + sar_msg->sar[1].status != SMR_SAR_FREE) + return -FI_EAGAIN; + break; + case smr_src_mmap: + if (!pending->map_name) + break; + if (pending->cmd.msg.hdr.op == ofi_op_read_req) { + if (!*err) { + pending->bytes_done = ofi_copy_to_iov(pending->iov, + pending->iov_count, 0, + pending->map_ptr, + pending->cmd.msg.hdr.size); + if (pending->bytes_done != pending->cmd.msg.hdr.size) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "Incomplete copy from mmapped file\n"); + *err = -FI_EIO; + } + } + munmap(pending->map_ptr, pending->cmd.msg.hdr.size); + } + shm_unlink(pending->map_name->name); + dlist_remove(&pending->map_name->entry); + free(pending->map_name); + pending->map_name = NULL; + break; + case smr_src_inject: + inj_offset = (size_t) pending->cmd.msg.hdr.src_data; + tx_buf = smr_get_ptr(peer_smr, inj_offset); + if (*err || pending->bytes_done == pending->cmd.msg.hdr.size || + pending->cmd.msg.hdr.op == ofi_op_atomic) + break; - src = pending->msg.hdr.op == ofi_op_atomic_compare ? - tx_buf->buf : tx_buf->data; - size = ofi_copy_to_iov(pending->msg.data.iov, - pending->msg.data.iov_count, - 0, src, pending->msg.hdr.size); + src = pending->cmd.msg.hdr.op == ofi_op_atomic_compare ? + tx_buf->buf : tx_buf->data; + pending->bytes_done = ofi_copy_to_iov(pending->iov, pending->iov_count, + 0, src, pending->cmd.msg.hdr.size); - if (size != pending->msg.hdr.size) { + if (pending->bytes_done != pending->cmd.msg.hdr.size) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "Incomplete rma read/fetch buffer copied\n"); + *err = FI_EIO; + } + break; + default: FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "Incomplete atomic fetch buffer copied\n"); - *ret = FI_EIO; + "unidentified operation type\n"); + } + + //Skip locking on transfers from self since we already have + //the ep->region->lock + if (peer_smr != ep->region) { + if (fastlock_tryacquire(&peer_smr->lock)) + return -FI_EAGAIN; } -out: - smr_freestack_push(smr_inject_pool(peer_smr), tx_buf); peer_smr->cmd_cnt++; - fastlock_release(&peer_smr->lock); + if (tx_buf) { + smr_freestack_push(smr_inject_pool(peer_smr), tx_buf); + } else if (sar_msg) { + smr_freestack_push(smr_sar_pool(peer_smr), sar_msg); + peer_smr->sar_cnt++; + smr_peer_data(ep->region)[pending->peer_id].sar_status = 0; + } + + if (peer_smr != ep->region) + fastlock_release(&peer_smr->lock); + return 0; } static void smr_progress_resp(struct smr_ep *ep) { struct smr_resp *resp; - struct smr_cmd *pending; + struct smr_tx_entry *pending; int ret; fastlock_acquire(&ep->region->lock); @@ -89,31 +183,31 @@ static void smr_progress_resp(struct smr_ep *ep) if (resp->status == FI_EBUSY) break; - pending = (struct smr_cmd *) resp->msg_id; - if (pending->msg.hdr.op_flags & SMR_RMA_REQ && - smr_progress_fetch(ep, pending, &resp->status)) - break; + pending = (struct smr_tx_entry *) resp->msg_id; + if (smr_progress_resp_entry(ep, resp, pending, &resp->status)) + break; - ret = smr_complete_tx(ep, (void *) (uintptr_t) pending->msg.hdr.msg_id, - pending->msg.hdr.op, pending->msg.hdr.op_flags, + ret = smr_complete_tx(ep, pending->context, + pending->cmd.msg.hdr.op, pending->cmd.msg.hdr.op_flags, -(resp->status)); if (ret) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "unable to process tx completion\n"); break; } - freestack_push(ep->pend_fs, pending); + ofi_freestack_push(ep->pend_fs, pending); ofi_cirque_discard(smr_resp_queue(ep->region)); } fastlock_release(&ep->util_ep.tx_cq->cq_lock); fastlock_release(&ep->region->lock); } -static int smr_progress_inline(struct smr_cmd *cmd, struct iovec *iov, +static int smr_progress_inline(struct smr_cmd *cmd, enum fi_hmem_iface iface, + uint64_t device, struct iovec *iov, size_t iov_count, size_t *total_len) { - *total_len = ofi_copy_to_iov(iov, iov_count, 0, cmd->msg.data.msg, - cmd->msg.hdr.size); + *total_len = ofi_copy_to_hmem_iov(iface, device, iov, iov_count, 0, + cmd->msg.data.msg, cmd->msg.hdr.size); if (*total_len != cmd->msg.hdr.size) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "recv truncated"); @@ -122,7 +216,8 @@ static int smr_progress_inline(struct smr_cmd *cmd, struct iovec *iov, return 0; } -static int smr_progress_inject(struct smr_cmd *cmd, struct iovec *iov, +static int smr_progress_inject(struct smr_cmd *cmd, enum fi_hmem_iface iface, + uint64_t device, struct iovec *iov, size_t iov_count, size_t *total_len, struct smr_ep *ep, int err) { @@ -130,22 +225,29 @@ static int smr_progress_inject(struct smr_cmd *cmd, struct iovec *iov, size_t inj_offset; inj_offset = (size_t) cmd->msg.hdr.src_data; - tx_buf = (struct smr_inject_buf *) ((char **) ep->region + - inj_offset); - if (err) - goto out; + tx_buf = smr_get_ptr(ep->region, inj_offset); + + if (err) { + smr_freestack_push(smr_inject_pool(ep->region), tx_buf); + return err; + } + + if (cmd->msg.hdr.op == ofi_op_read_req) { + *total_len = ofi_copy_from_hmem_iov(tx_buf->data, cmd->msg.hdr.size, + iface, device, iov, iov_count, 0); + } else { + *total_len = ofi_copy_to_hmem_iov(iface, device, iov, iov_count, 0, + tx_buf->data, cmd->msg.hdr.size); + smr_freestack_push(smr_inject_pool(ep->region), tx_buf); + } - *total_len = ofi_copy_to_iov(iov, iov_count, 0, tx_buf->data, - cmd->msg.hdr.size); if (*total_len != cmd->msg.hdr.size) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "recv truncated"); - err = -FI_EIO; + return -FI_EIO; } -out: - smr_freestack_push(smr_inject_pool(ep->region), tx_buf); - return err; + return FI_SUCCESS; } static int smr_progress_iov(struct smr_cmd *cmd, struct iovec *iov, @@ -154,43 +256,208 @@ static int smr_progress_iov(struct smr_cmd *cmd, struct iovec *iov, { struct smr_region *peer_smr; struct smr_resp *resp; - int peer_id, ret; + int ret; - peer_id = (int) cmd->msg.hdr.addr; - peer_smr = smr_peer_region(ep->region, peer_id); - resp = (struct smr_resp *) ((char **) peer_smr + - (size_t) cmd->msg.hdr.src_data); + peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id); + resp = smr_get_ptr(peer_smr, cmd->msg.hdr.src_data); if (err) { ret = -err; goto out; } - if (cmd->msg.hdr.op == ofi_op_read_req) { - ret = process_vm_writev(peer_smr->pid, iov, iov_count, - cmd->msg.data.iov, - cmd->msg.data.iov_count, 0); - } else { - ret = process_vm_readv(peer_smr->pid, iov, iov_count, - cmd->msg.data.iov, - cmd->msg.data.iov_count, 0); + ret = smr_cma_loop(peer_smr->pid, iov, iov_count, cmd->msg.data.iov, + cmd->msg.data.iov_count, 0, cmd->msg.hdr.size, + cmd->msg.hdr.op == ofi_op_read_req); + if (!ret) + *total_len = cmd->msg.hdr.size; + +out: + //Status must be set last (signals peer: op done, valid resp entry) + resp->status = ret; + + return -ret; +} + +static int smr_mmap_peer_copy(struct smr_ep *ep, struct smr_cmd *cmd, + struct iovec *iov, size_t iov_count, + size_t *total_len) +{ + char shm_name[SMR_NAME_MAX]; + void *mapped_ptr; + int fd, num; + int ret = 0; + + num = smr_mmap_name(shm_name, + ep->region->map->peers[cmd->msg.hdr.id].peer.name, + cmd->msg.hdr.msg_id); + if (num < 0) { + FI_WARN(&smr_prov, FI_LOG_AV, "generating shm file name failed\n"); + return -errno; + } + + fd = shm_open(shm_name, O_RDWR, S_IRUSR | S_IWUSR); + if (fd < 0) { + FI_WARN(&smr_prov, FI_LOG_AV, "shm_open error\n"); + return -errno; } - if (ret != cmd->msg.hdr.size) { - if (ret < 0) { + mapped_ptr = mmap(NULL, cmd->msg.hdr.size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (mapped_ptr == MAP_FAILED) { + FI_WARN(&smr_prov, FI_LOG_AV, "mmap error %s\n", strerror(errno)); + ret = -errno; + goto unlink_close; + } + + if (cmd->msg.hdr.op == ofi_op_read_req) { + *total_len = ofi_total_iov_len(iov, iov_count); + if (ofi_copy_from_iov(mapped_ptr, *total_len, iov, iov_count, 0) + != *total_len) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "CMA write error\n"); - ret = errno; - } else { + "mmap iov copy in error\n"); + ret = -FI_EIO; + goto munmap; + } + } else { + *total_len = ofi_copy_to_iov(iov, iov_count, 0, mapped_ptr, + cmd->msg.hdr.size); + if (*total_len != cmd->msg.hdr.size) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "partial read occurred\n"); - ret = FI_EIO; + "mmap iov copy out error\n"); + ret = -FI_EIO; + goto munmap; } + } + +munmap: + munmap(mapped_ptr, cmd->msg.hdr.size); +unlink_close: + shm_unlink(shm_name); + close(fd); + return ret; +} + +static int smr_progress_mmap(struct smr_cmd *cmd, struct iovec *iov, + size_t iov_count, size_t *total_len, + struct smr_ep *ep) +{ + struct smr_region *peer_smr; + struct smr_resp *resp; + int ret; + + peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id); + resp = smr_get_ptr(peer_smr, cmd->msg.hdr.src_data); + + ret = smr_mmap_peer_copy(ep, cmd, iov, iov_count, total_len); + + //Status must be set last (signals peer: op done, valid resp entry) + resp->status = ret; + + return ret; +} + +static struct smr_sar_entry *smr_progress_sar(struct smr_cmd *cmd, + struct smr_rx_entry *rx_entry, enum fi_hmem_iface iface, + uint64_t device, struct iovec *iov, size_t iov_count, + size_t *total_len, struct smr_ep *ep) +{ + struct smr_region *peer_smr; + struct smr_sar_entry *sar_entry; + struct smr_sar_msg *sar_msg; + struct smr_resp *resp; + struct iovec sar_iov[SMR_IOV_LIMIT]; + int next = 0; + + sar_msg = smr_get_ptr(ep->region, cmd->msg.data.sar); + peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id); + resp = smr_get_ptr(peer_smr, cmd->msg.hdr.src_data); + + memcpy(sar_iov, iov, sizeof(*iov) * iov_count); + (void) ofi_truncate_iov(sar_iov, &iov_count, cmd->msg.hdr.size); + + if (cmd->msg.hdr.op == ofi_op_read_req) + smr_try_progress_to_sar(sar_msg, resp, cmd, iface, device, + sar_iov, iov_count, total_len, &next); + else + smr_try_progress_from_sar(sar_msg, resp, cmd, iface, device, + sar_iov, iov_count, total_len, &next); + + if (*total_len == cmd->msg.hdr.size) + return NULL; + + sar_entry = ofi_freestack_pop(ep->sar_fs); + + sar_entry->cmd = *cmd; + sar_entry->bytes_done = *total_len; + sar_entry->next = next; + memcpy(sar_entry->iov, sar_iov, sizeof(*sar_iov) * iov_count); + sar_entry->iov_count = iov_count; + if (rx_entry) { + sar_entry->rx_entry = *rx_entry; + sar_entry->rx_entry.flags |= cmd->msg.hdr.op_flags; + sar_entry->rx_entry.flags &= ~SMR_MULTI_RECV; + } else { + sar_entry->rx_entry.flags = cmd->msg.hdr.op_flags; + } + + sar_entry->iface = iface; + sar_entry->device = device; + + dlist_insert_tail(&sar_entry->entry, &ep->sar_list); + *total_len = cmd->msg.hdr.size; + return sar_entry; +} + +static int smr_progress_ipc(struct smr_cmd *cmd, enum fi_hmem_iface iface, + uint64_t device, struct iovec *iov, + size_t iov_count, size_t *total_len, + struct smr_ep *ep, int err) +{ + struct smr_region *peer_smr; + struct smr_resp *resp; + void *base, *ptr; + uint64_t ipc_device; + int64_t id; + int ret, fd, ipc_fd; + + peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id); + resp = smr_get_ptr(peer_smr, cmd->msg.hdr.src_data); + + if (iface == FI_HMEM_ZE) { + id = cmd->msg.hdr.id; + ipc_device = cmd->msg.data.ipc_info.device; + fd = ep->sock_info->peers[id].device_fds[ipc_device]; + ret = ze_hmem_open_shared_handle(fd, + (void **) &cmd->msg.data.ipc_info.fd_handle, + &ipc_fd, ipc_device, &base); + } else { + ret = ofi_hmem_open_handle(iface, + (void **) &cmd->msg.data.ipc_info.ipc_handle, + device, &base); + } + if (ret) + goto out; + + ptr = base; + if (iface == FI_HMEM_ZE) + ptr = (char *) ptr + (uintptr_t) cmd->msg.data.ipc_info.offset; + + if (cmd->msg.hdr.op == ofi_op_read_req) { + *total_len = ofi_copy_from_hmem_iov(ptr, cmd->msg.hdr.size, + iface, device, iov, + iov_count, 0); } else { - *total_len = ret; - ret = 0; + *total_len = ofi_copy_to_hmem_iov(iface, device, iov, + iov_count, 0, ptr, + cmd->msg.hdr.size); } + if (!ret) + *total_len = cmd->msg.hdr.size; + if (iface == FI_HMEM_ZE) + close(ipc_fd); + ret = ofi_hmem_close_handle(iface, base); out: //Status must be set last (signals peer: op done, valid resp entry) resp->status = ret; @@ -198,29 +465,21 @@ static int smr_progress_iov(struct smr_cmd *cmd, struct iovec *iov, return -ret; } -static int smr_progress_multi_recv(struct smr_ep *ep, struct smr_queue *queue, - struct smr_ep_entry *entry, size_t len) +static bool smr_progress_multi_recv(struct smr_ep *ep, + struct smr_rx_entry *entry, size_t len) { size_t left; void *new_base; - int ret; left = entry->iov[0].iov_len - len; - if (left < ep->min_multi_recv_size) { - ret = smr_complete_rx(ep, entry->context, ofi_op_msg, - SMR_MULTI_RECV |entry->flags, 0, 0, - &entry->addr, 0, 0, 0); - freestack_push(ep->recv_fs, entry); - return ret; - } + if (left < ep->min_multi_recv_size) + return true; new_base = (void *) ((uintptr_t) entry->iov[0].iov_base + len); entry->iov[0].iov_len = left; entry->iov[0].iov_base = new_base; - dlist_insert_head(&entry->entry, &queue->list); - - return 0; + return false; } static void smr_do_atomic(void *src, void *dst, void *cmp, enum fi_datatype datatype, @@ -228,14 +487,17 @@ static void smr_do_atomic(void *src, void *dst, void *cmp, enum fi_datatype data { char tmp_result[SMR_INJECT_SIZE]; - if (op >= OFI_SWAP_OP_START) { - ofi_atomic_swap_handlers[op - OFI_SWAP_OP_START][datatype](dst, - src, cmp, tmp_result, cnt); - } else if (flags & SMR_RMA_REQ) { - ofi_atomic_readwrite_handlers[op][datatype](dst, src, - tmp_result, cnt); - } else if (op != FI_ATOMIC_READ) { - ofi_atomic_write_handlers[op][datatype](dst, src, cnt); + if (ofi_atomic_isswap_op(op)) { + ofi_atomic_swap_handler(op, datatype, dst, src, cmp, + tmp_result, cnt); + } else if (flags & SMR_RMA_REQ && ofi_atomic_isreadwrite_op(op)) { + ofi_atomic_readwrite_handler(op, datatype, dst, src, + tmp_result, cnt); + } else if (ofi_atomic_iswrite_op(op)) { + ofi_atomic_write_handler(op, datatype, dst, src, cnt); + } else { + FI_WARN(&smr_prov, FI_LOG_EP_DATA, + "invalid atomic operation\n"); } if (flags & SMR_RMA_REQ) @@ -285,8 +547,7 @@ static int smr_progress_inject_atomic(struct smr_cmd *cmd, struct fi_ioc *ioc, int i; inj_offset = (size_t) cmd->msg.hdr.src_data; - tx_buf = (struct smr_inject_buf *) ((char **) ep->region + - inj_offset); + tx_buf = smr_get_ptr(ep->region, inj_offset); if (err) goto out; @@ -321,16 +582,117 @@ static int smr_progress_inject_atomic(struct smr_cmd *cmd, struct fi_ioc *ioc, return err; } +static int smr_progress_msg_common(struct smr_ep *ep, struct smr_cmd *cmd, + struct smr_rx_entry *entry) +{ + struct smr_sar_entry *sar = NULL; + size_t total_len = 0; + uint16_t comp_flags; + void *comp_buf; + int ret; + bool free_entry = true; + + switch (cmd->msg.hdr.op_src) { + case smr_src_inline: + entry->err = smr_progress_inline(cmd, entry->iface, entry->device, + entry->iov, entry->iov_count, + &total_len); + ep->region->cmd_cnt++; + break; + case smr_src_inject: + entry->err = smr_progress_inject(cmd, entry->iface, entry->device, + entry->iov, entry->iov_count, + &total_len, ep, 0); + ep->region->cmd_cnt++; + break; + case smr_src_iov: + entry->err = smr_progress_iov(cmd, entry->iov, entry->iov_count, + &total_len, ep, 0); + break; + case smr_src_mmap: + entry->err = smr_progress_mmap(cmd, entry->iov, entry->iov_count, + &total_len, ep); + break; + case smr_src_sar: + sar = smr_progress_sar(cmd, entry, entry->iface, entry->device, + entry->iov, entry->iov_count, &total_len, ep); + break; + case smr_src_ipc: + entry->err = smr_progress_ipc(cmd, entry->iface, entry->device, + entry->iov, entry->iov_count, + &total_len, ep, 0); + break; + default: + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "unidentified operation type\n"); + entry->err = -FI_EINVAL; + } + + comp_buf = entry->iov[0].iov_base; + comp_flags = (cmd->msg.hdr.op_flags | entry->flags) & ~SMR_MULTI_RECV; + + if (entry->flags & SMR_MULTI_RECV) { + free_entry = smr_progress_multi_recv(ep, entry, total_len); + if (free_entry) { + comp_flags |= SMR_MULTI_RECV; + if (sar) + sar->rx_entry.flags |= SMR_MULTI_RECV; + } + } + + if (!sar) { + ret = smr_complete_rx(ep, entry->context, cmd->msg.hdr.op, + comp_flags, total_len, comp_buf, cmd->msg.hdr.id, + cmd->msg.hdr.tag, cmd->msg.hdr.data, entry->err); + if (ret) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "unable to process rx completion\n"); + } + } + + if (free_entry) { + dlist_remove(&entry->entry); + ofi_freestack_push(ep->recv_fs, entry); + return 1; + } + return 0; +} + +static void smr_progress_connreq(struct smr_ep *ep, struct smr_cmd *cmd) +{ + struct smr_region *peer_smr; + struct smr_inject_buf *tx_buf; + size_t inj_offset; + int64_t idx = -1; + int ret = 0; + + inj_offset = (size_t) cmd->msg.hdr.src_data; + tx_buf = smr_get_ptr(ep->region, inj_offset); + + ret = smr_map_add(&smr_prov, ep->region->map, + (char *) tx_buf->data, &idx); + if (ret) + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "Error processing mapping request\n"); + + peer_smr = smr_peer_region(ep->region, idx); + + smr_peer_data(peer_smr)[cmd->msg.hdr.id].addr.id = idx; + + smr_peer_data(ep->region)[idx].addr.id = cmd->msg.hdr.id; + + smr_freestack_push(smr_inject_pool(ep->region), tx_buf); + ofi_cirque_discard(smr_cmd_queue(ep->region)); + ep->region->cmd_cnt++; +} + static int smr_progress_cmd_msg(struct smr_ep *ep, struct smr_cmd *cmd) { struct smr_queue *recv_queue; struct smr_match_attr match_attr; struct dlist_entry *dlist_entry; - struct smr_ep_entry *entry; struct smr_unexp_msg *unexp; - fi_addr_t addr; - size_t total_len = 0; - int err, ret = 0; + int ret; if (ofi_cirque_isfull(ep->util_ep.rx_cq->cirq)) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, @@ -341,76 +703,45 @@ static int smr_progress_cmd_msg(struct smr_ep *ep, struct smr_cmd *cmd) recv_queue = (cmd->msg.hdr.op == ofi_op_tagged) ? &ep->trecv_queue : &ep->recv_queue; - if (dlist_empty(&recv_queue->list)) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "no recv entry available\n"); - return -FI_ENOMSG; - } - - match_attr.addr = cmd->msg.hdr.addr; + match_attr.id = cmd->msg.hdr.id; match_attr.tag = cmd->msg.hdr.tag; - dlist_entry = dlist_remove_first_match(&recv_queue->list, - recv_queue->match_func, - &match_attr); + dlist_entry = dlist_find_first_match(&recv_queue->list, + recv_queue->match_func, + &match_attr); if (!dlist_entry) { - if (freestack_isempty(ep->unexp_fs)) + if (ofi_freestack_isempty(ep->unexp_fs)) return -FI_EAGAIN; - unexp = freestack_pop(ep->unexp_fs); + unexp = ofi_freestack_pop(ep->unexp_fs); memcpy(&unexp->cmd, cmd, sizeof(*cmd)); ofi_cirque_discard(smr_cmd_queue(ep->region)); - dlist_insert_tail(&unexp->entry, &ep->unexp_queue.list); - return ret; - } - entry = container_of(dlist_entry, struct smr_ep_entry, entry); - - switch (cmd->msg.hdr.op_src) { - case smr_src_inline: - err = smr_progress_inline(cmd, entry->iov, entry->iov_count, - &total_len); - break; - case smr_src_inject: - err = smr_progress_inject(cmd, entry->iov, entry->iov_count, - &total_len, ep, 0); - break; - case smr_src_iov: - err = smr_progress_iov(cmd, entry->iov, entry->iov_count, - &total_len, ep, 0); - break; - default: - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "unidentified operation type\n"); - err = -FI_EINVAL; - } - ret = smr_complete_rx(ep, entry->context, cmd->msg.hdr.op, - cmd->msg.hdr.op_flags | (entry->flags & ~SMR_MULTI_RECV), - total_len, entry->iov[0].iov_base, &addr, cmd->msg.hdr.tag, - cmd->msg.hdr.data, err); - if (ret) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "unable to process rx completion\n"); + if (cmd->msg.hdr.op == ofi_op_msg) { + dlist_insert_tail(&unexp->entry, &ep->unexp_msg_queue.list); + } else { + assert(cmd->msg.hdr.op == ofi_op_tagged); + dlist_insert_tail(&unexp->entry, &ep->unexp_tagged_queue.list); + } + return 0; } + ret = smr_progress_msg_common(ep, cmd, + container_of(dlist_entry, struct smr_rx_entry, entry)); ofi_cirque_discard(smr_cmd_queue(ep->region)); - ep->region->cmd_cnt++; - - if (entry->flags & SMR_MULTI_RECV) { - ret = smr_progress_multi_recv(ep, recv_queue, entry, total_len); - return ret; - } - - freestack_push(ep->recv_fs, entry); - - return ret; + return ret < 0 ? ret : 0; } static int smr_progress_cmd_rma(struct smr_ep *ep, struct smr_cmd *cmd) { + struct smr_region *peer_smr; struct smr_domain *domain; struct smr_cmd *rma_cmd; + struct smr_resp *resp; struct iovec iov[SMR_IOV_LIMIT]; size_t iov_count; size_t total_len = 0; - int err, ret = 0; + int err = 0, ret = 0; + struct ofi_mr *mr; + enum fi_hmem_iface iface = FI_HMEM_SYSTEM; + uint64_t device = 0; domain = container_of(ep->util_ep.domain, struct smr_domain, util_domain); @@ -426,46 +757,79 @@ static int smr_progress_cmd_rma(struct smr_ep *ep, struct smr_cmd *cmd) ep->region->cmd_cnt++; rma_cmd = ofi_cirque_head(smr_cmd_queue(ep->region)); + fastlock_acquire(&domain->util_domain.lock); for (iov_count = 0; iov_count < rma_cmd->rma.rma_count; iov_count++) { - ret = ofi_mr_verify(&domain->util_domain.mr_map, - rma_cmd->rma.rma_iov[iov_count].len, + ret = ofi_mr_map_verify(&domain->util_domain.mr_map, (uintptr_t *) &(rma_cmd->rma.rma_iov[iov_count].addr), + rma_cmd->rma.rma_iov[iov_count].len, rma_cmd->rma.rma_iov[iov_count].key, - ofi_rx_mr_reg_flags(cmd->msg.hdr.op, 0)); + ofi_rx_mr_reg_flags(cmd->msg.hdr.op, 0), (void **) &mr); if (ret) break; iov[iov_count].iov_base = (void *) rma_cmd->rma.rma_iov[iov_count].addr; iov[iov_count].iov_len = rma_cmd->rma.rma_iov[iov_count].len; + + if (!iov_count) { + iface = mr->iface; + device = mr->device; + } else { + assert(mr->iface == iface && mr->device == device); + } } + fastlock_release(&domain->util_domain.lock); + ofi_cirque_discard(smr_cmd_queue(ep->region)); - ep->region->cmd_cnt++; - if (ret) + if (ret) { + ep->region->cmd_cnt++; return ret; + } switch (cmd->msg.hdr.op_src) { case smr_src_inline: - err = smr_progress_inline(cmd, iov, iov_count, &total_len); + err = smr_progress_inline(cmd, iface, device, iov, iov_count, + &total_len); + ep->region->cmd_cnt++; break; case smr_src_inject: - err = smr_progress_inject(cmd, iov, iov_count, &total_len, ep, ret); + err = smr_progress_inject(cmd, iface, device, iov, iov_count, + &total_len, ep, ret); + if (cmd->msg.hdr.op == ofi_op_read_req && cmd->msg.hdr.data) { + peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id); + resp = smr_get_ptr(peer_smr, cmd->msg.hdr.data); + resp->status = -err; + } else { + ep->region->cmd_cnt++; + } break; case smr_src_iov: err = smr_progress_iov(cmd, iov, iov_count, &total_len, ep, ret); break; + case smr_src_mmap: + err = smr_progress_mmap(cmd, iov, iov_count, &total_len, ep); + break; + case smr_src_sar: + if (smr_progress_sar(cmd, NULL, iface, device, iov, iov_count, + &total_len, ep)) + return ret; + break; + case smr_src_ipc: + err = smr_progress_ipc(cmd, iface, device, iov, iov_count, + &total_len, ep, ret); + break; default: FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "unidentified operation type\n"); err = -FI_EINVAL; } + ret = smr_complete_rx(ep, (void *) cmd->msg.hdr.msg_id, - cmd->msg.hdr.op, cmd->msg.hdr.op_flags, - total_len, iov_count ? iov[0].iov_base : NULL, - &cmd->msg.hdr.addr, 0, - cmd->msg.hdr.data, err); + cmd->msg.hdr.op, cmd->msg.hdr.op_flags, + total_len, iov_count ? iov[0].iov_base : NULL, + cmd->msg.hdr.id, 0, cmd->msg.hdr.data, err); if (ret) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "unable to process rx completion\n"); + "unable to process rx completion\n"); } return ret; @@ -521,26 +885,25 @@ static int smr_progress_cmd_atomic(struct smr_ep *ep, struct smr_cmd *cmd) "unidentified operation type\n"); err = -FI_EINVAL; } - if (!(cmd->msg.hdr.op_flags & SMR_RMA_REQ)) { - ep->region->cmd_cnt++; - } else { - peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.addr); - resp = (struct smr_resp *) ((char **) peer_smr + - (size_t) cmd->msg.hdr.data); + if (cmd->msg.hdr.data) { + peer_smr = smr_peer_region(ep->region, cmd->msg.hdr.id); + resp = smr_get_ptr(peer_smr, cmd->msg.hdr.data); resp->status = -err; + } else { + ep->region->cmd_cnt++; } + if (err) FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "error processing atomic op\n"); ret = smr_complete_rx(ep, NULL, cmd->msg.hdr.op, cmd->msg.hdr.op_flags, total_len, ioc_count ? ioc[0].addr : NULL, - &cmd->msg.hdr.addr, 0, - cmd->msg.hdr.data, err); + cmd->msg.hdr.id, 0, cmd->msg.hdr.data, err); if (ret) return ret; - return err; + return err; } static void smr_progress_cmd(struct smr_ep *ep) @@ -565,7 +928,8 @@ static void smr_progress_cmd(struct smr_ep *ep) break; case ofi_op_write_async: case ofi_op_read_async: - ofi_ep_rx_cntr_inc_func(&ep->util_ep, cmd->msg.hdr.op); + ofi_ep_rx_cntr_inc_func(&ep->util_ep, + cmd->msg.hdr.op); ofi_cirque_discard(smr_cmd_queue(ep->region)); ep->region->cmd_cnt++; break; @@ -574,12 +938,14 @@ static void smr_progress_cmd(struct smr_ep *ep) case ofi_op_atomic_compare: ret = smr_progress_cmd_atomic(ep, cmd); break; + case SMR_OP_MAX + ofi_ctrl_connreq: + smr_progress_connreq(ep, cmd); + break; default: FI_WARN(&smr_prov, FI_LOG_EP_CTRL, "unidentified operation type\n"); ret = -FI_EINVAL; } - if (ret) { if (ret != -FI_EAGAIN) { FI_WARN(&smr_prov, FI_LOG_EP_CTRL, @@ -592,6 +958,55 @@ static void smr_progress_cmd(struct smr_ep *ep) fastlock_release(&ep->region->lock); } +static void smr_progress_sar_list(struct smr_ep *ep) +{ + struct smr_region *peer_smr; + struct smr_sar_msg *sar_msg; + struct smr_sar_entry *sar_entry; + struct smr_resp *resp; + struct dlist_entry *tmp; + int ret; + + fastlock_acquire(&ep->region->lock); + fastlock_acquire(&ep->util_ep.rx_cq->cq_lock); + + dlist_foreach_container_safe(&ep->sar_list, struct smr_sar_entry, + sar_entry, entry, tmp) { + sar_msg = smr_get_ptr(ep->region, sar_entry->cmd.msg.data.sar); + peer_smr = smr_peer_region(ep->region, sar_entry->cmd.msg.hdr.id); + resp = smr_get_ptr(peer_smr, sar_entry->cmd.msg.hdr.src_data); + if (sar_entry->cmd.msg.hdr.op == ofi_op_read_req) + smr_try_progress_to_sar(sar_msg, resp, &sar_entry->cmd, + sar_entry->iface, sar_entry->device, + sar_entry->iov, sar_entry->iov_count, + &sar_entry->bytes_done, &sar_entry->next); + else + smr_try_progress_from_sar(sar_msg, resp, &sar_entry->cmd, + sar_entry->iface, sar_entry->device, + sar_entry->iov, sar_entry->iov_count, + &sar_entry->bytes_done, &sar_entry->next); + + if (sar_entry->bytes_done == sar_entry->cmd.msg.hdr.size) { + ret = smr_complete_rx(ep, sar_entry->rx_entry.context, + sar_entry->cmd.msg.hdr.op, + sar_entry->rx_entry.flags, + sar_entry->bytes_done, + sar_entry->rx_entry.iov[0].iov_base, + sar_entry->cmd.msg.hdr.id, + sar_entry->cmd.msg.hdr.tag, + sar_entry->cmd.msg.hdr.data, 0); + if (ret) { + FI_WARN(&smr_prov, FI_LOG_EP_CTRL, + "unable to process rx completion\n"); + } + dlist_remove(&sar_entry->entry); + ofi_freestack_push(ep->sar_fs, sar_entry); + } + } + fastlock_release(&ep->util_ep.rx_cq->cq_lock); + fastlock_release(&ep->region->lock); +} + void smr_ep_progress(struct util_ep *util_ep) { struct smr_ep *ep; @@ -600,74 +1015,41 @@ void smr_ep_progress(struct util_ep *util_ep) smr_progress_resp(ep); smr_progress_cmd(ep); + + smr_progress_sar_list(ep); } -int smr_progress_unexp(struct smr_ep *ep, struct smr_ep_entry *entry) +int smr_progress_unexp_queue(struct smr_ep *ep, struct smr_rx_entry *entry, + struct smr_queue *unexp_queue) { struct smr_match_attr match_attr; struct smr_unexp_msg *unexp_msg; struct dlist_entry *dlist_entry; - size_t total_len = 0; - int ret = 0; - - if (ofi_cirque_isfull(ep->util_ep.rx_cq->cirq)) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "rx cq full\n"); - ret = -FI_EAGAIN; - goto push_entry; - } + int multi_recv; + int ret; - match_attr.addr = entry->addr; + match_attr.id = entry->peer_id; match_attr.ignore = entry->ignore; match_attr.tag = entry->tag; - dlist_entry = dlist_remove_first_match(&ep->unexp_queue.list, - ep->unexp_queue.match_func, + + dlist_entry = dlist_remove_first_match(&unexp_queue->list, + unexp_queue->match_func, &match_attr); if (!dlist_entry) - return -FI_ENOMSG; - - unexp_msg = container_of(dlist_entry, struct smr_unexp_msg, entry); - - switch (unexp_msg->cmd.msg.hdr.op_src) { - case smr_src_inline: - entry->err = smr_progress_inline(&unexp_msg->cmd, entry->iov, - entry->iov_count, &total_len); - break; - case smr_src_inject: - entry->err = smr_progress_inject(&unexp_msg->cmd, entry->iov, - entry->iov_count, &total_len, - ep, 0); - break; - case smr_src_iov: - entry->err = smr_progress_iov(&unexp_msg->cmd, entry->iov, - entry->iov_count, &total_len, - ep, 0); - break; - default: - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "unidentified operation type\n"); - entry->err = FI_EINVAL; - } - - ret = smr_complete_rx(ep, entry->context, unexp_msg->cmd.msg.hdr.op, - unexp_msg->cmd.msg.hdr.op_flags | entry->flags, - total_len, entry->iov[0].iov_base, &entry->addr, entry->tag, - unexp_msg->cmd.msg.hdr.data, entry->err); - if (ret) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "unable to process rx completion\n"); - } - - ep->region->cmd_cnt++; - freestack_push(ep->unexp_fs, unexp_msg); + return 0; + + multi_recv = entry->flags & SMR_MULTI_RECV; + while (dlist_entry) { + unexp_msg = container_of(dlist_entry, struct smr_unexp_msg, entry); + ret = smr_progress_msg_common(ep, &unexp_msg->cmd, entry); + ofi_freestack_push(ep->unexp_fs, unexp_msg); + if (!multi_recv || ret) + break; - if (entry->flags & SMR_MULTI_RECV) { - ret = smr_progress_multi_recv(ep, &ep->trecv_queue, entry, - total_len); - return ret ? ret : -FI_ENOMSG; + dlist_entry = dlist_remove_first_match(&unexp_queue->list, + unexp_queue->match_func, + &match_attr); } -push_entry: - freestack_push(ep->recv_fs, entry); - return ret; + return ret < 0 ? ret : 0; } diff --git a/prov/shm/src/smr_rma.c b/prov/shm/src/smr_rma.c index fd970a12508..54c569f489b 100644 --- a/prov/shm/src/smr_rma.c +++ b/prov/shm/src/smr_rma.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2018 Intel Corporation. All rights reserved + * Copyright (c) 2013-2021 Intel Corporation. All rights reserved * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -49,7 +49,7 @@ static void smr_format_rma_resp(struct smr_cmd *cmd, fi_addr_t peer_id, const struct fi_rma_iov *rma_iov, size_t count, size_t total_len, uint32_t op, uint64_t op_flags) { - smr_generic_format(cmd, peer_id, op, 0, 0, 0, 0, op_flags); + smr_generic_format(cmd, peer_id, op, 0, 0, op_flags); cmd->msg.hdr.size = total_len; } @@ -59,10 +59,11 @@ ssize_t smr_rma_fast(struct smr_region *peer_smr, struct smr_cmd *cmd, void **desc, int peer_id, void *context, uint32_t op, uint64_t op_flags) { - struct iovec rma_iovec[SMR_IOV_LIMIT]; + struct iovec cma_iovec[SMR_IOV_LIMIT], rma_iovec[SMR_IOV_LIMIT]; size_t total_len; int ret, i; + memcpy(cma_iovec, iov, sizeof(*iov) * iov_count); for (i = 0; i < rma_count; i++) { rma_iovec[i].iov_base = (void *) rma_iov[i].addr; rma_iovec[i].iov_len = rma_iov[i].len; @@ -70,26 +71,11 @@ ssize_t smr_rma_fast(struct smr_region *peer_smr, struct smr_cmd *cmd, total_len = ofi_total_iov_len(iov, iov_count); - if (op == ofi_op_write) { - ret = process_vm_writev(peer_smr->pid, iov, iov_count, - rma_iovec, rma_count, 0); - } else { - ret = process_vm_readv(peer_smr->pid, iov, iov_count, - rma_iovec, rma_count, 0); - } + ret = smr_cma_loop(peer_smr->pid, cma_iovec, iov_count, rma_iovec, + rma_count, 0, total_len, op == ofi_op_write); - if (ret != total_len) { - if (ret < 0) { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "CMA write error\n"); - ret = -errno; - } else { - FI_WARN(&smr_prov, FI_LOG_EP_CTRL, - "unable to process tx completion\n"); - ret = -FI_EIO; - } + if (ret) return ret; - } smr_format_rma_resp(cmd, peer_id, rma_iov, rma_count, total_len, (op == ofi_op_write) ? ofi_op_write_async : @@ -106,9 +92,14 @@ ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov, struct smr_domain *domain; struct smr_region *peer_smr; struct smr_inject_buf *tx_buf; + struct smr_sar_msg *sar; struct smr_resp *resp; - struct smr_cmd *cmd, *pend; - int peer_id, cmds, err = 0, comp = 1; + struct smr_cmd *cmd; + struct smr_tx_entry *pend; + enum fi_hmem_iface iface; + uint64_t device; + int64_t id, peer_id; + int cmds, err = 0, comp = 1; uint16_t comp_flags; ssize_t ret = 0; size_t total_len; @@ -118,17 +109,20 @@ ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov, domain = container_of(ep->util_ep.domain, struct smr_domain, util_domain); - peer_id = (int) addr; - ret = smr_verify_peer(ep, peer_id); - if (ret) - return ret; + id = smr_verify_peer(ep, addr); + if (id < 0) + return -FI_EAGAIN; + + peer_id = smr_peer_data(ep->region)[id].addr.id; + peer_smr = smr_peer_region(ep->region, id); - cmds = 1 + !(domain->fast_rma && !(op_flags & FI_REMOTE_CQ_DATA) && - rma_count == 1); + cmds = 1 + !(domain->fast_rma && !(op_flags & + (FI_REMOTE_CQ_DATA | FI_DELIVERY_COMPLETE)) && + rma_count == 1 && smr_cma_enabled(ep, peer_smr)); - peer_smr = smr_peer_region(ep->region, peer_id); fastlock_acquire(&peer_smr->lock); - if (peer_smr->cmd_cnt < cmds) { + if (peer_smr->cmd_cnt < cmds || + smr_peer_data(ep->region)[id].sar_status) { ret = -FI_EAGAIN; goto unlock_region; } @@ -139,35 +133,84 @@ ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov, goto unlock_cq; } - cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr)); + cmd = ofi_cirque_next(smr_cmd_queue(peer_smr)); if (cmds == 1) { err = smr_rma_fast(peer_smr, cmd, iov, iov_count, rma_iov, - rma_count, desc, peer_id, context, op, op_flags); + rma_count, desc, peer_id, context, op, + op_flags); comp_flags = cmd->msg.hdr.op_flags; goto commit_comp; } + iface = smr_get_mr_hmem_iface(ep->util_ep.domain, desc, &device); + total_len = ofi_total_iov_len(iov, iov_count); - if (total_len <= SMR_MSG_DATA_LEN && op == ofi_op_write) { - smr_format_inline(cmd, smr_peer_addr(ep->region)[peer_id].addr, - iov, iov_count, op, 0, data, op_flags); - } else if (total_len <= SMR_INJECT_SIZE && op == ofi_op_write) { + smr_generic_format(cmd, peer_id, op, 0, data, op_flags); + if (total_len <= SMR_MSG_DATA_LEN && op == ofi_op_write && + !(op_flags & FI_DELIVERY_COMPLETE)) { + smr_format_inline(cmd, iface, device, iov, iov_count); + } else if (total_len <= SMR_INJECT_SIZE && + !(op_flags & FI_DELIVERY_COMPLETE)) { tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr)); - smr_format_inject(cmd, smr_peer_addr(ep->region)[peer_id].addr, - iov, iov_count, op, 0, data, op_flags, - peer_smr, tx_buf); + smr_format_inject(cmd, iface, device, iov, iov_count, peer_smr, tx_buf); + if (op == ofi_op_read_req) { + if (ofi_cirque_isfull(smr_resp_queue(ep->region))) { + smr_freestack_push(smr_inject_pool(peer_smr), tx_buf); + ret = -FI_EAGAIN; + goto unlock_cq; + } + cmd->msg.hdr.op_flags |= SMR_RMA_REQ; + resp = ofi_cirque_next(smr_resp_queue(ep->region)); + pend = ofi_freestack_pop(ep->pend_fs); + smr_format_pend_resp(pend, cmd, context, iface, device, iov, + iov_count, id, resp); + cmd->msg.hdr.data = smr_get_offset(ep->region, resp); + ofi_cirque_commit(smr_resp_queue(ep->region)); + comp = 0; + } } else { if (ofi_cirque_isfull(smr_resp_queue(ep->region))) { ret = -FI_EAGAIN; goto unlock_cq; } - resp = ofi_cirque_tail(smr_resp_queue(ep->region)); - pend = freestack_pop(ep->pend_fs); - smr_format_iov(cmd, smr_peer_addr(ep->region)[peer_id].addr, - iov, iov_count, total_len, op, 0, data, - op_flags, context, ep->region, resp, pend); + resp = ofi_cirque_next(smr_resp_queue(ep->region)); + pend = ofi_freestack_pop(ep->pend_fs); + if (smr_cma_enabled(ep, peer_smr) && iface == FI_HMEM_SYSTEM) { + smr_format_iov(cmd, iov, iov_count, total_len, ep->region, + resp); + } else { + if (iface == FI_HMEM_ZE && iov_count == 1 && + smr_ze_ipc_enabled(ep->region, peer_smr)) { + ret = smr_format_ze_ipc(ep, id, cmd, iov, + device, total_len, ep->region, + resp, pend); + } else if (total_len <= smr_env.sar_threshold || + iface != FI_HMEM_SYSTEM) { + if (!peer_smr->sar_cnt) { + ret = -FI_EAGAIN; + } else { + sar = smr_freestack_pop(smr_sar_pool(peer_smr)); + smr_format_sar(cmd, iface, device, iov, + iov_count, total_len, + ep->region, peer_smr, sar, + pend, resp); + peer_smr->sar_cnt--; + smr_peer_data(ep->region)[id].sar_status = 1; + } + } else { + ret = smr_format_mmap(ep, cmd, iov, iov_count, + total_len, pend, resp); + } + if (ret) { + ofi_freestack_push(ep->pend_fs, pend); + ret = -FI_EAGAIN; + goto unlock_cq; + } + } + smr_format_pend_resp(pend, cmd, context, iface, device, iov, + iov_count, id, resp); ofi_cirque_commit(smr_resp_queue(ep->region)); comp = 0; } @@ -175,7 +218,7 @@ ssize_t smr_generic_rma(struct smr_ep *ep, const struct iovec *iov, comp_flags = cmd->msg.hdr.op_flags; ofi_cirque_commit(smr_cmd_queue(peer_smr)); peer_smr->cmd_cnt--; - cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr)); + cmd = ofi_cirque_next(smr_cmd_queue(peer_smr)); smr_format_rma_iov(cmd, rma_iov, rma_count); commit_comp: @@ -213,7 +256,7 @@ ssize_t smr_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, rma_iov.len = len; rma_iov.key = key; - return smr_generic_rma(ep, &msg_iov, 1, &rma_iov, 1, &desc, + return smr_generic_rma(ep, &msg_iov, 1, &rma_iov, 1, &desc, src_addr, context, ofi_op_read_req, 0, smr_ep_tx_flags(ep)); } @@ -265,7 +308,7 @@ ssize_t smr_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc rma_iov.len = len; rma_iov.key = key; - return smr_generic_rma(ep, &msg_iov, 1, &rma_iov, 1, &desc, + return smr_generic_rma(ep, &msg_iov, 1, &rma_iov, 1, &desc, dest_addr, context, ofi_op_write, 0, smr_ep_tx_flags(ep)); } @@ -314,23 +357,27 @@ ssize_t smr_generic_rma_inject(struct fid_ep *ep_fid, const void *buf, struct smr_cmd *cmd; struct iovec iov; struct fi_rma_iov rma_iov; - int peer_id, cmds; + int64_t id, peer_id; + int cmds; ssize_t ret = 0; assert(len <= SMR_INJECT_SIZE); ep = container_of(ep_fid, struct smr_ep, util_ep.ep_fid.fid); domain = container_of(ep->util_ep.domain, struct smr_domain, util_domain); - peer_id = (int) dest_addr; - ret = smr_verify_peer(ep, peer_id); - if (ret) - return ret; + id = smr_verify_peer(ep, dest_addr); + if (id < 0) + return -FI_EAGAIN; - cmds = 1 + !(domain->fast_rma && !(flags & FI_REMOTE_CQ_DATA)); + peer_id = smr_peer_data(ep->region)[id].addr.id; + peer_smr = smr_peer_region(ep->region, id); + + cmds = 1 + !(domain->fast_rma && !(flags & FI_REMOTE_CQ_DATA) && + smr_cma_enabled(ep, peer_smr)); - peer_smr = smr_peer_region(ep->region, peer_id); fastlock_acquire(&peer_smr->lock); - if (peer_smr->cmd_cnt < cmds) { + if (peer_smr->cmd_cnt < cmds || + smr_peer_data(ep->region)[id].sar_status) { ret = -FI_EAGAIN; goto unlock_region; } @@ -341,7 +388,7 @@ ssize_t smr_generic_rma_inject(struct fid_ep *ep_fid, const void *buf, rma_iov.len = len; rma_iov.key = key; - cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr)); + cmd = ofi_cirque_next(smr_cmd_queue(peer_smr)); if (cmds == 1) { ret = smr_rma_fast(peer_smr, cmd, &iov, 1, &rma_iov, 1, NULL, @@ -351,19 +398,18 @@ ssize_t smr_generic_rma_inject(struct fid_ep *ep_fid, const void *buf, goto commit; } + smr_generic_format(cmd, peer_id, ofi_op_write, 0, data, flags); if (len <= SMR_MSG_DATA_LEN) { - smr_format_inline(cmd, smr_peer_addr(ep->region)[peer_id].addr, - &iov, 1, ofi_op_write, 0, data, flags); + smr_format_inline(cmd, FI_HMEM_SYSTEM, 0, &iov, 1); } else { tx_buf = smr_freestack_pop(smr_inject_pool(peer_smr)); - smr_format_inject(cmd, smr_peer_addr(ep->region)[peer_id].addr, - &iov, 1, ofi_op_write, 0, data, - flags, peer_smr, tx_buf); + smr_format_inject(cmd, FI_HMEM_SYSTEM, 0, &iov, 1, + peer_smr, tx_buf); } ofi_cirque_commit(smr_cmd_queue(peer_smr)); peer_smr->cmd_cnt--; - cmd = ofi_cirque_tail(smr_cmd_queue(peer_smr)); + cmd = ofi_cirque_next(smr_cmd_queue(peer_smr)); smr_format_rma_iov(cmd, &rma_iov, 1); commit: diff --git a/prov/shm/src/smr_signal.h b/prov/shm/src/smr_signal.h new file mode 100644 index 00000000000..8c2dde5773b --- /dev/null +++ b/prov/shm/src/smr_signal.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. + * Copyright (c) 2020-2021 Intel Corporation. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _SMR_SIGNAL_H_ +#define _SMR_SIGNAL_H_ +#include +#include + +struct sigaction *old_action; + +static void smr_handle_signal(int signum, siginfo_t *info, void *ucontext) +{ + struct smr_ep_name *ep_name; + struct smr_sock_name *sock_name; + int ret; + + dlist_foreach_container(&ep_name_list, struct smr_ep_name, + ep_name, entry) { + shm_unlink(ep_name->name); + } + dlist_foreach_container(&sock_name_list, struct smr_sock_name, + sock_name, entry) { + unlink(sock_name->name); + } + + /* Register the original signum handler, SIG_DFL or otherwise */ + ret = sigaction(signum, &old_action[signum], NULL); + if (ret) + return; + + /* call the original handler */ + if (old_action[signum].sa_flags & SA_SIGINFO) + old_action[signum].sa_sigaction(signum, info, ucontext); + else if (old_action[signum].sa_handler == SIG_DFL || + old_action[signum].sa_handler == SIG_IGN) + return; + else + old_action[signum].sa_handler(signum); + +} + +static void smr_reg_sig_hander(int signum) +{ + struct sigaction action; + int ret; + + memset(&action, 0, sizeof(action)); + action.sa_sigaction = smr_handle_signal; + action.sa_flags |= SA_SIGINFO | SA_ONSTACK; + + ret = sigaction(signum, &action, &old_action[signum]); + if (ret) + FI_WARN(&smr_prov, FI_LOG_FABRIC, + "Unable to register handler for sig %d\n", signum); +} + +#endif /* _SMR_SIGNAL_H_ */ diff --git a/prov/sockets/Makefile.include b/prov/sockets/Makefile.include index 3cf4a0722db..2e8024cd7a7 100644 --- a/prov/sockets/Makefile.include +++ b/prov/sockets/Makefile.include @@ -5,6 +5,7 @@ if HAVE_SOCKETS AM_CPPFLAGS += -I$(top_srcdir)/prov/sockets/include -I$(top_srcdir)/prov/sockets _sockets_files = \ + prov/sockets/src/sock_attr.c \ prov/sockets/src/sock_av.c \ prov/sockets/src/sock_dom.c \ prov/sockets/src/sock_mr.c \ diff --git a/prov/sockets/include/sock.h b/prov/sockets/include/sock.h index 4fe87cf5bb2..9b8186ea96a 100644 --- a/prov/sockets/include/sock.h +++ b/prov/sockets/include/sock.h @@ -109,42 +109,11 @@ #define SOCK_CM_DEF_RETRY (5) #define SOCK_CM_CONN_IN_PROGRESS ((struct sock_conn *)(0x1L)) -#define SOCK_EP_RDM_PRI_CAP (FI_MSG | FI_RMA | FI_TAGGED | FI_ATOMICS | \ - FI_NAMED_RX_CTX | \ - FI_DIRECTED_RECV | \ - FI_READ | FI_WRITE | FI_RECV | FI_SEND | \ - FI_REMOTE_READ | FI_REMOTE_WRITE) - -#define SOCK_EP_RDM_SEC_CAP_BASE (FI_MULTI_RECV | FI_SOURCE | FI_RMA_EVENT | \ - FI_SHARED_AV | FI_FENCE | FI_TRIGGER) -extern uint64_t SOCK_EP_RDM_SEC_CAP; - -#define SOCK_EP_RDM_CAP_BASE (SOCK_EP_RDM_PRI_CAP | SOCK_EP_RDM_SEC_CAP_BASE) -extern uint64_t SOCK_EP_RDM_CAP; - -#define SOCK_EP_MSG_PRI_CAP SOCK_EP_RDM_PRI_CAP - -#define SOCK_EP_MSG_SEC_CAP_BASE SOCK_EP_RDM_SEC_CAP_BASE -extern uint64_t SOCK_EP_MSG_SEC_CAP; - -#define SOCK_EP_MSG_CAP_BASE (SOCK_EP_MSG_PRI_CAP | SOCK_EP_MSG_SEC_CAP_BASE) -extern uint64_t SOCK_EP_MSG_CAP; - -#define SOCK_EP_DGRAM_PRI_CAP (FI_MSG | FI_TAGGED | \ - FI_NAMED_RX_CTX | FI_DIRECTED_RECV | \ - FI_RECV | FI_SEND) - -#define SOCK_EP_DGRAM_SEC_CAP (FI_MULTI_RECV | FI_SOURCE | FI_SHARED_AV | \ - FI_FENCE | FI_TRIGGER) - -#define SOCK_EP_DGRAM_CAP (SOCK_EP_DGRAM_PRI_CAP | SOCK_EP_DGRAM_SEC_CAP) - #define SOCK_EP_MSG_ORDER (OFI_ORDER_RAR_SET | OFI_ORDER_RAW_SET | FI_ORDER_RAS| \ OFI_ORDER_WAR_SET | OFI_ORDER_WAW_SET | FI_ORDER_WAS | \ FI_ORDER_SAR | FI_ORDER_SAW | FI_ORDER_SAS) #define SOCK_EP_COMP_ORDER (FI_ORDER_STRICT | FI_ORDER_DATA) -#define SOCK_EP_DEFAULT_OP_FLAGS (FI_TRANSMIT_COMPLETE) #define SOCK_EP_CQ_FLAGS (FI_SEND | FI_TRANSMIT | FI_RECV | \ FI_SELECTIVE_COMPLETION) @@ -177,11 +146,26 @@ enum { SOCK_OPTS_KEEPALIVE = 1<<1 }; -#define SOCK_MAJOR_VERSION 2 -#define SOCK_MINOR_VERSION 0 - #define SOCK_WIRE_PROTO_VERSION (2) +extern struct fi_info sock_dgram_info; +extern struct fi_info sock_msg_info; + +extern struct util_prov sock_util_prov; +extern struct fi_domain_attr sock_domain_attr; +extern struct fi_fabric_attr sock_fabric_attr; +extern struct fi_tx_attr sock_msg_tx_attr; +extern struct fi_tx_attr sock_rdm_tx_attr; +extern struct fi_tx_attr sock_dgram_tx_attr; +extern struct fi_rx_attr sock_msg_rx_attr; +extern struct fi_rx_attr sock_rdm_rx_attr; +extern struct fi_rx_attr sock_dgram_rx_attr; +extern struct fi_ep_attr sock_msg_ep_attr; +extern struct fi_ep_attr sock_rdm_ep_attr; +extern struct fi_ep_attr sock_dgram_ep_attr; +extern struct fi_tx_attr sock_stx_attr; +extern struct fi_rx_attr sock_srx_attr; + struct sock_service_entry { int service; struct dlist_entry entry; @@ -212,7 +196,7 @@ struct sock_conn { struct sock_conn_map { struct sock_conn *table; - fi_epoll_t epoll_set; + ofi_epoll_t epoll_set; void **epoll_ctxs; int epoll_ctxs_sz; int used; @@ -221,20 +205,22 @@ struct sock_conn_map { }; struct sock_conn_listener { - fi_epoll_t emap; + ofi_epoll_t epollfd; struct fd_signal signal; fastlock_t signal_lock; /* acquire before map lock */ pthread_t listener_thread; int do_listen; + bool removed_from_epollfd; }; struct sock_ep_cm_head { - fi_epoll_t emap; + ofi_epoll_t epollfd; struct fd_signal signal; - fastlock_t signal_lock; + pthread_mutex_t signal_lock; pthread_t listener_thread; struct dlist_entry msg_list; int do_listen; + bool removed_from_epollfd; }; struct sock_domain { @@ -377,6 +363,7 @@ struct sock_av { int shared; struct dlist_entry ep_list; fastlock_t list_lock; + fastlock_t table_lock; }; struct sock_fid_list { @@ -480,7 +467,7 @@ struct sock_eq_entry { size_t len; uint64_t flags; struct dlist_entry entry; - char event[0]; + char event[]; }; struct sock_eq_err_data_entry { @@ -669,6 +656,8 @@ struct sock_rx_ctx { struct dlist_entry ep_list; fastlock_t lock; + struct dlist_entry *progress_start; + struct fi_rx_attr attr; struct sock_rx_entry *rx_entry_pool; struct slist pool_list; @@ -877,7 +866,7 @@ struct sock_pe { pthread_t progress_thread; volatile int do_progress; struct sock_pe_entry *pe_atomic; - fi_epoll_t epoll_set; + ofi_epoll_t epoll_set; }; typedef int (*sock_cq_report_fn) (struct sock_cq *cq, fi_addr_t addr, @@ -887,7 +876,7 @@ struct sock_cq_overflow_entry_t { size_t len; fi_addr_t addr; struct dlist_entry entry; - char cq_entry[0]; + char cq_entry[]; }; struct sock_cq { @@ -901,8 +890,8 @@ struct sock_cq { struct ofi_ringbuffd cq_rbfd; struct ofi_ringbuf cqerr_rb; struct dlist_entry overflow_list; - fastlock_t lock; - fastlock_t list_lock; + pthread_mutex_t lock; + pthread_mutex_t list_lock; struct fid_wait *waitset; int signal; @@ -927,7 +916,7 @@ struct sock_conn_req { struct sock_conn_hdr hdr; union ofi_sock_ip src_addr; uint64_t caps; - char cm_data[0]; + char cm_data[]; }; enum { @@ -999,20 +988,7 @@ union sock_tx_op { }; #define SOCK_EP_TX_ENTRY_SZ (sizeof(union sock_tx_op)) -int sock_verify_info(uint32_t version, const struct fi_info *hints); -int sock_verify_fabric_attr(const struct fi_fabric_attr *attr); -int sock_verify_domain_attr(uint32_t version, const struct fi_info *info); - size_t sock_get_tx_size(size_t size); -int sock_rdm_verify_ep_attr(const struct fi_ep_attr *ep_attr, - const struct fi_tx_attr *tx_attr, - const struct fi_rx_attr *rx_attr); -int sock_dgram_verify_ep_attr(const struct fi_ep_attr *ep_attr, - const struct fi_tx_attr *tx_attr, - const struct fi_rx_attr *rx_attr); -int sock_msg_verify_ep_attr(const struct fi_ep_attr *ep_attr, - const struct fi_tx_attr *tx_attr, - const struct fi_rx_attr *rx_attr); int sock_get_src_addr(union ofi_sock_ip *dest_addr, union ofi_sock_ip *src_addr); int sock_get_src_addr_from_hostname(union ofi_sock_ip *src_addr, @@ -1021,12 +997,6 @@ int sock_get_src_addr_from_hostname(union ofi_sock_ip *src_addr, struct fi_info *sock_fi_info(uint32_t version, enum fi_ep_type ep_type, const struct fi_info *hints, void *src_addr, void *dest_addr); -int sock_msg_fi_info(uint32_t version, void *src_addr, void *dest_addr, - const struct fi_info *hints, struct fi_info **info); -int sock_dgram_fi_info(uint32_t version, void *src_addr, void *dest_addr, - const struct fi_info *hints, struct fi_info **info); -int sock_rdm_fi_info(uint32_t version, void *src_addr, void *dest_addr, - const struct fi_info *hints, struct fi_info **info); void free_fi_info(struct fi_info *info); int sock_msg_getinfo(uint32_t version, const char *node, const char *service, @@ -1250,7 +1220,6 @@ static inline size_t sock_rx_avail_len(struct sock_rx_entry *rx_entry) int sock_ep_cm_start_thread(struct sock_ep_cm_head *cm_head); void sock_ep_cm_signal(struct sock_ep_cm_head *cm_head); -void sock_ep_cm_signal_locked(struct sock_ep_cm_head *cm_head); void sock_ep_cm_stop_thread(struct sock_ep_cm_head *cm_head); void sock_ep_cm_wait_handle_finalized(struct sock_ep_cm_head *cm_head, struct sock_conn_req_handle *handle); diff --git a/prov/sockets/src/sock_attr.c b/prov/sockets/src/sock_attr.c new file mode 100644 index 00000000000..6039c73b36d --- /dev/null +++ b/prov/sockets/src/sock_attr.c @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2020 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "sock.h" + +#define SOCK_MSG_TX_CAPS (OFI_TX_MSG_CAPS | FI_TAGGED | OFI_TX_RMA_CAPS | \ + FI_ATOMICS | FI_NAMED_RX_CTX | FI_FENCE | FI_TRIGGER) +#define SOCK_MSG_RX_CAPS (OFI_RX_MSG_CAPS | FI_TAGGED | OFI_RX_RMA_CAPS | \ + FI_ATOMICS | FI_DIRECTED_RECV | FI_MULTI_RECV | \ + FI_RMA_EVENT | FI_SOURCE | FI_TRIGGER) + +#define SOCK_RDM_TX_CAPS (OFI_TX_MSG_CAPS | FI_TAGGED | OFI_TX_RMA_CAPS | \ + FI_ATOMICS | FI_NAMED_RX_CTX | FI_FENCE | FI_TRIGGER | \ + FI_RMA_PMEM) +#define SOCK_RDM_RX_CAPS (OFI_RX_MSG_CAPS | FI_TAGGED | OFI_RX_RMA_CAPS | \ + FI_ATOMICS | FI_DIRECTED_RECV | FI_MULTI_RECV | \ + FI_RMA_EVENT | FI_SOURCE | FI_TRIGGER | FI_RMA_PMEM) + +#define SOCK_DGRAM_TX_CAPS (OFI_TX_MSG_CAPS | FI_TAGGED | FI_NAMED_RX_CTX | \ + FI_FENCE | FI_TRIGGER) +#define SOCK_DGRAM_RX_CAPS (OFI_RX_MSG_CAPS | FI_TAGGED | FI_DIRECTED_RECV | \ + FI_MULTI_RECV | FI_SOURCE | FI_TRIGGER) + +#define SOCK_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM | FI_SHARED_AV) + +#define SOCK_TX_OP_FLAGS (FI_COMMIT_COMPLETE | FI_COMPLETION | \ + FI_DELIVERY_COMPLETE | FI_INJECT | FI_INJECT_COMPLETE | \ + FI_MULTICAST | FI_TRANSMIT_COMPLETE) +#define SOCK_RX_OP_FLAGS (FI_COMMIT_COMPLETE | FI_COMPLETION | \ + FI_DELIVERY_COMPLETE | FI_INJECT | FI_INJECT_COMPLETE | \ + FI_MULTI_RECV | FI_TRANSMIT_COMPLETE) + +struct fi_ep_attr sock_msg_ep_attr = { + .type = FI_EP_MSG, + .protocol = FI_PROTO_SOCK_TCP, + .protocol_version = SOCK_WIRE_PROTO_VERSION, + .max_msg_size = SOCK_EP_MAX_MSG_SZ, + .msg_prefix_size = SOCK_EP_MSG_PREFIX_SZ, + .max_order_raw_size = SOCK_EP_MAX_ORDER_RAW_SZ, + .max_order_war_size = SOCK_EP_MAX_ORDER_WAR_SZ, + .max_order_waw_size = SOCK_EP_MAX_ORDER_WAW_SZ, + .mem_tag_format = SOCK_EP_MEM_TAG_FMT, + .tx_ctx_cnt = SOCK_EP_MAX_TX_CNT, + .rx_ctx_cnt = SOCK_EP_MAX_RX_CNT, +}; + +struct fi_tx_attr sock_msg_tx_attr = { + .caps = SOCK_MSG_TX_CAPS, + .mode = SOCK_MODE, + .op_flags = SOCK_TX_OP_FLAGS, + .msg_order = SOCK_EP_MSG_ORDER, + .inject_size = SOCK_EP_MAX_INJECT_SZ, + .size = SOCK_EP_TX_SZ, + .iov_limit = SOCK_EP_MAX_IOV_LIMIT, + .rma_iov_limit = SOCK_EP_MAX_IOV_LIMIT, +}; + +struct fi_rx_attr sock_msg_rx_attr = { + .caps = SOCK_MSG_RX_CAPS, + .mode = SOCK_MODE, + .op_flags = SOCK_RX_OP_FLAGS, + .msg_order = SOCK_EP_MSG_ORDER, + .comp_order = SOCK_EP_COMP_ORDER, + .total_buffered_recv = SOCK_EP_MAX_BUFF_RECV, + .size = SOCK_EP_RX_SZ, + .iov_limit = SOCK_EP_MAX_IOV_LIMIT, +}; + +struct fi_ep_attr sock_dgram_ep_attr = { + .type = FI_EP_DGRAM, + .protocol = FI_PROTO_SOCK_TCP, + .protocol_version = SOCK_WIRE_PROTO_VERSION, + .max_msg_size = SOCK_EP_MAX_MSG_SZ, + .msg_prefix_size = SOCK_EP_MSG_PREFIX_SZ, + .max_order_raw_size = SOCK_EP_MAX_ORDER_RAW_SZ, + .max_order_war_size = SOCK_EP_MAX_ORDER_WAR_SZ, + .max_order_waw_size = SOCK_EP_MAX_ORDER_WAW_SZ, + .mem_tag_format = SOCK_EP_MEM_TAG_FMT, + .tx_ctx_cnt = SOCK_EP_MAX_TX_CNT, + .rx_ctx_cnt = SOCK_EP_MAX_RX_CNT, +}; + +struct fi_ep_attr sock_rdm_ep_attr = { + .type = FI_EP_RDM, + .protocol = FI_PROTO_SOCK_TCP, + .protocol_version = SOCK_WIRE_PROTO_VERSION, + .max_msg_size = SOCK_EP_MAX_MSG_SZ, + .msg_prefix_size = SOCK_EP_MSG_PREFIX_SZ, + .max_order_raw_size = SOCK_EP_MAX_ORDER_RAW_SZ, + .max_order_war_size = SOCK_EP_MAX_ORDER_WAR_SZ, + .max_order_waw_size = SOCK_EP_MAX_ORDER_WAW_SZ, + .mem_tag_format = SOCK_EP_MEM_TAG_FMT, + .tx_ctx_cnt = SOCK_EP_MAX_TX_CNT, + .rx_ctx_cnt = SOCK_EP_MAX_RX_CNT, +}; + +struct fi_tx_attr sock_rdm_tx_attr = { + .caps = SOCK_RDM_TX_CAPS, + .mode = SOCK_MODE, + .op_flags = SOCK_TX_OP_FLAGS, + .msg_order = SOCK_EP_MSG_ORDER, + .inject_size = SOCK_EP_MAX_INJECT_SZ, + .size = SOCK_EP_TX_SZ, + .iov_limit = SOCK_EP_MAX_IOV_LIMIT, + .rma_iov_limit = SOCK_EP_MAX_IOV_LIMIT, +}; + +struct fi_rx_attr sock_rdm_rx_attr = { + .caps = SOCK_RDM_RX_CAPS, + .mode = SOCK_MODE, + .op_flags = SOCK_RX_OP_FLAGS, + .msg_order = SOCK_EP_MSG_ORDER, + .comp_order = SOCK_EP_COMP_ORDER, + .total_buffered_recv = SOCK_EP_MAX_BUFF_RECV, + .size = SOCK_EP_RX_SZ, + .iov_limit = SOCK_EP_MAX_IOV_LIMIT, +}; + +struct fi_tx_attr sock_dgram_tx_attr = { + .caps = SOCK_DGRAM_TX_CAPS, + .mode = SOCK_MODE, + .op_flags = SOCK_TX_OP_FLAGS, + .msg_order = SOCK_EP_MSG_ORDER, + .inject_size = SOCK_EP_MAX_INJECT_SZ, + .size = SOCK_EP_TX_SZ, + .iov_limit = SOCK_EP_MAX_IOV_LIMIT, + .rma_iov_limit = 0, +}; + +struct fi_rx_attr sock_dgram_rx_attr = { + .caps = SOCK_DGRAM_RX_CAPS, + .mode = SOCK_MODE, + .op_flags = SOCK_RX_OP_FLAGS, + .msg_order = SOCK_EP_MSG_ORDER, + .comp_order = SOCK_EP_COMP_ORDER, + .total_buffered_recv = SOCK_EP_MAX_BUFF_RECV, + .size = SOCK_EP_RX_SZ, + .iov_limit = SOCK_EP_MAX_IOV_LIMIT, +}; + +struct fi_tx_attr sock_stx_attr = { + .caps = SOCK_RDM_TX_CAPS | SOCK_RDM_RX_CAPS | SOCK_DOMAIN_CAPS, + .mode = SOCK_MODE, + .op_flags = FI_TRANSMIT_COMPLETE, + .msg_order = SOCK_EP_MSG_ORDER, + .inject_size = SOCK_EP_MAX_INJECT_SZ, + .size = SOCK_EP_TX_SZ, + .iov_limit = SOCK_EP_MAX_IOV_LIMIT, + .rma_iov_limit = SOCK_EP_MAX_IOV_LIMIT, +}; + +struct fi_rx_attr sock_srx_attr = { + .caps = SOCK_RDM_TX_CAPS | SOCK_RDM_RX_CAPS | SOCK_DOMAIN_CAPS, + .mode = SOCK_MODE, + .op_flags = 0, + .msg_order = SOCK_EP_MSG_ORDER, + .comp_order = SOCK_EP_COMP_ORDER, + .total_buffered_recv = 0, + .size = SOCK_EP_MAX_MSG_SZ, + .iov_limit = SOCK_EP_MAX_IOV_LIMIT, +}; + +struct fi_domain_attr sock_domain_attr = { + .name = "sockets", + .threading = FI_THREAD_SAFE, + .control_progress = FI_PROGRESS_AUTO, + .data_progress = FI_PROGRESS_AUTO, + .resource_mgmt = FI_RM_ENABLED, + /* Provider supports basic memory registration mode */ + .mr_mode = FI_MR_BASIC | FI_MR_SCALABLE, + .mr_key_size = sizeof(uint64_t), + .cq_data_size = sizeof(uint64_t), + .cq_cnt = SOCK_EP_MAX_CQ_CNT, + .ep_cnt = SOCK_EP_MAX_EP_CNT, + .tx_ctx_cnt = SOCK_EP_MAX_TX_CNT, + .rx_ctx_cnt = SOCK_EP_MAX_RX_CNT, + .max_ep_tx_ctx = SOCK_EP_MAX_TX_CNT, + .max_ep_rx_ctx = SOCK_EP_MAX_RX_CNT, + .max_ep_stx_ctx = SOCK_EP_MAX_EP_CNT, + .max_ep_srx_ctx = SOCK_EP_MAX_EP_CNT, + .cntr_cnt = SOCK_EP_MAX_CNTR_CNT, + .mr_iov_limit = SOCK_EP_MAX_IOV_LIMIT, + .max_err_data = SOCK_MAX_ERR_CQ_EQ_DATA_SZ, + .mr_cnt = SOCK_DOMAIN_MR_CNT, + .caps = SOCK_DOMAIN_CAPS, +}; + +struct fi_fabric_attr sock_fabric_attr = { + .name = "sockets", + .prov_version = OFI_VERSION_DEF_PROV, +}; + +struct fi_info sock_msg_info = { + .caps = SOCK_MSG_TX_CAPS | SOCK_MSG_RX_CAPS | SOCK_DOMAIN_CAPS, + .addr_format = FI_SOCKADDR, + .tx_attr = &sock_msg_tx_attr, + .rx_attr = &sock_msg_rx_attr, + .ep_attr = &sock_msg_ep_attr, + .domain_attr = &sock_domain_attr, + .fabric_attr = &sock_fabric_attr +}; + +struct fi_info sock_rdm_info = { + .next = &sock_msg_info, + .caps = SOCK_RDM_TX_CAPS | SOCK_RDM_RX_CAPS | SOCK_DOMAIN_CAPS, + .addr_format = FI_SOCKADDR, + .tx_attr = &sock_rdm_tx_attr, + .rx_attr = &sock_rdm_rx_attr, + .ep_attr = &sock_rdm_ep_attr, + .domain_attr = &sock_domain_attr, + .fabric_attr = &sock_fabric_attr +}; + +struct fi_info sock_dgram_info = { + .next = &sock_rdm_info, + .caps = SOCK_DGRAM_TX_CAPS | SOCK_DGRAM_RX_CAPS | SOCK_DOMAIN_CAPS, + .addr_format = FI_SOCKADDR, + .tx_attr = &sock_dgram_tx_attr, + .rx_attr = &sock_dgram_rx_attr, + .ep_attr = &sock_dgram_ep_attr, + .domain_attr = &sock_domain_attr, + .fabric_attr = &sock_fabric_attr +}; diff --git a/prov/sockets/src/sock_av.c b/prov/sockets/src/sock_av.c index e2760e2e262..f83312a6c1e 100644 --- a/prov/sockets/src/sock_av.c +++ b/prov/sockets/src/sock_av.c @@ -68,15 +68,19 @@ int sock_av_get_addr_index(struct sock_av *av, union ofi_sock_ip *addr) int i; struct sock_av_addr *av_addr; + fastlock_acquire(&av->table_lock); for (i = 0; i < (int)av->table_hdr->size; i++) { av_addr = &av->table[i]; if (!av_addr->valid) continue; if (ofi_equals_sockaddr((const struct sockaddr *) addr, - (const struct sockaddr *) &av_addr->addr)) + (const struct sockaddr *) &av_addr->addr)) { + fastlock_release(&av->table_lock); return i; + } } + fastlock_release(&av->table_lock); SOCK_LOG_DBG("failed to get index in AV\n"); return -1; } @@ -86,13 +90,16 @@ int sock_av_compare_addr(struct sock_av *av, { int index1, index2; struct sock_av_addr *av_addr1, *av_addr2; + int ret; index1 = ((uint64_t)addr1 & av->mask); index2 = ((uint64_t)addr2 & av->mask); + fastlock_acquire(&av->table_lock); if (index1 >= (int)av->table_hdr->size || index1 < 0 || index2 >= (int)av->table_hdr->size || index2 < 0) { SOCK_LOG_ERROR("requested rank is larger than av table\n"); + fastlock_release(&av->table_lock); return -1; } @@ -100,7 +107,9 @@ int sock_av_compare_addr(struct sock_av *av, av_addr2 = &av->table[index2]; /* Return 0 if the addresses match */ - return !ofi_equals_sockaddr(&av_addr1->addr.sa, &av_addr2->addr.sa); + ret = !ofi_equals_sockaddr(&av_addr1->addr.sa, &av_addr2->addr.sa); + fastlock_release(&av->table_lock); + return ret; } static inline void sock_av_report_success(struct sock_av *av, void *context, @@ -118,19 +127,22 @@ static inline void sock_av_report_success(struct sock_av *av, void *context, &eq_entry, sizeof(eq_entry), flags); } -static inline void sock_av_report_error(struct sock_av *av, - void *context, int index, int err) +static void sock_av_report_error(struct sock_av *av, fi_addr_t *fi_addr, + void *context, int index, int err, + uint64_t flags) { - if (!av->eq) - return; + int *sync_err; - sock_eq_report_error(av->eq, &av->av_fid.fid, - context, index, err, -err, NULL, 0); -} + if (fi_addr) { + fi_addr[index] = FI_ADDR_NOTAVAIL; + } else if (flags & FI_SYNC_ERR) { + sync_err = context; + sync_err[index] = err; + } -static int sock_av_is_valid_address(const struct sockaddr *addr) -{ - return ofi_sizeofaddr(addr); + if (av->eq) + sock_eq_report_error(av->eq, &av->av_fid.fid, + context, index, err, -err, NULL, 0); } static void sock_update_av_table(struct sock_av *_av, size_t count) @@ -160,6 +172,7 @@ static int sock_resize_av_table(struct sock_av *av) new_addr = realloc(av->table_hdr, table_sz); if (!new_addr) return -1; + memset((char *) new_addr + old_sz, 0, table_sz - old_sz); } av->table_hdr = new_addr; @@ -194,15 +207,20 @@ static int sock_check_table_in(struct sock_av *_av, const struct sockaddr *addr, if ((_av->attr.flags & FI_EVENT) && !_av->eq) return -FI_ENOEQ; + if (flags & FI_SYNC_ERR) { + if (fi_addr || !context || _av->eq) + return -FI_EBADFLAGS; + memset(context, 0, sizeof(int) * count); + } + if (_av->attr.flags & FI_READ) { for (i = 0; i < count; i++) { for (j = 0; j < _av->table_hdr->size; j++) { if (_av->table[j].valid && - !sock_av_is_valid_address(&addr[i])) { - if (fi_addr) - fi_addr[i] = FI_ADDR_NOTAVAIL; - sock_av_report_error(_av, context, i, - FI_EINVAL); + !ofi_valid_dest_ipaddr(&addr[i])) { + sock_av_report_error(_av, fi_addr, + context, i, FI_EINVAL, + flags); continue; } @@ -221,19 +239,18 @@ static int sock_check_table_in(struct sock_av *_av, const struct sockaddr *addr, } for (i = 0, ret = 0; i < count; i++) { - if (!sock_av_is_valid_address(&addr[i])) { - if (fi_addr) - fi_addr[i] = FI_ADDR_NOTAVAIL; - sock_av_report_error(_av, context, i, FI_EINVAL); + if (!ofi_valid_dest_ipaddr(&addr[i])) { + sock_av_report_error(_av, fi_addr, context, i, FI_EINVAL, + flags); continue; } if (_av->table_hdr->stored == _av->table_hdr->size) { index = sock_av_get_next_index(_av); if (index < 0) { if (sock_resize_av_table(_av)) { - if (fi_addr) - fi_addr[i] = FI_ADDR_NOTAVAIL; - sock_av_report_error(_av, context, i, FI_ENOMEM); + sock_av_report_error(_av, fi_addr, + context, i, + FI_ENOMEM, flags); continue; } index = _av->table_hdr->stored++; @@ -264,9 +281,15 @@ static int sock_av_insert(struct fid_av *av, const void *addr, size_t count, fi_addr_t *fi_addr, uint64_t flags, void *context) { struct sock_av *_av; + int ret = 0; + _av = container_of(av, struct sock_av, av_fid); - return sock_check_table_in(_av, (const struct sockaddr *) addr, + + fastlock_acquire(&_av->table_lock); + ret = sock_check_table_in(_av, (const struct sockaddr *) addr, fi_addr, count, flags, context); + fastlock_release(&_av->table_lock); + return ret; } static int sock_av_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr, @@ -278,13 +301,17 @@ static int sock_av_lookup(struct fid_av *av, fi_addr_t fi_addr, void *addr, _av = container_of(av, struct sock_av, av_fid); index = ((uint64_t)fi_addr & _av->mask); + + fastlock_acquire(&_av->table_lock); if (index >= (int)_av->table_hdr->size || index < 0) { SOCK_LOG_ERROR("requested address not inserted\n"); + fastlock_release(&_av->table_lock); return -EINVAL; } av_addr = &_av->table[index]; memcpy(addr, &av_addr->addr, MIN(*addrlen, (size_t)_av->addrlen)); + fastlock_release(&_av->table_lock); *addrlen = _av->addrlen; return 0; } @@ -307,14 +334,17 @@ static int _sock_av_insertsvc(struct fid_av *av, const char *node, ret = getaddrinfo(node, service, &sock_hints, &result); if (ret) { if (_av->eq) { - sock_av_report_error(_av, context, 0, FI_EINVAL); + sock_av_report_error(_av, fi_addr, context, 0, + FI_EINVAL, flags); sock_av_report_success(_av, context, 0, flags); } return -ret; } + fastlock_acquire(&_av->table_lock); ret = sock_check_table_in(_av, result->ai_addr, fi_addr, 1, flags, context); + fastlock_release(&_av->table_lock); freeaddrinfo(result); return ret; } @@ -357,8 +387,9 @@ static int sock_av_insertsym(struct fid_av *av, const char *node, size_t nodecnt else fmt = offset; - assert((hostlen-offset) < FI_NAME_MAX); - strncpy(base_host, node, hostlen - (offset)); + if (hostlen - offset >= FI_NAME_MAX) + return -FI_ETOOSMALL; + memcpy(base_host, node, hostlen - offset); var_port = atoi(service); var_host = atoi(node + hostlen - offset); @@ -376,7 +407,7 @@ static int sock_av_insertsym(struct fid_av *av, const char *node, size_t nodecnt err_code = ret; } else { SOCK_LOG_ERROR("Node/service value is not valid\n"); - err_code = FI_ETOOSMALL; + err_code = -FI_ETOOSMALL; } } } @@ -403,7 +434,7 @@ static int sock_av_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count, sock_ep = container_of(fid_entry->fid, struct sock_ep, ep.fid); fastlock_acquire(&sock_ep->attr->cmap.lock); for (i = 0; i < count; i++) { - idx = fi_addr[i] & sock_ep->attr->av->mask; + idx = fi_addr[i] & sock_ep->attr->av->mask; conn = ofi_idm_lookup(&sock_ep->attr->av_idm, idx); if (conn) { /* A peer may be using the connection, so leave @@ -418,10 +449,12 @@ static int sock_av_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count, } fastlock_release(&_av->list_lock); + fastlock_acquire(&_av->table_lock); for (i = 0; i < count; i++) { av_addr = &_av->table[fi_addr[i]]; av_addr->valid = 0; } + fastlock_release(&_av->table_lock); return 0; } @@ -477,6 +510,7 @@ static int sock_av_close(struct fid *fid) ofi_atomic_dec32(&av->domain->ref); fastlock_destroy(&av->list_lock); + fastlock_destroy(&av->table_lock); free(av); return 0; } @@ -625,6 +659,7 @@ int sock_av_open(struct fid_domain *domain, struct fi_av_attr *attr, } dlist_init(&_av->ep_list); fastlock_init(&_av->list_lock); + fastlock_init(&_av->table_lock); _av->rx_ctx_bits = attr->rx_ctx_bits; _av->mask = attr->rx_ctx_bits ? ((uint64_t)1 << (64 - attr->rx_ctx_bits)) - 1 : ~0; diff --git a/prov/sockets/src/sock_cntr.c b/prov/sockets/src/sock_cntr.c index 63793e23048..2bf5b8a3da7 100644 --- a/prov/sockets/src/sock_cntr.c +++ b/prov/sockets/src/sock_cntr.c @@ -325,7 +325,7 @@ static int sock_cntr_wait(struct fid_cntr *fid_cntr, uint64_t threshold, ofi_atomic_inc32(&cntr->num_waiting); if (timeout >= 0) { - start_ms = fi_gettime_ms(); + start_ms = ofi_gettime_ms(); end_ms = start_ms + timeout; } @@ -341,7 +341,7 @@ static int sock_cntr_wait(struct fid_cntr *fid_cntr, uint64_t threshold, ret = fi_wait_cond(&cntr->cond, &cntr->mut, remaining_ms); } - uint64_t curr_ms = fi_gettime_ms(); + uint64_t curr_ms = ofi_gettime_ms(); if (timeout >= 0) { if (curr_ms >= end_ms) { ret = -FI_ETIMEDOUT; diff --git a/prov/sockets/src/sock_conn.c b/prov/sockets/src/sock_conn.c index 5f1c59d1a79..0d39956a825 100644 --- a/prov/sockets/src/sock_conn.c +++ b/prov/sockets/src/sock_conn.c @@ -105,7 +105,7 @@ int sock_conn_map_init(struct sock_ep *ep, int init_size) if (!map->epoll_ctxs) goto err1; - ret = fi_epoll_create(&map->epoll_set); + ret = ofi_epoll_create(&map->epoll_set); if (ret < 0) { SOCK_LOG_ERROR("failed to create epoll set, " "error - %d (%s)\n", ret, @@ -157,18 +157,19 @@ void sock_conn_map_destroy(struct sock_ep_attr *ep_attr) cmap->epoll_ctxs = NULL; cmap->epoll_ctxs_sz = 0; cmap->used = cmap->size = 0; - fi_epoll_close(cmap->epoll_set); + ofi_epoll_close(cmap->epoll_set); fastlock_destroy(&cmap->lock); } void sock_conn_release_entry(struct sock_conn_map *map, struct sock_conn *conn) { - fi_epoll_del(map->epoll_set, conn->sock_fd); + ofi_epoll_del(map->epoll_set, conn->sock_fd); ofi_close_socket(conn->sock_fd); conn->address_published = 0; - conn->connected = 0; - conn->sock_fd = -1; + conn->av_index = FI_ADDR_NOTAVAIL; + conn->connected = 0; + conn->sock_fd = -1; } static int sock_conn_get_next_index(struct sock_conn_map *map) @@ -210,7 +211,7 @@ static struct sock_conn *sock_conn_map_insert(struct sock_ep_attr *ep_attr, (ep_attr->ep_type == FI_EP_MSG ? SOCK_OPTS_KEEPALIVE : 0)); - if (fi_epoll_add(map->epoll_set, conn_fd, FI_EPOLL_IN, &map->table[index])) + if (ofi_epoll_add(map->epoll_set, conn_fd, OFI_EPOLL_IN, &map->table[index])) SOCK_LOG_ERROR("failed to add to epoll set: %d\n", conn_fd); map->table[index].address_published = addr_published; @@ -306,7 +307,7 @@ int sock_conn_stop_listener_thread(struct sock_conn_listener *conn_listener) } fd_signal_free(&conn_listener->signal); - fi_epoll_close(conn_listener->emap); + ofi_epoll_close(conn_listener->epollfd); fastlock_destroy(&conn_listener->signal_lock); return 0; @@ -323,7 +324,7 @@ static void *sock_conn_listener_thread(void *arg) socklen_t addr_size; while (conn_listener->do_listen) { - num_fds = fi_epoll_wait(conn_listener->emap, ep_contexts, + num_fds = ofi_epoll_wait(conn_listener->epollfd, ep_contexts, SOCK_EPOLL_WAIT_EVENTS, -1); if (num_fds < 0) { SOCK_LOG_ERROR("poll failed : %s\n", strerror(errno)); @@ -331,6 +332,15 @@ static void *sock_conn_listener_thread(void *arg) } fastlock_acquire(&conn_listener->signal_lock); + if (conn_listener->removed_from_epollfd) { + /* The epoll set changed between calling wait and wait + * returning. Get an updated set of events to avoid + * possible use after free error. + */ + conn_listener->removed_from_epollfd = false; + goto skip; + } + for (i = 0; i < num_fds; i++) { conn_handle = ep_contexts[i]; @@ -359,6 +369,7 @@ static void *sock_conn_listener_thread(void *arg) fastlock_release(&ep_attr->cmap.lock); sock_pe_signal(ep_attr->domain->pe); } +skip: fastlock_release(&conn_listener->signal_lock); } @@ -371,7 +382,7 @@ int sock_conn_start_listener_thread(struct sock_conn_listener *conn_listener) fastlock_init(&conn_listener->signal_lock); - ret = fi_epoll_create(&conn_listener->emap); + ret = ofi_epoll_create(&conn_listener->epollfd); if (ret < 0) { SOCK_LOG_ERROR("failed to create epoll set\n"); goto err1; @@ -383,15 +394,16 @@ int sock_conn_start_listener_thread(struct sock_conn_listener *conn_listener) goto err2; } - ret = fi_epoll_add(conn_listener->emap, + ret = ofi_epoll_add(conn_listener->epollfd, conn_listener->signal.fd[FI_READ_FD], - FI_EPOLL_IN, NULL); + OFI_EPOLL_IN, NULL); if (ret != 0){ SOCK_LOG_ERROR("failed to add signal fd to epoll\n"); goto err3; } conn_listener->do_listen = 1; + conn_listener->removed_from_epollfd = false; ret = pthread_create(&conn_listener->listener_thread, NULL, sock_conn_listener_thread, conn_listener); if (ret < 0) { @@ -404,7 +416,7 @@ int sock_conn_start_listener_thread(struct sock_conn_listener *conn_listener) conn_listener->do_listen = 0; fd_signal_free(&conn_listener->signal); err2: - fi_epoll_close(conn_listener->emap); + ofi_epoll_close(conn_listener->epollfd); err1: fastlock_destroy(&conn_listener->signal_lock); return ret; @@ -463,8 +475,8 @@ int sock_conn_listen(struct sock_ep_attr *ep_attr) conn_handle->do_listen = 1; fastlock_acquire(&ep_attr->domain->conn_listener.signal_lock); - ret = fi_epoll_add(ep_attr->domain->conn_listener.emap, - conn_handle->sock, FI_EPOLL_IN, conn_handle); + ret = ofi_epoll_add(ep_attr->domain->conn_listener.epollfd, + conn_handle->sock, OFI_EPOLL_IN, conn_handle); fd_signal_set(&ep_attr->domain->conn_listener.signal); fastlock_release(&ep_attr->domain->conn_listener.signal_lock); if (ret) { @@ -501,7 +513,9 @@ int sock_ep_connect(struct sock_ep_attr *ep_attr, fi_addr_t index, addr = *ep_attr->dest_addr; ofi_addr_set_port(&addr.sa, ep_attr->msg_dest_port); } else { + fastlock_acquire(&ep_attr->av->table_lock); addr = ep_attr->av->table[index].addr; + fastlock_release(&ep_attr->av->table_lock); } do_connect: @@ -579,6 +593,8 @@ int sock_ep_connect(struct sock_ep_attr *ep_attr, fi_addr_t index, SOCK_LOG_ERROR("Connect error, retrying - %s - %d\n", strerror(ofi_sockerr()), conn_fd); + ofi_straddr_log(&sock_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, + "Retry connect to peer ", &addr.sa); goto do_connect; out: diff --git a/prov/sockets/src/sock_cq.c b/prov/sockets/src/sock_cq.c index 9ea7e764d7b..6f893fa39a9 100644 --- a/prov/sockets/src/sock_cq.c +++ b/prov/sockets/src/sock_cq.c @@ -55,7 +55,7 @@ void sock_cq_add_tx_ctx(struct sock_cq *cq, struct sock_tx_ctx *tx_ctx) { struct dlist_entry *entry; struct sock_tx_ctx *curr_ctx; - fastlock_acquire(&cq->list_lock); + pthread_mutex_lock(&cq->list_lock); for (entry = cq->tx_list.next; entry != &cq->tx_list; entry = entry->next) { curr_ctx = container_of(entry, struct sock_tx_ctx, cq_entry); @@ -65,22 +65,22 @@ void sock_cq_add_tx_ctx(struct sock_cq *cq, struct sock_tx_ctx *tx_ctx) dlist_insert_tail(&tx_ctx->cq_entry, &cq->tx_list); ofi_atomic_inc32(&cq->ref); out: - fastlock_release(&cq->list_lock); + pthread_mutex_unlock(&cq->list_lock); } void sock_cq_remove_tx_ctx(struct sock_cq *cq, struct sock_tx_ctx *tx_ctx) { - fastlock_acquire(&cq->list_lock); + pthread_mutex_lock(&cq->list_lock); dlist_remove(&tx_ctx->cq_entry); ofi_atomic_dec32(&cq->ref); - fastlock_release(&cq->list_lock); + pthread_mutex_unlock(&cq->list_lock); } void sock_cq_add_rx_ctx(struct sock_cq *cq, struct sock_rx_ctx *rx_ctx) { struct dlist_entry *entry; struct sock_rx_ctx *curr_ctx; - fastlock_acquire(&cq->list_lock); + pthread_mutex_lock(&cq->list_lock); for (entry = cq->rx_list.next; entry != &cq->rx_list; entry = entry->next) { @@ -91,15 +91,15 @@ void sock_cq_add_rx_ctx(struct sock_cq *cq, struct sock_rx_ctx *rx_ctx) dlist_insert_tail(&rx_ctx->cq_entry, &cq->rx_list); ofi_atomic_inc32(&cq->ref); out: - fastlock_release(&cq->list_lock); + pthread_mutex_unlock(&cq->list_lock); } void sock_cq_remove_rx_ctx(struct sock_cq *cq, struct sock_rx_ctx *rx_ctx) { - fastlock_acquire(&cq->list_lock); + pthread_mutex_lock(&cq->list_lock); dlist_remove(&rx_ctx->cq_entry); ofi_atomic_dec32(&cq->ref); - fastlock_release(&cq->list_lock); + pthread_mutex_unlock(&cq->list_lock); } int sock_cq_progress(struct sock_cq *cq) @@ -111,7 +111,7 @@ int sock_cq_progress(struct sock_cq *cq) if (cq->domain->progress_mode == FI_PROGRESS_AUTO) return 0; - fastlock_acquire(&cq->list_lock); + pthread_mutex_lock(&cq->list_lock); for (entry = cq->tx_list.next; entry != &cq->tx_list; entry = entry->next) { tx_ctx = container_of(entry, struct sock_tx_ctx, cq_entry); @@ -135,7 +135,7 @@ int sock_cq_progress(struct sock_cq *cq) else sock_pe_progress_ep_rx(cq->domain->pe, rx_ctx->ep_attr); } - fastlock_release(&cq->list_lock); + pthread_mutex_unlock(&cq->list_lock); return 0; } @@ -176,7 +176,7 @@ static ssize_t _sock_cq_write(struct sock_cq *cq, fi_addr_t addr, ssize_t ret; struct sock_cq_overflow_entry_t *overflow_entry; - fastlock_acquire(&cq->lock); + pthread_mutex_lock(&cq->lock); if (ofi_rbfdavail(&cq->cq_rbfd) < len) { SOCK_LOG_ERROR("Not enough space in CQ\n"); overflow_entry = calloc(1, sizeof(*overflow_entry) + len); @@ -208,7 +208,7 @@ static ssize_t _sock_cq_write(struct sock_cq *cq, fi_addr_t addr, if (cq->signal) sock_wait_signal(cq->waitset); out: - fastlock_release(&cq->lock); + pthread_mutex_unlock(&cq->lock); return ret; } @@ -349,24 +349,24 @@ static ssize_t sock_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count, else threshold = count; - start_ms = (timeout >= 0) ? fi_gettime_ms() : 0; + start_ms = (timeout >= 0) ? ofi_gettime_ms() : 0; if (sock_cq->domain->progress_mode == FI_PROGRESS_MANUAL) { while (1) { sock_cq_progress(sock_cq); - fastlock_acquire(&sock_cq->lock); + pthread_mutex_lock(&sock_cq->lock); avail = ofi_rbfdused(&sock_cq->cq_rbfd); if (avail) { ret = sock_cq_rbuf_read(sock_cq, buf, MIN(threshold, (size_t)(avail / cq_entry_len)), src_addr, cq_entry_len); } - fastlock_release(&sock_cq->lock); + pthread_mutex_unlock(&sock_cq->lock); if (ret) return ret; if (timeout >= 0) { - timeout -= (int) (fi_gettime_ms() - start_ms); + timeout -= (int) (ofi_gettime_ms() - start_ms); if (timeout <= 0) return -FI_EAGAIN; } @@ -378,7 +378,7 @@ static ssize_t sock_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count, }; } else { do { - fastlock_acquire(&sock_cq->lock); + pthread_mutex_lock(&sock_cq->lock); ret = 0; avail = ofi_rbfdused(&sock_cq->cq_rbfd); if (avail) { @@ -388,12 +388,12 @@ static ssize_t sock_cq_sreadfrom(struct fid_cq *cq, void *buf, size_t count, } else { ofi_rbfdreset(&sock_cq->cq_rbfd); } - fastlock_release(&sock_cq->lock); + pthread_mutex_unlock(&sock_cq->lock); if (ret && ret != -FI_EAGAIN) return ret; if (timeout >= 0) { - timeout -= (int) (fi_gettime_ms() - start_ms); + timeout -= (int) (ofi_gettime_ms() - start_ms); if (timeout <= 0) return -FI_EAGAIN; } @@ -440,7 +440,7 @@ static ssize_t sock_cq_readerr(struct fid_cq *cq, struct fi_cq_err_entry *buf, if (sock_cq->domain->progress_mode == FI_PROGRESS_MANUAL) sock_cq_progress(sock_cq); - fastlock_acquire(&sock_cq->lock); + pthread_mutex_lock(&sock_cq->lock); if (ofi_rbused(&sock_cq->cqerr_rb) >= sizeof(struct fi_cq_err_entry)) { api_version = sock_cq->domain->fab->fab_fid.api_version; ofi_rbread(&sock_cq->cqerr_rb, &entry, sizeof(entry)); @@ -463,7 +463,7 @@ static ssize_t sock_cq_readerr(struct fid_cq *cq, struct fi_cq_err_entry *buf, } else { ret = -FI_EAGAIN; } - fastlock_release(&sock_cq->lock); + pthread_mutex_unlock(&sock_cq->lock); return ret; } @@ -490,8 +490,8 @@ static int sock_cq_close(struct fid *fid) ofi_rbfree(&cq->cqerr_rb); ofi_rbfdfree(&cq->cq_rbfd); - fastlock_destroy(&cq->lock); - fastlock_destroy(&cq->list_lock); + pthread_mutex_destroy(&cq->lock); + pthread_mutex_destroy(&cq->list_lock); ofi_atomic_dec32(&cq->domain->ref); free(cq); @@ -504,9 +504,9 @@ static int sock_cq_signal(struct fid_cq *cq) sock_cq = container_of(cq, struct sock_cq, cq_fid); ofi_atomic_set32(&sock_cq->signaled, 1); - fastlock_acquire(&sock_cq->lock); + pthread_mutex_lock(&sock_cq->lock); ofi_rbfdsignal(&sock_cq->cq_rbfd); - fastlock_release(&sock_cq->lock); + pthread_mutex_unlock(&sock_cq->lock); return 0; } @@ -668,7 +668,7 @@ int sock_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, if (ret) goto err3; - fastlock_init(&sock_cq->lock); + pthread_mutex_init(&sock_cq->lock, NULL); switch (sock_cq->attr.wait_obj) { case FI_WAIT_NONE: @@ -713,7 +713,7 @@ int sock_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, *cq = &sock_cq->cq_fid; ofi_atomic_inc32(&sock_dom->ref); - fastlock_init(&sock_cq->list_lock); + pthread_mutex_init(&sock_cq->list_lock, NULL); return 0; @@ -735,7 +735,7 @@ int sock_cq_report_error(struct sock_cq *cq, struct sock_pe_entry *entry, int ret; struct fi_cq_err_entry err_entry; - fastlock_acquire(&cq->lock); + pthread_mutex_lock(&cq->lock); if (ofi_rbavail(&cq->cqerr_rb) < sizeof(err_entry)) { ret = -FI_ENOSPC; goto out; @@ -764,6 +764,6 @@ int sock_cq_report_error(struct sock_cq *cq, struct sock_pe_entry *entry, ofi_rbfdsignal(&cq->cq_rbfd); out: - fastlock_release(&cq->lock); + pthread_mutex_unlock(&cq->lock); return ret; } diff --git a/prov/sockets/src/sock_ctx.c b/prov/sockets/src/sock_ctx.c index 6d051bd5c2c..30fe13dd515 100644 --- a/prov/sockets/src/sock_ctx.c +++ b/prov/sockets/src/sock_ctx.c @@ -57,6 +57,8 @@ struct sock_rx_ctx *sock_rx_ctx_alloc(const struct fi_rx_attr *attr, dlist_init(&rx_ctx->rx_buffered_list); dlist_init(&rx_ctx->ep_list); + rx_ctx->progress_start = &rx_ctx->rx_buffered_list; + fastlock_init(&rx_ctx->lock); rx_ctx->ctx.fid.fclass = FI_CLASS_RX_CTX; @@ -69,6 +71,15 @@ struct sock_rx_ctx *sock_rx_ctx_alloc(const struct fi_rx_attr *attr, void sock_rx_ctx_free(struct sock_rx_ctx *rx_ctx) { + struct sock_rx_entry *rx_buffered; + + /* free any remaining buffered entries */ + while (!dlist_empty(&rx_ctx->rx_buffered_list)) { + dlist_pop_front(&rx_ctx->rx_buffered_list, + struct sock_rx_entry, rx_buffered, entry); + free(rx_buffered); + } + fastlock_destroy(&rx_ctx->lock); free(rx_ctx->rx_entry_pool); free(rx_ctx); diff --git a/prov/sockets/src/sock_dom.c b/prov/sockets/src/sock_dom.c index a78bf71c02b..8adec0ce371 100644 --- a/prov/sockets/src/sock_dom.c +++ b/prov/sockets/src/sock_dom.c @@ -46,133 +46,6 @@ extern struct fi_ops_mr sock_dom_mr_ops; -const struct fi_domain_attr sock_domain_attr = { - .name = NULL, - .threading = FI_THREAD_SAFE, - .control_progress = FI_PROGRESS_AUTO, - .data_progress = FI_PROGRESS_AUTO, - .resource_mgmt = FI_RM_ENABLED, - /* Provider supports basic memory registration mode */ - .mr_mode = FI_MR_BASIC | FI_MR_SCALABLE, - .mr_key_size = sizeof(uint64_t), - .cq_data_size = sizeof(uint64_t), - .cq_cnt = SOCK_EP_MAX_CQ_CNT, - .ep_cnt = SOCK_EP_MAX_EP_CNT, - .tx_ctx_cnt = SOCK_EP_MAX_TX_CNT, - .rx_ctx_cnt = SOCK_EP_MAX_RX_CNT, - .max_ep_tx_ctx = SOCK_EP_MAX_TX_CNT, - .max_ep_rx_ctx = SOCK_EP_MAX_RX_CNT, - .max_ep_stx_ctx = SOCK_EP_MAX_EP_CNT, - .max_ep_srx_ctx = SOCK_EP_MAX_EP_CNT, - .cntr_cnt = SOCK_EP_MAX_CNTR_CNT, - .mr_iov_limit = SOCK_EP_MAX_IOV_LIMIT, - .max_err_data = SOCK_MAX_ERR_CQ_EQ_DATA_SZ, - .mr_cnt = SOCK_DOMAIN_MR_CNT, -}; - -int sock_verify_domain_attr(uint32_t version, const struct fi_info *info) -{ - const struct fi_domain_attr *attr = info->domain_attr; - - if (!attr) - return 0; - - switch (attr->threading) { - case FI_THREAD_UNSPEC: - case FI_THREAD_SAFE: - case FI_THREAD_FID: - case FI_THREAD_DOMAIN: - case FI_THREAD_COMPLETION: - case FI_THREAD_ENDPOINT: - break; - default: - SOCK_LOG_DBG("Invalid threading model!\n"); - return -FI_ENODATA; - } - - switch (attr->control_progress) { - case FI_PROGRESS_UNSPEC: - case FI_PROGRESS_AUTO: - case FI_PROGRESS_MANUAL: - break; - - default: - SOCK_LOG_DBG("Control progress mode not supported!\n"); - return -FI_ENODATA; - } - - switch (attr->data_progress) { - case FI_PROGRESS_UNSPEC: - case FI_PROGRESS_AUTO: - case FI_PROGRESS_MANUAL: - break; - - default: - SOCK_LOG_DBG("Data progress mode not supported!\n"); - return -FI_ENODATA; - } - - switch (attr->resource_mgmt) { - case FI_RM_UNSPEC: - case FI_RM_DISABLED: - case FI_RM_ENABLED: - break; - - default: - SOCK_LOG_DBG("Resource mgmt not supported!\n"); - return -FI_ENODATA; - } - - switch (attr->av_type) { - case FI_AV_UNSPEC: - case FI_AV_MAP: - case FI_AV_TABLE: - break; - - default: - SOCK_LOG_DBG("AV type not supported!\n"); - return -FI_ENODATA; - } - - if (ofi_check_mr_mode(&sock_prov, version, - sock_domain_attr.mr_mode, info)) { - FI_INFO(&sock_prov, FI_LOG_CORE, - "Invalid memory registration mode\n"); - return -FI_ENODATA; - } - - if (attr->mr_key_size > sock_domain_attr.mr_key_size) - return -FI_ENODATA; - - if (attr->cq_data_size > sock_domain_attr.cq_data_size) - return -FI_ENODATA; - - if (attr->cq_cnt > sock_domain_attr.cq_cnt) - return -FI_ENODATA; - - if (attr->ep_cnt > sock_domain_attr.ep_cnt) - return -FI_ENODATA; - - if (attr->max_ep_tx_ctx > sock_domain_attr.max_ep_tx_ctx) - return -FI_ENODATA; - - if (attr->max_ep_rx_ctx > sock_domain_attr.max_ep_rx_ctx) - return -FI_ENODATA; - - if (attr->cntr_cnt > sock_domain_attr.cntr_cnt) - return -FI_ENODATA; - - if (attr->mr_iov_limit > sock_domain_attr.mr_iov_limit) - return -FI_ENODATA; - - if (attr->max_err_data > sock_domain_attr.max_err_data) - return -FI_ENODATA; - - if (attr->mr_cnt > sock_domain_attr.mr_cnt) - return -FI_ENODATA; - - return 0; -} static int sock_dom_close(struct fid *fid) { @@ -272,6 +145,7 @@ static struct fi_ops_domain sock_dom_ops = { .stx_ctx = sock_stx_ctx, .srx_ctx = sock_srx_ctx, .query_atomic = sock_query_atomic, + .query_collective = fi_no_query_collective, }; int sock_domain(struct fid_fabric *fabric, struct fi_info *info, @@ -281,12 +155,8 @@ int sock_domain(struct fid_fabric *fabric, struct fi_info *info, struct sock_fabric *fab; int ret; + assert(info && info->domain_attr); fab = container_of(fabric, struct sock_fabric, fab_fid); - if (info && info->domain_attr) { - ret = sock_verify_domain_attr(fabric->api_version, info); - if (ret) - return -FI_EINVAL; - } sock_domain = calloc(1, sizeof(*sock_domain)); if (!sock_domain) @@ -295,12 +165,8 @@ int sock_domain(struct fid_fabric *fabric, struct fi_info *info, fastlock_init(&sock_domain->lock); ofi_atomic_initialize32(&sock_domain->ref, 0); - if (info) { - sock_domain->info = *info; - } else { - SOCK_LOG_ERROR("invalid fi_info\n"); - goto err1; - } + sock_domain->info = *info; + sock_domain->info.domain_attr = NULL; sock_domain->dom_fid.fid.fclass = FI_CLASS_DOMAIN; sock_domain->dom_fid.fid.context = context; @@ -308,8 +174,7 @@ int sock_domain(struct fid_fabric *fabric, struct fi_info *info, sock_domain->dom_fid.ops = &sock_dom_ops; sock_domain->dom_fid.mr = &sock_dom_mr_ops; - if (!info->domain_attr || - info->domain_attr->data_progress == FI_PROGRESS_UNSPEC) + if (info->domain_attr->data_progress == FI_PROGRESS_UNSPEC) sock_domain->progress_mode = FI_PROGRESS_AUTO; else sock_domain->progress_mode = info->domain_attr->data_progress; @@ -323,10 +188,7 @@ int sock_domain(struct fid_fabric *fabric, struct fi_info *info, sock_domain->fab = fab; *dom = &sock_domain->dom_fid; - if (info->domain_attr) - sock_domain->attr = *(info->domain_attr); - else - sock_domain->attr = sock_domain_attr; + sock_domain->attr = *(info->domain_attr); ret = ofi_mr_map_init(&sock_prov, sock_domain->attr.mr_mode, &sock_domain->mr_map); diff --git a/prov/sockets/src/sock_ep.c b/prov/sockets/src/sock_ep.c index d6eb12785f7..8257fdeb714 100644 --- a/prov/sockets/src/sock_ep.c +++ b/prov/sockets/src/sock_ep.c @@ -64,31 +64,6 @@ extern struct fi_ops_ep sock_ep_ops; extern struct fi_ops sock_ep_fi_ops; extern struct fi_ops_ep sock_ctx_ep_ops; -extern const struct fi_domain_attr sock_domain_attr; -extern const struct fi_fabric_attr sock_fabric_attr; - -const struct fi_tx_attr sock_stx_attr = { - .caps = SOCK_EP_RDM_CAP_BASE, - .mode = SOCK_MODE, - .op_flags = FI_TRANSMIT_COMPLETE, - .msg_order = SOCK_EP_MSG_ORDER, - .inject_size = SOCK_EP_MAX_INJECT_SZ, - .size = SOCK_EP_TX_SZ, - .iov_limit = SOCK_EP_MAX_IOV_LIMIT, - .rma_iov_limit = SOCK_EP_MAX_IOV_LIMIT, -}; - -const struct fi_rx_attr sock_srx_attr = { - .caps = SOCK_EP_RDM_CAP_BASE, - .mode = SOCK_MODE, - .op_flags = 0, - .msg_order = SOCK_EP_MSG_ORDER, - .comp_order = SOCK_EP_COMP_ORDER, - .total_buffered_recv = 0, - .size = SOCK_EP_MAX_MSG_SZ, - .iov_limit = SOCK_EP_MAX_IOV_LIMIT, -}; - static void sock_tx_ctx_close(struct sock_tx_ctx *tx_ctx) { if (tx_ctx->comp.send_cq) @@ -705,8 +680,9 @@ static int sock_ep_close(struct fid *fid) if (sock_ep->attr->conn_handle.do_listen) { fastlock_acquire(&sock_ep->attr->domain->conn_listener.signal_lock); - fi_epoll_del(sock_ep->attr->domain->conn_listener.emap, + ofi_epoll_del(sock_ep->attr->domain->conn_listener.epollfd, sock_ep->attr->conn_handle.sock); + sock_ep->attr->domain->conn_listener.removed_from_epollfd = true; fastlock_release(&sock_ep->attr->domain->conn_listener.signal_lock); ofi_close_socket(sock_ep->attr->conn_handle.sock); sock_ep->attr->conn_handle.do_listen = 0; @@ -750,7 +726,7 @@ static int sock_ep_close(struct fid *fid) free(sock_ep->attr->dest_addr); fastlock_acquire(&sock_ep->attr->domain->pe->lock); - ofi_idm_reset(&sock_ep->attr->av_idm); + ofi_idm_reset(&sock_ep->attr->av_idm, NULL); sock_conn_map_destroy(sock_ep->attr); fastlock_release(&sock_ep->attr->domain->pe->lock); @@ -1146,9 +1122,11 @@ static int sock_ep_tx_ctx(struct fid_ep *ep, int index, struct fi_tx_attr *attr, if (attr) { if (ofi_check_tx_attr(&sock_prov, sock_ep->attr->info.tx_attr, attr, 0) || - ofi_check_attr_subset(&sock_prov, - sock_ep->attr->info.tx_attr->caps, attr->caps)) + ofi_check_attr_subset(&sock_prov, + sock_ep->attr->info.tx_attr->caps, + attr->caps & ~OFI_IGNORED_TX_CAPS)) { return -FI_ENODATA; + } tx_ctx = sock_tx_ctx_alloc(attr, context, 0); } else { tx_ctx = sock_tx_ctx_alloc(&sock_ep->tx_attr, context, 0); @@ -1191,9 +1169,10 @@ static int sock_ep_rx_ctx(struct fid_ep *ep, int index, struct fi_rx_attr *attr, if (attr) { if (ofi_check_rx_attr(&sock_prov, &sock_ep->attr->info, attr, 0) || - ofi_check_attr_subset(&sock_prov, sock_ep->attr->info.rx_attr->caps, - attr->caps)) + ofi_check_attr_subset(&sock_prov, sock_ep->attr->info.rx_attr->caps, + attr->caps & ~OFI_IGNORED_RX_CAPS)) { return -FI_ENODATA; + } rx_ctx = sock_rx_ctx_alloc(attr, context, 0); } else { rx_ctx = sock_rx_ctx_alloc(&sock_ep->rx_attr, context, 0); @@ -1623,16 +1602,10 @@ int sock_alloc_endpoint(struct fid_domain *domain, struct fi_info *info, struct sock_rx_ctx *rx_ctx; struct sock_domain *sock_dom; + assert(info); sock_dom = container_of(domain, struct sock_domain, dom_fid); - if (info) { - ret = sock_verify_info(sock_dom->fab->fab_fid.api_version, info); - if (ret) { - SOCK_LOG_DBG("Cannot support requested options!\n"); - return -FI_EINVAL; - } - } - sock_ep = (struct sock_ep *) calloc(1, sizeof(*sock_ep)); + sock_ep = calloc(1, sizeof(*sock_ep)); if (!sock_ep) return -FI_ENOMEM; @@ -1672,52 +1645,50 @@ int sock_alloc_endpoint(struct fid_domain *domain, struct fi_info *info, sock_ep->attr->fclass = fclass; *ep = sock_ep; - if (info) { - sock_ep->attr->info.caps = info->caps; - sock_ep->attr->info.addr_format = FI_SOCKADDR_IN; - - if (info->ep_attr) { - sock_ep->attr->ep_type = info->ep_attr->type; - sock_ep->attr->ep_attr.tx_ctx_cnt = info->ep_attr->tx_ctx_cnt; - sock_ep->attr->ep_attr.rx_ctx_cnt = info->ep_attr->rx_ctx_cnt; - } + sock_ep->attr->info.caps = info->caps; + sock_ep->attr->info.addr_format = info->addr_format; - if (info->src_addr) { - sock_ep->attr->src_addr = calloc(1, sizeof(*sock_ep-> - attr->src_addr)); - if (!sock_ep->attr->src_addr) { - ret = -FI_ENOMEM; - goto err2; - } - memcpy(sock_ep->attr->src_addr, info->src_addr, - info->src_addrlen); - } + if (info->ep_attr) { + sock_ep->attr->ep_type = info->ep_attr->type; + sock_ep->attr->ep_attr.tx_ctx_cnt = info->ep_attr->tx_ctx_cnt; + sock_ep->attr->ep_attr.rx_ctx_cnt = info->ep_attr->rx_ctx_cnt; + } - if (info->dest_addr) { - sock_ep->attr->dest_addr = calloc(1, sizeof(*sock_ep-> - attr->dest_addr)); - if (!sock_ep->attr->dest_addr) { - ret = -FI_ENOMEM; - goto err2; - } - memcpy(sock_ep->attr->dest_addr, info->dest_addr, - info->dest_addrlen); + if (info->src_addr) { + sock_ep->attr->src_addr = calloc(1, sizeof(*sock_ep-> + attr->src_addr)); + if (!sock_ep->attr->src_addr) { + ret = -FI_ENOMEM; + goto err2; } + memcpy(sock_ep->attr->src_addr, info->src_addr, + info->src_addrlen); + } - if (info->tx_attr) { - sock_ep->tx_attr = *info->tx_attr; - if (!(sock_ep->tx_attr.op_flags & (FI_INJECT_COMPLETE | - FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE))) - sock_ep->tx_attr.op_flags |= FI_TRANSMIT_COMPLETE; - sock_ep->tx_attr.size = sock_ep->tx_attr.size ? - sock_ep->tx_attr.size : SOCK_EP_TX_SZ; + if (info->dest_addr) { + sock_ep->attr->dest_addr = calloc(1, sizeof(*sock_ep-> + attr->dest_addr)); + if (!sock_ep->attr->dest_addr) { + ret = -FI_ENOMEM; + goto err2; } + memcpy(sock_ep->attr->dest_addr, info->dest_addr, + info->dest_addrlen); + } - if (info->rx_attr) - sock_ep->rx_attr = *info->rx_attr; - sock_ep->attr->info.handle = info->handle; + if (info->tx_attr) { + sock_ep->tx_attr = *info->tx_attr; + if (!(sock_ep->tx_attr.op_flags & (FI_INJECT_COMPLETE | + FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE))) + sock_ep->tx_attr.op_flags |= FI_TRANSMIT_COMPLETE; + sock_ep->tx_attr.size = sock_ep->tx_attr.size ? + sock_ep->tx_attr.size : SOCK_EP_TX_SZ; } + if (info->rx_attr) + sock_ep->rx_attr = *info->rx_attr; + sock_ep->attr->info.handle = info->handle; + if (!sock_ep->attr->src_addr && sock_ep_assign_src_addr(sock_ep, info)) { SOCK_LOG_ERROR("failed to get src_address\n"); ret = -FI_EINVAL; @@ -1788,8 +1759,7 @@ int sock_alloc_endpoint(struct fid_domain *domain, struct fi_info *info, /* default config */ sock_ep->attr->min_multi_recv = SOCK_EP_MIN_MULTI_RECV; - if (info) - memcpy(&sock_ep->attr->info, info, sizeof(struct fi_info)); + memcpy(&sock_ep->attr->info, info, sizeof(struct fi_info)); sock_ep->attr->domain = sock_dom; fastlock_init(&sock_ep->attr->cm.lock); @@ -1816,6 +1786,8 @@ int sock_alloc_endpoint(struct fid_domain *domain, struct fi_info *info, void sock_ep_remove_conn(struct sock_ep_attr *attr, struct sock_conn *conn) { + if (attr->cmap.used <= 0 || conn->sock_fd == -1) + return; sock_pe_poll_del(attr->domain->pe, conn->sock_fd); sock_conn_release_entry(&attr->cmap, conn); } @@ -1824,14 +1796,27 @@ struct sock_conn *sock_ep_lookup_conn(struct sock_ep_attr *attr, fi_addr_t index union ofi_sock_ip *addr) { int i; - uint16_t idx; + uint64_t idx; + char buf[8]; struct sock_conn *conn; idx = (attr->ep_type == FI_EP_MSG) ? index : index & attr->av->mask; conn = ofi_idm_lookup(&attr->av_idm, idx); if (conn && conn != SOCK_CM_CONN_IN_PROGRESS) { - if (conn->av_index == FI_ADDR_NOTAVAIL) + /* Verify that the existing connection is still usable, and + * that the peer didn't restart. + */ + if (conn->connected == 0 || + (sock_comm_peek(conn, buf, 8) == 0 && conn->connected == 0)) { + sock_ep_remove_conn(attr, conn); + ofi_straddr_log(&sock_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, + "Peer disconnected", &addr->sa); + return NULL; + } + if (conn->av_index != FI_ADDR_NOTAVAIL) + assert(conn->av_index == idx); + else conn->av_index = idx; return conn; } @@ -1842,11 +1827,22 @@ struct sock_conn *sock_ep_lookup_conn(struct sock_ep_attr *attr, fi_addr_t index if (ofi_equals_sockaddr(&attr->cmap.table[i].addr.sa, &addr->sa)) { conn = &attr->cmap.table[i]; - if (conn->av_index == FI_ADDR_NOTAVAIL) - conn->av_index = idx; break; } } + if (conn && conn != SOCK_CM_CONN_IN_PROGRESS) { + if (conn->connected == 0 || + (sock_comm_peek(conn, buf, 8) == 0 && conn->connected == 0)) { + sock_ep_remove_conn(attr, conn); + ofi_straddr_log(&sock_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, + "Peer disconnected", &addr->sa); + return NULL; + } + if (conn->av_index != FI_ADDR_NOTAVAIL) + assert(conn->av_index == idx); + else + conn->av_index = idx; + } return conn; } @@ -1861,8 +1857,11 @@ int sock_ep_get_conn(struct sock_ep_attr *attr, struct sock_tx_ctx *tx_ctx, if (attr->ep_type == FI_EP_MSG) addr = attr->dest_addr; - else + else { + fastlock_acquire(&attr->av->table_lock); addr = &attr->av->table[av_index].addr; + fastlock_release(&attr->av->table_lock); + } fastlock_acquire(&attr->cmap.lock); conn = sock_ep_lookup_conn(attr, av_index, addr); @@ -1877,9 +1876,11 @@ int sock_ep_get_conn(struct sock_ep_attr *attr, struct sock_tx_ctx *tx_ctx, ret = sock_ep_connect(attr, av_index, &conn); if (!conn) { - SOCK_LOG_ERROR("Undable to find connection entry. " + SOCK_LOG_ERROR("Unable to find connection entry. " "Error in connecting: %s\n", fi_strerror(-ret)); + ofi_straddr_log(&sock_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, + "Unable to connect to", &addr->sa); return -FI_ENOENT; } diff --git a/prov/sockets/src/sock_ep_dgram.c b/prov/sockets/src/sock_ep_dgram.c index 3b7e62a40cc..7a18532be90 100644 --- a/prov/sockets/src/sock_ep_dgram.c +++ b/prov/sockets/src/sock_ep_dgram.c @@ -56,235 +56,6 @@ #define SOCK_LOG_DBG(...) _SOCK_LOG_DBG(FI_LOG_EP_CTRL, __VA_ARGS__) #define SOCK_LOG_ERROR(...) _SOCK_LOG_ERROR(FI_LOG_EP_CTRL, __VA_ARGS__) -const struct fi_ep_attr sock_dgram_ep_attr = { - .type = FI_EP_DGRAM, - .protocol = FI_PROTO_SOCK_TCP, - .protocol_version = SOCK_WIRE_PROTO_VERSION, - .max_msg_size = SOCK_EP_MAX_MSG_SZ, - .msg_prefix_size = SOCK_EP_MSG_PREFIX_SZ, - .max_order_raw_size = SOCK_EP_MAX_ORDER_RAW_SZ, - .max_order_war_size = SOCK_EP_MAX_ORDER_WAR_SZ, - .max_order_waw_size = SOCK_EP_MAX_ORDER_WAW_SZ, - .mem_tag_format = SOCK_EP_MEM_TAG_FMT, - .tx_ctx_cnt = SOCK_EP_MAX_TX_CNT, - .rx_ctx_cnt = SOCK_EP_MAX_RX_CNT, -}; - -const struct fi_tx_attr sock_dgram_tx_attr = { - .caps = SOCK_EP_DGRAM_CAP, - .mode = SOCK_MODE, - .op_flags = SOCK_EP_DEFAULT_OP_FLAGS, - .msg_order = SOCK_EP_MSG_ORDER, - .inject_size = SOCK_EP_MAX_INJECT_SZ, - .size = SOCK_EP_TX_SZ, - .iov_limit = SOCK_EP_MAX_IOV_LIMIT, - .rma_iov_limit = 0, -}; - -const struct fi_rx_attr sock_dgram_rx_attr = { - .caps = SOCK_EP_DGRAM_CAP, - .mode = SOCK_MODE, - .op_flags = 0, - .msg_order = SOCK_EP_MSG_ORDER, - .comp_order = SOCK_EP_COMP_ORDER, - .total_buffered_recv = SOCK_EP_MAX_BUFF_RECV, - .size = SOCK_EP_RX_SZ, - .iov_limit = SOCK_EP_MAX_IOV_LIMIT, -}; - -static int sock_dgram_verify_rx_attr(const struct fi_rx_attr *attr) -{ - if (!attr) - return 0; - - if ((attr->caps | SOCK_EP_DGRAM_CAP) != SOCK_EP_DGRAM_CAP) - return -FI_ENODATA; - - if ((attr->msg_order | SOCK_EP_MSG_ORDER) != SOCK_EP_MSG_ORDER) - return -FI_ENODATA; - - if ((attr->comp_order | SOCK_EP_COMP_ORDER) != SOCK_EP_COMP_ORDER) - return -FI_ENODATA; - - if (attr->total_buffered_recv > sock_dgram_rx_attr.total_buffered_recv) - return -FI_ENODATA; - - if (sock_get_tx_size(attr->size) > - sock_get_tx_size(sock_dgram_rx_attr.size)) - return -FI_ENODATA; - - if (attr->iov_limit > sock_dgram_rx_attr.iov_limit) - return -FI_ENODATA; - - return 0; -} - -static int sock_dgram_verify_tx_attr(const struct fi_tx_attr *attr) -{ - if (!attr) - return 0; - - if ((attr->caps | SOCK_EP_DGRAM_CAP) != SOCK_EP_DGRAM_CAP) - return -FI_ENODATA; - - if ((attr->msg_order | SOCK_EP_MSG_ORDER) != SOCK_EP_MSG_ORDER) - return -FI_ENODATA; - - if (attr->inject_size > sock_dgram_tx_attr.inject_size) - return -FI_ENODATA; - - if (sock_get_tx_size(attr->size) > - sock_get_tx_size(sock_dgram_tx_attr.size)) - return -FI_ENODATA; - - if (attr->iov_limit > sock_dgram_tx_attr.iov_limit) - return -FI_ENODATA; - - if (attr->rma_iov_limit > sock_dgram_tx_attr.rma_iov_limit) - return -FI_ENODATA; - - return 0; -} - -int sock_dgram_verify_ep_attr(const struct fi_ep_attr *ep_attr, - const struct fi_tx_attr *tx_attr, - const struct fi_rx_attr *rx_attr) -{ - if (ep_attr) { - switch (ep_attr->protocol) { - case FI_PROTO_UNSPEC: - case FI_PROTO_SOCK_TCP: - break; - default: - return -FI_ENODATA; - } - - if (ep_attr->protocol_version && - (ep_attr->protocol_version != sock_dgram_ep_attr.protocol_version)) - return -FI_ENODATA; - - if (ep_attr->max_msg_size > sock_dgram_ep_attr.max_msg_size) - return -FI_ENODATA; - - if (ep_attr->msg_prefix_size > sock_dgram_ep_attr.msg_prefix_size) - return -FI_ENODATA; - - if (ep_attr->max_order_raw_size > - sock_dgram_ep_attr.max_order_raw_size) - return -FI_ENODATA; - - if (ep_attr->max_order_war_size > - sock_dgram_ep_attr.max_order_war_size) - return -FI_ENODATA; - - if (ep_attr->max_order_waw_size > - sock_dgram_ep_attr.max_order_waw_size) - return -FI_ENODATA; - - if ((ep_attr->tx_ctx_cnt > SOCK_EP_MAX_TX_CNT) && - ep_attr->tx_ctx_cnt != FI_SHARED_CONTEXT) - return -FI_ENODATA; - - if ((ep_attr->rx_ctx_cnt > SOCK_EP_MAX_RX_CNT) && - ep_attr->rx_ctx_cnt != FI_SHARED_CONTEXT) - return -FI_ENODATA; - } - - if (sock_dgram_verify_tx_attr(tx_attr) || - sock_dgram_verify_rx_attr(rx_attr)) - return -FI_ENODATA; - - return 0; -} - -int sock_dgram_fi_info(uint32_t version, void *src_addr, void *dest_addr, - const struct fi_info *hints, struct fi_info **info) -{ - *info = sock_fi_info(version, FI_EP_DGRAM, hints, src_addr, dest_addr); - if (!*info) - return -FI_ENOMEM; - - *(*info)->tx_attr = sock_dgram_tx_attr; - (*info)->tx_attr->size = sock_get_tx_size(sock_dgram_tx_attr.size); - *(*info)->rx_attr = sock_dgram_rx_attr; - (*info)->rx_attr->size = sock_get_tx_size(sock_dgram_rx_attr.size); - *(*info)->ep_attr = sock_dgram_ep_attr; - - if (hints && hints->ep_attr) { - if (hints->ep_attr->rx_ctx_cnt) - (*info)->ep_attr->rx_ctx_cnt = hints->ep_attr->rx_ctx_cnt; - if (hints->ep_attr->tx_ctx_cnt) - (*info)->ep_attr->tx_ctx_cnt = hints->ep_attr->tx_ctx_cnt; - } - - if (hints && hints->rx_attr) { - (*info)->rx_attr->op_flags |= hints->rx_attr->op_flags; - if (hints->rx_attr->caps) - (*info)->rx_attr->caps = SOCK_EP_DGRAM_SEC_CAP | - hints->rx_attr->caps; - } - - if (hints && hints->tx_attr) { - (*info)->tx_attr->op_flags |= hints->tx_attr->op_flags; - if (hints->tx_attr->caps) - (*info)->tx_attr->caps = SOCK_EP_DGRAM_SEC_CAP | - hints->tx_attr->caps; - } - - (*info)->caps = SOCK_EP_DGRAM_CAP | - (*info)->rx_attr->caps | (*info)->tx_attr->caps; - if (hints && hints->caps) { - (*info)->caps = SOCK_EP_DGRAM_SEC_CAP | hints->caps; - (*info)->rx_attr->caps = SOCK_EP_DGRAM_SEC_CAP | - ((*info)->rx_attr->caps & (*info)->caps); - (*info)->tx_attr->caps = SOCK_EP_DGRAM_SEC_CAP | - ((*info)->tx_attr->caps & (*info)->caps); - } - return 0; -} - -static int sock_dgram_endpoint(struct fid_domain *domain, struct fi_info *info, - struct sock_ep **ep, void *context, size_t fclass) -{ - int ret; - - if (info) { - if (info->ep_attr) { - ret = sock_dgram_verify_ep_attr(info->ep_attr, - info->tx_attr, - info->rx_attr); - if (ret) - return -FI_EINVAL; - } - - if (info->tx_attr) { - ret = sock_dgram_verify_tx_attr(info->tx_attr); - if (ret) - return -FI_EINVAL; - } - - if (info->rx_attr) { - ret = sock_dgram_verify_rx_attr(info->rx_attr); - if (ret) - return -FI_EINVAL; - } - } - - ret = sock_alloc_endpoint(domain, info, ep, context, fclass); - if (ret) - return ret; - - if (!info || !info->ep_attr) - (*ep)->attr->ep_attr = sock_dgram_ep_attr; - - if (!info || !info->tx_attr) - (*ep)->tx_attr = sock_dgram_tx_attr; - - if (!info || !info->rx_attr) - (*ep)->rx_attr = sock_dgram_rx_attr; - - return 0; -} int sock_dgram_ep(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context) @@ -292,7 +63,7 @@ int sock_dgram_ep(struct fid_domain *domain, struct fi_info *info, int ret; struct sock_ep *endpoint; - ret = sock_dgram_endpoint(domain, info, &endpoint, context, FI_CLASS_EP); + ret = sock_alloc_endpoint(domain, info, &endpoint, context, FI_CLASS_EP); if (ret) return ret; @@ -306,7 +77,7 @@ int sock_dgram_sep(struct fid_domain *domain, struct fi_info *info, int ret; struct sock_ep *endpoint; - ret = sock_dgram_endpoint(domain, info, &endpoint, context, FI_CLASS_SEP); + ret = sock_alloc_endpoint(domain, info, &endpoint, context, FI_CLASS_SEP); if (ret) return ret; diff --git a/prov/sockets/src/sock_ep_msg.c b/prov/sockets/src/sock_ep_msg.c index 6b84b401e76..4261e8a7753 100644 --- a/prov/sockets/src/sock_ep_msg.c +++ b/prov/sockets/src/sock_ep_msg.c @@ -59,195 +59,6 @@ #define SOCK_LOG_DBG(...) _SOCK_LOG_DBG(FI_LOG_EP_CTRL, __VA_ARGS__) #define SOCK_LOG_ERROR(...) _SOCK_LOG_ERROR(FI_LOG_EP_CTRL, __VA_ARGS__) -static const struct fi_ep_attr sock_msg_ep_attr = { - .type = FI_EP_MSG, - .protocol = FI_PROTO_SOCK_TCP, - .protocol_version = SOCK_WIRE_PROTO_VERSION, - .max_msg_size = SOCK_EP_MAX_MSG_SZ, - .msg_prefix_size = SOCK_EP_MSG_PREFIX_SZ, - .max_order_raw_size = SOCK_EP_MAX_ORDER_RAW_SZ, - .max_order_war_size = SOCK_EP_MAX_ORDER_WAR_SZ, - .max_order_waw_size = SOCK_EP_MAX_ORDER_WAW_SZ, - .mem_tag_format = SOCK_EP_MEM_TAG_FMT, - .tx_ctx_cnt = SOCK_EP_MAX_TX_CNT, - .rx_ctx_cnt = SOCK_EP_MAX_RX_CNT, -}; - -static const struct fi_tx_attr sock_msg_tx_attr = { - .caps = SOCK_EP_MSG_CAP_BASE, - .mode = SOCK_MODE, - .op_flags = SOCK_EP_DEFAULT_OP_FLAGS, - .msg_order = SOCK_EP_MSG_ORDER, - .inject_size = SOCK_EP_MAX_INJECT_SZ, - .size = SOCK_EP_TX_SZ, - .iov_limit = SOCK_EP_MAX_IOV_LIMIT, - .rma_iov_limit = SOCK_EP_MAX_IOV_LIMIT, -}; - -static const struct fi_rx_attr sock_msg_rx_attr = { - .caps = SOCK_EP_MSG_CAP_BASE, - .mode = SOCK_MODE, - .op_flags = 0, - .msg_order = SOCK_EP_MSG_ORDER, - .comp_order = SOCK_EP_COMP_ORDER, - .total_buffered_recv = SOCK_EP_MAX_BUFF_RECV, - .size = SOCK_EP_RX_SZ, - .iov_limit = SOCK_EP_MAX_IOV_LIMIT, -}; - -static int sock_msg_verify_rx_attr(const struct fi_rx_attr *attr) -{ - if (!attr) - return 0; - - if ((attr->caps | SOCK_EP_MSG_CAP) != SOCK_EP_MSG_CAP) - return -FI_ENODATA; - - if ((attr->msg_order | SOCK_EP_MSG_ORDER) != SOCK_EP_MSG_ORDER) - return -FI_ENODATA; - - if ((attr->comp_order | SOCK_EP_COMP_ORDER) != SOCK_EP_COMP_ORDER) - return -FI_ENODATA; - - if (attr->total_buffered_recv > sock_msg_rx_attr.total_buffered_recv) - return -FI_ENODATA; - - if (sock_get_tx_size(attr->size) > - sock_get_tx_size(sock_msg_rx_attr.size)) - return -FI_ENODATA; - - if (attr->iov_limit > sock_msg_rx_attr.iov_limit) - return -FI_ENODATA; - - return 0; -} - -static int sock_msg_verify_tx_attr(const struct fi_tx_attr *attr) -{ - if (!attr) - return 0; - - if ((attr->caps | SOCK_EP_MSG_CAP) != SOCK_EP_MSG_CAP) - return -FI_ENODATA; - - if ((attr->msg_order | SOCK_EP_MSG_ORDER) != SOCK_EP_MSG_ORDER) - return -FI_ENODATA; - - if (attr->inject_size > sock_msg_tx_attr.inject_size) - return -FI_ENODATA; - - if (sock_get_tx_size(attr->size) > - sock_get_tx_size(sock_msg_tx_attr.size)) - return -FI_ENODATA; - - if (attr->iov_limit > sock_msg_tx_attr.iov_limit) - return -FI_ENODATA; - - if (attr->rma_iov_limit > sock_msg_tx_attr.rma_iov_limit) - return -FI_ENODATA; - - return 0; -} - -int sock_msg_verify_ep_attr(const struct fi_ep_attr *ep_attr, - const struct fi_tx_attr *tx_attr, - const struct fi_rx_attr *rx_attr) -{ - if (ep_attr) { - switch (ep_attr->protocol) { - case FI_PROTO_UNSPEC: - case FI_PROTO_SOCK_TCP: - break; - default: - return -FI_ENODATA; - } - - if (ep_attr->protocol_version && - (ep_attr->protocol_version != sock_msg_ep_attr.protocol_version)) - return -FI_ENODATA; - - if (ep_attr->max_msg_size > sock_msg_ep_attr.max_msg_size) - return -FI_ENODATA; - - if (ep_attr->msg_prefix_size > sock_msg_ep_attr.msg_prefix_size) - return -FI_ENODATA; - - if (ep_attr->max_order_raw_size > - sock_msg_ep_attr.max_order_raw_size) - return -FI_ENODATA; - - if (ep_attr->max_order_war_size > - sock_msg_ep_attr.max_order_war_size) - return -FI_ENODATA; - - if (ep_attr->max_order_waw_size > - sock_msg_ep_attr.max_order_waw_size) - return -FI_ENODATA; - - if ((ep_attr->tx_ctx_cnt > SOCK_EP_MAX_TX_CNT) && - ep_attr->tx_ctx_cnt != FI_SHARED_CONTEXT) - return -FI_ENODATA; - - if ((ep_attr->rx_ctx_cnt > SOCK_EP_MAX_RX_CNT) && - ep_attr->rx_ctx_cnt != FI_SHARED_CONTEXT) - return -FI_ENODATA; - - if (ep_attr->auth_key_size && - (ep_attr->auth_key_size != sock_msg_ep_attr.auth_key_size)) - return -FI_ENODATA; - } - - if (sock_msg_verify_tx_attr(tx_attr) || sock_msg_verify_rx_attr(rx_attr)) - return -FI_ENODATA; - - return 0; -} - -int sock_msg_fi_info(uint32_t version, void *src_addr, void *dest_addr, - const struct fi_info *hints, struct fi_info **info) -{ - *info = sock_fi_info(version, FI_EP_MSG, hints, src_addr, dest_addr); - if (!*info) - return -FI_ENOMEM; - - *(*info)->tx_attr = sock_msg_tx_attr; - (*info)->tx_attr->size = sock_get_tx_size(sock_msg_tx_attr.size); - *(*info)->rx_attr = sock_msg_rx_attr; - (*info)->rx_attr->size = sock_get_tx_size(sock_msg_rx_attr.size); - *(*info)->ep_attr = sock_msg_ep_attr; - - if (hints && hints->ep_attr) { - if (hints->ep_attr->rx_ctx_cnt) - (*info)->ep_attr->rx_ctx_cnt = hints->ep_attr->rx_ctx_cnt; - if (hints->ep_attr->tx_ctx_cnt) - (*info)->ep_attr->tx_ctx_cnt = hints->ep_attr->tx_ctx_cnt; - } - - if (hints && hints->rx_attr) { - (*info)->rx_attr->op_flags |= hints->rx_attr->op_flags; - if (hints->rx_attr->caps) - (*info)->rx_attr->caps = SOCK_EP_MSG_SEC_CAP | - hints->rx_attr->caps; - } - - if (hints && hints->tx_attr) { - (*info)->tx_attr->op_flags |= hints->tx_attr->op_flags; - if (hints->tx_attr->caps) - (*info)->tx_attr->caps = SOCK_EP_MSG_SEC_CAP | - hints->tx_attr->caps; - } - - (*info)->caps = SOCK_EP_MSG_CAP | - (*info)->rx_attr->caps | (*info)->tx_attr->caps; - if (hints && hints->caps) { - (*info)->caps = SOCK_EP_MSG_SEC_CAP | hints->caps; - (*info)->rx_attr->caps = SOCK_EP_MSG_SEC_CAP | - ((*info)->rx_attr->caps & (*info)->caps); - (*info)->tx_attr->caps = SOCK_EP_MSG_SEC_CAP | - ((*info)->tx_attr->caps & (*info)->caps); - } - return 0; -} static int sock_ep_cm_getname(fid_t fid, void *addr, size_t *addrlen) { @@ -412,13 +223,13 @@ static void sock_ep_cm_monitor_handle(struct sock_ep_cm_head *cm_head, { int ret; - fastlock_acquire(&cm_head->signal_lock); + pthread_mutex_lock(&cm_head->signal_lock); if (handle->monitored) goto unlock; /* Mark the handle as monitored before adding it to the pollset */ handle->monitored = 1; - ret = fi_epoll_add(cm_head->emap, handle->sock_fd, + ret = ofi_epoll_add(cm_head->epollfd, handle->sock_fd, events, handle); if (ret) { SOCK_LOG_ERROR("failed to monitor fd %d: %d\n", @@ -428,7 +239,7 @@ static void sock_ep_cm_monitor_handle(struct sock_ep_cm_head *cm_head, fd_signal_set(&cm_head->signal); } unlock: - fastlock_release(&cm_head->signal_lock); + pthread_mutex_unlock(&cm_head->signal_lock); } static void @@ -439,11 +250,12 @@ sock_ep_cm_unmonitor_handle_locked(struct sock_ep_cm_head *cm_head, int ret; if (handle->monitored) { - ret = fi_epoll_del(cm_head->emap, handle->sock_fd); + ret = ofi_epoll_del(cm_head->epollfd, handle->sock_fd); if (ret) SOCK_LOG_ERROR("failed to unmonitor fd %d: %d\n", handle->sock_fd, ret); handle->monitored = 0; + cm_head->removed_from_epollfd = true; } /* Multiple threads might call sock_ep_cm_unmonitor_handle() at the @@ -460,9 +272,9 @@ static void sock_ep_cm_unmonitor_handle(struct sock_ep_cm_head *cm_head, struct sock_conn_req_handle *handle, int close_socket) { - fastlock_acquire(&cm_head->signal_lock); + pthread_mutex_lock(&cm_head->signal_lock); sock_ep_cm_unmonitor_handle_locked(cm_head, handle, close_socket); - fastlock_release(&cm_head->signal_lock); + pthread_mutex_unlock(&cm_head->signal_lock); } static void sock_ep_cm_shutdown_report(struct sock_ep *ep, int send_shutdown) @@ -570,10 +382,15 @@ static void sock_ep_cm_connect_handler(struct sock_ep_cm_head *cm_head, struct fi_eq_cm_entry *cm_entry = NULL; int cm_data_sz, response_port; - assert(hdr->type == SOCK_CONN_ACCEPT - || hdr->type == SOCK_CONN_REJECT); + assert(hdr->type == SOCK_CONN_ACCEPT || + hdr->type == SOCK_CONN_REJECT); cm_data_sz = ntohs(hdr->cm_data_sz); + if (cm_data_sz > SOCK_EP_MAX_CM_DATA_SZ) { + SOCK_LOG_ERROR("CM data size too large\n"); + goto err; + } + response_port = ntohs(hdr->port); if (cm_data_sz) { param = calloc(1, cm_data_sz); @@ -717,7 +534,7 @@ static int sock_ep_cm_connect(struct fid_ep *ep, const void *addr, /* Monitor the connection */ _ep->attr->cm.state = SOCK_CM_STATE_REQUESTED; handle->sock_fd = sock_fd; - sock_ep_cm_monitor_handle(cm_head, handle, FI_EPOLL_IN); + sock_ep_cm_monitor_handle(cm_head, handle, OFI_EPOLL_IN); return 0; close_socket: @@ -782,7 +599,7 @@ static int sock_ep_cm_accept(struct fid_ep *ep, const void *param, size_t paraml } } /* Monitor the handle prior to report the event */ - sock_ep_cm_monitor_handle(cm_head, handle, FI_EPOLL_IN); + sock_ep_cm_monitor_handle(cm_head, handle, OFI_EPOLL_IN); sock_ep_enable(ep); memset(&cm_entry, 0, sizeof(cm_entry)); @@ -823,65 +640,22 @@ struct fi_ops_cm sock_ep_cm_ops = { .join = fi_no_join, }; -static int sock_msg_endpoint(struct fid_domain *domain, struct fi_info *info, - struct sock_ep **ep, void *context, size_t fclass) +int sock_msg_ep(struct fid_domain *domain, struct fi_info *info, + struct fid_ep **ep, void *context) { - int ret; + struct sock_ep *endpoint; struct sock_pep *pep; + int ret; - if (info) { - if (info->ep_attr) { - ret = sock_msg_verify_ep_attr(info->ep_attr, - info->tx_attr, - info->rx_attr); - if (ret) - return -FI_EINVAL; - } - - if (info->tx_attr) { - ret = sock_msg_verify_tx_attr(info->tx_attr); - if (ret) - return -FI_EINVAL; - } - - if (info->rx_attr) { - ret = sock_msg_verify_rx_attr(info->rx_attr); - if (ret) - return -FI_EINVAL; - } - } - - ret = sock_alloc_endpoint(domain, info, ep, context, fclass); + ret = sock_alloc_endpoint(domain, info, &endpoint, context, FI_CLASS_EP); if (ret) return ret; if (info && info->handle && info->handle->fclass == FI_CLASS_PEP) { pep = container_of(info->handle, struct sock_pep, pep.fid); - memcpy((*ep)->attr->src_addr, &pep->src_addr, sizeof *(*ep)->attr->src_addr); + *endpoint->attr->src_addr = pep->src_addr; } - if (!info || !info->ep_attr) - (*ep)->attr->ep_attr = sock_msg_ep_attr; - - if (!info || !info->tx_attr) - (*ep)->tx_attr = sock_msg_tx_attr; - - if (!info || !info->rx_attr) - (*ep)->rx_attr = sock_msg_rx_attr; - - return 0; -} - -int sock_msg_ep(struct fid_domain *domain, struct fi_info *info, - struct fid_ep **ep, void *context) -{ - int ret; - struct sock_ep *endpoint; - - ret = sock_msg_endpoint(domain, info, &endpoint, context, FI_CLASS_EP); - if (ret) - return ret; - *ep = &endpoint->ep; return 0; } @@ -945,8 +719,8 @@ static struct fi_info *sock_ep_msg_get_info(struct sock_pep *pep, struct fi_info hints; uint64_t requested, supported; - requested = req->caps & SOCK_EP_MSG_PRI_CAP; - supported = pep->info.caps & SOCK_EP_MSG_PRI_CAP; + requested = req->caps & sock_msg_info.caps; + supported = pep->info.caps & sock_msg_info.caps; supported = (supported & FI_RMA) ? (supported | FI_REMOTE_READ | FI_REMOTE_WRITE) : supported; if ((requested | supported) != supported) @@ -960,9 +734,9 @@ static struct fi_info *sock_ep_msg_get_info(struct sock_pep *pep, void sock_ep_cm_signal(struct sock_ep_cm_head *cm_head) { - fastlock_acquire(&cm_head->signal_lock); + pthread_mutex_lock(&cm_head->signal_lock); fd_signal_set(&cm_head->signal); - fastlock_release(&cm_head->signal_lock); + pthread_mutex_unlock(&cm_head->signal_lock); } static void sock_ep_cm_process_rejected(struct sock_ep_cm_head *cm_head, @@ -1015,13 +789,13 @@ sock_ep_cm_pop_from_msg_list(struct sock_ep_cm_head *cm_head) struct dlist_entry *entry; struct sock_conn_req_handle *hreq = NULL; - fastlock_acquire(&cm_head->signal_lock); + pthread_mutex_lock(&cm_head->signal_lock); if (!dlist_empty(&cm_head->msg_list)) { entry = cm_head->msg_list.next; dlist_remove(entry); hreq = container_of(entry, struct sock_conn_req_handle, entry); } - fastlock_release(&cm_head->signal_lock); + pthread_mutex_unlock(&cm_head->signal_lock); return hreq; } @@ -1077,6 +851,11 @@ static void sock_pep_req_handler(struct sock_ep_cm_head *cm_head, } req_cm_data_sz = ntohs(conn_req->hdr.cm_data_sz); + if (req_cm_data_sz > SOCK_EP_MAX_CM_DATA_SZ) { + SOCK_LOG_ERROR("CM data size is too large\n"); + goto err; + } + if (req_cm_data_sz) { ret = sock_cm_recv(handle->sock_fd, conn_req->cm_data, req_cm_data_sz); @@ -1172,7 +951,7 @@ static void *sock_pep_listener_thread(void *data) handle->pep = pep; /* Monitor the connection */ - sock_ep_cm_monitor_handle(&pep->cm_head, handle, FI_EPOLL_IN); + sock_ep_cm_monitor_handle(&pep->cm_head, handle, OFI_EPOLL_IN); } SOCK_LOG_DBG("PEP listener thread exiting\n"); @@ -1233,9 +1012,9 @@ static int sock_pep_reject(struct fid_pep *pep, fid_t handle, cm_head = &_pep->cm_head; hreq->state = SOCK_CONN_HANDLE_REJECTED; - fastlock_acquire(&cm_head->signal_lock); + pthread_mutex_lock(&cm_head->signal_lock); sock_ep_cm_add_to_msg_list(cm_head, hreq); - fastlock_release(&cm_head->signal_lock); + pthread_mutex_unlock(&cm_head->signal_lock); return 0; } @@ -1284,7 +1063,7 @@ int sock_msg_sep(struct fid_domain *domain, struct fi_info *info, int ret; struct sock_ep *endpoint; - ret = sock_msg_endpoint(domain, info, &endpoint, context, FI_CLASS_SEP); + ret = sock_alloc_endpoint(domain, info, &endpoint, context, FI_CLASS_SEP); if (ret) return ret; @@ -1299,52 +1078,40 @@ int sock_msg_passive_ep(struct fid_fabric *fabric, struct fi_info *info, struct sock_pep *_pep; struct addrinfo hints, *result; - if (info) { - ret = sock_verify_info(fabric->api_version, info); - if (ret) { - SOCK_LOG_DBG("Cannot support requested options!\n"); - return ret; - } - } - + assert(info); _pep = calloc(1, sizeof(*_pep)); if (!_pep) return -FI_ENOMEM; - if (info) { - if (info->src_addr) { - memcpy(&_pep->src_addr, info->src_addr, - info->src_addrlen); + if (info->src_addr) { + memcpy(&_pep->src_addr, info->src_addr, + info->src_addrlen); + } else { + memset(&hints, 0, sizeof(hints)); + hints.ai_socktype = SOCK_STREAM; + hints.ai_family = ofi_get_sa_family(info); + if (!hints.ai_family) + hints.ai_family = AF_INET; + + if (hints.ai_family == AF_INET) { + ret = getaddrinfo("127.0.0.1", NULL, &hints, + &result); + } else if (hints.ai_family == AF_INET6) { + ret = getaddrinfo("::1", NULL, &hints, &result); } else { - memset(&hints, 0, sizeof(hints)); - hints.ai_socktype = SOCK_STREAM; - hints.ai_family = ofi_get_sa_family(info); - if (!hints.ai_family) - hints.ai_family = AF_INET; - - if (hints.ai_family == AF_INET) { - ret = getaddrinfo("127.0.0.1", NULL, &hints, - &result); - } else if (hints.ai_family == AF_INET6) { - ret = getaddrinfo("::1", NULL, &hints, &result); - } else { - ret = getaddrinfo("localhost", NULL, &hints, - &result); - } - if (ret) { - ret = -FI_EINVAL; - SOCK_LOG_DBG("getaddrinfo failed!\n"); - goto err; - } - memcpy(&_pep->src_addr, result->ai_addr, - result->ai_addrlen); - freeaddrinfo(result); + ret = getaddrinfo("localhost", NULL, &hints, + &result); } - _pep->info = *info; - } else { - SOCK_LOG_ERROR("invalid fi_info\n"); - goto err; + if (ret) { + ret = -FI_EINVAL; + SOCK_LOG_DBG("getaddrinfo failed!\n"); + goto err; + } + memcpy(&_pep->src_addr, result->ai_addr, + result->ai_addrlen); + freeaddrinfo(result); } + _pep->info = *info; ret = socketpair(AF_UNIX, SOCK_STREAM, 0, _pep->cm.signal_fds); if (ret) { @@ -1410,14 +1177,23 @@ static void *sock_ep_cm_thread(void *arg) while (cm_head->do_listen) { sock_ep_cm_check_closing_rejected_list(cm_head); - num_fds = fi_epoll_wait(cm_head->emap, ep_contexts, + num_fds = ofi_epoll_wait(cm_head->epollfd, ep_contexts, SOCK_EPOLL_WAIT_EVENTS, -1); if (num_fds < 0) { SOCK_LOG_ERROR("poll failed : %s\n", strerror(errno)); continue; } - fastlock_acquire(&cm_head->signal_lock); + pthread_mutex_lock(&cm_head->signal_lock); + if (cm_head->removed_from_epollfd) { + /* If we removed a socket from the epollfd after + * ofi_epoll_wait returned, we can hit a use after + * free error. If a change was made, we skip processing + * and recheck for events. + */ + cm_head->removed_from_epollfd = false; + goto skip; + } for (i = 0; i < num_fds; i++) { handle = ep_contexts[i]; @@ -1439,7 +1215,8 @@ static void *sock_ep_cm_thread(void *arg) assert(handle->sock_fd != INVALID_SOCKET); sock_ep_cm_handle_rx(cm_head, handle); } - fastlock_release(&cm_head->signal_lock); +skip: + pthread_mutex_unlock(&cm_head->signal_lock); } return NULL; } @@ -1449,10 +1226,10 @@ int sock_ep_cm_start_thread(struct sock_ep_cm_head *cm_head) { assert(cm_head->do_listen == 0); - fastlock_init(&cm_head->signal_lock); + pthread_mutex_init(&cm_head->signal_lock, NULL); dlist_init(&cm_head->msg_list); - int ret = fi_epoll_create(&cm_head->emap); + int ret = ofi_epoll_create(&cm_head->epollfd); if (ret < 0) { SOCK_LOG_ERROR("failed to create epoll set\n"); goto err1; @@ -1465,15 +1242,16 @@ int sock_ep_cm_start_thread(struct sock_ep_cm_head *cm_head) goto err2; } - ret = fi_epoll_add(cm_head->emap, + ret = ofi_epoll_add(cm_head->epollfd, cm_head->signal.fd[FI_READ_FD], - FI_EPOLL_IN, NULL); + OFI_EPOLL_IN, NULL); if (ret != 0){ SOCK_LOG_ERROR("failed to add signal fd to epoll\n"); goto err3; } cm_head->do_listen = 1; + cm_head->removed_from_epollfd = false; ret = pthread_create(&cm_head->listener_thread, 0, sock_ep_cm_thread, cm_head); if (ret) { @@ -1486,7 +1264,7 @@ int sock_ep_cm_start_thread(struct sock_ep_cm_head *cm_head) cm_head->do_listen = 0; fd_signal_free(&cm_head->signal); err2: - fi_epoll_close(cm_head->emap); + ofi_epoll_close(cm_head->epollfd); err1: return ret; } @@ -1495,9 +1273,9 @@ void sock_ep_cm_wait_handle_finalized(struct sock_ep_cm_head *cm_head, struct sock_conn_req_handle *handle) { handle->state = SOCK_CONN_HANDLE_FINALIZING; - fastlock_acquire(&cm_head->signal_lock); + pthread_mutex_lock(&cm_head->signal_lock); sock_ep_cm_add_to_msg_list(cm_head, handle); - fastlock_release(&cm_head->signal_lock); + pthread_mutex_unlock(&cm_head->signal_lock); pthread_mutex_lock(&handle->finalized_mutex); while (handle->state != SOCK_CONN_HANDLE_FINALIZED) @@ -1516,10 +1294,10 @@ void sock_ep_cm_stop_thread(struct sock_ep_cm_head *cm_head) sock_ep_cm_signal(cm_head); if (cm_head->listener_thread && - pthread_join(cm_head->listener_thread, NULL)) { + pthread_join(cm_head->listener_thread, NULL)) { SOCK_LOG_DBG("pthread join failed\n"); } - fi_epoll_close(cm_head->emap); + ofi_epoll_close(cm_head->epollfd); fd_signal_free(&cm_head->signal); - fastlock_destroy(&cm_head->signal_lock); + pthread_mutex_destroy(&cm_head->signal_lock); } diff --git a/prov/sockets/src/sock_ep_rdm.c b/prov/sockets/src/sock_ep_rdm.c index 700f6b9239a..ede4bba67eb 100644 --- a/prov/sockets/src/sock_ep_rdm.c +++ b/prov/sockets/src/sock_ep_rdm.c @@ -57,278 +57,6 @@ #define SOCK_LOG_DBG(...) _SOCK_LOG_DBG(FI_LOG_EP_CTRL, __VA_ARGS__) #define SOCK_LOG_ERROR(...) _SOCK_LOG_ERROR(FI_LOG_EP_CTRL, __VA_ARGS__) -const struct fi_ep_attr sock_rdm_ep_attr = { - .type = FI_EP_RDM, - .protocol = FI_PROTO_SOCK_TCP, - .protocol_version = SOCK_WIRE_PROTO_VERSION, - .max_msg_size = SOCK_EP_MAX_MSG_SZ, - .msg_prefix_size = SOCK_EP_MSG_PREFIX_SZ, - .max_order_raw_size = SOCK_EP_MAX_ORDER_RAW_SZ, - .max_order_war_size = SOCK_EP_MAX_ORDER_WAR_SZ, - .max_order_waw_size = SOCK_EP_MAX_ORDER_WAW_SZ, - .mem_tag_format = SOCK_EP_MEM_TAG_FMT, - .tx_ctx_cnt = SOCK_EP_MAX_TX_CNT, - .rx_ctx_cnt = SOCK_EP_MAX_RX_CNT, -}; - -const struct fi_tx_attr sock_rdm_tx_attr = { - .caps = SOCK_EP_RDM_CAP_BASE, - .mode = SOCK_MODE, - .op_flags = SOCK_EP_DEFAULT_OP_FLAGS, - .msg_order = SOCK_EP_MSG_ORDER, - .inject_size = SOCK_EP_MAX_INJECT_SZ, - .size = SOCK_EP_TX_SZ, - .iov_limit = SOCK_EP_MAX_IOV_LIMIT, - .rma_iov_limit = SOCK_EP_MAX_IOV_LIMIT, -}; - -const struct fi_rx_attr sock_rdm_rx_attr = { - .caps = SOCK_EP_RDM_CAP_BASE, - .mode = SOCK_MODE, - .op_flags = 0, - .msg_order = SOCK_EP_MSG_ORDER, - .comp_order = SOCK_EP_COMP_ORDER, - .total_buffered_recv = SOCK_EP_MAX_BUFF_RECV, - .size = SOCK_EP_RX_SZ, - .iov_limit = SOCK_EP_MAX_IOV_LIMIT, -}; - -static int sock_rdm_verify_rx_attr(const struct fi_rx_attr *attr) -{ - if (!attr) - return 0; - - if ((attr->caps | SOCK_EP_RDM_CAP) != SOCK_EP_RDM_CAP) { - SOCK_LOG_DBG("Unsupported RDM rx caps\n"); - return -FI_ENODATA; - } - - if ((attr->msg_order | SOCK_EP_MSG_ORDER) != SOCK_EP_MSG_ORDER) { - SOCK_LOG_DBG("Unsuported rx message order\n"); - return -FI_ENODATA; - } - - if ((attr->comp_order | SOCK_EP_COMP_ORDER) != SOCK_EP_COMP_ORDER) { - SOCK_LOG_DBG("Unsuported rx completion order\n"); - return -FI_ENODATA; - } - - if (attr->total_buffered_recv > sock_rdm_rx_attr.total_buffered_recv) { - SOCK_LOG_DBG("Buffered receive size too large\n"); - return -FI_ENODATA; - } - - if (sock_get_tx_size(attr->size) > - sock_get_tx_size(sock_rdm_rx_attr.size)) { - SOCK_LOG_DBG("Rx size too large\n"); - return -FI_ENODATA; - } - - if (attr->iov_limit > sock_rdm_rx_attr.iov_limit) { - SOCK_LOG_DBG("Rx iov limit too large\n"); - return -FI_ENODATA; - } - - return 0; -} - -static int sock_rdm_verify_tx_attr(const struct fi_tx_attr *attr) -{ - if (!attr) - return 0; - - if ((attr->caps | SOCK_EP_RDM_CAP) != SOCK_EP_RDM_CAP) { - SOCK_LOG_DBG("Unsupported RDM tx caps\n"); - return -FI_ENODATA; - } - - if ((attr->msg_order | SOCK_EP_MSG_ORDER) != SOCK_EP_MSG_ORDER) { - SOCK_LOG_DBG("Unsupported tx message order\n"); - return -FI_ENODATA; - } - - if (attr->inject_size > sock_rdm_tx_attr.inject_size) { - SOCK_LOG_DBG("Inject size too large\n"); - return -FI_ENODATA; - } - - if (sock_get_tx_size(attr->size) > - sock_get_tx_size(sock_rdm_tx_attr.size)) { - SOCK_LOG_DBG("Tx size too large\n"); - return -FI_ENODATA; - } - - if (attr->iov_limit > sock_rdm_tx_attr.iov_limit) { - SOCK_LOG_DBG("Tx iov limit too large\n"); - return -FI_ENODATA; - } - - if (attr->rma_iov_limit > sock_rdm_tx_attr.rma_iov_limit) { - SOCK_LOG_DBG("RMA iov limit too large\n"); - return -FI_ENODATA; - } - - return 0; -} - -int sock_rdm_verify_ep_attr(const struct fi_ep_attr *ep_attr, - const struct fi_tx_attr *tx_attr, - const struct fi_rx_attr *rx_attr) -{ - int ret; - - if (ep_attr) { - switch (ep_attr->protocol) { - case FI_PROTO_UNSPEC: - case FI_PROTO_SOCK_TCP: - break; - default: - SOCK_LOG_DBG("Unsupported protocol\n"); - return -FI_ENODATA; - } - - if (ep_attr->protocol_version && - (ep_attr->protocol_version != sock_rdm_ep_attr.protocol_version)) { - SOCK_LOG_DBG("Invalid protocol version\n"); - return -FI_ENODATA; - } - - if (ep_attr->max_msg_size > sock_rdm_ep_attr.max_msg_size) { - SOCK_LOG_DBG("Message size too large\n"); - return -FI_ENODATA; - } - - if (ep_attr->msg_prefix_size > sock_rdm_ep_attr.msg_prefix_size) { - SOCK_LOG_DBG("Msg prefix size not supported\n"); - return -FI_ENODATA; - } - - if (ep_attr->max_order_raw_size > - sock_rdm_ep_attr.max_order_raw_size) { - SOCK_LOG_DBG("RAW order size too large\n"); - return -FI_ENODATA; - } - - if (ep_attr->max_order_war_size > - sock_rdm_ep_attr.max_order_war_size) { - SOCK_LOG_DBG("WAR order size too large\n"); - return -FI_ENODATA; - } - - if (ep_attr->max_order_waw_size > - sock_rdm_ep_attr.max_order_waw_size) { - SOCK_LOG_DBG("WAW order size too large\n"); - return -FI_ENODATA; - } - - if ((ep_attr->tx_ctx_cnt > SOCK_EP_MAX_TX_CNT) && - ep_attr->tx_ctx_cnt != FI_SHARED_CONTEXT) - return -FI_ENODATA; - - if ((ep_attr->rx_ctx_cnt > SOCK_EP_MAX_RX_CNT) && - ep_attr->rx_ctx_cnt != FI_SHARED_CONTEXT) - return -FI_ENODATA; - } - - ret = sock_rdm_verify_tx_attr(tx_attr); - if (ret) - return ret; - - ret = sock_rdm_verify_rx_attr(rx_attr); - if (ret) - return ret; - - return 0; -} - -int sock_rdm_fi_info(uint32_t version, void *src_addr, void *dest_addr, - const struct fi_info *hints, struct fi_info **info) -{ - *info = sock_fi_info(version, FI_EP_RDM, hints, src_addr, dest_addr); - if (!*info) - return -FI_ENOMEM; - - *(*info)->tx_attr = sock_rdm_tx_attr; - (*info)->tx_attr->size = sock_get_tx_size(sock_rdm_tx_attr.size); - *(*info)->rx_attr = sock_rdm_rx_attr; - (*info)->rx_attr->size = sock_get_tx_size(sock_rdm_rx_attr.size); - *(*info)->ep_attr = sock_rdm_ep_attr; - - if (hints && hints->ep_attr) { - if (hints->ep_attr->rx_ctx_cnt) - (*info)->ep_attr->rx_ctx_cnt = hints->ep_attr->rx_ctx_cnt; - if (hints->ep_attr->tx_ctx_cnt) - (*info)->ep_attr->tx_ctx_cnt = hints->ep_attr->tx_ctx_cnt; - } - - if (hints && hints->rx_attr) { - (*info)->rx_attr->op_flags |= hints->rx_attr->op_flags; - if (hints->rx_attr->caps) - (*info)->rx_attr->caps = SOCK_EP_RDM_SEC_CAP | - hints->rx_attr->caps; - } - - if (hints && hints->tx_attr) { - (*info)->tx_attr->op_flags |= hints->tx_attr->op_flags; - if (hints->tx_attr->caps) - (*info)->tx_attr->caps = SOCK_EP_RDM_SEC_CAP | - hints->tx_attr->caps; - } - - (*info)->caps = SOCK_EP_RDM_CAP | - (*info)->rx_attr->caps | (*info)->tx_attr->caps; - if (hints && hints->caps) { - (*info)->caps = SOCK_EP_RDM_SEC_CAP | hints->caps; - (*info)->rx_attr->caps = SOCK_EP_RDM_SEC_CAP | - ((*info)->rx_attr->caps & (*info)->caps); - (*info)->tx_attr->caps = SOCK_EP_RDM_SEC_CAP | - ((*info)->tx_attr->caps & (*info)->caps); - } - return 0; -} - -static int sock_rdm_endpoint(struct fid_domain *domain, struct fi_info *info, - struct sock_ep **ep, void *context, size_t fclass) -{ - int ret; - - if (info) { - if (info->ep_attr) { - ret = sock_rdm_verify_ep_attr(info->ep_attr, - info->tx_attr, - info->rx_attr); - if (ret) - return -FI_EINVAL; - } - - if (info->tx_attr) { - ret = sock_rdm_verify_tx_attr(info->tx_attr); - if (ret) - return -FI_EINVAL; - } - - if (info->rx_attr) { - ret = sock_rdm_verify_rx_attr(info->rx_attr); - if (ret) - return -FI_EINVAL; - } - } - - ret = sock_alloc_endpoint(domain, info, ep, context, fclass); - if (ret) - return ret; - - if (!info || !info->ep_attr) - (*ep)->attr->ep_attr = sock_rdm_ep_attr; - - if (!info || !info->tx_attr) - (*ep)->tx_attr = sock_rdm_tx_attr; - - if (!info || !info->rx_attr) - (*ep)->rx_attr = sock_rdm_rx_attr; - - return 0; -} int sock_rdm_ep(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context) @@ -336,7 +64,7 @@ int sock_rdm_ep(struct fid_domain *domain, struct fi_info *info, int ret; struct sock_ep *endpoint; - ret = sock_rdm_endpoint(domain, info, &endpoint, context, FI_CLASS_EP); + ret = sock_alloc_endpoint(domain, info, &endpoint, context, FI_CLASS_EP); if (ret) return ret; @@ -350,11 +78,10 @@ int sock_rdm_sep(struct fid_domain *domain, struct fi_info *info, int ret; struct sock_ep *endpoint; - ret = sock_rdm_endpoint(domain, info, &endpoint, context, FI_CLASS_SEP); + ret = sock_alloc_endpoint(domain, info, &endpoint, context, FI_CLASS_SEP); if (ret) return ret; *sep = &endpoint->ep; return 0; } - diff --git a/prov/sockets/src/sock_fabric.c b/prov/sockets/src/sock_fabric.c index 9e2e091c2cd..162892e1ee0 100644 --- a/prov/sockets/src/sock_fabric.c +++ b/prov/sockets/src/sock_fabric.c @@ -66,23 +66,9 @@ int sock_keepalive_time = INT_MAX; int sock_keepalive_intvl = INT_MAX; int sock_keepalive_probes = INT_MAX; -uint64_t SOCK_EP_RDM_SEC_CAP = SOCK_EP_RDM_SEC_CAP_BASE; -uint64_t SOCK_EP_RDM_CAP = SOCK_EP_RDM_CAP_BASE; -uint64_t SOCK_EP_MSG_SEC_CAP = SOCK_EP_MSG_SEC_CAP_BASE; -uint64_t SOCK_EP_MSG_CAP = SOCK_EP_MSG_CAP_BASE; - - -const struct fi_fabric_attr sock_fabric_attr = { - .fabric = NULL, - .name = NULL, - .prov_name = NULL, - .prov_version = FI_VERSION(SOCK_MAJOR_VERSION, SOCK_MINOR_VERSION), -}; - static struct dlist_entry sock_fab_list; static struct dlist_entry sock_dom_list; static fastlock_t sock_list_lock; -static struct slist sock_addr_list; static int read_default_params; void sock_dom_add_to_list(struct sock_domain *domain) @@ -206,101 +192,6 @@ struct sock_fabric *sock_fab_list_head(void) return fabric; } -int sock_verify_fabric_attr(const struct fi_fabric_attr *attr) -{ - if (!attr) - return 0; - - if (attr->prov_version) { - if (attr->prov_version != - FI_VERSION(SOCK_MAJOR_VERSION, SOCK_MINOR_VERSION)) - return -FI_ENODATA; - } - - return 0; -} - -int sock_verify_info(uint32_t version, const struct fi_info *hints) -{ - uint64_t caps; - enum fi_ep_type ep_type; - int ret; - struct sock_domain *domain; - struct sock_fabric *fabric; - - if (!hints) - return 0; - - ep_type = hints->ep_attr ? hints->ep_attr->type : FI_EP_UNSPEC; - switch (ep_type) { - case FI_EP_UNSPEC: - case FI_EP_MSG: - caps = SOCK_EP_MSG_CAP; - ret = sock_msg_verify_ep_attr(hints->ep_attr, - hints->tx_attr, - hints->rx_attr); - break; - case FI_EP_DGRAM: - caps = SOCK_EP_DGRAM_CAP; - ret = sock_dgram_verify_ep_attr(hints->ep_attr, - hints->tx_attr, - hints->rx_attr); - break; - case FI_EP_RDM: - caps = SOCK_EP_RDM_CAP; - ret = sock_rdm_verify_ep_attr(hints->ep_attr, - hints->tx_attr, - hints->rx_attr); - break; - default: - ret = -FI_ENODATA; - } - if (ret) - return ret; - - if ((caps | hints->caps) != caps) { - SOCK_LOG_DBG("Unsupported capabilities\n"); - return -FI_ENODATA; - } - - switch (hints->addr_format) { - case FI_FORMAT_UNSPEC: - case FI_SOCKADDR: - case FI_SOCKADDR_IN: - case FI_SOCKADDR_IN6: - break; - default: - SOCK_LOG_DBG("Unsupported address format\n"); - return -FI_ENODATA; - } - - if (hints->domain_attr && hints->domain_attr->domain) { - domain = container_of(hints->domain_attr->domain, - struct sock_domain, dom_fid); - if (!sock_dom_check_list(domain)) { - SOCK_LOG_DBG("no matching domain\n"); - return -FI_ENODATA; - } - } - ret = sock_verify_domain_attr(version, hints); - if (ret) - return ret; - - if (hints->fabric_attr && hints->fabric_attr->fabric) { - fabric = container_of(hints->fabric_attr->fabric, - struct sock_fabric, fab_fid); - if (!sock_fab_check_list(fabric)) { - SOCK_LOG_DBG("no matching fabric\n"); - return -FI_ENODATA; - } - } - ret = sock_verify_fabric_attr(hints->fabric_attr); - if (ret) - return ret; - - return 0; -} - static int sock_trywait(struct fid_fabric *fabric, struct fid **fids, int count) { /* we're always ready to wait! */ @@ -417,287 +308,34 @@ int sock_get_src_addr(union ofi_sock_ip *dest_addr, return ret; } -static int sock_fi_checkinfo(const struct fi_info *info, - const struct fi_info *hints) -{ - if (hints && hints->domain_attr && hints->domain_attr->name && - strcmp(info->domain_attr->name, hints->domain_attr->name)) - return -FI_ENODATA; - - if (hints && hints->fabric_attr && hints->fabric_attr->name && - strcmp(info->fabric_attr->name, hints->fabric_attr->name)) - return -FI_ENODATA; - - return 0; -} - -static int sock_ep_getinfo(uint32_t version, const char *node, - const char *service, uint64_t flags, - const struct fi_info *hints, enum fi_ep_type ep_type, - struct fi_info **info) -{ - struct addrinfo ai, *rai = NULL; - union ofi_sock_ip *src_addr = NULL, *dest_addr = NULL; - union ofi_sock_ip sip; - int ret; - - memset(&ai, 0, sizeof(ai)); - ai.ai_socktype = SOCK_STREAM; - ai.ai_family = ofi_get_sa_family(hints); - if (flags & FI_NUMERICHOST) - ai.ai_flags |= AI_NUMERICHOST; - - if (flags & FI_SOURCE) { - ai.ai_flags |= AI_PASSIVE; - ret = getaddrinfo(node, service, &ai, &rai); - if (ret) { - SOCK_LOG_DBG("getaddrinfo failed!\n"); - return -FI_ENODATA; - } - src_addr = (union ofi_sock_ip *) rai->ai_addr; - if (hints && hints->dest_addr) - dest_addr = hints->dest_addr; - } else { - if (node || service) { - ret = getaddrinfo(node, service, &ai, &rai); - if (ret) { - SOCK_LOG_DBG("getaddrinfo failed!\n"); - return -FI_ENODATA; - } - dest_addr = (union ofi_sock_ip *) rai->ai_addr; - } else if (hints) { - dest_addr = hints->dest_addr; - } - - if (hints && hints->src_addr) - src_addr = hints->src_addr; - } - - if (dest_addr && !src_addr) { - ret = sock_get_src_addr(dest_addr, &sip); - if (!ret) - src_addr = &sip; - } - - if (dest_addr) { - ofi_straddr_log(&sock_prov, FI_LOG_INFO, FI_LOG_CORE, - "dest addr: ", dest_addr); - } - if (src_addr) { - ofi_straddr_log(&sock_prov, FI_LOG_INFO, FI_LOG_CORE, - "src addr: ", src_addr); - } - switch (ep_type) { - case FI_EP_MSG: - ret = sock_msg_fi_info(version, src_addr, dest_addr, hints, info); - break; - case FI_EP_DGRAM: - ret = sock_dgram_fi_info(version, src_addr, dest_addr, hints, info); - break; - case FI_EP_RDM: - ret = sock_rdm_fi_info(version, src_addr, dest_addr, hints, info); - break; - default: - ret = -FI_ENODATA; - break; - } - - if (rai) - freeaddrinfo(rai); - - if (ret == 0) { - ret = sock_fi_checkinfo(*info, hints); - if (ret) - fi_freeinfo(*info); - } - - return ret; -} - -static void sock_init_addrlist(void) -{ - fastlock_acquire(&sock_list_lock); - if (slist_empty(&sock_addr_list)) - ofi_get_list_of_addr(&sock_prov, "iface", &sock_addr_list); - fastlock_release(&sock_list_lock); -} - -int sock_node_getinfo(uint32_t version, const char *node, const char *service, - uint64_t flags, const struct fi_info *hints, struct fi_info **info, - struct fi_info **tail) -{ - enum fi_ep_type ep_type; - struct fi_info *cur; - int ret; - - if (hints && hints->ep_attr) { - switch (hints->ep_attr->type) { - case FI_EP_RDM: - case FI_EP_DGRAM: - case FI_EP_MSG: - ret = sock_ep_getinfo(version, node, service, flags, - hints, hints->ep_attr->type, &cur); - if (ret) { - if (ret == -FI_ENODATA) - return ret; - goto err; - } - - if (!*info) - *info = cur; - else - (*tail)->next = cur; - (*tail) = cur; - return 0; - default: - break; - } - } - for (ep_type = FI_EP_MSG; ep_type <= FI_EP_RDM; ep_type++) { - ret = sock_ep_getinfo(version, node, service, flags, hints, - ep_type, &cur); - if (ret) { - if (ret == -FI_ENODATA) - continue; - goto err; - } - - if (!*info) - *info = cur; - else - (*tail)->next = cur; - (*tail) = cur; - } - if (!*info) { - ret = -FI_ENODATA; - goto err_no_free; - } - return 0; - -err: - fi_freeinfo(*info); - *info = NULL; -err_no_free: - return ret; -} - -static int sock_match_src_addr(struct slist_entry *entry, const void *src_addr) -{ - struct ofi_addr_list_entry *host_entry = - container_of(entry, struct ofi_addr_list_entry, entry); - - return ofi_equals_ipaddr(&host_entry->ipaddr.sa, src_addr); -} - -static int sock_addr_matches_interface(struct slist *addr_list, - struct sockaddr *src_addr) -{ - struct slist_entry *entry; - - /* Always match if it's localhost */ - if (ofi_is_loopback_addr(src_addr)) - return 1; - - entry = slist_find_first_match(addr_list, sock_match_src_addr, src_addr); - return entry ? 1 : 0; -} - -static int sock_node_matches_interface(struct slist *addr_list, const char *node) -{ - union ofi_sock_ip addr; - struct addrinfo *rai = NULL, ai = { - .ai_socktype = SOCK_STREAM, - }; - - if (getaddrinfo(node, 0, &ai, &rai)) { - SOCK_LOG_DBG("getaddrinfo failed!\n"); - return -FI_EINVAL; - } - if (rai->ai_addrlen > sizeof(addr)) { - freeaddrinfo(rai); - return -FI_EINVAL; - } - - memset(&addr, 0, sizeof addr); - memcpy(&addr, rai->ai_addr, rai->ai_addrlen); - freeaddrinfo(rai); - - return sock_addr_matches_interface(addr_list, &addr.sa); -} - static int sock_getinfo(uint32_t version, const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct fi_info **info) { - int ret = 0; - struct slist_entry *entry, *prev; - struct ofi_addr_list_entry *host_entry; - struct fi_info *tail; - - if (!(flags & FI_SOURCE) && hints && hints->src_addr && - (hints->src_addrlen != ofi_sizeofaddr(hints->src_addr))) - return -FI_ENODATA; - - if (((!node && !service) || (flags & FI_SOURCE)) && - hints && hints->dest_addr && - (hints->dest_addrlen != ofi_sizeofaddr(hints->dest_addr))) - return -FI_ENODATA; - - ret = sock_verify_info(version, hints); - if (ret) - return ret; - - ret = 1; - sock_init_addrlist(); - if ((flags & FI_SOURCE) && node) { - ret = sock_node_matches_interface(&sock_addr_list, node); - } else if (hints && hints->src_addr) { - ret = sock_addr_matches_interface(&sock_addr_list, - hints->src_addr); - } - if (!ret) { - SOCK_LOG_ERROR("Couldn't find a match with local interfaces\n"); - return -FI_ENODATA; - } - - *info = tail = NULL; - if (node || - (!(flags & FI_SOURCE) && hints && hints->src_addr) || - (!(flags & FI_SOURCE) && hints && hints->dest_addr)) - return sock_node_getinfo(version, node, service, flags, - hints, info, &tail); - - (void) prev; /* Makes compiler happy */ - slist_foreach(&sock_addr_list, entry, prev) { - host_entry = container_of(entry, struct ofi_addr_list_entry, entry); - node = host_entry->ipstr; - flags |= FI_SOURCE; - ret = sock_node_getinfo(version, node, service, flags, hints, info, &tail); - if (ret) { - if (ret == -FI_ENODATA) - continue; - return ret; - } - } - - return (!*info) ? ret : 0; + return ofi_ip_getinfo(&sock_util_prov, version, node, service, flags, + hints, info); } static void fi_sockets_fini(void) { - ofi_free_list_of_addr(&sock_addr_list); fastlock_destroy(&sock_list_lock); } struct fi_provider sock_prov = { .name = sock_prov_name, - .version = FI_VERSION(SOCK_MAJOR_VERSION, SOCK_MINOR_VERSION), - .fi_version = FI_VERSION(1, 8), + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, .getinfo = sock_getinfo, .fabric = sock_fabric, .cleanup = fi_sockets_fini }; +struct util_prov sock_util_prov = { + .prov = &sock_prov, + .info = &sock_dgram_info, + .flags = 0, +}; + SOCKETS_INI { #if HAVE_SOCKETS_DL @@ -747,11 +385,6 @@ SOCKETS_INI fastlock_init(&sock_list_lock); dlist_init(&sock_fab_list); dlist_init(&sock_dom_list); - slist_init(&sock_addr_list); - SOCK_EP_RDM_SEC_CAP |= OFI_RMA_PMEM; - SOCK_EP_RDM_CAP |= OFI_RMA_PMEM; - SOCK_EP_MSG_SEC_CAP |= OFI_RMA_PMEM; - SOCK_EP_MSG_CAP |= OFI_RMA_PMEM; #if ENABLE_DEBUG fi_param_define(&sock_prov, "dgram_drop_rate", FI_PARAM_INT, "Drop every Nth dgram frame (debug only)"); diff --git a/prov/sockets/src/sock_mr.c b/prov/sockets/src/sock_mr.c index b033f7d54c0..11d77466962 100644 --- a/prov/sockets/src/sock_mr.c +++ b/prov/sockets/src/sock_mr.c @@ -133,6 +133,7 @@ static int sock_regattr(struct fid *fid, const struct fi_mr_attr *attr, { struct fi_eq_entry eq_entry; struct sock_domain *dom; + struct fi_mr_attr cur_abi_attr; struct sock_mr *_mr; uint64_t key; struct fid_domain *domain; @@ -149,6 +150,8 @@ static int sock_regattr(struct fid *fid, const struct fi_mr_attr *attr, if (!_mr) return -FI_ENOMEM; + ofi_mr_update_attr(dom->fab->fab_fid.api_version, dom->info.caps, + attr, &cur_abi_attr); fastlock_acquire(&dom->lock); _mr->mr_fid.fid.fclass = FI_CLASS_MR; @@ -158,12 +161,12 @@ static int sock_regattr(struct fid *fid, const struct fi_mr_attr *attr, _mr->domain = dom; _mr->flags = flags; - ret = ofi_mr_map_insert(&dom->mr_map, attr, &key, _mr); + ret = ofi_mr_map_insert(&dom->mr_map, &cur_abi_attr, &key, _mr); if (ret != 0) goto err; _mr->mr_fid.key = _mr->key = key; - _mr->mr_fid.mem_desc = (void *)(uintptr_t)key; + _mr->mr_fid.mem_desc = (void *) (uintptr_t) key; fastlock_release(&dom->lock); *mr = &_mr->mr_fid; diff --git a/prov/sockets/src/sock_msg.c b/prov/sockets/src/sock_msg.c index 0d09300b8bb..a0e44e6e1d6 100644 --- a/prov/sockets/src/sock_msg.c +++ b/prov/sockets/src/sock_msg.c @@ -135,6 +135,7 @@ ssize_t sock_ep_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, SOCK_LOG_DBG("New rx_entry: %p (ctx: %p)\n", rx_entry, rx_ctx); fastlock_acquire(&rx_ctx->lock); dlist_insert_tail(&rx_entry->entry, &rx_ctx->rx_entry_list); + rx_ctx->progress_start = &rx_ctx->rx_buffered_list; fastlock_release(&rx_ctx->lock); return 0; } @@ -479,6 +480,7 @@ ssize_t sock_ep_trecvmsg(struct fid_ep *ep, fastlock_acquire(&rx_ctx->lock); SOCK_LOG_DBG("New rx_entry: %p (ctx: %p)\n", rx_entry, rx_ctx); dlist_insert_tail(&rx_entry->entry, &rx_ctx->rx_entry_list); + rx_ctx->progress_start = &rx_ctx->rx_buffered_list; fastlock_release(&rx_ctx->lock); return 0; } diff --git a/prov/sockets/src/sock_poll.c b/prov/sockets/src/sock_poll.c index e856774a173..6493b143561 100644 --- a/prov/sockets/src/sock_poll.c +++ b/prov/sockets/src/sock_poll.c @@ -129,12 +129,12 @@ static int sock_poll_poll(struct fid_poll *pollset, void **context, int count) cq = container_of(list_item->fid, struct sock_cq, cq_fid); sock_cq_progress(cq); - fastlock_acquire(&cq->lock); + pthread_mutex_lock(&cq->lock); if (ofi_rbfdused(&cq->cq_rbfd) || ofi_rbused(&cq->cqerr_rb)) { *context++ = cq->cq_fid.fid.context; ret_count++; } - fastlock_release(&cq->lock); + pthread_mutex_unlock(&cq->lock); break; case FI_CLASS_CNTR: diff --git a/prov/sockets/src/sock_progress.c b/prov/sockets/src/sock_progress.c index aa2018e47b8..b8f21962fbb 100644 --- a/prov/sockets/src/sock_progress.c +++ b/prov/sockets/src/sock_progress.c @@ -68,8 +68,9 @@ (((uint64_t)_addr) >> (64 - _bits))) -static int sock_pe_progress_buffered_rx(struct sock_rx_ctx *rx_ctx); - +#define SOCK_EP_MAX_PROGRESS_CNT 10 +static int sock_pe_progress_buffered_rx(struct sock_rx_ctx *rx_ctx, + bool shallow); static inline int sock_pe_is_data_msg(int msg_id) { @@ -864,16 +865,15 @@ static void sock_pe_do_atomic(void *cmp, void *dst, void *src, { char tmp_result[SOCK_EP_MAX_ATOMIC_SZ]; - if (op >= OFI_SWAP_OP_START) { - ofi_atomic_swap_handlers[op - OFI_SWAP_OP_START][datatype](dst, - src, cmp, tmp_result, cnt); + if (ofi_atomic_isswap_op(op)) { + ofi_atomic_swap_handler(op, datatype, dst, src, cmp, + tmp_result, cnt); if (cmp != NULL) memcpy(cmp, tmp_result, ofi_datatype_size(datatype) * cnt); - } else if (fetch) { - ofi_atomic_readwrite_handlers[op][datatype](dst, src, - cmp /*results*/, cnt); - } else { - ofi_atomic_write_handlers[op][datatype](dst, src, cnt); + } else if (fetch && ofi_atomic_isreadwrite_op(op)) { + ofi_atomic_readwrite_handler(op, datatype, dst, src, cmp, cnt); + } else if (ofi_atomic_iswrite_op(op)) { + ofi_atomic_write_handler(op, datatype, dst, src, cnt); } } @@ -1059,7 +1059,7 @@ sock_pe_process_rx_tatomic(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx, pe_entry->pe.rx.rx_entry = rx_entry; - sock_pe_progress_buffered_rx(rx_ctx); + sock_pe_progress_buffered_rx(rx_ctx, true); fastlock_release(&rx_ctx->lock); pe_entry->is_complete = 1; @@ -1177,21 +1177,36 @@ ssize_t sock_rx_claim_recv(struct sock_rx_ctx *rx_ctx, void *context, return ret; } -static int sock_pe_progress_buffered_rx(struct sock_rx_ctx *rx_ctx) +/* Check buffered msg list against posted list. If shallow is true, + * we only check SOCK_EP_MAX_PROGRESS_CNT messages to prevent progress + * test taking too long */ +static int sock_pe_progress_buffered_rx(struct sock_rx_ctx *rx_ctx, + bool shallow) { struct dlist_entry *entry; struct sock_pe_entry pe_entry; struct sock_rx_entry *rx_buffered, *rx_posted; size_t i, rem = 0, offset, len, used_len, dst_offset, datatype_sz; + size_t max_cnt; char *src, *dst; if (dlist_empty(&rx_ctx->rx_entry_list) || dlist_empty(&rx_ctx->rx_buffered_list)) return 0; - for (entry = rx_ctx->rx_buffered_list.next; - entry != &rx_ctx->rx_buffered_list;) { - + if (!shallow) { + /* ignoring rx_ctx->progress_start */ + entry = rx_ctx->rx_buffered_list.next; + max_cnt = SIZE_MAX; + } else { + /* continue where last time left off */ + entry = rx_ctx->progress_start; + if (entry == &rx_ctx->rx_buffered_list) { + entry = entry->next; + } + max_cnt = SOCK_EP_MAX_PROGRESS_CNT; + } + for (i = 0; i < max_cnt && entry != &rx_ctx->rx_buffered_list; i++) { rx_buffered = container_of(entry, struct sock_rx_entry, entry); entry = entry->next; @@ -1294,6 +1309,8 @@ static int sock_pe_progress_buffered_rx(struct sock_rx_ctx *rx_ctx) rx_ctx->num_left++; } } + /* remember where we left off for next shallow progress */ + rx_ctx->progress_start = entry; return 0; } @@ -1308,6 +1325,10 @@ static int sock_pe_process_rx_send(struct sock_pe *pe, offset = 0; len = sizeof(struct sock_msg_hdr); + if (pe_entry->addr == FI_ADDR_NOTAVAIL && + pe_entry->ep_attr->ep_type == FI_EP_RDM && pe_entry->ep_attr->av) + pe_entry->addr = pe_entry->conn->av_index; + if (pe_entry->msg_hdr.op_type == SOCK_OP_TSEND) { if (sock_pe_recv_field(pe_entry, &pe_entry->tag, SOCK_TAG_SIZE, len)) @@ -1325,7 +1346,8 @@ static int sock_pe_process_rx_send(struct sock_pe *pe, data_len = pe_entry->msg_hdr.msg_len - len; if (pe_entry->done_len == len && !pe_entry->pe.rx.rx_entry) { fastlock_acquire(&rx_ctx->lock); - sock_pe_progress_buffered_rx(rx_ctx); + rx_ctx->progress_start = &rx_ctx->rx_buffered_list; + sock_pe_progress_buffered_rx(rx_ctx, false); rx_entry = sock_rx_get_entry(rx_ctx, pe_entry->addr, pe_entry->tag, pe_entry->msg_hdr.op_type == SOCK_OP_TSEND ? 1 : 0); @@ -1923,13 +1945,12 @@ static int sock_pe_progress_tx_entry(struct sock_pe *pe, goto out; if (sock_comm_is_disconnected(pe_entry)) { - SOCK_LOG_DBG("conn disconnected: removing fd from pollset\n"); - if (pe_entry->ep_attr->cmap.used > 0 && - pe_entry->conn->sock_fd != -1) { - fastlock_acquire(&pe_entry->ep_attr->cmap.lock); - sock_ep_remove_conn(pe_entry->ep_attr, pe_entry->conn); - fastlock_release(&pe_entry->ep_attr->cmap.lock); - } + ofi_straddr_log(&sock_prov, FI_LOG_WARN, FI_LOG_EP_DATA, + "Peer disconnected: removing fd from pollset", + &pe_entry->conn->addr.sa); + fastlock_acquire(&pe_entry->ep_attr->cmap.lock); + sock_ep_remove_conn(pe_entry->ep_attr, pe_entry->conn); + fastlock_release(&pe_entry->ep_attr->cmap.lock); sock_pe_report_tx_error(pe_entry, 0, FI_EIO); pe_entry->is_complete = 1; @@ -2002,13 +2023,12 @@ static int sock_pe_progress_rx_pe_entry(struct sock_pe *pe, int ret; if (sock_comm_is_disconnected(pe_entry)) { - SOCK_LOG_DBG("conn disconnected: removing fd from pollset\n"); - if (pe_entry->ep_attr->cmap.used > 0 && - pe_entry->conn->sock_fd != -1) { - fastlock_acquire(&pe_entry->ep_attr->cmap.lock); - sock_ep_remove_conn(pe_entry->ep_attr, pe_entry->conn); - fastlock_release(&pe_entry->ep_attr->cmap.lock); - } + ofi_straddr_log(&sock_prov, FI_LOG_WARN, FI_LOG_EP_DATA, + "Peer disconnected: removing fd from pollset", + &pe_entry->conn->addr.sa); + fastlock_acquire(&pe_entry->ep_attr->cmap.lock); + sock_ep_remove_conn(pe_entry->ep_attr, pe_entry->conn); + fastlock_release(&pe_entry->ep_attr->cmap.lock); if (pe_entry->pe.rx.header_read) sock_pe_report_rx_error(pe_entry, 0, FI_EIO); @@ -2277,7 +2297,7 @@ void sock_pe_signal(struct sock_pe *pe) void sock_pe_poll_add(struct sock_pe *pe, int fd) { fastlock_acquire(&pe->signal_lock); - if (fi_epoll_add(pe->epoll_set, fd, FI_EPOLL_IN, NULL)) + if (ofi_epoll_add(pe->epoll_set, fd, OFI_EPOLL_IN, NULL)) SOCK_LOG_ERROR("failed to add to epoll set: %d\n", fd); fastlock_release(&pe->signal_lock); } @@ -2285,7 +2305,7 @@ void sock_pe_poll_add(struct sock_pe *pe, int fd) void sock_pe_poll_del(struct sock_pe *pe, int fd) { fastlock_acquire(&pe->signal_lock); - if (fi_epoll_del(pe->epoll_set, fd)) + if (ofi_epoll_del(pe->epoll_set, fd)) SOCK_LOG_DBG("failed to del from epoll set: %d\n", fd); fastlock_release(&pe->signal_lock); } @@ -2366,7 +2386,7 @@ static int sock_pe_progress_rx_ep(struct sock_pe *pe, } } - num_fds = fi_epoll_wait(map->epoll_set, map->epoll_ctxs, + num_fds = ofi_epoll_wait(map->epoll_set, map->epoll_ctxs, MIN(map->used, map->epoll_ctxs_sz), 0); if (num_fds < 0 || num_fds == 0) { if (num_fds < 0) @@ -2400,7 +2420,7 @@ int sock_pe_progress_rx_ctx(struct sock_pe *pe, struct sock_rx_ctx *rx_ctx) fastlock_acquire(&pe->lock); fastlock_acquire(&rx_ctx->lock); - sock_pe_progress_buffered_rx(rx_ctx); + sock_pe_progress_buffered_rx(rx_ctx, true); fastlock_release(&rx_ctx->lock); /* check for incoming data */ @@ -2538,7 +2558,7 @@ static int sock_pe_wait_ok(struct sock_pe *pe) struct sock_tx_ctx *tx_ctx; struct sock_rx_ctx *rx_ctx; - if (pe->waittime && ((fi_gettime_ms() - pe->waittime) < (uint64_t)sock_pe_waittime)) + if (pe->waittime && ((ofi_gettime_ms() - pe->waittime) < (uint64_t)sock_pe_waittime)) return 0; if (dlist_empty(&pe->tx_list) && dlist_empty(&pe->rx_list)) @@ -2577,7 +2597,7 @@ static void sock_pe_wait(struct sock_pe *pe) int ret; void *ep_contexts[1]; - ret = fi_epoll_wait(pe->epoll_set, ep_contexts, 1, -1); + ret = ofi_epoll_wait(pe->epoll_set, ep_contexts, 1, -1); if (ret < 0) SOCK_LOG_ERROR("poll failed : %s\n", strerror(ofi_sockerr())); @@ -2589,7 +2609,7 @@ static void sock_pe_wait(struct sock_pe *pe) SOCK_LOG_ERROR("Invalid signal\n"); } fastlock_release(&pe->signal_lock); - pe->waittime = fi_gettime_ms(); + pe->waittime = ofi_gettime_ms(); } static void sock_pe_set_affinity(void) @@ -2697,7 +2717,7 @@ struct sock_pe *sock_pe_init(struct sock_domain *domain) pthread_mutex_init(&pe->list_lock, NULL); pe->domain = domain; - + ret = ofi_bufpool_create(&pe->pe_rx_pool, sizeof(struct sock_pe_entry), 16, 0, 1024, 0); if (ret) { @@ -2712,7 +2732,7 @@ struct sock_pe *sock_pe_init(struct sock_domain *domain) goto err2; } - if (fi_epoll_create(&pe->epoll_set) < 0) { + if (ofi_epoll_create(&pe->epoll_set) < 0) { SOCK_LOG_ERROR("failed to create epoll set\n"); goto err3; } @@ -2722,9 +2742,9 @@ struct sock_pe *sock_pe_init(struct sock_domain *domain) goto err4; if (fd_set_nonblock(pe->signal_fds[SOCK_SIGNAL_RD_FD]) || - fi_epoll_add(pe->epoll_set, + ofi_epoll_add(pe->epoll_set, pe->signal_fds[SOCK_SIGNAL_RD_FD], - FI_EPOLL_IN, NULL)) + OFI_EPOLL_IN, NULL)) goto err5; pe->do_progress = 1; @@ -2741,7 +2761,7 @@ struct sock_pe *sock_pe_init(struct sock_domain *domain) ofi_close_socket(pe->signal_fds[0]); ofi_close_socket(pe->signal_fds[1]); err4: - fi_epoll_close(pe->epoll_set); + ofi_epoll_close(pe->epoll_set); err3: ofi_bufpool_destroy(pe->atomic_rx_pool); err2: @@ -2788,7 +2808,7 @@ void sock_pe_finalize(struct sock_pe *pe) fastlock_destroy(&pe->lock); fastlock_destroy(&pe->signal_lock); pthread_mutex_destroy(&pe->list_lock); - fi_epoll_close(pe->epoll_set); + ofi_epoll_close(pe->epoll_set); free(pe); SOCK_LOG_DBG("Progress engine finalize: OK\n"); } diff --git a/prov/sockets/src/sock_rx_entry.c b/prov/sockets/src/sock_rx_entry.c index 7ff4b5576b7..8f3e082df6a 100644 --- a/prov/sockets/src/sock_rx_entry.c +++ b/prov/sockets/src/sock_rx_entry.c @@ -124,6 +124,7 @@ struct sock_rx_entry *sock_rx_new_buffered_entry(struct sock_rx_ctx *rx_ctx, rx_ctx->buffered_len += len; dlist_insert_tail(&rx_entry->entry, &rx_ctx->rx_buffered_list); + rx_ctx->progress_start = &rx_ctx->rx_buffered_list; return rx_entry; } diff --git a/prov/sockets/src/sock_wait.c b/prov/sockets/src/sock_wait.c index 6f53cab16b2..578c8b15f88 100644 --- a/prov/sockets/src/sock_wait.c +++ b/prov/sockets/src/sock_wait.c @@ -127,7 +127,7 @@ static int sock_wait_wait(struct fid_wait *wait_fid, int timeout) wait = container_of(wait_fid, struct sock_wait, wait_fid); if (timeout > 0) - start_ms = fi_gettime_ms(); + start_ms = ofi_gettime_ms(); head = &wait->fid_list; for (p = head->next; p != head; p = p->next) { @@ -149,7 +149,7 @@ static int sock_wait_wait(struct fid_wait *wait_fid, int timeout) } } if (timeout > 0) { - end_ms = fi_gettime_ms(); + end_ms = ofi_gettime_ms(); timeout -= (int) (end_ms - start_ms); timeout = timeout < 0 ? 0 : timeout; } diff --git a/prov/tcp/src/tcpx.h b/prov/tcp/src/tcpx.h index feca63b2c73..585a5d7fb53 100644 --- a/prov/tcp/src/tcpx.h +++ b/prov/tcp/src/tcpx.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2017-2020 Intel Corporation, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -62,18 +62,14 @@ #ifndef _TCP_H_ #define _TCP_H_ -#define TCPX_MAJOR_VERSION 1 -#define TCPX_MINOR_VERSION 0 - #define TCPX_HDR_VERSION 3 #define TCPX_CTRL_HDR_VERSION 3 -#define TCPX_MAX_CM_DATA_SIZE (1<<8) +#define TCPX_MAX_CM_DATA_SIZE (1 << 8) #define TCPX_IOV_LIMIT (4) #define TCPX_MAX_INJECT_SZ (64) -#define MAX_EPOLL_EVENTS 100 -#define STAGE_BUF_SIZE 512 +#define MAX_POLL_EVENTS 100 #define TCPX_MIN_MULTI_RECV 16384 @@ -83,6 +79,7 @@ extern struct fi_provider tcpx_prov; extern struct util_prov tcpx_util_prov; extern struct fi_info tcpx_info; extern struct tcpx_port_range port_range; +extern int tcpx_nodelay; struct tcpx_xfer_entry; struct tcpx_ep; @@ -98,19 +95,26 @@ enum tcpx_xfer_op_codes { TCPX_OP_CODE_MAX, }; -enum tcpx_cm_event_type { - SERVER_SOCK_ACCEPT, - CLIENT_SEND_CONNREQ, - SERVER_RECV_CONNREQ, - SERVER_SEND_CM_ACCEPT, - CLIENT_RECV_CONNRESP, +enum tcpx_cm_state { + TCPX_CM_LISTENING, + TCPX_CM_CONNECTING, + TCPX_CM_WAIT_REQ, + TCPX_CM_REQ_SENT, + TCPX_CM_REQ_RVCD, + TCPX_CM_RESP_READY, + /* CM context is freed once connected */ +}; + +struct tcpx_cm_msg { + struct ofi_ctrl_hdr hdr; + char data[TCPX_MAX_CM_DATA_SIZE]; }; struct tcpx_cm_context { fid_t fid; - enum tcpx_cm_event_type type; + enum tcpx_cm_state state; size_t cm_data_sz; - char cm_data[TCPX_MAX_CM_DATA_SIZE]; + struct tcpx_cm_msg msg; }; struct tcpx_port_range { @@ -121,7 +125,7 @@ struct tcpx_port_range { struct tcpx_conn_handle { struct fid handle; struct tcpx_pep *pep; - SOCKET conn_fd; + SOCKET sock; bool endian_match; }; @@ -132,11 +136,13 @@ struct tcpx_pep { struct tcpx_cm_context cm_ctx; }; -enum tcpx_cm_state { - TCPX_EP_CONNECTING, - TCPX_EP_CONNECTED, - TCPX_EP_SHUTDOWN, - TCPX_EP_ERROR, +enum tcpx_state { + TCPX_IDLE, + TCPX_CONNECTING, + TCPX_RCVD_REQ, + TCPX_ACCEPTING, + TCPX_CONNECTED, + TCPX_DISCONNECTED, }; struct tcpx_base_hdr { @@ -161,7 +167,7 @@ struct tcpx_cq_data_hdr { TCPX_IOV_LIMIT + \ TCPX_MAX_INJECT_SZ) -struct tcpx_rx_detect { +struct tcpx_cur_rx_msg { union { struct tcpx_base_hdr base_hdr; uint8_t max_hdr[TCPX_MAX_HDR_SZ]; @@ -179,20 +185,21 @@ struct tcpx_rx_ctx { }; typedef int (*tcpx_rx_process_fn_t)(struct tcpx_xfer_entry *rx_entry); -typedef void (*tcpx_ep_progress_func_t)(struct tcpx_ep *ep); -typedef int (*tcpx_get_rx_func_t)(struct tcpx_ep *ep); + +enum { + STAGE_BUF_SIZE = 512 +}; struct stage_buf { uint8_t buf[STAGE_BUF_SIZE]; - size_t size; - size_t len; - size_t off; + size_t bytes_avail; + size_t cur_pos; }; struct tcpx_ep { struct util_ep util_ep; - SOCKET conn_fd; - struct tcpx_rx_detect rx_detect; + SOCKET sock; + struct tcpx_cur_rx_msg cur_rx_msg; struct tcpx_xfer_entry *cur_rx_entry; tcpx_rx_process_fn_t cur_rx_proc_fn; struct dlist_entry ep_entry; @@ -201,23 +208,20 @@ struct tcpx_ep { struct slist tx_rsp_pend_queue; struct slist rma_read_queue; struct tcpx_rx_ctx *srx_ctx; - enum tcpx_cm_state cm_state; - /* lock for protecting tx/rx queues,rma list,cm_state*/ + enum tcpx_state state; + /* lock for protecting tx/rx queues, rma list, state*/ fastlock_t lock; - tcpx_ep_progress_func_t progress_func; - tcpx_get_rx_func_t get_rx_entry[ofi_op_write + 1]; + int (*start_op[ofi_op_write + 1])(struct tcpx_ep *ep); void (*hdr_bswap)(struct tcpx_base_hdr *hdr); struct stage_buf stage_buf; size_t min_multi_recv_size; - bool send_ready_monitor; + bool pollout_set; }; struct tcpx_fabric { struct util_fabric util_fabric; }; -typedef void (*release_func_t)(struct tcpx_xfer_entry *xfer_entry); - struct tcpx_xfer_entry { struct slist_entry entry; union { @@ -232,13 +236,22 @@ struct tcpx_xfer_entry { void *context; uint64_t rem_len; void *mrecv_msg_start; - release_func_t rx_msg_release_fn; }; struct tcpx_domain { - struct util_domain util_domain; + struct util_domain util_domain; + struct ofi_ops_dynamic_rbuf *dynamic_rbuf; }; +static inline struct ofi_ops_dynamic_rbuf *tcpx_dynamic_rbuf(struct tcpx_ep *ep) +{ + struct tcpx_domain *domain; + + domain = container_of(ep->util_ep.domain, struct tcpx_domain, + util_domain); + return domain->dynamic_rbuf; +} + struct tcpx_buf_pool { struct ofi_bufpool *pool; enum tcpx_xfer_op_codes op_type; @@ -274,6 +287,7 @@ int tcpx_domain_open(struct fid_fabric *fabric, struct fi_info *info, int tcpx_endpoint(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep_fid, void *context); +void tcpx_ep_disable(struct tcpx_ep *ep, int cm_err); int tcpx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, @@ -285,33 +299,29 @@ void tcpx_cq_report_error(struct util_cq *cq, int err); +ssize_t tcpx_recv_hdr(SOCKET sock, struct stage_buf *stage_buf, + struct tcpx_cur_rx_msg *cur_rx_msg); int tcpx_recv_msg_data(struct tcpx_xfer_entry *recv_entry); int tcpx_send_msg(struct tcpx_xfer_entry *tx_entry); -int tcpx_recv_hdr(SOCKET sock, struct stage_buf *sbuf, - struct tcpx_rx_detect *rx_detect); int tcpx_read_to_buffer(SOCKET sock, struct stage_buf *stage_buf); struct tcpx_xfer_entry *tcpx_xfer_entry_alloc(struct tcpx_cq *cq, enum tcpx_xfer_op_codes type); -void tcpx_xfer_entry_release(struct tcpx_cq *tcpx_cq, - struct tcpx_xfer_entry *xfer_entry); -void tcpx_srx_xfer_release(struct tcpx_rx_ctx *srx_ctx, - struct tcpx_xfer_entry *xfer_entry); - -void tcpx_rx_msg_release(struct tcpx_xfer_entry *rx_entry); -void tcpx_rx_multi_recv_release(struct tcpx_xfer_entry *rx_entry); -struct tcpx_xfer_entry * -tcpx_srx_next_xfer_entry(struct tcpx_rx_ctx *srx_ctx, - struct tcpx_ep *ep, size_t entry_size); - -void tcpx_progress(struct util_ep *util_ep); -void tcpx_ep_progress(struct tcpx_ep *ep); +struct tcpx_xfer_entry *tcpx_srx_entry_alloc(struct tcpx_rx_ctx *srx_ctx, + struct tcpx_ep *ep); +void tcpx_xfer_entry_free(struct tcpx_cq *tcpx_cq, + struct tcpx_xfer_entry *xfer_entry); +void tcpx_srx_entry_free(struct tcpx_rx_ctx *srx_ctx, + struct tcpx_xfer_entry *xfer_entry); +void tcpx_rx_entry_free(struct tcpx_xfer_entry *rx_entry); + +void tcpx_progress_tx(struct tcpx_ep *ep); +void tcpx_progress_rx(struct tcpx_ep *ep); +int tcpx_try_func(void *util_ep); void tcpx_hdr_none(struct tcpx_base_hdr *hdr); void tcpx_hdr_bswap(struct tcpx_base_hdr *hdr); -int tcpx_ep_shutdown_report(struct tcpx_ep *ep, fid_t fid); -int tcpx_cq_wait_ep_add(struct tcpx_ep *ep); void tcpx_tx_queue_insert(struct tcpx_ep *tcpx_ep, struct tcpx_xfer_entry *tx_entry); @@ -320,10 +330,10 @@ int tcpx_eq_wait_try_func(void *arg); int tcpx_eq_create(struct fid_fabric *fabric_fid, struct fi_eq_attr *attr, struct fid_eq **eq_fid, void *context); -int tcpx_get_rx_entry_op_invalid(struct tcpx_ep *tcpx_ep); -int tcpx_get_rx_entry_op_msg(struct tcpx_ep *tcpx_ep); -int tcpx_get_rx_entry_op_read_req(struct tcpx_ep *tcpx_ep); -int tcpx_get_rx_entry_op_write(struct tcpx_ep *tcpx_ep); -int tcpx_get_rx_entry_op_read_rsp(struct tcpx_ep *tcpx_ep); +int tcpx_op_invalid(struct tcpx_ep *tcpx_ep); +int tcpx_op_msg(struct tcpx_ep *tcpx_ep); +int tcpx_op_read_req(struct tcpx_ep *tcpx_ep); +int tcpx_op_write(struct tcpx_ep *tcpx_ep); +int tcpx_op_read_rsp(struct tcpx_ep *tcpx_ep); #endif //_TCP_H_ diff --git a/prov/tcp/src/tcpx_attr.c b/prov/tcp/src/tcpx_attr.c index db67084d6df..f0f59d64873 100644 --- a/prov/tcp/src/tcpx_attr.c +++ b/prov/tcp/src/tcpx_attr.c @@ -37,7 +37,7 @@ #define TCPX_EP_CAPS (FI_MSG | FI_RMA | FI_RMA_PMEM) #define TCPX_TX_CAPS (FI_SEND | FI_WRITE | FI_READ) #define TCPX_RX_CAPS (FI_RECV | FI_REMOTE_READ | \ - FI_REMOTE_WRITE | FI_MULTI_RECV) + FI_REMOTE_WRITE) #define TCPX_MSG_ORDER (OFI_ORDER_RAR_SET | OFI_ORDER_RAW_SET | FI_ORDER_RAS | \ @@ -48,7 +48,7 @@ (FI_INJECT | FI_INJECT_COMPLETE | FI_TRANSMIT_COMPLETE | \ FI_DELIVERY_COMPLETE | FI_COMMIT_COMPLETE | FI_COMPLETION) -#define TCPX_RX_OP_FLAGS (FI_MULTI_RECV | FI_COMPLETION) +#define TCPX_RX_OP_FLAGS (FI_COMPLETION) static struct fi_tx_attr tcpx_tx_attr = { .caps = TCPX_EP_CAPS | TCPX_TX_CAPS, @@ -104,7 +104,7 @@ static struct fi_domain_attr tcpx_domain_attr = { static struct fi_fabric_attr tcpx_fabric_attr = { .name = "TCP-IP", - .prov_version = FI_VERSION(TCPX_MAJOR_VERSION, TCPX_MINOR_VERSION), + .prov_version = OFI_VERSION_DEF_PROV, }; struct fi_info tcpx_info = { diff --git a/prov/tcp/src/tcpx_comm.c b/prov/tcp/src/tcpx_comm.c index ff0fce6dfe6..d790b01379f 100644 --- a/prov/tcp/src/tcpx_comm.c +++ b/prov/tcp/src/tcpx_comm.c @@ -45,8 +45,7 @@ int tcpx_send_msg(struct tcpx_xfer_entry *tx_entry) msg.msg_iov = tx_entry->iov; msg.msg_iovlen = tx_entry->iov_cnt; - bytes_sent = ofi_sendmsg_tcp(tx_entry->ep->conn_fd, - &msg, MSG_NOSIGNAL); + bytes_sent = ofi_sendmsg_tcp(tx_entry->ep->sock, &msg, MSG_NOSIGNAL); if (bytes_sent < 0) return ofi_sockerr() == EPIPE ? -FI_ENOTCONN : -ofi_sockerr(); @@ -58,76 +57,51 @@ int tcpx_send_msg(struct tcpx_xfer_entry *tx_entry) return FI_SUCCESS; } -static ssize_t tcpx_read_from_buffer(struct stage_buf *sbuf, +static ssize_t tcpx_read_from_buffer(struct stage_buf *stage_buf, uint8_t *buf, size_t len) { size_t rem_size; ssize_t ret; - assert(sbuf->len >= sbuf->off); - rem_size = sbuf->len - sbuf->off; - assert(rem_size); - ret = (rem_size >= len)? len : rem_size; - memcpy(buf, &sbuf->buf[sbuf->off], ret); - sbuf->off += ret; + assert(stage_buf->cur_pos < stage_buf->bytes_avail); + rem_size = stage_buf->bytes_avail - stage_buf->cur_pos; + ret = (rem_size >= len) ? len : rem_size; + memcpy(buf, &stage_buf->buf[stage_buf->cur_pos], ret); + stage_buf->cur_pos += ret; return ret; } -int tcpx_recv_rem_hdr(SOCKET sock, struct stage_buf *sbuf, - struct tcpx_rx_detect *rx_detect) +ssize_t tcpx_recv_hdr(SOCKET sock, struct stage_buf *stage_buf, + struct tcpx_cur_rx_msg *cur_rx_msg) { - void *rem_buf; + ssize_t bytes_recvd, bytes_read; size_t rem_len; - ssize_t bytes_recvd; - - rem_buf = (uint8_t *) &rx_detect->hdr + rx_detect->done_len; - rem_len = rx_detect->hdr_len - rx_detect->done_len; - - if (sbuf->len != sbuf->off) { - bytes_recvd = tcpx_read_from_buffer(sbuf, rem_buf, rem_len); - } else { - bytes_recvd = ofi_recv_socket(sock, rem_buf, rem_len, 0); - } - if (bytes_recvd <= 0) - return (bytes_recvd)? -ofi_sockerr(): -FI_ENOTCONN; - - rx_detect->done_len += bytes_recvd; - return (rx_detect->done_len == rx_detect->hdr_len)? - FI_SUCCESS : -FI_EAGAIN; -} - -int tcpx_recv_hdr(SOCKET sock, struct stage_buf *sbuf, - struct tcpx_rx_detect *rx_detect) -{ void *rem_buf; - size_t rem_len; - ssize_t bytes_recvd; - rem_buf = (uint8_t *) &rx_detect->hdr + rx_detect->done_len; - rem_len = rx_detect->hdr_len - rx_detect->done_len; + rem_buf = (uint8_t *) &cur_rx_msg->hdr + cur_rx_msg->done_len; + rem_len = cur_rx_msg->hdr_len - cur_rx_msg->done_len; + + if (stage_buf->cur_pos < stage_buf->bytes_avail) { + bytes_read = tcpx_read_from_buffer(stage_buf, rem_buf, rem_len); + rem_len -= bytes_read; + if (!rem_len) + return bytes_read; - if (sbuf->len != sbuf->off) { - bytes_recvd = tcpx_read_from_buffer(sbuf, rem_buf, rem_len); + rem_buf = (char *) rem_buf + bytes_read; } else { - bytes_recvd = ofi_recv_socket(sock, rem_buf, rem_len, 0); + bytes_read = 0; } - if (bytes_recvd <= 0) - return (bytes_recvd)? -ofi_sockerr(): -FI_ENOTCONN; - rx_detect->done_len += bytes_recvd; - - if (rx_detect->done_len == sizeof(rx_detect->hdr.base_hdr)) { - rx_detect->hdr_len = (size_t) rx_detect->hdr.base_hdr.payload_off; - - if (rx_detect->hdr_len > rx_detect->done_len) - return tcpx_recv_rem_hdr(sock, sbuf, rx_detect); - } + bytes_recvd = ofi_recv_socket(sock, rem_buf, rem_len, 0); + if (bytes_recvd < 0) + return bytes_read ? bytes_read : -ofi_sockerr(); + else if (bytes_recvd == 0) + return -FI_ENOTCONN; - return (rx_detect->done_len == rx_detect->hdr_len)? - FI_SUCCESS : -FI_EAGAIN; + return bytes_read + bytes_recvd; } -static ssize_t tcpx_readv_from_buffer(struct stage_buf *sbuf, +static ssize_t tcpx_readv_from_buffer(struct stage_buf *stage_buf, struct iovec *iov, int iov_cnt) { @@ -136,15 +110,15 @@ static ssize_t tcpx_readv_from_buffer(struct stage_buf *sbuf, int i; if (iov_cnt == 1) - return tcpx_read_from_buffer(sbuf, iov[0].iov_base, + return tcpx_read_from_buffer(stage_buf, iov[0].iov_base, iov[0].iov_len); for (i = 0; i < iov_cnt; i++) { - bytes_read = tcpx_read_from_buffer(sbuf, iov[i].iov_base, + bytes_read = tcpx_read_from_buffer(stage_buf, iov[i].iov_base, iov[i].iov_len); ret += bytes_read; if ((bytes_read < iov[i].iov_len) || - !(sbuf->len - sbuf->off)) + !(stage_buf->bytes_avail - stage_buf->cur_pos)) break; } return ret; @@ -152,23 +126,34 @@ static ssize_t tcpx_readv_from_buffer(struct stage_buf *sbuf, int tcpx_recv_msg_data(struct tcpx_xfer_entry *rx_entry) { - ssize_t bytes_recvd; - - if (rx_entry->ep->stage_buf.len != rx_entry->ep->stage_buf.off) { - bytes_recvd = tcpx_readv_from_buffer(&rx_entry->ep->stage_buf, - rx_entry->iov, - rx_entry->iov_cnt); - }else { - bytes_recvd = ofi_readv_socket(rx_entry->ep->conn_fd, - rx_entry->iov, - rx_entry->iov_cnt); + struct stage_buf *stage_buf; + ssize_t bytes_recvd, bytes_read; + + if (!rx_entry->iov_cnt || !rx_entry->iov[0].iov_len) + return FI_SUCCESS; + + stage_buf = &rx_entry->ep->stage_buf; + if (stage_buf->cur_pos < stage_buf->bytes_avail) { + bytes_read = tcpx_readv_from_buffer(stage_buf, + rx_entry->iov, + rx_entry->iov_cnt); + ofi_consume_iov(rx_entry->iov, &rx_entry->iov_cnt, bytes_read); + if (!rx_entry->iov_cnt || !rx_entry->iov[0].iov_len) + return FI_SUCCESS; + } else { + bytes_read = 0; } - if (bytes_recvd <= 0) - return (bytes_recvd)? -ofi_sockerr(): -FI_ENOTCONN; + + bytes_recvd = ofi_readv_socket(rx_entry->ep->sock, rx_entry->iov, + rx_entry->iov_cnt); + if (bytes_recvd < 0) + return bytes_read ? -FI_EAGAIN : -ofi_sockerr(); + else if (bytes_recvd == 0) + return -FI_ENOTCONN; ofi_consume_iov(rx_entry->iov, &rx_entry->iov_cnt, bytes_recvd); - return (rx_entry->iov_cnt && rx_entry->iov[0].iov_len)? - -FI_EAGAIN: FI_SUCCESS; + return (!rx_entry->iov_cnt || !rx_entry->iov[0].iov_len) ? + FI_SUCCESS : -FI_EAGAIN; } int tcpx_read_to_buffer(SOCKET sock, struct stage_buf *stage_buf) @@ -176,11 +161,11 @@ int tcpx_read_to_buffer(SOCKET sock, struct stage_buf *stage_buf) int bytes_recvd; bytes_recvd = ofi_recv_socket(sock, stage_buf->buf, - stage_buf->size, 0); + sizeof(stage_buf->buf), 0); if (bytes_recvd <= 0) - return (bytes_recvd)? -ofi_sockerr(): -FI_ENOTCONN; + return (bytes_recvd) ? -ofi_sockerr(): -FI_ENOTCONN; - stage_buf->len = bytes_recvd; - stage_buf->off = 0; + stage_buf->bytes_avail = bytes_recvd; + stage_buf->cur_pos = 0; return FI_SUCCESS; } diff --git a/prov/tcp/src/tcpx_conn_mgr.c b/prov/tcp/src/tcpx_conn_mgr.c index 6be176fff31..8ed3299d27f 100644 --- a/prov/tcp/src/tcpx_conn_mgr.c +++ b/prov/tcp/src/tcpx_conn_mgr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 Intel Corporation. All rights reserved. + * Copyright (c) 2017-2020 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -39,247 +39,289 @@ #include -static int read_cm_data(SOCKET fd, struct tcpx_cm_context *cm_ctx, - struct ofi_ctrl_hdr *hdr) +/* The underlying socket has the POLLIN event set. The entire + * CM message should be readable, as it fits within a single MTU + * and is the first data transferred over the socket. + */ +static int rx_cm_data(SOCKET fd, int type, struct tcpx_cm_context *cm_ctx) { - cm_ctx->cm_data_sz = ntohs(hdr->seg_size); - if (cm_ctx->cm_data_sz) { - size_t data_sz = MIN(cm_ctx->cm_data_sz, - TCPX_MAX_CM_DATA_SIZE); - ssize_t ret = ofi_recv_socket(fd, cm_ctx->cm_data, - data_sz, MSG_WAITALL); - if ((size_t) ret != data_sz) - return -FI_EIO; - cm_ctx->cm_data_sz = data_sz; - - if (OFI_UNLIKELY(cm_ctx->cm_data_sz > - TCPX_MAX_CM_DATA_SIZE)) { - ofi_discard_socket(fd, cm_ctx->cm_data_sz - - TCPX_MAX_CM_DATA_SIZE); - } + size_t data_size = 0; + ssize_t ret; + + ret = ofi_recv_socket(fd, &cm_ctx->msg.hdr, sizeof(cm_ctx->msg.hdr), 0); + if (ret != sizeof(cm_ctx->msg.hdr)) { + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "Failed to read cm header\n"); + ret = ofi_sockerr() ? -ofi_sockerr() : -FI_EIO; + goto out; } - return FI_SUCCESS; -} -static int rx_cm_data(SOCKET fd, struct ofi_ctrl_hdr *hdr, - int type, struct tcpx_cm_context *cm_ctx) -{ - ssize_t ret; + if (cm_ctx->msg.hdr.version != TCPX_CTRL_HDR_VERSION) { + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "cm protocol version mismatch\n"); + ret = -FI_ENOPROTOOPT; + goto out; + } - ret = ofi_recv_socket(fd, hdr, - sizeof(*hdr), MSG_WAITALL); - if (ret != sizeof(*hdr)) - return -FI_EIO; + if (cm_ctx->msg.hdr.type != type && + cm_ctx->msg.hdr.type != ofi_ctrl_nack) { + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "unexpected cm message type, expected %d or %d got: %d\n", + type, ofi_ctrl_nack, cm_ctx->msg.hdr.type); + ret = -FI_ECONNREFUSED; + goto out; + } - if (hdr->version != TCPX_CTRL_HDR_VERSION) - return -FI_ENOPROTOOPT; + data_size = ntohs(cm_ctx->msg.hdr.seg_size); + if (data_size) { + if (data_size > TCPX_MAX_CM_DATA_SIZE) + data_size = TCPX_MAX_CM_DATA_SIZE; + + ret = ofi_recv_socket(fd, cm_ctx->msg.data, data_size, 0); + if ((size_t) ret != data_size) { + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "Failed to read cm data\n"); + ret = ofi_sockerr() ? -ofi_sockerr() : -FI_EIO; + data_size = 0; + goto out; + } - ret = read_cm_data(fd, cm_ctx, hdr); - if (hdr->type != type) { + if (ntohs(cm_ctx->msg.hdr.seg_size) > TCPX_MAX_CM_DATA_SIZE) { + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "Discarding unexpected cm data\n"); + ofi_discard_socket(fd, ntohs(cm_ctx->msg.hdr.seg_size) - + TCPX_MAX_CM_DATA_SIZE); + } + } + + if (cm_ctx->msg.hdr.type == ofi_ctrl_nack) { + FI_INFO(&tcpx_prov, FI_LOG_EP_CTRL, + "Connection refused from remote\n"); ret = -FI_ECONNREFUSED; + goto out; } + + ret = 0; +out: + cm_ctx->cm_data_sz = data_size; return ret; } +/* The underlying socket has the POLLOUT event set. It is ready + * to accept outbound data. We expect to transfer the entire CM + * message as it fits into a single MTU and is the first data + * transferred over the socket. + */ static int tx_cm_data(SOCKET fd, uint8_t type, struct tcpx_cm_context *cm_ctx) { - struct ofi_ctrl_hdr hdr; ssize_t ret; - memset(&hdr, 0, sizeof(hdr)); - hdr.version = TCPX_CTRL_HDR_VERSION; - hdr.type = type; - hdr.seg_size = htons((uint16_t) cm_ctx->cm_data_sz); - hdr.conn_data = 1; /* For testing endianess mismatch at peer */ - - ret = ofi_send_socket(fd, &hdr, sizeof(hdr), MSG_NOSIGNAL); - if (ret != sizeof(hdr)) - return -FI_EIO; - - if (cm_ctx->cm_data_sz) { - ret = ofi_send_socket(fd, cm_ctx->cm_data, - cm_ctx->cm_data_sz, MSG_NOSIGNAL); - if ((size_t) ret != cm_ctx->cm_data_sz) - return -FI_EIO; - } + memset(&cm_ctx->msg.hdr, 0, sizeof(cm_ctx->msg.hdr)); + cm_ctx->msg.hdr.version = TCPX_CTRL_HDR_VERSION; + cm_ctx->msg.hdr.type = type; + cm_ctx->msg.hdr.seg_size = htons((uint16_t) cm_ctx->cm_data_sz); + cm_ctx->msg.hdr.conn_data = 1; /* tests endianess mismatch at peer */ + + ret = ofi_send_socket(fd, &cm_ctx->msg, sizeof(cm_ctx->msg.hdr) + + cm_ctx->cm_data_sz, MSG_NOSIGNAL); + if (ret != sizeof(cm_ctx->msg.hdr) + cm_ctx->cm_data_sz) + return ofi_sockerr() ? -ofi_sockerr() : -FI_EIO; + return FI_SUCCESS; } -static int tcpx_ep_msg_xfer_enable(struct tcpx_ep *ep) +static int tcpx_ep_enable(struct tcpx_ep *ep) { - int ret; + int ret = 0; - fastlock_acquire(&ep->lock); - if (ep->cm_state != TCPX_EP_CONNECTING) { - fastlock_release(&ep->lock); - return -FI_EINVAL; - } - ep->progress_func = tcpx_ep_progress; - ret = fi_fd_nonblock(ep->conn_fd); - if (ret) { - fastlock_release(&ep->lock); - return ret; + if (!ep->util_ep.rx_cq && !ep->util_ep.tx_cq) { + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "ep must be bound to cq's\n"); + return -FI_ENOCQ; } - ep->cm_state = TCPX_EP_CONNECTED; - fastlock_release(&ep->lock); - - return tcpx_cq_wait_ep_add(ep); -} -static int proc_conn_resp(struct tcpx_cm_context *cm_ctx, - struct tcpx_ep *ep) -{ - struct ofi_ctrl_hdr conn_resp; - struct fi_eq_cm_entry *cm_entry; - ssize_t len; - int ret = FI_SUCCESS; - - ret = rx_cm_data(ep->conn_fd, &conn_resp, ofi_ctrl_connresp, cm_ctx); - if (ret) - return ret; + fastlock_acquire(&ep->lock); + if (ep->state != TCPX_CONNECTING && ep->state != TCPX_ACCEPTING) { + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "ep is in invalid state\n"); + ret = -FI_EINVAL; + goto unlock; + } - cm_entry = calloc(1, sizeof(*cm_entry) + cm_ctx->cm_data_sz); - if (!cm_entry) - return -FI_ENOMEM; + ep->state = TCPX_CONNECTED; + fastlock_release(&ep->lock); - cm_entry->fid = cm_ctx->fid; - memcpy(cm_entry->data, cm_ctx->cm_data, cm_ctx->cm_data_sz); + if (ep->util_ep.rx_cq) { + ret = ofi_wait_add_fd(ep->util_ep.rx_cq->wait, + ep->sock, POLLIN, tcpx_try_func, + (void *) &ep->util_ep, + &ep->util_ep.ep_fid.fid); + if (ret) { + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "Failed to add fd to rx_cq\n"); + return ret; + } + } - ep->hdr_bswap = (conn_resp.conn_data == 1)? - tcpx_hdr_none:tcpx_hdr_bswap; + if (ep->util_ep.tx_cq) { + ret = ofi_wait_add_fd(ep->util_ep.tx_cq->wait, + ep->sock, POLLIN, tcpx_try_func, + (void *) &ep->util_ep, + &ep->util_ep.ep_fid.fid); + if (ret) { + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "Failed to add fd to tx_cq\n"); + return ret; + } + } - ret = tcpx_ep_msg_xfer_enable(ep); - if (ret) - goto err; + /* TODO: Move writing CONNECTED event here */ - len = fi_eq_write(&ep->util_ep.eq->eq_fid, FI_CONNECTED, cm_entry, - sizeof(*cm_entry) + cm_ctx->cm_data_sz, 0); - if (len < 0) { - ret = (int) len; - goto err; - } -err: - free(cm_entry); + return ret; +unlock: + fastlock_release(&ep->lock); return ret; } -int tcpx_eq_wait_try_func(void *arg) -{ - return FI_SUCCESS; -} - -static void client_recv_connresp(struct util_wait *wait, - struct tcpx_cm_context *cm_ctx) +static void tcpx_cm_recv_resp(struct util_wait *wait, + struct tcpx_cm_context *cm_ctx) { - struct fi_eq_err_entry err_entry = { 0 }; + struct fi_eq_cm_entry *cm_entry; struct tcpx_ep *ep; - ssize_t ret; + int ret; + FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "Handling accept from server\n"); assert(cm_ctx->fid->fclass == FI_CLASS_EP); ep = container_of(cm_ctx->fid, struct tcpx_ep, util_ep.ep_fid.fid); - ret = ofi_wait_fd_del(wait, ep->conn_fd); + ret = rx_cm_data(ep->sock, ofi_ctrl_connresp, cm_ctx); + if (ret) { + if (ret == -FI_EAGAIN) + return; + + enum fi_log_level level = (ret == -FI_ECONNREFUSED) ? + FI_LOG_INFO : FI_LOG_WARN; + FI_LOG(&tcpx_prov, level, FI_LOG_EP_CTRL, + "Failed to receive connect response\n"); + ofi_wait_del_fd(wait, ep->sock); + goto err1; + } + + ret = ofi_wait_del_fd(wait, ep->sock); if (ret) { FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "Could not remove fd from wait\n"); - goto err; + goto err1; } - ret = proc_conn_resp(cm_ctx, ep); + cm_entry = calloc(1, sizeof(*cm_entry) + cm_ctx->cm_data_sz); + if (!cm_entry) + goto err1; + + cm_entry->fid = cm_ctx->fid; + memcpy(cm_entry->data, cm_ctx->msg.data, cm_ctx->cm_data_sz); + + ep->hdr_bswap = (cm_ctx->msg.hdr.conn_data == 1) ? + tcpx_hdr_none : tcpx_hdr_bswap; + + ret = tcpx_ep_enable(ep); if (ret) - goto err; + goto err2; - FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "Received Accept from server\n"); + ret = (int) fi_eq_write(&ep->util_ep.eq->eq_fid, FI_CONNECTED, cm_entry, + sizeof(*cm_entry) + cm_ctx->cm_data_sz, 0); + if (ret < 0) + goto err2; + + free(cm_entry); free(cm_ctx); return; -err: - err_entry.fid = cm_ctx->fid; - err_entry.context = cm_ctx->fid->context; - err_entry.err = -ret; - if (cm_ctx->cm_data_sz) { - err_entry.err_data = calloc(1, cm_ctx->cm_data_sz); - if (OFI_LIKELY(err_entry.err_data != NULL)) { - memcpy(err_entry.err_data, cm_ctx->cm_data, - cm_ctx->cm_data_sz); - err_entry.err_data_size = cm_ctx->cm_data_sz; - } - } - FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, - "fi_eq_write the conn refused %"PRId64"\n", ret); + +err2: + free(cm_entry); +err1: + tcpx_ep_disable(ep, -ret); free(cm_ctx); - /* `err_entry.err_data` must live until it is passed to user */ - ret = fi_eq_write(&ep->util_ep.eq->eq_fid, FI_NOTIFY, - &err_entry, sizeof(err_entry), UTIL_FLAG_ERROR); - if (OFI_UNLIKELY(ret < 0)) { - free(err_entry.err_data); - } } -static void server_send_cm_accept(struct util_wait *wait, - struct tcpx_cm_context *cm_ctx) +int tcpx_eq_wait_try_func(void *arg) +{ + return FI_SUCCESS; +} + +static void tcpx_cm_send_resp(struct util_wait *wait, + struct tcpx_cm_context *cm_ctx) { struct fi_eq_cm_entry cm_entry = {0}; - struct fi_eq_err_entry err_entry; struct tcpx_ep *ep; int ret; + FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "Send connect (accept) response\n"); assert(cm_ctx->fid->fclass == FI_CLASS_EP); ep = container_of(cm_ctx->fid, struct tcpx_ep, util_ep.ep_fid.fid); - ret = tx_cm_data(ep->conn_fd, ofi_ctrl_connresp, cm_ctx); - if (ret) - goto err; - - cm_entry.fid = cm_ctx->fid; - ret = (int) fi_eq_write(&ep->util_ep.eq->eq_fid, FI_CONNECTED, - &cm_entry, sizeof(cm_entry), 0); - if (ret < 0) { - FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "Error writing to EQ\n"); + ret = tx_cm_data(ep->sock, ofi_ctrl_connresp, cm_ctx); + if (ret) { + if (ret == -FI_EAGAIN) + return; + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "Failed to send connect (accept) response\n"); + goto delfd; } - ret = ofi_wait_fd_del(wait, ep->conn_fd); + ret = ofi_wait_del_fd(wait, ep->sock); if (ret) { FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "Could not remove fd from wait\n"); - goto err; + goto disable; } - ret = tcpx_ep_msg_xfer_enable(ep); + cm_entry.fid = cm_ctx->fid; + ret = (int) fi_eq_write(&ep->util_ep.eq->eq_fid, FI_CONNECTED, + &cm_entry, sizeof(cm_entry), 0); + if (ret < 0) + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "Error writing to EQ\n"); + + ret = tcpx_ep_enable(ep); if (ret) - goto err; + goto disable; FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "Connection Accept Successful\n"); free(cm_ctx); return; -err: - memset(&err_entry, 0, sizeof err_entry); - err_entry.fid = cm_ctx->fid; - err_entry.context = cm_ctx->fid->context; - err_entry.err = -ret; +delfd: + ofi_wait_del_fd(wait, ep->sock); +disable: + tcpx_ep_disable(ep, -ret); free(cm_ctx); - fi_eq_write(&ep->util_ep.eq->eq_fid, FI_NOTIFY, - &err_entry, sizeof(err_entry), UTIL_FLAG_ERROR); } -static void server_recv_connreq(struct util_wait *wait, - struct tcpx_cm_context *cm_ctx) +static void tcpx_cm_recv_req(struct util_wait *wait, + struct tcpx_cm_context *cm_ctx) { struct tcpx_conn_handle *handle; struct fi_eq_cm_entry *cm_entry; - struct ofi_ctrl_hdr conn_req; socklen_t len; int ret; - assert(cm_ctx->fid->fclass == FI_CLASS_CONNREQ); + FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "Server receive connect request\n"); + handle = container_of(cm_ctx->fid, struct tcpx_conn_handle, handle); - handle = container_of(cm_ctx->fid, - struct tcpx_conn_handle, - handle); + ret = rx_cm_data(handle->sock, ofi_ctrl_connreq, cm_ctx); + if (ret) { + if (ret == -FI_EAGAIN) + return; + ofi_wait_del_fd(wait, handle->sock); + goto err1; + } - ret = rx_cm_data(handle->conn_fd, &conn_req, ofi_ctrl_connreq, cm_ctx); - if (ret) + ret = ofi_wait_del_fd(wait, handle->sock); + if (ret) { + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "fd deletion from ofi_wait failed\n"); goto err1; + } cm_entry = calloc(1, sizeof(*cm_entry) + cm_ctx->cm_data_sz); if (!cm_entry) @@ -295,24 +337,23 @@ static void server_recv_connreq(struct util_wait *wait, if (!cm_entry->info->dest_addr) goto err3; - ret = ofi_getpeername(handle->conn_fd, cm_entry->info->dest_addr, &len); + ret = ofi_getpeername(handle->sock, cm_entry->info->dest_addr, &len); if (ret) goto err3; - handle->endian_match = (conn_req.conn_data == 1); + handle->endian_match = (cm_ctx->msg.hdr.conn_data == 1); cm_entry->info->handle = &handle->handle; - memcpy(cm_entry->data, cm_ctx->cm_data, cm_ctx->cm_data_sz); + memcpy(cm_entry->data, cm_ctx->msg.data, cm_ctx->cm_data_sz); + cm_ctx->state = TCPX_CM_REQ_RVCD; - ret = (int) fi_eq_write(&handle->pep->util_pep.eq->eq_fid, FI_CONNREQ, cm_entry, + ret = (int) fi_eq_write(&handle->pep->util_pep.eq->eq_fid, + FI_CONNREQ, cm_entry, sizeof(*cm_entry) + cm_ctx->cm_data_sz, 0); if (ret < 0) { FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "Error writing to EQ\n"); goto err3; } - ret = ofi_wait_fd_del(wait, handle->conn_fd); - if (ret) - FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, - "fd deletion from ofi_wait failed\n"); + free(cm_entry); free(cm_ctx); return; @@ -321,62 +362,58 @@ static void server_recv_connreq(struct util_wait *wait, err2: free(cm_entry); err1: - ofi_wait_fd_del(wait, handle->conn_fd); - ofi_close_socket(handle->conn_fd); + ofi_close_socket(handle->sock); free(cm_ctx); free(handle); } -static void client_send_connreq(struct util_wait *wait, - struct tcpx_cm_context *cm_ctx) +static void tcpx_cm_send_req(struct util_wait *wait, + struct tcpx_cm_context *cm_ctx) { struct tcpx_ep *ep; - struct fi_eq_err_entry err_entry; socklen_t len; int status, ret = FI_SUCCESS; FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "client send connreq\n"); - assert(cm_ctx->fid->fclass == FI_CLASS_EP); - ep = container_of(cm_ctx->fid, struct tcpx_ep, util_ep.ep_fid.fid); len = sizeof(status); - ret = getsockopt(ep->conn_fd, SOL_SOCKET, SO_ERROR, (char *) &status, &len); + ret = getsockopt(ep->sock, SOL_SOCKET, SO_ERROR, (char *) &status, &len); if (ret < 0 || status) { - FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "connection failure\n"); ret = (ret < 0)? -ofi_sockerr() : status; - goto err; + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "connection failure\n"); + goto delfd; } - ret = tx_cm_data(ep->conn_fd, ofi_ctrl_connreq, cm_ctx); + ret = tx_cm_data(ep->sock, ofi_ctrl_connreq, cm_ctx); if (ret) - goto err; + goto delfd; - ret = ofi_wait_fd_del(wait, ep->conn_fd); - if (ret) - goto err; + ret = ofi_wait_del_fd(wait, ep->sock); + if (ret) { + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "Could not remove fd from wait: %s\n", + fi_strerror(-ret)); + goto disable; + } - cm_ctx->type = CLIENT_RECV_CONNRESP; - ret = ofi_wait_fd_add(wait, ep->conn_fd, FI_EPOLL_IN, + cm_ctx->state = TCPX_CM_REQ_SENT; + ret = ofi_wait_add_fd(wait, ep->sock, POLLIN, tcpx_eq_wait_try_func, NULL, cm_ctx); if (ret) - goto err; + goto disable; - wait->signal(wait); return; -err: - memset(&err_entry, 0, sizeof err_entry); - err_entry.fid = cm_ctx->fid; - err_entry.context = cm_ctx->fid->context; - err_entry.err = -ret; +delfd: + ofi_wait_del_fd(wait, ep->sock); +disable: + tcpx_ep_disable(ep, -ret); free(cm_ctx); - fi_eq_write(&ep->util_ep.eq->eq_fid, FI_NOTIFY, - &err_entry, sizeof(err_entry), UTIL_FLAG_ERROR); } -static void server_sock_accept(struct util_wait *wait, - struct tcpx_cm_context *cm_ctx) +static void tcpx_accept(struct util_wait *wait, + struct tcpx_cm_context *cm_ctx) { struct tcpx_conn_handle *handle; struct tcpx_cm_context *rx_req_cm_ctx; @@ -384,15 +421,16 @@ static void server_sock_accept(struct util_wait *wait, SOCKET sock; int ret; - FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "Received Connreq\n"); + FI_DBG(&tcpx_prov, FI_LOG_EP_CTRL, "accepting connection\n"); assert(cm_ctx->fid->fclass == FI_CLASS_PEP); - pep = container_of(cm_ctx->fid, struct tcpx_pep, - util_pep.pep_fid.fid); + pep = container_of(cm_ctx->fid, struct tcpx_pep, util_pep.pep_fid.fid); sock = accept(pep->sock, NULL, 0); if (sock < 0) { - FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, - "accept error: %d\n", ofi_sockerr()); + if (!OFI_SOCK_TRY_ACCEPT_AGAIN(ofi_sockerr())) { + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "accept error: %d\n", ofi_sockerr()); + } return; } @@ -407,18 +445,18 @@ static void server_sock_accept(struct util_wait *wait, if (!rx_req_cm_ctx) goto err2; - handle->conn_fd = sock; + handle->sock = sock; handle->handle.fclass = FI_CLASS_CONNREQ; handle->pep = pep; rx_req_cm_ctx->fid = &handle->handle; - rx_req_cm_ctx->type = SERVER_RECV_CONNREQ; + rx_req_cm_ctx->state = TCPX_CM_WAIT_REQ; - ret = ofi_wait_fd_add(wait, sock, FI_EPOLL_IN, + ret = ofi_wait_add_fd(wait, sock, POLLIN, tcpx_eq_wait_try_func, NULL, (void *) rx_req_cm_ctx); if (ret) goto err3; - wait->signal(wait); + return; err3: free(rx_req_cm_ctx); @@ -431,33 +469,51 @@ static void server_sock_accept(struct util_wait *wait, static void process_cm_ctx(struct util_wait *wait, struct tcpx_cm_context *cm_ctx) { - switch (cm_ctx->type) { - case SERVER_SOCK_ACCEPT: - server_sock_accept(wait,cm_ctx); + switch (cm_ctx->state) { + case TCPX_CM_LISTENING: + assert(cm_ctx->fid->fclass == FI_CLASS_PEP); + tcpx_accept(wait, cm_ctx); break; - case CLIENT_SEND_CONNREQ: - client_send_connreq(wait, cm_ctx); + case TCPX_CM_CONNECTING: + assert((cm_ctx->fid->fclass == FI_CLASS_EP) && + (container_of(cm_ctx->fid, struct tcpx_ep, + util_ep.ep_fid.fid)->state == + TCPX_CONNECTING)); + tcpx_cm_send_req(wait, cm_ctx); break; - case SERVER_RECV_CONNREQ: - server_recv_connreq(wait, cm_ctx); + case TCPX_CM_WAIT_REQ: + assert(cm_ctx->fid->fclass == FI_CLASS_CONNREQ); + tcpx_cm_recv_req(wait, cm_ctx); break; - case SERVER_SEND_CM_ACCEPT: - server_send_cm_accept(wait, cm_ctx); + case TCPX_CM_RESP_READY: + assert((cm_ctx->fid->fclass == FI_CLASS_EP) && + (container_of(cm_ctx->fid, struct tcpx_ep, + util_ep.ep_fid.fid)->state == + TCPX_ACCEPTING)); + tcpx_cm_send_resp(wait, cm_ctx); break; - case CLIENT_RECV_CONNRESP: - client_recv_connresp(wait, cm_ctx); + case TCPX_CM_REQ_SENT: + assert((cm_ctx->fid->fclass == FI_CLASS_EP) && + (container_of(cm_ctx->fid, struct tcpx_ep, + util_ep.ep_fid.fid)->state == + TCPX_CONNECTING)); + tcpx_cm_recv_resp(wait, cm_ctx); break; default: - FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, - "should never end up here\n"); + break; } } +/* The implementation assumes that the EQ does not share a wait set with + * a CQ. This is true for internally created wait sets, but not if the + * application manages the wait set. To fix, we need to distinguish + * whether the wait_context references a fid or tcpx_cm_context. + */ void tcpx_conn_mgr_run(struct util_eq *eq) { struct util_wait_fd *wait_fd; struct tcpx_eq *tcpx_eq; - void *wait_contexts[MAX_EPOLL_EVENTS]; + void *wait_contexts[MAX_POLL_EVENTS]; int num_fds = 0, i; assert(eq->wait != NULL); @@ -467,21 +523,22 @@ void tcpx_conn_mgr_run(struct util_eq *eq) tcpx_eq = container_of(eq, struct tcpx_eq, util_eq); fastlock_acquire(&tcpx_eq->close_lock); - num_fds = fi_epoll_wait(wait_fd->epoll_fd, wait_contexts, - MAX_EPOLL_EVENTS, 0); + num_fds = (wait_fd->util_wait.wait_obj == FI_WAIT_FD) ? + ofi_epoll_wait(wait_fd->epoll_fd, wait_contexts, + MAX_POLL_EVENTS, 0) : + ofi_pollfds_wait(wait_fd->pollfds, wait_contexts, + MAX_POLL_EVENTS, 0); if (num_fds < 0) { fastlock_release(&tcpx_eq->close_lock); return; } for ( i = 0; i < num_fds; i++) { - /* skip wake up signals */ if (&wait_fd->util_wait.wait_fid.fid == wait_contexts[i]) continue; - process_cm_ctx(eq->wait, - (struct tcpx_cm_context *) + process_cm_ctx(eq->wait, (struct tcpx_cm_context *) wait_contexts[i]); } fastlock_release(&tcpx_eq->close_lock); diff --git a/prov/tcp/src/tcpx_cq.c b/prov/tcp/src/tcpx_cq.c index 868285cbcd6..a2d55ac1bb0 100644 --- a/prov/tcp/src/tcpx_cq.c +++ b/prov/tcp/src/tcpx_cq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 Intel Corporation. All rights reserved. + * Copyright (c) 2017-2020 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -37,6 +37,56 @@ #define TCPX_DEF_CQ_SIZE (1024) + +void tcpx_cq_progress(struct util_cq *cq) +{ + void *wait_contexts[MAX_POLL_EVENTS]; + struct fid_list_entry *fid_entry; + struct util_wait_fd *wait_fd; + struct dlist_entry *item; + struct tcpx_ep *ep; + struct fid *fid; + int nfds, i; + + wait_fd = container_of(cq->wait, struct util_wait_fd, util_wait); + + cq->cq_fastlock_acquire(&cq->ep_list_lock); + dlist_foreach(&cq->ep_list, item) { + fid_entry = container_of(item, struct fid_list_entry, entry); + ep = container_of(fid_entry->fid, struct tcpx_ep, + util_ep.ep_fid.fid); + tcpx_try_func(&ep->util_ep); + fastlock_acquire(&ep->lock); + tcpx_progress_tx(ep); + if (ep->stage_buf.cur_pos < ep->stage_buf.bytes_avail) + tcpx_progress_rx(ep); + fastlock_release(&ep->lock); + } + + nfds = (wait_fd->util_wait.wait_obj == FI_WAIT_FD) ? + ofi_epoll_wait(wait_fd->epoll_fd, wait_contexts, + MAX_POLL_EVENTS, 0) : + ofi_pollfds_wait(wait_fd->pollfds, wait_contexts, + MAX_POLL_EVENTS, 0); + if (nfds <= 0) + goto unlock; + + for (i = 0; i < nfds; i++) { + fid = wait_contexts[i]; + if (fid->fclass != FI_CLASS_EP) { + fd_signal_reset(&wait_fd->signal); + continue; + } + + ep = container_of(fid, struct tcpx_ep, util_ep.ep_fid.fid); + fastlock_acquire(&ep->lock); + tcpx_progress_rx(ep); + fastlock_release(&ep->lock); + } +unlock: + cq->cq_fastlock_release(&cq->ep_list_lock); +} + static void tcpx_buf_pools_destroy(struct tcpx_buf_pool *buf_pools) { int i; @@ -66,29 +116,20 @@ struct tcpx_xfer_entry *tcpx_xfer_entry_alloc(struct tcpx_cq *tcpx_cq, struct tcpx_xfer_entry *xfer_entry; tcpx_cq->util_cq.cq_fastlock_acquire(&tcpx_cq->util_cq.cq_lock); - - /* optimization: don't allocate queue_entry when cq is full */ - if (ofi_cirque_isfull(tcpx_cq->util_cq.cirq)) { - tcpx_cq->util_cq.cq_fastlock_release(&tcpx_cq->util_cq.cq_lock); - return NULL; - } - - xfer_entry = ofi_buf_alloc(tcpx_cq->buf_pools[type].pool); - if (!xfer_entry) { - tcpx_cq->util_cq.cq_fastlock_release(&tcpx_cq->util_cq.cq_lock); - FI_INFO(&tcpx_prov, FI_LOG_DOMAIN,"failed to get buffer\n"); - return NULL; - } + if (!ofi_cirque_isfull(tcpx_cq->util_cq.cirq)) + xfer_entry = ofi_buf_alloc(tcpx_cq->buf_pools[type].pool); + else + xfer_entry = NULL; tcpx_cq->util_cq.cq_fastlock_release(&tcpx_cq->util_cq.cq_lock); + return xfer_entry; } -void tcpx_xfer_entry_release(struct tcpx_cq *tcpx_cq, - struct tcpx_xfer_entry *xfer_entry) +void tcpx_xfer_entry_free(struct tcpx_cq *tcpx_cq, + struct tcpx_xfer_entry *xfer_entry) { - if (xfer_entry->ep->cur_rx_entry == xfer_entry) { + if (xfer_entry->ep->cur_rx_entry == xfer_entry) xfer_entry->ep->cur_rx_entry = NULL; - } xfer_entry->hdr.base_hdr.flags = 0; @@ -111,24 +152,17 @@ void tcpx_cq_report_success(struct util_cq *cq, flags = xfer_entry->flags; - if (!(flags & FI_MULTI_RECV) && !(flags & FI_COMPLETION)) + if (!(flags & FI_COMPLETION)) return; len = xfer_entry->hdr.base_hdr.size - - xfer_entry->hdr.base_hdr.payload_off; + xfer_entry->hdr.base_hdr.payload_off; if (xfer_entry->hdr.base_hdr.flags & OFI_REMOTE_CQ_DATA) { flags |= FI_REMOTE_CQ_DATA; data = xfer_entry->hdr.cq_data_hdr.cq_data; } - if ((flags & FI_MULTI_RECV) && - (xfer_entry->rem_len >= xfer_entry->ep->min_multi_recv_size)) { - buf = xfer_entry->mrecv_msg_start; - } else { - flags &= ~FI_MULTI_RECV; - } - ofi_cq_write(cq, xfer_entry->context, flags, len, buf, data, 0); if (cq->wait) @@ -142,9 +176,6 @@ void tcpx_cq_report_error(struct util_cq *cq, struct fi_cq_err_entry err_entry; uint64_t data = 0; - if (!(xfer_entry->flags & FI_COMPLETION)) - return; - if (xfer_entry->hdr.base_hdr.flags & OFI_REMOTE_CQ_DATA) { xfer_entry->flags |= FI_REMOTE_CQ_DATA; data = xfer_entry->hdr.cq_data_hdr.cq_data; @@ -174,18 +205,17 @@ static int tcpx_cq_control(struct fid *fid, int command, void *arg) switch(command) { case FI_GETWAIT: + case FI_GETWAITOBJ: if (!cq->wait) - return -FI_ENOSYS; - - ret = fi_control(&cq->wait->wait_fid.fid, - command, arg); - if (ret) - return ret; + return -FI_ENODATA; - return FI_SUCCESS; + ret = fi_control(&cq->wait->wait_fid.fid, command, arg); + break; default: return -FI_ENOSYS; } + + return ret; } static struct fi_ops tcpx_cq_fi_ops = { @@ -262,8 +292,9 @@ static int tcpx_buf_pools_create(struct tcpx_buf_pool *buf_pools) int tcpx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct fid_cq **cq_fid, void *context) { - int ret; struct tcpx_cq *tcpx_cq; + struct fi_cq_attr cq_attr; + int ret; tcpx_cq = calloc(1, sizeof(*tcpx_cq)); if (!tcpx_cq) @@ -276,8 +307,15 @@ int tcpx_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, if (ret) goto free_cq; + if (attr->wait_obj == FI_WAIT_NONE || + attr->wait_obj == FI_WAIT_UNSPEC) { + cq_attr = *attr; + cq_attr.wait_obj = FI_WAIT_POLLFD; + attr = &cq_attr; + } + ret = ofi_cq_init(&tcpx_prov, domain, attr, &tcpx_cq->util_cq, - &ofi_cq_progress, context); + &tcpx_cq_progress, context); if (ret) goto destroy_pool; diff --git a/prov/tcp/src/tcpx_domain.c b/prov/tcp/src/tcpx_domain.c index 4d623344d6e..2c3a22aef7a 100644 --- a/prov/tcp/src/tcpx_domain.c +++ b/prov/tcp/src/tcpx_domain.c @@ -115,8 +115,32 @@ static struct fi_ops_domain tcpx_domain_ops = { .stx_ctx = fi_no_stx_context, .srx_ctx = tcpx_srx_ctx, .query_atomic = fi_no_query_atomic, + .query_collective = fi_no_query_collective, }; +static int tcpx_set_ops(struct fid *fid, const char *name, + uint64_t flags, void *ops, void *context) +{ + struct tcpx_domain *domain; + + domain = container_of(fid, struct tcpx_domain, + util_domain.domain_fid.fid); + if (flags) + return -FI_EBADFLAGS; + + if (!strcasecmp(name, OFI_OPS_DYNAMIC_RBUF)) { + domain->dynamic_rbuf = ops; + if (domain->dynamic_rbuf->size != sizeof(*domain->dynamic_rbuf)) { + domain->dynamic_rbuf = NULL; + return -FI_ENOSYS; + } + + return 0; + } + + return -FI_ENOSYS; +} + static int tcpx_domain_close(fid_t fid) { struct tcpx_domain *tcpx_domain; @@ -130,15 +154,17 @@ static int tcpx_domain_close(fid_t fid) return ret; free(tcpx_domain); - return 0; + return FI_SUCCESS; } static struct fi_ops tcpx_domain_fi_ops = { .size = sizeof(struct fi_ops), .close = tcpx_domain_close, - .bind = fi_no_bind, + .bind = ofi_domain_bind, .control = fi_no_control, .ops_open = fi_no_ops_open, + .tostr = NULL, + .ops_set = tcpx_set_ops, }; static struct fi_ops_mr tcpx_domain_fi_ops_mr = { @@ -171,7 +197,7 @@ int tcpx_domain_open(struct fid_fabric *fabric, struct fi_info *info, (*domain)->ops = &tcpx_domain_ops; (*domain)->mr = &tcpx_domain_fi_ops_mr; - return 0; + return FI_SUCCESS; err: free(tcpx_domain); return ret; diff --git a/prov/tcp/src/tcpx_ep.c b/prov/tcp/src/tcpx_ep.c index 6280cad427a..a76a96d2184 100644 --- a/prov/tcp/src/tcpx_ep.c +++ b/prov/tcp/src/tcpx_ep.c @@ -42,7 +42,10 @@ extern struct fi_ops_rma tcpx_rma_ops; extern struct fi_ops_msg tcpx_msg_ops; -void tcpx_hdr_none(struct tcpx_base_hdr *hdr) {} +void tcpx_hdr_none(struct tcpx_base_hdr *hdr) +{ + /* no-op */ +} void tcpx_hdr_bswap(struct tcpx_base_hdr *hdr) { @@ -66,7 +69,7 @@ void tcpx_hdr_bswap(struct tcpx_base_hdr *hdr) } } -static int tcpx_setup_socket(SOCKET sock) +static int tcpx_setup_socket(SOCKET sock, struct fi_info *info) { int ret, optval = 1; @@ -74,17 +77,30 @@ static int tcpx_setup_socket(SOCKET sock) sizeof(optval)); if (ret) { FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,"setsockopt reuseaddr failed\n"); - return ret; + return -ofi_sockerr(); } + if ((tcpx_nodelay == 0) || ((tcpx_nodelay < 0) && + (info->fabric_attr->api_version >= FI_VERSION(1, 9) && + info->tx_attr->tclass == FI_TC_BULK_DATA))) + return 0; + ret = setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (char *) &optval, - sizeof(optval)); + sizeof(optval)); if (ret) { - FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,"setsockopt nodelay failed\n"); + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "setsockopt nodelay failed\n"); + return -ofi_sockerr(); + } + + ret = fi_fd_nonblock(sock); + if (ret) { + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "failed to set socket to nonblocking\n"); return ret; } - return ret; + return 0; } static int tcpx_ep_connect(struct fid_ep *ep, const void *addr, @@ -94,7 +110,8 @@ static int tcpx_ep_connect(struct fid_ep *ep, const void *addr, struct tcpx_cm_context *cm_ctx; int ret; - if (!addr || !tcpx_ep->conn_fd || paramlen > TCPX_MAX_CM_DATA_SIZE) + if (!addr || !tcpx_ep->sock || paramlen > TCPX_MAX_CM_DATA_SIZE || + tcpx_ep->state != TCPX_IDLE) return -FI_EINVAL; cm_ctx = calloc(1, sizeof(*cm_ctx)); @@ -104,29 +121,33 @@ static int tcpx_ep_connect(struct fid_ep *ep, const void *addr, return -FI_ENOMEM; } - ret = connect(tcpx_ep->conn_fd, (struct sockaddr *) addr, + tcpx_ep->state = TCPX_CONNECTING; + ret = connect(tcpx_ep->sock, (struct sockaddr *) addr, (socklen_t) ofi_sizeofaddr(addr)); - if (ret && ofi_sockerr() != FI_EINPROGRESS) { + if (ret && !OFI_SOCK_TRY_CONN_AGAIN(ofi_sockerr())) { + tcpx_ep->state = TCPX_IDLE; ret = -ofi_sockerr(); - goto err; + goto free; } cm_ctx->fid = &tcpx_ep->util_ep.ep_fid.fid; - cm_ctx->type = CLIENT_SEND_CONNREQ; + cm_ctx->state = TCPX_CM_CONNECTING; if (paramlen) { cm_ctx->cm_data_sz = paramlen; - memcpy(cm_ctx->cm_data, param, paramlen); + memcpy(cm_ctx->msg.data, param, paramlen); } - ret = ofi_wait_fd_add(tcpx_ep->util_ep.eq->wait, tcpx_ep->conn_fd, - FI_EPOLL_OUT, tcpx_eq_wait_try_func, NULL,cm_ctx); + ret = ofi_wait_add_fd(tcpx_ep->util_ep.eq->wait, tcpx_ep->sock, + POLLOUT, tcpx_eq_wait_try_func, NULL,cm_ctx); if (ret) - goto err; + goto disable; - tcpx_ep->util_ep.eq->wait->signal(tcpx_ep->util_ep.eq->wait); return 0; -err: + +disable: + tcpx_ep_disable(tcpx_ep, -ret); +free: free(cm_ctx); return ret; } @@ -137,7 +158,7 @@ static int tcpx_ep_accept(struct fid_ep *ep, const void *param, size_t paramlen) struct tcpx_cm_context *cm_ctx; int ret; - if (tcpx_ep->conn_fd == INVALID_SOCKET) + if (tcpx_ep->sock == INVALID_SOCKET || tcpx_ep->state != TCPX_RCVD_REQ) return -FI_EINVAL; cm_ctx = calloc(1, sizeof(*cm_ctx)); @@ -147,21 +168,109 @@ static int tcpx_ep_accept(struct fid_ep *ep, const void *param, size_t paramlen) return -FI_ENOMEM; } + tcpx_ep->state = TCPX_ACCEPTING; cm_ctx->fid = &tcpx_ep->util_ep.ep_fid.fid; - cm_ctx->type = SERVER_SEND_CM_ACCEPT; + cm_ctx->state = TCPX_CM_RESP_READY; if (paramlen) { cm_ctx->cm_data_sz = paramlen; - memcpy(cm_ctx->cm_data, param, paramlen); + memcpy(cm_ctx->msg.data, param, paramlen); } - ret = ofi_wait_fd_add(tcpx_ep->util_ep.eq->wait, tcpx_ep->conn_fd, - FI_EPOLL_OUT, tcpx_eq_wait_try_func, NULL, cm_ctx); - if (ret) { - free(cm_ctx); - return ret; - } - tcpx_ep->util_ep.eq->wait->signal(tcpx_ep->util_ep.eq->wait); + ret = ofi_wait_add_fd(tcpx_ep->util_ep.eq->wait, tcpx_ep->sock, + POLLOUT, tcpx_eq_wait_try_func, NULL, cm_ctx); + if (ret) + goto free; + return 0; + +free: + tcpx_ep->state = TCPX_RCVD_REQ; + free(cm_ctx); + return ret; +} + +/* must hold ep->lock */ +static void tcpx_ep_flush_queue(struct slist *queue, + struct tcpx_cq *tcpx_cq) +{ + struct tcpx_xfer_entry *xfer_entry; + + while (!slist_empty(queue)) { + xfer_entry = container_of(queue->head, struct tcpx_xfer_entry, + entry); + slist_remove_head(queue); + tcpx_cq_report_error(&tcpx_cq->util_cq, xfer_entry, FI_ECANCELED); + tcpx_xfer_entry_free(tcpx_cq, xfer_entry); + } +} + +/* must hold ep->lock */ +static void tcpx_ep_flush_all_queues(struct tcpx_ep *ep) +{ + struct tcpx_cq *tcpx_cq; + + tcpx_cq = container_of(ep->util_ep.tx_cq, struct tcpx_cq, util_cq); + tcpx_ep_flush_queue(&ep->tx_queue, tcpx_cq); + tcpx_ep_flush_queue(&ep->rma_read_queue, tcpx_cq); + tcpx_ep_flush_queue(&ep->tx_rsp_pend_queue, tcpx_cq); + + tcpx_cq = container_of(ep->util_ep.rx_cq, struct tcpx_cq, util_cq); + tcpx_ep_flush_queue(&ep->rx_queue, tcpx_cq); +} + +/* must hold ep->lock */ +void tcpx_ep_disable(struct tcpx_ep *ep, int cm_err) +{ + struct util_wait_fd *wait; + struct fi_eq_cm_entry cm_entry = {0}; + struct fi_eq_err_entry err_entry = {0}; + + switch (ep->state) { + case TCPX_RCVD_REQ: + break; + case TCPX_CONNECTED: + /* We need to remove the socket from the CQ's fdset, + * or the CQ will be left in a 'signaled' state. This + * can result in threads spinning on the CQs fdset. + */ + if (ep->util_ep.tx_cq) { + wait = container_of(ep->util_ep.tx_cq->wait, + struct util_wait_fd, util_wait); + ofi_wait_fdset_del(wait, ep->sock); + } + + if (ep->util_ep.rx_cq) { + wait = container_of(ep->util_ep.rx_cq->wait, + struct util_wait_fd, util_wait); + ofi_wait_fdset_del(wait, ep->sock); + } + /* fall through */ + case TCPX_ACCEPTING: + case TCPX_CONNECTING: + wait = container_of(ep->util_ep.eq->wait, + struct util_wait_fd, util_wait); + ofi_wait_fdset_del(wait, ep->sock); + break; + + default: + return; + } + + tcpx_ep_flush_all_queues(ep); + + if (cm_err) { + err_entry.fid = &ep->util_ep.ep_fid.fid; + err_entry.context = ep->util_ep.ep_fid.fid.context; + err_entry.err = cm_err; + (void) fi_eq_write(&ep->util_ep.eq->eq_fid, FI_SHUTDOWN, + &err_entry, sizeof(err_entry), + UTIL_FLAG_ERROR); + } else { + cm_entry.fid = &ep->util_ep.ep_fid.fid; + (void) fi_eq_write(&ep->util_ep.eq->eq_fid, FI_SHUTDOWN, + &cm_entry, sizeof(cm_entry), 0); + } + ep->state = TCPX_DISCONNECTED; } static int tcpx_ep_shutdown(struct fid_ep *ep, uint64_t flags) @@ -171,47 +280,44 @@ static int tcpx_ep_shutdown(struct fid_ep *ep, uint64_t flags) tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid); - ret = ofi_shutdown(tcpx_ep->conn_fd, SHUT_RDWR); + ret = ofi_shutdown(tcpx_ep->sock, SHUT_RDWR); if (ret && ofi_sockerr() != ENOTCONN) { FI_WARN(&tcpx_prov, FI_LOG_EP_DATA, "ep shutdown unsuccessful\n"); } fastlock_acquire(&tcpx_ep->lock); - ret = tcpx_ep_shutdown_report(tcpx_ep, &ep->fid); + tcpx_ep_disable(tcpx_ep, 0); fastlock_release(&tcpx_ep->lock); - if (ret) { - FI_WARN(&tcpx_prov, FI_LOG_EP_DATA, "Error writing to EQ\n"); - } - return ret; + return FI_SUCCESS; } static int tcpx_bind_to_port_range(SOCKET sock, void* src_addr, size_t addrlen) { int ret, i, rand_port_number; + static uint32_t seed; + if (!seed) + seed = ofi_generate_seed(); - rand_port_number = rand() % (port_range.high + 1 - port_range.low) + - port_range.low; + rand_port_number = ofi_xorshift_random_r(&seed) % + (port_range.high + 1 - port_range.low) + port_range.low; - for (i = port_range.low; i <= port_range.high; - i++, rand_port_number++) { - if (rand_port_number > port_range.high) { + for (i = port_range.low; i <= port_range.high; i++, rand_port_number++) { + if (rand_port_number > port_range.high) rand_port_number = port_range.low; - } + ofi_addr_set_port(src_addr, rand_port_number); ret = bind(sock, src_addr, (socklen_t) addrlen); if (ret) { - if (errno == EADDRINUSE) { + if (ofi_sockerr() == EADDRINUSE) continue; - } else { - FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, - "failed to bind listener: %s\n", - strerror(ofi_sockerr())); - return -errno; - } - } else { - break; + + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "failed to bind listener: %s\n", + strerror(ofi_sockerr())); + return -ofi_sockerr(); } + break; } return (i <= port_range.high) ? FI_SUCCESS : -FI_EADDRNOTAVAIL; } @@ -239,13 +345,23 @@ static int tcpx_pep_sock_create(struct tcpx_pep *pep) strerror(ofi_sockerr())); return -FI_EIO; } - ret = tcpx_setup_socket(pep->sock); + ret = tcpx_setup_socket(pep->sock, pep->info); + if (ret) { + goto err; + } + + ret = fi_fd_nonblock(pep->sock); if (ret) { + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "failed to set listener socket to nonblocking\n"); goto err; } + if (ofi_addr_get_port(pep->info->src_addr) != 0 || port_range.high == 0) { ret = bind(pep->sock, pep->info->src_addr, (socklen_t) pep->info->src_addrlen); + if (ret) + ret = -ofi_sockerr(); } else { ret = tcpx_bind_to_port_range(pep->sock, pep->info->src_addr, pep->info->src_addrlen); @@ -271,7 +387,7 @@ static int tcpx_ep_getname(fid_t fid, void *addr, size_t *addrlen) int ret; tcpx_ep = container_of(fid, struct tcpx_ep, util_ep.ep_fid); - ret = ofi_getsockname(tcpx_ep->conn_fd, addr, (socklen_t *)addrlen); + ret = ofi_getsockname(tcpx_ep->sock, addr, (socklen_t *)addrlen); if (ret) return -ofi_sockerr(); @@ -285,11 +401,11 @@ static int tcpx_ep_getpeer(struct fid_ep *ep, void *addr, size_t *addrlen) int ret; tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid); - ret = ofi_getpeername(tcpx_ep->conn_fd, addr, (socklen_t *)addrlen); + ret = ofi_getpeername(tcpx_ep->sock, addr, (socklen_t *)addrlen); if (ret) return -ofi_sockerr(); - return (addrlen_in < *addrlen)? -FI_ETOOSMALL: FI_SUCCESS; + return (addrlen_in < *addrlen) ? -FI_ETOOSMALL: FI_SUCCESS; } static struct fi_ops_cm tcpx_cm_ops = { @@ -305,98 +421,107 @@ static struct fi_ops_cm tcpx_cm_ops = { .join = fi_no_join, }; -void tcpx_rx_multi_recv_release(struct tcpx_xfer_entry *rx_entry) -{ - assert(rx_entry->iov_cnt == 1); - rx_entry->ep->cur_rx_entry = NULL; - rx_entry->iov[0].iov_len = rx_entry->rem_len; -} - -void tcpx_rx_msg_release(struct tcpx_xfer_entry *rx_entry) +void tcpx_rx_entry_free(struct tcpx_xfer_entry *rx_entry) { struct tcpx_cq *tcpx_cq; assert(rx_entry->hdr.base_hdr.op_data == TCPX_OP_MSG_RECV); if (rx_entry->ep->srx_ctx) { - tcpx_srx_xfer_release(rx_entry->ep->srx_ctx, rx_entry); + tcpx_srx_entry_free(rx_entry->ep->srx_ctx, rx_entry); } else { tcpx_cq = container_of(rx_entry->ep->util_ep.rx_cq, struct tcpx_cq, util_cq); - tcpx_xfer_entry_release(tcpx_cq, rx_entry); + tcpx_xfer_entry_free(tcpx_cq, rx_entry); } } -static void tcpx_ep_tx_rx_queues_release(struct tcpx_ep *ep) +/* Must hold ep->lock. */ +static void tcpx_ep_cancel_rx(struct tcpx_ep *ep, void *context) { - struct slist_entry *entry; + struct slist_entry *cur, *prev; struct tcpx_xfer_entry *xfer_entry; - struct tcpx_cq *tcpx_cq; - - fastlock_acquire(&ep->lock); - while (!slist_empty(&ep->tx_queue)) { - entry = ep->tx_queue.head; - xfer_entry = container_of(entry, struct tcpx_xfer_entry, entry); - slist_remove_head(&ep->tx_queue); - tcpx_cq = container_of(xfer_entry->ep->util_ep.tx_cq, - struct tcpx_cq, util_cq); - tcpx_xfer_entry_release(tcpx_cq, xfer_entry); + struct tcpx_cq *cq; + + /* To cancel an active receive, we would need to flush the socket of + * all data associated with that message. Since some of that data + * may not have arrived yet, this would require additional state + * tracking and complexity. Fail the cancel in this case, since + * the receive is already in process anyway. + */ + slist_foreach(&ep->rx_queue, cur, prev) { + xfer_entry = container_of(cur, struct tcpx_xfer_entry, entry); + if (xfer_entry->context == context) { + if (ep->cur_rx_entry == xfer_entry) + goto found; + break; + } } - while (!slist_empty(&ep->rx_queue)) { - entry = ep->rx_queue.head; - xfer_entry = container_of(entry, struct tcpx_xfer_entry, entry); - slist_remove_head(&ep->rx_queue); - tcpx_cq = container_of(xfer_entry->ep->util_ep.rx_cq, - struct tcpx_cq, util_cq); - tcpx_xfer_entry_release(tcpx_cq, xfer_entry); - } + return; - while (!slist_empty(&ep->rma_read_queue)) { - entry = ep->rma_read_queue.head; - xfer_entry = container_of(entry, struct tcpx_xfer_entry, entry); - slist_remove_head(&ep->rma_read_queue); - tcpx_cq = container_of(xfer_entry->ep->util_ep.tx_cq, - struct tcpx_cq, util_cq); - tcpx_xfer_entry_release(tcpx_cq, xfer_entry); - } +found: + cq = container_of(ep->util_ep.rx_cq, struct tcpx_cq, util_cq); - while (!slist_empty(&ep->tx_rsp_pend_queue)) { - entry = ep->tx_rsp_pend_queue.head; - xfer_entry = container_of(entry, struct tcpx_xfer_entry, entry); - slist_remove_head(&ep->tx_rsp_pend_queue); - tcpx_cq = container_of(xfer_entry->ep->util_ep.tx_cq, - struct tcpx_cq, util_cq); - tcpx_xfer_entry_release(tcpx_cq, xfer_entry); - } + slist_remove(&ep->rx_queue, cur, prev); + tcpx_cq_report_error(&cq->util_cq, xfer_entry, FI_ECANCELED); + tcpx_xfer_entry_free(cq, xfer_entry); +} + +/* We currently only support canceling receives, which is the common case. + * Canceling an operation from the other queues is not trivial, + * especially if the operation has already been initiated. + */ +static ssize_t tcpx_ep_cancel(fid_t fid, void *context) +{ + struct tcpx_ep *ep; + + ep = container_of(fid, struct tcpx_ep, util_ep.ep_fid.fid); + fastlock_acquire(&ep->lock); + tcpx_ep_cancel_rx(ep, context); fastlock_release(&ep->lock); + + return 0; } static int tcpx_ep_close(struct fid *fid) { + struct tcpx_ep *ep; struct tcpx_eq *eq; - struct tcpx_ep *ep = container_of(fid, struct tcpx_ep, - util_ep.ep_fid.fid); - - eq = container_of(ep->util_ep.eq, struct tcpx_eq, - util_eq); - - tcpx_ep_tx_rx_queues_release(ep); - - /* eq->close_lock protects from processing stale ep connection - events*/ - fastlock_acquire(&eq->close_lock); - if (ep->util_ep.rx_cq->wait) - ofi_wait_fd_del(ep->util_ep.rx_cq->wait, - ep->conn_fd); - - if (ep->util_ep.eq->wait) - ofi_wait_fd_del(ep->util_ep.eq->wait, ep->conn_fd); - fastlock_release(&eq->close_lock); - ofi_eq_remove_fid_events(ep->util_ep.eq, - &ep->util_ep.ep_fid.fid); - ofi_close_socket(ep->conn_fd); + + ep = container_of(fid, struct tcpx_ep, util_ep.ep_fid.fid); + eq = ep->util_ep.eq ? + container_of(ep->util_ep.eq, struct tcpx_eq, util_eq) : NULL; + + /* eq->close_lock protects from processing stale connection events */ + if (eq) + fastlock_acquire(&eq->close_lock); + + if (ep->util_ep.rx_cq) + ofi_wait_del_fd(ep->util_ep.rx_cq->wait, ep->sock); + + if (ep->util_ep.tx_cq) + ofi_wait_del_fd(ep->util_ep.tx_cq->wait, ep->sock); + + if (ep->util_ep.eq && ep->util_ep.eq->wait) + ofi_wait_del_fd(ep->util_ep.eq->wait, ep->sock); + + if (eq) + fastlock_release(&eq->close_lock); + + /* Lock not technically needed, since we're freeing the EP. But it's + * harmless to acquire and silences static code analysis tools. + */ + fastlock_acquire(&ep->lock); + tcpx_ep_flush_all_queues(ep); + fastlock_release(&ep->lock); + + if (eq) { + ofi_eq_remove_fid_events(ep->util_ep.eq, + &ep->util_ep.ep_fid.fid); + } + ofi_close_socket(ep->sock); ofi_endpoint_close(&ep->util_ep); fastlock_destroy(&ep->lock); @@ -411,14 +536,20 @@ static int tcpx_ep_ctrl(struct fid *fid, int command, void *arg) ep = container_of(fid, struct tcpx_ep, util_ep.ep_fid.fid); switch (command) { case FI_ENABLE: - if (!ep->util_ep.rx_cq || !ep->util_ep.tx_cq) + if ((ofi_needs_rx(ep->util_ep.caps) && !ep->util_ep.rx_cq) || + (ofi_needs_tx(ep->util_ep.caps) && !ep->util_ep.tx_cq)) { + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "missing needed CQ binding\n"); return -FI_ENOCQ; + } break; default: + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "unsupported command\n"); return -FI_ENOSYS; } - return 0; + return FI_SUCCESS; } + static int tcpx_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) { struct tcpx_ep *tcpx_ep; @@ -442,6 +573,7 @@ static struct fi_ops tcpx_ep_fi_ops = { .control = tcpx_ep_ctrl, .ops_open = fi_no_ops_open, }; + static int tcpx_ep_getopt(fid_t fid, int level, int optname, void *optval, size_t *optlen) { @@ -488,17 +620,16 @@ int tcpx_ep_setopt(fid_t fid, int level, int optname, return -FI_EINVAL; ep = container_of(fid, struct tcpx_ep, util_ep.ep_fid.fid); - ep->min_multi_recv_size = *(size_t *)optval; + ep->min_multi_recv_size = *(size_t *) optval; FI_INFO(&tcpx_prov, FI_LOG_EP_CTRL, - "FI_OPT_MIN_MULTI_RECV set to %zu\n", - ep->min_multi_recv_size); + "FI_OPT_MIN_MULTI_RECV set to %zu\n", ep->min_multi_recv_size); return FI_SUCCESS; } static struct fi_ops_ep tcpx_ep_ops = { .size = sizeof(struct fi_ops_ep), - .cancel = fi_no_cancel, + .cancel = tcpx_ep_cancel, .getopt = tcpx_ep_getopt, .setopt = tcpx_ep_setopt, .tx_ctx = fi_no_tx_ctx, @@ -507,10 +638,6 @@ static struct fi_ops_ep tcpx_ep_ops = { .tx_size_left = fi_no_tx_size_left, }; -static void tcpx_empty_progress(struct tcpx_ep *ep) -{ -} - int tcpx_endpoint(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep_fid, void *context) { @@ -524,7 +651,7 @@ int tcpx_endpoint(struct fid_domain *domain, struct fi_info *info, return -FI_ENOMEM; ret = ofi_endpoint_init(domain, &tcpx_util_prov, info, &ep->util_ep, - context, tcpx_progress); + context, NULL); if (ret) goto err1; @@ -533,49 +660,44 @@ int tcpx_endpoint(struct fid_domain *domain, struct fi_info *info, pep = container_of(info->handle, struct tcpx_pep, util_pep.pep_fid.fid); - ep->conn_fd = pep->sock; + ep->sock = pep->sock; pep->sock = INVALID_SOCKET; } else { + ep->state = TCPX_RCVD_REQ; handle = container_of(info->handle, struct tcpx_conn_handle, handle); - ep->conn_fd = handle->conn_fd; + ep->sock = handle->sock; ep->hdr_bswap = handle->endian_match ? tcpx_hdr_none : tcpx_hdr_bswap; free(handle); - ret = tcpx_setup_socket(ep->conn_fd); + ret = tcpx_setup_socket(ep->sock, info); if (ret) goto err3; } } else { - ep->conn_fd = ofi_socket(ofi_get_sa_family(info), SOCK_STREAM, 0); - if (ep->conn_fd == INVALID_SOCKET) { + ep->sock = ofi_socket(ofi_get_sa_family(info), SOCK_STREAM, 0); + if (ep->sock == INVALID_SOCKET) { ret = -ofi_sockerr(); goto err2; } - ret = tcpx_setup_socket(ep->conn_fd); + ret = tcpx_setup_socket(ep->sock, info); if (ret) goto err3; } - ep->cm_state = TCPX_EP_CONNECTING; - ep->progress_func = tcpx_empty_progress; ret = fastlock_init(&ep->lock); if (ret) goto err3; - ep->stage_buf.size = STAGE_BUF_SIZE; - ep->stage_buf.len = 0; - ep->stage_buf.off = 0; - slist_init(&ep->rx_queue); slist_init(&ep->tx_queue); slist_init(&ep->rma_read_queue); slist_init(&ep->tx_rsp_pend_queue); - ep->rx_detect.done_len = 0; - ep->rx_detect.hdr_len = sizeof(ep->rx_detect.hdr.base_hdr); + ep->cur_rx_msg.done_len = 0; + ep->cur_rx_msg.hdr_len = sizeof(ep->cur_rx_msg.hdr.base_hdr); ep->min_multi_recv_size = TCPX_MIN_MULTI_RECV; *ep_fid = &ep->util_ep.ep_fid; @@ -585,14 +707,14 @@ int tcpx_endpoint(struct fid_domain *domain, struct fi_info *info, (*ep_fid)->msg = &tcpx_msg_ops; (*ep_fid)->rma = &tcpx_rma_ops; - ep->get_rx_entry[ofi_op_msg] = tcpx_get_rx_entry_op_msg; - ep->get_rx_entry[ofi_op_tagged] = tcpx_get_rx_entry_op_invalid; - ep->get_rx_entry[ofi_op_read_req] = tcpx_get_rx_entry_op_read_req; - ep->get_rx_entry[ofi_op_read_rsp] = tcpx_get_rx_entry_op_read_rsp; - ep->get_rx_entry[ofi_op_write] =tcpx_get_rx_entry_op_write; + ep->start_op[ofi_op_msg] = tcpx_op_msg; + ep->start_op[ofi_op_tagged] = tcpx_op_invalid; + ep->start_op[ofi_op_read_req] = tcpx_op_read_req; + ep->start_op[ofi_op_read_rsp] = tcpx_op_read_rsp; + ep->start_op[ofi_op_write] = tcpx_op_write; return 0; err3: - ofi_close_socket(ep->conn_fd); + ofi_close_socket(ep->sock); err2: ofi_endpoint_close(&ep->util_ep); err1: @@ -606,7 +728,7 @@ static int tcpx_pep_fi_close(struct fid *fid) pep = container_of(fid, struct tcpx_pep, util_pep.pep_fid.fid); if (pep->util_pep.eq) - ofi_wait_fd_del(pep->util_pep.eq->wait, pep->sock); + ofi_wait_del_fd(pep->util_pep.eq->wait, pep->sock); ofi_close_socket(pep->sock); ofi_pep_close(&pep->util_pep); @@ -661,7 +783,6 @@ static int tcpx_pep_setname(fid_t fid, void *addr, size_t addrlen) tcpx_pep->info->src_addrlen = 0; } - tcpx_pep->info->src_addr = mem_dup(addr, addrlen); if (!tcpx_pep->info->src_addr) return -FI_ENOMEM; @@ -681,7 +802,7 @@ static int tcpx_pep_getname(fid_t fid, void *addr, size_t *addrlen) if (ret) return -ofi_sockerr(); - return (addrlen_in < *addrlen)? -FI_ETOOSMALL: FI_SUCCESS; + return (addrlen_in < *addrlen) ? -FI_ETOOSMALL: FI_SUCCESS; } static int tcpx_pep_listen(struct fid_pep *pep) @@ -691,43 +812,44 @@ static int tcpx_pep_listen(struct fid_pep *pep) tcpx_pep = container_of(pep,struct tcpx_pep, util_pep.pep_fid); - if (listen(tcpx_pep->sock, SOMAXCONN)) { + /* arbitrary backlog value to support larger scale jobs */ + if (listen(tcpx_pep->sock, 4096)) { FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, "socket listen failed\n"); return -ofi_sockerr(); } - ret = ofi_wait_fd_add(tcpx_pep->util_pep.eq->wait, tcpx_pep->sock, - FI_EPOLL_IN, tcpx_eq_wait_try_func, + ret = ofi_wait_add_fd(tcpx_pep->util_pep.eq->wait, tcpx_pep->sock, + POLLIN, tcpx_eq_wait_try_func, NULL, &tcpx_pep->cm_ctx); - tcpx_pep->util_pep.eq->wait->signal(tcpx_pep->util_pep.eq->wait); return ret; } static int tcpx_pep_reject(struct fid_pep *pep, fid_t handle, const void *param, size_t paramlen) { - struct ofi_ctrl_hdr hdr; + struct tcpx_cm_msg msg; struct tcpx_conn_handle *tcpx_handle; int ret; tcpx_handle = container_of(handle, struct tcpx_conn_handle, handle); - memset(&hdr, 0, sizeof(hdr)); - hdr.version = TCPX_CTRL_HDR_VERSION; - hdr.type = ofi_ctrl_nack; - hdr.seg_size = htons((uint16_t) paramlen); + memset(&msg.hdr, 0, sizeof(msg.hdr)); + msg.hdr.version = TCPX_CTRL_HDR_VERSION; + msg.hdr.type = ofi_ctrl_nack; + msg.hdr.seg_size = htons((uint16_t) paramlen); + if (paramlen) + memcpy(&msg.data, param, paramlen); - ret = ofi_send_socket(tcpx_handle->conn_fd, &hdr, - sizeof(hdr), MSG_NOSIGNAL); - - if ((ret == sizeof(hdr)) && paramlen) - (void) ofi_send_socket(tcpx_handle->conn_fd, param, - paramlen, MSG_NOSIGNAL); + ret = ofi_send_socket(tcpx_handle->sock, &msg, + sizeof(msg.hdr) + paramlen, MSG_NOSIGNAL); + if (ret != sizeof(msg.hdr) + paramlen) + FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL, + "sending of reject message failed\n"); - ofi_shutdown(tcpx_handle->conn_fd, SHUT_RDWR); - ret = ofi_close_socket(tcpx_handle->conn_fd); + ofi_shutdown(tcpx_handle->sock, SHUT_RDWR); + ret = ofi_close_socket(tcpx_handle->sock); if (ret) return ret; @@ -775,7 +897,6 @@ static struct fi_ops_ep tcpx_pep_ops = { .tx_size_left = fi_no_tx_size_left, }; - int tcpx_passive_ep(struct fid_fabric *fabric, struct fi_info *info, struct fid_pep **pep, void *context) { @@ -805,21 +926,23 @@ int tcpx_passive_ep(struct fid_fabric *fabric, struct fi_info *info, _pep->util_pep.pep_fid.ops = &tcpx_pep_ops; _pep->info = fi_dupinfo(info); - if (!_pep->info) + if (!_pep->info) { + ret = -FI_ENOMEM; goto err2; + } _pep->cm_ctx.fid = &_pep->util_pep.pep_fid.fid; - _pep->cm_ctx.type = SERVER_SOCK_ACCEPT; + _pep->cm_ctx.state = TCPX_CM_LISTENING; _pep->cm_ctx.cm_data_sz = 0; _pep->sock = INVALID_SOCKET; - *pep = &_pep->util_pep.pep_fid; - if (info->src_addr) { ret = tcpx_pep_sock_create(_pep); if (ret) goto err3; } + + *pep = &_pep->util_pep.pep_fid; return FI_SUCCESS; err3: fi_freeinfo(_pep->info); diff --git a/prov/tcp/src/tcpx_eq.c b/prov/tcp/src/tcpx_eq.c index 8cedf373628..96e3c8a2f16 100644 --- a/prov/tcp/src/tcpx_eq.c +++ b/prov/tcp/src/tcpx_eq.c @@ -42,13 +42,8 @@ static ssize_t tcpx_eq_read(struct fid_eq *eq_fid, uint32_t *event, eq = container_of(eq_fid, struct util_eq, eq_fid); - fastlock_acquire(&eq->lock); - if (slist_empty(&eq->list)) { - fastlock_release(&eq->lock); - tcpx_conn_mgr_run(eq); - } else { - fastlock_release(&eq->lock); - } + tcpx_conn_mgr_run(eq); + return ofi_eq_read(eq_fid, event, buf, len, flags); } @@ -61,8 +56,7 @@ static int tcpx_eq_close(struct fid *fid) if (ret) return ret; - eq = container_of(fid, struct tcpx_eq, - util_eq.eq_fid.fid); + eq = container_of(fid, struct tcpx_eq, util_eq.eq_fid.fid); fastlock_destroy(&eq->close_lock); free(eq); @@ -114,7 +108,7 @@ int tcpx_eq_create(struct fid_fabric *fabric_fid, struct fi_eq_attr *attr, if (!eq->util_eq.wait) { memset(&wait_attr, 0, sizeof wait_attr); - wait_attr.wait_obj = FI_WAIT_FD; + wait_attr.wait_obj = FI_WAIT_POLLFD; ret = fi_wait_open(fabric_fid, &wait_attr, &wait); if (ret) { FI_WARN(&tcpx_prov, FI_LOG_EQ, diff --git a/prov/tcp/src/tcpx_init.c b/prov/tcp/src/tcpx_init.c index 4eb729ab211..c0f1bab7454 100644 --- a/prov/tcp/src/tcpx_init.c +++ b/prov/tcp/src/tcpx_init.c @@ -36,85 +36,15 @@ #include "tcpx.h" #include -#include -#include #include #include -#if HAVE_GETIFADDRS -static void tcpx_getinfo_ifs(struct fi_info **info) -{ - struct fi_info *head = NULL, *tail = NULL, *cur; - struct slist addr_list; - size_t addrlen; - uint32_t addr_format; - struct slist_entry *entry, *prev; - struct ofi_addr_list_entry *addr_entry; - - slist_init(&addr_list); - - ofi_get_list_of_addr(&tcpx_prov, "iface", &addr_list); - - (void) prev; /* Makes compiler happy */ - slist_foreach(&addr_list, entry, prev) { - addr_entry = container_of(entry, struct ofi_addr_list_entry, entry); - - cur = fi_dupinfo(*info); - if (!cur) - break; - - if (!head) - head = cur; - else - tail->next = cur; - tail = cur; - - switch (addr_entry->ipaddr.sin.sin_family) { - case AF_INET: - addrlen = sizeof(struct sockaddr_in); - addr_format = FI_SOCKADDR_IN; - break; - case AF_INET6: - addrlen = sizeof(struct sockaddr_in6); - addr_format = FI_SOCKADDR_IN6; - break; - default: - continue; - } - - cur->src_addr = mem_dup(&addr_entry->ipaddr.sa, addrlen); - if (cur->src_addr) { - cur->src_addrlen = addrlen; - cur->addr_format = addr_format; - } - /* TODO: rework util code - util_set_fabric_domain(&tcpx_prov, cur); - */ - } - - ofi_free_list_of_addr(&addr_list); - fi_freeinfo(*info); - *info = head; -} -#else -#define tcpx_getinfo_ifs(info) do{ } while(0) -#endif - static int tcpx_getinfo(uint32_t version, const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct fi_info **info) { - int ret; - - ret = util_getinfo(&tcpx_util_prov, version, node, service, flags, - hints, info); - if (ret) - return ret; - - if (!(*info)->src_addr && !(*info)->dest_addr) - tcpx_getinfo_ifs(info); - - return 0; + return ofi_ip_getinfo(&tcpx_util_prov, version, node, service, flags, + hints, info); } struct tcpx_port_range port_range = { @@ -122,16 +52,30 @@ struct tcpx_port_range port_range = { .high = 0, }; -static int tcpx_init_env(void) +int tcpx_nodelay = -1; + + +static void tcpx_init_env(void) { - srand(getpid()); + fi_param_define(&tcpx_prov, "iface", FI_PARAM_STRING, + "Specify interface name"); + + fi_param_define(&tcpx_prov,"port_low_range", FI_PARAM_INT, + "define port low range"); + + fi_param_define(&tcpx_prov,"port_high_range", FI_PARAM_INT, + "define port high range"); + + fi_param_define(&tcpx_prov, "nodelay", FI_PARAM_BOOL, + "overrides default TCP_NODELAY socket setting"); + fi_param_get_bool(&tcpx_prov, "nodelay", &tcpx_nodelay); fi_param_get_int(&tcpx_prov, "port_high_range", &port_range.high); fi_param_get_int(&tcpx_prov, "port_low_range", &port_range.low); - if (port_range.high > TCPX_PORT_MAX_RANGE) { + if (port_range.high > TCPX_PORT_MAX_RANGE) port_range.high = TCPX_PORT_MAX_RANGE; - } + if (port_range.low < 0 || port_range.high < 0 || port_range.low > port_range.high) { FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,"User provided " @@ -139,7 +83,6 @@ static int tcpx_init_env(void) port_range.low = 0; port_range.high = 0; } - return 0; } static void fi_tcp_fini(void) @@ -149,8 +92,8 @@ static void fi_tcp_fini(void) struct fi_provider tcpx_prov = { .name = "tcp", - .version = FI_VERSION(TCPX_MAJOR_VERSION,TCPX_MINOR_VERSION), - .fi_version = FI_VERSION(1, 8), + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, .getinfo = tcpx_getinfo, .fabric = tcpx_create_fabric, .cleanup = fi_tcp_fini, @@ -161,18 +104,6 @@ TCP_INI #if HAVE_TCP_DL ofi_pmem_init(); #endif - fi_param_define(&tcpx_prov, "iface", FI_PARAM_STRING, - "Specify interface name"); - - fi_param_define(&tcpx_prov,"port_low_range", FI_PARAM_INT, - "define port low range"); - - fi_param_define(&tcpx_prov,"port_high_range", FI_PARAM_INT, - "define port high range"); - - if (tcpx_init_env()) { - FI_WARN(&tcpx_prov, FI_LOG_EP_CTRL,"Invalid info\n"); - return NULL; - } + tcpx_init_env(); return &tcpx_prov; } diff --git a/prov/tcp/src/tcpx_msg.c b/prov/tcp/src/tcpx_msg.c index ce0f369f06a..3e8581b32c5 100644 --- a/prov/tcp/src/tcpx_msg.c +++ b/prov/tcp/src/tcpx_msg.c @@ -54,13 +54,12 @@ tcpx_alloc_recv_entry(struct tcpx_ep *tcpx_ep) struct tcpx_xfer_entry *recv_entry; struct tcpx_cq *tcpx_cq; - tcpx_cq = container_of(tcpx_ep->util_ep.rx_cq, struct tcpx_cq, - util_cq); + tcpx_cq = container_of(tcpx_ep->util_ep.rx_cq, struct tcpx_cq, util_cq); recv_entry = tcpx_xfer_entry_alloc(tcpx_cq, TCPX_OP_MSG_RECV); - if (recv_entry) { + if (recv_entry) recv_entry->ep = tcpx_ep; - } + return recv_entry; } @@ -70,13 +69,12 @@ tcpx_alloc_send_entry(struct tcpx_ep *tcpx_ep) struct tcpx_xfer_entry *send_entry; struct tcpx_cq *tcpx_cq; - tcpx_cq = container_of(tcpx_ep->util_ep.tx_cq, struct tcpx_cq, - util_cq); + tcpx_cq = container_of(tcpx_ep->util_ep.tx_cq, struct tcpx_cq, util_cq); send_entry = tcpx_xfer_entry_alloc(tcpx_cq, TCPX_OP_MSG_SEND); - if (send_entry) { + if (send_entry) send_entry->ep = tcpx_ep; - } + return send_entry; } @@ -97,8 +95,6 @@ static ssize_t tcpx_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid); assert(msg->iov_count <= TCPX_IOV_LIMIT); - assert(!(tcpx_ep->util_ep.rx_op_flags & - flags & FI_MULTI_RECV) || msg->iov_count == 1); recv_entry = tcpx_alloc_recv_entry(tcpx_ep); if (!recv_entry) @@ -108,8 +104,8 @@ static ssize_t tcpx_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, memcpy(&recv_entry->iov[0], &msg->msg_iov[0], msg->iov_count * sizeof(struct iovec)); - recv_entry->flags = (tcpx_ep->util_ep.rx_msg_flags | flags | - FI_MSG | FI_RECV); + recv_entry->flags = tcpx_ep->util_ep.rx_msg_flags | flags | + FI_MSG | FI_RECV; recv_entry->context = msg->context; tcpx_queue_recv(tcpx_ep, recv_entry); @@ -132,9 +128,8 @@ static ssize_t tcpx_recv(struct fid_ep *ep, void *buf, size_t len, void *desc, recv_entry->iov[0].iov_base = buf; recv_entry->iov[0].iov_len = len; - recv_entry->flags = ((tcpx_ep->util_ep.rx_op_flags & - (FI_COMPLETION | FI_MULTI_RECV)) | - FI_MSG | FI_RECV); + recv_entry->flags = (tcpx_ep->util_ep.rx_op_flags & FI_COMPLETION) | + FI_MSG | FI_RECV; recv_entry->context = context; tcpx_queue_recv(tcpx_ep, recv_entry); @@ -150,7 +145,6 @@ static ssize_t tcpx_recvv(struct fid_ep *ep, const struct iovec *iov, void **des tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid); assert(count <= TCPX_IOV_LIMIT); - assert(!(tcpx_ep->util_ep.rx_op_flags & FI_MULTI_RECV) || count == 1); recv_entry = tcpx_alloc_recv_entry(tcpx_ep); if (!recv_entry) @@ -159,9 +153,8 @@ static ssize_t tcpx_recvv(struct fid_ep *ep, const struct iovec *iov, void **des recv_entry->iov_cnt = count; memcpy(recv_entry->iov, iov, count * sizeof(*iov)); - recv_entry->flags = ((tcpx_ep->util_ep.rx_op_flags & - (FI_COMPLETION | FI_MULTI_RECV)) | - FI_MSG | FI_RECV); + recv_entry->flags = (tcpx_ep->util_ep.rx_op_flags & FI_COMPLETION) | + FI_MSG | FI_RECV; recv_entry->context = context; tcpx_queue_recv(tcpx_ep, recv_entry); @@ -220,9 +213,8 @@ static ssize_t tcpx_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, tx_entry->flags = ((tcpx_ep->util_ep.tx_op_flags & FI_COMPLETION) | flags | FI_MSG | FI_SEND); - if (flags & (FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE)) { + if (flags & (FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE)) tx_entry->hdr.base_hdr.flags |= OFI_DELIVERY_COMPLETE; - } tx_entry->ep = tcpx_ep; tx_entry->context = msg->context; @@ -247,10 +239,9 @@ static ssize_t tcpx_send(struct fid_ep *ep, const void *buf, size_t len, if (!tx_entry) return -FI_EAGAIN; - tx_entry->hdr.base_hdr.size = - (len + sizeof(tx_entry->hdr.base_hdr)); - tx_entry->hdr.base_hdr.payload_off = - (uint8_t)sizeof(tx_entry->hdr.base_hdr); + tx_entry->hdr.base_hdr.size = len + sizeof(tx_entry->hdr.base_hdr); + tx_entry->hdr.base_hdr.payload_off = (uint8_t) + sizeof(tx_entry->hdr.base_hdr); tx_entry->iov[0].iov_base = (void *) &tx_entry->hdr; tx_entry->iov[0].iov_len = sizeof(tx_entry->hdr.base_hdr); @@ -260,13 +251,12 @@ static ssize_t tcpx_send(struct fid_ep *ep, const void *buf, size_t len, tx_entry->iov_cnt = 2; tx_entry->context = context; tx_entry->rem_len = tx_entry->hdr.base_hdr.size; - tx_entry->flags = ((tcpx_ep->util_ep.tx_op_flags & FI_COMPLETION) | - FI_MSG | FI_SEND); + tx_entry->flags = (tcpx_ep->util_ep.tx_op_flags & FI_COMPLETION) | + FI_MSG | FI_SEND; if (tcpx_ep->util_ep.tx_op_flags & - (FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE)) { + (FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE)) tx_entry->hdr.base_hdr.flags |= OFI_DELIVERY_COMPLETE; - } tcpx_ep->hdr_bswap(&tx_entry->hdr.base_hdr); fastlock_acquire(&tcpx_ep->lock); @@ -291,10 +281,9 @@ static ssize_t tcpx_sendv(struct fid_ep *ep, const struct iovec *iov, assert(count <= TCPX_IOV_LIMIT); data_len = ofi_total_iov_len(iov, count); - tx_entry->hdr.base_hdr.size = - (data_len + sizeof(tx_entry->hdr.base_hdr)); - tx_entry->hdr.base_hdr.payload_off = - (uint8_t)sizeof(tx_entry->hdr.base_hdr); + tx_entry->hdr.base_hdr.size = data_len + sizeof(tx_entry->hdr.base_hdr); + tx_entry->hdr.base_hdr.payload_off = (uint8_t) + sizeof(tx_entry->hdr.base_hdr); tx_entry->iov[0].iov_base = (void *) &tx_entry->hdr; tx_entry->iov[0].iov_len = sizeof(tx_entry->hdr.base_hdr); @@ -303,13 +292,12 @@ static ssize_t tcpx_sendv(struct fid_ep *ep, const struct iovec *iov, tx_entry->context = context; tx_entry->rem_len = tx_entry->hdr.base_hdr.size; - tx_entry->flags = ((tcpx_ep->util_ep.tx_op_flags & FI_COMPLETION) | - FI_MSG | FI_SEND); + tx_entry->flags = (tcpx_ep->util_ep.tx_op_flags & FI_COMPLETION) | + FI_MSG | FI_SEND; if (tcpx_ep->util_ep.tx_op_flags & - (FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE)) { + (FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE)) tx_entry->hdr.base_hdr.flags |= OFI_DELIVERY_COMPLETE; - } tcpx_ep->hdr_bswap(&tx_entry->hdr.base_hdr); fastlock_acquire(&tcpx_ep->lock); @@ -333,8 +321,7 @@ static ssize_t tcpx_inject(struct fid_ep *ep, const void *buf, size_t len, return -FI_EAGAIN; assert(len <= TCPX_MAX_INJECT_SZ); - tx_entry->hdr.base_hdr.size = - (len + sizeof(tx_entry->hdr.base_hdr)); + tx_entry->hdr.base_hdr.size = len + sizeof(tx_entry->hdr.base_hdr); offset = sizeof(tx_entry->hdr.base_hdr); tx_entry->hdr.base_hdr.payload_off = (uint8_t) offset; @@ -367,13 +354,13 @@ static ssize_t tcpx_senddata(struct fid_ep *ep, const void *buf, size_t len, return -FI_EAGAIN; tx_entry->hdr.cq_data_hdr.base_hdr.size = - (len + sizeof(tx_entry->hdr.cq_data_hdr)); + len + sizeof(tx_entry->hdr.cq_data_hdr); tx_entry->hdr.cq_data_hdr.base_hdr.flags = OFI_REMOTE_CQ_DATA; tx_entry->hdr.cq_data_hdr.cq_data = data; tx_entry->hdr.cq_data_hdr.base_hdr.payload_off = - (uint8_t)sizeof(tx_entry->hdr.cq_data_hdr); + (uint8_t) sizeof(tx_entry->hdr.cq_data_hdr); tx_entry->iov[0].iov_base = (void *) &tx_entry->hdr; tx_entry->iov[0].iov_len = sizeof(tx_entry->hdr.cq_data_hdr); @@ -385,13 +372,12 @@ static ssize_t tcpx_senddata(struct fid_ep *ep, const void *buf, size_t len, tx_entry->context = context; tx_entry->rem_len = tx_entry->hdr.base_hdr.size; - tx_entry->flags = ((tcpx_ep->util_ep.tx_op_flags & FI_COMPLETION) | - FI_MSG | FI_SEND); + tx_entry->flags = (tcpx_ep->util_ep.tx_op_flags & FI_COMPLETION) | + FI_MSG | FI_SEND; if (tcpx_ep->util_ep.tx_op_flags & - (FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE)) { + (FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE)) tx_entry->hdr.base_hdr.flags |= OFI_DELIVERY_COMPLETE; - } tcpx_ep->hdr_bswap(&tx_entry->hdr.base_hdr); fastlock_acquire(&tcpx_ep->lock); @@ -417,10 +403,9 @@ static ssize_t tcpx_injectdata(struct fid_ep *ep, const void *buf, size_t len, tx_entry->hdr.cq_data_hdr.base_hdr.flags = OFI_REMOTE_CQ_DATA; tx_entry->hdr.cq_data_hdr.cq_data = data; - tx_entry->hdr.base_hdr.size = - (len + sizeof(tx_entry->hdr.cq_data_hdr)); - tx_entry->hdr.base_hdr.payload_off = - (uint8_t)sizeof(tx_entry->hdr.cq_data_hdr); + tx_entry->hdr.base_hdr.size = len + sizeof(tx_entry->hdr.cq_data_hdr); + tx_entry->hdr.base_hdr.payload_off = (uint8_t) + sizeof(tx_entry->hdr.cq_data_hdr); memcpy((uint8_t *) &tx_entry->hdr + sizeof(tx_entry->hdr.cq_data_hdr), (uint8_t *) buf, len); diff --git a/prov/tcp/src/tcpx_progress.c b/prov/tcp/src/tcpx_progress.c index 0c0af1526b4..85863470361 100644 --- a/prov/tcp/src/tcpx_progress.c +++ b/prov/tcp/src/tcpx_progress.c @@ -42,56 +42,8 @@ #include #include -static void tcpx_cq_report_xfer_fail(struct tcpx_ep *tcpx_ep, int err) -{ - struct slist_entry *entry; - struct tcpx_xfer_entry *tx_entry; - struct tcpx_cq *tcpx_cq; - - while (!slist_empty(&tcpx_ep->tx_rsp_pend_queue)) { - entry = slist_remove_head(&tcpx_ep->tx_rsp_pend_queue); - tx_entry = container_of(entry, struct tcpx_xfer_entry, entry); - tcpx_cq_report_error(tx_entry->ep->util_ep.tx_cq, - tx_entry, -err); - - tcpx_cq = container_of(tx_entry->ep->util_ep.tx_cq, - struct tcpx_cq, util_cq); - tcpx_xfer_entry_release(tcpx_cq, tx_entry); - } -} - -static void tcpx_report_error(struct tcpx_ep *tcpx_ep, int err) -{ - struct fi_eq_err_entry err_entry = {0}; - - tcpx_cq_report_xfer_fail(tcpx_ep, err); - err_entry.fid = &tcpx_ep->util_ep.ep_fid.fid; - err_entry.context = tcpx_ep->util_ep.ep_fid.fid.context; - err_entry.err = -err; - fi_eq_write(&tcpx_ep->util_ep.eq->eq_fid, FI_NOTIFY, - &err_entry, sizeof(err_entry), UTIL_FLAG_ERROR); -} - -int tcpx_ep_shutdown_report(struct tcpx_ep *ep, fid_t fid) -{ - struct fi_eq_cm_entry cm_entry = {0}; - ssize_t len; - - if (ep->cm_state == TCPX_EP_SHUTDOWN) - return FI_SUCCESS; - tcpx_cq_report_xfer_fail(ep, -FI_ENOTCONN); - ep->cm_state = TCPX_EP_SHUTDOWN; - cm_entry.fid = fid; - len = fi_eq_write(&ep->util_ep.eq->eq_fid, FI_SHUTDOWN, - &cm_entry, sizeof(cm_entry), 0); - if (len < 0) - return (int) len; - - return FI_SUCCESS; -} - -static void process_tx_entry(struct tcpx_xfer_entry *tx_entry) +static void tcpx_process_tx_entry(struct tcpx_xfer_entry *tx_entry) { struct tcpx_cq *tcpx_cq; int ret; @@ -106,10 +58,8 @@ static void process_tx_entry(struct tcpx_xfer_entry *tx_entry) if (ret) { FI_WARN(&tcpx_prov, FI_LOG_DOMAIN, "msg send failed\n"); - tcpx_ep_shutdown_report(tx_entry->ep, - &tx_entry->ep->util_ep.ep_fid.fid); tcpx_cq_report_error(tx_entry->ep->util_ep.tx_cq, - tx_entry, ret); + tx_entry, -ret); } else { if (tx_entry->hdr.base_hdr.flags & (OFI_DELIVERY_COMPLETE | OFI_COMMIT_COMPLETE)) { @@ -122,7 +72,7 @@ static void process_tx_entry(struct tcpx_xfer_entry *tx_entry) tcpx_cq = container_of(tx_entry->ep->util_ep.tx_cq, struct tcpx_cq, util_cq); - tcpx_xfer_entry_release(tcpx_cq, tx_entry); + tcpx_xfer_entry_free(tcpx_cq, tx_entry); } static int tcpx_prepare_rx_entry_resp(struct tcpx_xfer_entry *rx_entry) @@ -155,36 +105,87 @@ static int tcpx_prepare_rx_entry_resp(struct tcpx_xfer_entry *rx_entry) tcpx_tx_queue_insert(resp_entry->ep, resp_entry); tcpx_cq_report_success(rx_entry->ep->util_ep.rx_cq, rx_entry); - rx_entry->rx_msg_release_fn(rx_entry); + tcpx_rx_entry_free(rx_entry); return FI_SUCCESS; } -static int process_rx_entry(struct tcpx_xfer_entry *rx_entry) +static int tcpx_update_rx_iov(struct tcpx_xfer_entry *rx_entry) { - int ret = FI_SUCCESS; + struct fi_cq_data_entry cq_entry; + int ret; - ret = tcpx_recv_msg_data(rx_entry); - if (OFI_SOCK_TRY_SND_RCV_AGAIN(-ret)) + assert(tcpx_dynamic_rbuf(rx_entry->ep)); + + cq_entry.op_context = rx_entry->context; + cq_entry.flags = rx_entry->flags; + cq_entry.len = (rx_entry->hdr.base_hdr.size - + rx_entry->hdr.base_hdr.payload_off) - + rx_entry->rem_len; + cq_entry.buf = rx_entry->mrecv_msg_start; + cq_entry.data = 0; + + rx_entry->iov_cnt = TCPX_IOV_LIMIT; + ret = (int) tcpx_dynamic_rbuf(rx_entry->ep)-> + get_rbuf(&cq_entry, &rx_entry->iov[0], &rx_entry->iov_cnt); + if (ret) { + FI_WARN(&tcpx_prov, FI_LOG_EP_DATA, + "get_rbuf callback failed %s\n", + fi_strerror(-ret)); return ret; + } + assert(rx_entry->iov_cnt && rx_entry->iov[0].iov_len && + rx_entry->iov_cnt <= TCPX_IOV_LIMIT); + ret = ofi_truncate_iov(rx_entry->iov, &rx_entry->iov_cnt, + rx_entry->rem_len); if (ret) { FI_WARN(&tcpx_prov, FI_LOG_EP_DATA, - "msg recv Failed ret = %d\n", ret); + "dynamically provided rbuf is too small\n"); + return ret; + } + + return 0; +} - tcpx_ep_shutdown_report(rx_entry->ep, - &rx_entry->ep->util_ep.ep_fid.fid); - tcpx_cq_report_error(rx_entry->ep->util_ep.rx_cq, - rx_entry, ret); - rx_entry->rx_msg_release_fn(rx_entry); - } else if (rx_entry->hdr.base_hdr.flags & OFI_DELIVERY_COMPLETE) { +static int tcpx_process_recv(struct tcpx_xfer_entry *rx_entry) +{ + int ret; + +retry: + ret = tcpx_recv_msg_data(rx_entry); + if (ret) { + if (OFI_SOCK_TRY_SND_RCV_AGAIN(-ret)) + return ret; + + FI_WARN(&tcpx_prov, FI_LOG_EP_DATA, + "msg recv failed ret = %d (%s)\n", ret, + fi_strerror(-ret)); + goto shutdown; + } + + /* iov has been consumed, check for dynamic rbuf handling */ + if (rx_entry->rem_len) { + ret = tcpx_update_rx_iov(rx_entry); + if (ret) + goto shutdown; + + rx_entry->rem_len = 0; + goto retry; + } + + if (rx_entry->hdr.base_hdr.flags & OFI_DELIVERY_COMPLETE) { if (tcpx_prepare_rx_entry_resp(rx_entry)) rx_entry->ep->cur_rx_proc_fn = tcpx_prepare_rx_entry_resp; } else { - tcpx_cq_report_success(rx_entry->ep->util_ep.rx_cq, - rx_entry); - - rx_entry->rx_msg_release_fn(rx_entry); + tcpx_cq_report_success(rx_entry->ep->util_ep.rx_cq, rx_entry); + tcpx_rx_entry_free(rx_entry); } + return 0; + +shutdown: + tcpx_ep_disable(rx_entry->ep, 0); + tcpx_cq_report_error(rx_entry->ep->util_ep.rx_cq, rx_entry, -ret); + tcpx_rx_entry_free(rx_entry); return ret; } @@ -194,7 +195,7 @@ static int tcpx_prepare_rx_write_resp(struct tcpx_xfer_entry *rx_entry) struct tcpx_xfer_entry *resp_entry; tcpx_tx_cq = container_of(rx_entry->ep->util_ep.tx_cq, - struct tcpx_cq, util_cq); + struct tcpx_cq, util_cq); resp_entry = tcpx_xfer_entry_alloc(tcpx_tx_cq, TCPX_OP_MSG_RESP); if (!resp_entry) @@ -206,8 +207,8 @@ static int tcpx_prepare_rx_write_resp(struct tcpx_xfer_entry *rx_entry) resp_entry->hdr.base_hdr.op = ofi_op_msg; resp_entry->hdr.base_hdr.size = sizeof(resp_entry->hdr.base_hdr); - resp_entry->hdr.base_hdr.payload_off = - (uint8_t)sizeof(resp_entry->hdr.base_hdr); + resp_entry->hdr.base_hdr.payload_off = (uint8_t) + sizeof(resp_entry->hdr.base_hdr); resp_entry->flags &= ~FI_COMPLETION; resp_entry->context = NULL; @@ -218,8 +219,8 @@ static int tcpx_prepare_rx_write_resp(struct tcpx_xfer_entry *rx_entry) tcpx_cq_report_success(rx_entry->ep->util_ep.rx_cq, rx_entry); tcpx_rx_cq = container_of(rx_entry->ep->util_ep.rx_cq, - struct tcpx_cq, util_cq); - tcpx_xfer_entry_release(tcpx_rx_cq, rx_entry); + struct tcpx_cq, util_cq); + tcpx_xfer_entry_free(tcpx_rx_cq, rx_entry); return FI_SUCCESS; } @@ -242,13 +243,12 @@ static void tcpx_pmem_commit(struct tcpx_xfer_entry *rx_entry) rma_iov = (struct ofi_rma_iov *)((uint8_t *)&rx_entry->hdr + offset); for (i = 0; i < rx_entry->hdr.base_hdr.rma_iov_cnt; i++) { - (*ofi_pmem_commit)((const void *) (uintptr_t) - rma_iov[i].addr, + (*ofi_pmem_commit)((const void *) (uintptr_t) rma_iov[i].addr, rma_iov[i].len); } } -static int process_rx_remote_write_entry(struct tcpx_xfer_entry *rx_entry) +static int tcpx_process_remote_write(struct tcpx_xfer_entry *rx_entry) { struct tcpx_cq *tcpx_cq; int ret = FI_SUCCESS; @@ -262,13 +262,11 @@ static int process_rx_remote_write_entry(struct tcpx_xfer_entry *rx_entry) "remote write Failed ret = %d\n", ret); - tcpx_ep_shutdown_report(rx_entry->ep, - &rx_entry->ep->util_ep.ep_fid.fid); - tcpx_cq_report_error(rx_entry->ep->util_ep.rx_cq, - rx_entry, ret); + tcpx_ep_disable(rx_entry->ep, 0); + tcpx_cq_report_error(rx_entry->ep->util_ep.rx_cq, rx_entry, -ret); tcpx_cq = container_of(rx_entry->ep->util_ep.rx_cq, struct tcpx_cq, util_cq); - tcpx_xfer_entry_release(tcpx_cq, rx_entry); + tcpx_xfer_entry_free(tcpx_cq, rx_entry); } else if (rx_entry->hdr.base_hdr.flags & (OFI_DELIVERY_COMPLETE | OFI_COMMIT_COMPLETE)) { @@ -279,16 +277,15 @@ static int process_rx_remote_write_entry(struct tcpx_xfer_entry *rx_entry) if (tcpx_prepare_rx_write_resp(rx_entry)) rx_entry->ep->cur_rx_proc_fn = tcpx_prepare_rx_write_resp; } else { - tcpx_cq_report_success(rx_entry->ep->util_ep.rx_cq, - rx_entry); + tcpx_cq_report_success(rx_entry->ep->util_ep.rx_cq, rx_entry); tcpx_cq = container_of(rx_entry->ep->util_ep.rx_cq, struct tcpx_cq, util_cq); - tcpx_xfer_entry_release(tcpx_cq, rx_entry); + tcpx_xfer_entry_free(tcpx_cq, rx_entry); } return ret; } -static int process_rx_read_entry(struct tcpx_xfer_entry *rx_entry) +static int tcpx_process_remote_read(struct tcpx_xfer_entry *rx_entry) { struct tcpx_cq *tcpx_cq; int ret = FI_SUCCESS; @@ -300,10 +297,8 @@ static int process_rx_read_entry(struct tcpx_xfer_entry *rx_entry) if (ret) { FI_WARN(&tcpx_prov, FI_LOG_DOMAIN, "msg recv Failed ret = %d\n", ret); - tcpx_ep_shutdown_report(rx_entry->ep, - &rx_entry->ep->util_ep.ep_fid.fid); - tcpx_cq_report_error(rx_entry->ep->util_ep.tx_cq, rx_entry, - ret); + tcpx_ep_disable(rx_entry->ep, 0); + tcpx_cq_report_error(rx_entry->ep->util_ep.tx_cq, rx_entry, -ret); } else { tcpx_cq_report_success(rx_entry->ep->util_ep.tx_cq, rx_entry); } @@ -311,7 +306,7 @@ static int process_rx_read_entry(struct tcpx_xfer_entry *rx_entry) slist_remove_head(&rx_entry->ep->rma_read_queue); tcpx_cq = container_of(rx_entry->ep->util_ep.tx_cq, struct tcpx_cq, util_cq); - tcpx_xfer_entry_release(tcpx_cq, rx_entry); + tcpx_xfer_entry_free(tcpx_cq, rx_entry); return ret; } @@ -327,7 +322,7 @@ static void tcpx_copy_rma_iov_to_msg_iov(struct tcpx_xfer_entry *xfer_entry) else offset = sizeof(xfer_entry->hdr.base_hdr); - rma_iov = (struct ofi_rma_iov *)((uint8_t *)&xfer_entry->hdr + offset); + rma_iov = (struct ofi_rma_iov *) ((uint8_t *) &xfer_entry->hdr + offset); xfer_entry->iov_cnt = xfer_entry->hdr.base_hdr.rma_iov_cnt; for ( i = 0 ; i < xfer_entry->hdr.base_hdr.rma_iov_cnt; i++ ) { @@ -344,8 +339,8 @@ static int tcpx_prepare_rx_remote_read_resp(struct tcpx_xfer_entry *resp_entry) resp_entry->iov[0].iov_base = (void *) &resp_entry->hdr; resp_entry->iov[0].iov_len = sizeof(resp_entry->hdr.base_hdr); - rma_iov = (struct ofi_rma_iov *)((uint8_t *)&resp_entry->hdr + - sizeof(resp_entry->hdr.base_hdr)); + rma_iov = (struct ofi_rma_iov *) ((uint8_t *) + &resp_entry->hdr + sizeof(resp_entry->hdr.base_hdr)); resp_entry->iov_cnt = 1 + resp_entry->hdr.base_hdr.rma_iov_cnt; resp_entry->hdr.base_hdr.size = resp_entry->iov[0].iov_len; @@ -356,8 +351,8 @@ static int tcpx_prepare_rx_remote_read_resp(struct tcpx_xfer_entry *resp_entry) } resp_entry->hdr.base_hdr.op = ofi_op_read_rsp; - resp_entry->hdr.base_hdr.payload_off = - (uint8_t)sizeof(resp_entry->hdr.base_hdr); + resp_entry->hdr.base_hdr.payload_off = (uint8_t) + sizeof(resp_entry->hdr.base_hdr); resp_entry->flags &= ~FI_COMPLETION; resp_entry->context = NULL; @@ -382,7 +377,7 @@ static int tcpx_validate_rx_rma_data(struct tcpx_xfer_entry *rx_entry, else offset = sizeof(rx_entry->hdr.base_hdr); - rma_iov = (struct ofi_rma_iov *)((uint8_t *)&rx_entry->hdr + offset); + rma_iov = (struct ofi_rma_iov *) ((uint8_t *) &rx_entry->hdr + offset); for ( i = 0 ; i < rx_entry->hdr.base_hdr.rma_iov_cnt ; i++) { ret = ofi_mr_verify(map, rma_iov[i].len, @@ -397,29 +392,46 @@ static int tcpx_validate_rx_rma_data(struct tcpx_xfer_entry *rx_entry, return FI_SUCCESS; } -int tcpx_get_rx_entry_op_invalid(struct tcpx_ep *tcpx_ep) +int tcpx_op_invalid(struct tcpx_ep *tcpx_ep) { return -FI_EINVAL; } -static inline void -tcpx_rx_detect_init(struct tcpx_rx_detect *rx_detect) +/* Must hold ep lock */ +static struct tcpx_xfer_entry *tcpx_rx_entry_alloc(struct tcpx_ep *ep) +{ + struct tcpx_xfer_entry *rx_entry; + + if (slist_empty(&ep->rx_queue)) + return NULL; + + rx_entry = container_of(ep->rx_queue.head, struct tcpx_xfer_entry, + entry); + slist_remove_head(&ep->rx_queue); + return rx_entry; +} +static void tcpx_rx_setup(struct tcpx_ep *ep, struct tcpx_xfer_entry *rx_entry, + tcpx_rx_process_fn_t process_fn) { - rx_detect->hdr_len = sizeof(rx_detect->hdr.base_hdr); - rx_detect->done_len = 0; + ep->cur_rx_entry = rx_entry; + ep->cur_rx_proc_fn = process_fn; + + /* Reset to receive next message */ + ep->cur_rx_msg.hdr_len = sizeof(ep->cur_rx_msg.hdr.base_hdr); + ep->cur_rx_msg.done_len = 0; } -int tcpx_get_rx_entry_op_msg(struct tcpx_ep *tcpx_ep) +int tcpx_op_msg(struct tcpx_ep *tcpx_ep) { struct tcpx_xfer_entry *rx_entry; struct tcpx_xfer_entry *tx_entry; struct tcpx_cq *tcpx_cq; - struct tcpx_rx_detect *rx_detect = &tcpx_ep->rx_detect; + struct tcpx_cur_rx_msg *cur_rx_msg = &tcpx_ep->cur_rx_msg; size_t msg_len; int ret; - if (rx_detect->hdr.base_hdr.op_data == TCPX_OP_MSG_RESP) { + if (cur_rx_msg->hdr.base_hdr.op_data == TCPX_OP_MSG_RESP) { assert(!slist_empty(&tcpx_ep->tx_rsp_pend_queue)); tx_entry = container_of(tcpx_ep->tx_rsp_pend_queue.head, struct tcpx_xfer_entry, entry); @@ -429,69 +441,58 @@ int tcpx_get_rx_entry_op_msg(struct tcpx_ep *tcpx_ep) tcpx_cq_report_success(tx_entry->ep->util_ep.tx_cq, tx_entry); slist_remove_head(&tx_entry->ep->tx_rsp_pend_queue); - tcpx_xfer_entry_release(tcpx_cq, tx_entry); - tcpx_rx_detect_init(rx_detect); + tcpx_xfer_entry_free(tcpx_cq, tx_entry); + tcpx_rx_setup(tcpx_ep, NULL, NULL); return -FI_EAGAIN; } - msg_len = (tcpx_ep->rx_detect.hdr.base_hdr.size - - tcpx_ep->rx_detect.hdr.base_hdr.payload_off); + msg_len = (tcpx_ep->cur_rx_msg.hdr.base_hdr.size - + tcpx_ep->cur_rx_msg.hdr.base_hdr.payload_off); - if (tcpx_ep->srx_ctx){ - rx_entry = tcpx_srx_next_xfer_entry(tcpx_ep->srx_ctx, - tcpx_ep, msg_len); + if (tcpx_ep->srx_ctx) { + rx_entry = tcpx_srx_entry_alloc(tcpx_ep->srx_ctx, tcpx_ep); if (!rx_entry) return -FI_EAGAIN; rx_entry->flags |= tcpx_ep->util_ep.rx_op_flags & FI_COMPLETION; } else { - if (slist_empty(&tcpx_ep->rx_queue)) + rx_entry = tcpx_rx_entry_alloc(tcpx_ep); + if (!rx_entry) return -FI_EAGAIN; - - rx_entry = container_of(tcpx_ep->rx_queue.head, - struct tcpx_xfer_entry, entry); - - rx_entry->rem_len = - ofi_total_iov_len(rx_entry->iov, rx_entry->iov_cnt)- - msg_len; - - if (!(rx_entry->flags & FI_MULTI_RECV) || - rx_entry->rem_len < tcpx_ep->min_multi_recv_size) { - slist_remove_head(&tcpx_ep->rx_queue); - rx_entry->rx_msg_release_fn = tcpx_rx_msg_release; - } else { - rx_entry->rx_msg_release_fn = tcpx_rx_multi_recv_release; - } } - memcpy(&rx_entry->hdr, &tcpx_ep->rx_detect.hdr, - (size_t) tcpx_ep->rx_detect.hdr.base_hdr.payload_off); + memcpy(&rx_entry->hdr, &tcpx_ep->cur_rx_msg.hdr, + (size_t) tcpx_ep->cur_rx_msg.hdr.base_hdr.payload_off); rx_entry->ep = tcpx_ep; rx_entry->hdr.base_hdr.op_data = TCPX_OP_MSG_RECV; rx_entry->mrecv_msg_start = rx_entry->iov[0].iov_base; - ret = ofi_truncate_iov(rx_entry->iov, - &rx_entry->iov_cnt, - msg_len); + ret = ofi_truncate_iov(rx_entry->iov, &rx_entry->iov_cnt, msg_len); if (ret) { - FI_WARN(&tcpx_prov, FI_LOG_DOMAIN, - "posted rx buffer size is not big enough\n"); - tcpx_cq_report_error(rx_entry->ep->util_ep.rx_cq, - rx_entry, -ret); - rx_entry->rx_msg_release_fn(rx_entry); - return ret; + if (!tcpx_dynamic_rbuf(tcpx_ep)) + goto truncate_err; + + rx_entry->rem_len = msg_len - + ofi_total_iov_len(rx_entry->iov, + rx_entry->iov_cnt); } - tcpx_ep->cur_rx_proc_fn = process_rx_entry; - if (rx_detect->hdr.base_hdr.flags & OFI_REMOTE_CQ_DATA) + if (cur_rx_msg->hdr.base_hdr.flags & OFI_REMOTE_CQ_DATA) rx_entry->flags |= FI_REMOTE_CQ_DATA; - tcpx_rx_detect_init(rx_detect); - tcpx_ep->cur_rx_entry = rx_entry; + tcpx_rx_setup(tcpx_ep, rx_entry, tcpx_process_recv); return FI_SUCCESS; + +truncate_err: + FI_WARN(&tcpx_prov, FI_LOG_EP_DATA, + "posted rx buffer size is not big enough\n"); + tcpx_cq_report_error(rx_entry->ep->util_ep.rx_cq, + rx_entry, -ret); + tcpx_rx_entry_free(rx_entry); + return ret; } -int tcpx_get_rx_entry_op_read_req(struct tcpx_ep *tcpx_ep) +int tcpx_op_read_req(struct tcpx_ep *tcpx_ep) { struct tcpx_xfer_entry *rx_entry; struct tcpx_cq *tcpx_cq; @@ -508,28 +509,24 @@ int tcpx_get_rx_entry_op_read_req(struct tcpx_ep *tcpx_ep) if (!rx_entry) return -FI_EAGAIN; - memcpy(&rx_entry->hdr, &tcpx_ep->rx_detect.hdr, - (size_t) tcpx_ep->rx_detect.hdr.base_hdr.payload_off); + memcpy(&rx_entry->hdr, &tcpx_ep->cur_rx_msg.hdr, + (size_t) tcpx_ep->cur_rx_msg.hdr.base_hdr.payload_off); rx_entry->hdr.base_hdr.op_data = TCPX_OP_REMOTE_READ; rx_entry->ep = tcpx_ep; - rx_entry->rem_len = (rx_entry->hdr.base_hdr.size - - tcpx_ep->rx_detect.done_len); ret = tcpx_validate_rx_rma_data(rx_entry, FI_REMOTE_READ); if (ret) { FI_WARN(&tcpx_prov, FI_LOG_DOMAIN, "invalid rma data\n"); - tcpx_xfer_entry_release(tcpx_cq, rx_entry); + tcpx_xfer_entry_free(tcpx_cq, rx_entry); return ret; } - tcpx_rx_detect_init(&tcpx_ep->rx_detect); - tcpx_ep->cur_rx_entry = rx_entry; - tcpx_ep->cur_rx_proc_fn = tcpx_prepare_rx_remote_read_resp; + tcpx_rx_setup(tcpx_ep, rx_entry, tcpx_prepare_rx_remote_read_resp); return FI_SUCCESS; } -int tcpx_get_rx_entry_op_write(struct tcpx_ep *tcpx_ep) +int tcpx_op_write(struct tcpx_ep *tcpx_ep) { struct tcpx_xfer_entry *rx_entry; struct tcpx_cq *tcpx_cq; @@ -543,34 +540,30 @@ int tcpx_get_rx_entry_op_write(struct tcpx_ep *tcpx_ep) return -FI_EAGAIN; rx_entry->flags = 0; - if (tcpx_ep->rx_detect.hdr.base_hdr.flags & OFI_REMOTE_CQ_DATA) + if (tcpx_ep->cur_rx_msg.hdr.base_hdr.flags & OFI_REMOTE_CQ_DATA) rx_entry->flags = (FI_COMPLETION | FI_REMOTE_CQ_DATA | FI_REMOTE_WRITE); - memcpy(&rx_entry->hdr, &tcpx_ep->rx_detect.hdr, - (size_t) tcpx_ep->rx_detect.hdr.base_hdr.payload_off); + memcpy(&rx_entry->hdr, &tcpx_ep->cur_rx_msg.hdr, + (size_t) tcpx_ep->cur_rx_msg.hdr.base_hdr.payload_off); rx_entry->hdr.base_hdr.op_data = TCPX_OP_REMOTE_WRITE; rx_entry->ep = tcpx_ep; - rx_entry->rem_len = (rx_entry->hdr.base_hdr.size - - tcpx_ep->rx_detect.done_len); ret = tcpx_validate_rx_rma_data(rx_entry, FI_REMOTE_WRITE); if (ret) { FI_WARN(&tcpx_prov, FI_LOG_DOMAIN, "invalid rma data\n"); - tcpx_xfer_entry_release(tcpx_cq, rx_entry); + tcpx_xfer_entry_free(tcpx_cq, rx_entry); return ret; } tcpx_copy_rma_iov_to_msg_iov(rx_entry); - tcpx_rx_detect_init(&tcpx_ep->rx_detect); - tcpx_ep->cur_rx_entry = rx_entry; - tcpx_ep->cur_rx_proc_fn = process_rx_remote_write_entry; + tcpx_rx_setup(tcpx_ep, rx_entry, tcpx_process_remote_write); return FI_SUCCESS; } -int tcpx_get_rx_entry_op_read_rsp(struct tcpx_ep *tcpx_ep) +int tcpx_op_read_rsp(struct tcpx_ep *tcpx_ep) { struct tcpx_xfer_entry *rx_entry; struct slist_entry *entry; @@ -582,62 +575,84 @@ int tcpx_get_rx_entry_op_read_rsp(struct tcpx_ep *tcpx_ep) rx_entry = container_of(entry, struct tcpx_xfer_entry, entry); - memcpy(&rx_entry->hdr, &tcpx_ep->rx_detect.hdr, - (size_t) tcpx_ep->rx_detect.hdr.base_hdr.payload_off); + memcpy(&rx_entry->hdr, &tcpx_ep->cur_rx_msg.hdr, + (size_t) tcpx_ep->cur_rx_msg.hdr.base_hdr.payload_off); rx_entry->hdr.base_hdr.op_data = TCPX_OP_READ_RSP; - rx_entry->rem_len = (rx_entry->hdr.base_hdr.size - - tcpx_ep->rx_detect.done_len); - tcpx_rx_detect_init(&tcpx_ep->rx_detect); - tcpx_ep->cur_rx_entry = rx_entry; - tcpx_ep->cur_rx_proc_fn = process_rx_read_entry; + tcpx_rx_setup(tcpx_ep, rx_entry, tcpx_process_remote_read); return FI_SUCCESS; } -static inline int tcpx_get_next_rx_hdr(struct tcpx_ep *ep) +static int tcpx_get_next_rx_hdr(struct tcpx_ep *ep) { - int ret; + ssize_t ret; - /* hdr already read from socket in previous call */ - if (ep->rx_detect.hdr_len == ep->rx_detect.done_len) - return FI_SUCCESS; + ret = tcpx_recv_hdr(ep->sock, &ep->stage_buf, &ep->cur_rx_msg); + if (ret < 0) + return (int) ret; - ret = tcpx_recv_hdr(ep->conn_fd, - &ep->stage_buf, - &ep->rx_detect); - if (ret) - return ret; + ep->cur_rx_msg.done_len += ret; + if (ep->cur_rx_msg.done_len >= sizeof(ep->cur_rx_msg.hdr.base_hdr)) { + if (ep->cur_rx_msg.hdr.base_hdr.payload_off > TCPX_MAX_HDR_SZ) { + FI_WARN(&tcpx_prov, FI_LOG_EP_DATA, + "Payload offset is too large\n"); + return -FI_EIO; + } + ep->cur_rx_msg.hdr_len = (size_t) ep->cur_rx_msg.hdr. + base_hdr.payload_off; + + if (ep->cur_rx_msg.hdr_len > ep->cur_rx_msg.done_len) { + ret = tcpx_recv_hdr(ep->sock, &ep->stage_buf, + &ep->cur_rx_msg); + if (ret < 0) + return (int) ret; + + ep->cur_rx_msg.done_len += ret; + } + } + + if (ep->cur_rx_msg.done_len < ep->cur_rx_msg.hdr_len) + return -FI_EAGAIN; - ep->hdr_bswap(&ep->rx_detect.hdr.base_hdr); + ep->hdr_bswap(&ep->cur_rx_msg.hdr.base_hdr); return FI_SUCCESS; } -static void tcpx_process_rx_msg(struct tcpx_ep *ep) +/* Must hold ep lock */ +void tcpx_progress_rx(struct tcpx_ep *ep) { int ret; if (!ep->cur_rx_entry && - (ep->stage_buf.len == ep->stage_buf.off)) { - ret = tcpx_read_to_buffer(ep->conn_fd, - &ep->stage_buf); + (ep->stage_buf.cur_pos == ep->stage_buf.bytes_avail)) { + ret = tcpx_read_to_buffer(ep->sock, &ep->stage_buf); if (ret) goto err; } do { if (!ep->cur_rx_entry) { - ret = tcpx_get_next_rx_hdr(ep); - if (ret) + if (ep->cur_rx_msg.done_len < ep->cur_rx_msg.hdr_len) { + ret = tcpx_get_next_rx_hdr(ep); + if (ret) + goto err; + } + + if (ep->cur_rx_msg.hdr.base_hdr.op >= + ARRAY_SIZE(ep->start_op)) { + FI_WARN(&tcpx_prov, FI_LOG_EP_DATA, + "Received invalid opcode\n"); + ret = -FI_ENOTCONN; /* force shutdown */ goto err; - - ret = ep->get_rx_entry[ep->rx_detect.hdr.base_hdr.op](ep); + } + ret = ep->start_op[ep->cur_rx_msg.hdr.base_hdr.op](ep); if (ret) goto err; } - assert(ep->cur_rx_proc_fn != NULL); + assert(ep->cur_rx_proc_fn); ep->cur_rx_proc_fn(ep->cur_rx_entry); - } while (ep->stage_buf.len != ep->stage_buf.off); + } while (ep->stage_buf.cur_pos < ep->stage_buf.bytes_avail); return; err: @@ -645,42 +660,23 @@ static void tcpx_process_rx_msg(struct tcpx_ep *ep) return; if (ret == -FI_ENOTCONN) - tcpx_ep_shutdown_report(ep, &ep->util_ep.ep_fid.fid); - else - tcpx_report_error(ep, ret); + tcpx_ep_disable(ep, 0); } -static void process_tx_queue(struct tcpx_ep *ep) +/* Must hold ep lock */ +void tcpx_progress_tx(struct tcpx_ep *ep) { struct tcpx_xfer_entry *tx_entry; struct slist_entry *entry; - if (slist_empty(&ep->tx_queue)) - return; - - entry = ep->tx_queue.head; - tx_entry = container_of(entry, struct tcpx_xfer_entry, entry); - process_tx_entry(tx_entry); -} - -void tcpx_ep_progress(struct tcpx_ep *ep) -{ - tcpx_process_rx_msg(ep); - process_tx_queue(ep); -} - -void tcpx_progress(struct util_ep *util_ep) -{ - struct tcpx_ep *ep; - - ep = container_of(util_ep, struct tcpx_ep, util_ep); - fastlock_acquire(&ep->lock); - ep->progress_func(ep); - fastlock_release(&ep->lock); - return; + if (!slist_empty(&ep->tx_queue)) { + entry = ep->tx_queue.head; + tx_entry = container_of(entry, struct tcpx_xfer_entry, entry); + tcpx_process_tx_entry(tx_entry); + } } -static int tcpx_try_func(void *util_ep) +int tcpx_try_func(void *util_ep) { uint32_t events; struct util_wait_fd *wait_fd; @@ -688,42 +684,37 @@ static int tcpx_try_func(void *util_ep) int ret; ep = container_of(util_ep, struct tcpx_ep, util_ep); - wait_fd = container_of(((struct util_ep *)util_ep)->rx_cq->wait, + wait_fd = container_of(((struct util_ep *) util_ep)->tx_cq->wait, struct util_wait_fd, util_wait); fastlock_acquire(&ep->lock); - if (!slist_empty(&ep->tx_queue) && !ep->send_ready_monitor) { - ep->send_ready_monitor = true; - events = FI_EPOLL_IN | FI_EPOLL_OUT; + if (!slist_empty(&ep->tx_queue) && !ep->pollout_set) { + ep->pollout_set = true; + events = (wait_fd->util_wait.wait_obj == FI_WAIT_FD) ? + (OFI_EPOLL_IN | OFI_EPOLL_OUT) : (POLLIN | POLLOUT); goto epoll_mod; - } else if (slist_empty(&ep->tx_queue) && ep->send_ready_monitor) { - ep->send_ready_monitor = false; - events = FI_EPOLL_IN; + } else if (slist_empty(&ep->tx_queue) && ep->pollout_set) { + ep->pollout_set = false; + events = (wait_fd->util_wait.wait_obj == FI_WAIT_FD) ? + OFI_EPOLL_IN : POLLIN; goto epoll_mod; } fastlock_release(&ep->lock); return FI_SUCCESS; epoll_mod: - ret = fi_epoll_mod(wait_fd->epoll_fd, ep->conn_fd, events, NULL); + ret = (wait_fd->util_wait.wait_obj == FI_WAIT_FD) ? + ofi_epoll_mod(wait_fd->epoll_fd, ep->sock, events, + &ep->util_ep.ep_fid.fid) : + ofi_pollfds_mod(wait_fd->pollfds, ep->sock, events, + &ep->util_ep.ep_fid.fid); if (ret) FI_WARN(&tcpx_prov, FI_LOG_EP_DATA, - "invalid op type\n"); + "epoll modify failed\n"); fastlock_release(&ep->lock); return ret; } -int tcpx_cq_wait_ep_add(struct tcpx_ep *ep) -{ - if (!ep->util_ep.rx_cq->wait) - return FI_SUCCESS; - - return ofi_wait_fd_add(ep->util_ep.rx_cq->wait, - ep->conn_fd, FI_EPOLL_IN, - tcpx_try_func, (void *)&ep->util_ep, - NULL); -} - void tcpx_tx_queue_insert(struct tcpx_ep *tcpx_ep, struct tcpx_xfer_entry *tx_entry) { @@ -734,7 +725,7 @@ void tcpx_tx_queue_insert(struct tcpx_ep *tcpx_ep, slist_insert_tail(&tx_entry->entry, &tcpx_ep->tx_queue); if (empty) { - process_tx_entry(tx_entry); + tcpx_process_tx_entry(tx_entry); if (!slist_empty(&tcpx_ep->tx_queue) && wait) wait->signal(wait); diff --git a/prov/tcp/src/tcpx_rma.c b/prov/tcp/src/tcpx_rma.c index 8215d791fc3..e3809541a1a 100644 --- a/prov/tcp/src/tcpx_rma.c +++ b/prov/tcp/src/tcpx_rma.c @@ -57,7 +57,7 @@ static void tcpx_rma_read_send_entry_fill(struct tcpx_xfer_entry *send_entry, size_t offset; offset = sizeof(send_entry->hdr.base_hdr); - rma_iov = (struct ofi_rma_iov *)((uint8_t *)&send_entry->hdr + offset); + rma_iov = (struct ofi_rma_iov *) ((uint8_t *) &send_entry->hdr + offset); send_entry->hdr.base_hdr.rma_iov_cnt = msg->rma_iov_count; memcpy(rma_iov, msg->rma_iov, @@ -111,7 +111,7 @@ static ssize_t tcpx_rma_readmsg(struct fid_ep *ep, const struct fi_msg_rma *msg, recv_entry = tcpx_xfer_entry_alloc(tcpx_cq, TCPX_OP_READ_RSP); if (!recv_entry) { - tcpx_xfer_entry_release(tcpx_cq, send_entry); + tcpx_xfer_entry_free(tcpx_cq, send_entry); return -FI_EAGAIN; } tcpx_rma_read_send_entry_fill(send_entry, tcpx_ep, msg); @@ -185,7 +185,6 @@ static ssize_t tcpx_rma_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg uint64_t *cq_data; size_t offset; - tcpx_ep = container_of(ep, struct tcpx_ep, util_ep.ep_fid); tcpx_cq = container_of(tcpx_ep->util_ep.tx_cq, struct tcpx_cq, util_cq); @@ -214,8 +213,7 @@ static ssize_t tcpx_rma_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg msg->rma_iov_count * sizeof(msg->rma_iov[0])); send_entry->hdr.base_hdr.rma_iov_cnt = msg->rma_iov_count; - offset += (send_entry->hdr.base_hdr.rma_iov_cnt * - sizeof(*rma_iov)); + offset += (send_entry->hdr.base_hdr.rma_iov_cnt * sizeof(*rma_iov)); send_entry->hdr.base_hdr.payload_off = (uint8_t)offset; send_entry->hdr.base_hdr.size = data_len + offset; @@ -235,16 +233,14 @@ static ssize_t tcpx_rma_writemsg(struct fid_ep *ep, const struct fi_msg_rma *msg send_entry->iov[0].iov_base = (void *) &send_entry->hdr; send_entry->iov[0].iov_len = offset; - send_entry->flags = ((tcpx_ep->util_ep.tx_op_flags & FI_COMPLETION) | - flags | FI_RMA | FI_WRITE); + send_entry->flags = (tcpx_ep->util_ep.tx_op_flags & FI_COMPLETION) | + flags | FI_RMA | FI_WRITE; - if (flags & (FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE)) { + if (flags & (FI_TRANSMIT_COMPLETE | FI_DELIVERY_COMPLETE)) send_entry->hdr.base_hdr.flags |= OFI_DELIVERY_COMPLETE; - } - if (flags & FI_COMMIT_COMPLETE) { + if (flags & FI_COMMIT_COMPLETE) send_entry->hdr.base_hdr.flags |= OFI_COMMIT_COMPLETE; - } send_entry->ep = tcpx_ep; send_entry->context = msg->context; @@ -393,15 +389,16 @@ static ssize_t tcpx_rma_inject_common(struct fid_ep *ep, const void *buf, static ssize_t tcpx_rma_inject(struct fid_ep *ep, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { - return tcpx_rma_inject_common(ep, buf, len, dest_addr, - 0, addr, key, FI_INJECT); + return tcpx_rma_inject_common(ep, buf, len, 0 ,dest_addr, + addr, key, FI_INJECT); } -static ssize_t tcpx_rma_injectdata(struct fid_ep *ep, const void *buf, size_t len, - uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key) +static ssize_t +tcpx_rma_injectdata(struct fid_ep *ep, const void *buf, size_t len, + uint64_t data, fi_addr_t dest_addr, uint64_t addr, + uint64_t key) { - return tcpx_rma_inject_common(ep, buf, len, dest_addr, - data, addr, key, + return tcpx_rma_inject_common(ep, buf, len, data, dest_addr, addr, key, FI_INJECT | FI_REMOTE_CQ_DATA); } diff --git a/prov/tcp/src/tcpx_shared_ctx.c b/prov/tcp/src/tcpx_shared_ctx.c index 66b196b4302..4eeeb72eb2d 100644 --- a/prov/tcp/src/tcpx_shared_ctx.c +++ b/prov/tcp/src/tcpx_shared_ctx.c @@ -38,8 +38,8 @@ #include #include -void tcpx_srx_xfer_release(struct tcpx_rx_ctx *srx_ctx, - struct tcpx_xfer_entry *xfer_entry) +void tcpx_srx_entry_free(struct tcpx_rx_ctx *srx_ctx, + struct tcpx_xfer_entry *xfer_entry) { if (xfer_entry->ep->cur_rx_entry == xfer_entry) xfer_entry->ep->cur_rx_entry = NULL; @@ -49,54 +49,21 @@ void tcpx_srx_xfer_release(struct tcpx_rx_ctx *srx_ctx, fastlock_release(&srx_ctx->lock); } -static inline void tcpx_srx_recv_init(struct tcpx_xfer_entry *recv_entry, - uint64_t base_flags, void *context) -{ - recv_entry->flags = base_flags | FI_MSG | FI_RECV; - recv_entry->context = context; -} - -static inline void tcpx_srx_recv_init_iov(struct tcpx_xfer_entry *recv_entry, - size_t count, const struct iovec *iov) -{ - recv_entry->iov_cnt = count; - memcpy(&recv_entry->iov[0], iov, count * sizeof(*iov)); -} - struct tcpx_xfer_entry * -tcpx_srx_next_xfer_entry(struct tcpx_rx_ctx *srx_ctx, - struct tcpx_ep *ep, size_t entry_size) +tcpx_srx_entry_alloc(struct tcpx_rx_ctx *srx_ctx, struct tcpx_ep *ep) { - struct tcpx_xfer_entry *xfer_entry = NULL; - struct tcpx_xfer_entry *new_entry; + struct tcpx_xfer_entry *rx_entry = NULL; fastlock_acquire(&srx_ctx->lock); if (slist_empty(&srx_ctx->rx_queue)) goto out; - xfer_entry = container_of(srx_ctx->rx_queue.head, - struct tcpx_xfer_entry, entry); - xfer_entry->rem_len = - ofi_total_iov_len(xfer_entry->iov, xfer_entry->iov_cnt)- - entry_size; - - if (!(xfer_entry->flags & FI_MULTI_RECV) && - xfer_entry->rem_len < ep->min_multi_recv_size) { - slist_remove_head(&srx_ctx->rx_queue); - xfer_entry->rx_msg_release_fn = tcpx_rx_msg_release; - } else { - new_entry = ofi_buf_alloc(srx_ctx->buf_pool); - if (new_entry) { - memcpy(new_entry, xfer_entry, sizeof(*new_entry)); - ofi_consume_iov(xfer_entry->iov, &xfer_entry->iov_cnt, - entry_size); - new_entry->rx_msg_release_fn = tcpx_rx_msg_release; - } - xfer_entry = new_entry; - } + rx_entry = container_of(srx_ctx->rx_queue.head, + struct tcpx_xfer_entry, entry); + slist_remove_head(&srx_ctx->rx_queue); out: fastlock_release(&srx_ctx->lock); - return xfer_entry; + return rx_entry; } static ssize_t tcpx_srx_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, @@ -108,8 +75,6 @@ static ssize_t tcpx_srx_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, srx_ctx = container_of(ep, struct tcpx_rx_ctx, rx_fid); assert(msg->iov_count <= TCPX_IOV_LIMIT); - assert(!(srx_ctx->op_flags & flags & FI_MULTI_RECV) || - msg->iov_count == 1); fastlock_acquire(&srx_ctx->lock); recv_entry = ofi_buf_alloc(srx_ctx->buf_pool); @@ -118,8 +83,11 @@ static ssize_t tcpx_srx_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, goto unlock; } - tcpx_srx_recv_init(recv_entry, flags, msg->context); - tcpx_srx_recv_init_iov(recv_entry, msg->iov_count, msg->msg_iov); + recv_entry->flags = flags | FI_MSG | FI_RECV; + recv_entry->context = msg->context; + recv_entry->iov_cnt = msg->iov_count; + memcpy(&recv_entry->iov[0], msg->msg_iov, + msg->iov_count * sizeof(*msg->msg_iov)); slist_insert_tail(&recv_entry->entry, &srx_ctx->rx_queue); unlock: @@ -143,12 +111,11 @@ static ssize_t tcpx_srx_recv(struct fid_ep *ep, void *buf, size_t len, void *des goto unlock; } - tcpx_srx_recv_init(recv_entry, srx_ctx->op_flags & FI_MULTI_RECV, - context); + recv_entry->flags = FI_MSG | FI_RECV; + recv_entry->context = context; recv_entry->iov_cnt = 1; recv_entry->iov[0].iov_base = buf; recv_entry->iov[0].iov_len = len; - recv_entry->rem_len = len; slist_insert_tail(&recv_entry->entry, &srx_ctx->rx_queue); unlock: @@ -165,7 +132,6 @@ static ssize_t tcpx_srx_recvv(struct fid_ep *ep, const struct iovec *iov, void * srx_ctx = container_of(ep, struct tcpx_rx_ctx, rx_fid); assert(count <= TCPX_IOV_LIMIT); - assert(!(srx_ctx->op_flags & FI_MULTI_RECV) || count == 1); fastlock_acquire(&srx_ctx->lock); recv_entry = ofi_buf_alloc(srx_ctx->buf_pool); @@ -174,9 +140,10 @@ static ssize_t tcpx_srx_recvv(struct fid_ep *ep, const struct iovec *iov, void * goto unlock; } - tcpx_srx_recv_init(recv_entry, srx_ctx->op_flags & FI_MULTI_RECV, - context); - tcpx_srx_recv_init_iov(recv_entry, count, iov); + recv_entry->flags = FI_MSG | FI_RECV; + recv_entry->context = context; + recv_entry->iov_cnt = count; + memcpy(&recv_entry->iov[0], iov, count * sizeof(*iov)); slist_insert_tail(&recv_entry->entry, &srx_ctx->rx_queue); unlock: diff --git a/prov/udp/Makefile.include b/prov/udp/Makefile.include index 6cd0428c5fb..c291b6568a1 100644 --- a/prov/udp/Makefile.include +++ b/prov/udp/Makefile.include @@ -11,12 +11,11 @@ _udp_files = \ if HAVE_UDP_DL pkglib_LTLIBRARIES += libudp-fi.la libudp_fi_la_SOURCES = $(_udp_files) $(common_srcs) -libudp_fi_la_LIBADD = $(linkback) $(udp_shm_LIBS) +libudp_fi_la_LIBADD = $(linkback) libudp_fi_la_LDFLAGS = -module -avoid-version -shared -export-dynamic libudp_fi_la_DEPENDENCIES = $(linkback) else !HAVE_UDP_DL src_libfabric_la_SOURCES += $(_udp_files) -src_libfabric_la_LIBADD += $(udp_shm_LIBS) endif !HAVE_UDP_DL prov_install_man_pages += man/man7/fi_udp.7 diff --git a/prov/udp/configure.m4 b/prov/udp/configure.m4 index 8330abb2a1f..3538cfc5a16 100644 --- a/prov/udp/configure.m4 +++ b/prov/udp/configure.m4 @@ -10,30 +10,10 @@ dnl AC_DEFUN([FI_UDP_CONFIGURE],[ # Determine if we can support the udp provider udp_h_happy=0 - udp_shm_happy=0 AS_IF([test x"$enable_udp" != x"no"], [AC_CHECK_HEADER([sys/socket.h], [udp_h_happy=1], [udp_h_happy=0]) - - - # check if shm_open is already present - AC_CHECK_FUNC([shm_open], - [udp_shm_happy=1], - [udp_shm_happy=0]) - - # look for shm_open in librt if not already present - AS_IF([test $udp_shm_happy -eq 0], - [FI_CHECK_PACKAGE([udp_shm], - [sys/mman.h], - [rt], - [shm_open], - [], - [], - [], - [udp_shm_happy=1], - [udp_shm_happy=0])]) ]) - AS_IF([test $udp_h_happy -eq 1 && \ - test $udp_shm_happy -eq 1], [$1], [$2]) + AS_IF([test $udp_h_happy -eq 1], [$1], [$2]) ]) diff --git a/prov/udp/libfabric-udp.spec.in b/prov/udp/libfabric-udp.spec.in index 9e11096ace3..0ad0d5c2b54 100644 --- a/prov/udp/libfabric-udp.spec.in +++ b/prov/udp/libfabric-udp.spec.in @@ -1,11 +1,12 @@ %{!?configopts: %global configopts LDFLAGS=-Wl,--build-id} -%{!?provider: %define provider usnic} -%{!?provider_formal: %define provider_formal usNIC} +%{!?provider: %define provider udp} +%{!?provider_formal: %define provider_formal udp} Name: libfabric-%{provider} Version: @VERSION@ Release: 1%{?dist} Summary: Dynamic %{provider_formal} provider for user-space Open Fabric Interfaces + Group: System Environment/Libraries License: GPLv2 or BSD Url: http://www.github.com/ofiwg/libfabric diff --git a/prov/udp/src/udpx.h b/prov/udp/src/udpx.h index ef2550d8a8c..a52ff392b99 100644 --- a/prov/udp/src/udpx.h +++ b/prov/udp/src/udpx.h @@ -64,10 +64,6 @@ #define _UDPX_H_ -#define UDPX_MAJOR_VERSION 1 -#define UDPX_MINOR_VERSION 1 - - extern struct fi_provider udpx_prov; extern struct util_prov udpx_util_prov; extern struct fi_info udpx_info; @@ -121,6 +117,7 @@ struct udpx_mc { struct fid_mc mc_fid; union { struct sockaddr_in sin; + struct sockaddr_in6 sin6; } addr; struct udpx_ep *ep; }; diff --git a/prov/udp/src/udpx_attr.c b/prov/udp/src/udpx_attr.c index 029085ff448..fb1f974c9a6 100644 --- a/prov/udp/src/udpx_attr.c +++ b/prov/udp/src/udpx_attr.c @@ -32,9 +32,12 @@ #include "udpx.h" +#define UDPX_TX_CAPS (OFI_TX_MSG_CAPS | FI_MULTICAST) +#define UDPX_RX_CAPS (FI_SOURCE | OFI_RX_MSG_CAPS) +#define UDPX_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM) struct fi_tx_attr udpx_tx_attr = { - .caps = FI_MSG | FI_SEND | FI_MULTICAST, + .caps = UDPX_TX_CAPS, .comp_order = FI_ORDER_STRICT, .inject_size = 1472, .size = 1024, @@ -42,7 +45,7 @@ struct fi_tx_attr udpx_tx_attr = { }; struct fi_rx_attr udpx_rx_attr = { - .caps = FI_MSG | FI_RECV | FI_SOURCE | FI_MULTICAST, + .caps = UDPX_RX_CAPS, .comp_order = FI_ORDER_STRICT, .total_buffered_recv = (1 << 16), .size = 1024, @@ -59,13 +62,15 @@ struct fi_ep_attr udpx_ep_attr = { }; struct fi_domain_attr udpx_domain_attr = { + .caps = UDPX_DOMAIN_CAPS, .name = "udp", .threading = FI_THREAD_SAFE, .control_progress = FI_PROGRESS_AUTO, .data_progress = FI_PROGRESS_AUTO, .resource_mgmt = FI_RM_ENABLED, .av_type = FI_AV_UNSPEC, - .mr_mode = 0, + .mr_mode = FI_MR_BASIC | FI_MR_SCALABLE, + .mr_key_size = sizeof(uint64_t), .cq_cnt = 256, .ep_cnt = 256, .tx_ctx_cnt = 256, @@ -76,11 +81,11 @@ struct fi_domain_attr udpx_domain_attr = { struct fi_fabric_attr udpx_fabric_attr = { .name = "UDP-IP", - .prov_version = FI_VERSION(UDPX_MAJOR_VERSION, UDPX_MINOR_VERSION) + .prov_version = OFI_VERSION_DEF_PROV }; struct fi_info udpx_info = { - .caps = FI_MSG | FI_SEND | FI_RECV | FI_SOURCE | FI_MULTICAST, + .caps = UDPX_DOMAIN_CAPS | UDPX_TX_CAPS | UDPX_RX_CAPS, .addr_format = FI_SOCKADDR, .tx_attr = &udpx_tx_attr, .rx_attr = &udpx_rx_attr, diff --git a/prov/udp/src/udpx_domain.c b/prov/udp/src/udpx_domain.c index a40e8f25bef..e79d0b2ba90 100644 --- a/prov/udp/src/udpx_domain.c +++ b/prov/udp/src/udpx_domain.c @@ -47,6 +47,7 @@ static struct fi_ops_domain udpx_domain_ops = { .stx_ctx = fi_no_stx_context, .srx_ctx = fi_no_srx_context, .query_atomic = fi_no_query_atomic, + .query_collective = fi_no_query_collective, }; static int udpx_domain_close(fid_t fid) @@ -69,6 +70,13 @@ static struct fi_ops udpx_domain_fi_ops = { .ops_open = fi_no_ops_open, }; +static struct fi_ops_mr udpx_mr_ops = { + .size = sizeof(struct fi_ops_mr), + .reg = ofi_mr_reg, + .regv = ofi_mr_regv, + .regattr = ofi_mr_regattr, +}; + int udpx_domain_open(struct fid_fabric *fabric, struct fi_info *info, struct fid_domain **domain, void *context) { @@ -92,5 +100,6 @@ int udpx_domain_open(struct fid_fabric *fabric, struct fi_info *info, *domain = &util_domain->domain_fid; (*domain)->fid.ops = &udpx_domain_fi_ops; (*domain)->ops = &udpx_domain_ops; + (*domain)->mr = &udpx_mr_ops; return 0; } diff --git a/prov/udp/src/udpx_ep.c b/prov/udp/src/udpx_ep.c index 2a1145f1cf7..274f7a86f0e 100644 --- a/prov/udp/src/udpx_ep.c +++ b/prov/udp/src/udpx_ep.c @@ -219,7 +219,7 @@ static void udpx_tx_comp(struct udpx_ep *ep, void *context) { struct fi_cq_tagged_entry *comp; - comp = ofi_cirque_tail(ep->util_ep.tx_cq->cirq); + comp = ofi_cirque_next(ep->util_ep.tx_cq->cirq); comp->op_context = context; comp->flags = FI_SEND; comp->len = 0; @@ -239,7 +239,7 @@ static void udpx_rx_comp(struct udpx_ep *ep, void *context, uint64_t flags, { struct fi_cq_tagged_entry *comp; - comp = ofi_cirque_tail(ep->util_ep.rx_cq->cirq); + comp = ofi_cirque_next(ep->util_ep.rx_cq->cirq); comp->op_context = context; comp->flags = FI_RECV | flags; comp->len = len; @@ -316,7 +316,7 @@ static ssize_t udpx_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, goto out; } - entry = ofi_cirque_tail(ep->rxq); + entry = ofi_cirque_next(ep->rxq); entry->context = msg->context; for (entry->iov_count = 0; entry->iov_count < msg->iov_count; entry->iov_count++) { @@ -357,7 +357,7 @@ static ssize_t udpx_recv(struct fid_ep *ep_fid, void *buf, size_t len, goto out; } - entry = ofi_cirque_tail(ep->rxq); + entry = ofi_cirque_next(ep->rxq); entry->context = context; entry->iov_count = 1; entry->iov[0].iov_base = buf; @@ -561,7 +561,7 @@ static int udpx_ep_close(struct fid *fid) if (ep->util_ep.rx_cq->wait) { wait = container_of(ep->util_ep.rx_cq->wait, struct util_wait_fd, util_wait); - fi_epoll_del(wait->epoll_fd, (int)ep->sock); + ofi_epoll_del(wait->epoll_fd, (int)ep->sock); } fid_list_remove(&ep->util_ep.rx_cq->ep_list, &ep->util_ep.rx_cq->ep_list_lock, @@ -604,8 +604,8 @@ static int udpx_ep_bind_cq(struct udpx_ep *ep, struct util_cq *cq, wait = container_of(cq->wait, struct util_wait_fd, util_wait); - ret = fi_epoll_add(wait->epoll_fd, (int)ep->sock, - FI_EPOLL_IN, &ep->util_ep.ep_fid.fid); + ret = ofi_epoll_add(wait->epoll_fd, (int)ep->sock, + OFI_EPOLL_IN, &ep->util_ep.ep_fid.fid); if (ret) return ret; } else { @@ -698,7 +698,8 @@ static int udpx_ep_ctrl(struct fid *fid, int command, void *arg) ep = container_of(fid, struct udpx_ep, util_ep.ep_fid.fid); switch (command) { case FI_ENABLE: - if (!ep->util_ep.rx_cq || !ep->util_ep.tx_cq) + if ((ofi_needs_rx(ep->util_ep.caps) && !ep->util_ep.rx_cq) || + (ofi_needs_tx(ep->util_ep.caps) && !ep->util_ep.tx_cq)) return -FI_ENOCQ; if (!ep->util_ep.av) return -FI_ENOAV; diff --git a/prov/udp/src/udpx_init.c b/prov/udp/src/udpx_init.c index 5462cc2c3f2..85ad70f749b 100644 --- a/prov/udp/src/udpx_init.c +++ b/prov/udp/src/udpx_init.c @@ -36,81 +36,14 @@ #include "udpx.h" #include -#include -#include -#if HAVE_GETIFADDRS -static void udpx_getinfo_ifs(struct fi_info **info) -{ - struct fi_info *head = NULL, *tail = NULL, *cur; - struct slist addr_list; - size_t addrlen; - uint32_t addr_format; - struct slist_entry *entry, *prev; - struct ofi_addr_list_entry *addr_entry; - - slist_init(&addr_list); - - ofi_get_list_of_addr(&udpx_prov, "iface", &addr_list); - - (void) prev; /* Makes compiler happy */ - slist_foreach(&addr_list, entry, prev) { - addr_entry = container_of(entry, struct ofi_addr_list_entry, entry); - - cur = fi_dupinfo(*info); - if (!cur) - break; - - if (!head) - head = cur; - else - tail->next = cur; - tail = cur; - - switch (addr_entry->ipaddr.sin.sin_family) { - case AF_INET: - addrlen = sizeof(struct sockaddr_in); - addr_format = FI_SOCKADDR_IN; - break; - case AF_INET6: - addrlen = sizeof(struct sockaddr_in6); - addr_format = FI_SOCKADDR_IN6; - break; - default: - continue; - } - - cur->src_addr = mem_dup(&addr_entry->ipaddr.sa, addrlen); - if (cur->src_addr) { - cur->src_addrlen = addrlen; - cur->addr_format = addr_format; - } - } - - ofi_free_list_of_addr(&addr_list); - fi_freeinfo(*info); - *info = head; -} -#else -#define udpx_getinfo_ifs(info) do{}while(0) -#endif - static int udpx_getinfo(uint32_t version, const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct fi_info **info) { - int ret; - - ret = util_getinfo(&udpx_util_prov, version, node, service, flags, - hints, info); - if (ret) - return ret; - - if (!(*info)->src_addr && !(*info)->dest_addr) - udpx_getinfo_ifs(info); - - return 0; + return ofi_ip_getinfo(&udpx_util_prov, version, node, service, flags, + hints, info); } static void udpx_fini(void) @@ -119,9 +52,9 @@ static void udpx_fini(void) } struct fi_provider udpx_prov = { - .name = "UDP", - .version = FI_VERSION(UDPX_MAJOR_VERSION, UDPX_MINOR_VERSION), - .fi_version = FI_VERSION(1, 8), + .name = "udp", + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, .getinfo = udpx_getinfo, .fabric = udpx_fabric, .cleanup = udpx_fini diff --git a/prov/usnic/Makefile.include b/prov/usnic/Makefile.include index eb49540ee4c..74ff3d6c02b 100644 --- a/prov/usnic/Makefile.include +++ b/prov/usnic/Makefile.include @@ -1,5 +1,5 @@ # -# Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. # # This software is available to you under a choice of one of two # licenses. You may choose to be licensed under the terms of the GNU @@ -114,18 +114,12 @@ _usnic_files = \ prov/usnic/src/usdf_endpoint.c \ prov/usnic/src/usdf_endpoint.h \ prov/usnic/src/usdf_ep_dgram.c \ - prov/usnic/src/usdf_ep_msg.c \ - prov/usnic/src/usdf_ep_rdm.c \ prov/usnic/src/usdf_eq.c \ prov/usnic/src/usdf_fabric.c \ prov/usnic/src/usdf_mem.c \ - prov/usnic/src/usdf_msg.c \ - prov/usnic/src/usdf_msg.h \ prov/usnic/src/usdf_pep.c \ prov/usnic/src/usdf_progress.c \ prov/usnic/src/usdf_progress.h \ - prov/usnic/src/usdf_rdm.c \ - prov/usnic/src/usdf_rdm.h \ prov/usnic/src/usdf_rudp.h \ prov/usnic/src/usdf_timer.c \ prov/usnic/src/usdf_timer.h \ diff --git a/prov/usnic/src/usdf.h b/prov/usnic/src/usdf.h index 6280b515c41..991eb57a615 100644 --- a/prov/usnic/src/usdf.h +++ b/prov/usnic/src/usdf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -40,6 +40,7 @@ #include #include +#include #include "usdf_progress.h" #include "usd.h" @@ -120,7 +121,7 @@ struct usdf_fabric { /* progression */ pthread_t fab_thread; int fab_exit; - int fab_epollfd; + ofi_epoll_t fab_epollfd; int fab_eventfd; struct usdf_poll_item fab_poll_item; @@ -147,11 +148,6 @@ struct usdf_domain { TAILQ_HEAD(,usdf_tx) dom_tx_ready; TAILQ_HEAD(,usdf_cq_hard) dom_hcq_list; - struct usdf_rdm_connection **dom_rdc_hashtab; - SLIST_HEAD(,usdf_rdm_connection) dom_rdc_free; - ofi_atomic32_t dom_rdc_free_cnt; - size_t dom_rdc_total; - /* used only by connected endpoints */ struct usdf_ep **dom_peer_tab; uint32_t dom_next_peer; @@ -429,7 +425,7 @@ enum { struct usdf_err_data_entry { struct slist_entry entry; uint8_t seen; - uint8_t err_data[0]; + uint8_t err_data[]; }; struct usdf_event { diff --git a/prov/usnic/src/usdf_av.c b/prov/usnic/src/usdf_av.c index de69541ac60..0c41e17e846 100644 --- a/prov/usnic/src/usdf_av.c +++ b/prov/usnic/src/usdf_av.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -65,7 +65,6 @@ #include "usdf_av.h" #include "usdf_cm.h" #include "usdf_timer.h" -#include "usdf_rdm.h" #include "fi_ext_usnic.h" @@ -77,28 +76,14 @@ static int usdf_av_alloc_dest(struct usdf_dest **dest_o) if (dest == NULL) return -errno; - SLIST_INIT(&dest->ds_rdm_rdc_list); - *dest_o = dest; return 0; } static void usdf_av_free_dest(struct usdf_dest *dest) { - struct usdf_rdm_connection *rdc = NULL; - LIST_REMOVE(dest, ds_addresses_entry); - while (!SLIST_EMPTY(&dest->ds_rdm_rdc_list)) { - rdc = SLIST_FIRST(&dest->ds_rdm_rdc_list); - rdc->dc_dest = NULL; - - SLIST_REMOVE(&dest->ds_rdm_rdc_list, rdc, usdf_rdm_connection, - dc_addr_link); - if (rdc) - rdc->dc_dest = NULL; - } - free(dest); } diff --git a/prov/usnic/src/usdf_av.h b/prov/usnic/src/usdf_av.h index d14f6dbaa75..24e3cd511e6 100644 --- a/prov/usnic/src/usdf_av.h +++ b/prov/usnic/src/usdf_av.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -49,7 +49,6 @@ struct usdf_rdm_connection; struct usdf_dest { struct usd_dest ds_dest; - SLIST_HEAD(,usdf_rdm_connection) ds_rdm_rdc_list; LIST_ENTRY(usdf_dest) ds_addresses_entry; }; diff --git a/prov/usnic/src/usdf_cm.c b/prov/usnic/src/usdf_cm.c index d372fe62bb2..cc2198e9b7a 100644 --- a/prov/usnic/src/usdf_cm.c +++ b/prov/usnic/src/usdf_cm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -45,7 +45,7 @@ #include #include #include -#include +#include #include #include @@ -60,155 +60,9 @@ #include "usdf.h" #include "usdf_endpoint.h" #include "usdf_dgram.h" -#include "usdf_msg.h" #include "usdf_av.h" #include "usdf_cm.h" -void -usdf_cm_msg_connreq_cleanup(struct usdf_connreq *crp) -{ - struct usdf_ep *ep; - struct usdf_pep *pep; - struct usdf_fabric *fp; - - ep = crp->cr_ep; - pep = crp->cr_pep; - if (pep != NULL) { - fp = pep->pep_fabric; - } else { - fp = ep->ep_domain->dom_fabric; - } - - if (crp->cr_pollitem.pi_rtn != NULL) { - (void) epoll_ctl(fp->fab_epollfd, EPOLL_CTL_DEL, crp->cr_sockfd, NULL); - crp->cr_pollitem.pi_rtn = NULL; - } - if (crp->cr_sockfd != -1) { - close(crp->cr_sockfd); - crp->cr_sockfd = -1; - } - - /* If there is a passive endpoint, recycle the crp */ - if (pep != NULL) { - if (TAILQ_ON_LIST(crp, cr_link)) { - TAILQ_REMOVE(&pep->pep_cr_pending, crp, cr_link); - } - TAILQ_INSERT_TAIL(&pep->pep_cr_free, crp, cr_link); - } else { - free(crp); - } -} - -static int -usdf_cm_msg_accept_complete(struct usdf_connreq *crp) -{ - struct usdf_ep *ep; - struct fi_eq_cm_entry entry; - int ret; - - ep = crp->cr_ep; - - /* post EQ entry */ - entry.fid = ep_utofid(ep); - entry.info = NULL; - ret = usdf_eq_write_internal(ep->ep_eq, FI_CONNECTED, &entry, - sizeof(entry), 0); - if (ret != sizeof(entry)) { - usdf_cm_report_failure(crp, ret, false); - return 0; - } - - usdf_cm_msg_connreq_cleanup(crp); - - return 0; -} - -int -usdf_cm_msg_accept(struct fid_ep *fep, const void *param, size_t paramlen) -{ - struct usdf_ep *ep; - struct usdf_rx *rx; - struct usdf_domain *udp; - struct usdf_fabric *fp; - struct usdf_connreq *crp; - struct usdf_connreq_msg *reqp; - struct usd_qp_impl *qp; - int ret; - int n; - - USDF_TRACE_SYS(EP_CTRL, "\n"); - - if (paramlen > USDF_MAX_CONN_DATA) - return -FI_EINVAL; - - ep = ep_ftou(fep); - udp = ep->ep_domain; - fp = udp->dom_fabric; - crp = ep->e.msg.ep_connreq; - if (crp == NULL) { - return -FI_ENOTCONN; - } - if (ep->ep_eq == NULL) { - return -FI_ENOEQ; - } - crp->cr_ep = ep; - reqp = (struct usdf_connreq_msg *)crp->cr_data; - - ep->e.msg.ep_lcl_peer_id = ntohs(reqp->creq_peer_id); - - /* start creating the dest early */ - ret = usd_create_dest(udp->dom_dev, reqp->creq_ipaddr, - reqp->creq_port, &ep->e.msg.ep_dest); - if (ret != 0) { - goto fail; - } - - ep->e.msg.ep_dest->ds_dest.ds_udp.u_hdr.uh_ip.frag_off |= htons(IP_DF); - - ret = usdf_ep_msg_get_queues(ep); - if (ret != 0) { - goto fail; - } - rx = ep->ep_rx; - qp = to_qpi(rx->rx_qp); - - /* allocate a peer ID */ - ep->e.msg.ep_rem_peer_id = udp->dom_next_peer; - udp->dom_peer_tab[udp->dom_next_peer] = ep; - ++udp->dom_next_peer; - - crp->cr_ptr = crp->cr_data; - crp->cr_resid = sizeof(*reqp) + paramlen; - - reqp->creq_peer_id = htons(ep->e.msg.ep_rem_peer_id); - reqp->creq_ipaddr = fp->fab_dev_attrs->uda_ipaddr_be; - reqp->creq_port = - qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; - reqp->creq_result = htonl(0); - reqp->creq_datalen = htonl(paramlen); - memcpy(reqp->creq_data, param, paramlen); - - n = write(crp->cr_sockfd, crp->cr_ptr, crp->cr_resid); - if (n == -1) { - usdf_cm_msg_connreq_cleanup(crp); - ret = -errno; - goto fail; - } - - crp->cr_resid -= n; - if (crp->cr_resid == 0) { - usdf_cm_msg_accept_complete(crp); - } else { - // XXX set up epoll junk to send rest - } - - return 0; -fail: - free(ep->e.msg.ep_dest); - /* XXX release queues */ - return ret; -} - /* Given a connection request structure containing data, make a copy of the data * that can be accessed in error entries on the EQ. The return value is the size * of the data stored in the error entry. If the return value is a non-negative @@ -308,266 +162,6 @@ void usdf_cm_report_failure(struct usdf_connreq *crp, int error, bool copy_data) err.err = -error; usdf_eq_write_internal(eq, 0, &err, sizeof(err), USDF_EVENT_FLAG_ERROR); - - usdf_cm_msg_connreq_cleanup(crp); -} - -/* - * read connection request response from the listener - */ -static int -usdf_cm_msg_connect_cb_rd(void *v) -{ - struct usdf_connreq *crp; - struct usdf_ep *ep; - struct usdf_fabric *fp; - struct usdf_domain *udp; - struct usdf_connreq_msg *reqp; - struct fi_eq_cm_entry *entry; - size_t entry_len; - int ret; - - crp = v; - ep = crp->cr_ep; - fp = ep->ep_domain->dom_fabric; - - ret = read(crp->cr_sockfd, crp->cr_ptr, crp->cr_resid); - if (ret == -1) - goto report_failure_skip_data; - - crp->cr_ptr += ret; - crp->cr_resid -= ret; - - reqp = (struct usdf_connreq_msg *)crp->cr_data; - if (crp->cr_resid == 0 && crp->cr_ptr == crp->cr_data + sizeof(*reqp)) { - reqp->creq_datalen = ntohl(reqp->creq_datalen); - crp->cr_resid = reqp->creq_datalen; - } - - /* if resid is 0 now, completely done */ - if (crp->cr_resid == 0) { - reqp->creq_result = ntohl(reqp->creq_result); - - ret = epoll_ctl(fp->fab_epollfd, EPOLL_CTL_DEL, - crp->cr_sockfd, NULL); - close(crp->cr_sockfd); - crp->cr_sockfd = -1; - - if (reqp->creq_result != FI_SUCCESS) { - /* Copy the data since this was an explicit rejection. - */ - usdf_cm_report_failure(crp, reqp->creq_result, true); - return 0; - } - - entry_len = sizeof(*entry) + reqp->creq_datalen; - entry = malloc(entry_len); - if (entry == NULL) - goto report_failure_skip_data; - - udp = ep->ep_domain; - ep->e.msg.ep_lcl_peer_id = ntohs(reqp->creq_peer_id); - ret = usd_create_dest(udp->dom_dev, reqp->creq_ipaddr, - reqp->creq_port, &ep->e.msg.ep_dest); - if (ret != 0) - goto free_entry_and_report_failure; - - ep->e.msg.ep_dest->ds_dest.ds_udp.u_hdr.uh_ip.frag_off |= - htons(IP_DF); - - entry->fid = ep_utofid(ep); - entry->info = NULL; - memcpy(entry->data, reqp->creq_data, reqp->creq_datalen); - ret = usdf_eq_write_internal(ep->ep_eq, FI_CONNECTED, entry, - entry_len, 0); - if (ret != (int)entry_len) { - free(ep->e.msg.ep_dest); - ep->e.msg.ep_dest = NULL; - - goto free_entry_and_report_failure; - } - - free(entry); - usdf_cm_msg_connreq_cleanup(crp); - } - return 0; - -free_entry_and_report_failure: - free(entry); -report_failure_skip_data: - usdf_cm_report_failure(crp, ret, false); - return 0; -} - -/* - * Write connection request data to the listener - * Once everything is written, switch over into listening mode to - * capture the listener response. - */ -static int -usdf_cm_msg_connect_cb_wr(void *v) -{ - struct usdf_connreq *crp; - struct usdf_ep *ep; - struct usdf_fabric *fp; - struct epoll_event ev; - int ret; - - crp = v; - ep = crp->cr_ep; - fp = ep->ep_domain->dom_fabric; - - ret = write(crp->cr_sockfd, crp->cr_ptr, crp->cr_resid); - if (ret == -1) { - usdf_cm_report_failure(crp, -errno, false); - return 0; - } - - crp->cr_resid -= ret; - if (crp->cr_resid == 0) { - crp->cr_pollitem.pi_rtn = usdf_cm_msg_connect_cb_rd; - crp->cr_ptr = crp->cr_data; - crp->cr_resid = sizeof(struct usdf_connreq_msg); - - ev.events = EPOLLIN; - ev.data.ptr = &crp->cr_pollitem; - ret = epoll_ctl(fp->fab_epollfd, EPOLL_CTL_MOD, - crp->cr_sockfd, &ev); - if (ret != 0) { - usdf_cm_report_failure(crp, -errno, false); - return 0; - } - } - return 0; -} - -int -usdf_cm_msg_connect(struct fid_ep *fep, const void *addr, - const void *param, size_t paramlen) -{ - struct usdf_connreq *crp; - struct usdf_ep *ep; - struct usdf_rx *rx; - struct usdf_domain *udp; - const struct sockaddr_in *sin; - struct epoll_event ev; - struct usdf_fabric *fp; - struct usdf_connreq_msg *reqp; - struct usd_qp_impl *qp; - struct fi_info *info; - size_t request_size; - int ret; - - USDF_TRACE_SYS(EP_CTRL, "\n"); - - if (paramlen > USDF_MAX_CONN_DATA) - return -FI_EINVAL; - - ep = ep_ftou(fep); - udp = ep->ep_domain; - fp = udp->dom_fabric; - info = ep->ep_domain->dom_info; - - sin = usdf_format_to_sin(info, addr); - - /* Although paramlen may be less than USDF_MAX_CONN_DATA, the same crp - * struct is used for receiving the accept and reject payload. The - * structure has to be prepared to receive the maximum allowable amount - * of data per transfer. The maximum size includes the connection - * request structure, the connection request message, and the maximum - * amount of data per connection request message. - */ - request_size = sizeof(*crp) + sizeof(*reqp) + USDF_MAX_CONN_DATA; - crp = calloc(1, request_size); - if (crp == NULL) { - ret = -errno; - goto fail; - } - ep->e.msg.ep_connreq = crp; - - crp->handle.fclass = FI_CLASS_CONNREQ; - - if (ep->e.msg.ep_cm_sock == -1) { - crp->cr_sockfd = socket(AF_INET, SOCK_STREAM, 0); - if (crp->cr_sockfd == -1) { - ret = -errno; - goto fail; - } - } else { - crp->cr_sockfd = ep->e.msg.ep_cm_sock; - ep->e.msg.ep_cm_sock = -1; - } - - ret = fi_fd_nonblock(crp->cr_sockfd); - if (ret) { - ret = -errno; - goto fail; - } - - ret = usdf_ep_msg_get_queues(ep); - if (ret != 0) { - goto fail; - } - rx = ep->ep_rx; - qp = to_qpi(rx->rx_qp); - - ret = connect(crp->cr_sockfd, (struct sockaddr *)sin, sizeof(*sin)); - if (ret != 0 && errno != EINPROGRESS) { - ret = -errno; - goto fail; - } - - /* If cr_sockfd was previously unbound, connect(2) will do a a bind(2) - * for us. Update our snapshot of the locally bound address. */ - ret = usdf_msg_upd_lcl_addr(ep); - if (ret) - goto fail; - - /* allocate remote peer ID */ - ep->e.msg.ep_rem_peer_id = udp->dom_next_peer; - udp->dom_peer_tab[udp->dom_next_peer] = ep; - ++udp->dom_next_peer; - - crp->cr_ep = ep; - reqp = (struct usdf_connreq_msg *)crp->cr_data; - crp->cr_ptr = crp->cr_data; - crp->cr_resid = sizeof(*reqp) + paramlen; - - reqp->creq_peer_id = htons(ep->e.msg.ep_rem_peer_id); - reqp->creq_ipaddr = fp->fab_dev_attrs->uda_ipaddr_be; - reqp->creq_port = - qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; - reqp->creq_datalen = htonl(paramlen); - memcpy(reqp->creq_data, param, paramlen); - - /* register for notification when connect completes */ - crp->cr_pollitem.pi_rtn = usdf_cm_msg_connect_cb_wr; - crp->cr_pollitem.pi_context = crp; - ev.events = EPOLLOUT; - ev.data.ptr = &crp->cr_pollitem; - ret = epoll_ctl(fp->fab_epollfd, EPOLL_CTL_ADD, crp->cr_sockfd, &ev); - if (ret != 0) { - crp->cr_pollitem.pi_rtn = NULL; - ret = -errno; - goto fail; - } - - usdf_free_sin_if_needed(info, (struct sockaddr_in *)sin); - - return 0; - -fail: - usdf_free_sin_if_needed(info, (struct sockaddr_in *)sin); - - if (crp != NULL) { - if (crp->cr_sockfd != -1) { - close(crp->cr_sockfd); - } - free(crp); - ep->e.msg.ep_connreq = NULL; - } - usdf_ep_msg_release_queues(ep); - return ret; } /* A wrapper to core function to translate string address to @@ -659,32 +253,6 @@ static int usdf_cm_copy_name(struct fi_info *info, struct sockaddr_in *sin, return ret; } -int usdf_cm_rdm_getname(fid_t fid, void *addr, size_t *addrlen) -{ - struct usdf_ep *ep; - struct usdf_rx *rx; - struct sockaddr_in sin; - struct fi_info *info; - - USDF_TRACE_SYS(EP_CTRL, "\n"); - - ep = ep_fidtou(fid); - rx = ep->ep_rx; - info = ep->ep_domain->dom_info; - - memset(&sin, 0, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = - ep->ep_domain->dom_fabric->fab_dev_attrs->uda_ipaddr_be; - if (rx == NULL || rx->rx_qp == NULL) { - sin.sin_port = 0; - } else { - sin.sin_port = to_qpi(rx->rx_qp)->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; - } - - return usdf_cm_copy_name(info, &sin, addr, addrlen); -} - int usdf_cm_dgram_getname(fid_t fid, void *addr, size_t *addrlen) { int ret; @@ -719,19 +287,6 @@ int usdf_cm_dgram_getname(fid_t fid, void *addr, size_t *addrlen) return usdf_cm_copy_name(info, &sin, addr, addrlen); } -int usdf_cm_msg_getname(fid_t fid, void *addr, size_t *addrlen) -{ - struct usdf_ep *ep; - struct fi_info *info; - - USDF_TRACE_SYS(EP_CTRL, "\n"); - - ep = ep_fidtou(fid); - info = ep->ep_domain->dom_info; - - return usdf_cm_copy_name(info, &ep->e.msg.ep_lcl_addr, addr, addrlen); -} - /* Checks that the given address is actually a sockaddr_in of appropriate * length. "addr_format" is an FI_ constant like FI_SOCKADDR_IN indicating the * claimed type of the given address. diff --git a/prov/usnic/src/usdf_cm.h b/prov/usnic/src/usdf_cm.h index 48b92e824ef..d361818055b 100644 --- a/prov/usnic/src/usdf_cm.h +++ b/prov/usnic/src/usdf_cm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -51,7 +51,7 @@ struct usdf_connreq_msg { uint32_t creq_result; uint32_t creq_reason; uint32_t creq_datalen; - uint8_t creq_data[0]; + uint8_t creq_data[]; } __attribute__((packed)); struct usdf_connreq { @@ -67,16 +67,13 @@ struct usdf_connreq { size_t cr_resid; size_t cr_datalen; - uint8_t cr_data[0]; + uint8_t cr_data[]; }; void usdf_cm_report_failure(struct usdf_connreq *crp, int error, bool skip_data); -void usdf_cm_msg_connreq_cleanup(struct usdf_connreq *crp); -int usdf_cm_rdm_getname(fid_t fid, void *addr, size_t *addrlen); int usdf_cm_dgram_getname(fid_t fid, void *addr, size_t *addrlen); -int usdf_cm_msg_getname(fid_t fid, void *addr, size_t *addrlen); bool usdf_cm_addr_is_valid_sin(void *addr, size_t addrlen, uint32_t addr_format); diff --git a/prov/usnic/src/usdf_cq.c b/prov/usnic/src/usdf_cq.c index 0b67a18e6be..604ca15ad6f 100644 --- a/prov/usnic/src/usdf_cq.c +++ b/prov/usnic/src/usdf_cq.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include @@ -735,7 +735,6 @@ static int usdf_cq_unbind_wait(struct usdf_cq *cq) { int ret; struct usdf_wait *wait_priv; - struct epoll_event event = {0}; if (!cq->cq_attr.wait_set) { USDF_DBG_SYS(CQ, "can't unbind from non-existent wait set\n"); @@ -744,12 +743,10 @@ static int usdf_cq_unbind_wait(struct usdf_cq *cq) wait_priv = wait_ftou(cq->cq_attr.wait_set); - ret = epoll_ctl(wait_priv->object.epfd, EPOLL_CTL_DEL, - cq->object.fd, &event); + ret = ofi_epoll_del(wait_priv->object.epfd, cq->object.fd); if (ret) { - USDF_WARN_SYS(CQ, - "failed to remove FD from wait set\n"); - return -errno; + USDF_WARN_SYS(CQ, "failed to remove FD from wait set\n"); + return ret; } fid_list_remove(&wait_priv->list, &wait_priv->lock, &cq->cq_fid.fid); @@ -1166,7 +1163,6 @@ static int usdf_cq_bind_wait(struct usdf_cq *cq) { int ret; struct usdf_wait *wait_priv; - struct epoll_event event = {0}; if (!cq->cq_attr.wait_set) { USDF_DBG_SYS(CQ, "can't bind to non-existent wait set\n"); @@ -1181,9 +1177,6 @@ static int usdf_cq_bind_wait(struct usdf_cq *cq) */ wait_priv = wait_ftou(cq->cq_attr.wait_set); - event.data.ptr = cq; - event.events = EPOLLIN; - ret = fid_list_insert(&wait_priv->list, &wait_priv->lock, &cq->cq_fid.fid); if (ret) { @@ -1192,8 +1185,8 @@ static int usdf_cq_bind_wait(struct usdf_cq *cq) return ret; } - ret = epoll_ctl(wait_priv->object.epfd, EPOLL_CTL_ADD, cq->object.fd, - &event); + ret = ofi_epoll_add(wait_priv->object.epfd, cq->object.fd, + OFI_EPOLL_IN, cq); if (ret) { USDF_WARN_SYS(CQ, "failed to associate FD with wait set\n"); goto err; diff --git a/prov/usnic/src/usdf_domain.c b/prov/usnic/src/usdf_domain.c index 62981adaff3..fb4aa4caf54 100644 --- a/prov/usnic/src/usdf_domain.c +++ b/prov/usnic/src/usdf_domain.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2018, Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -55,7 +55,6 @@ #include "usnic_direct.h" #include "usdf.h" -#include "usdf_rdm.h" #include "usdf_timer.h" #include "usdf_poll.h" #include "usdf_cm.h" @@ -90,81 +89,6 @@ usdf_domain_bind(struct fid *fid, struct fid *bfid, uint64_t flags) return 0; } -static void -usdf_dom_rdc_free_data(struct usdf_domain *udp) -{ - struct usdf_rdm_connection *rdc; - int i; - - if (udp->dom_rdc_hashtab != NULL) { - - pthread_spin_lock(&udp->dom_progress_lock); - for (i = 0; i < USDF_RDM_HASH_SIZE; ++i) { - rdc = udp->dom_rdc_hashtab[i]; - while (rdc != NULL) { - usdf_timer_reset(udp->dom_fabric, - rdc->dc_timer, 0); - rdc = rdc->dc_hash_next; - } - } - pthread_spin_unlock(&udp->dom_progress_lock); - - /* XXX probably want a timeout here... */ - while (ofi_atomic_get32(&udp->dom_rdc_free_cnt) < - (int)udp->dom_rdc_total) { - pthread_yield(); - } - - free(udp->dom_rdc_hashtab); - udp->dom_rdc_hashtab = NULL; - } - - while (!SLIST_EMPTY(&udp->dom_rdc_free)) { - rdc = SLIST_FIRST(&udp->dom_rdc_free); - SLIST_REMOVE_HEAD(&udp->dom_rdc_free, dc_addr_link); - usdf_timer_free(udp->dom_fabric, rdc->dc_timer); - free(rdc); - } -} - -static int -usdf_dom_rdc_alloc_data(struct usdf_domain *udp) -{ - struct usdf_rdm_connection *rdc; - int ret; - int i; - - udp->dom_rdc_hashtab = calloc(USDF_RDM_HASH_SIZE, - sizeof(*udp->dom_rdc_hashtab)); - if (udp->dom_rdc_hashtab == NULL) { - return -FI_ENOMEM; - } - SLIST_INIT(&udp->dom_rdc_free); - ofi_atomic_initialize32(&udp->dom_rdc_free_cnt, 0); - for (i = 0; i < USDF_RDM_FREE_BLOCK; ++i) { - rdc = calloc(1, sizeof(*rdc)); - if (rdc == NULL) { - return -FI_ENOMEM; - } - ret = usdf_timer_alloc(usdf_rdm_rdc_timeout, rdc, - &rdc->dc_timer); - if (ret != 0) { - free(rdc); - return ret; - } - rdc->dc_flags = USDF_DCS_UNCONNECTED | USDF_DCF_NEW_RX; - rdc->dc_next_rx_seq = 0; - rdc->dc_next_tx_seq = 0; - rdc->dc_last_rx_ack = rdc->dc_next_tx_seq - 1; - TAILQ_INIT(&rdc->dc_wqe_posted); - TAILQ_INIT(&rdc->dc_wqe_sent); - SLIST_INSERT_HEAD(&udp->dom_rdc_free, rdc, dc_addr_link); - ofi_atomic_inc32(&udp->dom_rdc_free_cnt); - } - udp->dom_rdc_total = USDF_RDM_FREE_BLOCK; - return 0; -} - static int usdf_domain_close(fid_t fid) { @@ -184,7 +108,6 @@ usdf_domain_close(fid_t fid) return ret; } } - usdf_dom_rdc_free_data(udp); if (udp->dom_eq != NULL) { ofi_atomic_dec32(&udp->dom_eq->eq_refcnt); @@ -223,6 +146,7 @@ static struct fi_ops_domain usdf_domain_ops = { .stx_ctx = fi_no_stx_context, .srx_ctx = fi_no_srx_context, .query_atomic = usdf_query_atomic, + .query_collective = fi_no_query_collective, }; int @@ -344,11 +268,6 @@ usdf_domain_open(struct fid_fabric *fabric, struct fi_info *info, udp->dom_info->dest_addr = NULL; } - ret = usdf_dom_rdc_alloc_data(udp); - if (ret != 0) { - goto fail; - } - udp->dom_fabric = fp; LIST_INSERT_HEAD(&fp->fab_domain_list, udp, dom_link); ofi_atomic_initialize32(&udp->dom_refcnt, 0); @@ -365,91 +284,84 @@ usdf_domain_open(struct fid_fabric *fabric, struct fi_info *info, if (udp->dom_dev != NULL) { usd_close(udp->dom_dev); } - usdf_dom_rdc_free_data(udp); free(udp); } return ret; } +/* In pre-1.4, the domain name was NULL. + * + * There used to be elaborate schemes to try to preserve this pre-1.4 + * behavior. In Nov 2019 discussions, however, it was determined that + * we could rationalize classifying this as buggy behavior. + * Specifically: we should just now always return a domain name -- + * even if the requested version is <1.4. + * + * This greatly simplifies the logic here, and also greatly simplifies + * layering with the rxd provider. + */ int usdf_domain_getname(uint32_t version, struct usd_device_attrs *dap, char **name) { int ret = FI_SUCCESS; char *buf = NULL; - if (FI_VERSION_GE(version, FI_VERSION(1, 4))) { - buf = strdup(dap->uda_devname); - if (!buf) { - ret = -errno; - USDF_DBG("strdup failed while creating domain name\n"); - } + buf = strdup(dap->uda_devname); + if (NULL == buf) { + ret = -errno; + USDF_DBG("strdup failed while creating domain name\n"); + } else { + *name = buf; } - *name = buf; return ret; } -/* In pre-1.4 the domain name was NULL. This is unfortunate as it makes it - * difficult to tell whether providing a name was intended. In this case, it can - * be broken into 4 cases: +/* Check to see if the name supplied in a hint matches the name of our + * current domain. + * + * In pre-1.4, the domain name was NULL. * - * 1. Version is greater than or equal to 1.4 and a non-NULL hint is provided. - * Just do a string compare. - * 2. Version is greater than or equal to 1.4 and provided hint is NULL. Treat - * this as _valid_ as it could be an application requesting a 1.4 domain name - * but not providing an explicit hint. - * 3. Version is less than 1.4 and a name hint is provided. This should always - * be _invalid_. - * 4. Version is less than 1.4 and name hint is NULL. This will always be - * _valid_. + * There used to be elaborate schemes to try to preserve this pre-1.4 + * behavior. In Nov 2019 discussions, however, it was determined that + * we could rationalize classifying this as buggy behavior. + * Specifically: we should just now always return a domain name -- + * even if the requested version is <1.4. + * + * This greatly simplifies the logic here, and also greatly simplifies + * layering with the rxd provider. + * + * Hence, if a hint was provided, check the domain name (that we now + * always have) against the hint. */ bool usdf_domain_checkname(uint32_t version, struct usd_device_attrs *dap, const char *hint) { - char *reference; + char *reference = NULL; bool valid; int ret; - USDF_DBG("checking domain name: version=%d, domain name='%s'\n", - version, hint); - - if (version) { - valid = false; - - ret = usdf_domain_getname(version, dap, &reference); - if (ret < 0) - return false; - - /* If the reference name exists, then this is version 1.4 or - * greater. - */ - if (reference) { - if (hint) { - /* Case 1 */ - valid = (strcmp(reference, hint) == 0); - } else { - /* Case 2 */ - valid = true; - } - } else { - /* Case 3 & 4 */ - valid = (hint == NULL); - } + /* If no hint was provided, then by definition, we agree with + * the hint. */ + if (NULL == hint) { + return true; + } - if (!valid) - USDF_DBG("given hint %s does not match %s -- invalid\n", - hint, reference); + USDF_DBG("checking domain name: domain name='%s'\n", hint); - free(reference); - return valid; + ret = usdf_domain_getname(version, dap, &reference); + if (ret < 0) { + return false; } - /* If hint is non-NULL then assume the version is 1.4 if not provided. - */ - if (hint) - return usdf_domain_checkname(FI_VERSION(1, 4), dap, hint); + valid = (strcmp(reference, hint) == 0); + if (!valid) { + USDF_DBG("given hint %s does not match %s -- invalid\n", + hint, reference); + } - return usdf_domain_checkname(FI_VERSION(1, 3), dap, hint); + free(reference); + return valid; } /* Query domain's atomic capability. diff --git a/prov/usnic/src/usdf_endpoint.c b/prov/usnic/src/usdf_endpoint.c index 2dedac136c4..1fa33ce1728 100644 --- a/prov/usnic/src/usdf_endpoint.c +++ b/prov/usnic/src/usdf_endpoint.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2016, Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -41,7 +41,7 @@ #include #include #include -#include +#include #include #include #include @@ -68,10 +68,6 @@ usdf_endpoint_open(struct fid_domain *domain, struct fi_info *info, switch (info->ep_attr->type) { case FI_EP_DGRAM: return usdf_ep_dgram_open(domain, info, ep_o, context); - case FI_EP_MSG: - return usdf_ep_msg_open(domain, info, ep_o, context); - case FI_EP_RDM: - return usdf_ep_rdm_open(domain, info, ep_o, context); default: return -FI_ENODEV; } diff --git a/prov/usnic/src/usdf_endpoint.h b/prov/usnic/src/usdf_endpoint.h index 994f76c1519..1bbad52869e 100644 --- a/prov/usnic/src/usdf_endpoint.h +++ b/prov/usnic/src/usdf_endpoint.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2016, Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -39,12 +39,6 @@ int usdf_ep_port_bind(struct usdf_ep *ep, struct fi_info *info); int usdf_ep_dgram_open(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); -int usdf_ep_msg_open(struct fid_domain *domain, struct fi_info *info, - struct fid_ep **ep, void *context); -int usdf_ep_rdm_open(struct fid_domain *domain, struct fi_info *info, - struct fid_ep **ep, void *context); -int usdf_ep_msg_get_queues(struct usdf_ep *ep); -void usdf_ep_msg_release_queues(struct usdf_ep *ep); int usdf_msg_upd_lcl_addr(struct usdf_ep *ep); int usdf_ep_getopt_connected(fid_t fid, int level, int optname, void *optval, diff --git a/prov/usnic/src/usdf_ep_dgram.c b/prov/usnic/src/usdf_ep_dgram.c index 6cd6f109b15..d260308dbb2 100644 --- a/prov/usnic/src/usdf_ep_dgram.c +++ b/prov/usnic/src/usdf_ep_dgram.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2018, Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -565,6 +565,14 @@ int usdf_dgram_fill_dom_attr(uint32_t version, const struct fi_info *hints, return -FI_ENODATA; } + switch (hints->domain_attr->av_type) { + case FI_AV_UNSPEC: + case FI_AV_MAP: + break; + default: + return -FI_ENODATA; + } + if (ofi_check_mr_mode(&usdf_ops, version, defaults.mr_mode, hints)) return -FI_ENODATA; diff --git a/prov/usnic/src/usdf_ep_msg.c b/prov/usnic/src/usdf_ep_msg.c deleted file mode 100644 index 83e7a3a6012..00000000000 --- a/prov/usnic/src/usdf_ep_msg.c +++ /dev/null @@ -1,1147 +0,0 @@ -/* - * Copyright (c) 2014-2018, Cisco Systems, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "config.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include "ofi.h" -#include "ofi_enosys.h" -#include "ofi_util.h" - -#include "usnic_direct.h" -#include "usd.h" -#include "usdf.h" -#include "usdf_cm.h" -#include "usdf_endpoint.h" -#include "fi_ext_usnic.h" -#include "usdf_rudp.h" -#include "usdf_msg.h" -#include "usdf_cq.h" -#include "usdf_timer.h" - -/******************************************************************************* - * Default values for msg attributes - ******************************************************************************/ -static const struct fi_tx_attr msg_dflt_tx_attr = { - .caps = USDF_MSG_CAPS, - .mode = USDF_MSG_SUPP_MODE, - .op_flags = 0, - .msg_order = USDF_MSG_MSG_ORDER, - .comp_order = USDF_MSG_COMP_ORDER, - .inject_size = USDF_MSG_MAX_INJECT_SIZE, - .size = USDF_MSG_DFLT_CTX_SIZE, - .iov_limit = USDF_MSG_MAX_SGE, - .rma_iov_limit = USDF_MSG_RMA_IOV_LIMIT -}; - -static const struct fi_rx_attr msg_dflt_rx_attr = { - .caps = USDF_MSG_CAPS, - .mode = USDF_MSG_SUPP_MODE, - .op_flags = 0, - .msg_order = USDF_MSG_MSG_ORDER, - .comp_order = USDF_MSG_COMP_ORDER, - .size = USDF_MSG_DFLT_CTX_SIZE, - .total_buffered_recv = 0, - .iov_limit = USDF_MSG_IOV_LIMIT -}; - -/* The protocol for MSG is still under development. Version 0 does not provide - * any interoperability. - */ -static const struct fi_ep_attr msg_dflt_ep_attr = { - .type = FI_EP_MSG, - .protocol = FI_PROTO_RUDP, - .protocol_version = 0, - .msg_prefix_size = 0, - .max_msg_size = USDF_MSG_MAX_MSG, - .max_order_raw_size = 0, - .max_order_war_size = 0, - .max_order_waw_size = 0, - .mem_tag_format = 0, - .tx_ctx_cnt = 1, - .rx_ctx_cnt = 1 -}; - -static const struct fi_domain_attr msg_dflt_domain_attr = { - .caps = USDF_DOM_CAPS, - .threading = FI_THREAD_UNSPEC, - .control_progress = FI_PROGRESS_AUTO, - .data_progress = FI_PROGRESS_MANUAL, - .resource_mgmt = FI_RM_DISABLED, - .mr_mode = FI_MR_ALLOCATED | FI_MR_LOCAL | FI_MR_BASIC, - .cntr_cnt = USDF_MSG_CNTR_CNT, - .mr_iov_limit = USDF_MSG_MR_IOV_LIMIT, - .mr_cnt = USDF_MSG_MR_CNT, -}; - -static struct fi_ops_atomic usdf_msg_atomic_ops = { - .size = sizeof(struct fi_ops_atomic), - .write = fi_no_atomic_write, - .writev = fi_no_atomic_writev, - .writemsg = fi_no_atomic_writemsg, - .inject = fi_no_atomic_inject, - .readwrite = fi_no_atomic_readwrite, - .readwritev = fi_no_atomic_readwritev, - .readwritemsg = fi_no_atomic_readwritemsg, - .compwrite = fi_no_atomic_compwrite, - .compwritev = fi_no_atomic_compwritev, - .compwritemsg = fi_no_atomic_compwritemsg, - .writevalid = fi_no_atomic_writevalid, - .readwritevalid = fi_no_atomic_readwritevalid, - .compwritevalid = fi_no_atomic_compwritevalid, -}; - -/******************************************************************************* - * Fill functions for attributes - ******************************************************************************/ -int usdf_msg_fill_ep_attr(const struct fi_info *hints, struct fi_info *fi, - struct usd_device_attrs *dap) -{ - struct fi_ep_attr defaults; - - defaults = msg_dflt_ep_attr; - - if (!hints || !hints->ep_attr) - goto out; - - if (hints->ep_attr->max_msg_size > defaults.max_msg_size) - return -FI_ENODATA; - - switch (hints->ep_attr->protocol) { - case FI_PROTO_UNSPEC: - case FI_PROTO_RUDP: - break; - default: - return -FI_ENODATA; - } - - if (hints->ep_attr->tx_ctx_cnt > defaults.tx_ctx_cnt) - return -FI_ENODATA; - - if (hints->ep_attr->rx_ctx_cnt > defaults.rx_ctx_cnt) - return -FI_ENODATA; - - if (hints->ep_attr->max_order_raw_size > defaults.max_order_raw_size) - return -FI_ENODATA; - - if (hints->ep_attr->max_order_war_size > defaults.max_order_war_size) - return -FI_ENODATA; - - if (hints->ep_attr->max_order_waw_size > defaults.max_order_waw_size) - return -FI_ENODATA; - -out: - *fi->ep_attr = defaults; - - return FI_SUCCESS; -} - -int usdf_msg_fill_dom_attr(uint32_t version, const struct fi_info *hints, - struct fi_info *fi, struct usd_device_attrs *dap) -{ - int ret; - struct fi_domain_attr defaults; - - defaults = msg_dflt_domain_attr; - ret = usdf_domain_getname(version, dap, &defaults.name); - if (ret < 0) - return -FI_ENODATA; - - if (!hints || !hints->domain_attr) - goto catch; - - /* how to handle fi_thread_fid, fi_thread_completion, etc? - */ - switch (hints->domain_attr->threading) { - case FI_THREAD_UNSPEC: - case FI_THREAD_ENDPOINT: - break; - default: - return -FI_ENODATA; - } - - /* how to handle fi_progress_manual? - */ - switch (hints->domain_attr->control_progress) { - case FI_PROGRESS_UNSPEC: - case FI_PROGRESS_AUTO: - break; - default: - return -FI_ENODATA; - } - - switch (hints->domain_attr->data_progress) { - case FI_PROGRESS_UNSPEC: - case FI_PROGRESS_MANUAL: - break; - default: - return -FI_ENODATA; - } - - switch (hints->domain_attr->resource_mgmt) { - case FI_RM_UNSPEC: - case FI_RM_DISABLED: - break; - default: - return -FI_ENODATA; - } - - switch (hints->domain_attr->caps) { - case 0: - case FI_REMOTE_COMM: - break; - default: - USDF_WARN_SYS(DOMAIN, - "invalid domain capabilities\n"); - return -FI_ENODATA; - } - - if (ofi_check_mr_mode(&usdf_ops, version, defaults.mr_mode, hints)) - return -FI_ENODATA; - - if (hints->domain_attr->mr_cnt <= USDF_MSG_MR_CNT) { - defaults.mr_cnt = hints->domain_attr->mr_cnt; - } else { - USDF_DBG_SYS(DOMAIN, "mr_count exceeded provider limit\n"); - return -FI_ENODATA; - } - -catch: - /* catch the version changes here. */ - ret = usdf_catch_dom_attr(version, hints, &defaults); - if (ret) - return ret; - - *fi->domain_attr = defaults; - - return FI_SUCCESS; -} - -int usdf_msg_fill_tx_attr(uint32_t version, const struct fi_info *hints, - struct fi_info *fi) -{ - int ret; - struct fi_tx_attr defaults; - - defaults = msg_dflt_tx_attr; - - if (!hints || !hints->tx_attr) - goto catch; - - /* make sure we can support the caps that are requested*/ - if (hints->tx_attr->caps & ~USDF_MSG_CAPS) - return -FI_ENODATA; - - /* clear the mode bits the app doesn't support */ - if (hints->mode || hints->tx_attr->mode) - defaults.mode &= (hints->mode | hints->tx_attr->mode); - - defaults.op_flags |= hints->tx_attr->op_flags; - - if ((hints->tx_attr->msg_order | USDF_MSG_MSG_ORDER) != - USDF_MSG_MSG_ORDER) - return -FI_ENODATA; - - if ((hints->tx_attr->comp_order | USDF_MSG_COMP_ORDER) != - USDF_MSG_COMP_ORDER) - return -FI_ENODATA; - - if (hints->tx_attr->inject_size > defaults.inject_size) - return -FI_ENODATA; - - if (hints->tx_attr->iov_limit > defaults.iov_limit) - return -FI_ENODATA; - - if (hints->tx_attr->rma_iov_limit > defaults.rma_iov_limit) - return -FI_ENODATA; - - if (hints->tx_attr->size > defaults.size) - return -FI_ENODATA; - -catch: - /* catch version changes here. */ - ret = usdf_catch_tx_attr(version, &defaults); - if (ret) - return ret; - - *fi->tx_attr = defaults; - - return FI_SUCCESS; -} - -int usdf_msg_fill_rx_attr(uint32_t version, const struct fi_info *hints, struct fi_info *fi) -{ - int ret; - struct fi_rx_attr defaults; - - defaults = msg_dflt_rx_attr; - - if (!hints || !hints->rx_attr) - goto catch; - - /* make sure we can support the capabilities that are requested */ - if (hints->rx_attr->caps & ~USDF_MSG_CAPS) - return -FI_ENODATA; - - /* clear the mode bits the app doesn't support */ - if (hints->mode || hints->rx_attr->mode) - defaults.mode &= (hints->mode | hints->rx_attr->mode); - - defaults.op_flags |= hints->rx_attr->op_flags; - - if ((hints->rx_attr->msg_order | USDF_MSG_MSG_ORDER) != - USDF_MSG_MSG_ORDER) - return -FI_ENODATA; - if ((hints->rx_attr->comp_order | USDF_MSG_COMP_ORDER) != - USDF_MSG_COMP_ORDER) - return -FI_ENODATA; - - if (hints->rx_attr->total_buffered_recv > - defaults.total_buffered_recv) - return -FI_ENODATA; - - if (hints->rx_attr->iov_limit > defaults.iov_limit) - return -FI_ENODATA; - - if (hints->rx_attr->size > defaults.size) - return -FI_ENODATA; - -catch: - /* catch version changes here. */ - ret = usdf_catch_rx_attr(version, &defaults); - if (ret) - return ret; - - *fi->rx_attr = defaults; - - return FI_SUCCESS; -} - -static int -usdf_tx_msg_enable(struct usdf_tx *tx) -{ - struct usdf_msg_qe *wqe; - struct usdf_domain *udp; - struct usdf_cq_hard *hcq; - struct usd_filter filt; - int ret; - size_t i; - - udp = tx->tx_domain; - - hcq = tx->t.msg.tx_hcq; - if (hcq == NULL) { - return -FI_ENOCQ; - } - - USDF_INFO("allocating 1 QP for FI_EP_MSG TX context\n"); - /* XXX temp until we can allocate WQ and RQ independently */ - filt.uf_type = USD_FTY_UDP; - filt.uf_filter.uf_udp.u_port = 0; - ret = usd_create_qp(udp->dom_dev, - USD_QTR_UDP, - USD_QTY_UD, - hcq->cqh_ucq, - hcq->cqh_ucq, - udp->dom_fabric->fab_dev_attrs->uda_max_send_credits, - udp->dom_fabric->fab_dev_attrs->uda_max_recv_credits, - &filt, - &tx->tx_qp); - if (ret != 0) { - USDF_INFO("QP allocation failed (%s)\n", strerror(-ret)); - goto fail; - } - tx->tx_qp->uq_context = tx; - - /* msg send queue */ - tx->t.msg.tx_wqe_buf = malloc(tx->tx_attr.size * - sizeof(struct usdf_msg_qe)); - if (tx->t.msg.tx_wqe_buf == NULL) { - ret = -errno; - USDF_INFO("malloc failed (%s)\n", strerror(-ret)); - goto fail; - } - - ret = usd_alloc_mr(tx->tx_domain->dom_dev, - tx->tx_attr.size * USDF_MSG_MAX_INJECT_SIZE, - (void **)&tx->t.msg.tx_inject_bufs); - if (ret) { - USDF_INFO("usd_alloc_mr failed (%s)\n", strerror(-ret)); - goto fail; - } - - /* populate free list */ - TAILQ_INIT(&tx->t.msg.tx_free_wqe); - wqe = tx->t.msg.tx_wqe_buf; - for (i = 0; i < tx->tx_attr.size; ++i) { - wqe->ms_inject_buf = - &tx->t.msg.tx_inject_bufs[USDF_MSG_MAX_INJECT_SIZE * i]; - TAILQ_INSERT_TAIL(&tx->t.msg.tx_free_wqe, wqe, ms_link); - ++wqe; - } - tx->t.msg.tx_num_free_wqe = tx->tx_attr.size; - - return 0; - -fail: - if (tx->t.msg.tx_wqe_buf != NULL) { - free(tx->t.msg.tx_wqe_buf); - tx->t.msg.tx_wqe_buf = NULL; - TAILQ_INIT(&tx->t.msg.tx_free_wqe); - tx->t.msg.tx_num_free_wqe = 0; - } - - if (tx->t.msg.tx_inject_bufs != NULL) { - usd_free_mr(tx->t.msg.tx_inject_bufs); - tx->t.msg.tx_inject_bufs = NULL; - } - - if (tx->tx_qp != NULL) { - usd_destroy_qp(tx->tx_qp); - } - return ret; -} - -static int -usdf_rx_msg_enable(struct usdf_rx *rx) -{ - struct usdf_domain *udp; - struct usdf_cq_hard *hcq; - struct usdf_msg_qe *rqe; - struct usd_filter filt; - struct usd_qp_impl *qp; - uint8_t *ptr; - size_t mtu; - int ret; - size_t i; - - udp = rx->rx_domain; - - hcq = rx->r.msg.rx_hcq; - if (hcq == NULL) { - return -FI_ENOCQ; - } - - USDF_INFO("allocating 1 QP for FI_EP_MSG RX context\n"); - /* XXX temp until we can allocate WQ and RQ independently */ - filt.uf_type = USD_FTY_UDP; - filt.uf_filter.uf_udp.u_port = 0; - ret = usd_create_qp(udp->dom_dev, - USD_QTR_UDP, - USD_QTY_UD, - hcq->cqh_ucq, - hcq->cqh_ucq, - udp->dom_fabric->fab_dev_attrs->uda_max_send_credits, - udp->dom_fabric->fab_dev_attrs->uda_max_recv_credits, - &filt, - &rx->rx_qp); - if (ret != 0) { - USDF_INFO("QP allocation failed (%s)\n", strerror(-ret)); - goto fail; - } - rx->rx_qp->uq_context = rx; - qp = to_qpi(rx->rx_qp); - - /* receive buffers */ - mtu = rx->rx_domain->dom_fabric->fab_dev_attrs->uda_mtu; - ret = usd_alloc_mr(rx->rx_domain->dom_dev, - qp->uq_rq.urq_num_entries * mtu, - (void **)&rx->r.msg.rx_bufs); - if (ret != 0) { - USDF_INFO("usd_alloc_mr failed (%s)\n", strerror(-ret)); - goto fail; - } - - /* post all the buffers */ - ptr = rx->r.msg.rx_bufs; - for (i = 0; i < qp->uq_rq.urq_num_entries - 1; ++i) { - usdf_msg_post_recv(rx, ptr, mtu); - ptr += mtu; - } - - /* msg recv queue */ - rx->r.msg.rx_rqe_buf = malloc(rx->rx_attr.size * - sizeof(struct usdf_msg_qe)); - if (rx->r.msg.rx_rqe_buf == NULL) { - ret = -errno; - USDF_INFO("malloc failed (%s)\n", strerror(-ret)); - goto fail; - } - - /* populate free list */ - TAILQ_INIT(&rx->r.msg.rx_free_rqe); - rqe = rx->r.msg.rx_rqe_buf; - for (i = 0; i < rx->rx_attr.size; ++i) { - TAILQ_INSERT_TAIL(&rx->r.msg.rx_free_rqe, rqe, ms_link); - ++rqe; - } - rx->r.msg.rx_num_free_rqe = rx->rx_attr.size; - - return 0; - -fail: - if (rx->r.msg.rx_rqe_buf != NULL) { - free(rx->r.msg.rx_rqe_buf); - rx->r.msg.rx_rqe_buf = NULL; - TAILQ_INIT(&rx->r.msg.rx_free_rqe); - rx->r.msg.rx_num_free_rqe = 0; - } - if (rx->r.msg.rx_bufs != NULL) { - usd_free_mr(rx->r.msg.rx_bufs); - rx->r.msg.rx_bufs = NULL; - } - if (rx->rx_qp != NULL) { - usd_destroy_qp(rx->rx_qp); - } - return ret; -} - -/* - * release queue resources - */ -void -usdf_ep_msg_release_queues(struct usdf_ep *ep) -{ - /* XXX */ -} - -/* - * Allocate any missing queue resources for this endpoint - */ -int -usdf_ep_msg_get_queues(struct usdf_ep *ep) -{ - struct usdf_tx *tx; - struct usdf_rx *rx; - int ret; - - /* Must have TX context at this point */ - tx = ep->ep_tx; - if (tx == NULL) { - ret = -FI_EINVAL; - goto fail; - } - if (tx->tx_qp == NULL) { - ret = usdf_tx_msg_enable(tx); - if (ret != 0) { - goto fail; - } - } - - /* Must have RX context at this point */ - rx = ep->ep_rx; - if (rx == NULL) { - ret = -FI_EINVAL; - goto fail; - } - if (rx->rx_qp == NULL) { - ret = usdf_rx_msg_enable(rx); - if (ret != 0) { - goto fail; - } - } - - return 0; -fail: - return ret; -} - -static int -usdf_ep_msg_enable(struct fid_ep *fep) -{ - struct usdf_ep *ep; - int ret; - - ep = ep_ftou(fep); - - ret = usdf_ep_msg_get_queues(ep); - if (ret == FI_SUCCESS) - ep->flags |= USDF_EP_ENABLED; - - return ret; -} - -static ssize_t -usdf_ep_msg_cancel(fid_t fid, void *context) -{ - USDF_TRACE_SYS(EP_CTRL, "\n"); - /* XXX should this have a non-empty implementation? */ - return 0; -} - -/* - * Find a hard CQ within this soft CQ that services message EPs - */ -static struct usdf_cq_hard * -usdf_ep_msg_find_cqh(struct usdf_cq *cq) -{ - struct usdf_cq_hard *hcq; - - TAILQ_FOREACH(hcq, &cq->c.soft.cq_list, cqh_link) { - if (hcq->cqh_progress == usdf_msg_hcq_progress) { - return hcq; - } - } - return NULL; -} - -static int -usdf_ep_msg_bind_cq(struct usdf_ep *ep, struct usdf_cq *cq, uint64_t flags) -{ - struct usdf_cq_hard **hcqp; - struct usdf_cq_hard *hcq; - int ret; - - /* - * The CQ is actually bound the RX or TX ctx, not the EP directly - */ - if (flags & FI_SEND) { - /* if TX is shared, but bind directly */ - if (ep->ep_tx->tx_fid.fid.fclass == FI_CLASS_STX_CTX) { - return -FI_EINVAL; - } - hcqp = &ep->ep_tx->t.msg.tx_hcq; - } else { - /* if RX is shared, but bind directly */ - if (ep->ep_rx->rx_fid.fid.fclass == FI_CLASS_SRX_CTX) { - return -FI_EINVAL; - } - hcqp = &ep->ep_rx->r.msg.rx_hcq; - } - if (*hcqp != NULL) { - return -FI_EINVAL; - } - - /* Make sure this CQ is "soft" */ - ret = usdf_cq_make_soft(cq); - if (ret != 0) { - return ret; - } - - if ((cq->cq_attr.wait_obj == FI_WAIT_FD) || - (cq->cq_attr.wait_obj == FI_WAIT_SET)) { - cq->object.fd = eventfd(0, EFD_NONBLOCK); - if (cq->object.fd == -1) { - USDF_DBG_SYS(CQ, "creating eventfd failed: %s\n", - strerror(errno)); - return -errno; - } - - USDF_DBG_SYS(CQ, "successfully created eventfd: %d\n", - cq->object.fd); - } - - /* Use existing msg CQ if present */ - hcq = usdf_ep_msg_find_cqh(cq); - if (hcq == NULL) { - hcq = malloc(sizeof(*hcq)); - if (hcq == NULL) { - return -errno; - } - - ret = usdf_cq_create_cq(cq, &hcq->cqh_ucq, false); - if (ret) - goto fail; - - hcq->cqh_cq = cq; - ofi_atomic_initialize32(&hcq->cqh_refcnt, 0); - hcq->cqh_progress = usdf_msg_hcq_progress; - hcq->cqh_post = usdf_cq_post_soft; - TAILQ_INSERT_TAIL(&cq->c.soft.cq_list, hcq, cqh_link); - - /* add to domain progression list */ - TAILQ_INSERT_TAIL(&ep->ep_domain->dom_hcq_list, - hcq, cqh_dom_link); - } - ofi_atomic_inc32(&hcq->cqh_refcnt); - ofi_atomic_inc32(&cq->cq_refcnt); - *hcqp = hcq; - return 0; - -fail: - free(hcq); - return ret; -} - -static int -usdf_ep_msg_bind(struct fid *fid, struct fid *bfid, uint64_t flags) -{ - int ret; - struct usdf_ep *ep; - struct usdf_cq *cq; - - USDF_TRACE_SYS(EP_CTRL, "\n"); - - /* Validate the flags. */ - ret = ofi_ep_bind_valid(&usdf_ops, bfid, flags); - if (ret) - return ret; - - ep = ep_fidtou(fid); - - switch (bfid->fclass) { - - case FI_CLASS_CQ: - if (flags & FI_SEND) { - cq = cq_fidtou(bfid); - if (flags & FI_SELECTIVE_COMPLETION) - ep->ep_tx_dflt_signal_comp = 0; - else - ep->ep_tx_dflt_signal_comp = 1; - usdf_ep_msg_bind_cq(ep, cq, FI_SEND); - } - - if (flags & FI_RECV) { - cq = cq_fidtou(bfid); - if (flags & FI_SELECTIVE_COMPLETION) - ep->ep_rx_dflt_signal_comp = 0; - else - ep->ep_rx_dflt_signal_comp = 1; - usdf_ep_msg_bind_cq(ep, cq, FI_RECV); - } - break; - - case FI_CLASS_EQ: - if (ep->ep_eq != NULL) { - return -FI_EINVAL; - } - ep->ep_eq = eq_fidtou(bfid); - ofi_atomic_inc32(&ep->ep_eq->eq_refcnt); - break; - default: - return -FI_EINVAL; - } - - return 0; -} - -static int -usdf_msg_rx_ctx_close(fid_t fid) -{ - struct usdf_rx *rx; - struct usdf_cq_hard *hcq; - - rx = rx_fidtou(fid); - - if (ofi_atomic_get32(&rx->rx_refcnt) > 0) { - return -FI_EBUSY; - } - - hcq = rx->r.msg.rx_hcq; - if (hcq != NULL) { - ofi_atomic_dec32(&hcq->cqh_refcnt); - ofi_atomic_dec32(&hcq->cqh_cq->cq_refcnt); - } - - if (rx->rx_qp != NULL) { - usd_free_mr(rx->r.msg.rx_bufs); - free(rx->r.msg.rx_rqe_buf); - usd_destroy_qp(rx->rx_qp); - } - ofi_atomic_dec32(&rx->rx_domain->dom_refcnt); - - free(rx); - - return 0; -} - -static int -usdf_msg_tx_ctx_close(fid_t fid) -{ - struct usdf_tx *tx; - struct usdf_cq_hard *hcq; - - tx = tx_fidtou(fid); - - if (ofi_atomic_get32(&tx->tx_refcnt) > 0) { - return -FI_EBUSY; - } - - hcq = tx->t.msg.tx_hcq; - if (hcq != NULL) { - ofi_atomic_dec32(&hcq->cqh_refcnt); - ofi_atomic_dec32(&hcq->cqh_cq->cq_refcnt); - } - - if (tx->tx_qp != NULL) { - usd_free_mr(tx->t.msg.tx_inject_bufs); - free(tx->t.msg.tx_wqe_buf); - usd_destroy_qp(tx->tx_qp); - } - ofi_atomic_dec32(&tx->tx_domain->dom_refcnt); - - free(tx); - - return 0; -} - -static int -usdf_ep_msg_close(fid_t fid) -{ - struct usdf_ep *ep; - - USDF_TRACE_SYS(EP_CTRL, "\n"); - - ep = ep_fidtou(fid); - - if (ofi_atomic_get32(&ep->ep_refcnt) > 0) { - return -FI_EBUSY; - } - - if (ep->ep_rx != NULL) { - ofi_atomic_dec32(&ep->ep_rx->rx_refcnt); - if (rx_utofid(ep->ep_rx)->fclass == FI_CLASS_RX_CTX) { - (void) usdf_msg_rx_ctx_close(rx_utofid(ep->ep_rx)); - } - } - - if (ep->ep_tx != NULL) { - ofi_atomic_dec32(&ep->ep_tx->tx_refcnt); - if (tx_utofid(ep->ep_tx)->fclass == FI_CLASS_TX_CTX) { - (void) usdf_msg_tx_ctx_close(tx_utofid(ep->ep_tx)); - } - } - - ofi_atomic_dec32(&ep->ep_domain->dom_refcnt); - if (ep->ep_eq != NULL) { - ofi_atomic_dec32(&ep->ep_eq->eq_refcnt); - } - usdf_timer_free(ep->ep_domain->dom_fabric, ep->e.msg.ep_ack_timer); - - free(ep); - return 0; -} - -static struct fi_ops_ep usdf_base_msg_ops = { - .size = sizeof(struct fi_ops_ep), - .cancel = usdf_ep_msg_cancel, - .getopt = usdf_ep_getopt_connected, - .setopt = usdf_ep_setopt, - .tx_ctx = fi_no_tx_ctx, - .rx_ctx = fi_no_rx_ctx, - .rx_size_left = usdf_msg_rx_size_left, - .tx_size_left = usdf_msg_tx_size_left, -}; - -static struct fi_ops_cm usdf_cm_msg_ops = { - .size = sizeof(struct fi_ops_cm), - .setname = fi_no_setname, - .getname = usdf_cm_msg_getname, - .getpeer = fi_no_getpeer, - .connect = usdf_cm_msg_connect, - .listen = fi_no_listen, - .accept = usdf_cm_msg_accept, - .reject = fi_no_reject, - .shutdown = fi_no_shutdown, - .join = fi_no_join, -}; - -static struct fi_ops_msg usdf_msg_ops = { - .size = sizeof(struct fi_ops_msg), - .recv = usdf_msg_recv, - .recvv = usdf_msg_recvv, - .recvmsg = usdf_msg_recvmsg, - .send = usdf_msg_send, - .sendv = usdf_msg_sendv, - .sendmsg = usdf_msg_sendmsg, - .inject = usdf_msg_inject, - .senddata = fi_no_msg_senddata, - .injectdata = fi_no_msg_injectdata, -}; - -static int usdf_ep_msg_control(struct fid *fid, int command, void *arg) -{ - struct fid_ep *ep; - int ret; - - USDF_TRACE_SYS(EP_CTRL, "\n"); - - switch (fid->fclass) { - case FI_CLASS_EP: - ep = container_of(fid, struct fid_ep, fid); - switch (command) { - case FI_ENABLE: - ret = usdf_ep_msg_enable(ep); - break; - default: - ret = -FI_ENOSYS; - } - break; - default: - ret = -FI_ENOSYS; - } - - return ret; -} - -static struct fi_ops usdf_ep_msg_ops = { - .size = sizeof(struct fi_ops), - .close = usdf_ep_msg_close, - .bind = usdf_ep_msg_bind, - .control = usdf_ep_msg_control, - .ops_open = fi_no_ops_open -}; - -/* update the EP's local address field based on the current state of the EP */ -int usdf_msg_upd_lcl_addr(struct usdf_ep *ep) -{ - int ret; - int lower_sockfd; - socklen_t slen; - - if (ep->e.msg.ep_connreq == NULL) { - /* might be -1 if no parent PEP was passed at open time */ - lower_sockfd = ep->e.msg.ep_cm_sock; - } else { - lower_sockfd = ep->e.msg.ep_connreq->cr_sockfd; - } - - if (lower_sockfd == -1) { - USDF_DBG_SYS(EP_CTRL, "no CM socket yet, use fabric addr\n"); - ep->e.msg.ep_lcl_addr.sin_family = AF_INET; - ep->e.msg.ep_lcl_addr.sin_addr.s_addr = - ep->ep_domain->dom_fabric->fab_dev_attrs->uda_ipaddr_be; - ep->e.msg.ep_lcl_addr.sin_port = 0; - } else { - slen = sizeof(ep->e.msg.ep_lcl_addr); - ret = getsockname(lower_sockfd, &ep->e.msg.ep_lcl_addr, &slen); - if (ret == -1) { - return -errno; - } - assert(((struct sockaddr *)&ep->e.msg.ep_lcl_addr)->sa_family == AF_INET); - assert(slen == sizeof(ep->e.msg.ep_lcl_addr)); - } - - return 0; -} - -int -usdf_ep_msg_open(struct fid_domain *domain, struct fi_info *info, - struct fid_ep **ep_o, void *context) -{ - struct usdf_domain *udp; - struct usdf_fabric *fp; - struct usdf_tx *tx; - struct usdf_rx *rx; - struct usdf_ep *ep; - int ret; - struct usdf_connreq *connreq; - struct usdf_pep *parent_pep; - int is_bound; - uint32_t api_version; - - USDF_TRACE_SYS(EP_CTRL, "\n"); - - connreq = NULL; - parent_pep = NULL; - - ep = NULL; - rx = NULL; - tx = NULL; - if ((info->caps & ~USDF_MSG_CAPS) != 0) { - return -FI_EBADFLAGS; - } - - if (info->handle != NULL) { - switch (info->handle->fclass) { - case FI_CLASS_CONNREQ: - connreq = (struct usdf_connreq *)info->handle; - break; - case FI_CLASS_PEP: - parent_pep = pep_fidtou(info->handle); - break; - default: - USDF_WARN_SYS(EP_CTRL, - "\"handle\" should be a PEP, CONNREQ (or NULL)\n"); - return -FI_EINVAL; - } - } - - udp = dom_ftou(domain); - fp = udp->dom_fabric; - api_version = fp->fab_attr.fabric->api_version; - - /* allocate peer table if not done */ - if (udp->dom_peer_tab == NULL) { - udp->dom_peer_tab = calloc(USDF_MAX_PEERS, sizeof(ep)); - } - if (udp->dom_peer_tab == NULL) { - ret = -errno; - goto fail; - } - - ep = calloc(1, sizeof(*ep)); - if (ep == NULL) { - ret = -errno; - goto fail; - } - - ep->ep_fid.fid.fclass = FI_CLASS_EP; - ep->ep_fid.fid.context = context; - ep->ep_fid.fid.ops = &usdf_ep_msg_ops; - ep->ep_fid.ops = &usdf_base_msg_ops; - ep->ep_fid.cm = &usdf_cm_msg_ops; - ep->ep_fid.msg = &usdf_msg_ops; - ep->ep_fid.atomic = &usdf_msg_atomic_ops; - ep->ep_domain = udp; - ep->ep_caps = info->caps; - ep->ep_mode = info->mode; - ep->e.msg.ep_connreq = connreq; - ep->e.msg.ep_cm_sock = -1; - ep->ep_tx_dflt_signal_comp = 1; - ep->ep_rx_dflt_signal_comp = 1; - - ep->e.msg.ep_seq_credits = USDF_RUDP_SEQ_CREDITS; - TAILQ_INIT(&ep->e.msg.ep_posted_wqe); - TAILQ_INIT(&ep->e.msg.ep_sent_wqe); - --ep->e.msg.ep_last_rx_ack; - - ep->e.msg.ep_lcl_addr.sin_family = AF_INET; - ep->e.msg.ep_lcl_addr.sin_addr.s_addr = - ep->ep_domain->dom_fabric->fab_dev_attrs->uda_ipaddr_be; - ep->e.msg.ep_lcl_addr.sin_port = 0; - - if (parent_pep != NULL) { - ret = usdf_pep_steal_socket(parent_pep, &is_bound, - &ep->e.msg.ep_cm_sock); - if (ret) { - goto fail; - } - } - - ret = usdf_msg_upd_lcl_addr(ep); - if (ret) - goto fail; - - ret = usdf_timer_alloc(usdf_msg_ep_timeout, ep, - &ep->e.msg.ep_ack_timer); - if (ret != 0) { - goto fail; - } - - /* implicitly create TX context if not to be shared */ - if (info->ep_attr == NULL || - info->ep_attr->tx_ctx_cnt != FI_SHARED_CONTEXT) { - tx = calloc(1, sizeof(*tx)); - if (tx == NULL) { - ret = -errno; - goto fail; - } - tx->tx_fid.fid.fclass = FI_CLASS_TX_CTX; - ofi_atomic_initialize32(&tx->tx_refcnt, 0); - tx->tx_domain = udp; - tx->tx_progress = usdf_msg_tx_progress; - ofi_atomic_inc32(&udp->dom_refcnt); - - /* use info as the hints structure, and the output structure */ - ret = usdf_msg_fill_tx_attr(api_version, info, info); - if (ret != 0) - goto fail; - tx->tx_attr = *info->tx_attr; - - TAILQ_INIT(&tx->t.msg.tx_free_wqe); - TAILQ_INIT(&tx->t.msg.tx_ep_ready); - TAILQ_INIT(&tx->t.msg.tx_ep_have_acks); - - ep->ep_tx = tx; - ofi_atomic_inc32(&tx->tx_refcnt); - } - TAILQ_INIT(&ep->e.msg.ep_posted_wqe); - - /* implicitly create RX context if not to be shared */ - if (info->ep_attr == NULL || - info->ep_attr->rx_ctx_cnt != FI_SHARED_CONTEXT) { - rx = calloc(1, sizeof(*rx)); - if (rx == NULL) { - ret = -errno; - goto fail; - } - rx->rx_fid.fid.fclass = FI_CLASS_RX_CTX; - ofi_atomic_initialize32(&rx->rx_refcnt, 0); - rx->rx_domain = udp; - ofi_atomic_inc32(&udp->dom_refcnt); - - /* info serves as both the hints and the output */ - ret = usdf_msg_fill_rx_attr(api_version, info, info); - if (ret != 0) - goto fail; - rx->rx_attr = *info->rx_attr; - - TAILQ_INIT(&rx->r.msg.rx_free_rqe); - TAILQ_INIT(&rx->r.msg.rx_posted_rqe); - - ep->ep_rx = rx; - ofi_atomic_inc32(&rx->rx_refcnt); - } - - ofi_atomic_initialize32(&ep->ep_refcnt, 0); - ofi_atomic_inc32(&udp->dom_refcnt); - - *ep_o = ep_utof(ep); - return 0; -fail: - if (rx != NULL) { - free(rx); - ofi_atomic_dec32(&udp->dom_refcnt); - } - if (tx != NULL) { - free(tx); - ofi_atomic_dec32(&udp->dom_refcnt); - } - if (ep != NULL) { - if (ep->e.msg.ep_ack_timer != NULL) { - usdf_timer_free(fp, ep->e.msg.ep_ack_timer); - } - free(ep); - } - return ret; -} diff --git a/prov/usnic/src/usdf_ep_rdm.c b/prov/usnic/src/usdf_ep_rdm.c deleted file mode 100644 index 183a8e9d696..00000000000 --- a/prov/usnic/src/usdf_ep_rdm.c +++ /dev/null @@ -1,1146 +0,0 @@ -/* - * Copyright (c) 2014-2018, Cisco Systems, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "config.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include "ofi.h" -#include "ofi_enosys.h" -#include "ofi_util.h" - -#include "usd.h" -#include "usdf.h" -#include "usnic_direct.h" -#include "usdf_endpoint.h" -#include "fi_ext_usnic.h" -#include "usdf_rudp.h" -#include "usdf_cq.h" -#include "usdf_cm.h" -#include "usdf_av.h" -#include "usdf_timer.h" -#include "usdf_rdm.h" - - -/******************************************************************************* - * Default values for rdm attributes - ******************************************************************************/ -static const struct fi_tx_attr rdm_dflt_tx_attr = { - .caps = USDF_RDM_CAPS, - .mode = USDF_RDM_SUPP_MODE, - .size = USDF_RDM_DFLT_CTX_SIZE, - .op_flags = 0, - .msg_order = USDF_RDM_MSG_ORDER, - .comp_order = USDF_RDM_COMP_ORDER, - .inject_size = USDF_RDM_MAX_INJECT_SIZE, - .iov_limit = USDF_RDM_IOV_LIMIT, - .rma_iov_limit = USDF_RDM_RMA_IOV_LIMIT -}; - -static const struct fi_rx_attr rdm_dflt_rx_attr = { - .caps = USDF_RDM_CAPS, - .mode = USDF_RDM_SUPP_MODE, - .size = USDF_RDM_DFLT_CTX_SIZE, - .op_flags = 0, - .msg_order = USDF_RDM_MSG_ORDER, - .comp_order = USDF_RDM_COMP_ORDER, - .total_buffered_recv = 0, - .iov_limit = USDF_RDM_DFLT_SGE -}; - -/* The protocol for RDM is still under development. Version 0 does not provide - * any interoperability. - */ -static const struct fi_ep_attr rdm_dflt_ep_attr = { - .type = FI_EP_RDM, - .protocol = FI_PROTO_RUDP, - .protocol_version = 0, - .max_msg_size = USDF_RDM_MAX_MSG, - .msg_prefix_size = 0, - .max_order_raw_size = 0, - .max_order_war_size = 0, - .max_order_waw_size = 0, - .mem_tag_format = 0, - .tx_ctx_cnt = 1, - .rx_ctx_cnt = 1 -}; - -static const struct fi_domain_attr rdm_dflt_domain_attr = { - .caps = USDF_DOM_CAPS, - .threading = FI_THREAD_ENDPOINT, - .control_progress = FI_PROGRESS_AUTO, - .data_progress = FI_PROGRESS_MANUAL, - .resource_mgmt = FI_RM_DISABLED, - .mr_mode = FI_MR_ALLOCATED | FI_MR_LOCAL | FI_MR_BASIC, - .cntr_cnt = USDF_RDM_CNTR_CNT, - .mr_iov_limit = USDF_RDM_MR_IOV_LIMIT, - .mr_cnt = USDF_RDM_MR_CNT, -}; - -static struct fi_ops_atomic usdf_rdm_atomic_ops = { - .size = sizeof(struct fi_ops_atomic), - .write = fi_no_atomic_write, - .writev = fi_no_atomic_writev, - .writemsg = fi_no_atomic_writemsg, - .inject = fi_no_atomic_inject, - .readwrite = fi_no_atomic_readwrite, - .readwritev = fi_no_atomic_readwritev, - .readwritemsg = fi_no_atomic_readwritemsg, - .compwrite = fi_no_atomic_compwrite, - .compwritev = fi_no_atomic_compwritev, - .compwritemsg = fi_no_atomic_compwritemsg, - .writevalid = fi_no_atomic_writevalid, - .readwritevalid = fi_no_atomic_readwritevalid, - .compwritevalid = fi_no_atomic_compwritevalid, -}; - -/******************************************************************************* - * Fill functions for attributes - ******************************************************************************/ -int usdf_rdm_fill_ep_attr(const struct fi_info *hints, struct fi_info *fi, - struct usd_device_attrs *dap) -{ - struct fi_ep_attr defaults; - - defaults = rdm_dflt_ep_attr; - - if (!hints || !hints->ep_attr) - goto out; - - if (hints->ep_attr->max_msg_size > defaults.max_msg_size) - return -FI_ENODATA; - - switch (hints->ep_attr->protocol) { - case FI_PROTO_UNSPEC: - case FI_PROTO_RUDP: - break; - default: - return -FI_ENODATA; - } - - if (hints->ep_attr->tx_ctx_cnt > defaults.tx_ctx_cnt) - return -FI_ENODATA; - - if (hints->ep_attr->rx_ctx_cnt > defaults.rx_ctx_cnt) - return -FI_ENODATA; - - if (hints->ep_attr->max_order_raw_size > defaults.max_order_raw_size) - return -FI_ENODATA; - - if (hints->ep_attr->max_order_war_size > defaults.max_order_war_size) - return -FI_ENODATA; - - if (hints->ep_attr->max_order_waw_size > defaults.max_order_waw_size) - return -FI_ENODATA; - -out: - *fi->ep_attr = defaults; - - return FI_SUCCESS; - -} - -int usdf_rdm_fill_dom_attr(uint32_t version, const struct fi_info *hints, - struct fi_info *fi, struct usd_device_attrs *dap) -{ - int ret; - struct fi_domain_attr defaults; - - defaults = rdm_dflt_domain_attr; - ret = usdf_domain_getname(version, dap, &defaults.name); - if (ret < 0) - return -FI_ENODATA; - - if (!hints || !hints->domain_attr) - goto catch; - - /* how to handle fi_thread_fid, fi_thread_completion, etc? - */ - switch (hints->domain_attr->threading) { - case FI_THREAD_UNSPEC: - case FI_THREAD_ENDPOINT: - break; - default: - return -FI_ENODATA; - } - - /* how to handle fi_progress_manual? - */ - switch (hints->domain_attr->control_progress) { - case FI_PROGRESS_UNSPEC: - case FI_PROGRESS_AUTO: - break; - default: - return -FI_ENODATA; - } - - switch (hints->domain_attr->data_progress) { - case FI_PROGRESS_UNSPEC: - case FI_PROGRESS_MANUAL: - break; - default: - return -FI_ENODATA; - } - - switch (hints->domain_attr->resource_mgmt) { - case FI_RM_UNSPEC: - case FI_RM_DISABLED: - break; - default: - return -FI_ENODATA; - } - - switch (hints->domain_attr->caps) { - case 0: - case FI_REMOTE_COMM: - break; - default: - USDF_WARN_SYS(DOMAIN, - "invalid domain capabilities\n"); - return -FI_ENODATA; - } - - if (ofi_check_mr_mode(&usdf_ops, version, defaults.mr_mode, hints)) - return -FI_ENODATA; - - if (hints->domain_attr->mr_cnt <= USDF_RDM_MR_CNT) { - defaults.mr_cnt = hints->domain_attr->mr_cnt; - } else { - USDF_DBG_SYS(DOMAIN, "mr_count exceeded provider limit\n"); - return -FI_ENODATA; - } - -catch: - /* catch the version changes here. */ - ret = usdf_catch_dom_attr(version, hints, &defaults); - if (ret) - return ret; - - *fi->domain_attr = defaults; - - return FI_SUCCESS; -} - -int usdf_rdm_fill_tx_attr(uint32_t version, const struct fi_info *hints, - struct fi_info *fi) -{ - int ret; - struct fi_tx_attr defaults; - - defaults = rdm_dflt_tx_attr; - - if (!hints || !hints->tx_attr) - goto catch; - - /* make sure we can support the caps that are requested*/ - if (hints->tx_attr->caps & ~USDF_RDM_CAPS) - return -FI_ENODATA; - - /* clear the mode bits the app doesn't support */ - if (hints->mode || hints->tx_attr->mode) - defaults.mode &= (hints->mode | hints->tx_attr->mode); - - defaults.op_flags |= hints->tx_attr->op_flags; - - if ((hints->tx_attr->msg_order | USDF_RDM_MSG_ORDER) != - USDF_RDM_MSG_ORDER) - return -FI_ENODATA; - - if ((hints->tx_attr->comp_order | USDF_RDM_COMP_ORDER) != - USDF_RDM_COMP_ORDER) - return -FI_ENODATA; - - if (hints->tx_attr->inject_size > defaults.inject_size) - return -FI_ENODATA; - - if (hints->tx_attr->iov_limit > defaults.iov_limit) - return -FI_ENODATA; - - if (hints->tx_attr->rma_iov_limit > defaults.rma_iov_limit) - return -FI_ENODATA; - - if (hints->tx_attr->size > defaults.size) - return -FI_ENODATA; - -catch: - /* catch version changes here. */ - ret = usdf_catch_tx_attr(version, &defaults); - if (ret) - return ret; - - *fi->tx_attr = defaults; - - return FI_SUCCESS; -} - -int usdf_rdm_fill_rx_attr(uint32_t version, const struct fi_info *hints, - struct fi_info *fi) -{ - int ret; - struct fi_rx_attr defaults; - - defaults = rdm_dflt_rx_attr; - - if (!hints || !hints->rx_attr) - goto catch; - - /* make sure we can support the capabilities that are requested */ - if (hints->rx_attr->caps & ~USDF_RDM_CAPS) - return -FI_ENODATA; - - /* clear the mode bits the app doesn't support */ - if (hints->mode || hints->rx_attr->mode) - defaults.mode &= (hints->mode | hints->rx_attr->mode); - - defaults.op_flags |= hints->rx_attr->op_flags; - - if ((hints->rx_attr->msg_order | USDF_RDM_MSG_ORDER) != - USDF_RDM_MSG_ORDER) - return -FI_ENODATA; - if ((hints->rx_attr->comp_order | USDF_RDM_COMP_ORDER) != - USDF_RDM_COMP_ORDER) - return -FI_ENODATA; - - if (hints->rx_attr->total_buffered_recv > - defaults.total_buffered_recv) - return -FI_ENODATA; - - if (hints->rx_attr->iov_limit > defaults.iov_limit) - return -FI_ENODATA; - - if (hints->rx_attr->size > defaults.size) - return -FI_ENODATA; - -catch: - /* catch version changes here. */ - ret = usdf_catch_rx_attr(version, &defaults); - if (ret) - return ret; - - *fi->rx_attr = defaults; - - return FI_SUCCESS; -} - -static int -usdf_tx_rdm_enable(struct usdf_tx *tx) -{ - struct usdf_rdm_qe *wqe; - struct usdf_domain *udp; - struct usdf_cq_hard *hcq; - struct usd_filter filt; - int ret; - size_t i; - - USDF_TRACE_SYS(EP_CTRL, "\n"); - - udp = tx->tx_domain; - - hcq = tx->t.rdm.tx_hcq; - if (hcq == NULL) { - return -FI_ENOCQ; - } - - /* XXX temp until we can allocate WQ and RQ independently */ - filt.uf_type = USD_FTY_UDP; - filt.uf_filter.uf_udp.u_port = 0; - ret = usd_create_qp(udp->dom_dev, - USD_QTR_UDP, - USD_QTY_UD, - hcq->cqh_ucq, - hcq->cqh_ucq, - udp->dom_fabric->fab_dev_attrs->uda_max_send_credits, - udp->dom_fabric->fab_dev_attrs->uda_max_recv_credits, - &filt, - &tx->tx_qp); - if (ret != 0) { - goto fail; - } - tx->tx_qp->uq_context = tx; - - /* rdm send queue */ - tx->t.rdm.tx_wqe_buf = malloc(tx->tx_attr.size * - sizeof(struct usdf_rdm_qe)); - if (tx->t.rdm.tx_wqe_buf == NULL) { - ret = -errno; - goto fail; - } - - ret = usd_alloc_mr(tx->tx_domain->dom_dev, - tx->tx_attr.size * USDF_RDM_MAX_INJECT_SIZE, - (void **)&tx->t.rdm.tx_inject_bufs); - if (ret) { - USDF_INFO("usd_alloc_mr failed (%s)\n", strerror(-ret)); - goto fail; - } - - /* populate free list */ - TAILQ_INIT(&tx->t.rdm.tx_free_wqe); - wqe = tx->t.rdm.tx_wqe_buf; - for (i = 0; i < tx->tx_attr.size; ++i) { - wqe->rd_inject_buf = - &tx->t.rdm.tx_inject_bufs[USDF_RDM_MAX_INJECT_SIZE * i]; - TAILQ_INSERT_TAIL(&tx->t.rdm.tx_free_wqe, wqe, rd_link); - ++wqe; - } - tx->t.rdm.tx_num_free_wqe = tx->tx_attr.size; - - return 0; - -fail: - if (tx->t.rdm.tx_wqe_buf != NULL) { - free(tx->t.rdm.tx_wqe_buf); - tx->t.rdm.tx_wqe_buf = NULL; - TAILQ_INIT(&tx->t.rdm.tx_free_wqe); - tx->t.rdm.tx_num_free_wqe = 0; - } - - if (tx->t.rdm.tx_inject_bufs != NULL) { - usd_free_mr(tx->t.rdm.tx_inject_bufs); - tx->t.rdm.tx_inject_bufs = NULL; - } - - if (tx->tx_qp != NULL) { - usd_destroy_qp(tx->tx_qp); - } - return ret; -} - -static int -usdf_rx_rdm_enable(struct usdf_rx *rx) -{ - struct usdf_domain *udp; - struct usdf_cq_hard *hcq; - struct usdf_rdm_qe *rqe; - struct usd_filter filt; - struct usd_qp_impl *qp; - uint8_t *ptr; - size_t mtu; - int ret; - size_t i; - - USDF_TRACE_SYS(EP_CTRL, "\n"); - - udp = rx->rx_domain; - - hcq = rx->r.rdm.rx_hcq; - if (hcq == NULL) { - return -FI_ENOCQ; - } - - /* XXX temp until we can allocate WQ and RQ independently */ - filt.uf_type = USD_FTY_UDP_SOCK; - filt.uf_filter.uf_udp_sock.u_sock = rx->r.rdm.rx_sock; - ret = usd_create_qp(udp->dom_dev, - USD_QTR_UDP, - USD_QTY_UD, - hcq->cqh_ucq, - hcq->cqh_ucq, - udp->dom_fabric->fab_dev_attrs->uda_max_send_credits, - udp->dom_fabric->fab_dev_attrs->uda_max_recv_credits, - &filt, - &rx->rx_qp); - if (ret != 0) { - goto fail; - } - rx->rx_qp->uq_context = rx; - qp = to_qpi(rx->rx_qp); - - /* receive buffers */ - mtu = rx->rx_domain->dom_fabric->fab_dev_attrs->uda_mtu; - ret = usd_alloc_mr(rx->rx_domain->dom_dev, - qp->uq_rq.urq_num_entries * mtu, - (void **)&rx->r.rdm.rx_bufs); - if (ret != 0) { - goto fail; - } - - /* post all the buffers */ - ptr = rx->r.rdm.rx_bufs; - for (i = 0; i < qp->uq_rq.urq_num_entries - 1; ++i) { - usdf_rdm_post_recv(rx, ptr, mtu); - ptr += mtu; - } - - /* rdm recv queue */ - rx->r.rdm.rx_rqe_buf = malloc(rx->rx_attr.size * - sizeof(struct usdf_rdm_qe)); - if (rx->r.rdm.rx_rqe_buf == NULL) { - ret = -errno; - goto fail; - } - - /* populate free list */ - TAILQ_INIT(&rx->r.rdm.rx_free_rqe); - rqe = rx->r.rdm.rx_rqe_buf; - for (i = 0; i < rx->rx_attr.size; ++i) { - TAILQ_INSERT_TAIL(&rx->r.rdm.rx_free_rqe, rqe, rd_link); - ++rqe; - } - rx->r.rdm.rx_num_free_rqe = rx->rx_attr.size; - - return 0; - -fail: - if (rx->r.rdm.rx_rqe_buf != NULL) { - free(rx->r.rdm.rx_rqe_buf); - rx->r.rdm.rx_rqe_buf = NULL; - TAILQ_INIT(&rx->r.rdm.rx_free_rqe); - rx->r.rdm.rx_num_free_rqe = 0; - } - if (rx->r.rdm.rx_bufs != NULL) { - usd_free_mr(rx->r.rdm.rx_bufs); - rx->r.rdm.rx_bufs = NULL; - } - if (rx->rx_qp != NULL) { - usd_destroy_qp(rx->rx_qp); - } - return ret; -} - -/* - * Allocate any missing queue resources for this endpoint - */ -static int -usdf_ep_rdm_get_queues(struct usdf_ep *ep) -{ - struct usdf_tx *tx; - struct usdf_rx *rx; - int ret; - - /* Must have TX context at this point */ - tx = ep->ep_tx; - if (tx == NULL) { - ret = -FI_EINVAL; - goto fail; - } - if (tx->tx_qp == NULL) { - ret = usdf_tx_rdm_enable(tx); - if (ret != 0) { - goto fail; - } - } - - /* Must have RX context at this point */ - rx = ep->ep_rx; - if (rx == NULL) { - ret = -FI_EINVAL; - goto fail; - } - if (rx->rx_qp == NULL) { - ret = usdf_rx_rdm_enable(rx); - if (ret != 0) { - goto fail; - } - } - - return 0; -fail: - return ret; -} - -static int -usdf_ep_rdm_enable(struct fid_ep *fep) -{ - struct usdf_ep *ep; - int ret; - - ep = ep_ftou(fep); - - ret = usdf_ep_rdm_get_queues(ep); - if (ret == FI_SUCCESS) - ep->flags |= USDF_EP_ENABLED; - - return ret; -} - -static ssize_t -usdf_ep_rdm_cancel(fid_t fid, void *context) -{ - USDF_TRACE_SYS(EP_CTRL, "\n"); - /* XXX should this have a non-empty implementation? */ - return 0; -} - -/* - * Find a hard CQ within this soft CQ that services message EPs - */ -static struct usdf_cq_hard * -usdf_ep_rdm_find_cqh(struct usdf_cq *cq) -{ - struct usdf_cq_hard *hcq; - - TAILQ_FOREACH(hcq, &cq->c.soft.cq_list, cqh_link) { - if (hcq->cqh_progress == usdf_rdm_hcq_progress) { - return hcq; - } - } - return NULL; -} - -static int -usdf_ep_rdm_bind_cq(struct usdf_ep *ep, struct usdf_cq *cq, uint64_t flags) -{ - struct usdf_cq_hard **hcqp; - struct usdf_cq_hard *hcq; - int ret; - - /* - * The CQ is actually bound the RX or TX ctx, not the EP directly - */ - if (flags & FI_SEND) { - /* if TX is shared, but bind directly */ - if (ep->ep_tx->tx_fid.fid.fclass == FI_CLASS_STX_CTX) { - return -FI_EINVAL; - } - hcqp = &ep->ep_tx->t.rdm.tx_hcq; - } else { - /* if RX is shared, but bind directly */ - if (ep->ep_rx->rx_fid.fid.fclass == FI_CLASS_SRX_CTX) { - return -FI_EINVAL; - } - hcqp = &ep->ep_rx->r.rdm.rx_hcq; - } - if (*hcqp != NULL) { - return -FI_EINVAL; - } - - /* Make sure this CQ is "soft" */ - ret = usdf_cq_make_soft(cq); - if (ret != 0) { - return ret; - } - - if ((cq->cq_attr.wait_obj == FI_WAIT_FD) || - (cq->cq_attr.wait_obj == FI_WAIT_SET)) { - cq->object.fd = eventfd(0, EFD_NONBLOCK); - if (cq->object.fd == -1) { - USDF_DBG_SYS(CQ, "creating eventfd failed: %s\n", - strerror(errno)); - return -errno; - } - - USDF_DBG_SYS(CQ, "successfully created eventfd: %d\n", - cq->object.fd); - } - - /* Use existing rdm CQ if present */ - hcq = usdf_ep_rdm_find_cqh(cq); - if (hcq == NULL) { - hcq = malloc(sizeof(*hcq)); - if (hcq == NULL) { - return -errno; - } - - ret = usdf_cq_create_cq(cq, &hcq->cqh_ucq, false); - if (ret) - goto fail; - - hcq->cqh_cq = cq; - ofi_atomic_initialize32(&hcq->cqh_refcnt, 0); - hcq->cqh_progress = usdf_rdm_hcq_progress; - hcq->cqh_post = usdf_cq_post_soft; - TAILQ_INSERT_TAIL(&cq->c.soft.cq_list, hcq, cqh_link); - - /* add to domain progression list */ - TAILQ_INSERT_TAIL(&ep->ep_domain->dom_hcq_list, - hcq, cqh_dom_link); - } - ofi_atomic_inc32(&hcq->cqh_refcnt); - ofi_atomic_inc32(&cq->cq_refcnt); - *hcqp = hcq; - return 0; - -fail: - if (hcq != NULL) { - free(hcq); - } - return ret; -} - -static int -usdf_ep_rdm_bind(struct fid *fid, struct fid *bfid, uint64_t flags) -{ - int ret; - struct usdf_ep *ep; - struct usdf_cq *cq; - struct usdf_av *av; - - USDF_TRACE_SYS(EP_CTRL, "\n"); - - /* Check if the binding flags are valid. */ - ret = ofi_ep_bind_valid(&usdf_ops, bfid, flags); - if (ret) - return ret; - - ep = ep_fidtou(fid); - - switch (bfid->fclass) { - - case FI_CLASS_AV: - if (ep->e.rdm.ep_av != NULL) { - return -FI_EINVAL; - } - - av = av_fidtou(bfid); - ep->e.rdm.ep_av = av; - ofi_atomic_inc32(&av->av_refcnt); - break; - - case FI_CLASS_CQ: - if (flags & FI_SEND) { - cq = cq_fidtou(bfid); - if (flags & FI_SELECTIVE_COMPLETION) - ep->ep_tx_dflt_signal_comp = 0; - else - ep->ep_tx_dflt_signal_comp = 1; - usdf_ep_rdm_bind_cq(ep, cq, FI_SEND); - } - - if (flags & FI_RECV) { - cq = cq_fidtou(bfid); - if (flags & FI_SELECTIVE_COMPLETION) - ep->ep_rx_dflt_signal_comp = 0; - else - ep->ep_rx_dflt_signal_comp = 1; - usdf_ep_rdm_bind_cq(ep, cq, FI_RECV); - } - break; - - case FI_CLASS_EQ: - if (ep->ep_eq != NULL) { - return -FI_EINVAL; - } - ep->ep_eq = eq_fidtou(bfid); - ofi_atomic_inc32(&ep->ep_eq->eq_refcnt); - break; - default: - return -FI_EINVAL; - } - - return 0; -} - -/* - * XXX clean up pending transmits - */ -static int -usdf_rdm_rx_ctx_close(fid_t fid) -{ - struct usdf_rx *rx; - struct usdf_cq_hard *hcq; - - USDF_TRACE_SYS(EP_CTRL, "\n"); - - rx = rx_fidtou(fid); - - if (ofi_atomic_get32(&rx->rx_refcnt) > 0) { - return -FI_EBUSY; - } - - hcq = rx->r.rdm.rx_hcq; - if (hcq != NULL) { - ofi_atomic_dec32(&hcq->cqh_refcnt); - ofi_atomic_dec32(&hcq->cqh_cq->cq_refcnt); - } - if (rx->r.rdm.rx_sock != -1) { - close(rx->r.rdm.rx_sock); - } - - if (rx->rx_qp != NULL) { - usd_free_mr(rx->r.rdm.rx_bufs); - free(rx->r.rdm.rx_rqe_buf); - usd_destroy_qp(rx->rx_qp); - } - ofi_atomic_dec32(&rx->rx_domain->dom_refcnt); - - free(rx); - - return 0; -} - -/* - * XXX clean up pending receives - */ -static int -usdf_rdm_tx_ctx_close(fid_t fid) -{ - struct usdf_tx *tx; - struct usdf_cq_hard *hcq; - - USDF_TRACE_SYS(EP_CTRL, "\n"); - - tx = tx_fidtou(fid); - - if (ofi_atomic_get32(&tx->tx_refcnt) > 0) { - return -FI_EBUSY; - } - - hcq = tx->t.rdm.tx_hcq; - if (hcq != NULL) { - ofi_atomic_dec32(&hcq->cqh_refcnt); - ofi_atomic_dec32(&hcq->cqh_cq->cq_refcnt); - } - - if (tx->tx_qp != NULL) { - usd_free_mr(tx->t.rdm.tx_inject_bufs); - free(tx->t.rdm.tx_wqe_buf); - usd_destroy_qp(tx->tx_qp); - } - ofi_atomic_dec32(&tx->tx_domain->dom_refcnt); - - free(tx); - - return 0; -} - -static int -usdf_rx_rdm_port_bind(struct usdf_rx *rx, struct fi_info *info) -{ - struct sockaddr_in *sin; - struct sockaddr_in src; - socklen_t addrlen; - int ret; - - if (info->src_addr != NULL) { - switch (info->addr_format) { - case FI_SOCKADDR: - case FI_SOCKADDR_IN: - case FI_ADDR_STR: - sin = usdf_format_to_sin(info, info->src_addr); - if (NULL == sin) { - return -FI_ENOMEM; - } - break; - default: - return -FI_EINVAL; - } - } else { - memset(&src, 0, sizeof(src)); - sin = &src; - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = - rx->rx_domain->dom_fabric->fab_dev_attrs->uda_ipaddr_be; - } - - rx->r.rdm.rx_sock = socket(AF_INET, SOCK_DGRAM, 0); - if (rx->r.rdm.rx_sock == -1) { - return -errno; - } - ret = bind(rx->r.rdm.rx_sock, (struct sockaddr *)sin, sizeof(*sin)); - if (ret == -1) { - return -errno; - } - - addrlen = sizeof(*sin); - ret = getsockname(rx->r.rdm.rx_sock, (struct sockaddr *)sin, &addrlen); - if (ret == -1) { - return -errno; - } - - /* This has to be here because usdf_sin_to_format will allocate - * new piece of memory if the string conversion happens. - */ - if (info->addr_format == FI_ADDR_STR) - free(info->src_addr); - - info->src_addr = usdf_sin_to_format(info, sin, &info->src_addrlen); - - return 0; -} - -static int -usdf_ep_rdm_close(fid_t fid) -{ - struct usdf_ep *ep; - - USDF_TRACE_SYS(EP_CTRL, "\n"); - - ep = ep_fidtou(fid); - - if (ofi_atomic_get32(&ep->ep_refcnt) > 0) { - return -FI_EBUSY; - } - - if (ep->ep_rx != NULL) { - ofi_atomic_dec32(&ep->ep_rx->rx_refcnt); - if (rx_utofid(ep->ep_rx)->fclass == FI_CLASS_RX_CTX) { - (void) usdf_rdm_rx_ctx_close(rx_utofid(ep->ep_rx)); - } - } - - if (ep->ep_tx != NULL) { - ofi_atomic_dec32(&ep->ep_tx->tx_refcnt); - if (tx_utofid(ep->ep_tx)->fclass == FI_CLASS_TX_CTX) { - (void) usdf_rdm_tx_ctx_close(tx_utofid(ep->ep_tx)); - } - } - - ofi_atomic_dec32(&ep->ep_domain->dom_refcnt); - if (ep->ep_eq != NULL) { - ofi_atomic_dec32(&ep->ep_eq->eq_refcnt); - } - - if (ep->e.rdm.ep_av) - ofi_atomic_dec32(&ep->e.rdm.ep_av->av_refcnt); - - free(ep); - return 0; -} - -static struct fi_ops_ep usdf_base_rdm_ops = { - .size = sizeof(struct fi_ops_ep), - .cancel = usdf_ep_rdm_cancel, - .getopt = usdf_ep_getopt_unconnected, - .setopt = usdf_ep_setopt, - .tx_ctx = fi_no_tx_ctx, - .rx_ctx = fi_no_rx_ctx, - .rx_size_left = usdf_rdm_rx_size_left, - .tx_size_left = usdf_rdm_tx_size_left, -}; - -static struct fi_ops_cm usdf_cm_rdm_ops = { - .size = sizeof(struct fi_ops_cm), - .setname = fi_no_setname, - .getname = usdf_cm_rdm_getname, - .getpeer = fi_no_getpeer, - .connect = fi_no_connect, - .listen = fi_no_listen, - .accept = fi_no_accept, - .reject = fi_no_reject, - .shutdown = fi_no_shutdown, - .join = fi_no_join, -}; - -static struct fi_ops_msg usdf_rdm_ops = { - .size = sizeof(struct fi_ops_msg), - .recv = usdf_rdm_recv, - .recvv = usdf_rdm_recvv, - .recvmsg = usdf_rdm_recvmsg, - .send = usdf_rdm_send, - .sendv = usdf_rdm_sendv, - .sendmsg = usdf_rdm_sendmsg, - .inject = usdf_rdm_inject, - .senddata = fi_no_msg_senddata, - .injectdata = fi_no_msg_injectdata, -}; - -static int usdf_ep_rdm_control(struct fid *fid, int command, void *arg) -{ - struct fid_ep *ep; - int ret; - - USDF_TRACE_SYS(EP_CTRL, "\n"); - - switch (fid->fclass) { - case FI_CLASS_EP: - ep = container_of(fid, struct fid_ep, fid); - switch (command) { - case FI_ENABLE: - ret = usdf_ep_rdm_enable(ep); - break; - default: - ret = -FI_ENOSYS; - } - break; - default: - ret = -FI_ENOSYS; - } - - return ret; -} - -static struct fi_ops usdf_ep_rdm_ops = { - .size = sizeof(struct fi_ops), - .close = usdf_ep_rdm_close, - .bind = usdf_ep_rdm_bind, - .control = usdf_ep_rdm_control, - .ops_open = fi_no_ops_open -}; - -int -usdf_ep_rdm_open(struct fid_domain *domain, struct fi_info *info, - struct fid_ep **ep_o, void *context) -{ - struct usdf_domain *udp; - struct usdf_tx *tx; - struct usdf_rx *rx; - struct usdf_ep *ep; - int ret; - uint32_t api_version; - - USDF_TRACE_SYS(EP_CTRL, "\n"); - - ep = NULL; - rx = NULL; - tx = NULL; - if ((info->caps & ~USDF_RDM_CAPS) != 0) { - return -FI_EBADFLAGS; - } - - udp = dom_ftou(domain); - api_version = udp->dom_fabric->fab_attr.fabric->api_version; - - /* allocate peer table if not done */ - if (udp->dom_peer_tab == NULL) { - udp->dom_peer_tab = calloc(USDF_MAX_PEERS, sizeof(ep)); - } - if (udp->dom_peer_tab == NULL) { - ret = -errno; - goto fail; - } - - ep = calloc(1, sizeof(*ep)); - if (ep == NULL) { - ret = -errno; - goto fail; - } - - ep->ep_fid.fid.fclass = FI_CLASS_EP; - ep->ep_fid.fid.context = context; - ep->ep_fid.fid.ops = &usdf_ep_rdm_ops; - ep->ep_fid.ops = &usdf_base_rdm_ops; - ep->ep_fid.cm = &usdf_cm_rdm_ops; - ep->ep_fid.msg = &usdf_rdm_ops; - ep->ep_fid.atomic = &usdf_rdm_atomic_ops; - ep->ep_domain = udp; - ep->ep_caps = info->caps; - ep->ep_mode = info->mode; - ep->ep_tx_dflt_signal_comp = 1; - ep->ep_rx_dflt_signal_comp = 1; - - /* implicitly create TX context if not to be shared */ - if (info->ep_attr == NULL || - info->ep_attr->tx_ctx_cnt != FI_SHARED_CONTEXT) { - tx = calloc(1, sizeof(*tx)); - if (tx == NULL) { - ret = -errno; - goto fail; - } - tx->tx_fid.fid.fclass = FI_CLASS_TX_CTX; - ofi_atomic_initialize32(&tx->tx_refcnt, 0); - tx->tx_domain = udp; - tx->tx_progress = usdf_rdm_tx_progress; - ofi_atomic_initialize32(&tx->t.rdm.tx_next_msg_id, 1); - ofi_atomic_inc32(&udp->dom_refcnt); - - /* info is both hints and output */ - ret = usdf_rdm_fill_tx_attr(api_version, info, info); - if (ret) - goto fail; - tx->tx_attr = *info->tx_attr; - - TAILQ_INIT(&tx->t.rdm.tx_free_wqe); - TAILQ_INIT(&tx->t.rdm.tx_rdc_ready); - TAILQ_INIT(&tx->t.rdm.tx_rdc_have_acks); - - ep->ep_tx = tx; - ofi_atomic_inc32(&tx->tx_refcnt); - } - - /* implicitly create RX context if not to be shared */ - if (info->ep_attr == NULL || - info->ep_attr->rx_ctx_cnt != FI_SHARED_CONTEXT) { - rx = calloc(1, sizeof(*rx)); - if (rx == NULL) { - ret = -errno; - goto fail; - } - - rx->rx_fid.fid.fclass = FI_CLASS_RX_CTX; - ofi_atomic_initialize32(&rx->rx_refcnt, 0); - rx->rx_domain = udp; - rx->r.rdm.rx_tx = tx; - rx->r.rdm.rx_sock = -1; - ofi_atomic_inc32(&udp->dom_refcnt); - - ret = usdf_rx_rdm_port_bind(rx, info); - if (ret) { - goto fail; - } - - /* info is both hints and output */ - ret = usdf_rdm_fill_rx_attr(api_version, info, info); - if (ret) { - goto fail; - } - rx->rx_attr = *info->rx_attr; - - TAILQ_INIT(&rx->r.rdm.rx_free_rqe); - TAILQ_INIT(&rx->r.rdm.rx_posted_rqe); - - ep->ep_rx = rx; - ofi_atomic_inc32(&rx->rx_refcnt); - } - - ofi_atomic_initialize32(&ep->ep_refcnt, 0); - ofi_atomic_inc32(&udp->dom_refcnt); - - *ep_o = ep_utof(ep); - return 0; -fail: - if (rx != NULL) { - if (rx->r.rdm.rx_sock != -1) { - close(rx->r.rdm.rx_sock); - } - free(rx); - ofi_atomic_dec32(&udp->dom_refcnt); - } - if (tx != NULL) { - free(tx); - ofi_atomic_dec32(&udp->dom_refcnt); - } - if (ep != NULL) { - free(ep); - } - return ret; -} diff --git a/prov/usnic/src/usdf_eq.c b/prov/usnic/src/usdf_eq.c index 0e14f744200..5030b73d552 100644 --- a/prov/usnic/src/usdf_eq.c +++ b/prov/usnic/src/usdf_eq.c @@ -433,7 +433,6 @@ usdf_eq_control(fid_t fid, int command, void *arg) static int usdf_eq_bind_wait(struct usdf_eq *eq) { int ret; - struct epoll_event event = {0}; struct usdf_wait *wait_priv; if (!eq->eq_attr.wait_set) { @@ -443,9 +442,6 @@ static int usdf_eq_bind_wait(struct usdf_eq *eq) wait_priv = wait_ftou(eq->eq_attr.wait_set); - event.data.ptr = eq; - event.events = EPOLLIN; - ret = fid_list_insert(&wait_priv->list, &wait_priv->lock, &eq->eq_fid.fid); if (ret) { @@ -454,8 +450,7 @@ static int usdf_eq_bind_wait(struct usdf_eq *eq) return ret; } - ret = epoll_ctl(wait_priv->object.epfd, EPOLL_CTL_ADD, eq->eq_fd, - &event); + ret = ofi_epoll_add(wait_priv->object.epfd, eq->eq_fd, OFI_EPOLL_IN, eq); if (ret) { USDF_WARN_SYS(EQ, "failed to associate FD with wait set\n"); goto err; @@ -475,7 +470,6 @@ static int usdf_eq_unbind_wait(struct usdf_eq *eq) { int ret; struct usdf_wait *wait_priv; - struct epoll_event event = {0}; if (!eq->eq_attr.wait_set) { USDF_DBG_SYS(EQ, "can't unbind from non-existent wait set\n"); @@ -484,12 +478,11 @@ static int usdf_eq_unbind_wait(struct usdf_eq *eq) wait_priv = wait_ftou(eq->eq_attr.wait_set); - ret = epoll_ctl(wait_priv->object.epfd, EPOLL_CTL_DEL, - eq->eq_fd, &event); + ret = ofi_epoll_del(wait_priv->object.epfd, eq->eq_fd); if (ret) { USDF_WARN_SYS(EQ, "failed to remove FD from wait set\n"); - return -errno; + return ret; } fid_list_remove(&wait_priv->list, &wait_priv->lock, &eq->eq_fid.fid); diff --git a/prov/usnic/src/usdf_ext.c b/prov/usnic/src/usdf_ext.c index a171d2a447f..eefdec67908 100644 --- a/prov/usnic/src/usdf_ext.c +++ b/prov/usnic/src/usdf_ext.c @@ -57,7 +57,8 @@ usdf_usnic_getinfo_v1(uint32_t version, struct fid_fabric *fabric, uip->ui.v1.ui_link_speed = dap->uda_bandwidth; uip->ui.v1.ui_netmask_be = dap->uda_netmask_be; - strcpy(uip->ui.v1.ui_ifname, dap->uda_ifname); + snprintf(uip->ui.v1.ui_ifname, sizeof(uip->ui.v1.ui_ifname), "%s", + dap->uda_ifname); uip->ui.v1.ui_num_vf = dap->uda_num_vf; uip->ui.v1.ui_qp_per_vf = dap->uda_qp_per_vf; uip->ui.v1.ui_cq_per_vf = dap->uda_cq_per_vf; diff --git a/prov/usnic/src/usdf_fabric.c b/prov/usnic/src/usdf_fabric.c index 79d89c0804d..65b2e5daaa2 100644 --- a/prov/usnic/src/usdf_fabric.c +++ b/prov/usnic/src/usdf_fabric.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -42,7 +42,7 @@ #include #include #include -#include +#include #include #include #include @@ -71,8 +71,6 @@ #include "usdf_progress.h" #include "usdf_timer.h" #include "usdf_dgram.h" -#include "usdf_msg.h" -#include "usdf_rdm.h" #include "usdf_cm.h" struct usdf_usnic_info *__usdf_devinfo; @@ -514,190 +512,6 @@ static int usdf_fill_info_dgram( return ret; } -static int usdf_fill_info_msg( - uint32_t version, - const struct fi_info *hints, - void *src, - void *dest, - struct usd_device_attrs *dap, - struct fi_info **fi_first, - struct fi_info **fi_last) -{ - struct fi_info *fi; - struct fi_fabric_attr *fattrp; - uint32_t addr_format; - int ret; - - fi = fi_allocinfo(); - if (fi == NULL) { - ret = -FI_ENOMEM; - goto fail; - } - - fi->caps = USDF_MSG_CAPS; - - ret = validate_modebits(version, hints, - USDF_MSG_SUPP_MODE, &fi->mode); - if (ret) - goto fail; - - if (hints != NULL) { - addr_format = hints->addr_format; - - /* check that we are capable of what's requested */ - if ((hints->caps & ~USDF_MSG_CAPS) != 0) { - ret = -FI_ENODATA; - goto fail; - } - - fi->handle = hints->handle; - } else { - addr_format = FI_FORMAT_UNSPEC; - } - - fi->ep_attr->type = FI_EP_MSG; - - ret = usdf_fill_addr_info(fi, addr_format, src, dest, dap); - if (ret != 0) { - goto fail; - } - - /* fabric attrs */ - fattrp = fi->fabric_attr; - ret = usdf_fabric_getname(version, dap, &fattrp->name); - if (ret < 0 || fattrp->name == NULL) { - ret = -FI_ENOMEM; - goto fail; - } - - ret = usdf_msg_fill_ep_attr(hints, fi, dap); - if (ret) - goto fail; - - ret = usdf_msg_fill_dom_attr(version, hints, fi, dap); - if (ret) - goto fail; - - ret = usdf_msg_fill_tx_attr(version, hints, fi); - if (ret) - goto fail; - - ret = usdf_msg_fill_rx_attr(version, hints, fi); - if (ret) - goto fail; - - ret = usdf_alloc_fid_nic(fi, dap); - if (ret) - goto fail; - - /* add to tail of list */ - if (*fi_first == NULL) { - *fi_first = fi; - } else { - (*fi_last)->next = fi; - } - *fi_last = fi; - - return 0; - -fail: - if (fi != NULL) { - fi_freeinfo(fi); - } - return ret; -} - -static int usdf_fill_info_rdm( - uint32_t version, - const struct fi_info *hints, - void *src, - void *dest, - struct usd_device_attrs *dap, - struct fi_info **fi_first, - struct fi_info **fi_last) -{ - struct fi_info *fi; - struct fi_fabric_attr *fattrp; - uint32_t addr_format; - int ret; - - fi = fi_allocinfo(); - if (fi == NULL) { - ret = -FI_ENOMEM; - goto fail; - } - - fi->caps = USDF_RDM_CAPS; - - ret = validate_modebits(version, hints, - USDF_RDM_SUPP_MODE, &fi->mode); - if (ret) - goto fail; - - if (hints != NULL) { - addr_format = hints->addr_format; - /* check that we are capable of what's requested */ - if ((hints->caps & ~USDF_RDM_CAPS) != 0) { - ret = -FI_ENODATA; - goto fail; - } - - fi->handle = hints->handle; - } else { - addr_format = FI_FORMAT_UNSPEC; - } - fi->ep_attr->type = FI_EP_RDM; - - ret = usdf_fill_addr_info(fi, addr_format, src, dest, dap); - if (ret != 0) { - goto fail; - } - - /* fabric attrs */ - fattrp = fi->fabric_attr; - ret = usdf_fabric_getname(version, dap, &fattrp->name); - if (ret < 0 || fattrp->name == NULL) { - ret = -FI_ENOMEM; - goto fail; - } - - ret = usdf_rdm_fill_ep_attr(hints, fi, dap); - if (ret) - goto fail; - - ret = usdf_rdm_fill_dom_attr(version, hints, fi, dap); - if (ret) - goto fail; - - ret = usdf_rdm_fill_tx_attr(version, hints, fi); - if (ret) - goto fail; - - ret = usdf_rdm_fill_rx_attr(version, hints, fi); - if (ret) - goto fail; - - ret = usdf_alloc_fid_nic(fi, dap); - if (ret) - goto fail; - - /* add to tail of list */ - if (*fi_first == NULL) { - *fi_first = fi; - } else { - (*fi_last)->next = fi; - } - *fi_last = fi; - - return 0; - -fail: - if (fi != NULL) { - fi_freeinfo(fi); - } - return ret; -} - static int usdf_get_devinfo(void) { @@ -1015,22 +829,6 @@ usdf_getinfo(uint32_t version, const char *node, const char *service, goto fail; } } - - if (ep_type == FI_EP_MSG || ep_type == FI_EP_UNSPEC) { - ret = usdf_fill_info_msg(version, hints, src, dest, - dap, &fi_first, &fi_last); - if (ret != 0 && ret != -FI_ENODATA) { - goto fail; - } - } - - if (ep_type == FI_EP_RDM || ep_type == FI_EP_UNSPEC) { - ret = usdf_fill_info_rdm(version, hints, src, dest, - dap, &fi_first, &fi_last); - if (ret != 0 && ret != -FI_ENODATA) { - goto fail; - } - } } if (fi_first != NULL) { @@ -1080,8 +878,8 @@ usdf_fabric_close(fid_t fid) pthread_join(fp->fab_thread, &rv); } usdf_timer_deinit(fp); - if (fp->fab_epollfd != -1) { - close(fp->fab_epollfd); + if (fp->fab_epollfd != OFI_EPOLL_INVALID) { + ofi_epoll_close(fp->fab_epollfd); } if (fp->fab_eventfd != -1) { close(fp->fab_eventfd); @@ -1119,7 +917,6 @@ usdf_fabric_open(struct fi_fabric_attr *fattrp, struct fid_fabric **fabric, struct usdf_fabric *fp; struct usdf_usnic_info *dp; struct usdf_dev_entry *dep; - struct epoll_event ev; struct sockaddr_in sin; int ret; int d; @@ -1146,7 +943,7 @@ usdf_fabric_open(struct fi_fabric_attr *fattrp, struct fid_fabric **fabric, USDF_INFO("unable to allocate memory for fabric\n"); return -FI_ENOMEM; } - fp->fab_epollfd = -1; + fp->fab_epollfd = OFI_EPOLL_INVALID; fp->fab_arp_sockfd = -1; LIST_INIT(&fp->fab_domain_list); @@ -1167,9 +964,8 @@ usdf_fabric_open(struct fi_fabric_attr *fattrp, struct fid_fabric **fabric, fp->fab_dev_attrs = &dep->ue_dattr; - fp->fab_epollfd = epoll_create(1024); - if (fp->fab_epollfd == -1) { - ret = -errno; + ret = ofi_epoll_create(&fp->fab_epollfd); + if (ret) { USDF_INFO("unable to allocate epoll fd\n"); goto fail; } @@ -1182,11 +978,9 @@ usdf_fabric_open(struct fi_fabric_attr *fattrp, struct fid_fabric **fabric, } fp->fab_poll_item.pi_rtn = usdf_fabric_progression_cb; fp->fab_poll_item.pi_context = fp; - ev.events = EPOLLIN; - ev.data.ptr = &fp->fab_poll_item; - ret = epoll_ctl(fp->fab_epollfd, EPOLL_CTL_ADD, fp->fab_eventfd, &ev); - if (ret == -1) { - ret = -errno; + ret = ofi_epoll_add(fp->fab_epollfd, fp->fab_eventfd, OFI_EPOLL_IN, + &fp->fab_poll_item); + if (ret) { USDF_INFO("unable to EPOLL_CTL_ADD\n"); goto fail; } @@ -1248,7 +1042,7 @@ static void usdf_fini(void) struct fi_provider usdf_ops = { .name = USDF_PROV_NAME, .version = USDF_PROV_VERSION, - .fi_version = FI_VERSION(1, 8), + .fi_version = OFI_VERSION_LATEST, .getinfo = usdf_getinfo, .fabric = usdf_fabric_open, .cleanup = usdf_fini diff --git a/prov/usnic/src/usdf_msg.c b/prov/usnic/src/usdf_msg.c deleted file mode 100644 index 0e499e118fa..00000000000 --- a/prov/usnic/src/usdf_msg.c +++ /dev/null @@ -1,1255 +0,0 @@ -/* - * Copyright (c) 2014-2016, Cisco Systems, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "config.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include "ofi.h" - -#include "usd.h" -#include "usd_post.h" - -#include "usdf.h" -#include "usdf_rudp.h" -#include "usdf_msg.h" -#include "usdf_timer.h" -#include "usdf_progress.h" - -/* Functions to add and remove entries from the free list for the transmit and - * receive work queues. - */ -static struct usdf_msg_qe *usdf_msg_get_tx_wqe(struct usdf_tx *tx) -{ - struct usdf_msg_qe *entry; - - entry = TAILQ_FIRST(&tx->t.msg.tx_free_wqe); - TAILQ_REMOVE(&tx->t.msg.tx_free_wqe, entry, ms_link); - tx->t.msg.tx_num_free_wqe -= 1; - - return entry; -} - -static void usdf_msg_put_tx_wqe(struct usdf_tx *tx, struct usdf_msg_qe *wqe) -{ - TAILQ_INSERT_HEAD(&tx->t.msg.tx_free_wqe, wqe, ms_link); - tx->t.msg.tx_num_free_wqe += 1; -} - -static struct usdf_msg_qe *usdf_msg_get_rx_rqe(struct usdf_rx *rx) -{ - struct usdf_msg_qe *entry; - - entry = TAILQ_FIRST(&rx->r.msg.rx_free_rqe); - TAILQ_REMOVE(&rx->r.msg.rx_free_rqe, entry, ms_link); - rx->r.msg.rx_num_free_rqe -= 1; - - return entry; -} - -static void usdf_msg_put_rx_rqe(struct usdf_rx *rx, struct usdf_msg_qe *rqe) -{ - TAILQ_INSERT_HEAD(&rx->r.msg.rx_free_rqe, rqe, ms_link); - rx->r.msg.rx_num_free_rqe += 1; -} - -/******************************************************************************/ - -static inline void -usdf_msg_ep_ready(struct usdf_ep *ep) -{ - struct usdf_tx *tx; - - tx = ep->ep_tx; - if (!TAILQ_ON_LIST(ep, e.msg.ep_link)) { - - ep->e.msg.ep_fairness_credits = USDF_MSG_FAIRNESS_CREDITS; - TAILQ_INSERT_TAIL(&tx->t.msg.tx_ep_ready, ep, e.msg.ep_link); - - /* Make sure TX is on domain ready list */ - if (!TAILQ_ON_LIST(tx, tx_link)) { - TAILQ_INSERT_TAIL(&tx->tx_domain->dom_tx_ready, - tx, tx_link); - } - } -} - -static inline void -usdf_msg_rewind_qe(struct usdf_msg_qe *qe, size_t rewind, size_t mtu) -{ - size_t cur_resid; - size_t cur_iov; - size_t bytes; - size_t len; - - if (qe->ms_resid == 0) { - bytes = qe->ms_length % mtu; - cur_resid = 0; - } else { - bytes = mtu; - cur_resid = qe->ms_iov_resid; - } - bytes += (rewind - 1) * mtu; - qe->ms_resid += bytes; - - cur_iov = qe->ms_cur_iov; - while (bytes > 0) { - len = qe->ms_iov[cur_iov].iov_len - cur_resid; - if (len >= bytes) { - len = bytes; - cur_resid += len; - } else { - --cur_iov; - cur_resid = 0; - } - bytes -= len; - } - - qe->ms_cur_iov = cur_iov; - qe->ms_cur_ptr = ((uint8_t *)qe->ms_iov[cur_iov].iov_base) + - qe->ms_iov[cur_iov].iov_len - cur_resid; - qe->ms_iov_resid = cur_resid; -} - -/* - * semi-native rx buffer post, i want to eventually avoid using the - * vnic_*() calls - */ -static inline int -_usdf_msg_post_recv(struct usdf_rx *rx, void *buf, size_t len) -{ - struct usd_rq *rq; - struct vnic_rq *vrq; - struct rq_enet_desc *desc; - struct usd_qp_impl *qp; - - qp = to_qpi(rx->rx_qp); - rq = &qp->uq_rq; - vrq = &rq->urq_vnic_rq; - - rq->urq_context[rq->urq_post_index] = buf; - rq->urq_post_index = (rq->urq_post_index + 1) - & rq->urq_post_index_mask; - - desc = rq->urq_next_desc; - rq_enet_desc_enc(desc, (dma_addr_t) buf, - RQ_ENET_TYPE_ONLY_SOP, len); - wmb(); - iowrite32(rq->urq_post_index, &vrq->ctrl->posted_index); - - rq->urq_next_desc = (struct rq_enet_desc *) - ((uintptr_t)rq->urq_desc_ring - + ((rq->urq_post_index)<<4)); - rq->urq_recv_credits -= 1; - - return 0; -} - -/* - * Allow external access to the inline - */ -int -usdf_msg_post_recv(struct usdf_rx *rx, void *buf, size_t len) -{ - return _usdf_msg_post_recv(rx, buf, len); -} - -ssize_t -usdf_msg_recv(struct fid_ep *fep, void *buf, size_t len, - void *desc, fi_addr_t src_addr, void *context) -{ - struct usdf_ep *ep; - struct usdf_rx *rx; - struct usdf_msg_qe *rqe; - struct usdf_domain *udp; - - ep = ep_ftou(fep); - rx = ep->ep_rx; - udp = ep->ep_domain; - - if (TAILQ_EMPTY(&rx->r.msg.rx_free_rqe)) { - return -FI_EAGAIN; - } - - pthread_spin_lock(&udp->dom_progress_lock); - - rqe = usdf_msg_get_rx_rqe(rx); - - rqe->ms_context = context; - rqe->ms_iov[0].iov_base = buf; - rqe->ms_iov[0].iov_len = len; - rqe->ms_last_iov = 0; - - rqe->ms_cur_iov = 0; - rqe->ms_cur_ptr = buf; - rqe->ms_iov_resid = len; - rqe->ms_length = 0; - rqe->ms_resid = len; - - TAILQ_INSERT_TAIL(&rx->r.msg.rx_posted_rqe, rqe, ms_link); - - pthread_spin_unlock(&udp->dom_progress_lock); - - return 0; -} - -ssize_t -usdf_msg_recvv(struct fid_ep *fep, const struct iovec *iov, void **desc, - size_t count, fi_addr_t src_addr, void *context) -{ - struct usdf_ep *ep; - struct usdf_rx *rx; - struct usdf_msg_qe *rqe; - struct usdf_domain *udp; - size_t tot_len; - uint64_t op_flags; - uint32_t i; - - ep = ep_ftou(fep); - rx = ep->ep_rx; - udp = ep->ep_domain; - - if (TAILQ_EMPTY(&rx->r.msg.rx_free_rqe)) { - return -FI_EAGAIN; - } - - pthread_spin_lock(&udp->dom_progress_lock); - - rqe = usdf_msg_get_rx_rqe(rx); - - rqe->ms_context = context; - tot_len = 0; - for (i = 0; i < count; ++i) { - rqe->ms_iov[i].iov_base = (void *)iov[i].iov_base; - rqe->ms_iov[i].iov_len = iov[i].iov_len; - tot_len += iov[i].iov_len; - } - rqe->ms_last_iov = count - 1; - rqe->ms_cur_iov = 0; - rqe->ms_cur_ptr = iov[0].iov_base; - rqe->ms_iov_resid = iov[0].iov_len; - rqe->ms_resid = tot_len; - rqe->ms_length = 0; - - op_flags = ep->ep_rx->rx_attr.op_flags; - rqe->ms_signal_comp = ep->ep_rx_dflt_signal_comp || - (op_flags & FI_COMPLETION) ? 1 : 0; - - TAILQ_INSERT_TAIL(&rx->r.msg.rx_posted_rqe, rqe, ms_link); - pthread_spin_unlock(&udp->dom_progress_lock); - - return 0; -} - -ssize_t -usdf_msg_send(struct fid_ep *fep, const void *buf, size_t len, void *desc, - fi_addr_t dest_addr, void *context) -{ - struct usdf_ep *ep; - struct usdf_tx *tx; - struct usdf_msg_qe *wqe; - struct usdf_domain *udp; - uint64_t op_flags; - - ep = ep_ftou(fep); - tx = ep->ep_tx; - udp = ep->ep_domain; - - if (TAILQ_EMPTY(&tx->t.msg.tx_free_wqe)) { - return -FI_EAGAIN; - } - - pthread_spin_lock(&udp->dom_progress_lock); - - wqe = usdf_msg_get_tx_wqe(tx); - - wqe->ms_context = context; - wqe->ms_iov[0].iov_base = (void *)buf; - wqe->ms_iov[0].iov_len = len; - wqe->ms_last_iov = 0; - - wqe->ms_cur_iov = 0; - wqe->ms_cur_ptr = buf; - wqe->ms_iov_resid = len; - wqe->ms_resid = len; - wqe->ms_length = len; - - op_flags = ep->ep_tx->tx_attr.op_flags; - wqe->ms_signal_comp = ep->ep_tx_dflt_signal_comp || - (op_flags & FI_COMPLETION) ? 1 : 0; - - /* add send to EP, and add EP to TX list if not present */ - TAILQ_INSERT_TAIL(&ep->e.msg.ep_posted_wqe, wqe, ms_link); - usdf_msg_ep_ready(ep); - - pthread_spin_unlock(&udp->dom_progress_lock); - - usdf_domain_progress(udp); - - return 0; -} - -ssize_t -usdf_msg_sendv(struct fid_ep *fep, const struct iovec *iov, void **desc, - size_t count, fi_addr_t dest_addr, void *context) -{ - size_t i; - struct usdf_ep *ep; - struct usdf_tx *tx; - struct usdf_msg_qe *wqe; - struct usdf_domain *udp; - size_t tot_len; - uint64_t op_flags; - - ep = ep_ftou(fep); - tx = ep->ep_tx; - udp = ep->ep_domain; - - if (TAILQ_EMPTY(&tx->t.msg.tx_free_wqe)) { - return -FI_EAGAIN; - } - - pthread_spin_lock(&udp->dom_progress_lock); - - wqe = usdf_msg_get_tx_wqe(tx); - - wqe->ms_context = context; - tot_len = 0; - for (i = 0; i < count; ++i) { - wqe->ms_iov[i].iov_base = (void *)iov[i].iov_base; - wqe->ms_iov[i].iov_len = iov[i].iov_len; - tot_len += iov[i].iov_len; - } - wqe->ms_last_iov = count - 1; - - wqe->ms_cur_iov = 0; - wqe->ms_cur_ptr = iov[0].iov_base; - wqe->ms_iov_resid = iov[0].iov_len; - wqe->ms_resid = tot_len; - wqe->ms_length = tot_len; - - op_flags = ep->ep_tx->tx_attr.op_flags; - wqe->ms_signal_comp = ep->ep_tx_dflt_signal_comp || - (op_flags & FI_COMPLETION) ? 1 : 0; - - /* add send to EP, and add EP to TX list if not present */ - TAILQ_INSERT_TAIL(&ep->e.msg.ep_posted_wqe, wqe, ms_link); - usdf_msg_ep_ready(ep); - - pthread_spin_unlock(&udp->dom_progress_lock); - - usdf_domain_progress(udp); - - return 0; -} - -ssize_t -usdf_msg_sendmsg(struct fid_ep *fep, const struct fi_msg *msg, uint64_t flags) -{ - size_t i; - struct usdf_ep *ep; - struct usdf_tx *tx; - struct usdf_msg_qe *wqe; - struct usdf_domain *udp; - size_t tot_len; - const struct iovec *iov; - - ep = ep_ftou(fep); - tx = ep->ep_tx; - udp = ep->ep_domain; - iov = msg->msg_iov; - - if (flags & ~USDF_MSG_SUPP_SENDMSG_FLAGS) { - USDF_DBG_SYS(EP_DATA, - "one or more flags in %#" PRIx64 " not supported\n", - flags); - return -FI_EOPNOTSUPP; - } - - /* check for inject overrun before acquiring lock and allocating wqe, - * easier to unwind this way */ - if (flags & FI_INJECT) { - iov = msg->msg_iov; - tot_len = 0; - for (i = 0; i < msg->iov_count; ++i) { - tot_len += iov[i].iov_len; - if (tot_len > USDF_MSG_MAX_INJECT_SIZE) { - USDF_DBG_SYS(EP_DATA, "max inject len exceeded (%zu)\n", - tot_len); - return -FI_EINVAL; - } - } - } - - if (TAILQ_EMPTY(&tx->t.msg.tx_free_wqe)) { - return -FI_EAGAIN; - } - - pthread_spin_lock(&udp->dom_progress_lock); - - wqe = usdf_msg_get_tx_wqe(tx); - - wqe->ms_context = msg->context; - if (flags & FI_INJECT) { - tot_len = 0; - for (i = 0; i < msg->iov_count; ++i) { - assert(tot_len + iov[i].iov_len <= USDF_MSG_MAX_INJECT_SIZE); - memcpy(&wqe->ms_inject_buf[tot_len], iov[i].iov_base, - iov[i].iov_len); - tot_len += iov[i].iov_len; - } - wqe->ms_iov[0].iov_base = wqe->ms_inject_buf; - wqe->ms_iov[0].iov_len = tot_len; - wqe->ms_last_iov = 0; - - } else { - tot_len = 0; - for (i = 0; i < msg->iov_count; ++i) { - wqe->ms_iov[i].iov_base = (void *)iov[i].iov_base; - wqe->ms_iov[i].iov_len = iov[i].iov_len; - tot_len += iov[i].iov_len; - } - wqe->ms_last_iov = msg->iov_count - 1; - } - - wqe->ms_cur_iov = 0; - wqe->ms_resid = tot_len; - wqe->ms_length = tot_len; - wqe->ms_cur_ptr = iov[0].iov_base; - wqe->ms_iov_resid = iov[0].iov_len; - - wqe->ms_signal_comp = ep->ep_tx_dflt_signal_comp || - (flags & FI_COMPLETION) ? 1 : 0; - - /* add send to EP, and add EP to TX list if not present */ - TAILQ_INSERT_TAIL(&ep->e.msg.ep_posted_wqe, wqe, ms_link); - usdf_msg_ep_ready(ep); - - pthread_spin_unlock(&udp->dom_progress_lock); - - usdf_domain_progress(udp); - - return 0; -} - -ssize_t -usdf_msg_inject(struct fid_ep *fep, const void *buf, size_t len, - fi_addr_t dest_addr) -{ - struct usdf_ep *ep; - struct usdf_tx *tx; - struct usdf_msg_qe *wqe; - struct usdf_domain *udp; - - if (len > USDF_MSG_MAX_INJECT_SIZE) { - USDF_WARN_SYS(EP_DATA, - "cannot inject more than inject_size bytes\n"); - return -EINVAL; - } - - ep = ep_ftou(fep); - tx = ep->ep_tx; - udp = ep->ep_domain; - - if (TAILQ_EMPTY(&tx->t.msg.tx_free_wqe)) { - return -FI_EAGAIN; - } - - pthread_spin_lock(&udp->dom_progress_lock); - - wqe = usdf_msg_get_tx_wqe(tx); - - wqe->ms_context = NULL; - memcpy(wqe->ms_inject_buf, buf, len); - wqe->ms_iov[0].iov_base = wqe->ms_inject_buf; - wqe->ms_iov[0].iov_len = len; - wqe->ms_last_iov = 0; - - wqe->ms_cur_iov = 0; - wqe->ms_cur_ptr = buf; - wqe->ms_iov_resid = len; - wqe->ms_resid = len; - wqe->ms_length = len; - - /* fi_inject() never signals a completion */ - wqe->ms_signal_comp = 0; - - /* add send to EP, and add EP to TX list if not present */ - TAILQ_INSERT_TAIL(&ep->e.msg.ep_posted_wqe, wqe, ms_link); - usdf_msg_ep_ready(ep); - - pthread_spin_unlock(&udp->dom_progress_lock); - - usdf_domain_progress(udp); - - return 0; -} - -ssize_t -usdf_msg_recvmsg(struct fid_ep *fep, const struct fi_msg *msg, uint64_t flags) -{ - size_t i; - struct usdf_ep *ep; - struct usdf_rx *rx; - struct usdf_msg_qe *rqe; - struct usdf_domain *udp; - size_t tot_len; - const struct iovec *iov; - - ep = ep_ftou(fep); - rx = ep->ep_rx; - udp = ep->ep_domain; - iov = msg->msg_iov; - - if (TAILQ_EMPTY(&rx->r.msg.rx_free_rqe)) { - return -FI_EAGAIN; - } - - if (flags & ~USDF_MSG_SUPP_RECVMSG_FLAGS) { - USDF_DBG_SYS(EP_DATA, - "one or more flags in %#" PRIx64 " not supported\n", - flags); - return -FI_EOPNOTSUPP; - } - - pthread_spin_lock(&udp->dom_progress_lock); - - rqe = usdf_msg_get_rx_rqe(rx); - - rqe->ms_context = msg->context; - tot_len = 0; - for (i = 0; i < msg->iov_count; ++i) { - rqe->ms_iov[i].iov_base = (void *)iov[i].iov_base; - rqe->ms_iov[i].iov_len = iov[i].iov_len; - tot_len += iov[i].iov_len; - } - rqe->ms_last_iov = msg->iov_count - 1; - - rqe->ms_cur_iov = 0; - rqe->ms_resid = tot_len; - rqe->ms_length = 0; - rqe->ms_cur_ptr = iov[0].iov_base; - rqe->ms_iov_resid = iov[0].iov_len; - - rqe->ms_signal_comp = ep->ep_rx_dflt_signal_comp || - (flags & FI_COMPLETION) ? 1 : 0; - - TAILQ_INSERT_TAIL(&rx->r.msg.rx_posted_rqe, rqe, ms_link); - - pthread_spin_unlock(&udp->dom_progress_lock); - - return 0; -} - -static void -usdf_msg_send_complete(struct usdf_ep *ep, struct usdf_msg_qe *wqe) -{ - TAILQ_REMOVE(&ep->e.msg.ep_posted_wqe, wqe, ms_link); - - wqe->ms_last_seq = ep->e.msg.ep_next_tx_seq - 1; - TAILQ_INSERT_TAIL(&ep->e.msg.ep_sent_wqe, wqe, ms_link); -} - -static inline void -usdf_msg_send_segment(struct usdf_tx *tx, struct usdf_ep *ep) -{ - struct usdf_msg_qe *msg; - struct rudp_pkt *hdr; - struct usd_wq *wq; - uint32_t index; - size_t cur_iov; - size_t cur_resid; - size_t resid; - const uint8_t *cur_ptr; - const uint8_t *send_ptr; - size_t sge_len; - uint8_t *ptr; - struct usd_wq_post_info *info; - - msg = TAILQ_FIRST(&ep->e.msg.ep_posted_wqe); - wq = &(to_qpi(tx->tx_qp)->uq_wq); - - index = wq->uwq_post_index; - hdr = (struct rudp_pkt *)(wq->uwq_copybuf + index * USD_SEND_MAX_COPY); - - memcpy(hdr, &ep->e.msg.ep_dest->ds_dest.ds_udp.u_hdr, - sizeof(struct usd_udp_hdr)); - hdr->msg.src_peer_id = htons(ep->e.msg.ep_lcl_peer_id); - - resid = msg->ms_resid; - cur_iov = msg->ms_cur_iov; - cur_ptr = msg->ms_cur_ptr; - cur_resid = msg->ms_iov_resid; - - /* save first seq for message */ - if (cur_iov == 0 && cur_resid == msg->ms_iov[0].iov_len) { - msg->ms_first_seq = ep->e.msg.ep_next_tx_seq; - } - - if (resid < USD_SEND_MAX_COPY - sizeof(*hdr)) { - hdr->msg.opcode = htons(RUDP_OP_LAST); - hdr->msg.m.rc_data.length = htons(resid); - hdr->msg.m.rc_data.seqno = htons(ep->e.msg.ep_next_tx_seq); - ++ep->e.msg.ep_next_tx_seq; - - sge_len = resid; - ptr = (uint8_t *)(hdr + 1); - while (resid > 0) { - memcpy(ptr, cur_ptr, cur_resid); - ptr += cur_resid; - resid -= cur_resid; - ++cur_iov; - cur_ptr = msg->ms_iov[cur_iov].iov_base; - cur_resid = msg->ms_iov[cur_iov].iov_len; - } - - /* add packet lengths */ - hdr->hdr.uh_ip.tot_len = htons( - sge_len + sizeof(struct rudp_pkt) - - sizeof(struct ether_header)); - hdr->hdr.uh_udp.len = htons( - (sizeof(struct rudp_pkt) - - sizeof(struct ether_header) - - sizeof(struct iphdr)) + sge_len); - - index = _usd_post_send_one(wq, hdr, - sge_len + sizeof(*hdr), 1); - } else { - struct vnic_wq *vwq; - u_int8_t offload_mode = 0, eop; - u_int16_t mss = 7, header_length = 0, vlan_tag = 0; - u_int8_t vlan_tag_insert = 0, loopback = 0, fcoe_encap = 0; - struct wq_enet_desc *desc; - size_t space; - size_t num_sge; - size_t sent; - - vwq = &wq->uwq_vnic_wq; - desc = wq->uwq_next_desc; - space = ep->ep_domain->dom_fabric->fab_dev_attrs->uda_mtu - - sizeof(*hdr); - num_sge = 1; - - /* encode header desc */ - eop = 0; - wq_enet_desc_enc(desc, (uintptr_t)hdr, sizeof(*hdr), - mss, header_length, offload_mode, eop, 0, fcoe_encap, - vlan_tag_insert, vlan_tag, loopback); - - do { - desc = (struct wq_enet_desc *) - ((uintptr_t)wq->uwq_desc_ring + (index << 4)); - index = (index + 1) & wq->uwq_post_index_mask; - - send_ptr = cur_ptr; - if (cur_resid >= space) { - sge_len = space; - eop = 1; - cur_resid -= sge_len; - cur_ptr += sge_len; - } else { - sge_len = cur_resid; - if (num_sge == USDF_MSG_MAX_SGE || - cur_resid == resid) { - eop = 1; - } - ++cur_iov; - cur_ptr = msg->ms_iov[cur_iov].iov_base; - cur_resid = msg->ms_iov[cur_iov].iov_len; - } - - wq_enet_desc_enc(desc, (uintptr_t)send_ptr, sge_len, - mss, header_length, offload_mode, eop, eop, - fcoe_encap, vlan_tag_insert, - vlan_tag, loopback); - - ++num_sge; - space -= sge_len; - resid -= sge_len; - } while (space > 0 && num_sge <= USDF_MSG_MAX_SGE && resid > 0); - - /* add packet lengths */ - sent = ep->ep_domain->dom_fabric->fab_dev_attrs->uda_mtu - - sizeof(*hdr) - space; - hdr->hdr.uh_ip.tot_len = htons( - sent + sizeof(struct rudp_pkt) - - sizeof(struct ether_header)); - hdr->hdr.uh_udp.len = htons( - (sizeof(struct rudp_pkt) - - sizeof(struct ether_header) - - sizeof(struct iphdr)) + sent); -#if 0 -if ((random() % 177) == 0 && resid == 0) { - hdr->hdr.uh_eth.ether_type = 0; -//printf("BORK seq %u\n", ep->e.msg.ep_next_tx_seq); -} -#endif - - if (resid == 0) { - hdr->msg.opcode = htons(RUDP_OP_LAST); - } else { - hdr->msg.opcode = htons(RUDP_OP_FIRST); - } - hdr->msg.m.rc_data.length = htons(sent); - hdr->msg.m.rc_data.seqno = htons(ep->e.msg.ep_next_tx_seq); - ++ep->e.msg.ep_next_tx_seq; - - wmb(); - iowrite64(index, &vwq->ctrl->posted_index); - - wq->uwq_next_desc = (struct wq_enet_desc *) - ((uintptr_t)wq->uwq_desc_ring + (index << 4)); - wq->uwq_post_index = (index + 1) & wq->uwq_post_index_mask; - wq->uwq_send_credits -= num_sge; - } - - info = &wq->uwq_post_info[index]; - info->wp_context = tx; - info->wp_len = sge_len; - - /* If send complete, remove from send list */ - if (resid == 0) { - usdf_msg_send_complete(ep, msg); - } else { - msg->ms_resid = resid; - msg->ms_iov_resid = cur_resid; - msg->ms_cur_iov = cur_iov; - msg->ms_cur_ptr = cur_ptr; - } - - /* set ACK timer */ - usdf_timer_set(ep->ep_domain->dom_fabric, ep->e.msg.ep_ack_timer, - USDF_RUDP_ACK_TIMEOUT); -} - -static inline void -usdf_msg_send_ack(struct usdf_tx *tx, struct usdf_ep *ep) -{ - struct rudp_pkt *hdr; - struct usd_wq *wq; - uint32_t last_post; - struct usd_wq_post_info *info; - uint16_t seq; - - wq = &(to_qpi(tx->tx_qp)->uq_wq); - - hdr = (struct rudp_pkt *) (wq->uwq_copybuf + - wq->uwq_post_index * USD_SEND_MAX_COPY); - - memcpy(hdr, &ep->e.msg.ep_dest->ds_dest.ds_udp.u_hdr, - sizeof(struct usd_udp_hdr)); - - hdr->msg.src_peer_id = htons(ep->e.msg.ep_lcl_peer_id); - if (ep->e.msg.ep_send_nak) { - hdr->msg.opcode = htons(RUDP_OP_NAK); - seq = ep->e.msg.ep_next_rx_seq; - hdr->msg.m.nak.nak_seq = htons(seq); - ep->e.msg.ep_send_nak = 0; - } else { - hdr->msg.opcode = htons(RUDP_OP_ACK); - seq = ep->e.msg.ep_next_rx_seq - 1; - hdr->msg.m.ack.ack_seq = htons(seq); - } - - /* add packet lengths */ - hdr->hdr.uh_ip.tot_len = htons( - sizeof(struct rudp_pkt) - - sizeof(struct ether_header)); - hdr->hdr.uh_udp.len = htons(sizeof(struct rudp_pkt) - - sizeof(struct ether_header) - sizeof(struct iphdr)); - - last_post = _usd_post_send_one(wq, hdr, sizeof(*hdr), 1); - - info = &wq->uwq_post_info[last_post]; - info->wp_context = tx; - info->wp_len = 0; -} - -/* - * If this TX has sends to do and is not on domain ready list, then - * this completion means we can go back on the domain ready list - */ -static void -usdf_msg_send_completion(struct usd_completion *comp) -{ - struct usdf_tx *tx; - - tx = comp->uc_context; - - if (!TAILQ_EMPTY(&tx->t.msg.tx_ep_ready) && - !TAILQ_ON_LIST(tx, tx_link)) { - TAILQ_INSERT_TAIL(&tx->tx_domain->dom_tx_ready, tx, tx_link); - } -} - -/* - * Keep progressing sends on this queue until: - * a) no more send credits on the queue (it's full) - * or - * b) all endpoints are complete or blocked awaiting ACKs - */ -void -usdf_msg_tx_progress(struct usdf_tx *tx) -{ - struct usdf_ep *ep; - struct usd_qp_impl *qp; - - qp = to_qpi(tx->tx_qp); - while (qp->uq_wq.uwq_send_credits > 1 && - !TAILQ_EMPTY(&tx->t.msg.tx_ep_have_acks)) { - ep = TAILQ_FIRST(&tx->t.msg.tx_ep_have_acks); - TAILQ_REMOVE_MARK(&tx->t.msg.tx_ep_have_acks, - ep, e.msg.ep_ack_link); - - usdf_msg_send_ack(tx, ep); - } - - while (qp->uq_wq.uwq_send_credits > 1 && - !TAILQ_EMPTY(&tx->t.msg.tx_ep_ready)) { - ep = TAILQ_FIRST(&tx->t.msg.tx_ep_ready); - - /* - * Send next segment on this EP. This will also remove the - * current send from the EP send list if it completes - */ - usdf_msg_send_segment(tx, ep); - - --ep->e.msg.ep_seq_credits; - if (TAILQ_EMPTY(&ep->e.msg.ep_posted_wqe)) { - TAILQ_REMOVE_MARK(&tx->t.msg.tx_ep_ready, - ep, e.msg.ep_link); - } else { - --ep->e.msg.ep_fairness_credits; - if (ep->e.msg.ep_seq_credits == 0) { - TAILQ_REMOVE_MARK(&tx->t.msg.tx_ep_ready, - ep, e.msg.ep_link); - ep->e.msg.ep_fairness_credits = - USDF_MSG_FAIRNESS_CREDITS; - - /* fairness credits exhausted, go to back of the line */ - } else if (ep->e.msg.ep_fairness_credits == 0) { - TAILQ_REMOVE(&tx->t.msg.tx_ep_ready, - ep, e.msg.ep_link); - TAILQ_INSERT_TAIL(&tx->t.msg.tx_ep_ready, - ep, e.msg.ep_link); - ep->e.msg.ep_fairness_credits = - USDF_MSG_FAIRNESS_CREDITS; - } - } - } -} - -static inline void -usdf_msg_recv_complete(struct usdf_ep *ep, struct usdf_msg_qe *rqe, int status) -{ - struct usdf_cq_hard *hcq; - struct usdf_rx *rx; - - rx = ep->ep_rx; - hcq = rx->r.msg.rx_hcq; - - hcq->cqh_post(hcq, rqe->ms_context, rqe->ms_length, status, - FI_MSG | FI_RECV); - usdf_msg_put_rx_rqe(rx, rqe); -} - -static inline void -usdf_msg_ep_has_ack(struct usdf_ep *ep) -{ - struct usdf_tx *tx; - struct usdf_domain *udp; - - if (!TAILQ_ON_LIST(ep, e.msg.ep_ack_link)) { - tx = ep->ep_tx; - udp = ep->ep_domain; - TAILQ_INSERT_TAIL(&tx->t.msg.tx_ep_have_acks, ep, - e.msg.ep_ack_link); - /* Add TX to domain list if not present */ - if (!TAILQ_ON_LIST(tx, tx_link)) { - TAILQ_INSERT_TAIL(&udp->dom_tx_ready, tx, tx_link); - } - - } -} - -static inline int -usdf_msg_check_seq(struct usdf_ep *ep, struct rudp_pkt *pkt) -{ - uint16_t seq; - int ret; - - seq = ntohs(pkt->msg.m.rc_data.seqno); - - /* Drop bad seq, send NAK if seq from the future */ - if (seq != ep->e.msg.ep_next_rx_seq) { - if (RUDP_SEQ_GT(seq, ep->e.msg.ep_next_rx_seq)) { - ep->e.msg.ep_send_nak = 1; - } - ret = -1; - } else { - ++ep->e.msg.ep_next_rx_seq; - ret = 0; - } - usdf_msg_ep_has_ack(ep); - - return ret; -} - -static inline void -usdf_msg_process_ack(struct usdf_ep *ep, uint16_t seq) -{ - struct usdf_cq_hard *hcq; - struct usdf_msg_qe *wqe; - struct usdf_tx *tx; - uint16_t max_ack; - unsigned credits; - - tx = ep->ep_tx; - - /* don't try to ACK what we don't think we've sent */ - max_ack = ep->e.msg.ep_next_tx_seq - 1; - if (RUDP_SEQ_GT(seq, max_ack)) { - seq = max_ack; - } - - hcq = tx->t.msg.tx_hcq; - while (!TAILQ_EMPTY(&ep->e.msg.ep_sent_wqe)) { - wqe = TAILQ_FIRST(&ep->e.msg.ep_sent_wqe); - if (RUDP_SEQ_LE(wqe->ms_last_seq, seq)) { - TAILQ_REMOVE(&ep->e.msg.ep_sent_wqe, wqe, ms_link); - USDF_DBG_SYS(EP_DATA, "send complete, signal_comp=%u\n", wqe->ms_signal_comp); - if (wqe->ms_signal_comp) - hcq->cqh_post(hcq, wqe->ms_context, - wqe->ms_length, FI_SUCCESS, - FI_MSG | FI_SEND); - - usdf_msg_put_tx_wqe(tx, wqe); - } else { - break; - } - } - - credits = RUDP_SEQ_DIFF(seq, ep->e.msg.ep_last_rx_ack); - if (ep->e.msg.ep_seq_credits == 0 && credits > 0 && - !TAILQ_EMPTY(&ep->e.msg.ep_posted_wqe)) { - usdf_msg_ep_ready(ep); - } - ep->e.msg.ep_seq_credits += credits; - ep->e.msg.ep_last_rx_ack = seq; - - /* If all ACKed, cancel timer, else reset it */ - if (seq == max_ack) { - usdf_timer_cancel(ep->ep_domain->dom_fabric, - ep->e.msg.ep_ack_timer); - } else { - usdf_timer_reset(ep->ep_domain->dom_fabric, - ep->e.msg.ep_ack_timer, USDF_RUDP_ACK_TIMEOUT); - } -} - -static inline void -usdf_process_nak(struct usdf_ep *ep, uint16_t seq) -{ - struct usdf_msg_qe *wqe; - size_t rewind; - - /* Ignore NAKs of future packets */ - if (RUDP_SEQ_GE(seq, ep->e.msg.ep_next_tx_seq)) { - return; - } - - /* - * Move any WQEs that contain NAKed sequences back to the - * posted list. We set ms_resid == 0 here because final set to zero - * is optimized out of the fastpath - */ - while (!TAILQ_EMPTY(&ep->e.msg.ep_sent_wqe)) { - wqe = TAILQ_LAST(&ep->e.msg.ep_sent_wqe, usdf_msg_qe_head); - TAILQ_REMOVE(&ep->e.msg.ep_sent_wqe, wqe, ms_link); - wqe->ms_resid = 0; - TAILQ_INSERT_HEAD(&ep->e.msg.ep_posted_wqe, wqe, ms_link); - } - wqe = TAILQ_FIRST(&ep->e.msg.ep_posted_wqe); - - /* reset WQE to old sequence # */ - if (wqe->ms_resid == 0) { - rewind = RUDP_SEQ_DIFF(wqe->ms_last_seq, seq) + 1; - } else { - rewind = RUDP_SEQ_DIFF(ep->e.msg.ep_next_tx_seq, seq); - } - if (rewind > 0) { - ep->e.msg.ep_seq_credits = USDF_RUDP_SEQ_CREDITS; - ep->e.msg.ep_next_tx_seq = seq; - - usdf_msg_rewind_qe(wqe, rewind, - ep->ep_domain->dom_fabric->fab_dev_attrs->uda_mtu - - sizeof(struct rudp_pkt)); - - usdf_msg_ep_ready(ep); - } -} - -void -usdf_msg_ep_timeout(void *vep) -{ - struct usdf_ep *ep; - struct usdf_domain *udp; - uint16_t nak; - - ep = vep; - udp = ep->ep_domain; - - pthread_spin_lock(&udp->dom_progress_lock); - nak = ep->e.msg.ep_last_rx_ack + 1; - - usdf_process_nak(ep, nak); - pthread_spin_unlock(&udp->dom_progress_lock); -} - -static inline void -usdf_msg_rx_ack(struct usdf_ep *ep, struct rudp_pkt *pkt) -{ - uint16_t seq; - seq = ntohs(pkt->msg.m.ack.ack_seq); - usdf_msg_process_ack(ep, seq); -} - -static inline void -usdf_msg_rx_nak(struct usdf_ep *ep, struct rudp_pkt *pkt) -{ - uint16_t seq; - - seq = ntohs(pkt->msg.m.nak.nak_seq); - usdf_msg_process_ack(ep, seq); - - usdf_process_nak(ep, seq); -} - -/* - * Handle a receive on a queue servicing a message endpoint - */ -static inline void -usdf_msg_handle_recv(struct usdf_domain *udp, struct usd_completion *comp) -{ - struct rudp_pkt *pkt; - struct usdf_msg_qe *rqe; - struct usdf_ep *ep; - struct usd_qp *qp; - struct usdf_rx *rx; - uint32_t peer_id; - uint32_t opcode; - uint8_t *rx_ptr; - uint8_t *rqe_ptr; - size_t cur_iov; - size_t iov_resid; - size_t ms_resid; - size_t rxlen; - size_t copylen; - int ret; - - pkt = comp->uc_context; - opcode = ntohs(pkt->msg.opcode); - peer_id = ntohs(pkt->msg.src_peer_id); - if (peer_id > USDF_MAX_PEERS) { - qp = comp->uc_qp; - rx = qp->uq_context; - goto dropit; - } - ep = udp->dom_peer_tab[peer_id]; - if (ep == NULL) { - qp = comp->uc_qp; - rx = qp->uq_context; - goto dropit; - } - rx = ep->ep_rx; - - if (comp->uc_status != USD_COMPSTAT_SUCCESS) - goto dropit; - - switch (opcode) { - case RUDP_OP_ACK: - usdf_msg_rx_ack(ep, pkt); - goto dropit; - case RUDP_OP_NAK: - usdf_msg_rx_nak(ep, pkt); - goto dropit; - case RUDP_OP_FIRST: - case RUDP_OP_LAST: - break; - default: - USDF_DBG_SYS(EP_DATA, - "encountered unexpected opcode %" PRIu32 "\n", - opcode); - goto dropit; - } - - ret = usdf_msg_check_seq(ep, pkt); - if (ret == -1) { - goto dropit; - } - - rqe = ep->e.msg.ep_cur_recv; - if (rqe == NULL) { - if (TAILQ_EMPTY(&rx->r.msg.rx_posted_rqe)) { - goto dropit; - } - rqe = TAILQ_FIRST(&rx->r.msg.rx_posted_rqe); - TAILQ_REMOVE(&rx->r.msg.rx_posted_rqe, rqe, ms_link); - ep->e.msg.ep_cur_recv = rqe; - } - - rx_ptr = (uint8_t *)(pkt + 1); - rxlen = ntohs(pkt->msg.m.rc_data.length); - rqe->ms_length += rxlen; - rqe_ptr = (uint8_t *)rqe->ms_cur_ptr; - iov_resid = rqe->ms_iov_resid; - cur_iov = rqe->ms_cur_iov; - ms_resid = rqe->ms_resid; - while (rxlen > 0) { - copylen = MIN(rxlen, iov_resid); - memcpy(rqe_ptr, rx_ptr, copylen); - rx_ptr += copylen; - rxlen -= copylen; - iov_resid -= copylen; - ms_resid -= copylen; - if (iov_resid == 0) { - if (cur_iov == rqe->ms_last_iov) { - break; - } - ++cur_iov; - rqe_ptr = rqe->ms_iov[cur_iov].iov_base; - iov_resid = rqe->ms_iov[cur_iov].iov_len; - } else { - rqe_ptr += copylen; - } - } - - if (opcode & RUDP_OP_LAST) { - /* - * Normally we need to store back the updated values of - * ms_resid, ms_cur_iov, ms_cur_ptr and ms_iov_resid. But - * being the last step of the process, updating these - * values are not necessary - */ - if (rxlen > 0) { - USDF_DBG_SYS(EP_DATA, "message truncated by %zu bytes", - rxlen); - rqe->ms_length -= rxlen; - usdf_msg_recv_complete(ep, rqe, FI_ETRUNC); - } else { - usdf_msg_recv_complete(ep, rqe, FI_SUCCESS); - } - - ep->e.msg.ep_cur_recv = NULL; - } else { - rqe->ms_cur_ptr = rqe_ptr; - rqe->ms_iov_resid = iov_resid; - rqe->ms_cur_iov = cur_iov; - rqe->ms_resid = ms_resid; - } - -dropit: - /* repost buffer */ - _usdf_msg_post_recv(rx, pkt, - rx->rx_domain->dom_fabric->fab_dev_attrs->uda_mtu); -} - -/* - * Process message completions - */ -void -usdf_msg_hcq_progress(struct usdf_cq_hard *hcq) -{ - struct usd_completion comp; - - while (usd_poll_cq(hcq->cqh_ucq, &comp) != -EAGAIN) { - switch (comp.uc_type) { - case USD_COMPTYPE_SEND: - usdf_msg_send_completion(&comp); - break; - case USD_COMPTYPE_RECV: - usdf_msg_handle_recv(hcq->cqh_cq->cq_domain, &comp); - break; - } - } -} - -ssize_t usdf_msg_rx_size_left(struct fid_ep *fep) -{ - struct usdf_ep *ep; - struct usdf_rx *rx; - - USDF_DBG_SYS(EP_DATA, "\n"); - - ep = ep_ftou(fep); - rx = ep->ep_rx; - - if (!(ep->flags & USDF_EP_ENABLED)) - return -FI_EOPBADSTATE; - - return rx->r.msg.rx_num_free_rqe; -} - -ssize_t usdf_msg_tx_size_left(struct fid_ep *fep) -{ - struct usdf_ep *ep; - struct usdf_tx *tx; - - USDF_DBG_SYS(EP_DATA, "\n"); - - ep = ep_ftou(fep); - tx = ep->ep_tx; - - if (!(ep->flags & USDF_EP_ENABLED)) - return -FI_EOPBADSTATE; - - return tx->t.msg.tx_num_free_wqe; -} diff --git a/prov/usnic/src/usdf_msg.h b/prov/usnic/src/usdf_msg.h deleted file mode 100644 index d1c0eece80e..00000000000 --- a/prov/usnic/src/usdf_msg.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef _USDF_MSG_H_ -#define _USDF_MSG_H_ - -#define USDF_MSG_CAPS (FI_MSG | FI_SOURCE | FI_SEND | FI_RECV) - -#define USDF_MSG_SUPP_MODE (FI_LOCAL_MR) - -#define USDF_MSG_SUPP_SENDMSG_FLAGS \ - (FI_INJECT_COMPLETE | FI_TRANSMIT_COMPLETE | FI_INJECT | FI_COMPLETION) -#define USDF_MSG_SUPP_RECVMSG_FLAGS (FI_COMPLETION) - -#define USDF_MSG_MSG_ORDER (FI_ORDER_NONE) -#define USDF_MSG_COMP_ORDER (FI_ORDER_NONE) - -#define USDF_MSG_MAX_SGE 8 -#define USDF_MSG_DFLT_SGE 8 -#define USDF_MSG_MAX_CTX_SIZE 1024 -#define USDF_MSG_DFLT_CTX_SIZE 512 - -#define USDF_MSG_IOV_LIMIT (USDF_MSG_DFLT_SGE) -#define USDF_MSG_RMA_IOV_LIMIT 0 -#define USDF_MSG_MR_IOV_LIMIT (USDF_MR_IOV_LIMIT) -#define USDF_MSG_MR_CNT (USDF_MR_CNT) - -#define USDF_MSG_CNTR_CNT 0 - -#define USDF_MSG_MAX_MSG UINT_MAX - -#define USDF_MSG_MAX_INJECT_SIZE 64 - -#define USDF_MSG_FAIRNESS_CREDITS 16 - -#define USDF_MSG_RUDP_SEQ_CREDITS 256 - -struct usdf_msg_qe { - void *ms_context; - - struct iovec ms_iov[USDF_MSG_MAX_SGE]; - size_t ms_last_iov; - size_t ms_length; - - uint16_t ms_first_seq; - uint16_t ms_last_seq; - - size_t ms_cur_iov; - const uint8_t *ms_cur_ptr; - size_t ms_resid; /* amount remaining in entire msg */ - size_t ms_iov_resid; /* amount remaining in current iov */ - - /* points at buffer no larger than USDF_MSG_MAX_INJECT_SIZE */ - uint8_t *ms_inject_buf; - - uint8_t ms_signal_comp; - - TAILQ_ENTRY(usdf_msg_qe) ms_link; -}; - -int usdf_msg_post_recv(struct usdf_rx *rx, void *buf, size_t len); - -int usdf_msg_fill_tx_attr(uint32_t version, const struct fi_info *hints, - struct fi_info *fi); -int usdf_msg_fill_rx_attr(uint32_t version, const struct fi_info *hints, - struct fi_info *fi); -int usdf_msg_fill_ep_attr(const struct fi_info *hints, struct fi_info *fi, - struct usd_device_attrs *dap); -int usdf_msg_fill_dom_attr(uint32_t version, const struct fi_info *hints, - struct fi_info *fi, struct usd_device_attrs *dap); - -void usdf_msg_ep_timeout(void *vep); - -void usdf_msg_hcq_progress(struct usdf_cq_hard *hcq); -void usdf_msg_tx_progress(struct usdf_tx *tx); - - -/* fi_ops_cm for RC */ -int usdf_cm_msg_connect(struct fid_ep *ep, const void *addr, - const void *param, size_t paramlen); -int usdf_cm_msg_accept(struct fid_ep *fep, const void *param, size_t paramlen); - -/* fi_ops_msg for RC */ -ssize_t usdf_msg_recv(struct fid_ep *ep, void *buf, size_t len, void *desc, - fi_addr_t src_addr, void *context); -ssize_t usdf_msg_recvv(struct fid_ep *ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, void *context); -ssize_t usdf_msg_recvmsg(struct fid_ep *ep, const struct fi_msg *msg, - uint64_t flags); - -ssize_t usdf_msg_send(struct fid_ep *ep, const void *buf, size_t len, - void *desc, fi_addr_t src_addr, void *context); -ssize_t usdf_msg_sendv(struct fid_ep *ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, void *context); -ssize_t usdf_msg_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, - uint64_t flags); - -ssize_t usdf_msg_inject(struct fid_ep *ep, const void *buf, size_t len, - fi_addr_t src_addr); - - -ssize_t usdf_msg_rx_size_left(struct fid_ep *fep); -ssize_t usdf_msg_tx_size_left(struct fid_ep *fep); - -#endif /* _USDF_MSG_H_ */ diff --git a/prov/usnic/src/usdf_pep.c b/prov/usnic/src/usdf_pep.c index a04430815fd..0d4811e279d 100644 --- a/prov/usnic/src/usdf_pep.c +++ b/prov/usnic/src/usdf_pep.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014-2019, Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #include #include @@ -65,7 +65,6 @@ #include "usdf.h" #include "usdf_endpoint.h" #include "usdf_cm.h" -#include "usdf_msg.h" static int usdf_pep_bind(fid_t fid, fid_t bfid, uint64_t flags) @@ -143,8 +142,8 @@ usdf_pep_creq_epoll_del(struct usdf_connreq *crp) pep = crp->cr_pep; if (crp->cr_pollitem.pi_rtn != NULL) { - ret = epoll_ctl(pep->pep_fabric->fab_epollfd, EPOLL_CTL_DEL, - crp->cr_sockfd, NULL); + ret = ofi_epoll_del(pep->pep_fabric->fab_epollfd, + crp->cr_sockfd); crp->cr_pollitem.pi_rtn = NULL; if (ret != 0) { ret = -errno; @@ -231,7 +230,6 @@ usdf_pep_listen_cb(void *v) struct usdf_pep *pep; struct sockaddr_in sin; struct usdf_connreq *crp; - struct epoll_event ev; socklen_t socklen; int ret; int s; @@ -267,13 +265,11 @@ usdf_pep_listen_cb(void *v) crp->cr_pollitem.pi_rtn = usdf_pep_read_connreq; crp->cr_pollitem.pi_context = crp; - ev.events = EPOLLIN; - ev.data.ptr = &crp->cr_pollitem; - ret = epoll_ctl(pep->pep_fabric->fab_epollfd, EPOLL_CTL_ADD, - crp->cr_sockfd, &ev); - if (ret == -1) { - usdf_cm_report_failure(crp, -errno, false); + ret = ofi_epoll_add(pep->pep_fabric->fab_epollfd, crp->cr_sockfd, + OFI_EPOLL_IN, &crp->cr_pollitem); + if (ret) { + usdf_cm_report_failure(crp, ret, false); return 0; } @@ -286,7 +282,6 @@ static int usdf_pep_listen(struct fid_pep *fpep) { struct usdf_pep *pep; - struct epoll_event ev; struct usdf_fabric *fp; struct sockaddr_in *sin; socklen_t socklen; @@ -360,10 +355,10 @@ usdf_pep_listen(struct fid_pep *fpep) pep->pep_pollitem.pi_rtn = usdf_pep_listen_cb; pep->pep_pollitem.pi_context = pep; - ev.events = EPOLLIN; - ev.data.ptr = &pep->pep_pollitem; - ret = epoll_ctl(fp->fab_epollfd, EPOLL_CTL_ADD, pep->pep_sock, &ev); - if (ret == -1) { + ret = ofi_epoll_add(fp->fab_epollfd, pep->pep_sock, OFI_EPOLL_IN, + &pep->pep_pollitem); + if (ret) { + errno = -ret; goto fail; } @@ -401,9 +396,6 @@ static int usdf_pep_reject_async(void *vreq) crp->cr_resid -= ret; crp->cr_ptr += ret; - if (crp->cr_resid == 0) - usdf_cm_msg_connreq_cleanup(crp); - return FI_SUCCESS; } @@ -412,7 +404,6 @@ static int usdf_pep_reject(struct fid_pep *fpep, fid_t handle, const void *param { struct usdf_pep *pep; struct usdf_connreq *crp; - struct epoll_event event; struct usdf_connreq_msg *reqp; int ret; @@ -456,16 +447,9 @@ static int usdf_pep_reject(struct fid_pep *fpep, fid_t handle, const void *param crp->cr_pollitem.pi_rtn = usdf_pep_reject_async; crp->cr_pollitem.pi_context = crp; - event.events = EPOLLOUT; - event.data.ptr = &crp->cr_pollitem; - - ret = epoll_ctl(pep->pep_fabric->fab_epollfd, EPOLL_CTL_ADD, - crp->cr_sockfd, &event); - - if (ret) - return -errno; - - return FI_SUCCESS; + ret = ofi_epoll_add(pep->pep_fabric->fab_epollfd, crp->cr_sockfd, + OFI_EPOLL_OUT, &crp->cr_pollitem); + return ret; } static void @@ -703,10 +687,6 @@ usdf_pep_open(struct fid_fabric *fabric, struct fi_info *info, return -FI_ENODEV; } - if ((info->caps & ~USDF_MSG_CAPS) != 0) { - return -FI_EBADF; - } - switch (info->addr_format) { case FI_SOCKADDR: case FI_SOCKADDR_IN: diff --git a/prov/usnic/src/usdf_progress.c b/prov/usnic/src/usdf_progress.c index 4d6e3186d90..47204a175da 100644 --- a/prov/usnic/src/usdf_progress.c +++ b/prov/usnic/src/usdf_progress.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include #include @@ -88,12 +88,12 @@ void * usdf_fabric_progression_thread(void *v) { struct usdf_fabric *fp; - struct epoll_event ev; struct usdf_poll_item *pip; struct usdf_domain *dom; int num_blocked_waiting; int sleep_time; - int epfd; + ofi_epoll_t epfd; + void *context; int ret; int n; @@ -111,14 +111,14 @@ usdf_fabric_progression_thread(void *v) sleep_time = -1; } - n = epoll_wait(epfd, &ev, 1, sleep_time); - if (fp->fab_exit || (n == -1 && errno != EINTR)) { + n = ofi_epoll_wait(epfd, &context, 1, sleep_time); + if (fp->fab_exit || (n < 0 && n != EINTR)) { pthread_exit(NULL); } /* consume event if there was one */ if (n == 1) { - pip = ev.data.ptr; + pip = context; ret = pip->pi_rtn(pip->pi_context); if (ret != 0) { pthread_exit(NULL); diff --git a/prov/usnic/src/usdf_rdm.c b/prov/usnic/src/usdf_rdm.c deleted file mode 100644 index d2eee05db3e..00000000000 --- a/prov/usnic/src/usdf_rdm.c +++ /dev/null @@ -1,1640 +0,0 @@ -/* - * Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "config.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include "ofi.h" - -#include "usd.h" -#include "usd_post.h" - -#include "usdf.h" -#include "usdf_rudp.h" -#include "usdf_rdm.h" -#include "usdf_timer.h" -#include "usdf_av.h" -#include "usdf_progress.h" - -/* Functions to add and remove entries from the free list for the transmit and - * receive work queues. - */ -static struct usdf_rdm_qe *usdf_rdm_get_tx_wqe(struct usdf_tx *tx) -{ - struct usdf_rdm_qe *entry; - - entry = TAILQ_FIRST(&tx->t.rdm.tx_free_wqe); - TAILQ_REMOVE(&tx->t.rdm.tx_free_wqe, entry, rd_link); - tx->t.rdm.tx_num_free_wqe -= 1; - - return entry; -} - -static void usdf_rdm_put_tx_wqe(struct usdf_tx *tx, struct usdf_rdm_qe *wqe) -{ - TAILQ_INSERT_HEAD(&tx->t.rdm.tx_free_wqe, wqe, rd_link); - tx->t.rdm.tx_num_free_wqe += 1; -} - -static struct usdf_rdm_qe *usdf_rdm_get_rx_rqe(struct usdf_rx *rx) -{ - struct usdf_rdm_qe *entry; - - entry = TAILQ_FIRST(&rx->r.rdm.rx_free_rqe); - TAILQ_REMOVE(&rx->r.rdm.rx_free_rqe, entry, rd_link); - rx->r.rdm.rx_num_free_rqe -= 1; - - return entry; -} - -static void usdf_rdm_put_rx_rqe(struct usdf_rx *rx, struct usdf_rdm_qe *rqe) -{ - TAILQ_INSERT_HEAD(&rx->r.rdm.rx_free_rqe, rqe, rd_link); - rx->r.rdm.rx_num_free_rqe += 1; -} - -/******************************************************************************/ - -static inline void -usdf_rdm_rdc_ready(struct usdf_rdm_connection *rdc, struct usdf_tx *tx) -{ - /* skip if we have pending send messages */ - if (!TAILQ_EMPTY(&rdc->dc_wqe_sent)) { - USDF_DBG_SYS(EP_DATA, "SKIP rdc %p ready due to pending wqe\n", rdc); - return; - } - if (!TAILQ_ON_LIST(rdc, dc_tx_link)) { - rdc->dc_fairness_credits = USDF_RDM_FAIRNESS_CREDITS; - TAILQ_INSERT_TAIL(&tx->t.rdm.tx_rdc_ready, rdc, dc_tx_link); - - /* Make sure TX is on domain ready list */ - if (!TAILQ_ON_LIST(tx, tx_link)) { - TAILQ_INSERT_TAIL(&tx->tx_domain->dom_tx_ready, - tx, tx_link); - } - } - else { - USDF_DBG_SYS(EP_DATA, "RDC %p already on list\n", rdc); - } -} - -static inline uint16_t -usdf_rdm_rdc_hash_helper(uint32_t ipaddr, uint16_t port) -{ - uint16_t hash_index; - - uint16_t lower = (ipaddr & 0xFFFF); - uint16_t upper = (ipaddr >> 16); - - hash_index = lower; - hash_index ^= upper; - hash_index ^= port; - - return hash_index & USDF_RDM_HASH_MASK; -} - - -static inline uint16_t -usdf_rdm_rdc_hash_hdr(struct usd_udp_hdr *hdr) -{ - return usdf_rdm_rdc_hash_helper(hdr->uh_ip.saddr, hdr->uh_udp.source); -} - -static inline int -usdf_rdm_rdc_hdr_match(struct usdf_rdm_connection *rdc, struct usd_udp_hdr *hdr) -{ - return hdr->uh_ip.saddr == rdc->dc_hdr.uh_ip.daddr && - hdr->uh_udp.source == rdc->dc_hdr.uh_udp.dest; -} - -static inline int -usdf_rdm_rdc_addr_match(struct usdf_rdm_connection *rdc, uint32_t ipaddr, - uint16_t port) -{ - return ipaddr == rdc->dc_hdr.uh_ip.daddr && - port == rdc->dc_hdr.uh_udp.dest; -} - -/* - * Find a matching RDM connection on this domain - */ -static inline struct usdf_rdm_connection * -usdf_rdm_rdc_addr_lookup(struct usdf_domain *udp, uint32_t ipaddr, - uint16_t port) -{ - uint16_t hash_index; - struct usdf_rdm_connection *rdc; - - hash_index = usdf_rdm_rdc_hash_helper(ipaddr, port); - - rdc = udp->dom_rdc_hashtab[hash_index]; - - while (rdc != NULL) { - if (usdf_rdm_rdc_addr_match(rdc, ipaddr, port)) { - return rdc; - } - rdc = rdc->dc_hash_next; - } - - return NULL; -} - -/* - * Find a matching RDM connection on this domain - */ -static inline struct usdf_rdm_connection * -usdf_rdm_rdc_hdr_lookup(struct usdf_domain *udp, struct usd_udp_hdr *hdr) -{ - uint16_t hash_index; - struct usdf_rdm_connection *rdc; - - hash_index = usdf_rdm_rdc_hash_hdr(hdr); - - rdc = udp->dom_rdc_hashtab[hash_index]; - - while (rdc != NULL) { - if (usdf_rdm_rdc_hdr_match(rdc, hdr)) { - return rdc; - } - rdc = rdc->dc_hash_next; - } - - return NULL; -} - -/* - * Insert rdc into domain hash table - */ -static inline void -usdf_rdm_rdc_insert(struct usdf_domain *udp, struct usdf_rdm_connection *rdc) -{ - uint16_t hash_index; - - hash_index = usdf_rdm_rdc_hash_helper(rdc->dc_hdr.uh_ip.daddr, - rdc->dc_hdr.uh_udp.dest); - USDF_DBG_SYS(EP_DATA, "insert rdc %p at %u\n", rdc, hash_index); - - rdc->dc_hash_next = udp->dom_rdc_hashtab[hash_index]; - udp->dom_rdc_hashtab[hash_index] = rdc; -} - -static inline void -usdf_rdm_rdc_remove(struct usdf_domain *udp, struct usdf_rdm_connection *rdc) -{ - uint16_t hash_index; - struct usdf_rdm_connection *prev; - - hash_index = usdf_rdm_rdc_hash_helper(rdc->dc_hdr.uh_ip.daddr, - rdc->dc_hdr.uh_udp.dest); - USDF_DBG_SYS(EP_DATA, "remove rdc %p from %u\n", rdc, hash_index); - - if (udp->dom_rdc_hashtab[hash_index] == rdc) { - udp->dom_rdc_hashtab[hash_index] = rdc->dc_hash_next; - } else { - prev = udp->dom_rdc_hashtab[hash_index]; - while (prev->dc_hash_next != rdc) { - prev = prev->dc_hash_next; - } - prev->dc_hash_next = rdc->dc_hash_next; - } -} - -/* - * Get a new RDC from domain list. - */ -static inline struct usdf_rdm_connection * -usdf_rdc_alloc(struct usdf_domain *udp) -{ - struct usdf_rdm_connection *rdc; - - if (SLIST_EMPTY(&udp->dom_rdc_free)) { - return NULL; // XXX alloc a new batch - } else { - rdc = SLIST_FIRST(&udp->dom_rdc_free); - SLIST_REMOVE_HEAD(&udp->dom_rdc_free, dc_addr_link); - ofi_atomic_dec32(&udp->dom_rdc_free_cnt); - } - return rdc; -} - -/* - * Get an RDM connection for this send. If there is a connection for this - * TX queue already attached to this destination, use that. - * If not, check to see if one if in the connection cache (possibly put - * there by receive). If there is not one there either, grab a new one - * and put it in the cache and also attch to this dest. - */ -static inline struct usdf_rdm_connection * -usdf_rdm_rdc_tx_get(struct usdf_dest *dest, struct usdf_ep *ep) -{ - struct usdf_rdm_connection *rdc; - struct usdf_tx *tx; - struct usdf_rx *rx; - struct usd_qp_impl *qp; - struct usdf_domain *udp; - - tx = ep->ep_tx; - rx = ep->ep_rx; - - SLIST_FOREACH(rdc, &dest->ds_rdm_rdc_list, dc_addr_link) { - if (rdc->dc_tx == tx) { - return rdc; - } - } - - udp = tx->tx_domain; - rdc = usdf_rdm_rdc_addr_lookup(udp, - dest->ds_dest.ds_dest.ds_udp.u_hdr.uh_ip.daddr, - dest->ds_dest.ds_dest.ds_udp.u_hdr.uh_udp.dest); - - if (rdc == NULL) { - rdc = usdf_rdc_alloc(udp); - if (rdc == NULL) { - return NULL; - } - memcpy(&rdc->dc_hdr, - &dest->ds_dest.ds_dest.ds_udp.u_hdr, - sizeof(rdc->dc_hdr)); - - qp = to_qpi(rx->rx_qp); - rdc->dc_tx = tx; - rdc->dc_hdr.uh_udp.source = - qp->uq_attrs.uqa_local_addr.ul_addr.ul_udp.u_addr.sin_port; - - usdf_rdm_rdc_insert(udp, rdc); - - /* start eviction timer */ - usdf_timer_set(tx->tx_domain->dom_fabric, rdc->dc_timer, - USDF_RDM_RDC_TIMEOUT); - } - - /* Add to list for this dest */ - SLIST_INSERT_HEAD(&dest->ds_rdm_rdc_list, rdc, dc_addr_link); - rdc->dc_dest = dest; - rdc->dc_seq_credits = USDF_RUDP_SEQ_CREDITS; - rdc->dc_next_tx_seq = 0; - - return rdc; -} - -/* - * See if there is matching connectoin in hash table. If not, grab a new one. - */ -static inline struct usdf_rdm_connection * -usdf_rdm_rdc_rx_get(struct usdf_rx *rx, struct rudp_pkt *pkt) -{ - struct usdf_rdm_connection *rdc; - struct usdf_domain *udp; - struct usdf_tx *tx; - - udp = rx->rx_domain; - tx = rx->r.rdm.rx_tx; - - /* if pkt->msg.src_peer_id != 0, live connection, just look up */ - - rdc = usdf_rdm_rdc_hdr_lookup(udp, &pkt->hdr); - if (rdc == NULL) { - rdc = usdf_rdc_alloc(udp); - if (rdc == NULL) { - return NULL; - } - - memcpy(&rdc->dc_hdr, pkt, sizeof(rdc->dc_hdr)); - memcpy(rdc->dc_hdr.uh_eth.ether_shost, - pkt->hdr.uh_eth.ether_dhost, ETH_ALEN); - memcpy(rdc->dc_hdr.uh_eth.ether_dhost, - pkt->hdr.uh_eth.ether_shost, ETH_ALEN); - rdc->dc_hdr.uh_ip.saddr = pkt->hdr.uh_ip.daddr; - rdc->dc_hdr.uh_ip.daddr = pkt->hdr.uh_ip.saddr; - rdc->dc_hdr.uh_udp.dest = pkt->hdr.uh_udp.source; - rdc->dc_hdr.uh_udp.source = pkt->hdr.uh_udp.dest; - - rdc->dc_next_rx_seq = 0; - rdc->dc_tx = tx; - usdf_rdm_rdc_insert(udp, rdc); - - /* start eviction timer */ - usdf_timer_set(tx->tx_domain->dom_fabric, rdc->dc_timer, - USDF_RDM_RDC_TIMEOUT); - } - return rdc; -} - -/* - * Rewind a queue entry by "rewind" packets - */ -static inline void -usdf_rdm_rewind_qe(struct usdf_rdm_qe *qe, size_t rewind, size_t mtu) -{ - size_t cur_resid; - size_t cur_iov; - size_t bytes; - size_t len; - - if (qe->rd_resid == 0) { - bytes = qe->rd_length % mtu; - cur_resid = 0; - } else { - bytes = mtu; - cur_resid = qe->rd_iov_resid; - } - bytes += (rewind - 1) * mtu; - qe->rd_resid += bytes; - - cur_iov = qe->rd_cur_iov; - while (bytes > 0) { - len = qe->rd_iov[cur_iov].iov_len - cur_resid; - if (len >= bytes) { - len = bytes; - cur_resid += len; - } else { - --cur_iov; - cur_resid = 0; - } - bytes -= len; - } - - qe->rd_cur_iov = cur_iov; - qe->rd_cur_ptr = ((uint8_t *)qe->rd_iov[cur_iov].iov_base) + - qe->rd_iov[cur_iov].iov_len - cur_resid; - qe->rd_iov_resid = cur_resid; -} - -/* - * semi-native rx buffer post, i want to eventually avoid using the - * vnic_*() calls - */ -static inline int -_usdf_rdm_post_recv(struct usdf_rx *rx, void *buf, size_t len) -{ - struct usd_rq *rq; - struct vnic_rq *vrq; - struct rq_enet_desc *desc; - struct usd_qp_impl *qp; - - qp = to_qpi(rx->rx_qp); - rq = &qp->uq_rq; - vrq = &rq->urq_vnic_rq; - - rq->urq_context[rq->urq_post_index] = buf; - rq->urq_post_index = (rq->urq_post_index + 1) - & rq->urq_post_index_mask; - - desc = rq->urq_next_desc; - rq_enet_desc_enc(desc, (dma_addr_t) buf, - RQ_ENET_TYPE_ONLY_SOP, len); - wmb(); - iowrite32(rq->urq_post_index, &vrq->ctrl->posted_index); - - rq->urq_next_desc = (struct rq_enet_desc *) - ((uintptr_t)rq->urq_desc_ring - + ((rq->urq_post_index)<<4)); - rq->urq_recv_credits -= 1; - - return 0; -} - -/* - * Allow external access to the inline - */ -int -usdf_rdm_post_recv(struct usdf_rx *rx, void *buf, size_t len) -{ - return _usdf_rdm_post_recv(rx, buf, len); -} - -ssize_t -usdf_rdm_recv(struct fid_ep *fep, void *buf, size_t len, - void *desc, fi_addr_t src_addr, void *context) -{ - struct usdf_ep *ep; - struct usdf_rx *rx; - struct usdf_rdm_qe *rqe; - struct usdf_domain *udp; - - ep = ep_ftou(fep); - rx = ep->ep_rx; - udp = ep->ep_domain; - - if (TAILQ_EMPTY(&rx->r.rdm.rx_free_rqe)) { - return -FI_EAGAIN; - } - - pthread_spin_lock(&udp->dom_progress_lock); - - rqe = usdf_rdm_get_rx_rqe(rx); - - rqe->rd_context = context; - rqe->rd_iov[0].iov_base = buf; - rqe->rd_iov[0].iov_len = len; - rqe->rd_last_iov = 0; - - rqe->rd_cur_iov = 0; - rqe->rd_cur_ptr = buf; - rqe->rd_iov_resid = len; - rqe->rd_length = 0; - rqe->rd_resid = len; - USDF_DBG_SYS(EP_DATA, "RECV post rqe=%p len=%lu\n", rqe, len); - - TAILQ_INSERT_TAIL(&rx->r.rdm.rx_posted_rqe, rqe, rd_link); - - pthread_spin_unlock(&udp->dom_progress_lock); - - return 0; -} - -static inline ssize_t _usdf_rdm_recv_vector(struct fid_ep *fep, - const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, - void *context, uint64_t flags) -{ - struct usdf_ep *ep; - struct usdf_rx *rx; - struct usdf_rdm_qe *rqe; - struct usdf_domain *udp; - size_t tot_len; - size_t i; - - ep = ep_ftou(fep); - rx = ep->ep_rx; - udp = ep->ep_domain; - - if (flags & ~USDF_RDM_SUPP_RECVMSG_FLAGS) { - USDF_DBG_SYS(EP_DATA, - "one or more flags in 0x%" PRIx64 " not supported\n", - flags); - return -FI_EOPNOTSUPP; - } - - if (TAILQ_EMPTY(&rx->r.rdm.rx_free_rqe)) - return -FI_EAGAIN; - - pthread_spin_lock(&udp->dom_progress_lock); - - rqe = usdf_rdm_get_rx_rqe(rx); - - tot_len = 0; - for (i = 0; i < count; i++) { - rqe->rd_iov[i].iov_base = iov[i].iov_base; - rqe->rd_iov[i].iov_len = iov[i].iov_len; - tot_len += iov[i].iov_len; - } - - rqe->rd_context = context; - rqe->rd_cur_iov = 0; - rqe->rd_iov_resid = iov[0].iov_len; - rqe->rd_last_iov = count - 1; - rqe->rd_cur_ptr = iov[0].iov_base; - rqe->rd_resid = tot_len; - rqe->rd_length = 0; - - rqe->rd_signal_comp = ep->ep_rx_dflt_signal_comp || - (flags & FI_COMPLETION) ? 1 : 0; - - TAILQ_INSERT_TAIL(&rx->r.rdm.rx_posted_rqe, rqe, rd_link); - - pthread_spin_unlock(&udp->dom_progress_lock); - - return FI_SUCCESS; -} - -ssize_t usdf_rdm_recvv(struct fid_ep *fep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, void *context) -{ - struct usdf_ep *ep = ep_ftou(fep); - - return _usdf_rdm_recv_vector(fep, iov, desc, count, src_addr, context, - ep->ep_rx->rx_attr.op_flags); -} - -ssize_t usdf_rdm_recvmsg(struct fid_ep *fep, const struct fi_msg *msg, - uint64_t flags) -{ - return _usdf_rdm_recv_vector(fep, msg->msg_iov, msg->desc, - msg->iov_count, msg->addr, msg->context, flags); -} - -ssize_t -usdf_rdm_send(struct fid_ep *fep, const void *buf, size_t len, void *desc, - fi_addr_t dest_addr, void *context) -{ - struct usdf_ep *ep; - struct usdf_tx *tx; - struct usdf_rdm_qe *wqe; - struct usdf_domain *udp; - struct usdf_dest *dest; - struct usdf_rdm_connection *rdc; - uint32_t msg_id; - uint64_t op_flags; - - ep = ep_ftou(fep); - tx = ep->ep_tx; - udp = ep->ep_domain; - dest = (struct usdf_dest *)dest_addr; - - if (TAILQ_EMPTY(&tx->t.rdm.tx_free_wqe)) { - return -FI_EAGAIN; - } - - pthread_spin_lock(&udp->dom_progress_lock); - - rdc = usdf_rdm_rdc_tx_get(dest, ep); - if (rdc == NULL) { - pthread_spin_unlock(&udp->dom_progress_lock); - return -FI_EAGAIN; - } - - wqe = usdf_rdm_get_tx_wqe(tx); - - wqe->rd_context = context; - - msg_id = ofi_atomic_inc32(&tx->t.rdm.tx_next_msg_id); - wqe->rd_msg_id_be = htonl(msg_id); - - wqe->rd_iov[0].iov_base = (void *)buf; - wqe->rd_iov[0].iov_len = len; - wqe->rd_last_iov = 0; - - wqe->rd_cur_iov = 0; - wqe->rd_cur_ptr = buf; - wqe->rd_iov_resid = len; - wqe->rd_resid = len; - wqe->rd_length = len; - - op_flags = ep->ep_tx->tx_attr.op_flags; - wqe->rd_signal_comp = ep->ep_tx_dflt_signal_comp || - (op_flags & FI_COMPLETION); - - /* add send to TX list */ - TAILQ_INSERT_TAIL(&rdc->dc_wqe_posted, wqe, rd_link); - usdf_rdm_rdc_ready(rdc, tx); - - pthread_spin_unlock(&udp->dom_progress_lock); - USDF_DBG_SYS(EP_DATA, "SEND posted len=%lu, ID = %d\n", len, msg_id); - - usdf_domain_progress(udp); - - return 0; -} - -static inline size_t _usdf_iov_len(const struct iovec *iov, size_t count) -{ - size_t len; - size_t i; - - for (i = 0, len = 0; i < count; i++) - len += iov[i].iov_len; - - return len; -} - -static inline ssize_t _usdf_rdm_send_vector(struct fid_ep *fep, - const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, - void *context, uint64_t flags) -{ - struct usdf_rdm_connection *rdc; - struct usdf_rdm_qe *wqe; - struct usdf_domain *udp; - struct usdf_dest *dest; - struct usdf_ep *ep; - struct usdf_tx *tx; - uint32_t msg_id; - size_t tot_len; - size_t i; - - ep = ep_ftou(fep); - tx = ep->ep_tx; - udp = ep->ep_domain; - dest = (struct usdf_dest *) dest_addr; - - if (flags & ~USDF_RDM_SUPP_SENDMSG_FLAGS) { - USDF_DBG_SYS(EP_DATA, - "one or more flags in %#" PRIx64 " not supported\n", - flags); - return -FI_EOPNOTSUPP; - } - - if (TAILQ_EMPTY(&tx->t.rdm.tx_free_wqe)) - return -FI_EAGAIN; - - /* check for inject overrun before acquiring lock and allocating msg id, - * easier to unwind this way */ - if (flags & FI_INJECT) { - tot_len = _usdf_iov_len(iov, count); - if (tot_len > USDF_RDM_MAX_INJECT_SIZE) { - USDF_DBG_SYS(EP_DATA, "max inject len exceeded (%zu)\n", - tot_len); - return -FI_EINVAL; - } - } - - pthread_spin_lock(&udp->dom_progress_lock); - - rdc = usdf_rdm_rdc_tx_get(dest, ep); - if (rdc == NULL) { - pthread_spin_unlock(&udp->dom_progress_lock); - return -FI_EAGAIN; - } - - wqe = usdf_rdm_get_tx_wqe(tx); - - tot_len = 0; - if (flags & FI_INJECT) { - /* copy to the wqe's tiny injection buffer */ - for (i = 0; i < count; ++i) { - assert(tot_len + iov[i].iov_len <= - USDF_RDM_MAX_INJECT_SIZE); - memcpy(&wqe->rd_inject_buf[tot_len], iov[i].iov_base, - iov[i].iov_len); - tot_len += iov[i].iov_len; - } - - wqe->rd_iov[0].iov_base = wqe->rd_inject_buf; - wqe->rd_iov[0].iov_len = tot_len; - wqe->rd_last_iov = 0; - } else { - for (i = 0; i < count; ++i) { - wqe->rd_iov[i].iov_base = iov[i].iov_base; - wqe->rd_iov[i].iov_len = iov[i].iov_len; - tot_len += iov[i].iov_len; - } - wqe->rd_last_iov = count - 1; - } - - msg_id = ofi_atomic_inc32(&tx->t.rdm.tx_next_msg_id); - - wqe->rd_msg_id_be = htonl(msg_id); - wqe->rd_context = context; - wqe->rd_cur_iov = 0; - wqe->rd_cur_ptr = iov[0].iov_base; - wqe->rd_iov_resid = iov[0].iov_len; - wqe->rd_resid = tot_len; - wqe->rd_length = tot_len; - - wqe->rd_signal_comp = - ep->ep_tx_dflt_signal_comp || (flags & FI_COMPLETION); - - /* add send to TX list */ - TAILQ_INSERT_TAIL(&rdc->dc_wqe_posted, wqe, rd_link); - usdf_rdm_rdc_ready(rdc, tx); - - pthread_spin_unlock(&udp->dom_progress_lock); - USDF_DBG_SYS(EP_DATA, "posted len=%lu, ID=%d\n", tot_len, - msg_id); - - usdf_domain_progress(udp); - - return FI_SUCCESS; -} - -ssize_t usdf_rdm_sendv(struct fid_ep *fep, const struct iovec *iov, void **desc, - size_t count, fi_addr_t dest_addr, void *context) -{ - struct usdf_ep *ep = ep_ftou(fep); - - return _usdf_rdm_send_vector(fep, iov, desc, count, dest_addr, context, - ep->ep_tx->tx_attr.op_flags); -} - -ssize_t usdf_rdm_sendmsg(struct fid_ep *fep, const struct fi_msg *msg, - uint64_t flags) -{ - return _usdf_rdm_send_vector(fep, msg->msg_iov, msg->desc, - msg->iov_count, msg->addr, msg->context, flags); -} - -ssize_t -usdf_rdm_inject(struct fid_ep *fep, const void *buf, size_t len, - fi_addr_t dest_addr) -{ - struct usdf_ep *ep; - struct usdf_tx *tx; - struct usdf_rdm_qe *wqe; - struct usdf_domain *udp; - struct usdf_dest *dest; - struct usdf_rdm_connection *rdc; - uint32_t msg_id; - - ep = ep_ftou(fep); - tx = ep->ep_tx; - udp = ep->ep_domain; - dest = (struct usdf_dest *)dest_addr; - - if (len > USDF_RDM_MAX_INJECT_SIZE) { - USDF_DBG_SYS(EP_DATA, "max inject len exceeded (%zu)\n", len); - return -FI_EINVAL; - } - - if (TAILQ_EMPTY(&tx->t.rdm.tx_free_wqe)) { - return -FI_EAGAIN; - } - - pthread_spin_lock(&udp->dom_progress_lock); - - rdc = usdf_rdm_rdc_tx_get(dest, ep); - if (rdc == NULL) { - pthread_spin_unlock(&udp->dom_progress_lock); - return -FI_EAGAIN; - } - - wqe = usdf_rdm_get_tx_wqe(tx); - wqe->rd_context = NULL; - msg_id = ofi_atomic_inc32(&tx->t.rdm.tx_next_msg_id); - wqe->rd_msg_id_be = htonl(msg_id); - - memcpy(wqe->rd_inject_buf, buf, len); - wqe->rd_iov[0].iov_base = wqe->rd_inject_buf; - wqe->rd_iov[0].iov_len = len; - wqe->rd_last_iov = 0; - - wqe->rd_cur_iov = 0; - wqe->rd_cur_ptr = wqe->rd_inject_buf; - wqe->rd_iov_resid = len; - wqe->rd_resid = len; - wqe->rd_length = len; - - /* inject never generates a completion */ - wqe->rd_signal_comp = 0; - - /* add send to TX list */ - TAILQ_INSERT_TAIL(&rdc->dc_wqe_posted, wqe, rd_link); - usdf_rdm_rdc_ready(rdc, tx); - - pthread_spin_unlock(&udp->dom_progress_lock); - USDF_DBG_SYS(EP_DATA, "INJECT posted len=%lu, ID = %d\n", len, msg_id); - - usdf_domain_progress(udp); - - return 0; -} - -/* - * All segments send, stall this TXD until message completely ACKed - */ -static inline void -usdf_rdm_send_sent(struct usdf_tx *tx, struct usdf_rdm_connection *rdc) -{ - struct usdf_rdm_qe *wqe; - - wqe = TAILQ_FIRST(&rdc->dc_wqe_posted); - TAILQ_REMOVE(&rdc->dc_wqe_posted, wqe, rd_link); - TAILQ_INSERT_TAIL(&rdc->dc_wqe_sent, wqe, rd_link); - -#if 0 - /* remove this RDC from TX */ -if (!TAILQ_ON_LIST(rdc, dc_tx_link) abort(); - TAILQ_REMOVE_MARK(&tx->t.rdm.tx_rdc_ready, rdc, dc_tx_link); -#endif -} - -static inline void -usdf_rdm_send_segment(struct usdf_tx *tx, struct usdf_rdm_connection *rdc) -{ - struct rudp_pkt *hdr; - struct usdf_rdm_qe *wqe; - struct usd_qp_impl *qp; - struct usd_wq *wq; - uint32_t index; - size_t cur_iov; - size_t cur_resid; - size_t resid; - const uint8_t *cur_ptr; - const uint8_t *send_ptr; - size_t sent; - uint8_t *ptr; - struct usd_wq_post_info *info; - uint16_t opcode; - - wqe = TAILQ_FIRST(&rdc->dc_wqe_posted); - qp = to_qpi(tx->tx_qp); - wq = &(qp->uq_wq); - - index = wq->uwq_post_index; - hdr = (struct rudp_pkt *)(wq->uwq_copybuf + index * USD_SEND_MAX_COPY); - - memcpy(hdr, &rdc->dc_hdr, sizeof(struct usd_udp_hdr)); - - resid = wqe->rd_resid; - cur_iov = wqe->rd_cur_iov; - cur_ptr = wqe->rd_cur_ptr; - cur_resid = wqe->rd_iov_resid; - - if (cur_ptr == wqe->rd_iov[0].iov_base) { - opcode = RUDP_OP_FIRST; - } else { - opcode = RUDP_OP_MID; - } - - if (resid < USD_SEND_MAX_COPY - sizeof(*hdr)) { - opcode |= RUDP_OP_LAST; - hdr->msg.opcode = htons(opcode); - hdr->msg.msg_id = wqe->rd_msg_id_be; - hdr->msg.m.rc_data.length = htons(resid); - hdr->msg.m.rc_data.seqno = htons(rdc->dc_next_tx_seq); - ++rdc->dc_next_tx_seq; - - ptr = (uint8_t *)(hdr + 1); - sent = resid; - while (resid > 0) { - memcpy(ptr, cur_ptr, cur_resid); - ptr += cur_resid; - resid -= cur_resid; - ++cur_iov; - cur_ptr = wqe->rd_iov[cur_iov].iov_base; - cur_resid = wqe->rd_iov[cur_iov].iov_len; - } - - /* add packet lengths */ - hdr->hdr.uh_ip.tot_len = htons( - sent + sizeof(struct rudp_pkt) - - sizeof(struct ether_header)); - hdr->hdr.uh_udp.len = htons( - (sizeof(struct rudp_pkt) - - sizeof(struct ether_header) - - sizeof(struct iphdr)) + sent); - USDF_DBG_SYS(EP_DATA, "TX 1seg=%lu, s/i = %u/%u\n", sent, ntohs(hdr->msg.m.rc_data.seqno), ntohl(hdr->msg.msg_id)); - - index = _usd_post_send_one(wq, hdr, - sent + sizeof(*hdr), 1); - } else { - struct vnic_wq *vwq; - u_int8_t offload_mode = 0, eop; - u_int16_t mss = 7, header_length = 0, vlan_tag = 0; - u_int8_t vlan_tag_insert = 0, loopback = 0, fcoe_encap = 0; - struct wq_enet_desc *desc; - size_t space; - size_t num_sge; - size_t sge_len; - - vwq = &wq->uwq_vnic_wq; - desc = wq->uwq_next_desc; - space = tx->tx_domain->dom_fabric->fab_dev_attrs->uda_mtu - - sizeof(*hdr); - num_sge = 1; - - /* encode header desc */ - eop = 0; - wq_enet_desc_enc(desc, (uintptr_t)hdr, sizeof(*hdr), - mss, header_length, offload_mode, eop, 0, fcoe_encap, - vlan_tag_insert, vlan_tag, loopback); - - do { - desc = (struct wq_enet_desc *) - ((uintptr_t)wq->uwq_desc_ring + (index << 4)); - index = (index + 1) & wq->uwq_post_index_mask; - - send_ptr = cur_ptr; - if (cur_resid >= space) { - sge_len = space; - eop = 1; - cur_resid -= sge_len; - cur_ptr += sge_len; - } else { - sge_len = cur_resid; - if (num_sge == USDF_RDM_MAX_SGE || - cur_resid == resid) { - eop = 1; - } - ++cur_iov; - cur_ptr = wqe->rd_iov[cur_iov].iov_base; - cur_resid = wqe->rd_iov[cur_iov].iov_len; - } - - wq_enet_desc_enc(desc, (uintptr_t)send_ptr, sge_len, - mss, header_length, offload_mode, eop, eop, - fcoe_encap, vlan_tag_insert, - vlan_tag, loopback); - - ++num_sge; - space -= sge_len; - resid -= sge_len; - } while (space > 0 && num_sge <= USDF_RDM_MAX_SGE && resid > 0); - - /* add packet lengths */ - sent = tx->tx_domain->dom_fabric->fab_dev_attrs->uda_mtu - - sizeof(*hdr) - space; -//printf("SEND sent=%lu resid=%lu\n", sent, resid); - hdr->hdr.uh_ip.tot_len = htons( - sent + sizeof(struct rudp_pkt) - - sizeof(struct ether_header)); - hdr->hdr.uh_udp.len = htons( - (sizeof(struct rudp_pkt) - - sizeof(struct ether_header) - - sizeof(struct iphdr)) + sent); -#if 0 -if ((random() % 177) == 0 && resid == 0) { - hdr->hdr.uh_eth.ether_type = 0; -//printf("BORK seq %u, ID %u\n", rdc->dc_next_tx_seq, ntohl(wqe->rd_msg_id_be)); -} -#endif - - if (resid == 0) { - opcode |= RUDP_OP_LAST; - } - hdr->msg.opcode = htons(opcode); - hdr->msg.msg_id = wqe->rd_msg_id_be; - hdr->msg.m.rc_data.length = htons(sent); - hdr->msg.m.rc_data.seqno = htons(rdc->dc_next_tx_seq); - ++rdc->dc_next_tx_seq; - USDF_DBG_SYS(EP_DATA, "TX sge=%lu, s/i = %u/%u\n", sent, ntohs(hdr->msg.m.rc_data.seqno), ntohl(hdr->msg.msg_id)); - - wmb(); - iowrite64(index, &vwq->ctrl->posted_index); - - wq->uwq_next_desc = (struct wq_enet_desc *) - ((uintptr_t)wq->uwq_desc_ring + (index << 4)); - wq->uwq_post_index = (index + 1) & wq->uwq_post_index_mask; - wq->uwq_send_credits -= num_sge; - } - - info = &wq->uwq_post_info[index]; - info->wp_context = tx; - info->wp_len = sent; - - /* If send complete, wait for last ack on this message */ - if (resid == 0) { - wqe->rd_resid = 0; - usdf_rdm_send_sent(tx, rdc); - } else { - wqe->rd_resid = resid; - wqe->rd_iov_resid = cur_resid; - wqe->rd_cur_iov = cur_iov; - wqe->rd_cur_ptr = cur_ptr; - } - - /* set ack timer */ - usdf_timer_set(tx->tx_domain->dom_fabric, rdc->dc_timer, - USDF_RUDP_ACK_TIMEOUT); -} - -static inline void -usdf_rdm_send_ack(struct usdf_tx *tx, struct usdf_rdm_connection *rdc) -{ - struct rudp_pkt *hdr; - struct usd_wq *wq; - uint32_t last_post; - struct usd_wq_post_info *info; - uint16_t seq; - - wq = &(to_qpi(tx->tx_qp)->uq_wq); - hdr = (struct rudp_pkt *) (wq->uwq_copybuf + - wq->uwq_post_index * USD_SEND_MAX_COPY); - - memcpy(hdr, &rdc->dc_hdr, sizeof(struct usd_udp_hdr)); - - if (rdc->dc_send_nak) { - hdr->msg.opcode = htons(RUDP_OP_NAK); - seq = rdc->dc_ack_seq + 1; - hdr->msg.m.nak.nak_seq = htons(seq); - rdc->dc_send_nak = 0; - USDF_DBG_SYS(EP_DATA, "TX NAK seq=%d\n", seq); - } else { - hdr->msg.opcode = htons(RUDP_OP_ACK); - seq = rdc->dc_ack_seq; - hdr->msg.m.ack.ack_seq = htons(seq); - USDF_DBG_SYS(EP_DATA, "TXACK seq=%u:%u\n", seq, rdc->dc_rx_msg_id); - } - hdr->msg.msg_id = htonl(rdc->dc_ack_msg_id); - - /* add packet lengths */ - hdr->hdr.uh_ip.tot_len = htons( - sizeof(struct rudp_pkt) - - sizeof(struct ether_header)); - hdr->hdr.uh_udp.len = htons(sizeof(struct rudp_pkt) - - sizeof(struct ether_header) - sizeof(struct iphdr)); - - last_post = _usd_post_send_one(wq, hdr, sizeof(*hdr), 1); - - info = &wq->uwq_post_info[last_post]; - info->wp_context = tx; - info->wp_len = 0; -} - -/* - * If this TX has sends to do and is not on domain ready list, then - * this completion means we can go back on the domain ready list - */ -static void -usdf_rdm_send_completion(struct usd_completion *comp) -{ - struct usdf_tx *tx; - - tx = comp->uc_context; - - if (!TAILQ_EMPTY(&tx->t.rdm.tx_rdc_ready) && - !TAILQ_ON_LIST(tx, tx_link)) { - TAILQ_INSERT_TAIL(&tx->tx_domain->dom_tx_ready, tx, tx_link); - } -} - -/* - * Keep progressing sends on this queue until: - * a) no more send credits on the queue (it's full) - * or - * b) all endpoints are complete or blocked awaiting ACKs - */ -void -usdf_rdm_tx_progress(struct usdf_tx *tx) -{ - struct usdf_rdm_connection *rdc; - struct usd_qp_impl *qp; - - qp = to_qpi(tx->tx_qp); - while (qp->uq_wq.uwq_send_credits > 1 && - !TAILQ_EMPTY(&tx->t.rdm.tx_rdc_have_acks)) { - rdc = TAILQ_FIRST(&tx->t.rdm.tx_rdc_have_acks); - TAILQ_REMOVE_MARK(&tx->t.rdm.tx_rdc_have_acks, - rdc, dc_ack_link); - - usdf_rdm_send_ack(tx, rdc); - } - - while (qp->uq_wq.uwq_send_credits > 1 && - !TAILQ_EMPTY(&tx->t.rdm.tx_rdc_ready)) { - rdc = TAILQ_FIRST(&tx->t.rdm.tx_rdc_ready); - - /* - * Send next segment on this connection. This will also - * remove the current WQE from the RDC list if it - * completes. - */ - usdf_rdm_send_segment(tx, rdc); - - --rdc->dc_seq_credits; - if (!TAILQ_EMPTY(&rdc->dc_wqe_sent)) { - TAILQ_REMOVE_MARK(&tx->t.rdm.tx_rdc_ready, - rdc, dc_tx_link); - } else if (TAILQ_EMPTY(&rdc->dc_wqe_posted)) { - TAILQ_REMOVE_MARK(&tx->t.rdm.tx_rdc_ready, - rdc, dc_tx_link); - } else { - --rdc->dc_fairness_credits; - if (rdc->dc_seq_credits == 0) { - TAILQ_REMOVE_MARK(&tx->t.rdm.tx_rdc_ready, - rdc, dc_tx_link); - rdc->dc_fairness_credits = - USDF_RDM_FAIRNESS_CREDITS; - - /* fairness credits exhausted, go to back of the line */ - } else if (rdc->dc_fairness_credits == 0) { - TAILQ_REMOVE(&tx->t.rdm.tx_rdc_ready, - rdc, dc_tx_link); - TAILQ_INSERT_TAIL(&tx->t.rdm.tx_rdc_ready, - rdc, dc_tx_link); - rdc->dc_fairness_credits = - USDF_RDM_FAIRNESS_CREDITS; - } - } - } -} - -static inline void usdf_rdm_recv_complete(struct usdf_rx *rx, - struct usdf_rdm_connection *rdc, - struct usdf_rdm_qe *rqe, int status) -{ - struct usdf_cq_hard *hcq; - - USDF_DBG_SYS(EP_DATA, "RECV complete ID=%u len=%lu with status %d\n", - rdc->dc_rx_msg_id, rqe->rd_length, status); - hcq = rx->r.rdm.rx_hcq; - hcq->cqh_post(hcq, rqe->rd_context, rqe->rd_length, status, - FI_MSG | FI_RECV); - - usdf_rdm_put_rx_rqe(rx, rqe); - - rdc->dc_cur_rqe = NULL; -} - -static inline void -usdf_rdm_rdc_has_ack(struct usdf_rdm_connection *rdc) -{ - struct usdf_tx *tx; - struct usdf_domain *udp; - - if (!TAILQ_ON_LIST(rdc, dc_ack_link)) { - tx = rdc->dc_tx; - udp = tx->tx_domain; - TAILQ_INSERT_TAIL(&tx->t.rdm.tx_rdc_have_acks, rdc, - dc_ack_link); - /* Add TX to domain list if not present */ - if (!TAILQ_ON_LIST(tx, tx_link)) { - TAILQ_INSERT_TAIL(&udp->dom_tx_ready, tx, tx_link); - } - } -} - -static inline void -usdf_set_ack_nak(struct usdf_rdm_connection *rdc, uint32_t msg_id, - uint16_t seq, uint16_t nak) -{ - /* if newly on list or msg_id > cur, use all new values */ - if (!TAILQ_ON_LIST(rdc, dc_ack_link) || - RUDP_MSGID_GT(msg_id, rdc->dc_ack_msg_id)) { - rdc->dc_ack_msg_id = msg_id; - rdc->dc_ack_seq = seq; - rdc->dc_send_nak = nak; - - /* If same msg_id and new seq, use new seq */ - } else if (msg_id == rdc->dc_ack_msg_id && - RUDP_SEQ_GE(seq, rdc->dc_ack_seq)) { - rdc->dc_ack_seq = seq; - rdc->dc_send_nak = nak; - } - - usdf_rdm_rdc_has_ack(rdc); -} - -static inline void -usdf_set_ack(struct usdf_rdm_connection *rdc, uint32_t msg_id, uint16_t seq) -{ - usdf_set_ack_nak(rdc, msg_id, seq, 0); -} - -static inline void -usdf_set_nak(struct usdf_rdm_connection *rdc, uint32_t msg_id, uint16_t seq) -{ - usdf_set_ack_nak(rdc, msg_id, seq, 1); -} - -static inline struct usdf_rdm_qe * -usdf_rdm_check_seq_id(struct usdf_rdm_connection *rdc, struct usdf_rx *rx, - struct rudp_pkt *pkt) -{ - uint16_t seq; - uint32_t msg_id; - int32_t msg_delta; - struct usdf_rdm_qe *rqe; - - seq = ntohs(pkt->msg.m.rc_data.seqno); - msg_id = ntohl(pkt->msg.msg_id); - if (rdc->dc_flags & USDF_DCF_NEW_RX) { - msg_delta = 1; - } else { - msg_delta = RUDP_SEQ_DIFF(msg_id, rdc->dc_rx_msg_id); - } - rqe = rdc->dc_cur_rqe; - USDF_DBG_SYS(EP_DATA, "RXSEQ %u:%u, msg_delt=%d, rqe=%p\n", seq, msg_id, msg_delta, rqe); - - /* old message ID */ - if (msg_delta < 0) { - return NULL; /* just DROP */ - - /* current message ID */ - } else if (msg_delta == 0) { - if (RUDP_SEQ_LT(seq, rdc->dc_next_rx_seq)) { - USDF_DBG_SYS(EP_DATA, "old SEQ, ACK %u\n", (uint16_t)(rdc->dc_next_rx_seq)); - usdf_set_ack(rdc, msg_id, rdc->dc_next_rx_seq); - } else if (seq == rdc->dc_next_rx_seq) { - USDF_DBG_SYS(EP_DATA, "old SEQ, ACK %u\n", (uint16_t)(rdc->dc_next_rx_seq)); - usdf_set_ack(rdc, msg_id, rdc->dc_next_rx_seq); - ++rdc->dc_next_rx_seq; - } else { - USDF_DBG_SYS(EP_DATA, "future SEQ, NAK %u\n", rdc->dc_next_rx_seq); - usdf_set_nak(rdc, msg_id, rdc->dc_next_rx_seq - 1); - rqe = NULL; - } - - /* future message ID */ - } else { - if (rqe != NULL) { - return NULL; /* DROP */ - } else if (seq != 0) { - usdf_set_nak(rdc, msg_id, -1); - } else if (TAILQ_EMPTY(&rx->r.rdm.rx_posted_rqe)) { - USDF_WARN_SYS(EP_DATA, "RX overrun?????\n"); /* XXX */ - usdf_set_nak(rdc, msg_id, -1); - } else { - rqe = TAILQ_FIRST(&rx->r.rdm.rx_posted_rqe); - TAILQ_REMOVE(&rx->r.rdm.rx_posted_rqe, rqe, rd_link); - rdc->dc_flags &= ~USDF_DCF_NEW_RX; - rdc->dc_cur_rqe = rqe; - rdc->dc_rx_msg_id = msg_id; - usdf_set_ack(rdc, msg_id, 0); - rdc->dc_next_rx_seq = 1; - USDF_DBG_SYS(EP_DATA, "start new msg, rqe=%p\n", rqe); - } - } - return rqe; -} - -static inline void -usdf_rdm_process_ack(struct usdf_rdm_connection *rdc, - struct usdf_tx *tx, uint16_t seq, uint32_t msg_id) -{ - struct usdf_cq_hard *hcq; - struct usdf_rdm_qe *wqe; - struct usdf_fabric *fp; - uint16_t max_ack; - unsigned credits; - - /* find assocoated send, drop if none */ - if (!TAILQ_EMPTY(&rdc->dc_wqe_sent)) { - wqe = TAILQ_FIRST(&rdc->dc_wqe_sent); - } else if (!TAILQ_EMPTY(&rdc->dc_wqe_posted)) { - wqe = TAILQ_FIRST(&rdc->dc_wqe_posted); - } else { - USDF_DBG_SYS(EP_DATA, "ACK no WQEs\n"); - return; - } - - /* drop if not for this message */ - if (msg_id != ntohl(wqe->rd_msg_id_be)) { - USDF_DBG_SYS(EP_DATA, "ACK ID %u != %u\n", msg_id, ntohl(wqe->rd_msg_id_be)); - return; - } - - /* don't try to ACK what we don't think we've sent */ - max_ack = rdc->dc_next_tx_seq - 1; - USDF_DBG_SYS(EP_DATA, "ACK %u max = %u\n", seq, max_ack); - if (RUDP_SEQ_GT(seq, max_ack)) { - seq = max_ack; - } - - credits = RUDP_SEQ_DIFF(seq, rdc->dc_last_rx_ack); - if (rdc->dc_seq_credits == 0 && credits > 0 && - !TAILQ_EMPTY(&rdc->dc_wqe_posted)) { - usdf_rdm_rdc_ready(rdc, tx); - } - rdc->dc_seq_credits += credits; - rdc->dc_last_rx_ack = seq; - - /* - * Look at the current send - if this ACK is for the last sequence we - * have sent and the message is fully sent, post a completion and move - * on to the next send. - */ - fp = tx->tx_domain->dom_fabric; - if (seq == max_ack) { - hcq = tx->t.rdm.tx_hcq; - if (!TAILQ_EMPTY(&rdc->dc_wqe_sent)) { - if (wqe->rd_resid == 0) { - TAILQ_REMOVE(&rdc->dc_wqe_sent, wqe, rd_link); - USDF_DBG_SYS(EP_DATA, "send ID=%u complete\n", msg_id); - if (wqe->rd_signal_comp) - hcq->cqh_post(hcq, wqe->rd_context, - wqe->rd_length, - FI_SUCCESS, - FI_MSG | FI_SEND); - - usdf_rdm_put_tx_wqe(tx, wqe); - - /* prepare for next message */ - rdc->dc_next_tx_seq = 0; - rdc->dc_last_rx_ack = rdc->dc_next_tx_seq - 1; - USDF_DBG_SYS(EP_DATA, "posted %s, sent %s\n", TAILQ_EMPTY(&rdc->dc_wqe_posted)?"empty":"occupied", TAILQ_EMPTY(&rdc->dc_wqe_sent)?"empty":"occupied"); - if (!TAILQ_EMPTY(&rdc->dc_wqe_posted)) { - usdf_rdm_rdc_ready(rdc, tx); - } - } - } - - /* revert to eviction timeout */ - usdf_timer_reset(fp, rdc->dc_timer, USDF_RDM_RDC_TIMEOUT); - } else { - usdf_timer_reset(fp, rdc->dc_timer, USDF_RUDP_ACK_TIMEOUT); - } -} - -static inline void -usdf_rdm_process_nak(struct usdf_rdm_connection *rdc, struct usdf_tx *tx, - uint16_t seq, uint32_t msg_id) -{ - struct usdf_rdm_qe *wqe; - struct usdf_fabric *fp; - uint32_t wqe_msg_id; - int rewind; - - /* Ignore NAKs of future packets */ - /* XXX or non-matching msg id */ - - /* In unconnected case, only one msg in flight. If wqe_sent != NULL, - * apply to that, else apply to wqe_posted - */ - if (!TAILQ_EMPTY(&rdc->dc_wqe_sent)) { - wqe = TAILQ_FIRST(&rdc->dc_wqe_sent); - wqe_msg_id = ntohl(wqe->rd_msg_id_be); - USDF_DBG_SYS(EP_DATA, "NAK %u:%u, next = %u:%u\n", seq, msg_id, rdc->dc_next_tx_seq, wqe_msg_id); - if (msg_id != wqe_msg_id) { - return; - } - TAILQ_REMOVE(&rdc->dc_wqe_sent, wqe, rd_link); - TAILQ_INSERT_HEAD(&rdc->dc_wqe_posted, wqe, rd_link); - } else if (!TAILQ_EMPTY(&rdc->dc_wqe_posted)) { - wqe = TAILQ_FIRST(&rdc->dc_wqe_posted); - wqe_msg_id = ntohl(wqe->rd_msg_id_be); - USDF_DBG_SYS(EP_DATA, "NAK %u:%u, next = %u:%u (posted)\n", seq, msg_id, rdc->dc_next_tx_seq, wqe_msg_id); - if (msg_id != wqe_msg_id) { - return; - } - } else { - USDF_DBG_SYS(EP_DATA, "NAK Nothing send or posted\n"); - return; - } - - /* reset WQE to old sequence # */ - rewind = RUDP_SEQ_DIFF(rdc->dc_next_tx_seq, seq); - USDF_DBG_SYS(EP_DATA, "rewind = %d\n", rewind); - if (rewind > 0) { - rdc->dc_seq_credits = USDF_RUDP_SEQ_CREDITS; - rdc->dc_next_tx_seq = seq; - - fp = rdc->dc_tx->tx_domain->dom_fabric; - usdf_rdm_rewind_qe(wqe, rewind, - fp->fab_dev_attrs->uda_mtu - sizeof(struct rudp_pkt)); - - usdf_rdm_rdc_ready(rdc, tx); - } -} - -/* - * RDC timeout could be because of needing to retransmit a packet, or it - * could be cache eviction timer - */ -void -usdf_rdm_rdc_timeout(void *vrdc) -{ - struct usdf_rdm_connection *rdc; - struct usdf_rdm_qe *wqe; - struct usdf_domain *udp; - struct usdf_dest *dest; - uint16_t nak; - - rdc = vrdc; - udp = rdc->dc_tx->tx_domain; - USDF_DBG_SYS(EP_DATA, "RDC timer fire\n"); - - pthread_spin_lock(&udp->dom_progress_lock); - - if (!TAILQ_EMPTY(&rdc->dc_wqe_sent)) { - wqe = TAILQ_FIRST(&rdc->dc_wqe_sent); - goto gotnak; - } else if (!TAILQ_EMPTY(&rdc->dc_wqe_posted)) { - wqe = TAILQ_FIRST(&rdc->dc_wqe_posted); - goto gotnak; - - /* If inactive, remove from hash list */ - } else if (rdc->dc_cur_rqe == NULL && - !TAILQ_ON_LIST(rdc, dc_tx_link) && - !TAILQ_ON_LIST(rdc, dc_ack_link)) { - - dest = rdc->dc_dest; - if (dest != NULL) { - SLIST_REMOVE(&dest->ds_rdm_rdc_list, rdc, - usdf_rdm_connection, dc_addr_link); - } - - rdc->dc_dest = NULL; - rdc->dc_flags = USDF_DCS_UNCONNECTED | USDF_DCF_NEW_RX; - rdc->dc_next_rx_seq = 0; - usdf_rdm_rdc_remove(udp, rdc); - - SLIST_INSERT_HEAD(&udp->dom_rdc_free, rdc, dc_addr_link); - ofi_atomic_inc32(&udp->dom_rdc_free_cnt); - - } else { - usdf_timer_set(udp->dom_fabric, rdc->dc_timer, - USDF_RDM_RDC_TIMEOUT); - } - goto done; - -gotnak: - /* wqe set above */ - nak = rdc->dc_last_rx_ack + 1; - USDF_DBG_SYS(EP_DATA, "TIMEOUT nak=%u:%u\n", nak, ntohl(wqe->rd_msg_id_be)); - usdf_rdm_process_nak(rdc, rdc->dc_tx, nak, ntohl(wqe->rd_msg_id_be)); - -done: - pthread_spin_unlock(&udp->dom_progress_lock); -} - -static inline void -usdf_rdm_rx_ack(struct usdf_rdm_connection *rdc, struct usdf_tx *tx, - struct rudp_pkt *pkt) -{ - uint16_t seq; - uint32_t msg_id; - - seq = ntohs(pkt->msg.m.nak.nak_seq); - msg_id = ntohl(pkt->msg.msg_id); - USDF_DBG_SYS(EP_DATA, "RXACK %u:%u\n", seq, msg_id); - usdf_rdm_process_ack(rdc, tx, seq, msg_id); -} - -static inline void -usdf_rdm_rx_nak(struct usdf_rdm_connection *rdc, struct usdf_tx *tx, - struct rudp_pkt *pkt) -{ - uint16_t seq; - uint32_t msg_id; - - seq = ntohs(pkt->msg.m.nak.nak_seq); - msg_id = ntohl(pkt->msg.msg_id); - usdf_rdm_process_ack(rdc, tx, seq - 1, msg_id); - - usdf_rdm_process_nak(rdc, tx, seq, msg_id); -} - -/* - * Handle a receive on a queue servicing a message endpoint - */ -static inline void -usdf_rdm_handle_recv(struct usdf_domain *udp, struct usd_completion *comp) -{ - struct rudp_pkt *pkt; - struct usdf_rdm_qe *rqe; - struct usdf_rdm_connection *rdc; - struct usd_qp *qp; - struct usdf_rx *rx; - uint32_t opcode; - uint8_t *rx_ptr; - uint8_t *rqe_ptr; - size_t cur_iov; - size_t iov_resid; - size_t rd_resid; - size_t rxlen; - size_t copylen; - - qp = comp->uc_qp; - rx = qp->uq_context; - pkt = comp->uc_context; - opcode = ntohs(pkt->msg.opcode); - - rdc = usdf_rdm_rdc_rx_get(rx, pkt); - if (rdc == NULL) { - goto repost; - } -//printf("RX opcode=%u\n", opcode); - - if (comp->uc_status != USD_COMPSTAT_SUCCESS) - goto repost; - - switch (opcode) { - case RUDP_OP_ACK: - usdf_rdm_rx_ack(rdc, rx->r.rdm.rx_tx, pkt); - goto repost; - - case RUDP_OP_NAK: - usdf_rdm_rx_nak(rdc, rx->r.rdm.rx_tx, pkt); - goto repost; - default: - break; - } - - if ((opcode & ~RUDP_OP_DATA_MASK) != 0) { - goto repost; - } - - /* check sequence # and msg_id */ - rqe = usdf_rdm_check_seq_id(rdc, rx, pkt); - if (rqe == NULL) { - goto repost; - } - - /* Consume the data in the packet */ - rxlen = ntohs(pkt->msg.m.rc_data.length); - rqe->rd_length += rxlen; - - rx_ptr = (uint8_t *)(pkt + 1); - rqe_ptr = (uint8_t *)rqe->rd_cur_ptr; - iov_resid = rqe->rd_iov_resid; - cur_iov = rqe->rd_cur_iov; - rd_resid = rqe->rd_resid; - while (rxlen > 0) { - copylen = MIN(rxlen, iov_resid); - memcpy(rqe_ptr, rx_ptr, copylen); - rx_ptr += copylen; - rxlen -= copylen; - iov_resid -= copylen; - rd_resid -= copylen; - if (iov_resid == 0) { - if (cur_iov == rqe->rd_last_iov) { - break; - } - ++cur_iov; - rqe_ptr = rqe->rd_iov[cur_iov].iov_base; - iov_resid = rqe->rd_iov[cur_iov].iov_len; - } else { - rqe_ptr += copylen; - } - } - - rqe->rd_cur_ptr = rqe_ptr; - rqe->rd_iov_resid = iov_resid; - rqe->rd_cur_iov = cur_iov; - rqe->rd_resid = rd_resid; - - if (rxlen > 0) { - USDF_DBG_SYS(EP_DATA, "RQE truncated by %zu bytes\n", rxlen); - rqe->rd_length -= rxlen; - usdf_rdm_recv_complete(rx, rdc, rqe, FI_ETRUNC); - } else if (opcode & RUDP_OP_LAST) { - usdf_rdm_recv_complete(rx, rdc, rqe, FI_SUCCESS); - } - -repost: - /* repost buffer */ - _usdf_rdm_post_recv(rx, pkt, - rx->rx_domain->dom_fabric->fab_dev_attrs->uda_mtu); -} - -/* - * Process message completions - */ -void -usdf_rdm_hcq_progress(struct usdf_cq_hard *hcq) -{ - struct usd_completion comp; - int loop; - - loop = 100; - while (loop-- > 0 && usd_poll_cq(hcq->cqh_ucq, &comp) != -EAGAIN) { - switch (comp.uc_type) { - case USD_COMPTYPE_SEND: - usdf_rdm_send_completion(&comp); - break; - case USD_COMPTYPE_RECV: - usdf_rdm_handle_recv(hcq->cqh_cq->cq_domain, &comp); - break; - } - } -} - -ssize_t usdf_rdm_rx_size_left(struct fid_ep *fep) -{ - struct usdf_ep *ep; - struct usdf_rx *rx; - - USDF_DBG_SYS(EP_DATA, "\n"); - - ep = ep_ftou(fep); - rx = ep->ep_rx; - - if (!(ep->flags & USDF_EP_ENABLED)) - return -FI_EOPBADSTATE; - - return rx->r.rdm.rx_num_free_rqe; -} - -ssize_t usdf_rdm_tx_size_left(struct fid_ep *fep) -{ - struct usdf_ep *ep; - struct usdf_tx *tx; - - USDF_DBG_SYS(EP_DATA, "\n"); - - ep = ep_ftou(fep); - tx = ep->ep_tx; - - if (!(ep->flags & USDF_EP_ENABLED)) - return -FI_EOPBADSTATE; - - return tx->t.rdm.tx_num_free_wqe; -} diff --git a/prov/usnic/src/usdf_rdm.h b/prov/usnic/src/usdf_rdm.h deleted file mode 100644 index 001ed7a7e14..00000000000 --- a/prov/usnic/src/usdf_rdm.h +++ /dev/null @@ -1,187 +0,0 @@ -/* - * Copyright (c) 2014-2017, Cisco Systems, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#ifndef _USDF_RDM_H_ -#define _USDF_RDM_H_ - -#define USDF_RDM_CAPS (FI_MSG | FI_SOURCE | FI_SEND | FI_RECV) - -#define USDF_RDM_SUPP_MODE (FI_LOCAL_MR) - -#define USDF_RDM_SUPP_SENDMSG_FLAGS \ - (FI_INJECT_COMPLETE | FI_TRANSMIT_COMPLETE | FI_INJECT | FI_COMPLETION) -#define USDF_RDM_SUPP_RECVMSG_FLAGS (FI_COMPLETION) - -#define USDF_RDM_MAX_SGE 8 -#define USDF_RDM_DFLT_SGE 8 -#define USDF_RDM_MAX_CTX_SIZE 1024 -#define USDF_RDM_DFLT_CTX_SIZE 128 - -#define USDF_RDM_MAX_MSG UINT_MAX - -#define USDF_RDM_MAX_INJECT_SIZE 64 -#define USDF_RDM_IOV_LIMIT (USDF_RDM_DFLT_SGE) -#define USDF_RDM_RMA_IOV_LIMIT 0 -#define USDF_RDM_MR_IOV_LIMIT (USDF_MR_IOV_LIMIT) -#define USDF_RDM_MR_CNT (USDF_MR_CNT) - -#define USDF_RDM_CNTR_CNT 0 - -#define USDF_RDM_MSG_ORDER (FI_ORDER_NONE) -#define USDF_RDM_COMP_ORDER (FI_ORDER_NONE) - -#define USDF_RDM_FREE_BLOCK (16 * 1024) -#define USDF_RDM_HASH_SIZE (64 * 1024) -#define USDF_RDM_HASH_MASK (USDF_RDM_HASH_SIZE - 1) -#define USDF_RDM_FAIRNESS_CREDITS 16 - -#define USDF_RDM_RUDP_SEQ_CREDITS 256 - -#define USDF_RDM_RDC_TIMEOUT 1000 /* ms */ - -struct usdf_rdm_qe { - void *rd_context; - uint32_t rd_msg_id_be; - - struct iovec rd_iov[USDF_RDM_MAX_SGE]; - size_t rd_last_iov; - size_t rd_length; - - size_t rd_cur_iov; - const uint8_t *rd_cur_ptr; - size_t rd_resid; /* amount remaining in entire rdm */ - size_t rd_iov_resid; /* amount remaining in current iov */ - - /* points at buffer no larger than USDF_RDM_MAX_INJECT_SIZE */ - uint8_t *rd_inject_buf; - - uint8_t rd_signal_comp; - - TAILQ_ENTRY(usdf_rdm_qe) rd_link; - - struct usdf_rdm_connection *rd_conn; -}; - -/* - * RDM connection state - */ -enum { - USDF_DCS_UNCONNECTED = 0, - USDF_DCS_CONNECTING = 1, - USDF_DCS_CONNECTED = 2 -}; - -#define USDF_DCF_STATE_BITS 0x03 -#define USDF_DCF_NEW_RX 0x04 - -/* - * We're only connectionless to the app. - * This connection struct is used to manage messages in flight. - */ -struct usdf_rdm_connection { - ofi_atomic32_t dc_refcnt; - - struct usdf_tx *dc_tx; - struct usd_udp_hdr dc_hdr; - uint16_t dc_flags; - struct usdf_timer_entry *dc_timer; - - /* RX state */ - uint32_t dc_rx_msg_id; - struct usdf_rdm_qe *dc_cur_rqe; - uint16_t dc_next_rx_seq; - uint16_t dc_send_nak; - uint32_t dc_ack_msg_id; - uint16_t dc_ack_seq; - TAILQ_ENTRY(usdf_rdm_connection) dc_ack_link; - - /* TX state */ - struct usdf_dest *dc_dest; - TAILQ_HEAD(,usdf_rdm_qe) dc_wqe_posted; - TAILQ_HEAD(,usdf_rdm_qe) dc_wqe_sent; - uint16_t dc_next_tx_seq; - uint16_t dc_last_rx_ack; - size_t dc_fairness_credits; - size_t dc_seq_credits; - TAILQ_ENTRY(usdf_rdm_connection) dc_tx_link; - - SLIST_ENTRY(usdf_rdm_connection) dc_addr_link; - struct usdf_rdm_connection *dc_hash_next; -}; - -int usdf_rdm_fill_ep_attr(const struct fi_info *hints, struct fi_info *fi, - struct usd_device_attrs *dap); -int usdf_rdm_fill_dom_attr(uint32_t version, const struct fi_info *hints, - struct fi_info *fi, struct usd_device_attrs *dap); -int usdf_rdm_fill_tx_attr(uint32_t version, const struct fi_info *hints, - struct fi_info *fi); -int usdf_rdm_fill_rx_attr(uint32_t version, const struct fi_info *hints, - struct fi_info *fi); - -int usdf_rdm_post_recv(struct usdf_rx *rx, void *buf, size_t len); -int usdf_cq_rdm_poll(struct usd_cq *ucq, struct usd_completion *comp); -void usdf_rdm_rdc_timeout(void *vrdc); - -void usdf_rdm_hcq_progress(struct usdf_cq_hard *hcq); -void usdf_rdm_tx_progress(struct usdf_tx *tx); - -/* fi_ops_cm for RC */ -int usdf_cm_rdm_connect(struct fid_ep *ep, const void *addr, - const void *param, size_t paramlen); -int usdf_cm_rdm_accept(struct fid_ep *fep, const void *param, size_t paramlen); -int usdf_cm_rdm_shutdown(struct fid_ep *ep, uint64_t flags); - -/* fi_ops_rdm for RC */ -ssize_t usdf_rdm_recv(struct fid_ep *ep, void *buf, size_t len, void *desc, - fi_addr_t src_addr, void *context); -ssize_t usdf_rdm_recvv(struct fid_ep *fep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, void *context); -ssize_t usdf_rdm_recvmsg(struct fid_ep *fep, const struct fi_msg *msg, - uint64_t flags); - -ssize_t usdf_rdm_send(struct fid_ep *ep, const void *buf, size_t len, - void *desc, fi_addr_t src_addr, void *context); -ssize_t usdf_rdm_sendv(struct fid_ep *ep, const struct iovec *iov, - void **desc, size_t count, fi_addr_t src_addr, void *context); -ssize_t usdf_rdm_sendmsg(struct fid_ep *ep, const struct fi_msg *msg, - uint64_t flags); -ssize_t usdf_rdm_inject(struct fid_ep *ep, const void *buf, size_t len, - fi_addr_t src_addr); - - -ssize_t usdf_rdm_rx_size_left(struct fid_ep *fep); -ssize_t usdf_rdm_tx_size_left(struct fid_ep *fep); - -#endif /* _USDF_RDM_H_ */ diff --git a/prov/usnic/src/usdf_socket.c b/prov/usnic/src/usdf_socket.c index e3ea5dee566..2d1f7cfe438 100644 --- a/prov/usnic/src/usdf_socket.c +++ b/prov/usnic/src/usdf_socket.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include diff --git a/prov/usnic/src/usdf_wait.c b/prov/usnic/src/usdf_wait.c index 510f99edc2e..ea575e3f705 100644 --- a/prov/usnic/src/usdf_wait.c +++ b/prov/usnic/src/usdf_wait.c @@ -172,7 +172,7 @@ int usdf_wait_open(struct fid_fabric *fabric, struct fi_wait_attr *attr, { struct usdf_wait *wait_priv; struct usdf_fabric *fabric_priv; - int epfd; + ofi_epoll_t epfd; int ret; USDF_TRACE_SYS(FABRIC, "\n"); @@ -192,10 +192,9 @@ int usdf_wait_open(struct fid_fabric *fabric, struct fi_wait_attr *attr, if (ret) goto error; - epfd = epoll_create1(0); - if (epfd < 0) { + ret = ofi_epoll_create(&epfd); + if (ret) { USDF_WARN_SYS(FABRIC, "failed to create epoll fd[%d]\n", errno); - ret = -errno; goto error; } @@ -228,7 +227,7 @@ int usdf_wait_open(struct fid_fabric *fabric, struct fi_wait_attr *attr, return FI_SUCCESS; calloc_fail: - close(epfd); + ofi_epoll_close(epfd); error: *waitset = NULL; return ret; @@ -258,7 +257,7 @@ static int usdf_wait_close(struct fid *waitset) switch (wait_priv->wait_obj) { case FI_WAIT_UNSPEC: case FI_WAIT_FD: - close(wait_priv->object.epfd); + ofi_epoll_close(wait_priv->object.epfd); break; default: USDF_WARN_SYS(FABRIC, @@ -275,7 +274,7 @@ static int usdf_wait_close(struct fid *waitset) static int usdf_wait_wait(struct fid_wait *fwait, int timeout) { struct usdf_wait *wait; - struct epoll_event event; + void *context; int ret = FI_SUCCESS; int nevents; @@ -290,12 +289,12 @@ static int usdf_wait_wait(struct fid_wait *fwait, int timeout) return ret; } - nevents = epoll_wait(wait->object.epfd, &event, 1, timeout); + nevents = ofi_epoll_wait(wait->object.epfd, &context, 1, timeout); if (nevents == 0) { ret = -FI_ETIMEDOUT; } else if (nevents < 0) { USDF_DBG_SYS(FABRIC, "epoll wait failed\n"); - ret = -errno; + ret = nevents; } return ret; @@ -313,7 +312,11 @@ static int usdf_wait_get_wait(struct usdf_wait *wait_priv, void *arg) switch (wait_priv->wait_obj) { case FI_WAIT_UNSPEC: case FI_WAIT_FD: +#ifdef HAVE_EPOLL *(int *) arg = wait_priv->object.epfd; +#else + return -FI_ENOSYS; +#endif break; default: USDF_DBG_SYS(FABRIC, "unsupported wait type\n"); diff --git a/prov/usnic/src/usdf_wait.h b/prov/usnic/src/usdf_wait.h index 8c6cf8b822a..c1a273ddd9f 100644 --- a/prov/usnic/src/usdf_wait.h +++ b/prov/usnic/src/usdf_wait.h @@ -45,7 +45,7 @@ struct usdf_wait { enum fi_wait_obj wait_obj; union { - int epfd; + ofi_epoll_t epfd; struct fi_mutex_cond mutex_cond; } object; diff --git a/prov/usnic/src/usnic_direct/vnic_devcmd.h b/prov/usnic/src/usnic_direct/vnic_devcmd.h index 47e06095bab..90872381c1c 100644 --- a/prov/usnic/src/usnic_direct/vnic_devcmd.h +++ b/prov/usnic/src/usnic_direct/vnic_devcmd.h @@ -820,7 +820,7 @@ struct vnic_devcmd_notify { struct vnic_devcmd_provinfo { u8 oui[3]; u8 type; - u8 data[0]; + u8 data[]; }; /* @@ -1038,7 +1038,7 @@ enum { struct filter_tlv { u_int32_t type; u_int32_t length; - u_int32_t val[0]; + u_int32_t val[]; }; /* Data for CMD_ADD_FILTER is 2 TLV and filter + action structs */ @@ -1379,9 +1379,9 @@ typedef enum { * * in: (u32) a0 = RDMA_SUBCMD_GET_STATS * - * out: (u64) a0 = IG packet count + * out: (u64) a0 = IG packet count * (u64) a1 = IG byte count - * (u64) a2 = EG packet count + * (u64) a2 = EG packet count * (u64) a3 = EG byte count */ #define RDMA_SUBCMD_GET_STATS 7 diff --git a/prov/util/src/cuda_mem_monitor.c b/prov/util/src/cuda_mem_monitor.c new file mode 100644 index 00000000000..7abfe6c8aee --- /dev/null +++ b/prov/util/src/cuda_mem_monitor.c @@ -0,0 +1,151 @@ +/* + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ofi_mr.h" + +#if HAVE_LIBCUDA + +#include "ofi_hmem.h" + +static int cuda_mm_subscribe(struct ofi_mem_monitor *monitor, const void *addr, + size_t len, union ofi_mr_hmem_info *hmem_info) +{ + CUresult ret; + + ret = ofi_cuPointerGetAttribute(&hmem_info->cuda_id, + CU_POINTER_ATTRIBUTE_BUFFER_ID, + (CUdeviceptr)addr); + if (ret == CUDA_SUCCESS) { + FI_DBG(&core_prov, FI_LOG_MR, + "Assigned CUDA buffer ID %lu to buffer %p\n", + hmem_info->cuda_id, addr); + return FI_SUCCESS; + } + + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to get CUDA buffer ID for buffer %p len %lu\n" + "cuPointerGetAttribute() failed: %s:%s\n", addr, len, + ofi_cudaGetErrorName(ret), ofi_cudaGetErrorString(ret)); + + return -FI_EFAULT; +} + +static void cuda_mm_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + /* no-op */ +} + +static bool cuda_mm_valid(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + uint64_t id; + CUresult ret; + + /* CUDA buffer IDs are associated for each CUDA monitor entry. If the + * device pages backing the device virtual address change, a different + * buffer ID is associated with this mapping. + */ + ret = ofi_cuPointerGetAttribute(&id, CU_POINTER_ATTRIBUTE_BUFFER_ID, + (CUdeviceptr)addr); + if (ret == CUDA_SUCCESS && hmem_info->cuda_id == id) { + FI_DBG(&core_prov, FI_LOG_MR, + "CUDA buffer ID %lu still valid for buffer %p\n", + hmem_info->cuda_id, addr); + return true; + } else if (ret == CUDA_SUCCESS && hmem_info->cuda_id != id) { + FI_DBG(&core_prov, FI_LOG_MR, + "CUDA buffer ID %lu invalid for buffer %p\n", + hmem_info->cuda_id, addr); + } else { + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to get CUDA buffer ID for buffer %p len %lu\n" + "cuPointerGetAttribute() failed: %s:%s\n", addr, len, + ofi_cudaGetErrorName(ret), ofi_cudaGetErrorString(ret)); + } + + return false; +} + +static int cuda_monitor_start(struct ofi_mem_monitor *monitor) +{ + /* no-op */ + return FI_SUCCESS; +} + +#else + +static int cuda_mm_subscribe(struct ofi_mem_monitor *monitor, const void *addr, + size_t len, union ofi_mr_hmem_info *hmem_info) +{ + return -FI_ENOSYS; +} + +static void cuda_mm_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ +} + +static bool cuda_mm_valid(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + return false; +} + +static int cuda_monitor_start(struct ofi_mem_monitor *monitor) +{ + return -FI_ENOSYS; +} + +#endif /* HAVE_LIBCUDA */ + +void cuda_monitor_stop(struct ofi_mem_monitor *monitor) +{ + /* no-op */ +} + +static struct ofi_mem_monitor cuda_mm = { + .iface = FI_HMEM_CUDA, + .init = ofi_monitor_init, + .cleanup = ofi_monitor_cleanup, + .start = cuda_monitor_start, + .stop = cuda_monitor_stop, + .subscribe = cuda_mm_subscribe, + .unsubscribe = cuda_mm_unsubscribe, + .valid = cuda_mm_valid, +}; + +struct ofi_mem_monitor *cuda_monitor = &cuda_mm; diff --git a/prov/util/src/rocr_mem_monitor.c b/prov/util/src/rocr_mem_monitor.c new file mode 100644 index 00000000000..09b8076b4ab --- /dev/null +++ b/prov/util/src/rocr_mem_monitor.c @@ -0,0 +1,405 @@ +/* + * Copyright (c) 2020 Hewlett Packard Enterprise Development LP + * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ofi_mr.h" + +#ifdef HAVE_ROCR + +#include "ofi_tree.h" +#include "ofi_iov.h" + +#include + +struct rocr_mm_entry { + struct iovec iov; + struct ofi_rbnode *node; +}; + +struct rocr_mm { + struct ofi_mem_monitor mm; + struct ofi_rbmap *dev_region_tree; +}; + +static int rocr_mm_start(struct ofi_mem_monitor *monitor); +static void rocr_mm_stop(struct ofi_mem_monitor *monitor); +static int rocr_mm_subscribe(struct ofi_mem_monitor *monitor, const void *addr, + size_t len, union ofi_mr_hmem_info *hmem_info); +static void rocr_mm_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info); +static bool rocr_mm_valid(struct ofi_mem_monitor *monitor, const void *addr, + size_t len, union ofi_mr_hmem_info *hmem_info); + +static struct rocr_mm rocr_mm = { + .mm = { + .iface = FI_HMEM_ROCR, + .init = ofi_monitor_init, + .cleanup = ofi_monitor_cleanup, + .start = rocr_mm_start, + .stop = rocr_mm_stop, + .subscribe = rocr_mm_subscribe, + .unsubscribe = rocr_mm_unsubscribe, + .valid = rocr_mm_valid, + }, +}; + +struct ofi_mem_monitor *rocr_monitor = &rocr_mm.mm; + +static int rocr_rbmap_compare(struct ofi_rbmap *map, void *key, void *data) +{ + struct rocr_mm_entry *entry = data; + struct iovec *iov = key; + + if (ofi_iov_left(&entry->iov, iov)) + return -1; + else if (ofi_iov_right(&entry->iov, iov)) + return 1; + + /* If this fails, the ROCR memory monitor failed to have a single ROCR + * memory monitor entry per user allocated ROCR memory region. + */ + assert(ofi_iov_within(iov, &entry->iov)); + + return 0; +} + +static struct rocr_mm_entry *rocr_mm_entry_get_root(void) +{ + struct ofi_rbnode *node; + + node = ofi_rbmap_get_root(rocr_mm.dev_region_tree); + if (node) + return node->data; + return NULL; +} + +/* ROCR memory monitor entry find works by finding the node within the device + * region tree which contains the address within an entry's monitored range. + * Thus, we only need an address instead of an address and length when + * searching. + */ +static struct rocr_mm_entry *rocr_mm_entry_find(const void *addr) +{ + struct ofi_rbnode *node; + struct iovec iov = { + .iov_base = (void *) addr, + .iov_len = 1, + }; + + node = ofi_rbmap_find(rocr_mm.dev_region_tree, (void *) &iov); + if (node) + return node->data; + return NULL; +} + +/* Pointer to ROCR memory monitor entry can never be returned as user data. This + * could lead to use-after-free. Instead, address and length is always returned. + * Unsubscribe will attempt to lookup the corresponding ROCR memory monitor + * entry and will free the entry if found. + */ +static void rocr_mm_dealloc_cb(void *addr, void *user_data) +{ + size_t len = (size_t) user_data; + + pthread_rwlock_rdlock(&mm_list_rwlock); + pthread_mutex_lock(&mm_lock); + ofi_monitor_unsubscribe(rocr_monitor, addr, len, NULL); + pthread_mutex_unlock(&mm_lock); + pthread_rwlock_unlock(&mm_list_rwlock); +} + +static void rocr_mm_entry_free(struct rocr_mm_entry *entry) +{ + hsa_status_t hsa_ret __attribute__((unused)); + + FI_DBG(&core_prov, FI_LOG_MR, + "ROCR buffer address %p length %lu monitor entry freed\n", + entry->iov.iov_base, entry->iov.iov_len); + + /* Two return codes are expected. HSA_STATUS_SUCCESS is returned if the + * deallocation callback was not triggered and the entry is freed. + * HSA_STATUS_ERROR_INVALID_ARGUMENT is returned if the deallocation + * callback was triggered and the entry is freed. Any other return code + * puts the monitor in an unknown state and is fatal. + */ + hsa_ret = ofi_hsa_amd_dereg_dealloc_cb(entry->iov.iov_base, + rocr_mm_dealloc_cb); + assert(hsa_ret == HSA_STATUS_SUCCESS || + hsa_ret == HSA_STATUS_ERROR_INVALID_ARGUMENT); + + ofi_rbmap_delete(rocr_mm.dev_region_tree, entry->node); + free(entry); +} + +/* Each ROCR memory monitor entry is sized to the entire user-allocated ROCR + * memory region. A single deallocation callback is registered for the memory + * region. This callback is called when the user frees the ROCR memory region. + */ +static int rocr_mm_entry_alloc(const void *addr, struct rocr_mm_entry **entry) +{ + hsa_amd_pointer_info_t hsa_info = { + .size = sizeof(hsa_info), + }; + hsa_status_t hsa_ret; + int ret; + + *entry = malloc(sizeof(**entry)); + if (!*entry) { + ret = -FI_ENOMEM; + goto err; + } + + /* Determine full ROCR memory region size. */ + hsa_ret = ofi_hsa_amd_pointer_info((void *) addr, &hsa_info, NULL, NULL, + NULL); + if (hsa_ret != HSA_STATUS_SUCCESS) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to perform hsa_amd_pointer_info: %s\n", + ofi_hsa_status_to_string(hsa_ret)); + ret = -FI_EIO; + goto err_free_entry; + } + + if (hsa_info.type != HSA_EXT_POINTER_TYPE_HSA) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Cannot monitor non-HSA allocated memory\n"); + ret = -FI_EINVAL; + goto err_free_entry; + } + + (*entry)->iov.iov_base = hsa_info.agentBaseAddress; + (*entry)->iov.iov_len = hsa_info.sizeInBytes; + + ret = ofi_rbmap_insert(rocr_mm.dev_region_tree, + (void *) &(*entry)->iov, + (void *) *entry, &(*entry)->node); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to insert into RB tree: %s\n", strerror(ret)); + goto err_free_entry; + } + + /* Register a deallocation callback for this ROCR memory region. */ + hsa_ret = ofi_hsa_amd_reg_dealloc_cb(hsa_info.agentBaseAddress, + rocr_mm_dealloc_cb, + (void *) hsa_info.sizeInBytes); + if (hsa_ret != HSA_STATUS_SUCCESS) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to perform hsa_amd_register_deallocation_callback: %s\n", + ofi_hsa_status_to_string(hsa_ret)); + + ret = -FI_EIO; + goto err_rbmap_delete_entry; + } + + FI_DBG(&core_prov, FI_LOG_MR, + "ROCR buffer address %p length %lu monitor entry allocated\n", + hsa_info.agentBaseAddress, hsa_info.sizeInBytes); + + return FI_SUCCESS; + +err_rbmap_delete_entry: + ofi_rbmap_delete(rocr_mm.dev_region_tree, (*entry)->node); +err_free_entry: + free(*entry); +err: + *entry = NULL; + return ret; +} + +static int rocr_mm_start(struct ofi_mem_monitor *monitor) +{ + rocr_mm.dev_region_tree = ofi_rbmap_create(rocr_rbmap_compare); + if (!rocr_mm.dev_region_tree) + return -FI_ENOMEM; + return FI_SUCCESS; +} + +static void rocr_mm_stop(struct ofi_mem_monitor *monitor) +{ + struct rocr_mm_entry *entry; + + while ((entry = rocr_mm_entry_get_root())) + rocr_mm_entry_free(entry); + + assert(ofi_rbmap_empty(rocr_mm.dev_region_tree)); + + ofi_rbmap_destroy(rocr_mm.dev_region_tree); +} + +static void rocr_mm_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + struct rocr_mm_entry *entry; + size_t cur_len = len; + void *cur_addr = (void *) addr; + void *next_addr; + + /* The user unsubscribe region may span multiple ROCR memory regions. + * Each ROCR memory region needs to be freed and MR caches notified. + */ + while (cur_len) { + entry = rocr_mm_entry_find(cur_addr); + if (!entry) + break; + + ofi_monitor_notify(rocr_monitor, entry->iov.iov_base, + entry->iov.iov_len); + + FI_DBG(&core_prov, FI_LOG_MR, + "ROCR buffer address %p length %lu unsubscribed\n", + entry->iov.iov_base, entry->iov.iov_len); + + next_addr = (void *) ((uintptr_t) ofi_iov_end(&entry->iov) + 1); + + rocr_mm_entry_free(entry); + + cur_len -= MIN((uintptr_t) next_addr - (uintptr_t) cur_addr, + cur_len); + cur_addr = next_addr; + } + + if (cur_len) + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to completely unsubscribe from address %p length %lu\n", + addr, len); +} + +/* Subscribe is designed to monitor entire ROCR memory regions even if the user + * subscribe region is smaller. All ROCR memory regions are inserted into an RB + * tree for tracking. Future subscriptions will always reuse RB tree entries if + * possible. + * + * RB tree entries can be removed in two different ways: + * 1. An unsubscribe against the memory region occurs. This will occur when ROCR + * invokes the deregistration callback. + * 2. The ROCR memory monitor is stopped. + * + * Note: The ROCR memory monitor does not impose a limit on the number of ROCR + * memory regions which can be monitored. + */ +static int rocr_mm_subscribe(struct ofi_mem_monitor *monitor, const void *addr, + size_t len, union ofi_mr_hmem_info *hmem_info) +{ + struct rocr_mm_entry *entry; + int ret = FI_SUCCESS; + size_t cur_len = len; + void *cur_addr = (void *) addr; + void *next_addr; + + /* The user subscribe region may span multiple ROCR memory regions. For + * this case, each ROCR memory region needs to be monitored. This + * requires allocating a ROCR memory monitor entry for each ROCR memory + * region. + */ + while (cur_len) { + entry = rocr_mm_entry_find(cur_addr); + if (entry) { + FI_DBG(&core_prov, FI_LOG_MR, + "Reusing monitored ROCR buffer address %p length %lu\n", + entry->iov.iov_base, entry->iov.iov_len); + } else { + /* On error, previous allocated entries are not cleaned + * up. This is harmless since these entries will either + * be cleaned up when the user frees the ROCR memory + * region or the memory monitor is stopped. + */ + ret = rocr_mm_entry_alloc(cur_addr, &entry); + if (ret != FI_SUCCESS) + break; + } + + next_addr = (void *) ((uintptr_t) ofi_iov_end(&entry->iov) + 1); + cur_len -= MIN((uintptr_t) next_addr - (uintptr_t) cur_addr, + cur_len); + cur_addr = next_addr; + } + + FI_LOG(&core_prov, ret ? FI_LOG_WARN : FI_LOG_DEBUG, FI_LOG_MR, + "ROCR buffer address %p length %lu subscribe status: %s\n", addr, + len, fi_strerror(-ret)); + + return ret; +} + +static bool rocr_mm_valid(struct ofi_mem_monitor *monitor, const void *addr, + size_t len, union ofi_mr_hmem_info *hmem_info) +{ + /* no-op */ + return true; +} + +#else + +static int rocr_mm_start(struct ofi_mem_monitor *monitor) +{ + return -FI_ENOSYS; +} + +static void rocr_mm_stop(struct ofi_mem_monitor *monitor) +{ +} + +static int rocr_mm_subscribe(struct ofi_mem_monitor *monitor, const void *addr, + size_t len, union ofi_mr_hmem_info *hmem_info) +{ + return -FI_ENOSYS; +} + +static void rocr_mm_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ +} + +static bool rocr_mm_valid(struct ofi_mem_monitor *monitor, const void *addr, + size_t len, union ofi_mr_hmem_info *hmem_info) +{ + return false; +} + +static struct ofi_mem_monitor rocr_mm = { + .iface = FI_HMEM_ROCR, + .init = ofi_monitor_init, + .cleanup = ofi_monitor_cleanup, + .start = rocr_mm_start, + .stop = rocr_mm_stop, + .subscribe = rocr_mm_subscribe, + .unsubscribe = rocr_mm_unsubscribe, + .valid = rocr_mm_valid, +}; + +struct ofi_mem_monitor *rocr_monitor = &rocr_mm; + +#endif /* HAVE_ROCR */ diff --git a/prov/util/src/util_atomic.c b/prov/util/src/util_atomic.c index 5a032b1c8c4..a95057011ca 100644 --- a/prov/util/src/util_atomic.c +++ b/prov/util/src/util_atomic.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2013-2017 Intel Corporation. All rights reserved. * Copyright (c) 2018 Cray Inc. All rights reserved. + * Copyright (c) 2018 System Fabric Works, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -820,7 +821,7 @@ OFI_DEFINE_ALL_HANDLERS(WRITEEXT, FUNC, OFI_OP_LXOR) OFI_DEFINE_INT_HANDLERS(WRITE, FUNC, OFI_OP_BXOR) OFI_DEFINE_ALL_HANDLERS(WRITE, FUNC, OFI_OP_WRITE) -void (*ofi_atomic_write_handlers[OFI_WRITE_OP_LAST][FI_DATATYPE_LAST]) +void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][FI_DATATYPE_LAST]) (void *dst, const void *src, size_t cnt) = { { OFI_DEFINE_REALNO_HANDLERS(WRITEEXT_CMP, NAME, OFI_OP_MIN) }, @@ -854,7 +855,7 @@ OFI_DEFINE_INT_HANDLERS(READWRITE, FUNC, OFI_OP_BXOR) OFI_DEFINE_ALL_HANDLERS(READ, FUNC, OFI_OP_READ) OFI_DEFINE_ALL_HANDLERS(EXCHANGE, FUNC, OFI_OP_READWRITE) -void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_LAST][FI_DATATYPE_LAST]) +void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][FI_DATATYPE_LAST]) (void *dst, const void *src, void *res, size_t cnt) = { { OFI_DEFINE_REALNO_HANDLERS(READWRITEEXT_CMP, NAME, OFI_OP_MIN) }, @@ -883,7 +884,7 @@ OFI_DEFINE_REALNO_HANDLERS(CSWAPEXT_CMP, FUNC, OFI_OP_CSWAP_GE) OFI_DEFINE_REALNO_HANDLERS(CSWAPEXT_CMP, FUNC, OFI_OP_CSWAP_GT) OFI_DEFINE_INT_HANDLERS(CSWAPEXT, FUNC, OFI_OP_MSWAP) -void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_LAST][FI_DATATYPE_LAST]) +void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][FI_DATATYPE_LAST]) (void *dst, const void *src, const void *cmp, void *res, size_t cnt) = { { OFI_DEFINE_ALL_HANDLERS(CSWAP, NAME, OFI_OP_CSWAP_EQ) }, @@ -918,7 +919,7 @@ OFI_DEFINE_ALL_HANDLERS(WRITE, FUNC, OFI_OP_LXOR) OFI_DEFINE_INT_HANDLERS(WRITE, FUNC, OFI_OP_BXOR) OFI_DEFINE_ALL_HANDLERS(WRITE, FUNC, OFI_OP_WRITE) -void (*ofi_atomic_write_handlers[OFI_WRITE_OP_LAST][FI_DATATYPE_LAST]) +void (*ofi_atomic_write_handlers[OFI_WRITE_OP_CNT][FI_DATATYPE_LAST]) (void *dst, const void *src, size_t cnt) = { { OFI_DEFINE_REALNO_HANDLERS(WRITE, NAME, OFI_OP_MIN) }, @@ -952,7 +953,7 @@ OFI_DEFINE_INT_HANDLERS(READWRITE, FUNC, OFI_OP_BXOR) OFI_DEFINE_ALL_HANDLERS(READ, FUNC, OFI_OP_READ) OFI_DEFINE_ALL_HANDLERS(READWRITE, FUNC, OFI_OP_WRITE) -void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_LAST][FI_DATATYPE_LAST]) +void (*ofi_atomic_readwrite_handlers[OFI_READWRITE_OP_CNT][FI_DATATYPE_LAST]) (void *dst, const void *src, void *res, size_t cnt) = { { OFI_DEFINE_REALNO_HANDLERS(READWRITE, NAME, OFI_OP_MIN) }, @@ -981,7 +982,7 @@ OFI_DEFINE_REALNO_HANDLERS(CSWAP, FUNC, OFI_OP_CSWAP_GE) OFI_DEFINE_REALNO_HANDLERS(CSWAP, FUNC, OFI_OP_CSWAP_GT) OFI_DEFINE_INT_HANDLERS(CSWAP, FUNC, OFI_OP_MSWAP) -void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_LAST][FI_DATATYPE_LAST]) +void (*ofi_atomic_swap_handlers[OFI_SWAP_OP_CNT][FI_DATATYPE_LAST]) (void *dst, const void *src, const void *cmp, void *res, size_t cnt) = { { OFI_DEFINE_ALL_HANDLERS(CSWAP, NAME, OFI_OP_CSWAP_EQ) }, @@ -1028,19 +1029,19 @@ int ofi_atomic_valid(const struct fi_provider *prov, } if (flags & FI_FETCH_ATOMIC) { - if (op >= OFI_READWRITE_OP_LAST) { + if (!ofi_atomic_isreadwrite_op(op)) { FI_INFO(prov, FI_LOG_DOMAIN, "Invalid fetch operation\n"); return -FI_EOPNOTSUPP; } have_func = ofi_atomic_readwrite_handlers[op][datatype] != NULL; } else if (flags & FI_COMPARE_ATOMIC) { - if (op < FI_CSWAP || op > FI_MSWAP) { + if (!ofi_atomic_isswap_op(op)) { FI_INFO(prov, FI_LOG_DOMAIN, "Invalid swap operation\n"); return -FI_EOPNOTSUPP; } have_func = ofi_atomic_swap_handlers[op - FI_CSWAP][datatype] != NULL; } else { - if (op >= OFI_WRITE_OP_LAST) { + if (!ofi_atomic_iswrite_op(op)) { FI_INFO(prov, FI_LOG_DOMAIN, "Invalid write operation\n"); return -FI_EOPNOTSUPP; } diff --git a/prov/util/src/util_attr.c b/prov/util/src/util_attr.c index 54d27ad8dc8..d84ddbe1eb2 100644 --- a/prov/util/src/util_attr.c +++ b/prov/util/src/util_attr.c @@ -36,8 +36,9 @@ #include #include -#define OFI_MSG_CAPS (FI_SEND | FI_RECV) -#define OFI_RMA_CAPS (FI_READ | FI_WRITE | FI_REMOTE_READ | FI_REMOTE_WRITE) +#define OFI_MSG_DIRECTION_CAPS (FI_SEND | FI_RECV) +#define OFI_RMA_DIRECTION_CAPS (FI_READ | FI_WRITE | \ + FI_REMOTE_READ | FI_REMOTE_WRITE) static int fi_valid_addr_format(uint32_t prov_format, uint32_t user_format) { @@ -93,13 +94,14 @@ char *ofi_strdup_append(const char *head, const char *tail) int ofi_exclude_prov_name(char **prov_name_list, const char *util_prov_name) { char *exclude, *name, *temp; + int length; - exclude = malloc(strlen(util_prov_name) + 2); + length = strlen(util_prov_name) + 2; + exclude = malloc(length); if (!exclude) return -FI_ENOMEM; - exclude[0] = '^'; - strcpy(&exclude[1], util_prov_name); + snprintf(exclude, length, "^%s", util_prov_name); if (!*prov_name_list) goto out; @@ -150,8 +152,30 @@ static int ofi_dup_addr(const struct fi_info *info, struct fi_info *dup) return 0; } +static int ofi_set_prov_name(const struct fi_provider *prov, + const struct fi_fabric_attr *util_hints, + const struct fi_info *base_attr, + struct fi_fabric_attr *core_hints) +{ + if (util_hints->prov_name) { + core_hints->prov_name = strdup(util_hints->prov_name); + if (!core_hints->prov_name) + return -FI_ENOMEM; + } else if (base_attr && base_attr->fabric_attr && + base_attr->fabric_attr->prov_name) { + core_hints->prov_name = strdup(base_attr->fabric_attr-> + prov_name); + if (!core_hints->prov_name) + return -FI_ENOMEM; + } + + return core_hints->prov_name ? + ofi_exclude_prov_name(&core_hints->prov_name, prov->name) : 0; +} + static int ofi_info_to_core(uint32_t version, const struct fi_provider *prov, - const struct fi_info *util_info, + const struct fi_info *util_hints, + const struct fi_info *base_attr, ofi_alter_info_t info_to_core, struct fi_info **core_hints) { @@ -160,19 +184,19 @@ static int ofi_info_to_core(uint32_t version, const struct fi_provider *prov, if (!(*core_hints = fi_allocinfo())) return -FI_ENOMEM; - if (info_to_core(version, util_info, *core_hints)) + if (info_to_core(version, util_hints, base_attr, *core_hints)) goto err; - if (!util_info) + if (!util_hints) return 0; - if (ofi_dup_addr(util_info, *core_hints)) + if (ofi_dup_addr(util_hints, *core_hints)) goto err; - if (util_info->fabric_attr) { - if (util_info->fabric_attr->name) { + if (util_hints->fabric_attr) { + if (util_hints->fabric_attr->name) { (*core_hints)->fabric_attr->name = - strdup(util_info->fabric_attr->name); + strdup(util_hints->fabric_attr->name); if (!(*core_hints)->fabric_attr->name) { FI_WARN(prov, FI_LOG_FABRIC, "Unable to allocate fabric name\n"); @@ -180,25 +204,15 @@ static int ofi_info_to_core(uint32_t version, const struct fi_provider *prov, } } - if (util_info->fabric_attr->prov_name) { - (*core_hints)->fabric_attr->prov_name = - strdup(util_info->fabric_attr->prov_name); - if (!(*core_hints)->fabric_attr->prov_name) { - FI_WARN(prov, FI_LOG_FABRIC, - "Unable to alloc prov name\n"); - goto err; - } - ret = ofi_exclude_prov_name( - &(*core_hints)->fabric_attr->prov_name, - prov->name); - if (ret) - goto err; - } + ret = ofi_set_prov_name(prov, util_hints->fabric_attr, + base_attr, (*core_hints)->fabric_attr); + if (ret) + goto err; } - if (util_info->domain_attr && util_info->domain_attr->name) { + if (util_hints->domain_attr && util_hints->domain_attr->name) { (*core_hints)->domain_attr->name = - strdup(util_info->domain_attr->name); + strdup(util_hints->domain_attr->name); if (!(*core_hints)->domain_attr->name) { FI_WARN(prov, FI_LOG_FABRIC, "Unable to allocate domain name\n"); @@ -213,14 +227,14 @@ static int ofi_info_to_core(uint32_t version, const struct fi_provider *prov, } static int ofi_info_to_util(uint32_t version, const struct fi_provider *prov, - struct fi_info *core_info, + struct fi_info *core_info, const struct fi_info *base_info, ofi_alter_info_t info_to_util, struct fi_info **util_info) { if (!(*util_info = fi_allocinfo())) return -FI_ENOMEM; - if (info_to_util(version, core_info, *util_info)) + if (info_to_util(version, core_info, base_info, *util_info)) goto err; if (ofi_dup_addr(core_info, *util_info)) @@ -267,18 +281,15 @@ static int ofi_info_to_util(uint32_t version, const struct fi_provider *prov, int ofi_get_core_info(uint32_t version, const char *node, const char *service, uint64_t flags, const struct util_prov *util_prov, - const struct fi_info *util_hints, ofi_alter_info_t info_to_core, - struct fi_info **core_info) + const struct fi_info *util_hints, + const struct fi_info *base_attr, + ofi_alter_info_t info_to_core, struct fi_info **core_info) { struct fi_info *core_hints = NULL; int ret; - ret = ofi_prov_check_info(util_prov, version, util_hints); - if (ret) - return ret; - - ret = ofi_info_to_core(version, util_prov->prov, util_hints, info_to_core, - &core_hints); + ret = ofi_info_to_core(version, util_prov->prov, util_hints, base_attr, + info_to_core, &core_hints); if (ret) return ret; @@ -298,31 +309,42 @@ int ofix_getinfo(uint32_t version, const char *node, const char *service, const struct fi_info *hints, ofi_alter_info_t info_to_core, ofi_alter_info_t info_to_util, struct fi_info **info) { - struct fi_info *core_info, *util_info, *cur, *tail; - int ret; - - ret = ofi_get_core_info(version, node, service, flags, util_prov, - hints, info_to_core, &core_info); - if (ret) - return ret; + struct fi_info *core_info, *base_info, *util_info, *cur, *tail; + int ret = -FI_ENODATA; *info = tail = NULL; - for (cur = core_info; cur; cur = cur->next) { - ret = ofi_info_to_util(version, util_prov->prov, cur, - info_to_util, &util_info); + for (base_info = (struct fi_info *) util_prov->info; base_info; + base_info = base_info->next) { + if (ofi_check_info(util_prov, base_info, version, hints)) + continue; + + ret = ofi_get_core_info(version, node, service, flags, + util_prov, hints, base_info, + info_to_core, &core_info); if (ret) { - fi_freeinfo(*info); + if (ret == -FI_ENODATA) + continue; break; } - ofi_alter_info(util_info, hints, version); - if (!*info) - *info = util_info; - else - tail->next = util_info; - tail = util_info; + for (cur = core_info; cur; cur = cur->next) { + ret = ofi_info_to_util(version, util_prov->prov, cur, + base_info, info_to_util, + &util_info); + if (ret) { + fi_freeinfo(*info); + break; + } + + ofi_alter_info(util_info, hints, version); + if (!*info) + *info = util_info; + else + tail->next = util_info; + tail = util_info; + } + fi_freeinfo(core_info); } - fi_freeinfo(core_info); return ret; } @@ -370,7 +392,18 @@ int ofi_check_fabric_attr(const struct fi_provider *prov, const struct fi_fabric_attr *prov_attr, const struct fi_fabric_attr *user_attr) { - /* Provider names are checked by the framework */ + /* Provider names are properly checked by the framework. + * Here we only apply a simple filter. If the util provider has + * supplied a core provider name, verify that it is also in the + * user's hints, if one is specified. + */ + if (prov_attr->prov_name && user_attr->prov_name && + !strcasestr(user_attr->prov_name, prov_attr->prov_name)) { + FI_INFO(prov, FI_LOG_CORE, + "Requesting provider %s, skipping %s\n", + prov_attr->prov_name, user_attr->prov_name); + return -FI_ENODATA; + } if (user_attr->prov_version > prov_attr->prov_version) { FI_INFO(prov, FI_LOG_CORE, "Unsupported provider version\n"); @@ -447,8 +480,11 @@ static int fi_resource_mgmt_level(enum fi_resource_mgmt rm_model) */ static int ofi_cap_mr_mode(uint64_t info_caps, int mr_mode) { + if (!(info_caps & FI_HMEM)) + mr_mode &= ~FI_MR_HMEM; + if (!ofi_rma_target_allowed(info_caps)) { - if (!(mr_mode & FI_MR_LOCAL)) + if (!(mr_mode & (FI_MR_LOCAL | FI_MR_HMEM))) return 0; mr_mode &= ~OFI_MR_MODE_RMA_TARGET; @@ -520,13 +556,6 @@ int ofi_check_domain_attr(const struct fi_provider *prov, uint32_t api_version, { const struct fi_domain_attr *user_attr = user_info->domain_attr; - if (prov_attr->name && user_attr->name && - strcasecmp(user_attr->name, prov_attr->name)) { - FI_INFO(prov, FI_LOG_CORE, "Unknown domain name\n"); - FI_INFO_NAME(prov, prov_attr, user_attr); - return -FI_ENODATA; - } - if (fi_thread_level(user_attr->threading) < fi_thread_level(prov_attr->threading)) { FI_INFO(prov, FI_LOG_CORE, "Invalid threading model\n"); @@ -619,14 +648,14 @@ int ofi_check_domain_attr(const struct fi_provider *prov, uint32_t api_version, return 0; } -static int ofi_check_ep_type(const struct fi_provider *prov, - const struct fi_ep_attr *prov_attr, - const struct fi_ep_attr *user_attr) +int ofi_check_ep_type(const struct fi_provider *prov, + const struct fi_ep_attr *prov_attr, + const struct fi_ep_attr *user_attr) { if ((user_attr->type != FI_EP_UNSPEC) && (prov_attr->type != FI_EP_UNSPEC) && (user_attr->type != prov_attr->type)) { - FI_INFO(prov, FI_LOG_CORE, "Unsupported endpoint type\n"); + FI_INFO(prov, FI_LOG_CORE, "unsupported endpoint type\n"); FI_INFO_CHECK(prov, prov_attr, user_attr, type, FI_TYPE_EP_TYPE); return -FI_ENODATA; } @@ -735,6 +764,14 @@ int ofi_check_ep_attr(const struct util_prov *util_prov, uint32_t api_version, return -FI_ENODATA; } + if ((user_info->caps & FI_TAGGED) && user_attr->mem_tag_format && + ofi_max_tag(user_attr->mem_tag_format) > + ofi_max_tag(prov_attr->mem_tag_format)) { + FI_INFO(prov, FI_LOG_CORE, "Tag size exceeds supported size\n"); + FI_INFO_CHECK_VAL(prov, prov_attr, user_attr, mem_tag_format); + return -FI_ENODATA; + } + return 0; } @@ -745,7 +782,10 @@ int ofi_check_rx_attr(const struct fi_provider *prov, const struct fi_rx_attr *prov_attr = prov_info->rx_attr; int rm_enabled = (prov_info->domain_attr->resource_mgmt == FI_RM_ENABLED); - if (user_attr->caps & ~(prov_attr->caps)) { + if (user_attr->caps & ~OFI_IGNORED_RX_CAPS) + FI_INFO(prov, FI_LOG_CORE, "Tx only caps ignored in Rx caps\n"); + + if ((user_attr->caps & ~OFI_IGNORED_RX_CAPS) & ~(prov_attr->caps)) { FI_INFO(prov, FI_LOG_CORE, "caps not supported\n"); FI_INFO_CHECK(prov, prov_attr, user_attr, caps, FI_TYPE_CAPS); return -FI_ENODATA; @@ -810,34 +850,26 @@ int ofi_check_rx_attr(const struct fi_provider *prov, return 0; } -static uint64_t ofi_expand_caps(uint64_t base_caps) -{ - uint64_t expanded_caps = base_caps; - uint64_t msg_caps = FI_SEND | FI_RECV; - uint64_t rma_caps = FI_WRITE | FI_READ | FI_REMOTE_WRITE | FI_REMOTE_READ; - - if (base_caps & (FI_MSG | FI_TAGGED)) - if (!(base_caps & msg_caps)) - expanded_caps |= msg_caps; - - if (base_caps & (FI_RMA | FI_ATOMIC)) - if (!(base_caps & rma_caps)) - expanded_caps |= rma_caps; - - return expanded_caps; -} - int ofi_check_attr_subset(const struct fi_provider *prov, uint64_t base_caps, uint64_t requested_caps) { - uint64_t expanded_base_caps; + uint64_t expanded_caps; - expanded_base_caps = ofi_expand_caps(base_caps); + expanded_caps = base_caps; + if (base_caps & (FI_MSG | FI_TAGGED)) { + if (!(base_caps & OFI_MSG_DIRECTION_CAPS)) + expanded_caps |= OFI_MSG_DIRECTION_CAPS; + } + if (base_caps & (FI_RMA | FI_ATOMIC)) { + if (!(base_caps & OFI_RMA_DIRECTION_CAPS)) + expanded_caps |= OFI_RMA_DIRECTION_CAPS; + } - if (~expanded_base_caps & requested_caps) { - FI_INFO(prov, FI_LOG_CORE, "requested caps not subset of base endpoint caps\n"); - FI_INFO_FIELD(prov, expanded_base_caps, requested_caps, "Supported", - "Requested", FI_TYPE_CAPS); + if (~expanded_caps & requested_caps) { + FI_INFO(prov, FI_LOG_CORE, + "requested caps not subset of base endpoint caps\n"); + FI_INFO_FIELD(prov, expanded_caps, requested_caps, + "Supported", "Requested", FI_TYPE_CAPS); return -FI_ENODATA; } @@ -848,7 +880,10 @@ int ofi_check_tx_attr(const struct fi_provider *prov, const struct fi_tx_attr *prov_attr, const struct fi_tx_attr *user_attr, uint64_t info_mode) { - if (user_attr->caps & ~(prov_attr->caps)) { + if (user_attr->caps & ~OFI_IGNORED_TX_CAPS) + FI_INFO(prov, FI_LOG_CORE, "Rx only caps ignored in Tx caps\n"); + + if ((user_attr->caps & ~OFI_IGNORED_TX_CAPS) & ~(prov_attr->caps)) { FI_INFO(prov, FI_LOG_CORE, "caps not supported\n"); FI_INFO_CHECK(prov, prov_attr, user_attr, caps, FI_TYPE_CAPS); return -FI_ENODATA; @@ -909,7 +944,7 @@ int ofi_check_tx_attr(const struct fi_provider *prov, return 0; } -/* if there are multiple fi_info in the provider: +/* Use if there are multiple fi_info in the provider: * check provider's info */ int ofi_prov_check_info(const struct util_prov *util_prov, uint32_t api_version, @@ -932,7 +967,7 @@ int ofi_prov_check_info(const struct util_prov *util_prov, return (!success_info ? -FI_ENODATA : FI_SUCCESS); } -/* if there are multiple fi_info in the provider: +/* Use if there are multiple fi_info in the provider: * check and duplicate provider's info */ int ofi_prov_check_dup_info(const struct util_prov *util_prov, uint32_t api_version, @@ -965,7 +1000,7 @@ int ofi_prov_check_dup_info(const struct util_prov *util_prov, tail = fi; } - return (!*info ? -FI_ENODATA : FI_SUCCESS); + return !*info ? -FI_ENODATA : FI_SUCCESS; err: fi_freeinfo(*info); FI_INFO(prov, FI_LOG_CORE, @@ -973,7 +1008,7 @@ int ofi_prov_check_dup_info(const struct util_prov *util_prov, return ret; } -/* if there is only single fi_info in the provider */ +/* Use if there is only single fi_info in the provider */ int ofi_check_info(const struct util_prov *util_prov, const struct fi_info *prov_info, uint32_t api_version, const struct fi_info *user_info) @@ -1067,10 +1102,10 @@ static uint64_t ofi_get_caps(uint64_t info_caps, uint64_t hint_caps, (attr_caps & FI_SECONDARY_CAPS); } - if (caps & (FI_MSG | FI_TAGGED) && !(caps & OFI_MSG_CAPS)) - caps |= OFI_MSG_CAPS; - if (caps & (FI_RMA | FI_ATOMICS) && !(caps & OFI_RMA_CAPS)) - caps |= OFI_RMA_CAPS; + if (caps & (FI_MSG | FI_TAGGED) && !(caps & OFI_MSG_DIRECTION_CAPS)) + caps |= (attr_caps & OFI_MSG_DIRECTION_CAPS); + if (caps & (FI_RMA | FI_ATOMICS) && !(caps & OFI_RMA_DIRECTION_CAPS)) + caps |= (attr_caps & OFI_RMA_DIRECTION_CAPS); return caps; } @@ -1088,7 +1123,10 @@ static void fi_alter_domain_attr(struct fi_domain_attr *attr, attr->mr_mode = (attr->mr_mode && attr->mr_mode != FI_MR_SCALABLE) ? FI_MR_BASIC : FI_MR_SCALABLE; } else { - if ((hints_mr_mode & attr->mr_mode) != attr->mr_mode) { + attr->mr_mode &= ~(FI_MR_BASIC | FI_MR_SCALABLE); + + if (hints && + ((hints_mr_mode & attr->mr_mode) != attr->mr_mode)) { attr->mr_mode = ofi_cap_mr_mode(info_caps, attr->mr_mode & hints_mr_mode); } @@ -1171,7 +1209,8 @@ static uint64_t ofi_get_info_caps(const struct fi_info *prov_info, int prov_mode, user_mode; uint64_t caps; - assert(user_info); + if (!user_info) + return prov_info->caps; caps = ofi_get_caps(prov_info->caps, user_info->caps, prov_info->caps); @@ -1189,7 +1228,7 @@ static uint64_t ofi_get_info_caps(const struct fi_info *prov_info, if ((FI_VERSION_LT(api_version, FI_VERSION(1,5)) && (user_mode == FI_MR_UNSPEC)) || (user_mode == FI_MR_BASIC) || - ((user_mode & prov_mode & OFI_MR_MODE_RMA_TARGET) == + ((user_mode & prov_mode & OFI_MR_MODE_RMA_TARGET) == (prov_mode & OFI_MR_MODE_RMA_TARGET))) return caps; @@ -1205,9 +1244,6 @@ static uint64_t ofi_get_info_caps(const struct fi_info *prov_info, void ofi_alter_info(struct fi_info *info, const struct fi_info *hints, uint32_t api_version) { - if (!hints) - return; - for (; info; info = info->next) { /* This should stay before call to fi_alter_domain_attr as * the checks depend on unmodified provider mr_mode attr */ @@ -1219,12 +1255,17 @@ void ofi_alter_info(struct fi_info *info, const struct fi_info *hints, (hints->domain_attr->mr_mode & (FI_MR_BASIC | FI_MR_SCALABLE))))) info->mode |= FI_LOCAL_MR; - info->handle = hints->handle; + if (hints) + info->handle = hints->handle; - fi_alter_domain_attr(info->domain_attr, hints->domain_attr, + fi_alter_domain_attr(info->domain_attr, + hints ? hints->domain_attr : NULL, info->caps, api_version); - fi_alter_ep_attr(info->ep_attr, hints->ep_attr, info->caps); - fi_alter_rx_attr(info->rx_attr, hints->rx_attr, info->caps); - fi_alter_tx_attr(info->tx_attr, hints->tx_attr, info->caps); + fi_alter_ep_attr(info->ep_attr, hints ? hints->ep_attr : NULL, + info->caps); + fi_alter_rx_attr(info->rx_attr, hints ? hints->rx_attr : NULL, + info->caps); + fi_alter_tx_attr(info->tx_attr, hints ? hints->tx_attr : NULL, + info->caps); } } diff --git a/prov/util/src/util_av.c b/prov/util/src/util_av.c index 6d2e7589ecb..ea929ddf595 100644 --- a/prov/util/src/util_av.c +++ b/prov/util/src/util_av.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017 Intel Corporation. All rights reserved. + * Copyright (c) 2015-2020 Intel Corporation. All rights reserved. * Copyright (c) 2017, Cisco Systems, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -53,7 +53,6 @@ enum { UTIL_NO_ENTRY = -1, - UTIL_DEFAULT_AV_SIZE = 1024, }; static int fi_get_src_sockaddr(const struct sockaddr *dest_addr, size_t dest_addrlen, @@ -245,19 +244,31 @@ void *ofi_av_get_addr(struct util_av *av, fi_addr_t fi_addr) struct util_av_entry *entry; entry = ofi_bufpool_get_ibuf(av->av_entry_pool, fi_addr); - return entry->addr; + return entry->data; } -int ofi_verify_av_insert(struct util_av *av, uint64_t flags) +int ofi_verify_av_insert(struct util_av *av, uint64_t flags, void *context) { - if ((av->flags & FI_EVENT) && !av->eq) { - FI_WARN(av->prov, FI_LOG_AV, "no EQ bound to AV\n"); - return -FI_ENOEQ; + if (av->flags & FI_EVENT) { + if (!av->eq) { + FI_WARN(av->prov, FI_LOG_AV, "no EQ bound to AV\n"); + return -FI_ENOEQ; + } + + if (flags & FI_SYNC_ERR) { + FI_WARN(av->prov, FI_LOG_AV, "invalid flag\n"); + return -FI_EBADFLAGS; + } } - if (flags & ~(FI_MORE)) { + if (flags & ~(FI_MORE | FI_SYNC_ERR)) { FI_WARN(av->prov, FI_LOG_AV, "unsupported flags\n"); - return -FI_ENOEQ; + return -FI_EBADFLAGS; + } + + if ((flags & FI_SYNC_ERR) && !context) { + FI_WARN(av->prov, FI_LOG_AV, "null context with FI_SYNC_ERR"); + return -FI_EINVAL; } return 0; @@ -278,13 +289,17 @@ int ofi_av_insert_addr(struct util_av *av, const void *addr, fi_addr_t *fi_addr) return 0; } else { entry = ofi_ibuf_alloc(av->av_entry_pool); - if (!entry) + if (!entry) { + if (fi_addr) + *fi_addr = FI_ADDR_NOTAVAIL; return -FI_ENOMEM; + } + if (fi_addr) *fi_addr = ofi_buf_index(entry); - memcpy(entry->addr, addr, av->addrlen); + memcpy(entry->data, addr, av->addrlen); ofi_atomic_initialize32(&entry->use_cnt, 1); - HASH_ADD(hh, av->hash, addr, av->addrlen, entry); + HASH_ADD(hh, av->hash, data, av->addrlen, entry); } return 0; } @@ -295,7 +310,7 @@ int ofi_av_elements_iter(struct util_av *av, ofi_av_apply_func apply, void *arg) int ret; HASH_ITER(hh, av->hash, av_entry, av_entry_tmp) { - ret = apply(av, av_entry->addr, + ret = apply(av, av_entry->data, ofi_buf_index(av_entry), arg); if (OFI_UNLIKELY(ret)) return ret; @@ -408,6 +423,13 @@ int ofi_av_close(struct util_av *av) return 0; } +size_t ofi_av_size(struct util_av *av) +{ + return av->av_entry_pool->entry_cnt ? + av->av_entry_pool->entry_cnt : + av->av_entry_pool->attr.chunk_cnt; +} + static int util_verify_av_util_attr(struct util_domain *domain, const struct util_av_attr *util_attr) { @@ -423,16 +445,22 @@ static int util_av_init(struct util_av *av, const struct fi_av_attr *attr, const struct util_av_attr *util_attr) { int ret = 0; - size_t max_count; + size_t orig_size; + size_t offset; + + /* offset calculated on a 8-byte boundary */ + offset = util_attr->addrlen % 8; + if (offset != 0) + offset = 8 - offset; struct ofi_bufpool_attr pool_attr = { - .size = util_attr->addrlen + + .size = util_attr->addrlen + offset + + util_attr->context_len + sizeof(struct util_av_entry), .alignment = 16, .max_cnt = 0, /* Don't use track of buffer, because user can close * the AV without prior deletion of addresses */ - .flags = OFI_BUFPOOL_NO_TRACK | OFI_BUFPOOL_INDEXED | - OFI_BUFPOOL_HUGEPAGES, + .flags = OFI_BUFPOOL_NO_TRACK | OFI_BUFPOOL_INDEXED, }; /* TODO: Handle FI_READ */ @@ -442,23 +470,16 @@ static int util_av_init(struct util_av *av, const struct fi_av_attr *attr, if (ret) return ret; - if (attr->count) { - max_count = attr->count; - } else { - if (fi_param_get_size_t(NULL, "universe_size", &max_count)) - max_count = UTIL_DEFAULT_AV_SIZE; - } - - av->count = roundup_power_of_two(max_count ? - max_count : - UTIL_DEFAULT_AV_SIZE); - FI_INFO(av->prov, FI_LOG_AV, "AV size %zu\n", av->count); + orig_size = attr->count ? attr->count : ofi_universe_size; + orig_size = roundup_power_of_two(orig_size); + FI_INFO(av->prov, FI_LOG_AV, "AV size %zu\n", orig_size); av->addrlen = util_attr->addrlen; + av->context_offset = offset + av->addrlen; av->flags = util_attr->flags | attr->flags; av->hash = NULL; - pool_attr.chunk_cnt = av->count; + pool_attr.chunk_cnt = orig_size; return ofi_bufpool_create_attr(&pool_attr, &av->av_entry_pool); } @@ -572,40 +593,22 @@ fi_addr_t ofi_ip_av_get_fi_addr(struct util_av *av, const void *addr) return ofi_av_lookup_fi_addr(av, addr); } -static int ip_av_valid_addr(struct util_av *av, const void *addr) -{ - const struct sockaddr_in *sin = addr; - const struct sockaddr_in6 *sin6 = addr; - - switch (sin->sin_family) { - case AF_INET: - return sin->sin_port && sin->sin_addr.s_addr; - case AF_INET6: - return sin6->sin6_port && - memcmp(&in6addr_any, &sin6->sin6_addr, sizeof(in6addr_any)); - default: - return 0; - } -} - static int ip_av_insert_addr(struct util_av *av, const void *addr, fi_addr_t *fi_addr, void *context) { int ret; - fi_addr_t fi_addr_ret; - if (ip_av_valid_addr(av, addr)) { + if (ofi_valid_dest_ipaddr(addr)) { fastlock_acquire(&av->lock); - ret = ofi_av_insert_addr(av, addr, &fi_addr_ret); + ret = ofi_av_insert_addr(av, addr, fi_addr); fastlock_release(&av->lock); } else { ret = -FI_EADDRNOTAVAIL; + if (fi_addr) + *fi_addr = FI_ADDR_NOTAVAIL; FI_WARN(av->prov, FI_LOG_AV, "invalid address\n"); } - if (fi_addr) - *fi_addr = !ret ? fi_addr_ret : FI_ADDR_NOTAVAIL; - ofi_straddr_dbg(av->prov, FI_LOG_AV, "av_insert addr", addr); if (fi_addr) FI_DBG(av->prov, FI_LOG_AV, "av_insert fi_addr: %" PRIu64 "\n", @@ -615,12 +618,19 @@ static int ip_av_insert_addr(struct util_av *av, const void *addr, } int ofi_ip_av_insertv(struct util_av *av, const void *addr, size_t addrlen, - size_t count, fi_addr_t *fi_addr, void *context) + size_t count, fi_addr_t *fi_addr, uint64_t flags, + void *context) { int ret, success_cnt = 0; + int *sync_err = NULL; size_t i; FI_DBG(av->prov, FI_LOG_AV, "inserting %zu addresses\n", count); + if (flags & FI_SYNC_ERR) { + sync_err = context; + memset(sync_err, 0, sizeof(*sync_err) * count); + } + for (i = 0; i < count; i++) { ret = ip_av_insert_addr(av, (const char *) addr + i * addrlen, fi_addr ? &fi_addr[i] : NULL, context); @@ -628,6 +638,8 @@ int ofi_ip_av_insertv(struct util_av *av, const void *addr, size_t addrlen, success_cnt++; else if (av->eq) ofi_av_write_event(av, i, -ret, context); + else if (sync_err) + sync_err[i] = -ret; } FI_DBG(av->prov, FI_LOG_AV, "%d addresses successful\n", success_cnt); @@ -647,12 +659,12 @@ int ofi_ip_av_insert(struct fid_av *av_fid, const void *addr, size_t count, int ret; av = container_of(av_fid, struct util_av, av_fid); - ret = ofi_verify_av_insert(av, flags); + ret = ofi_verify_av_insert(av, flags, context); if (ret) return ret; return ofi_ip_av_insertv(av, addr, ofi_sizeofaddr(addr), - count, fi_addr, context); + count, fi_addr, flags, context); } static int ip_av_insertsvc(struct fid_av *av, const char *node, @@ -843,7 +855,7 @@ static int ip_av_insertsym(struct fid_av *av_fid, const char *node, int ret, count; av = container_of(av_fid, struct util_av, av_fid); - ret = ofi_verify_av_insert(av, flags); + ret = ofi_verify_av_insert(av, flags, context); if (ret) return ret; @@ -853,7 +865,7 @@ static int ip_av_insertsym(struct fid_av *av_fid, const char *node, return count; ret = ofi_ip_av_insertv(av, addr, addrlen, count, - fi_addr, context); + fi_addr, flags, context); free(addr); return ret; } @@ -896,7 +908,7 @@ int ofi_ip_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr, container_of(av_fid, struct util_av, av_fid); size_t av_addrlen; void *av_addr = ofi_av_lookup_addr(av, fi_addr, &av_addrlen); - + memcpy(addr, av_addr, MIN(*addrlen, av_addrlen)); *addrlen = av->addrlen; @@ -955,6 +967,7 @@ int ofi_ip_av_create_flags(struct fid_domain *domain_fid, struct fi_av_attr *att util_attr.addrlen = sizeof(struct sockaddr_in6); util_attr.flags = flags; + util_attr.context_len = 0; if (attr->type == FI_AV_UNSPEC) attr->type = FI_AV_MAP; diff --git a/prov/util/src/util_buf.c b/prov/util/src/util_buf.c index 2be9be877f5..9e7a4fb9390 100644 --- a/prov/util/src/util_buf.c +++ b/prov/util/src/util_buf.c @@ -67,19 +67,18 @@ int ofi_bufpool_grow(struct ofi_bufpool *pool) ret = ofi_alloc_hugepage_buf((void **) &buf_region->alloc_region, pool->alloc_size); /* If we can't allocate huge pages, fall back to normal - * allocations if this is the first allocation attempt. + * allocations for all future attempts. */ - if (ret && !pool->entry_cnt) { + if (ret) { pool->attr.flags &= ~OFI_BUFPOOL_HUGEPAGES; - pool->alloc_size = (pool->attr.chunk_cnt + 1) * - pool->entry_size; - pool->region_size = pool->alloc_size - pool->entry_size; goto retry; } + buf_region->flags = OFI_BUFPOOL_HUGEPAGES; } else { retry: ret = ofi_memalign((void **) &buf_region->alloc_region, - pool->attr.alignment, pool->alloc_size); + roundup_power_of_two(pool->attr.alignment), + pool->alloc_size); } if (ret) { FI_DBG(&core_prov, FI_LOG_CORE, "Allocation failed: %s\n", @@ -115,26 +114,18 @@ int ofi_bufpool_grow(struct ofi_bufpool *pool) buf_hdr = ofi_buf_hdr(buf); buf_hdr->region = buf_region; buf_hdr->index = pool->entry_cnt + i; - if (pool->attr.init_fn) { -#if ENABLE_DEBUG - if (pool->attr.flags & OFI_BUFPOOL_INDEXED) { - buf_hdr->entry.dlist.next = (void *) OFI_MAGIC_64; - buf_hdr->entry.dlist.prev = (void *) OFI_MAGIC_64; - - pool->attr.init_fn(buf_region, buf); - - assert((buf_hdr->entry.dlist.next == (void *) OFI_MAGIC_64) && - (buf_hdr->entry.dlist.prev == (void *) OFI_MAGIC_64)); - } else { - buf_hdr->entry.slist.next = (void *) OFI_MAGIC_64; + OFI_DBG_SET(buf_hdr->magic, OFI_MAGIC_SIZE_T); + OFI_DBG_SET(buf_hdr->ftr, + (struct ofi_bufpool_ftr *) ((char *) buf + + pool->entry_size - sizeof(struct ofi_bufpool_ftr))); + OFI_DBG_SET(buf_hdr->ftr->magic, OFI_MAGIC_SIZE_T); - pool->attr.init_fn(buf_region, buf); - - assert(buf_hdr->entry.slist.next == (void *) OFI_MAGIC_64); - } -#else + if (pool->attr.init_fn) { + OFI_DBG_SET(buf_hdr->entry.dlist.next, OFI_MAGIC_PTR); + OFI_DBG_SET(buf_hdr->entry.dlist.prev, OFI_MAGIC_PTR); pool->attr.init_fn(buf_region, buf); -#endif + assert((buf_hdr->entry.dlist.next == OFI_MAGIC_PTR) && + (buf_hdr->entry.dlist.prev == OFI_MAGIC_PTR)); } if (pool->attr.flags & OFI_BUFPOOL_INDEXED) { dlist_insert_tail(&buf_hdr->entry.dlist, @@ -155,7 +146,7 @@ int ofi_bufpool_grow(struct ofi_bufpool *pool) if (pool->attr.free_fn) pool->attr.free_fn(buf_region); err2: - if (pool->attr.flags & OFI_BUFPOOL_HUGEPAGES) + if (buf_region->flags & OFI_BUFPOOL_HUGEPAGES) ofi_free_hugepage_buf(buf_region->alloc_region, pool->alloc_size); else ofi_freealign(buf_region->alloc_region); @@ -178,6 +169,7 @@ int ofi_bufpool_create_attr(struct ofi_bufpool_attr *attr, pool->attr = *attr; entry_sz = (attr->size + sizeof(struct ofi_bufpool_hdr)); + OFI_DBG_ADD(entry_sz, sizeof(struct ofi_bufpool_ftr)); pool->entry_size = ofi_get_aligned_size(entry_sz, attr->alignment); if (!attr->chunk_cnt) { @@ -220,7 +212,7 @@ void ofi_bufpool_destroy(struct ofi_bufpool *pool) if (pool->attr.free_fn) pool->attr.free_fn(buf_region); - if (pool->attr.flags & OFI_BUFPOOL_HUGEPAGES) { + if (buf_region->flags & OFI_BUFPOOL_HUGEPAGES) { ret = ofi_free_hugepage_buf(buf_region->alloc_region, pool->alloc_size); if (ret) { diff --git a/prov/util/src/util_cntr.c b/prov/util/src/util_cntr.c index 4880ef07061..469451f4cf2 100644 --- a/prov/util/src/util_cntr.c +++ b/prov/util/src/util_cntr.c @@ -49,6 +49,7 @@ static int ofi_check_cntr_attr(const struct fi_provider *prov, switch (attr->wait_obj) { case FI_WAIT_NONE: + case FI_WAIT_YIELD: break; case FI_WAIT_SET: if (!attr->wait_set) { @@ -58,6 +59,7 @@ static int ofi_check_cntr_attr(const struct fi_provider *prov, /* fall through */ case FI_WAIT_UNSPEC: case FI_WAIT_FD: + case FI_WAIT_POLLFD: break; default: FI_WARN(prov, FI_LOG_CNTR, "unsupported wait object\n"); @@ -193,6 +195,17 @@ static struct fi_ops_cntr util_cntr_ops = { .wait = ofi_cntr_wait }; +static struct fi_ops_cntr util_cntr_no_wait_ops = { + .size = sizeof(struct fi_ops_cntr), + .read = ofi_cntr_read, + .readerr = ofi_cntr_readerr, + .add = ofi_cntr_add, + .adderr = ofi_cntr_adderr, + .set = ofi_cntr_set, + .seterr = ofi_cntr_seterr, + .wait = fi_no_cntr_wait, +}; + int ofi_cntr_cleanup(struct util_cntr *cntr) { if (ofi_atomic_get32(&cntr->ref)) @@ -223,54 +236,6 @@ static int util_cntr_close(struct fid *fid) return 0; } -static int fi_cntr_init(struct fid_domain *domain, struct fi_cntr_attr *attr, - struct util_cntr *cntr, void *context) -{ - struct fi_wait_attr wait_attr; - struct fid_wait *wait; - int ret; - - cntr->domain = container_of(domain, struct util_domain, domain_fid); - ofi_atomic_initialize32(&cntr->ref, 0); - ofi_atomic_initialize64(&cntr->cnt, 0); - ofi_atomic_initialize64(&cntr->err, 0); - dlist_init(&cntr->ep_list); - fastlock_init(&cntr->ep_list_lock); - - cntr->cntr_fid.fid.fclass = FI_CLASS_CNTR; - cntr->cntr_fid.fid.context = context; - - switch (attr->wait_obj) { - case FI_WAIT_NONE: - wait = NULL; - cntr->cntr_fid.ops->wait = fi_no_cntr_wait; - break; - case FI_WAIT_UNSPEC: - case FI_WAIT_FD: - case FI_WAIT_MUTEX_COND: - memset(&wait_attr, 0, sizeof wait_attr); - wait_attr.wait_obj = attr->wait_obj; - cntr->internal_wait = 1; - ret = fi_wait_open(&cntr->domain->fabric->fabric_fid, - &wait_attr, &wait); - if (ret) - return ret; - break; - case FI_WAIT_SET: - wait = attr->wait_set; - break; - default: - assert(0); - return -FI_EINVAL; - } - - if (wait) - cntr->wait = container_of(wait, struct util_wait, wait_fid); - - ofi_atomic_inc32(&cntr->domain->ref); - return 0; -} - void ofi_cntr_progress(struct util_cntr *cntr) { struct util_ep *ep; @@ -299,22 +264,58 @@ int ofi_cntr_init(const struct fi_provider *prov, struct fid_domain *domain, ofi_cntr_progress_func progress, void *context) { int ret; + struct fi_wait_attr wait_attr; + struct fid_wait *wait; assert(progress); ret = ofi_check_cntr_attr(prov, attr); if (ret) return ret; + cntr->progress = progress; + cntr->domain = container_of(domain, struct util_domain, domain_fid); + ofi_atomic_initialize32(&cntr->ref, 0); + ofi_atomic_initialize64(&cntr->cnt, 0); + ofi_atomic_initialize64(&cntr->err, 0); + dlist_init(&cntr->ep_list); + + cntr->cntr_fid.fid.fclass = FI_CLASS_CNTR; + cntr->cntr_fid.fid.context = context; cntr->cntr_fid.fid.ops = &util_cntr_fi_ops; cntr->cntr_fid.ops = &util_cntr_ops; - cntr->progress = progress; - ret = fi_cntr_init(domain, attr, cntr, context); - if (ret) - return ret; + switch (attr->wait_obj) { + case FI_WAIT_NONE: + wait = NULL; + cntr->cntr_fid.ops = &util_cntr_no_wait_ops; + break; + case FI_WAIT_UNSPEC: + case FI_WAIT_FD: + case FI_WAIT_POLLFD: + case FI_WAIT_MUTEX_COND: + case FI_WAIT_YIELD: + memset(&wait_attr, 0, sizeof wait_attr); + wait_attr.wait_obj = attr->wait_obj; + cntr->internal_wait = 1; + ret = fi_wait_open(&cntr->domain->fabric->fabric_fid, + &wait_attr, &wait); + if (ret) + return ret; + break; + case FI_WAIT_SET: + wait = attr->wait_set; + break; + default: + assert(0); + return -FI_EINVAL; + } + + fastlock_init(&cntr->ep_list_lock); + ofi_atomic_inc32(&cntr->domain->ref); /* CNTR must be fully operational before adding to wait set */ - if (cntr->wait) { + if (wait) { + cntr->wait = container_of(wait, struct util_wait, wait_fid); ret = fi_poll_add(&cntr->wait->pollset->poll_fid, &cntr->cntr_fid.fid, 0); if (ret) { diff --git a/prov/util/src/util_coll.c b/prov/util/src/util_coll.c new file mode 100644 index 00000000000..621d2aedd16 --- /dev/null +++ b/prov/util/src/util_coll.c @@ -0,0 +1,1472 @@ +/* + * Copyright (c) 2019 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if HAVE_GETIFADDRS +#include +#include +#endif + +#include + +#include +#include +#include +#include +#include +#include + +int ofi_av_set_union(struct fid_av_set *dst, const struct fid_av_set *src) +{ + struct util_av_set *src_av_set; + struct util_av_set *dst_av_set; + size_t temp_count; + int i,j; + + src_av_set = container_of(src, struct util_av_set, av_set_fid); + dst_av_set = container_of(dst, struct util_av_set, av_set_fid); + + assert(src_av_set->av == dst_av_set->av); + temp_count = dst_av_set->fi_addr_count; + + for (i = 0; i < src_av_set->fi_addr_count; i++) { + for (j = 0; j < dst_av_set->fi_addr_count; j++) { + if (dst_av_set->fi_addr_array[j] == + src_av_set->fi_addr_array[i]) + break; + } + if (j == dst_av_set->fi_addr_count) { + dst_av_set->fi_addr_array[temp_count++] = + src_av_set->fi_addr_array[i]; + } + } + + dst_av_set->fi_addr_count = temp_count; + return FI_SUCCESS; +} + +int ofi_av_set_intersect(struct fid_av_set *dst, const struct fid_av_set *src) +{ + struct util_av_set *src_av_set; + struct util_av_set *dst_av_set; + int i,j, temp; + + src_av_set = container_of(src, struct util_av_set, av_set_fid); + dst_av_set = container_of(dst, struct util_av_set, av_set_fid); + + assert(src_av_set->av == dst_av_set->av); + + temp = 0; + for (i = 0; i < src_av_set->fi_addr_count; i++) { + for (j = temp; j < dst_av_set->fi_addr_count; j++) { + if (dst_av_set->fi_addr_array[j] == + src_av_set->fi_addr_array[i]) { + dst_av_set->fi_addr_array[temp++] = + dst_av_set->fi_addr_array[j]; + break; + } + } + } + dst_av_set->fi_addr_count = temp; + return FI_SUCCESS; +} + +int ofi_av_set_diff(struct fid_av_set *dst, const struct fid_av_set *src) +{ + + struct util_av_set *src_av_set; + struct util_av_set *dst_av_set; + int i,j, temp; + + src_av_set = container_of(src, struct util_av_set, av_set_fid); + dst_av_set = container_of(dst, struct util_av_set, av_set_fid); + + assert(src_av_set->av == dst_av_set->av); + + temp = dst_av_set->fi_addr_count; + for (i = 0; i < src_av_set->fi_addr_count; i++) { + for (j = 0; j < temp; j++) { + if (dst_av_set->fi_addr_array[j] == + src_av_set->fi_addr_array[i]) { + dst_av_set->fi_addr_array[--temp] = + dst_av_set->fi_addr_array[j]; + break; + } + } + } + dst_av_set->fi_addr_count = temp; + return FI_SUCCESS; +} + +int ofi_av_set_insert(struct fid_av_set *set, fi_addr_t addr) +{ + struct util_av_set *av_set; + int i; + + av_set = container_of(set, struct util_av_set, av_set_fid); + + for (i = 0; i < av_set->fi_addr_count; i++) { + if (av_set->fi_addr_array[i] == addr) + return -FI_EINVAL; + } + av_set->fi_addr_array[av_set->fi_addr_count++] = addr; + return FI_SUCCESS; +} + +int ofi_av_set_remove(struct fid_av_set *set, fi_addr_t addr) + +{ + struct util_av_set *av_set; + int i; + + av_set = container_of(set, struct util_av_set, av_set_fid); + + for (i = 0; i < av_set->fi_addr_count; i++) { + if (av_set->fi_addr_array[i] == addr) { + av_set->fi_addr_array[i] = + av_set->fi_addr_array[--av_set->fi_addr_count]; + return FI_SUCCESS; + } + } + return -FI_EINVAL; +} + +static inline uint64_t util_coll_form_tag(uint32_t coll_id, uint32_t rank) +{ + uint64_t tag; + uint64_t src_rank = rank; + + tag = coll_id; + tag |= (src_rank << 32); + + return OFI_COLL_TAG_FLAG | tag; +} + +static inline uint32_t util_coll_get_next_id(struct util_coll_mc *coll_mc) +{ + uint32_t cid = coll_mc->group_id; + return cid << 16 | coll_mc->seq++; +} + +static inline int util_coll_op_create(struct util_coll_operation **coll_op, + struct util_coll_mc *coll_mc, + enum util_coll_op_type type, void *context, + util_coll_comp_fn_t comp_fn) +{ + *coll_op = calloc(1, sizeof(**coll_op)); + if (!(*coll_op)) + return -FI_ENOMEM; + + (*coll_op)->cid = util_coll_get_next_id(coll_mc); + (*coll_op)->mc = coll_mc; + (*coll_op)->type = type; + (*coll_op)->context = context; + (*coll_op)->comp_fn = comp_fn; + dlist_init(&(*coll_op)->work_queue); + + return FI_SUCCESS; +} + +static inline void util_coll_op_log_work(struct util_coll_operation *coll_op) +{ +#if ENABLE_DEBUG + struct util_coll_work_item *cur_item = NULL; + struct util_coll_xfer_item *xfer_item; + struct dlist_entry *tmp = NULL; + size_t count = 0; + FI_DBG(coll_op->mc->av_set->av->prov, FI_LOG_CQ, "Remaining Work for %s:\n", + log_util_coll_op_type[coll_op->type]); + dlist_foreach_container_safe(&coll_op->work_queue, struct util_coll_work_item, + cur_item, waiting_entry, tmp) + { + switch (cur_item->type) { + case UTIL_COLL_SEND: + xfer_item = + container_of(cur_item, struct util_coll_xfer_item, hdr); + FI_DBG(coll_op->mc->av_set->av->prov, FI_LOG_CQ, + "\t%ld: { %p [%s] SEND TO: 0x%02x FROM: 0x%02lx " + "cnt: %d typesize: %ld tag: 0x%02lx }\n", + count, cur_item, log_util_coll_state[cur_item->state], + xfer_item->remote_rank, coll_op->mc->local_rank, + xfer_item->count, ofi_datatype_size(xfer_item->datatype), + xfer_item->tag); + break; + case UTIL_COLL_RECV: + xfer_item = + container_of(cur_item, struct util_coll_xfer_item, hdr); + FI_DBG(coll_op->mc->av_set->av->prov, FI_LOG_CQ, + "\t%ld: { %p [%s] RECV FROM: 0x%02x TO: 0x%02lx " + "cnt: %d typesize: %ld tag: 0x%02lx }\n", + count, cur_item, log_util_coll_state[cur_item->state], + xfer_item->remote_rank, coll_op->mc->local_rank, + xfer_item->count, ofi_datatype_size(xfer_item->datatype), + xfer_item->tag); + break; + case UTIL_COLL_REDUCE: + //reduce_item = container_of(cur_item, struct util_coll_reduce_item, hdr); + FI_DBG(coll_op->mc->av_set->av->prov, FI_LOG_CQ, + "\t%ld: { %p [%s] REDUCTION }\n", count, cur_item, + log_util_coll_state[cur_item->state]); + break; + case UTIL_COLL_COPY: + FI_DBG(coll_op->mc->av_set->av->prov, FI_LOG_CQ, + "\t%ld: { %p [%s] COPY }\n", count, cur_item, + log_util_coll_state[cur_item->state]); + break; + case UTIL_COLL_COMP: + FI_DBG(coll_op->mc->av_set->av->prov, FI_LOG_CQ, + "\t%ld: { %p [%s] COMPLETION }\n", count, cur_item, + log_util_coll_state[cur_item->state]); + break; + default: + FI_DBG(coll_op->mc->av_set->av->prov, FI_LOG_CQ, + "\t%ld: { %p [%s] UNKNOWN }\n", count, cur_item, + log_util_coll_state[cur_item->state]); + break; + } + count++; + } +#endif +} + +static inline void util_coll_op_progress_work(struct util_ep *util_ep, + struct util_coll_operation *coll_op) +{ + struct util_coll_work_item *next_ready = NULL; + struct util_coll_work_item *cur_item = NULL; + struct util_coll_work_item *prev_item = NULL; + struct dlist_entry *tmp = NULL; + int previous_is_head; + + // clean up any completed items while searching for the next ready + dlist_foreach_container_safe(&coll_op->work_queue, struct util_coll_work_item, + cur_item, waiting_entry, tmp) + { + previous_is_head = cur_item->waiting_entry.prev == &cur_item->coll_op->work_queue; + if (!previous_is_head) { + prev_item = container_of(cur_item->waiting_entry.prev, + struct util_coll_work_item, + waiting_entry); + } + + if (cur_item->state == UTIL_COLL_COMPLETE) { + // if there is work before cur and cur is fencing, we can't complete + if (cur_item->fence && !previous_is_head) + continue; + + FI_DBG(coll_op->mc->av_set->av->prov, FI_LOG_CQ, + "Removing Completed Work item: %p \n", cur_item); + dlist_remove(&cur_item->waiting_entry); + free(cur_item); + + // if the work queue is empty, we're done + if (dlist_empty(&coll_op->work_queue)) { + free(coll_op); + return; + } + continue; + } + + // we can't progress if prior work is fencing + if (!previous_is_head && prev_item && prev_item->fence) { + FI_DBG(coll_op->mc->av_set->av->prov, FI_LOG_CQ, + "%p fenced by: %p \n", cur_item, prev_item); + return; + } + + // if the current item isn't waiting, it's not the next ready item + if (cur_item->state != UTIL_COLL_WAITING) { + FI_DBG(coll_op->mc->av_set->av->prov, FI_LOG_CQ, + "Work item not waiting: %p [%s]\n", cur_item, + log_util_coll_state[cur_item->state]); + continue; + } + + FI_DBG(coll_op->mc->av_set->av->prov, FI_LOG_CQ, "Ready item: %p \n", + cur_item); + next_ready = cur_item; + break; + } + + if (!next_ready) + return; + + util_coll_op_log_work(coll_op); + + next_ready->state = UTIL_COLL_PROCESSING; + slist_insert_tail(&next_ready->ready_entry, &util_ep->coll_ready_queue); +} + +static inline void util_coll_op_bind_work(struct util_coll_operation *coll_op, + struct util_coll_work_item *item) +{ + item->coll_op = coll_op; + dlist_insert_tail(&item->waiting_entry, &coll_op->work_queue); +} + +static int util_coll_sched_send(struct util_coll_operation *coll_op, uint32_t dest, + void *buf, int count, enum fi_datatype datatype, + int fence) +{ + struct util_coll_xfer_item *xfer_item; + + xfer_item = calloc(1, sizeof(*xfer_item)); + if (!xfer_item) + return -FI_ENOMEM; + + xfer_item->hdr.type = UTIL_COLL_SEND; + xfer_item->hdr.state = UTIL_COLL_WAITING; + xfer_item->hdr.fence = fence; + xfer_item->tag = util_coll_form_tag(coll_op->cid, coll_op->mc->local_rank); + xfer_item->buf = buf; + xfer_item->count = count; + xfer_item->datatype = datatype; + xfer_item->remote_rank = dest; + + util_coll_op_bind_work(coll_op, &xfer_item->hdr); + return FI_SUCCESS; +} + +static int util_coll_sched_recv(struct util_coll_operation *coll_op, uint32_t src, + void *buf, int count, enum fi_datatype datatype, + int fence) +{ + struct util_coll_xfer_item *xfer_item; + + xfer_item = calloc(1, sizeof(*xfer_item)); + if (!xfer_item) + return -FI_ENOMEM; + + xfer_item->hdr.type = UTIL_COLL_RECV; + xfer_item->hdr.state = UTIL_COLL_WAITING; + xfer_item->hdr.fence = fence; + xfer_item->tag = util_coll_form_tag(coll_op->cid, src); + xfer_item->buf = buf; + xfer_item->count = count; + xfer_item->datatype = datatype; + xfer_item->remote_rank = src; + + util_coll_op_bind_work(coll_op, &xfer_item->hdr); + return FI_SUCCESS; +} + +static int util_coll_sched_reduce(struct util_coll_operation *coll_op, void *in_buf, + void *inout_buf, int count, enum fi_datatype datatype, + enum fi_op op, int fence) +{ + struct util_coll_reduce_item *reduce_item; + + reduce_item = calloc(1, sizeof(*reduce_item)); + if (!reduce_item) + return -FI_ENOMEM; + + reduce_item->hdr.type = UTIL_COLL_REDUCE; + reduce_item->hdr.state = UTIL_COLL_WAITING; + reduce_item->hdr.fence = fence; + reduce_item->in_buf = in_buf; + reduce_item->inout_buf = inout_buf; + reduce_item->count = count; + reduce_item->datatype = datatype; + reduce_item->op = op; + + util_coll_op_bind_work(coll_op, &reduce_item->hdr); + return FI_SUCCESS; +} + +static int util_coll_sched_copy(struct util_coll_operation *coll_op, void *in_buf, + void *out_buf, int count, enum fi_datatype datatype, + int fence) +{ + struct util_coll_copy_item *copy_item; + + copy_item = calloc(1, sizeof(*copy_item)); + if (!copy_item) + return -FI_ENOMEM; + + copy_item->hdr.type = UTIL_COLL_COPY; + copy_item->hdr.state = UTIL_COLL_WAITING; + copy_item->hdr.fence = fence; + copy_item->in_buf = in_buf; + copy_item->out_buf = out_buf; + copy_item->count = count; + copy_item->datatype = datatype; + + util_coll_op_bind_work(coll_op, ©_item->hdr); + return FI_SUCCESS; +} + +static int util_coll_sched_comp(struct util_coll_operation *coll_op) +{ + struct util_coll_work_item *comp_item; + + comp_item = calloc(1, sizeof(*comp_item)); + if (!comp_item) + return -FI_ENOMEM; + + comp_item->type = UTIL_COLL_COMP; + comp_item->state = UTIL_COLL_WAITING; + comp_item->fence = 1; + + util_coll_op_bind_work(coll_op, comp_item); + return FI_SUCCESS; +} + +/* TODO: when this fails, clean up the already scheduled work in this function */ +static int util_coll_allreduce(struct util_coll_operation *coll_op, const void *send_buf, + void *result, void* tmp_buf, int count, enum fi_datatype datatype, + enum fi_op op) +{ + uint64_t rem, pof2, my_new_id; + uint64_t local, remote, next_remote; + int ret; + uint64_t mask = 1; + + pof2 = rounddown_power_of_two(coll_op->mc->av_set->fi_addr_count); + rem = coll_op->mc->av_set->fi_addr_count - pof2; + local = coll_op->mc->local_rank; + + // copy initial send data to result + memcpy(result, send_buf, count * ofi_datatype_size(datatype)); + + if (local < 2 * rem) { + if (local % 2 == 0) { + ret = util_coll_sched_send(coll_op, local + 1, result, count, + datatype, 1); + if (ret) + return ret; + + my_new_id = -1; + } else { + ret = util_coll_sched_recv(coll_op, local - 1, + tmp_buf, count, datatype, 1); + if (ret) + return ret; + + my_new_id = local / 2; + + ret = util_coll_sched_reduce(coll_op, tmp_buf, result, + count, datatype, op, 1); + if (ret) + return ret; + } + } else { + my_new_id = local - rem; + } + + if (my_new_id != -1) { + while (mask < pof2) { + next_remote = my_new_id ^ mask; + remote = (next_remote < rem) ? next_remote * 2 + 1 : + next_remote + rem; + + // receive remote data into tmp buf + ret = util_coll_sched_recv(coll_op, remote, tmp_buf, count, + datatype, 0); + if (ret) + return ret; + + // send result buf, which has the current total + ret = util_coll_sched_send(coll_op, remote, result, count, + datatype, 1); + if (ret) + return ret; + + if (remote < local) { + // reduce received remote into result buf + ret = util_coll_sched_reduce(coll_op, tmp_buf, result, + count, datatype, op, 1); + if (ret) + return ret; + } else { + // reduce local result into received data + ret = util_coll_sched_reduce(coll_op, result, tmp_buf, + count, datatype, op, 1); + if (ret) + return ret; + + // copy total into result + ret = util_coll_sched_copy(coll_op, tmp_buf, result, + count, datatype, 1); + if (ret) + return ret; + } + mask <<= 1; + } + } + + if (local < 2 * rem) { + if (local % 2) { + ret = util_coll_sched_send(coll_op, local - 1, result, count, + datatype, 1); + if (ret) + return ret; + } else { + ret = util_coll_sched_recv(coll_op, local + 1, result, count, + datatype, 1); + if (ret) + return ret; + } + } + return FI_SUCCESS; +} + +static int util_coll_allgather(struct util_coll_operation *coll_op, const void *send_buf, + void *result, int count, enum fi_datatype datatype) +{ + // allgather implemented using ring algorithm + int64_t ret, i, cur_offset, next_offset; + size_t nbytes, numranks; + uint64_t local_rank, left_rank, right_rank; + + local_rank = coll_op->mc->local_rank; + nbytes = ofi_datatype_size(datatype) * count; + numranks = coll_op->mc->av_set->fi_addr_count; + + // copy the local value to the appropriate place in result buffer + ret = util_coll_sched_copy(coll_op, (void *) send_buf, + (char *) result + (local_rank * nbytes), count, + datatype, 1); + if (ret) + return ret; + + // send to right, recv from left + left_rank = (numranks + local_rank - 1) % numranks; + right_rank = (local_rank + 1) % numranks; + + cur_offset = local_rank; + next_offset = left_rank; + + // fill in result with data going right to left + for (i = 1; i < numranks; i++) { + ret = util_coll_sched_send(coll_op, right_rank, + (char *) result + (cur_offset * nbytes), count, + datatype, 0); + if (ret) + return ret; + + ret = util_coll_sched_recv(coll_op, left_rank, + (char *) result + (next_offset * nbytes), + count, datatype, 1); + if (ret) + return ret; + + cur_offset = next_offset; + next_offset = (numranks + next_offset - 1) % numranks; + } + + return FI_SUCCESS; +} + +static size_t util_binomial_tree_values_to_recv(uint64_t rank, size_t numranks) +{ + size_t nvalues = 0x1 << (ofi_lsb(rank) - 1); + if (numranks < rank + nvalues) + nvalues = numranks - rank; + + return nvalues; +} + +static int util_coll_scatter(struct util_coll_operation *coll_op, const void *data, + void *result, void **temp, size_t count, uint64_t root, + enum fi_datatype datatype) +{ + // scatter implemented with binomial tree algorithm + uint64_t local_rank, relative_rank; + size_t nbytes, numranks, send_cnt, cur_cnt = 0; + int ret, mask, remote_rank; + void *send_data; + + local_rank = coll_op->mc->local_rank; + numranks = coll_op->mc->av_set->fi_addr_count; + relative_rank = (local_rank >= root) ? local_rank - root : local_rank - root + numranks; + nbytes = count * ofi_datatype_size(datatype); + + // check if we need to participate + if (count == 0) + return FI_SUCCESS; + + // non-root even nodes get a temp buffer for receiving data + // these nodes may need to send part of what they receive + if (relative_rank && !(relative_rank % 2)) { + cur_cnt = count * util_binomial_tree_values_to_recv(relative_rank, numranks); + *temp = malloc(cur_cnt * ofi_datatype_size(datatype)); + if (!*temp) + return -FI_ENOMEM; + } + + if (local_rank == root) { + cur_cnt = count * numranks; + if (root != 0) { + // if we're root but not rank 0, we need to reorder the send buffer + // according to destination rank. if we're rank 3, data intended for + // ranks 0-2 will be moved to the end + *temp = malloc(cur_cnt * ofi_datatype_size(datatype)); + if (!*temp) + return -FI_ENOMEM; + ret = util_coll_sched_copy(coll_op, + (char *) data + nbytes * local_rank, *temp, + (numranks - local_rank) * count, datatype, + 1); + if (ret) + return ret; + + ret = util_coll_sched_copy(coll_op, (char *) data, + (char *) *temp + + (numranks - local_rank) * nbytes, + local_rank * count, datatype, 1); + if (ret) + return ret; + } + } + + // set up all receives + mask = 0x1; + while (mask < numranks) { + if (relative_rank & mask) { + remote_rank = local_rank - mask; + if (remote_rank < 0) + remote_rank += numranks; + + if (relative_rank % 2) { + // leaf node, we're receiving the actual data + ret = util_coll_sched_recv(coll_op, remote_rank, result, count, + datatype, 1); + if (ret) + return ret; + } else { + // branch node, we're receiving data which we've got to forward + ret = util_coll_sched_recv(coll_op, remote_rank, *temp, + cur_cnt, datatype, 1); + if (ret) + return ret; + } + break; + } + mask <<= 1; + } + + // set up all sends + send_data = root == local_rank && root == 0 ? (void *) data : *temp; + mask >>= 1; + while (mask > 0) { + if (relative_rank + mask < numranks) { + // to this point, cur_cnt has represented the number of values + // to expect to store in our data buf + // from here on, cur_cnt is the number of values we have left to + // forward from the data buf + send_cnt = cur_cnt - count * mask; + + remote_rank = local_rank + mask; + if (remote_rank >= numranks) + remote_rank -= numranks; + + FI_DBG(coll_op->mc->av_set->av->prov, FI_LOG_CQ, + "MASK: 0x%0x CUR_CNT: %ld SENDING: %ld TO: %d\n", mask, + cur_cnt, send_cnt, remote_rank); + + assert(send_cnt > 0); + + ret = util_coll_sched_send(coll_op, remote_rank, + (char *) send_data + + nbytes * mask, + send_cnt, datatype, 1); + if (ret) + return ret; + + cur_cnt -= send_cnt; + } + mask >>= 1; + } + + if (!(relative_rank % 2)) { + // for the root and all even nodes, we've got to copy + // our local data to the result buffer + ret = util_coll_sched_copy(coll_op, send_data, result, count, datatype, 1); + } + + return FI_SUCCESS; +} + +static int util_coll_close(struct fid *fid) +{ + struct util_coll_mc *coll_mc; + + coll_mc = container_of(fid, struct util_coll_mc, mc_fid.fid); + + ofi_atomic_dec32(&coll_mc->av_set->ref); + free(coll_mc); + + return FI_SUCCESS; +} + +static struct fi_ops util_coll_fi_ops = { + .size = sizeof(struct fi_ops), + .close = util_coll_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static inline void util_coll_mc_init(struct util_coll_mc *coll_mc, + struct util_av_set *av_set, + struct fid_ep *ep, void *context) +{ + coll_mc->mc_fid.fid.fclass = FI_CLASS_MC; + coll_mc->mc_fid.fid.context = context; + coll_mc->mc_fid.fid.ops = &util_coll_fi_ops; + coll_mc->mc_fid.fi_addr = (uintptr_t) coll_mc; + coll_mc->ep = ep; + assert(av_set != NULL); + ofi_atomic_inc32(&av_set->ref); + coll_mc->av_set = av_set; +} + +static int ofi_av_set_addr(struct fid_av_set *set, fi_addr_t *coll_addr) +{ + struct util_av_set *av_set; + + av_set = container_of(set, struct util_av_set, av_set_fid); + + *coll_addr = (uintptr_t) &av_set->coll_mc; + + return FI_SUCCESS; +} + +/* TODO: Figure out requirements for using collectives. + * e.g. require local address to be in AV? + * Determine best way to handle first join request + */ +static int util_coll_find_local_rank(struct fid_ep *ep, struct util_coll_mc *coll_mc) +{ + size_t addrlen; + char *addr; + int ret; + fi_addr_t my_addr; + int i; + + addrlen = 0; + ret = fi_getname(&ep->fid, NULL, &addrlen); + if (ret != FI_SUCCESS && addrlen == 0) + return ret; + + addr = calloc(1, addrlen); + if (!addr) + return -FI_ENOMEM; + + ret = fi_getname(&ep->fid, addr, &addrlen); + if (ret) { + free(addr); + return ret; + } + my_addr = ofi_av_lookup_fi_addr(coll_mc->av_set->av, addr); + + coll_mc->local_rank = FI_ADDR_NOTAVAIL; + if (my_addr != FI_ADDR_NOTAVAIL) { + for (i=0; iav_set->fi_addr_count; i++) + if (coll_mc->av_set->fi_addr_array[i] == my_addr) { + coll_mc->local_rank = i; + break; + } + } + + free(addr); + + return FI_SUCCESS; +} + +void util_coll_join_comp(struct util_coll_operation *coll_op) +{ + struct fi_eq_entry entry; + struct util_ep *ep = container_of(coll_op->mc->ep, struct util_ep, ep_fid); + + coll_op->data.join.new_mc->seq = 0; + coll_op->data.join.new_mc->group_id = ofi_bitmask_get_lsbset(coll_op->data.join.data); + // mark the local mask bit + ofi_bitmask_unset(ep->coll_cid_mask, coll_op->data.join.new_mc->group_id); + + /* write to the eq */ + memset(&entry, 0, sizeof(entry)); + entry.fid = &coll_op->mc->mc_fid.fid; + entry.context = coll_op->context; + + if (ofi_eq_write(&ep->eq->eq_fid, FI_JOIN_COMPLETE, &entry, + sizeof(struct fi_eq_entry), FI_COLLECTIVE) < 0) + FI_WARN(ep->domain->fabric->prov, FI_LOG_DOMAIN, + "join collective - eq write failed\n"); + + ofi_bitmask_free(&coll_op->data.join.data); + ofi_bitmask_free(&coll_op->data.join.tmp); +} + +void util_coll_collective_comp(struct util_coll_operation *coll_op) +{ + struct util_ep *ep; + + ep = container_of(coll_op->mc->ep, struct util_ep, ep_fid); + + if (ofi_cq_write(ep->tx_cq, coll_op->context, FI_COLLECTIVE, 0, 0, 0, 0)) + FI_WARN(ep->domain->fabric->prov, FI_LOG_DOMAIN, + "barrier collective - cq write failed\n"); + + switch (coll_op->type) { + case UTIL_COLL_ALLREDUCE_OP: + free(coll_op->data.allreduce.data); + break; + case UTIL_COLL_SCATTER_OP: + free(coll_op->data.scatter); + break; + case UTIL_COLL_BROADCAST_OP: + free(coll_op->data.broadcast.chunk); + free(coll_op->data.broadcast.scatter); + break; + case UTIL_COLL_JOIN_OP: + case UTIL_COLL_BARRIER_OP: + case UTIL_COLL_ALLGATHER_OP: + default: + //nothing to clean up + break; + } +} + +static int util_coll_proc_reduce_item(struct util_coll_reduce_item *reduce_item) +{ + if (FI_MIN <= reduce_item->op && FI_BXOR >= reduce_item->op) { + ofi_atomic_write_handler(reduce_item->op, reduce_item->datatype, + reduce_item->inout_buf, + reduce_item->in_buf, + reduce_item->count); + } else { + return -FI_ENOSYS; + } + return FI_SUCCESS; +} + +int util_coll_process_xfer_item(struct util_coll_xfer_item *item) { + struct iovec iov; + struct fi_msg_tagged msg; + struct util_coll_mc *mc = item->hdr.coll_op->mc; + int ret; + + msg.msg_iov = &iov; + msg.desc = NULL; + msg.iov_count = 1; + msg.ignore = 0; + msg.context = item; + msg.data = 0; + msg.tag = item->tag; + msg.addr = mc->av_set->fi_addr_array[item->remote_rank]; + + iov.iov_base = item->buf; + iov.iov_len = (item->count * ofi_datatype_size(item->datatype)); + + if (item->hdr.type == UTIL_COLL_SEND) { + ret = fi_tsendmsg(mc->ep, &msg, FI_COLLECTIVE); + if (!ret) + FI_DBG(mc->av_set->av->prov, FI_LOG_CQ, + "%p SEND [0x%02lx] -> [0x%02x] cnt: %d sz: %ld\n", item, + item->hdr.coll_op->mc->local_rank, item->remote_rank, + item->count, + item->count * ofi_datatype_size(item->datatype)); + return ret; + } else if (item->hdr.type == UTIL_COLL_RECV) { + ret = fi_trecvmsg(mc->ep, &msg, FI_COLLECTIVE); + if (!ret) + FI_DBG(mc->av_set->av->prov, FI_LOG_CQ, + "%p RECV [0x%02lx] <- [0x%02x] cnt: %d sz: %ld\n", item, + item->hdr.coll_op->mc->local_rank, item->remote_rank, + item->count, + item->count * ofi_datatype_size(item->datatype)); + return ret; + } + + return -FI_ENOSYS; +} + +int ofi_coll_ep_progress(struct fid_ep *ep) +{ + struct util_coll_work_item *work_item; + struct util_coll_reduce_item *reduce_item; + struct util_coll_copy_item *copy_item; + struct util_coll_xfer_item *xfer_item; + struct util_coll_operation *coll_op; + struct util_ep *util_ep; + int ret; + + util_ep = container_of(ep, struct util_ep, ep_fid); + + while (!slist_empty(&util_ep->coll_ready_queue)) { + slist_remove_head_container(&util_ep->coll_ready_queue, + struct util_coll_work_item, work_item, + ready_entry); + coll_op = work_item->coll_op; + switch (work_item->type) { + case UTIL_COLL_SEND: + xfer_item = container_of(work_item, struct util_coll_xfer_item, hdr); + ret = util_coll_process_xfer_item(xfer_item); + if (ret && ret == -FI_EAGAIN) { + slist_insert_tail(&work_item->ready_entry, + &util_ep->coll_ready_queue); + goto out; + } + break; + case UTIL_COLL_RECV: + xfer_item = container_of(work_item, struct util_coll_xfer_item, hdr); + ret = util_coll_process_xfer_item(xfer_item); + if (ret) + goto out; + break; + case UTIL_COLL_REDUCE: + reduce_item = container_of(work_item, struct util_coll_reduce_item, hdr); + ret = util_coll_proc_reduce_item(reduce_item); + if (ret) + goto out; + + reduce_item->hdr.state = UTIL_COLL_COMPLETE; + break; + case UTIL_COLL_COPY: + copy_item = container_of(work_item, struct util_coll_copy_item, hdr); + memcpy(copy_item->out_buf, copy_item->in_buf, + copy_item->count * ofi_datatype_size(copy_item->datatype)); + + copy_item->hdr.state = UTIL_COLL_COMPLETE; + break; + case UTIL_COLL_COMP: + if (work_item->coll_op->comp_fn) + work_item->coll_op->comp_fn(work_item->coll_op); + + work_item->state = UTIL_COLL_COMPLETE; + break; + default: + ret = FI_ENOSYS; + goto out; + } + + util_coll_op_progress_work(util_ep, coll_op); + } + + ret = FI_SUCCESS; + +out: + return ret; +} + +int ofi_join_collective(struct fid_ep *ep, fi_addr_t coll_addr, + const struct fid_av_set *set, + uint64_t flags, struct fid_mc **mc, void *context) +{ + struct util_coll_mc *new_coll_mc; + struct util_av_set *av_set; + struct util_coll_mc *coll_mc; + struct util_coll_operation *join_op; + struct util_ep *util_ep; + int ret; + + av_set = container_of(set, struct util_av_set, av_set_fid); + + if (coll_addr == FI_ADDR_NOTAVAIL) { + assert(av_set->av->coll_mc != NULL); + coll_mc = av_set->av->coll_mc; + } else { + coll_mc = (struct util_coll_mc*) ((uintptr_t) coll_addr); + } + + new_coll_mc = calloc(1, sizeof(*new_coll_mc)); + if (!new_coll_mc) + return -FI_ENOMEM; + + // set up the new mc for future collectives + util_coll_mc_init(new_coll_mc, av_set, ep, context); + + coll_mc->ep = ep; + + util_ep = container_of(ep, struct util_ep, ep_fid); + + /* get the rank */ + util_coll_find_local_rank(ep, new_coll_mc); + util_coll_find_local_rank(ep, coll_mc); + + ret = util_coll_op_create(&join_op, coll_mc, UTIL_COLL_JOIN_OP, context, + util_coll_join_comp); + if (ret) + goto err1; + + join_op->data.join.new_mc = new_coll_mc; + + ret = ofi_bitmask_create(&join_op->data.join.data, OFI_MAX_GROUP_ID); + if (ret) + goto err2; + + ret = ofi_bitmask_create(&join_op->data.join.tmp, OFI_MAX_GROUP_ID); + if (ret) + goto err3; + + ret = util_coll_allreduce(join_op, util_ep->coll_cid_mask->bytes, + join_op->data.join.data.bytes, + join_op->data.join.tmp.bytes, + ofi_bitmask_bytesize(util_ep->coll_cid_mask), + FI_UINT8, FI_BAND); + if (ret) + goto err4; + + ret = util_coll_sched_comp(join_op); + if (ret) + goto err4; + + util_coll_op_progress_work(util_ep, join_op); + + *mc = &new_coll_mc->mc_fid; + return FI_SUCCESS; +err4: + ofi_bitmask_free(&join_op->data.join.tmp); +err3: + ofi_bitmask_free(&join_op->data.join.data); +err2: + free(join_op); +err1: + free(new_coll_mc); + return ret; +} + +static struct fi_ops_av_set util_av_set_ops= { + .set_union = ofi_av_set_union, + .intersect = ofi_av_set_intersect, + .diff = ofi_av_set_diff, + .insert = ofi_av_set_insert, + .remove = ofi_av_set_remove, + .addr = ofi_av_set_addr +}; + +static int util_coll_copy_from_av(struct util_av *av, void *addr, + fi_addr_t fi_addr, void *arg) +{ + struct util_av_set *av_set = (struct util_av_set *) arg; + av_set->fi_addr_array[av_set->fi_addr_count++] = fi_addr; + return FI_SUCCESS; +} + +static int util_av_set_close(struct fid *fid) +{ + struct util_av_set *av_set; + + av_set = container_of(fid, struct util_av_set, av_set_fid.fid); + + // release reference taken by internal coll_mc embedded in av_set + ofi_atomic_dec32(&av_set->ref); + if (ofi_atomic_get32(&av_set->ref) > 0) + return -FI_EBUSY; + + free(av_set); + + return FI_SUCCESS; +} + +static struct fi_ops util_av_set_fi_ops = { + .size = sizeof(struct fi_ops), + .close = util_av_set_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static inline int util_av_set_init(struct util_av_set *av_set, + struct util_av *util_av, + void *context) +{ + int ret = FI_SUCCESS; + + av_set->av_set_fid.ops = &util_av_set_ops; + av_set->av_set_fid.fid.fclass = FI_CLASS_AV_SET; + av_set->av_set_fid.fid.context = context; + av_set->av_set_fid.fid.ops = &util_av_set_fi_ops; + av_set->av = util_av; + ofi_atomic_initialize32(&av_set->ref, 0); + ret = fastlock_init(&av_set->lock); + + return ret; +} + +static int util_coll_av_init(struct util_av *av) +{ + struct util_coll_mc *coll_mc; + int ret; + + assert(!av->coll_mc); + + coll_mc = calloc(1, sizeof(*coll_mc)); + if (!coll_mc) + return -FI_ENOMEM; + + coll_mc->av_set = calloc(1, sizeof(*coll_mc->av_set)); + if (!coll_mc->av_set) { + ret = -FI_ENOMEM; + goto err1; + } + ret = util_av_set_init(coll_mc->av_set, av, NULL); + if (ret) + goto err3; + + coll_mc->av_set->fi_addr_array = calloc(ofi_av_size(av), + sizeof(*coll_mc->av_set->fi_addr_array)); + if (!coll_mc->av_set->fi_addr_array) { + ret = -FI_ENOMEM; + goto err2; + } + + ret = ofi_av_elements_iter(av, util_coll_copy_from_av, + (void *)coll_mc->av_set); + if (ret) + goto err4; + + util_coll_mc_init(coll_mc, coll_mc->av_set, NULL, NULL); + + av->coll_mc = coll_mc; + return FI_SUCCESS; + +err4: + fastlock_destroy(&coll_mc->av_set->lock); +err3: + free(coll_mc->av_set->fi_addr_array); +err2: + free(coll_mc->av_set); +err1: + free(coll_mc); + return ret; +} + +int ofi_av_set(struct fid_av *av, struct fi_av_set_attr *attr, + struct fid_av_set **av_set_fid, void * context) +{ + struct util_av *util_av = container_of(av, struct util_av, av_fid); + struct util_av_set *av_set; + int ret, iter; + + if (!util_av->coll_mc) { + ret = util_coll_av_init(util_av); + if (ret) + return ret; + } + + av_set = calloc(1,sizeof(*av_set)); + if (!av_set) + return -FI_ENOMEM; + + ret = util_av_set_init(av_set, util_av, context); + if (ret) + goto err1; + + av_set->fi_addr_array = calloc(ofi_av_size(util_av), + sizeof(*av_set->fi_addr_array)); + if (!av_set->fi_addr_array) + goto err2; + + for (iter = 0; iter < attr->count; iter++) { + av_set->fi_addr_array[iter] = + util_av->coll_mc->av_set->fi_addr_array[iter * attr->stride]; + av_set->fi_addr_count++; + } + + util_coll_mc_init(&av_set->coll_mc, av_set, NULL, context); + + (*av_set_fid) = &av_set->av_set_fid; + return FI_SUCCESS; +err2: + fastlock_destroy(&av_set->lock); +err1: + free(av_set); + return ret; +} + +ssize_t ofi_ep_barrier(struct fid_ep *ep, fi_addr_t coll_addr, void *context) +{ + struct util_coll_mc *coll_mc; + struct util_coll_operation *barrier_op; + struct util_ep *util_ep; + uint64_t send; + int ret; + + coll_mc = (struct util_coll_mc*) ((uintptr_t) coll_addr); + + ret = util_coll_op_create(&barrier_op, coll_mc, UTIL_COLL_BARRIER_OP, context, + util_coll_collective_comp); + if (ret) + return ret; + + send = ~barrier_op->mc->local_rank; + ret = util_coll_allreduce(barrier_op, &send, &barrier_op->data.barrier.data, + &barrier_op->data.barrier.tmp, 1, FI_UINT64, FI_BAND); + if (ret) + goto err1; + + ret = util_coll_sched_comp(barrier_op); + if (ret) + goto err1; + + util_ep = container_of(ep, struct util_ep, ep_fid); + util_coll_op_progress_work(util_ep, barrier_op); + + return FI_SUCCESS; +err1: + free(barrier_op); + return ret; +} + +ssize_t ofi_ep_allreduce(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + enum fi_datatype datatype, enum fi_op op, uint64_t flags, + void *context) +{ + struct util_coll_mc *coll_mc; + struct util_coll_operation *allreduce_op; + struct util_ep *util_ep; + int ret; + + coll_mc = (struct util_coll_mc *) ((uintptr_t) coll_addr); + ret = util_coll_op_create(&allreduce_op, coll_mc, UTIL_COLL_ALLREDUCE_OP, context, + util_coll_collective_comp); + if (ret) + return ret; + + + allreduce_op->data.allreduce.size = count * ofi_datatype_size(datatype); + allreduce_op->data.allreduce.data = calloc(count, ofi_datatype_size(datatype)); + if (!allreduce_op->data.allreduce.data) + goto err1; + + ret = util_coll_allreduce(allreduce_op, buf, result, allreduce_op->data.allreduce.data, count, + datatype, op); + if (ret) + goto err2; + + ret = util_coll_sched_comp(allreduce_op); + if (ret) + goto err2; + + util_ep = container_of(ep, struct util_ep, ep_fid); + util_coll_op_progress_work(util_ep, allreduce_op); + + return FI_SUCCESS; +err2: + free(allreduce_op->data.allreduce.data); +err1: + free(allreduce_op); + return ret; +} + +ssize_t ofi_ep_allgather(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + enum fi_datatype datatype, uint64_t flags, void *context) +{ + struct util_coll_mc *coll_mc; + struct util_coll_operation *allgather_op; + struct util_ep *util_ep; + int ret; + + coll_mc = (struct util_coll_mc *) ((uintptr_t) coll_addr); + ret = util_coll_op_create(&allgather_op, coll_mc, UTIL_COLL_ALLGATHER_OP, context, + util_coll_collective_comp); + if (ret) + return ret; + + ret = util_coll_allgather(allgather_op, buf, result, count, datatype); + if (ret) + goto err; + + ret = util_coll_sched_comp(allgather_op); + if (ret) + goto err; + + util_ep = container_of(ep, struct util_ep, ep_fid); + util_coll_op_progress_work(util_ep, allgather_op); + + return FI_SUCCESS; +err: + free(allgather_op); + return ret; +} + +ssize_t ofi_ep_scatter(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + fi_addr_t root_addr, enum fi_datatype datatype, uint64_t flags, + void *context) +{ + struct util_coll_mc *coll_mc; + struct util_coll_operation *scatter_op; + struct util_ep *util_ep; + int ret; + + coll_mc = (struct util_coll_mc *) ((uintptr_t) coll_addr); + ret = util_coll_op_create(&scatter_op, coll_mc, UTIL_COLL_SCATTER_OP, context, + util_coll_collective_comp); + if (ret) + return ret; + + ret = util_coll_scatter(scatter_op, buf, result, &scatter_op->data.scatter, count, root_addr, datatype); + if (ret) + goto err; + + ret = util_coll_sched_comp(scatter_op); + if (ret) + goto err; + + util_ep = container_of(ep, struct util_ep, ep_fid); + util_coll_op_progress_work(util_ep, scatter_op); + + return FI_SUCCESS; +err: + free(scatter_op); + return ret; +} + +ssize_t ofi_ep_broadcast(struct fid_ep *ep, void *buf, size_t count, void *desc, + fi_addr_t coll_addr, fi_addr_t root_addr, + enum fi_datatype datatype, uint64_t flags, void *context) +{ + struct util_coll_mc *coll_mc; + struct util_coll_operation *broadcast_op; + struct util_ep *util_ep; + int ret, chunk_cnt, numranks, local; + + coll_mc = (struct util_coll_mc *) ((uintptr_t) coll_addr); + ret = util_coll_op_create(&broadcast_op, coll_mc, UTIL_COLL_BROADCAST_OP, context, + util_coll_collective_comp); + if (ret) + return ret; + + local = broadcast_op->mc->local_rank; + numranks = broadcast_op->mc->av_set->fi_addr_count; + chunk_cnt = (count + numranks - 1) / numranks; + if (chunk_cnt * local > count && chunk_cnt * local - (int) count > chunk_cnt) + chunk_cnt = 0; + + broadcast_op->data.broadcast.chunk = malloc(chunk_cnt * ofi_datatype_size(datatype)); + if (!broadcast_op->data.broadcast.chunk) { + ret = -FI_ENOMEM; + goto err1; + } + + ret = util_coll_scatter(broadcast_op, buf, broadcast_op->data.broadcast.chunk, + &broadcast_op->data.broadcast.scatter, chunk_cnt, + root_addr, datatype); + if (ret) + goto err2; + + ret = util_coll_allgather(broadcast_op, broadcast_op->data.broadcast.chunk, buf, + chunk_cnt, datatype); + if (ret) + goto err2; + + ret = util_coll_sched_comp(broadcast_op); + if (ret) + goto err2; + + util_ep = container_of(ep, struct util_ep, ep_fid); + util_coll_op_progress_work(util_ep, broadcast_op); + + return FI_SUCCESS; +err2: + free(broadcast_op->data.broadcast.chunk); +err1: + free(broadcast_op); + return ret; +} + +void ofi_coll_handle_xfer_comp(uint64_t tag, void *ctx) +{ + struct util_ep *util_ep; + struct util_coll_xfer_item *xfer_item = (struct util_coll_xfer_item *) ctx; + xfer_item->hdr.state = UTIL_COLL_COMPLETE; + + FI_DBG(xfer_item->hdr.coll_op->mc->av_set->av->prov, FI_LOG_CQ, + "\tXfer complete: { %p %s Remote: 0x%02x Local: 0x%02lx cnt: %d typesize: %ld }\n", + xfer_item, xfer_item->hdr.type == UTIL_COLL_SEND ? "SEND" : "RECV", + xfer_item->remote_rank, xfer_item->hdr.coll_op->mc->local_rank, + xfer_item->count, ofi_datatype_size(xfer_item->datatype)); + util_ep = container_of(xfer_item->hdr.coll_op->mc->ep, struct util_ep, ep_fid); + util_coll_op_progress_work(util_ep, xfer_item->hdr.coll_op); +} + +int ofi_query_collective(struct fid_domain *domain, enum fi_collective_op coll, + struct fi_collective_attr *attr, uint64_t flags) +{ + int ret; + + if (!attr || attr->mode != 0) + return -FI_EINVAL; + + switch (coll) { + case FI_BARRIER: + case FI_ALLGATHER: + case FI_SCATTER: + case FI_BROADCAST: + ret = FI_SUCCESS; + break; + case FI_ALLREDUCE: + if (FI_MIN <= attr->op && FI_BXOR >= attr->op) + ret = fi_query_atomic(domain, attr->datatype, attr->op, + &attr->datatype_attr, flags); + else + return -FI_ENOSYS; + break; + case FI_ALLTOALL: + case FI_REDUCE_SCATTER: + case FI_REDUCE: + case FI_GATHER: + default: + return -FI_ENOSYS; + } + + if (ret) + return ret; + + // with the currently implemented software based collective operations + // the only restriction is the number of ranks we can address, as limited + // by the size of the rank portion of the collective tag, which is 31 bits. + // future collectives may impose further restrictions which will need to update + // the calculation. For example, operations which require dedicated space in + // the recieve buffer for each rank would limit the number of members by buffer + // size and value type (8kB buffer / 64B value = 128 member max). + // hardware may impose further restrictions + attr->max_members = ~(0x80000000); + + return FI_SUCCESS; +} diff --git a/prov/util/src/util_cq.c b/prov/util/src/util_cq.c index 965f55c36aa..a0be452356a 100644 --- a/prov/util/src/util_cq.c +++ b/prov/util/src/util_cq.c @@ -38,58 +38,69 @@ #define UTIL_DEF_CQ_SIZE (1024) -/* Caller must hold `cq_lock` */ -int ofi_cq_write_overflow(struct util_cq *cq, void *context, uint64_t flags, size_t len, - void *buf, uint64_t data, uint64_t tag, fi_addr_t src) + +/* While the CQ is full, we continue to add new entries to the auxiliary + * queue. + */ +static void ofi_cq_insert_aux(struct util_cq *cq, + struct util_cq_aux_entry *entry) +{ + if (!ofi_cirque_isfull(cq->cirq)) + ofi_cirque_commit(cq->cirq); + + entry->cq_slot = ofi_cirque_tail(cq->cirq); + entry->cq_slot->flags = UTIL_FLAG_AUX; + slist_insert_tail(&entry->list_entry, &cq->aux_queue); +} + +/* Caller must hold 'cq lock' */ +int ofi_cq_write_overflow(struct util_cq *cq, void *context, uint64_t flags, + size_t len, void *buf, uint64_t data, uint64_t tag, + fi_addr_t src) { - struct util_cq_oflow_err_entry *entry; + struct util_cq_aux_entry *entry; - assert(ofi_cirque_isfull(cq->cirq)); + FI_DBG(cq->domain->prov, FI_LOG_CQ, "writing to CQ overflow list\n"); + assert(ofi_cirque_freecnt(cq->cirq) <= 1); if (!(entry = calloc(1, sizeof(*entry)))) return -FI_ENOMEM; - entry->parent_comp = ofi_cirque_tail(cq->cirq); - entry->parent_comp->flags |= UTIL_FLAG_OVERFLOW; - entry->comp.op_context = context; entry->comp.flags = flags; entry->comp.len = len; entry->comp.buf = buf; entry->comp.data = data; entry->comp.tag = tag; - + entry->comp.err = 0; entry->src = src; - slist_insert_tail(&entry->list_entry, &cq->oflow_err_list); + ofi_cq_insert_aux(cq, entry); return 0; } -int ofi_cq_write_error(struct util_cq *cq, - const struct fi_cq_err_entry *err_entry) +/* Caller must hold 'cq lock' */ +int ofi_cq_insert_error(struct util_cq *cq, + const struct fi_cq_err_entry *err_entry) { - struct util_cq_oflow_err_entry *entry; - struct fi_cq_tagged_entry *comp; + struct util_cq_aux_entry *entry; assert(err_entry->err); - if (!(entry = calloc(1, sizeof(*entry)))) return -FI_ENOMEM; entry->comp = *err_entry; - cq->cq_fastlock_acquire(&cq->cq_lock); - slist_insert_tail(&entry->list_entry, &cq->oflow_err_list); + ofi_cq_insert_aux(cq, entry); + return 0; +} - if (OFI_UNLIKELY(ofi_cirque_isfull(cq->cirq))) { - comp = ofi_cirque_tail(cq->cirq); - comp->flags |= (UTIL_FLAG_ERROR | UTIL_FLAG_OVERFLOW); - entry->parent_comp = ofi_cirque_tail(cq->cirq); - } else { - comp = ofi_cirque_tail(cq->cirq); - comp->flags = UTIL_FLAG_ERROR; - ofi_cirque_commit(cq->cirq); - } +int ofi_cq_write_error(struct util_cq *cq, + const struct fi_cq_err_entry *err_entry) +{ + cq->cq_fastlock_acquire(&cq->cq_lock); + ofi_cq_insert_error(cq, err_entry); cq->cq_fastlock_release(&cq->cq_lock); + if (cq->wait) cq->wait->signal(cq->wait); return 0; @@ -142,6 +153,7 @@ int ofi_check_cq_attr(const struct fi_provider *prov, switch (attr->wait_obj) { case FI_WAIT_NONE: + case FI_WAIT_YIELD: break; case FI_WAIT_SET: if (!attr->wait_set) { @@ -151,6 +163,7 @@ int ofi_check_cq_attr(const struct fi_provider *prov, /* fall through */ case FI_WAIT_UNSPEC: case FI_WAIT_FD: + case FI_WAIT_POLLFD: switch (attr->wait_cond) { case FI_CQ_COND_NONE: case FI_CQ_COND_THRESHOLD: @@ -201,40 +214,12 @@ static void util_cq_read_tagged(void **dst, void *src) *(char **)dst += sizeof(struct fi_cq_tagged_entry); } -static inline -void util_cq_read_oflow_entry(struct util_cq *cq, - struct util_cq_oflow_err_entry *oflow_entry, - struct fi_cq_tagged_entry *cirq_entry, - void **buf, fi_addr_t *src_addr, ssize_t i) -{ - if (src_addr && cq->src) { - src_addr[i] = cq->src[ofi_cirque_rindex(cq->cirq)]; - cq->src[ofi_cirque_rindex(cq->cirq)] = oflow_entry->src; - } - cq->read_entry(buf, cirq_entry); - cirq_entry->op_context = oflow_entry->comp.op_context; - cirq_entry->flags = oflow_entry->comp.flags; - cirq_entry->len = oflow_entry->comp.len; - cirq_entry->buf = oflow_entry->comp.buf; - cirq_entry->data = oflow_entry->comp.data; - cirq_entry->tag = oflow_entry->comp.tag; -} - -static inline -void util_cq_read_entry(struct util_cq *cq, struct fi_cq_tagged_entry *entry, - void **buf, fi_addr_t *src_addr, ssize_t i) -{ - if (src_addr && cq->src) - src_addr[i] = cq->src[ofi_cirque_rindex(cq->cirq)]; - cq->read_entry(buf, entry); - ofi_cirque_discard(cq->cirq); -} - ssize_t ofi_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t count, fi_addr_t *src_addr) { - struct util_cq *cq; struct fi_cq_tagged_entry *entry; + struct util_cq_aux_entry *aux_entry; + struct util_cq *cq; ssize_t i; cq = container_of(cq_fid, struct util_cq, cq_fid); @@ -253,54 +238,40 @@ ssize_t ofi_cq_readfrom(struct fid_cq *cq_fid, void *buf, size_t count, if (count > ofi_cirque_usedcnt(cq->cirq)) count = ofi_cirque_usedcnt(cq->cirq); - for (i = 0; i < (ssize_t)count; i++) { + for (i = 0; i < (ssize_t) count; i++) { entry = ofi_cirque_head(cq->cirq); - if (OFI_UNLIKELY(entry->flags & (UTIL_FLAG_ERROR | - UTIL_FLAG_OVERFLOW))) { - if (entry->flags & UTIL_FLAG_ERROR) { - struct util_cq_oflow_err_entry *oflow_err_entry = - container_of(cq->oflow_err_list.head, - struct util_cq_oflow_err_entry, - list_entry); - if (oflow_err_entry->comp.err) { - /* This handles case when the head of oflow_err_list is - * an error entry. - * - * NOTE: if this isn't an error entry, we have to handle - * overflow entries and then the error entries to ensure - * ordering. */ - if (!i) - i = -FI_EAVAIL; - break; - } + if (!(entry->flags & UTIL_FLAG_AUX)) { + if (src_addr && cq->src) + src_addr[i] = cq->src[ofi_cirque_rindex(cq->cirq)]; + cq->read_entry(&buf, entry); + ofi_cirque_discard(cq->cirq); + } else { + assert(!slist_empty(&cq->aux_queue)); + aux_entry = container_of(cq->aux_queue.head, + struct util_cq_aux_entry, + list_entry); + assert(aux_entry->cq_slot == entry); + if (aux_entry->comp.err) { + if (!i) + i = -FI_EAVAIL; + break; } - if (entry->flags & UTIL_FLAG_OVERFLOW) { - assert(!slist_empty(&cq->oflow_err_list)); - struct util_cq_oflow_err_entry *oflow_entry = - container_of(cq->oflow_err_list.head, - struct util_cq_oflow_err_entry, - list_entry); - if (oflow_entry->parent_comp != entry) { - /* Handle case when all overflow/error CQ entries were read - * for particular CIRQ entry */ - entry->flags &= ~(UTIL_FLAG_OVERFLOW | UTIL_FLAG_ERROR); - } else { - uint64_t service_flags = - (entry->flags & (UTIL_FLAG_OVERFLOW | UTIL_FLAG_ERROR)); - slist_remove_head(&cq->oflow_err_list); - - entry->flags &= ~(service_flags); - util_cq_read_oflow_entry(cq, oflow_entry, entry, - &buf, src_addr, i); - /* To ensure checking of overflow CQ entries once again */ - if (!slist_empty(&cq->oflow_err_list)) - entry->flags |= service_flags; - free(oflow_entry); - continue; - } + + if (src_addr && cq->src) + src_addr[i] = aux_entry->src; + cq->read_entry(&buf, &aux_entry->comp); + slist_remove_head(&cq->aux_queue); + + if (slist_empty(&cq->aux_queue)) { + ofi_cirque_discard(cq->cirq); + } else { + aux_entry = container_of(cq->aux_queue.head, + struct util_cq_aux_entry, + list_entry); + if (aux_entry->cq_slot != ofi_cirque_head(cq->cirq)) + ofi_cirque_discard(cq->cirq); } } - util_cq_read_entry(cq, entry, &buf, src_addr, i); } out: cq->cq_fastlock_release(&cq->cq_lock); @@ -315,10 +286,8 @@ ssize_t ofi_cq_read(struct fid_cq *cq_fid, void *buf, size_t count) ssize_t ofi_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *buf, uint64_t flags) { + struct util_cq_aux_entry *aux_entry; struct util_cq *cq; - struct util_cq_oflow_err_entry *err; - struct slist_entry *entry; - struct fi_cq_tagged_entry *cirq_entry; char *err_buf_save; size_t err_data_size; uint32_t api_version; @@ -329,47 +298,48 @@ ssize_t ofi_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *buf, cq->cq_fastlock_acquire(&cq->cq_lock); if (ofi_cirque_isempty(cq->cirq) || - !(ofi_cirque_head(cq->cirq)->flags & UTIL_FLAG_ERROR)) { + !(ofi_cirque_head(cq->cirq)->flags & UTIL_FLAG_AUX)) { ret = -FI_EAGAIN; goto unlock; } - entry = slist_remove_head(&cq->oflow_err_list); - err = container_of(entry, struct util_cq_oflow_err_entry, list_entry); - if ((FI_VERSION_GE(api_version, FI_VERSION(1, 5))) && buf->err_data_size) { - err_data_size = MIN(buf->err_data_size, err->comp.err_data_size); - memcpy(buf->err_data, err->comp.err_data, err_data_size); + assert(!slist_empty(&cq->aux_queue)); + aux_entry = container_of(cq->aux_queue.head, + struct util_cq_aux_entry, list_entry); + assert(aux_entry->cq_slot == ofi_cirque_head(cq->cirq)); + + if (!aux_entry->comp.err) { + ret = -FI_EAGAIN; + goto unlock; + } + + if ((FI_VERSION_GE(api_version, FI_VERSION(1, 5))) && + buf->err_data_size) { err_buf_save = buf->err_data; - *buf = err->comp; + err_data_size = MIN(buf->err_data_size, + aux_entry->comp.err_data_size); + + *buf = aux_entry->comp; + memcpy(err_buf_save, aux_entry->comp.err_data, err_data_size); buf->err_data = err_buf_save; buf->err_data_size = err_data_size; } else { - memcpy(buf, &err->comp, sizeof(struct fi_cq_err_entry_1_0)); + memcpy(buf, &aux_entry->comp, + sizeof(struct fi_cq_err_entry_1_0)); } - cirq_entry = ofi_cirque_head(cq->cirq); - if (!(cirq_entry->flags & UTIL_FLAG_OVERFLOW)) { + slist_remove_head(&cq->aux_queue); + free(aux_entry); + if (slist_empty(&cq->aux_queue)) { ofi_cirque_discard(cq->cirq); - } else if (!slist_empty(&cq->oflow_err_list)) { - struct util_cq_oflow_err_entry *oflow_entry = - container_of(cq->oflow_err_list.head, - struct util_cq_oflow_err_entry, - list_entry); - if (oflow_entry->parent_comp != cirq_entry) { - /* The normal CQ entry were used to report error due to - * out of space in the circular queue. We have to unset - * UTIL_FLAG_ERROR and UTIL_FLAG_OVERFLOW flags */ - cirq_entry->flags &= ~(UTIL_FLAG_ERROR | UTIL_FLAG_OVERFLOW); - } - /* If the next entry in the oflow_err_list use the same entry from CIRQ to - * report error/overflow, don't unset UTIL_FLAG_ERRO and UTIL_FLAG_OVERFLOW - * flags to ensure the next round of handling overflow/error entries */ } else { - cirq_entry->flags &= ~(UTIL_FLAG_ERROR | UTIL_FLAG_OVERFLOW); + aux_entry = container_of(cq->aux_queue.head, + struct util_cq_aux_entry, list_entry); + if (aux_entry->cq_slot != ofi_cirque_head(cq->cirq)) + ofi_cirque_discard(cq->cirq); } ret = 1; - free(err); unlock: cq->cq_fastlock_release(&cq->cq_lock); return ret; @@ -396,7 +366,7 @@ ssize_t ofi_cq_sreadfrom(struct fid_cq *cq_fid, void *buf, size_t count, if (ofi_atomic_get32(&cq->signaled)) { ofi_atomic_set32(&cq->signaled, 0); - return -FI_ECANCELED; + return -FI_EAGAIN; } ret = fi_wait(&cq->wait->wait_fid, timeout); @@ -438,15 +408,15 @@ static struct fi_ops_cq util_cq_ops = { int ofi_cq_cleanup(struct util_cq *cq) { - struct util_cq_oflow_err_entry *err; + struct util_cq_aux_entry *err; struct slist_entry *entry; if (ofi_atomic_get32(&cq->ref)) return -FI_EBUSY; - while (!slist_empty(&cq->oflow_err_list)) { - entry = slist_remove_head(&cq->oflow_err_list); - err = container_of(entry, struct util_cq_oflow_err_entry, list_entry); + while (!slist_empty(&cq->aux_queue)) { + entry = slist_remove_head(&cq->aux_queue); + err = container_of(entry, struct util_cq_aux_entry, list_entry); free(err); } @@ -471,9 +441,10 @@ int ofi_cq_control(struct fid *fid, int command, void *arg) switch (command) { case FI_GETWAIT: + case FI_GETWAITOBJ: if (!cq->wait) return -FI_ENODATA; - return fi_control(&cq->wait->wait_fid.fid, FI_GETWAIT, arg); + return fi_control(&cq->wait->wait_fid.fid, command, arg); default: FI_INFO(cq->wait->prov, FI_LOG_CQ, "Unsupported command\n"); return -FI_ENOSYS; @@ -524,7 +495,7 @@ static int fi_cq_init(struct fid_domain *domain, struct fi_cq_attr *attr, cq->cq_fastlock_acquire = ofi_fastlock_acquire; cq->cq_fastlock_release = ofi_fastlock_release; } - slist_init(&cq->oflow_err_list); + slist_init(&cq->aux_queue); cq->read_entry = read_entry; cq->cq_fid.fid.fclass = FI_CLASS_CQ; @@ -536,7 +507,9 @@ static int fi_cq_init(struct fid_domain *domain, struct fi_cq_attr *attr, break; case FI_WAIT_UNSPEC: case FI_WAIT_FD: + case FI_WAIT_POLLFD: case FI_WAIT_MUTEX_COND: + case FI_WAIT_YIELD: memset(&wait_attr, 0, sizeof wait_attr); wait_attr.wait_obj = attr->wait_obj; cq->internal_wait = 1; @@ -640,31 +613,27 @@ int ofi_cq_init(const struct fi_provider *prov, struct fid_domain *domain, if (cq->wait) { ret = fi_poll_add(&cq->wait->pollset->poll_fid, &cq->cq_fid.fid, 0); - if (ret) { - ofi_cq_cleanup(cq); - return ret; - } + if (ret) + goto cleanup; } cq->cirq = util_comp_cirq_create(attr->size == 0 ? UTIL_DEF_CQ_SIZE : attr->size); if (!cq->cirq) { ret = -FI_ENOMEM; - goto err1; + goto cleanup; } if (cq->domain->info_domain_caps & FI_SOURCE) { cq->src = calloc(cq->cirq->size, sizeof *cq->src); if (!cq->src) { ret = -FI_ENOMEM; - goto err2; + goto cleanup; } } return 0; -err2: - util_comp_cirq_free(cq->cirq); -err1: - ofi_cq_cleanup(cq); +cleanup: + (void) ofi_cq_cleanup(cq); return ret; } @@ -682,7 +651,7 @@ uint64_t ofi_rx_flags[] = { }; uint64_t ofi_tx_flags[] = { - [ofi_op_msg] = FI_SEND, + [ofi_op_msg] = FI_SEND | FI_MSG, [ofi_op_tagged] = FI_SEND | FI_TAGGED, [ofi_op_read_req] = FI_RMA | FI_READ, [ofi_op_read_rsp] = FI_RMA | FI_READ, diff --git a/prov/util/src/util_domain.c b/prov/util/src/util_domain.c index 2a7d8de11bb..334b48c645e 100644 --- a/prov/util/src/util_domain.c +++ b/prov/util/src/util_domain.c @@ -37,7 +37,7 @@ #include -int ofi_domain_bind_eq(struct util_domain *domain, struct util_eq *eq) +static int ofi_domain_bind_eq(struct util_domain *domain, struct util_eq *eq) { if (domain->eq) { FI_WARN(domain->prov, FI_LOG_DOMAIN, @@ -50,11 +50,34 @@ int ofi_domain_bind_eq(struct util_domain *domain, struct util_eq *eq) return 0; } +int ofi_domain_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +{ + struct util_domain *domain; + struct util_eq *eq; + + domain = container_of(fid, struct util_domain, domain_fid.fid); + if (flags) { + FI_WARN(domain->prov, FI_LOG_DOMAIN, + "unsupported bind flags\n"); + return -FI_EBADFLAGS; + } + + switch (bfid->fclass) { + case FI_CLASS_EQ: + eq = container_of(bfid, struct util_eq, eq_fid.fid); + return ofi_domain_bind_eq(domain, eq); + default: + return -EINVAL; + } +} + int ofi_domain_close(struct util_domain *domain) { if (ofi_atomic_get32(&domain->ref)) return -FI_EBUSY; + if (domain->eq) + ofi_atomic_dec32(&domain->eq->ref); if (domain->mr_map.rbtree) ofi_mr_map_close(&domain->mr_map); diff --git a/prov/util/src/util_ep.c b/prov/util/src/util_ep.c index c46d83703ea..fe2be36c27a 100644 --- a/prov/util/src/util_ep.c +++ b/prov/util/src/util_ep.c @@ -35,6 +35,7 @@ #include #include +#include int ofi_ep_bind_cq(struct util_ep *ep, struct util_cq *cq, uint64_t flags) { @@ -173,23 +174,37 @@ int ofi_ep_bind(struct util_ep *util_ep, struct fid *fid, uint64_t flags) return ret; switch (fid->fclass) { - case FI_CLASS_CQ: - cq = container_of(fid, struct util_cq, cq_fid.fid); - return ofi_ep_bind_cq(util_ep, cq, flags); - case FI_CLASS_EQ: - eq = container_of(fid, struct util_eq, eq_fid.fid); - return ofi_ep_bind_eq(util_ep, eq); - case FI_CLASS_AV: - av = container_of(fid, struct util_av, av_fid.fid); - return ofi_ep_bind_av(util_ep, av); - case FI_CLASS_CNTR: - cntr = container_of(fid, struct util_cntr, cntr_fid.fid); - return ofi_ep_bind_cntr(util_ep, cntr, flags); + case FI_CLASS_CQ: + cq = container_of(fid, struct util_cq, cq_fid.fid); + return ofi_ep_bind_cq(util_ep, cq, flags); + case FI_CLASS_EQ: + eq = container_of(fid, struct util_eq, eq_fid.fid); + return ofi_ep_bind_eq(util_ep, eq); + case FI_CLASS_AV: + av = container_of(fid, struct util_av, av_fid.fid); + return ofi_ep_bind_av(util_ep, av); + case FI_CLASS_CNTR: + cntr = container_of(fid, struct util_cntr, cntr_fid.fid); + return ofi_ep_bind_cntr(util_ep, cntr, flags); } return -FI_EINVAL; } +static inline int util_coll_init_cid_mask(struct bitmask *mask) +{ + int err = ofi_bitmask_create(mask, OFI_MAX_GROUP_ID); + if (err) + return err; + + ofi_bitmask_set_all(mask); + + /* reserving the first bit in context id to whole av set */ + ofi_bitmask_unset(mask, OFI_WORLD_GROUP_ID); + + return FI_SUCCESS; +} + int ofi_endpoint_init(struct fid_domain *domain, const struct util_prov *util_prov, struct fi_info *info, struct util_ep *ep, void *context, ofi_ep_progress_func progress) @@ -240,6 +255,15 @@ int ofi_endpoint_init(struct fid_domain *domain, const struct util_prov *util_pr ep->lock_acquire = ofi_fastlock_acquire; ep->lock_release = ofi_fastlock_release; } + if (ep->caps & FI_COLLECTIVE) { + ep->coll_cid_mask = calloc(1, sizeof(*ep->coll_cid_mask)); + if (!ep->coll_cid_mask) + return -FI_ENOMEM; + util_coll_init_cid_mask(ep->coll_cid_mask); + } else { + ep->coll_cid_mask = NULL; + } + slist_init(&ep->coll_ready_queue); return 0; } @@ -309,6 +333,11 @@ int ofi_endpoint_close(struct util_ep *util_ep) ofi_atomic_dec32(&util_ep->av->ref); } + if (util_ep->coll_cid_mask) { + ofi_bitmask_free(util_ep->coll_cid_mask); + free(util_ep->coll_cid_mask); + } + if (util_ep->eq) ofi_atomic_dec32(&util_ep->eq->ref); ofi_atomic_dec32(&util_ep->domain->ref); diff --git a/prov/util/src/util_eq.c b/prov/util/src/util_eq.c index 4aabc31397e..44dd0d8e2b8 100644 --- a/prov/util/src/util_eq.c +++ b/prov/util/src/util_eq.c @@ -41,7 +41,8 @@ void ofi_eq_handle_err_entry(uint32_t api_version, uint64_t flags, struct fi_eq_err_entry *user_err_entry) { if ((FI_VERSION_GE(api_version, FI_VERSION(1, 5))) - && user_err_entry->err_data && user_err_entry->err_data_size) { + && user_err_entry->err_data && user_err_entry->err_data_size + && err_entry->err_data && err_entry->err_data_size) { void *err_data = user_err_entry->err_data; size_t err_data_size = MIN(err_entry->err_data_size, user_err_entry->err_data_size); @@ -67,6 +68,11 @@ void ofi_eq_handle_err_entry(uint32_t api_version, uint64_t flags, } } +/* + * fi_eq_read and fi_eq_readerr share this common code path. + * If flags contains UTIL_FLAG_ERROR, then we are processing + * fi_eq_readerr. + */ ssize_t ofi_eq_read(struct fid_eq *eq_fid, uint32_t *event, void *buf, size_t len, uint64_t flags) { @@ -104,7 +110,7 @@ ssize_t ofi_eq_read(struct fid_eq *eq_fid, uint32_t *event, ofi_eq_handle_err_entry(eq->fabric->fabric_fid.api_version, flags, err_entry, buf); - ret = (ssize_t) entry->size; + ret = entry->size; if (!(flags & FI_PEEK)) eq->saved_err_data = err_entry->err_data; @@ -143,7 +149,7 @@ ssize_t ofi_eq_write(struct fid_eq *eq_fid, uint32_t event, if (!entry) return -FI_ENOMEM; - entry->size = (int) len; + entry->size = len; entry->event = event; entry->err = !!(flags & UTIL_FLAG_ERROR); memcpy(entry->data, buf, len); @@ -202,6 +208,7 @@ int ofi_eq_control(struct fid *fid, int command, void *arg) switch (command) { case FI_GETWAIT: + case FI_GETWAITOBJ: ret = fi_control(&eq->wait->wait_fid.fid, command, arg); break; default: @@ -289,7 +296,9 @@ static int util_eq_init(struct fid_fabric *fabric, struct util_eq *eq, break; case FI_WAIT_UNSPEC: case FI_WAIT_FD: + case FI_WAIT_POLLFD: case FI_WAIT_MUTEX_COND: + case FI_WAIT_YIELD: memset(&wait_attr, 0, sizeof wait_attr); wait_attr.wait_obj = attr->wait_obj; eq->internal_wait = 1; @@ -362,7 +371,9 @@ static int util_verify_eq_attr(const struct fi_provider *prov, case FI_WAIT_NONE: case FI_WAIT_UNSPEC: case FI_WAIT_FD: + case FI_WAIT_POLLFD: case FI_WAIT_MUTEX_COND: + case FI_WAIT_YIELD: break; case FI_WAIT_SET: if (!attr->wait_set) { diff --git a/prov/util/src/util_main.c b/prov/util/src/util_main.c index 69273ac3f09..6c4e77051fc 100644 --- a/prov/util/src/util_main.c +++ b/prov/util/src/util_main.c @@ -57,17 +57,6 @@ static int util_match_fabric(struct dlist_entry *item, const void *arg) !strcmp(fabric->name, fabric_info->name); } -struct util_fabric *ofi_fabric_find(struct util_fabric_info *fabric_info) -{ - struct dlist_entry *item; - - pthread_mutex_lock(&common_locks.util_fabric_lock); - item = dlist_find_first_match(&fabric_list, util_match_fabric, fabric_info); - pthread_mutex_unlock(&common_locks.util_fabric_lock); - - return item ? container_of(item, struct util_fabric, list_entry) : NULL; -} - void ofi_fabric_remove(struct util_fabric *fabric) { pthread_mutex_lock(&common_locks.util_fabric_lock); @@ -138,6 +127,11 @@ static int util_find_domain(struct dlist_entry *item, const void *arg) ((info->domain_attr->mr_mode & domain->mr_mode) == domain->mr_mode); } +/* + * Produces 1 fi_info output for each fi_info entry in the provider's base + * list (stored with util_prov), subject to the base fi_info meeting the + * user's hints. + */ int util_getinfo(const struct util_prov *util_prov, uint32_t version, const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct fi_info **info) @@ -171,8 +165,11 @@ int util_getinfo(const struct util_prov *util_prov, uint32_t version, fabric_info.name = (*info)->fabric_attr->name; fabric_info.prov = util_prov->prov; - fabric = ofi_fabric_find(&fabric_info); - if (fabric) { + pthread_mutex_lock(&common_locks.util_fabric_lock); + item = dlist_find_first_match(&fabric_list, util_match_fabric, + &fabric_info); + if (item) { + fabric = container_of(item, struct util_fabric, list_entry); FI_DBG(prov, FI_LOG_CORE, "Found opened fabric\n"); (*info)->fabric_attr->fabric = &fabric->fabric_fid; @@ -190,6 +187,7 @@ int util_getinfo(const struct util_prov *util_prov, uint32_t version, fastlock_release(&fabric->lock); } + pthread_mutex_unlock(&common_locks.util_fabric_lock); if (flags & FI_SOURCE) { ret = ofi_get_addr(&(*info)->addr_format, flags, @@ -261,3 +259,158 @@ int util_getinfo(const struct util_prov *util_prov, uint32_t version, fi_freeinfo(*info); return ret; } + +static void util_set_netif_names(struct fi_info *info, + struct ofi_addr_list_entry *addr_entry) +{ + char *name; + + name = strdup(addr_entry->net_name); + if (name) { + free(info->fabric_attr->name); + info->fabric_attr->name = name; + } + + name = strdup(addr_entry->ifa_name); + if (name) { + free(info->domain_attr->name); + info->domain_attr->name = name; + } +} + +/* + * Produces 1 fi_info output for each usable IP address in the system for the + * given fi_info input. + */ +#if HAVE_GETIFADDRS +static void util_getinfo_ifs(const struct util_prov *prov, + const struct fi_info *hints, + struct fi_info *src_info, + struct fi_info **head, struct fi_info **tail) +{ + struct fi_info *cur; + struct slist addr_list; + size_t addrlen; + uint32_t addr_format; + struct slist_entry *entry, *prev; + struct ofi_addr_list_entry *addr_entry; + + *head = *tail = NULL; + slist_init(&addr_list); + + ofi_get_list_of_addr(prov->prov, "iface", &addr_list); + + (void) prev; /* Makes compiler happy */ + slist_foreach(&addr_list, entry, prev) { + addr_entry = container_of(entry, struct ofi_addr_list_entry, entry); + + if (hints && ((hints->caps & addr_entry->comm_caps) != + (hints->caps & (FI_LOCAL_COMM | FI_REMOTE_COMM)))) + continue; + + cur = fi_dupinfo(src_info); + if (!cur) + break; + + if (!*head) { + *head = cur; + FI_INFO(prov->prov, FI_LOG_CORE, "Chosen addr for using: %s," + " speed %zu\n", addr_entry->ipstr, addr_entry->speed); + } else { + (*tail)->next = cur; + } + *tail = cur; + + switch (addr_entry->ipaddr.sin.sin_family) { + case AF_INET: + addrlen = sizeof(struct sockaddr_in); + addr_format = FI_SOCKADDR_IN; + break; + case AF_INET6: + addrlen = sizeof(struct sockaddr_in6); + addr_format = FI_SOCKADDR_IN6; + break; + default: + continue; + } + + cur->caps = (cur->caps & ~(FI_LOCAL_COMM | FI_REMOTE_COMM)) | + addr_entry->comm_caps; + cur->src_addr = mem_dup(&addr_entry->ipaddr, addrlen); + if (cur->src_addr) { + cur->src_addrlen = addrlen; + cur->addr_format = addr_format; + } + util_set_netif_names(cur, addr_entry); + } + + ofi_free_list_of_addr(&addr_list); + if (!*head) { + *head = src_info; + *tail = src_info; + } +} +#else +static void util_getinfo_ifs(const struct util_prov *prov, + const struct fi_info *hints, + struct fi_info *src_info, + struct fi_info **head, struct fi_info **tail) +{ + *head = src_info; + *tail = src_info; +} +#endif + +static int util_match_addr(struct slist_entry *entry, const void *addr) +{ + struct ofi_addr_list_entry *addr_entry; + + addr_entry = container_of(entry, struct ofi_addr_list_entry, entry); + return ofi_equals_ipaddr(&addr_entry->ipaddr.sa, addr); +} + +int ofi_ip_getinfo(const struct util_prov *prov, uint32_t version, + const char *node, const char *service, uint64_t flags, + const struct fi_info *hints, struct fi_info **info) +{ + struct fi_info *head, *tail, *cur, **prev; + struct ofi_addr_list_entry *addr_entry; + struct slist addr_list; + struct slist_entry *entry; + int ret; + + ret = util_getinfo(prov, version, node, service, flags, + hints, info); + if (ret) + return ret; + + prev = info; + for (cur = *info; cur; cur = cur->next) { + if (!cur->src_addr && !cur->dest_addr) { + util_getinfo_ifs(prov, hints, cur, &head, &tail); + if (head != cur) { + tail->next = (*prev)->next; + *prev = head; + + cur->next = NULL; + fi_freeinfo(cur); + cur = tail; + } + } else if (cur->src_addr) { + slist_init(&addr_list); + ofi_get_list_of_addr(prov->prov, "iface", &addr_list); + + entry = slist_find_first_match(&addr_list, util_match_addr, + (*info)->src_addr); + if (entry) { + addr_entry = container_of(entry, + struct ofi_addr_list_entry, entry); + util_set_netif_names(cur, addr_entry); + } + ofi_free_list_of_addr(&addr_list); + } + prev = &cur->next; + } + + return 0; +} diff --git a/prov/util/src/util_mem_hooks.c b/prov/util/src/util_mem_hooks.c new file mode 100644 index 00000000000..7a61948ad33 --- /dev/null +++ b/prov/util/src/util_mem_hooks.c @@ -0,0 +1,626 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009-2017 Cisco Systems, Inc. All rights reserved + * Copyright (c) 2013-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2016-2017 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * Copyright (c) 2016-2020 IBM Corporation. All rights reserved. + * Copyright (c) 2019 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * License text from Open-MPI (www.open-mpi.org/community/license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer listed + * in this license in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * The copyright holders provide no reassurances that the source code + * provided does not infringe any patent, copyright, or any other + * intellectual property rights of third parties. The copyright holders + * disclaim any liability to any recipient for claims brought against + * recipient by any third party for infringement of that parties + * intellectual property rights. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +static int ofi_memhooks_start(struct ofi_mem_monitor *monitor); +static void ofi_memhooks_stop(struct ofi_mem_monitor *monitor); + +struct ofi_memhooks memhooks = { + .monitor.iface = FI_HMEM_SYSTEM, + .monitor.init = ofi_monitor_init, + .monitor.cleanup = ofi_monitor_cleanup, + .monitor.start = ofi_memhooks_start, + .monitor.stop = ofi_memhooks_stop, +}; +struct ofi_mem_monitor *memhooks_monitor = &memhooks.monitor; + + +/* memhook support checks */ +#if HAVE_MEMHOOKS_MONITOR + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if HAVE_DECL___SYSCALL && defined(HAVE___SYSCALL) +/* calling __syscall is preferred on some systems when some arguments may be 64-bit. it also + * has the benefit of having an off_t return type */ +#define ofi_memhooks_syscall __syscall +#else +#define ofi_memhooks_syscall syscall +#endif + +// These op codes used to be in bits/ipc.h but were removed in glibc in 2015 +// with a comment saying they should be defined in internal headers: +// https://sourceware.org/bugzilla/show_bug.cgi?id=18560 +// and when glibc uses that syscall it seems to do so from its own definitions: +// https://github.com/bminor/glibc/search?q=IPCOP_shmat&unscoped_q=IPCOP_shmat +#if (!defined(SYS_shmat) && !defined(IPCOP_shmat)) +#define IPCOP_shmat 21 +#endif +#if (!defined(SYS_shmdt) && !defined(IPCOP_shmdt)) +#define IPCOP_shmdt 22 +#endif + +#define OFI_INTERCEPT_MAX_PATCH 32 + +struct ofi_intercept { + struct dlist_entry entry; + const char *symbol; + void *our_func; + void *orig_func; + unsigned char patch_data[OFI_INTERCEPT_MAX_PATCH]; + unsigned char patch_orig_data[OFI_INTERCEPT_MAX_PATCH]; + unsigned patch_data_size; + struct dlist_entry dl_intercept_list; +}; + +enum { + OFI_INTERCEPT_MMAP, + OFI_INTERCEPT_MUNMAP, + OFI_INTERCEPT_MREMAP, + OFI_INTERCEPT_MADVISE, + OFI_INTERCEPT_SHMAT, + OFI_INTERCEPT_SHMDT, + OFI_INTERCEPT_BRK, + OFI_INTERCEPT_MAX +}; + +static void *ofi_intercept_mmap(void *start, size_t length, + int prot, int flags, int fd, off_t offset); +static int ofi_intercept_munmap(void *start, size_t length); +static void *ofi_intercept_mremap(void *old_address, size_t old_size, + size_t new_size, int flags, void *new_address); +static int ofi_intercept_madvise(void *addr, size_t length, int advice); +static void *ofi_intercept_shmat(int shmid, const void *shmaddr, int shmflg); +static int ofi_intercept_shmdt(const void *shmaddr); +static int ofi_intercept_brk(const void *brkaddr); + +static struct ofi_intercept intercepts[] = { + [OFI_INTERCEPT_MMAP] = { .symbol = "mmap", + .our_func = ofi_intercept_mmap}, + [OFI_INTERCEPT_MUNMAP] = { .symbol = "munmap", + .our_func = ofi_intercept_munmap}, + [OFI_INTERCEPT_MREMAP] = { .symbol = "mremap", + .our_func = ofi_intercept_mremap}, + [OFI_INTERCEPT_MADVISE] = { .symbol = "madvise", + .our_func = ofi_intercept_madvise}, + [OFI_INTERCEPT_SHMAT] = { .symbol = "shmat", + .our_func = ofi_intercept_shmat}, + [OFI_INTERCEPT_SHMDT] = { .symbol = "shmdt", + .our_func = ofi_intercept_shmdt}, + [OFI_INTERCEPT_BRK] = { .symbol = "brk", + .our_func = ofi_intercept_brk}, +}; + +#ifdef HAVE___CURBRK +extern void *__curbrk; /* in libc */ +#endif + +#if HAVE___CLEAR_CACHE +/* + * Used on ARM64 platforms, see https://github.com/open-mpi/ompi/issues/5631 + */ +static inline void ofi_clear_instruction_cache(uintptr_t address, size_t data_size) +{ + /* do not allow global declaration of compiler intrinsic */ + void __clear_cache(void* beg, void* end); + + __clear_cache ((void *) address, (void *) (address + data_size)); +} +#else +static inline void ofi_clear_instruction_cache(uintptr_t address, size_t data_size) +{ + size_t i; + size_t offset_jump = 16; +#if defined(__aarch64__) + offset_jump = 32; +#endif + /* align the address */ + address &= ~(offset_jump - 1); + + for (i = 0 ; i < data_size ; i += offset_jump) { +#if (defined(__x86_64__) || defined(__amd64__)) + __asm__ volatile("mfence;clflush %0;mfence":: + "m" (*((char*) address + i))); +#elif defined(__aarch64__) + __asm__ volatile ("dc cvau, %0\n\t" + "dsb ish\n\t" + "ic ivau, %0\n\t" + "dsb ish\n\t" + "isb":: "r" (address + i)); +#endif + } +} +#endif + +static inline int ofi_write_patch(unsigned char *patch_data, void *address, + size_t data_size) +{ + long page_size; + void *base; + void *bound; + size_t length; + + page_size = ofi_get_page_size(); + if (page_size < 0) { + FI_WARN(&core_prov, FI_LOG_MR, + "failed to get page size: %s\n", fi_strerror(-page_size)); + return page_size; + } + + base = ofi_get_page_start(address, page_size); + bound = ofi_get_page_end(address, page_size); + length = (uintptr_t) bound - (uintptr_t) base; + + if (mprotect(base, length, PROT_EXEC|PROT_READ|PROT_WRITE)) { + FI_WARN(&core_prov, FI_LOG_MR, + "mprotect to set PROT_WRITE on %p len %lu failed: %s\n", + (void *) base, length, strerror(errno)); + return -errno; + } + + memcpy(address, patch_data, data_size); + + ofi_clear_instruction_cache((uintptr_t) address, data_size); + + /* + * Nothing we can do here if this fails so ignore the return code. It + * shouldn't due to alignment since the parameters are the same as + * before. + */ + if (mprotect(base, length, PROT_EXEC|PROT_READ)) + FI_WARN(&core_prov, FI_LOG_MR, + "mprotect to drop PROT_WRITE on %p len %lu failed: %s\n", + base, length, strerror(errno)); + + return 0; +} + +static int ofi_apply_patch(struct ofi_intercept *intercept) +{ + memcpy(intercept->patch_orig_data, intercept->orig_func, + intercept->patch_data_size); + return ofi_write_patch(intercept->patch_data, intercept->orig_func, + intercept->patch_data_size); +} + +static int ofi_remove_patch(struct ofi_intercept *intercept) +{ + return ofi_write_patch(intercept->patch_orig_data, intercept->orig_func, + intercept->patch_data_size); +} + +static void ofi_restore_intercepts(void) +{ + struct ofi_intercept *intercept; + + dlist_foreach_container(&memhooks.intercept_list, struct ofi_intercept, + intercept, entry) + ofi_remove_patch(intercept); +} + +#if (defined(__x86_64___) || defined(__amd64__)) +static int ofi_patch_function(struct ofi_intercept *intercept) +{ + intercept->patch_data_size = 13; + *(unsigned short*)(intercept->patch_data + 0) = 0xbb49; + *(unsigned long* )(intercept->patch_data + 2) = + (unsigned long) intercept->our_func; + *(unsigned char*) (intercept->patch_data +10) = 0x41; + *(unsigned char*) (intercept->patch_data +11) = 0xff; + *(unsigned char*) (intercept->patch_data +12) = 0xe3; + + return ofi_apply_patch(intercept); +} +#elif defined(__aarch64__) +/** + * @brief Generate a mov immediate instruction + * + * @param[in] reg register number (0-31) + * @param[in] shift shift amount (0-3) * 16-bits + * @param[in] value immediate value + */ +static uint32_t mov(unsigned int reg, uint16_t shift, uint16_t value) +{ + return (0x1a5 << 23) + ((uint32_t) shift << 21) + ((uint32_t) value << 5) + reg; +} + +/** + * @brief Generate a mov immediate with keep instruction + * + * @param[in] reg register number (0-31) + * @param[in] shift shift amount (0-3) * 16-bits + * @param[in] value immediate value + */ +static uint32_t movk(unsigned int reg, uint16_t shift, uint16_t value) +{ + return (0x1e5 << 23) + ((uint32_t) shift << 21) + ((uint32_t) value << 5) + reg; +} + +/** + * @brief Generate a branch to register instruction + * + * @param[in] reg register number (0-31) + */ +static uint32_t br(unsigned int reg) +{ + return (0xd61f << 16) + (reg << 5); +} + +static int ofi_patch_function(struct ofi_intercept *intercept) +{ + /* + * r15 is the highest numbered temporary register. I am + * assuming this one is safe to use. + */ + const unsigned int gr = 15; + uintptr_t addr = (uintptr_t) intercept->patch_data; + uintptr_t value = (uintptr_t) intercept->our_func; + + *(uint32_t *) (addr + 0) = mov(gr, 3, value >> 48); + *(uint32_t *) (addr + 4) = movk(gr, 2, value >> 32); + *(uint32_t *) (addr + 8) = movk(gr, 1, value >> 16); + *(uint32_t *) (addr + 12) = movk(gr, 0, value); + intercept->patch_data_size = 16; + + *(uint32_t *) ((uintptr_t) intercept->patch_data + + intercept->patch_data_size) = br(gr); + intercept->patch_data_size = intercept->patch_data_size + 4; + + return ofi_apply_patch(intercept); +} +#endif + +/* + * This implementation intercepts syscalls by overwriting the beginning of + * glibc's functions with a jump to our intercept function. After notifying the + * cache we will make the syscall directly. We store the original instructions + * and restore them when memhooks is unloaded. + */ +static int ofi_intercept_symbol(struct ofi_intercept *intercept) +{ + void *func_addr; + int ret; + + FI_DBG(&core_prov, FI_LOG_MR, + "overwriting function %s\n", intercept->symbol); + + func_addr = dlsym(RTLD_NEXT, intercept->symbol); + if (!func_addr) { + func_addr = dlsym(RTLD_DEFAULT, intercept->symbol); + if (!func_addr) { + FI_DBG(&core_prov, FI_LOG_MR, + "could not find symbol %s\n", intercept->symbol); + ret = -FI_ENOMEM; + return ret; + } + } + + intercept->orig_func = func_addr; + + ret = ofi_patch_function(intercept); + + if (!ret) + dlist_insert_tail(&intercept->entry, &memhooks.intercept_list); + + return ret; +} + +void ofi_intercept_handler(const void *addr, size_t len) +{ + pthread_rwlock_rdlock(&mm_list_rwlock); + pthread_mutex_lock(&mm_lock); + ofi_monitor_notify(memhooks_monitor, addr, len); + pthread_mutex_unlock(&mm_lock); + pthread_rwlock_unlock(&mm_list_rwlock); +} + +static void *ofi_intercept_mmap(void *start, size_t length, + int prot, int flags, int fd, off_t offset) +{ + if ((flags & MAP_FIXED) && start) + ofi_intercept_handler(start, length); + + return (void *)(intptr_t) ofi_memhooks_syscall(SYS_mmap, start, length, + prot, flags, fd, offset); +} + +static int ofi_intercept_munmap(void *start, size_t length) +{ + ofi_intercept_handler(start, length); + + return ofi_memhooks_syscall(SYS_munmap, start, length); +} + +static void *ofi_intercept_mremap(void *old_address, size_t old_size, + size_t new_size, int flags, void *new_address) +{ + ofi_intercept_handler(old_address, old_size); + +#ifdef MREMAP_FIXED + /* + * new_address is an optional argument. Explicitly set it to NULL + * if it is not applicable. + */ + if (!(flags & MREMAP_FIXED)) + new_address = NULL; +#endif + + return (void *)(intptr_t) ofi_memhooks_syscall(SYS_mremap, old_address, + old_size, new_size, + flags, new_address); +} + +static int ofi_intercept_madvise(void *addr, size_t length, int advice) +{ + if (advice == MADV_DONTNEED || +#ifdef MADV_FREE + advice == MADV_FREE || +#endif +#ifdef MADV_REMOVE + advice == MADV_REMOVE || +#endif + advice == POSIX_MADV_DONTNEED) { + ofi_intercept_handler(addr, length); + } + + return ofi_memhooks_syscall(SYS_madvise, addr, length, advice); +} + +static void *ofi_intercept_shmat(int shmid, const void *shmaddr, int shmflg) +{ + struct shmid_ds ds; + const void *start; + void *result; + size_t len; + int ret; + + if (shmaddr && (shmflg & SHM_REMAP)) { + ret = shmctl(shmid, IPC_STAT, &ds); + len = (ret < 0) ? 0 : ds.shm_segsz; + + if (shmflg & SHM_RND) { + start = (char *) shmaddr - ((uintptr_t) shmaddr) % SHMLBA; + len += ((uintptr_t) shmaddr) % SHMLBA; + } else { + start = shmaddr; + } + + ofi_intercept_handler(start, len); + } + +#ifdef SYS_shmat + result = (void *) ofi_memhooks_syscall(SYS_shmat, shmid, shmaddr, shmflg); +#else // IPCOP_shmat + unsigned long sysret; + sysret = ofi_memhooks_syscall(SYS_ipc, IPCOP_shmat, + shmid, shmflg, &shmaddr, shmaddr); + result = (sysret > -(unsigned long)SHMLBA) ? (void *)sysret : + (void *)shmaddr; +#endif + return result; +} + +static int ofi_intercept_shmdt(const void *shmaddr) +{ + int ret; + + /* + * Overly aggressive, but simple. Invalidate everything after shmaddr. + * We could choose to find the shared memory segment size in /proc but + * that seems like a great way to deadlock ourselves. + */ + ofi_intercept_handler(shmaddr, SIZE_MAX - (uintptr_t) shmaddr); + +#ifdef SYS_shmdt + ret = ofi_memhooks_syscall(SYS_shmdt, shmaddr); +#else // IPCOP_shmdt + ret = ofi_memhooks_syscall(SYS_ipc, IPCOP_shmdt, 0, 0, 0, shmaddr); +#endif + return ret; +} + +static int ofi_intercept_brk(const void *brkaddr) +{ + void *old_addr, *new_addr; + +#ifdef HAVE___CURBRK + old_addr = __curbrk; +#else + old_addr = sbrk(0); +#endif + new_addr = (void *) (intptr_t) ofi_memhooks_syscall(SYS_brk, brkaddr); + +#ifdef HAVE___CURBRK + /* + * Note: if we were using glibc brk/sbrk, their __curbrk would get + * updated, but since we're going straight to the syscall, we have + * to update __curbrk or else glibc won't see it. + */ + __curbrk = new_addr; +#endif + + if (new_addr < brkaddr) { + errno = ENOMEM; + return -1; + } else if (new_addr < old_addr) { + ofi_intercept_handler(new_addr, (intptr_t) old_addr - + (intptr_t) new_addr); + } + + return 0; +} + +static int ofi_memhooks_subscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + /* no-op */ + return FI_SUCCESS; +} + +static void ofi_memhooks_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + /* no-op */ +} + +static bool ofi_memhooks_valid(struct ofi_mem_monitor *monitor, + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) +{ + /* no-op */ + return true; +} + +static int ofi_memhooks_start(struct ofi_mem_monitor *monitor) +{ + int i, ret; + + if (memhooks_monitor->subscribe == ofi_memhooks_subscribe) + return 0; + + memhooks_monitor->subscribe = ofi_memhooks_subscribe; + memhooks_monitor->unsubscribe = ofi_memhooks_unsubscribe; + memhooks_monitor->valid = ofi_memhooks_valid; + dlist_init(&memhooks.intercept_list); + + for (i = 0; i < OFI_INTERCEPT_MAX; ++i) + dlist_init(&intercepts[i].dl_intercept_list); + + ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_MMAP]); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "intercept mmap failed %d %s\n", ret, fi_strerror(ret)); + return ret; + } + + ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_MUNMAP]); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "intercept munmap failed %d %s\n", ret, fi_strerror(ret)); + return ret; + } + + ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_MREMAP]); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "intercept mremap failed %d %s\n", ret, fi_strerror(ret)); + return ret; + } + + ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_MADVISE]); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "intercept madvise failed %d %s\n", ret, fi_strerror(ret)); + return ret; + } + + ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_SHMAT]); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "intercept shmat failed %d %s\n", ret, fi_strerror(ret)); + return ret; + } + + ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_SHMDT]); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "intercept shmdt failed %d %s\n", ret, fi_strerror(ret)); + return ret; + } + + ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_BRK]); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "intercept brk failed %d %s\n", ret, fi_strerror(ret)); + return ret; + } + + return 0; +} + +static void ofi_memhooks_stop(struct ofi_mem_monitor *monitor) +{ + ofi_restore_intercepts(); + memhooks_monitor->subscribe = NULL; + memhooks_monitor->unsubscribe = NULL; +} + +#else + +static int ofi_memhooks_start(struct ofi_mem_monitor *monitor) +{ + return -FI_ENOSYS; +} + +static void ofi_memhooks_stop(struct ofi_mem_monitor *monitor) +{ +} + +#endif /* memhook support checks */ diff --git a/prov/util/src/util_mem_monitor.c b/prov/util/src/util_mem_monitor.c index 5899d20079b..126aa231811 100644 --- a/prov/util/src/util_mem_monitor.c +++ b/prov/util/src/util_mem_monitor.c @@ -1,7 +1,9 @@ /* * Copyright (c) 2017 Cray Inc. All rights reserved. * Copyright (c) 2017-2019 Intel Inc. All rights reserved. - * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright (c) 2019-2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,112 +35,294 @@ */ #include +#include +pthread_mutex_t mm_lock = PTHREAD_MUTEX_INITIALIZER; +pthread_rwlock_t mm_list_rwlock = PTHREAD_RWLOCK_INITIALIZER; -static struct ofi_uffd uffd; +static int ofi_uffd_start(struct ofi_mem_monitor *monitor); +static void ofi_uffd_stop(struct ofi_mem_monitor *monitor); + +static struct ofi_uffd uffd = { + .monitor.iface = FI_HMEM_SYSTEM, + .monitor.init = ofi_monitor_init, + .monitor.cleanup = ofi_monitor_cleanup, + .monitor.start = ofi_uffd_start, + .monitor.stop = ofi_uffd_stop, +}; struct ofi_mem_monitor *uffd_monitor = &uffd.monitor; +struct ofi_mem_monitor *default_monitor; +struct ofi_mem_monitor *default_cuda_monitor; +struct ofi_mem_monitor *default_rocr_monitor; + +static size_t ofi_default_cache_size(void) +{ + long cpu_cnt; + size_t cache_size; + + cpu_cnt = ofi_sysconf(_SC_NPROCESSORS_ONLN); + /* disable cache on error */ + if (cpu_cnt <= 0) + return 0; + + cache_size = ofi_get_mem_size() / (size_t) cpu_cnt / 2; + FI_INFO(&core_prov, FI_LOG_MR, + "default cache size=%zu\n", cache_size); + return cache_size; +} + + +void ofi_monitor_init(struct ofi_mem_monitor *monitor) +{ + dlist_init(&monitor->list); +} + +void ofi_monitor_cleanup(struct ofi_mem_monitor *monitor) +{ + assert(dlist_empty(&monitor->list)); +} /* * Initialize all available memory monitors */ -void ofi_monitor_init(void) +void ofi_monitors_init(void) { - fastlock_init(&uffd_monitor->lock); - dlist_init(&uffd_monitor->list); + uffd_monitor->init(uffd_monitor); + memhooks_monitor->init(memhooks_monitor); + cuda_monitor->init(cuda_monitor); + rocr_monitor->init(rocr_monitor); + +#if HAVE_MEMHOOKS_MONITOR + default_monitor = memhooks_monitor; +#elif HAVE_UFFD_MONITOR + default_monitor = uffd_monitor; +#else + default_monitor = NULL; +#endif fi_param_define(NULL, "mr_cache_max_size", FI_PARAM_SIZE_T, "Defines the total number of bytes for all memory" " regions that may be tracked by the MR cache." " Setting this will reduce the amount of memory" " not actively in use that may be registered." - " (default: 0 no limit is enforced)"); + " (default: total memory / number of cpu cores / 2)"); fi_param_define(NULL, "mr_cache_max_count", FI_PARAM_SIZE_T, "Defines the total number of memory regions that" " may be store in the cache. Setting this will" " reduce the number of registered regions, regardless" " of their size, stored in the cache. Setting this" " to zero will disable MR caching. (default: 1024)"); - fi_param_define(NULL, "mr_cache_merge_regions", FI_PARAM_BOOL, - "If set to true, overlapping or adjacent memory" - " regions will be combined into a single, larger" - " region. Merging regions can reduce the cache" - " memory footprint, but can negatively impact" - " performance in some situations. (default: false)"); + fi_param_define(NULL, "mr_cache_monitor", FI_PARAM_STRING, + "Define a default memory registration monitor." + " The monitor checks for virtual to physical memory" + " address changes. Options are: userfaultfd, memhooks" + " and disabled. Userfaultfd is a Linux kernel feature." + " Memhooks operates by intercepting memory allocation" + " and free calls. Userfaultfd is the default if" + " available on the system. 'disabled' option disables" + " memory caching."); + fi_param_define(NULL, "mr_cuda_cache_monitor_enabled", FI_PARAM_BOOL, + "Enable or disable the CUDA cache memory monitor." + "Monitor is enabled by default."); + fi_param_define(NULL, "mr_rocr_cache_monitor_enabled", FI_PARAM_BOOL, + "Enable or disable the ROCR cache memory monitor. " + "Monitor is enabled by default."); fi_param_get_size_t(NULL, "mr_cache_max_size", &cache_params.max_size); fi_param_get_size_t(NULL, "mr_cache_max_count", &cache_params.max_cnt); - fi_param_get_bool(NULL, "mr_cache_merge_regions", - &cache_params.merge_regions); + fi_param_get_str(NULL, "mr_cache_monitor", &cache_params.monitor); + fi_param_get_bool(NULL, "mr_cuda_cache_monitor_enabled", + &cache_params.cuda_monitor_enabled); + fi_param_get_bool(NULL, "mr_rocr_cache_monitor_enabled", + &cache_params.rocr_monitor_enabled); if (!cache_params.max_size) - cache_params.max_size = SIZE_MAX; + cache_params.max_size = ofi_default_cache_size(); + + if (cache_params.monitor != NULL) { + if (!strcmp(cache_params.monitor, "userfaultfd")) { +#if HAVE_UFFD_MONITOR + default_monitor = uffd_monitor; +#else + FI_WARN(&core_prov, FI_LOG_MR, "userfaultfd monitor not available\n"); + default_monitor = NULL; +#endif + } else if (!strcmp(cache_params.monitor, "memhooks")) { +#if HAVE_MEMHOOKS_MONITOR + default_monitor = memhooks_monitor; +#else + FI_WARN(&core_prov, FI_LOG_MR, "memhooks monitor not available\n"); + default_monitor = NULL; +#endif + } else if (!strcmp(cache_params.monitor, "disabled")) { + default_monitor = NULL; + } + } + + if (cache_params.cuda_monitor_enabled) + default_cuda_monitor = cuda_monitor; + else + default_cuda_monitor = NULL; + + if (cache_params.rocr_monitor_enabled) + default_rocr_monitor = rocr_monitor; + else + default_rocr_monitor = NULL; } -void ofi_monitor_cleanup(void) +void ofi_monitors_cleanup(void) { - assert(dlist_empty(&uffd_monitor->list)); - fastlock_destroy(&uffd_monitor->lock); + uffd_monitor->cleanup(uffd_monitor); + memhooks_monitor->cleanup(memhooks_monitor); + cuda_monitor->cleanup(cuda_monitor); + rocr_monitor->cleanup(rocr_monitor); } -int ofi_monitor_add_cache(struct ofi_mem_monitor *monitor, - struct ofi_mr_cache *cache) +/* Monitors array must be of size OFI_HMEM_MAX. */ +int ofi_monitors_add_cache(struct ofi_mem_monitor **monitors, + struct ofi_mr_cache *cache) { int ret = 0; + enum fi_hmem_iface iface; + struct ofi_mem_monitor *monitor; + unsigned int success_count = 0; + + if (!monitors) { + for (iface = FI_HMEM_SYSTEM; iface < OFI_HMEM_MAX; iface++) + cache->monitors[iface] = NULL; + return -FI_ENOSYS; + } - fastlock_acquire(&monitor->lock); - if (dlist_empty(&monitor->list)) { - if (monitor == uffd_monitor) - ret = ofi_uffd_init(); - else - ret = -FI_ENOSYS; + /* Loops until there are no readers or writers holding the lock */ + do { + ret = pthread_rwlock_trywrlock(&mm_list_rwlock); + if (ret && ret != EBUSY) { + FI_WARN(&core_prov, FI_LOG_MR, + "add_cache cannot obtain write lock, %d\n", + ret); + return ret; + } + } while (ret); + + for (iface = FI_HMEM_SYSTEM; iface < OFI_HMEM_MAX; iface++) { + cache->monitors[iface] = NULL; - if (ret) - goto out; + monitor = monitors[iface]; + if (!monitor) { + FI_DBG(&core_prov, FI_LOG_MR, + "MR cache disabled for %s memory\n", + fi_tostr(&iface, FI_TYPE_HMEM_IFACE)); + continue; + } + + if (dlist_empty(&monitor->list)) { + ret = monitor->start(monitor); + if (ret == -FI_ENOSYS) + continue; + else if (ret) + goto err; + } + + success_count++; + cache->monitors[iface] = monitor; + dlist_insert_tail(&cache->notify_entries[iface], + &monitor->list); } - cache->monitor = monitor; - dlist_insert_tail(&cache->notify_entry, &monitor->list); -out: - fastlock_release(&monitor->lock); + pthread_rwlock_unlock(&mm_list_rwlock); + return success_count ? FI_SUCCESS : -FI_ENOSYS; + +err: + pthread_rwlock_unlock(&mm_list_rwlock); + + FI_WARN(&core_prov, FI_LOG_MR, + "Failed to start %s memory monitor: %s\n", + fi_tostr(&iface, FI_TYPE_HMEM_IFACE), fi_strerror(-ret)); + ofi_monitors_del_cache(cache); + return ret; } -void ofi_monitor_del_cache(struct ofi_mr_cache *cache) +void ofi_monitors_del_cache(struct ofi_mr_cache *cache) { - struct ofi_mem_monitor *monitor = cache->monitor; + struct ofi_mem_monitor *monitor; + enum fi_hmem_iface iface; + int ret; + + /* Loops until there are no readers or writers holding the lock */ + do { + ret = pthread_rwlock_trywrlock(&mm_list_rwlock); + if (ret && ret != EBUSY) { + FI_WARN(&core_prov, FI_LOG_MR, + "del_cache cannot obtain write lock, %d\n", + ret); + return; + } + } while (ret); + + for (iface = 0; iface < OFI_HMEM_MAX; iface++) { + monitor = cache->monitors[iface]; + if (!monitor) + continue; - if (!monitor) - return; + dlist_remove(&cache->notify_entries[iface]); - fastlock_acquire(&monitor->lock); - dlist_remove(&cache->notify_entry); + if (dlist_empty(&monitor->list)) + monitor->stop(monitor); - if (dlist_empty(&monitor->list) && (monitor == uffd_monitor)) - ofi_uffd_cleanup(); - fastlock_release(&monitor->lock); + cache->monitors[iface] = NULL; + } + + pthread_rwlock_unlock(&mm_list_rwlock); } -/* Must be called holding monitor lock */ +/* Must be called with locks in place like following + * pthread_rwlock_rdlock(&mm_list_rwlock); + * pthread_mutex_lock(&mm_lock); + * ofi_monitor_notify(); + * pthread_mutex_unlock(&mm_lock); + * pthread_rwlock_unlock(&mm_list_rwlock); + */ void ofi_monitor_notify(struct ofi_mem_monitor *monitor, const void *addr, size_t len) { struct ofi_mr_cache *cache; dlist_foreach_container(&monitor->list, struct ofi_mr_cache, - cache, notify_entry) { + cache, notify_entries[monitor->iface]) { ofi_mr_cache_notify(cache, addr, len); } } +/* Must be called with locks in place like following + * pthread_rwlock_rdlock(&mm_list_rwlock); + * pthread_mutex_lock(&mm_lock); + * ofi_monitor_flush(); + * pthread_mutex_unlock(&mm_lock); + * pthread_rwlock_unlock(&mm_list_rwlock); + */ +void ofi_monitor_flush(struct ofi_mem_monitor *monitor) +{ + struct ofi_mr_cache *cache; + + dlist_foreach_container(&monitor->list, struct ofi_mr_cache, + cache, notify_entries[monitor->iface]) { + pthread_mutex_unlock(&mm_lock); + ofi_mr_cache_flush(cache, false); + pthread_mutex_lock(&mm_lock); + } +} + int ofi_monitor_subscribe(struct ofi_mem_monitor *monitor, - const void *addr, size_t len) + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) { int ret; FI_DBG(&core_prov, FI_LOG_MR, "subscribing addr=%p len=%zu\n", addr, len); - ret = monitor->subscribe(monitor, addr, len); + ret = monitor->subscribe(monitor, addr, len, hmem_info); if (OFI_UNLIKELY(ret)) { FI_WARN(&core_prov, FI_LOG_MR, "Failed (ret = %d) to monitor addr=%p len=%zu\n", @@ -148,21 +332,30 @@ int ofi_monitor_subscribe(struct ofi_mem_monitor *monitor, } void ofi_monitor_unsubscribe(struct ofi_mem_monitor *monitor, - const void *addr, size_t len) + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) { FI_DBG(&core_prov, FI_LOG_MR, "unsubscribing addr=%p len=%zu\n", addr, len); - monitor->unsubscribe(monitor, addr, len); + monitor->unsubscribe(monitor, addr, len, hmem_info); } -#if HAVE_UFFD_UNMAP +#if HAVE_UFFD_MONITOR #include #include #include #include - +/* The userfault fd monitor requires for events that could + * trigger it to be handled outside of the monitor functions + * itself. When a fault occurs on a monitored region, the + * faulting thread is put to sleep until the event is read + * via the userfault file descriptor. If this fault occurs + * within the userfault handling thread, no threads will + * read this event and our threads cannot progress, resulting + * in a hang. + */ static void *ofi_uffd_handler(void *arg) { struct uffd_msg msg; @@ -176,10 +369,12 @@ static void *ofi_uffd_handler(void *arg) if (ret != 1) break; - fastlock_acquire(&uffd.monitor.lock); + pthread_rwlock_rdlock(&mm_list_rwlock); + pthread_mutex_lock(&mm_lock); ret = read(uffd.fd, &msg, sizeof(msg)); if (ret != sizeof(msg)) { - fastlock_release(&uffd.monitor.lock); + pthread_mutex_unlock(&mm_lock); + pthread_rwlock_unlock(&mm_list_rwlock); if (errno != EAGAIN) break; continue; @@ -187,6 +382,11 @@ static void *ofi_uffd_handler(void *arg) switch (msg.event) { case UFFD_EVENT_REMOVE: + ofi_monitor_unsubscribe(&uffd.monitor, + (void *) (uintptr_t) msg.arg.remove.start, + (size_t) (msg.arg.remove.end - + msg.arg.remove.start), NULL); + /* fall through */ case UFFD_EVENT_UNMAP: ofi_monitor_notify(&uffd.monitor, (void *) (uintptr_t) msg.arg.remove.start, @@ -203,7 +403,8 @@ static void *ofi_uffd_handler(void *arg) "Unhandled uffd event %d\n", msg.event); break; } - fastlock_release(&uffd.monitor.lock); + pthread_mutex_unlock(&mm_lock); + pthread_rwlock_unlock(&mm_list_rwlock); } return NULL; } @@ -229,7 +430,8 @@ static int ofi_uffd_register(const void *addr, size_t len, size_t page_size) } static int ofi_uffd_subscribe(struct ofi_mem_monitor *monitor, - const void *addr, size_t len) + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) { int i; @@ -262,7 +464,8 @@ static int ofi_uffd_unregister(const void *addr, size_t len, size_t page_size) /* May be called from mr cache notifier callback */ static void ofi_uffd_unsubscribe(struct ofi_mem_monitor *monitor, - const void *addr, size_t len) + const void *addr, size_t len, + union ofi_mr_hmem_info *hmem_info) { int i; @@ -273,13 +476,21 @@ static void ofi_uffd_unsubscribe(struct ofi_mem_monitor *monitor, } } -int ofi_uffd_init(void) +static bool ofi_uffd_valid(struct ofi_mem_monitor *monitor, const void *addr, + size_t len, union ofi_mr_hmem_info *hmem_info) +{ + /* no-op */ + return true; +} + +static int ofi_uffd_start(struct ofi_mem_monitor *monitor) { struct uffdio_api api; int ret; uffd.monitor.subscribe = ofi_uffd_subscribe; uffd.monitor.unsubscribe = ofi_uffd_unsubscribe; + uffd.monitor.valid = ofi_uffd_valid; if (!num_page_sizes) return -FI_ENODATA; @@ -322,22 +533,22 @@ int ofi_uffd_init(void) return ret; } -void ofi_uffd_cleanup(void) +static void ofi_uffd_stop(struct ofi_mem_monitor *monitor) { pthread_cancel(uffd.thread); pthread_join(uffd.thread, NULL); close(uffd.fd); } -#else /* HAVE_UFFD_UNMAP */ +#else /* HAVE_UFFD_MONITOR */ -int ofi_uffd_init(void) +static int ofi_uffd_start(struct ofi_mem_monitor *monitor) { return -FI_ENOSYS; } -void ofi_uffd_cleanup(void) +static void ofi_uffd_stop(struct ofi_mem_monitor *monitor) { } -#endif /* HAVE_UFFD_UNMAP */ +#endif /* HAVE_UFFD_MONITOR */ diff --git a/prov/util/src/util_mr_cache.c b/prov/util/src/util_mr_cache.c index 939766b8dcb..9c051364a13 100644 --- a/prov/util/src/util_mr_cache.c +++ b/prov/util/src/util_mr_cache.c @@ -1,6 +1,9 @@ /* * Copyright (c) 2016-2017 Cray Inc. All rights reserved. * Copyright (c) 2017-2019 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -42,6 +45,8 @@ struct ofi_mr_cache_params cache_params = { .max_cnt = 1024, + .cuda_monitor_enabled = true, + .rocr_monitor_enabled = true, }; static int util_mr_find_within(struct ofi_rbmap *map, void *key, void *data) @@ -70,34 +75,53 @@ static int util_mr_find_overlap(struct ofi_rbmap *map, void *key, void *data) return 0; } +static struct ofi_mr_entry *util_mr_entry_alloc(struct ofi_mr_cache *cache) +{ + struct ofi_mr_entry *entry; + + pthread_mutex_lock(&cache->lock); + entry = ofi_buf_alloc(cache->entry_pool); + pthread_mutex_unlock(&cache->lock); + return entry; +} + +static void util_mr_entry_free(struct ofi_mr_cache *cache, + struct ofi_mr_entry *entry) +{ + pthread_mutex_lock(&cache->lock); + ofi_buf_free(entry); + pthread_mutex_unlock(&cache->lock); +} + +/* We cannot hold the monitor lock when freeing an entry. This call + * will result in freeing memory, which can generate a uffd event + * (e.g. UNMAP). If we hold the monitor lock, the uffd thread will + * hang trying to acquire it in order to read the event, and this thread + * will itself be blocked until the uffd event is read. + */ static void util_mr_free_entry(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) { - FI_DBG(cache->domain->prov, FI_LOG_MR, "free %p (len: %" PRIu64 ")\n", + FI_DBG(cache->domain->prov, FI_LOG_MR, "free %p (len: %zu)\n", entry->info.iov.iov_base, entry->info.iov.iov_len); - assert(!entry->cached); - /* If regions are not being merged, then we can't safely - * unsubscribe this region from the monitor. Otherwise, we - * might unsubscribe an address range in use by another region. - * As a result, we remain subscribed. This may result in extra - * notification events, but is harmless to correct operation. - */ - if (entry->subscribed && cache_params.merge_regions) { - ofi_monitor_unsubscribe(cache->monitor, entry->info.iov.iov_base, - entry->info.iov.iov_len); - entry->subscribed = 0; - } + assert(!entry->node); cache->delete_region(cache, entry); - ofi_buf_free(entry); + util_mr_entry_free(cache, entry); } static void util_mr_uncache_entry_storage(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) { - assert(entry->cached); - cache->storage.erase(&cache->storage, entry); - entry->cached = 0; + /* Without subscription context, we might unsubscribe from + * an address range in use by another region. As a result, + * we remain subscribed. This may result in extra + * notification events, but is harmless to correct operation. + */ + + ofi_rbmap_delete(&cache->tree, entry->node); + entry->node = NULL; + cache->cached_cnt--; cache->cached_size -= entry->info.iov.iov_len; } @@ -108,15 +132,40 @@ static void util_mr_uncache_entry(struct ofi_mr_cache *cache, util_mr_uncache_entry_storage(cache, entry); if (entry->use_cnt == 0) { - dlist_remove_init(&entry->lru_entry); - util_mr_free_entry(cache, entry); + dlist_remove(&entry->list_entry); + dlist_insert_tail(&entry->list_entry, &cache->flush_list); } else { cache->uncached_cnt++; cache->uncached_size += entry->info.iov.iov_len; } } -/* Caller must hold ofi_mem_monitor lock */ +static struct ofi_mr_entry *ofi_mr_rbt_find(struct ofi_rbmap *tree, + const struct ofi_mr_info *key) +{ + struct ofi_rbnode *node; + + node = ofi_rbmap_find(tree, (void *) key); + if (!node) + return NULL; + + return node->data; +} + +static struct ofi_mr_entry *ofi_mr_rbt_overlap(struct ofi_rbmap *tree, + const struct iovec *key) +{ + struct ofi_rbnode *node; + + node = ofi_rbmap_search(tree, (void *) key, + util_mr_find_overlap); + if (!node) + return NULL; + + return node->data; +} + +/* Caller must hold ofi_mem_monitor lock as well as unsubscribe from the region */ void ofi_mr_cache_notify(struct ofi_mr_cache *cache, const void *addr, size_t len) { struct ofi_mr_entry *entry; @@ -126,203 +175,287 @@ void ofi_mr_cache_notify(struct ofi_mr_cache *cache, const void *addr, size_t le iov.iov_base = (void *) addr; iov.iov_len = len; - for (entry = cache->storage.overlap(&cache->storage, &iov); entry; - entry = cache->storage.overlap(&cache->storage, &iov)) + for (entry = ofi_mr_rbt_overlap(&cache->tree, &iov); entry; + entry = ofi_mr_rbt_overlap(&cache->tree, &iov)) util_mr_uncache_entry(cache, entry); - - /* See comment in util_mr_free_entry. If we're not merging address - * ranges, we can only safely unsubscribe for the reported range. - */ - if (!cache_params.merge_regions) - ofi_monitor_unsubscribe(cache->monitor, addr, len); } -static bool mr_cache_flush(struct ofi_mr_cache *cache) +bool ofi_mr_cache_flush(struct ofi_mr_cache *cache, bool flush_lru) { struct ofi_mr_entry *entry; - if (dlist_empty(&cache->lru_list)) + pthread_mutex_lock(&mm_lock); + while (!dlist_empty(&cache->flush_list)) { + dlist_pop_front(&cache->flush_list, struct ofi_mr_entry, + entry, list_entry); + FI_DBG(cache->domain->prov, FI_LOG_MR, "flush %p (len: %zu)\n", + entry->info.iov.iov_base, entry->info.iov.iov_len); + pthread_mutex_unlock(&mm_lock); + + util_mr_free_entry(cache, entry); + pthread_mutex_lock(&mm_lock); + } + + if (!flush_lru || dlist_empty(&cache->lru_list)) { + pthread_mutex_unlock(&mm_lock); return false; + } - dlist_pop_front(&cache->lru_list, struct ofi_mr_entry, - entry, lru_entry); - dlist_init(&entry->lru_entry); - FI_DBG(cache->domain->prov, FI_LOG_MR, "flush %p (len: %" PRIu64 ")\n", - entry->info.iov.iov_base, entry->info.iov.iov_len); + do { + dlist_pop_front(&cache->lru_list, struct ofi_mr_entry, + entry, list_entry); + dlist_init(&entry->list_entry); + FI_DBG(cache->domain->prov, FI_LOG_MR, "flush %p (len: %zu)\n", + entry->info.iov.iov_base, entry->info.iov.iov_len); - util_mr_uncache_entry_storage(cache, entry); - util_mr_free_entry(cache, entry); - return true; -} + util_mr_uncache_entry_storage(cache, entry); + pthread_mutex_unlock(&mm_lock); -bool ofi_mr_cache_flush(struct ofi_mr_cache *cache) -{ - bool empty; + util_mr_free_entry(cache, entry); + pthread_mutex_lock(&mm_lock); + + } while (!dlist_empty(&cache->lru_list) && + ((cache->cached_cnt >= cache_params.max_cnt) || + (cache->cached_size >= cache_params.max_size))); + pthread_mutex_unlock(&mm_lock); - fastlock_acquire(&cache->monitor->lock); - empty = mr_cache_flush(cache); - fastlock_release(&cache->monitor->lock); - return empty; + return true; } void ofi_mr_cache_delete(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) { - FI_DBG(cache->domain->prov, FI_LOG_MR, "delete %p (len: %" PRIu64 ")\n", + FI_DBG(cache->domain->prov, FI_LOG_MR, "delete %p (len: %zu)\n", entry->info.iov.iov_base, entry->info.iov.iov_len); - fastlock_acquire(&cache->monitor->lock); + pthread_mutex_lock(&mm_lock); cache->delete_cnt++; if (--entry->use_cnt == 0) { - if (entry->cached) { - dlist_insert_tail(&entry->lru_entry, &cache->lru_list); - } else { + if (!entry->node) { cache->uncached_cnt--; cache->uncached_size -= entry->info.iov.iov_len; + pthread_mutex_unlock(&mm_lock); util_mr_free_entry(cache, entry); + return; } + dlist_insert_tail(&entry->list_entry, &cache->lru_list); } - fastlock_release(&cache->monitor->lock); + pthread_mutex_unlock(&mm_lock); } +/* + * We cannot hold the monitor lock when allocating and registering the + * mr_entry without creating a potential deadlock situation with the + * memory monitor needing to acquire the same lock. The underlying + * calls may allocate memory, which can result in the monitor needing + * to handle address mapping changes. To handle this, we build the + * new entry, then check under lock that a conflict with another thread + * hasn't occurred. If a conflict occurred, we return -EAGAIN and + * restart the entire operation. + */ static int -util_mr_cache_create(struct ofi_mr_cache *cache, const struct iovec *iov, - uint64_t access, struct ofi_mr_entry **entry) +util_mr_cache_create(struct ofi_mr_cache *cache, const struct ofi_mr_info *info, + struct ofi_mr_entry **entry) { + struct ofi_mr_entry *cur; int ret; + struct ofi_mem_monitor *monitor = cache->monitors[info->iface]; + + assert(monitor); - FI_DBG(cache->domain->prov, FI_LOG_MR, "create %p (len: %" PRIu64 ")\n", - iov->iov_base, iov->iov_len); + FI_DBG(cache->domain->prov, FI_LOG_MR, "create %p (len: %zu)\n", + info->iov.iov_base, info->iov.iov_len); - *entry = ofi_buf_alloc(cache->entry_pool); - if (OFI_UNLIKELY(!*entry)) + *entry = util_mr_entry_alloc(cache); + if (!*entry) return -FI_ENOMEM; - (*entry)->info.iov = *iov; + (*entry)->node = NULL; + (*entry)->info = *info; (*entry)->use_cnt = 1; ret = cache->add_region(cache, *entry); - if (ret) { - while (ret && mr_cache_flush(cache)) { - ret = cache->add_region(cache, *entry); - } - if (ret) { - assert(!mr_cache_flush(cache)); - ofi_buf_free(*entry); - return ret; - } + if (ret) + goto free; + + pthread_mutex_lock(&mm_lock); + cur = ofi_mr_rbt_find(&cache->tree, info); + if (cur) { + ret = -FI_EAGAIN; + goto unlock; } - if ((cache->cached_cnt > cache_params.max_cnt) || - (cache->cached_size > cache_params.max_size)) { - (*entry)->cached = 0; + if ((cache->cached_cnt >= cache_params.max_cnt) || + (cache->cached_size >= cache_params.max_size)) { cache->uncached_cnt++; - cache->uncached_size += iov->iov_len; + cache->uncached_size += info->iov.iov_len; } else { - if (cache->storage.insert(&cache->storage, - &(*entry)->info, *entry)) { + if (ofi_rbmap_insert(&cache->tree, (void *) &(*entry)->info, + (void *) *entry, &(*entry)->node)) { ret = -FI_ENOMEM; - goto err; + goto unlock; } - (*entry)->cached = 1; cache->cached_cnt++; - cache->cached_size += iov->iov_len; + cache->cached_size += info->iov.iov_len; - ret = ofi_monitor_subscribe(cache->monitor, iov->iov_base, - iov->iov_len); - if (ret) - util_mr_uncache_entry(cache, *entry); - else - (*entry)->subscribed = 1; + ret = ofi_monitor_subscribe(monitor, info->iov.iov_base, + info->iov.iov_len, + &(*entry)->hmem_info); + if (ret) { + util_mr_uncache_entry_storage(cache, *entry); + cache->uncached_cnt++; + cache->uncached_size += (*entry)->info.iov.iov_len; + } } - + pthread_mutex_unlock(&mm_lock); return 0; -err: +unlock: + pthread_mutex_unlock(&mm_lock); +free: util_mr_free_entry(cache, *entry); return ret; } -static int -util_mr_cache_merge(struct ofi_mr_cache *cache, const struct fi_mr_attr *attr, - struct ofi_mr_entry *old_entry, struct ofi_mr_entry **entry) +int ofi_mr_cache_search(struct ofi_mr_cache *cache, const struct fi_mr_attr *attr, + struct ofi_mr_entry **entry) { - struct ofi_mr_info info, *old_info; + struct ofi_mr_info info; + int ret; + struct ofi_mem_monitor *monitor = cache->monitors[attr->iface]; + + if (!monitor) { + FI_WARN(&core_prov, FI_LOG_MR, + "MR cache disabled for %s memory\n", + fi_tostr(&attr->iface, FI_TYPE_HMEM_IFACE)); + return -FI_ENOSYS; + } + + assert(attr->iov_count == 1); + FI_DBG(cache->domain->prov, FI_LOG_MR, "search %p (len: %zu)\n", + attr->mr_iov->iov_base, attr->mr_iov->iov_len); info.iov = *attr->mr_iov; + info.iface = attr->iface; + info.device = attr->device.reserved; + do { - FI_DBG(cache->domain->prov, FI_LOG_MR, - "merging %p (len: %" PRIu64 ") with %p (len: %" PRIu64 ")\n", - info.iov.iov_base, info.iov.iov_len, - old_entry->info.iov.iov_base, old_entry->info.iov.iov_len); - old_info = &old_entry->info; + pthread_mutex_lock(&mm_lock); - info.iov.iov_len = ((uintptr_t) - MAX(ofi_iov_end(&info.iov), ofi_iov_end(&old_info->iov))) + 1 - - ((uintptr_t) MIN(info.iov.iov_base, old_info->iov.iov_base)); - info.iov.iov_base = MIN(info.iov.iov_base, old_info->iov.iov_base); - FI_DBG(cache->domain->prov, FI_LOG_MR, "merged %p (len: %" PRIu64 ")\n", - info.iov.iov_base, info.iov.iov_len); + if ((cache->cached_cnt >= cache_params.max_cnt) || + (cache->cached_size >= cache_params.max_size)) { + pthread_mutex_unlock(&mm_lock); + ofi_mr_cache_flush(cache, true); + pthread_mutex_lock(&mm_lock); + } - /* New entry will expand range of subscription */ - old_entry->subscribed = 0; + cache->search_cnt++; + *entry = ofi_mr_rbt_find(&cache->tree, &info); - util_mr_uncache_entry(cache, old_entry); + if (*entry && + ofi_iov_within(attr->mr_iov, &(*entry)->info.iov) && + monitor->valid(monitor, + (const void *)(*entry)->info.iov.iov_base, + (*entry)->info.iov.iov_len, + &(*entry)->hmem_info)) + goto hit; - } while ((old_entry = cache->storage.find(&cache->storage, &info))); + /* Purge regions that overlap with new region */ + while (*entry) { + util_mr_uncache_entry(cache, *entry); + *entry = ofi_mr_rbt_find(&cache->tree, &info); + } + pthread_mutex_unlock(&mm_lock); - return util_mr_cache_create(cache, &info.iov, attr->access, entry); + ret = util_mr_cache_create(cache, &info, entry); + if (ret && ret != -FI_EAGAIN) { + if (ofi_mr_cache_flush(cache, true)) + ret = -FI_EAGAIN; + } + } while (ret == -FI_EAGAIN); + + return ret; + +hit: + cache->hit_cnt++; + if ((*entry)->use_cnt++ == 0) + dlist_remove_init(&(*entry)->list_entry); + pthread_mutex_unlock(&mm_lock); + return 0; } -int ofi_mr_cache_search(struct ofi_mr_cache *cache, const struct fi_mr_attr *attr, - struct ofi_mr_entry **entry) +struct ofi_mr_entry *ofi_mr_cache_find(struct ofi_mr_cache *cache, + const struct fi_mr_attr *attr) { struct ofi_mr_info info; - int ret = 0; + struct ofi_mr_entry *entry; assert(attr->iov_count == 1); - FI_DBG(cache->domain->prov, FI_LOG_MR, "search %p (len: %" PRIu64 ")\n", + FI_DBG(cache->domain->prov, FI_LOG_MR, "find %p (len: %zu)\n", attr->mr_iov->iov_base, attr->mr_iov->iov_len); - fastlock_acquire(&cache->monitor->lock); + pthread_mutex_lock(&mm_lock); cache->search_cnt++; - while (((cache->cached_cnt >= cache_params.max_cnt) || - (cache->cached_size >= cache_params.max_size)) && - mr_cache_flush(cache)) - ; - info.iov = *attr->mr_iov; - *entry = cache->storage.find(&cache->storage, &info); - if (!*entry) { - ret = util_mr_cache_create(cache, attr->mr_iov, - attr->access, entry); + entry = ofi_mr_rbt_find(&cache->tree, &info); + if (!entry) { goto unlock; } - /* This branch may be taken even if user hasn't enabled merging regions. - * e.g. a new region encloses previously cached smaller region. Cache - * find function (util_mr_find_within) would match the enclosed region. - */ - if (!ofi_iov_within(attr->mr_iov, &(*entry)->info.iov)) { - ret = util_mr_cache_merge(cache, attr, *entry, entry); + if (!ofi_iov_within(attr->mr_iov, &entry->info.iov)) { + entry = NULL; goto unlock; } cache->hit_cnt++; - if ((*entry)->use_cnt++ == 0) - dlist_remove_init(&(*entry)->lru_entry); + if ((entry)->use_cnt++ == 0) + dlist_remove_init(&(entry)->list_entry); unlock: - fastlock_release(&cache->monitor->lock); + pthread_mutex_unlock(&mm_lock); + return entry; +} + +int ofi_mr_cache_reg(struct ofi_mr_cache *cache, const struct fi_mr_attr *attr, + struct ofi_mr_entry **entry) +{ + int ret; + + assert(attr->iov_count == 1); + FI_DBG(cache->domain->prov, FI_LOG_MR, "reg %p (len: %zu)\n", + attr->mr_iov->iov_base, attr->mr_iov->iov_len); + + *entry = util_mr_entry_alloc(cache); + if (!*entry) + return -FI_ENOMEM; + + pthread_mutex_lock(&mm_lock); + cache->uncached_cnt++; + cache->uncached_size += attr->mr_iov->iov_len; + pthread_mutex_unlock(&mm_lock); + + (*entry)->info.iov = *attr->mr_iov; + (*entry)->use_cnt = 1; + (*entry)->node = NULL; + + ret = cache->add_region(cache, *entry); + if (ret) + goto buf_free; + + return 0; + +buf_free: + util_mr_entry_free(cache, *entry); + pthread_mutex_lock(&mm_lock); + cache->uncached_cnt--; + cache->uncached_size -= attr->mr_iov->iov_len; + pthread_mutex_unlock(&mm_lock); return ret; } void ofi_mr_cache_cleanup(struct ofi_mr_cache *cache) { - struct ofi_mr_entry *entry; - struct dlist_entry *tmp; - /* If we don't have a domain, initialization failed */ if (!cache->domain) return; @@ -332,16 +465,12 @@ void ofi_mr_cache_cleanup(struct ofi_mr_cache *cache) cache->search_cnt, cache->delete_cnt, cache->hit_cnt, cache->notify_cnt); - fastlock_acquire(&cache->monitor->lock); - dlist_foreach_container_safe(&cache->lru_list, struct ofi_mr_entry, - entry, lru_entry, tmp) { - assert(entry->use_cnt == 0); - util_mr_uncache_entry(cache, entry); - } - fastlock_release(&cache->monitor->lock); + while (ofi_mr_cache_flush(cache, true)) + ; - ofi_monitor_del_cache(cache); - cache->storage.destroy(&cache->storage); + pthread_mutex_destroy(&cache->lock); + ofi_monitors_del_cache(cache); + ofi_rbmap_cleanup(&cache->tree); ofi_atomic_dec32(&cache->domain->ref); ofi_bufpool_destroy(cache->entry_pool); assert(cache->cached_cnt == 0); @@ -350,105 +479,20 @@ void ofi_mr_cache_cleanup(struct ofi_mr_cache *cache) assert(cache->uncached_size == 0); } -static void ofi_mr_rbt_destroy(struct ofi_mr_storage *storage) -{ - ofi_rbmap_destroy(storage->storage); -} - -static struct ofi_mr_entry *ofi_mr_rbt_find(struct ofi_mr_storage *storage, - const struct ofi_mr_info *key) -{ - struct ofi_rbnode *node; - - node = ofi_rbmap_find(storage->storage, (void *) key); - if (!node) - return NULL; - - return node->data; -} - -static struct ofi_mr_entry *ofi_mr_rbt_overlap(struct ofi_mr_storage *storage, - const struct iovec *key) -{ - struct ofi_rbnode *node; - - node = ofi_rbmap_search(storage->storage, (void *) key, - util_mr_find_overlap); - if (!node) - return NULL; - - return node->data; -} - -static int ofi_mr_rbt_insert(struct ofi_mr_storage *storage, - struct ofi_mr_info *key, - struct ofi_mr_entry *entry) -{ - return ofi_rbmap_insert(storage->storage, (void *) key, (void *) entry, - NULL); -} - -static int ofi_mr_rbt_erase(struct ofi_mr_storage *storage, - struct ofi_mr_entry *entry) -{ - struct ofi_rbnode *node; - - node = ofi_rbmap_find(storage->storage, &entry->info); - assert(node); - ofi_rbmap_delete(storage->storage, node); - return 0; -} - -static int ofi_mr_cache_init_rbt(struct ofi_mr_cache *cache) -{ - cache->storage.storage = ofi_rbmap_create(cache_params.merge_regions ? - util_mr_find_overlap : - util_mr_find_within); - if (!cache->storage.storage) - return -FI_ENOMEM; - - cache->storage.overlap = ofi_mr_rbt_overlap; - cache->storage.destroy = ofi_mr_rbt_destroy; - cache->storage.find = ofi_mr_rbt_find; - cache->storage.insert = ofi_mr_rbt_insert; - cache->storage.erase = ofi_mr_rbt_erase; - return 0; -} - -static int ofi_mr_cache_init_storage(struct ofi_mr_cache *cache) -{ - int ret; - - switch (cache->storage.type) { - case OFI_MR_STORAGE_DEFAULT: - case OFI_MR_STORAGE_RBT: - ret = ofi_mr_cache_init_rbt(cache); - break; - case OFI_MR_STORAGE_USER: - ret = (cache->storage.storage && cache->storage.overlap && - cache->storage.destroy && cache->storage.find && - cache->storage.insert && cache->storage.erase) ? - 0 : -FI_EINVAL; - break; - default: - ret = -FI_EINVAL; - break; - } - - return ret; -} - +/* Monitors array must be of size OFI_HMEM_MAX. */ int ofi_mr_cache_init(struct util_domain *domain, - struct ofi_mem_monitor *monitor, + struct ofi_mem_monitor **monitors, struct ofi_mr_cache *cache) { int ret; assert(cache->add_region && cache->delete_region); - if (!cache_params.max_cnt) + if (!cache_params.max_cnt || !cache_params.max_size) return -FI_ENOSPC; + pthread_mutex_init(&cache->lock, NULL); dlist_init(&cache->lru_list); + dlist_init(&cache->flush_list); cache->cached_cnt = 0; cache->cached_size = 0; cache->uncached_cnt = 0; @@ -460,11 +504,8 @@ int ofi_mr_cache_init(struct util_domain *domain, cache->domain = domain; ofi_atomic_inc32(&domain->ref); - ret = ofi_mr_cache_init_storage(cache); - if (ret) - goto dec; - - ret = ofi_monitor_add_cache(monitor, cache); + ofi_rbmap_init(&cache->tree, util_mr_find_within); + ret = ofi_monitors_add_cache(monitors, cache); if (ret) goto destroy; @@ -477,11 +518,11 @@ int ofi_mr_cache_init(struct util_domain *domain, return 0; del: - ofi_monitor_del_cache(cache); + ofi_monitors_del_cache(cache); destroy: - cache->storage.destroy(&cache->storage); -dec: + ofi_rbmap_cleanup(&cache->tree); ofi_atomic_dec32(&cache->domain->ref); + pthread_mutex_destroy(&cache->lock); cache->domain = NULL; return ret; } diff --git a/prov/util/src/util_mr_map.c b/prov/util/src/util_mr_map.c index 2157b702b42..78e6459f5c8 100644 --- a/prov/util/src/util_mr_map.c +++ b/prov/util/src/util_mr_map.c @@ -219,10 +219,39 @@ static struct fi_ops ofi_mr_fi_ops = { .ops_open = fi_no_ops_open }; +void ofi_mr_update_attr(uint32_t user_version, uint64_t caps, + const struct fi_mr_attr *user_attr, + struct fi_mr_attr *cur_abi_attr) +{ + cur_abi_attr->mr_iov = (struct iovec *) user_attr->mr_iov; + cur_abi_attr->iov_count = user_attr->iov_count; + cur_abi_attr->access = user_attr->access; + cur_abi_attr->offset = user_attr->offset; + cur_abi_attr->requested_key = user_attr->requested_key; + cur_abi_attr->context = user_attr->context; + + if (FI_VERSION_GE(user_version, FI_VERSION(1, 5))) { + cur_abi_attr->auth_key_size = user_attr->auth_key_size; + cur_abi_attr->auth_key = user_attr->auth_key; + } else { + cur_abi_attr->auth_key_size = 0; + cur_abi_attr->auth_key = NULL; + } + + if (caps & FI_HMEM) { + cur_abi_attr->iface = user_attr->iface; + cur_abi_attr->device = user_attr->device; + } else { + cur_abi_attr->iface = FI_HMEM_SYSTEM; + cur_abi_attr->device.reserved = 0; + } +} + int ofi_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, uint64_t flags, struct fid_mr **mr_fid) { struct util_domain *domain; + struct fi_mr_attr cur_abi_attr; struct ofi_mr *mr; uint64_t key; int ret = 0; @@ -235,6 +264,8 @@ int ofi_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, if (!mr) return -FI_ENOMEM; + ofi_mr_update_attr(domain->fabric->fabric_fid.api_version, + domain->info_domain_caps, attr, &cur_abi_attr); fastlock_acquire(&domain->lock); mr->mr_fid.fid.fclass = FI_CLASS_MR; @@ -242,15 +273,17 @@ int ofi_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, mr->mr_fid.fid.ops = &ofi_mr_fi_ops; mr->domain = domain; mr->flags = flags; + mr->iface = cur_abi_attr.iface; + mr->device = cur_abi_attr.device.reserved; - ret = ofi_mr_map_insert(&domain->mr_map, attr, &key, mr); + ret = ofi_mr_map_insert(&domain->mr_map, &cur_abi_attr, &key, mr); if (ret) { free(mr); goto out; } mr->mr_fid.key = mr->key = key; - mr->mr_fid.mem_desc = (void *) (uintptr_t) key; + mr->mr_fid.mem_desc = (void *) mr; *mr_fid = &mr->mr_fid; ofi_atomic_inc32(&domain->ref); @@ -273,6 +306,9 @@ int ofi_mr_regv(struct fid *fid, const struct iovec *iov, attr.offset = offset; attr.requested_key = requested_key; attr.context = context; + attr.iface = FI_HMEM_SYSTEM; + attr.device.reserved = 0; + return ofi_mr_regattr(fid, &attr, flags, mr_fid); } diff --git a/prov/util/src/util_poll.c b/prov/util/src/util_poll.c index 30059503dad..23b75d83ec3 100644 --- a/prov/util/src/util_poll.c +++ b/prov/util/src/util_poll.c @@ -141,6 +141,9 @@ static int util_poll_close(struct fid *fid) if (pollset->domain) ofi_atomic_dec32(&pollset->domain->ref); + + fastlock_destroy(&pollset->lock); + free(pollset); return 0; } diff --git a/prov/util/src/util_shm.c b/prov/util/src/util_shm.c index 7508471f6db..9426d767df9 100644 --- a/prov/util/src/util_shm.c +++ b/prov/util/src/util_shm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017 Intel Corporation. All rights reserved. + * Copyright (c) 2016-2021 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -39,90 +39,266 @@ #include #include #include +#include #include +struct dlist_entry ep_name_list; +DEFINE_LIST(ep_name_list); +pthread_mutex_t ep_list_lock = PTHREAD_MUTEX_INITIALIZER; + +void smr_cleanup(void) +{ + struct smr_ep_name *ep_name; + struct dlist_entry *tmp; + + pthread_mutex_lock(&ep_list_lock); + dlist_foreach_container_safe(&ep_name_list, struct smr_ep_name, + ep_name, entry, tmp) + free(ep_name); + pthread_mutex_unlock(&ep_list_lock); +} static void smr_peer_addr_init(struct smr_addr *peer) { - memset(peer->name, 0, SMR_NAME_SIZE); - peer->addr = FI_ADDR_UNSPEC; + memset(peer->name, 0, SMR_NAME_MAX); + peer->id = -1; +} + +void smr_cma_check(struct smr_region *smr, struct smr_region *peer_smr) +{ + struct iovec local_iov, remote_iov; + int remote_pid; + int ret; + + if (smr != peer_smr && peer_smr->cma_cap_peer != SMR_CMA_CAP_NA) { + smr->cma_cap_peer = peer_smr->cma_cap_peer; + return; + } + remote_pid = peer_smr->pid; + local_iov.iov_base = &remote_pid; + local_iov.iov_len = sizeof(remote_pid); + remote_iov.iov_base = (char *)peer_smr->base_addr + + ((char *)&peer_smr->pid - (char *)peer_smr); + remote_iov.iov_len = sizeof(peer_smr->pid); + ret = ofi_process_vm_writev(peer_smr->pid, &local_iov, 1, + &remote_iov, 1, 0); + assert(remote_pid == peer_smr->pid); + + if (smr == peer_smr) { + smr->cma_cap_self = (ret == -1) ? SMR_CMA_CAP_OFF : SMR_CMA_CAP_ON; + } else { + smr->cma_cap_peer = (ret == -1) ? SMR_CMA_CAP_OFF : SMR_CMA_CAP_ON; + peer_smr->cma_cap_peer = smr->cma_cap_peer; + } +} + +size_t smr_calculate_size_offsets(size_t tx_count, size_t rx_count, + size_t *cmd_offset, size_t *resp_offset, + size_t *inject_offset, size_t *sar_offset, + size_t *peer_offset, size_t *name_offset, + size_t *sock_offset) +{ + size_t cmd_queue_offset, resp_queue_offset, inject_pool_offset; + size_t sar_pool_offset, peer_data_offset, ep_name_offset; + size_t tx_size, rx_size, total_size, sock_name_offset; + + tx_size = roundup_power_of_two(tx_count); + rx_size = roundup_power_of_two(rx_count); + + cmd_queue_offset = sizeof(struct smr_region); + resp_queue_offset = cmd_queue_offset + sizeof(struct smr_cmd_queue) + + sizeof(struct smr_cmd) * rx_size; + inject_pool_offset = resp_queue_offset + sizeof(struct smr_resp_queue) + + sizeof(struct smr_resp) * tx_size; + sar_pool_offset = inject_pool_offset + sizeof(struct smr_inject_pool) + + sizeof(struct smr_inject_pool_entry) * rx_size; + peer_data_offset = sar_pool_offset + sizeof(struct smr_sar_pool) + + sizeof(struct smr_sar_pool_entry) * SMR_MAX_PEERS; + ep_name_offset = peer_data_offset + sizeof(struct smr_peer_data) * SMR_MAX_PEERS; + + sock_name_offset = ep_name_offset + SMR_NAME_MAX; + + if (cmd_offset) + *cmd_offset = cmd_queue_offset; + if (resp_offset) + *resp_offset = resp_queue_offset; + if (inject_offset) + *inject_offset = inject_pool_offset; + if (sar_offset) + *sar_offset = sar_pool_offset; + if (peer_offset) + *peer_offset = peer_data_offset; + if (name_offset) + *name_offset = ep_name_offset; + if (sock_offset) + *sock_offset = sock_name_offset; + + total_size = sock_name_offset + SMR_SOCK_NAME_MAX; + + /* + * Revisit later to see if we really need the size adjustment, or + * at most align to a multiple of a page size. + */ + total_size = roundup_power_of_two(total_size); + + return total_size; +} + +static int smr_retry_map(const char *name, int *fd) +{ + char tmp[NAME_MAX]; + struct smr_region *old_shm; + struct stat sts; + int shm_pid; + + *fd = shm_open(name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if (*fd < 0) + return -errno; + + old_shm = mmap(NULL, sizeof(*old_shm), PROT_READ | PROT_WRITE, + MAP_SHARED, *fd, 0); + if (old_shm == MAP_FAILED) + goto err; + + if (old_shm->version > SMR_VERSION) { + munmap(old_shm, sizeof(*old_shm)); + goto err; + } + shm_pid = old_shm->pid; + munmap(old_shm, sizeof(*old_shm)); + + if (!shm_pid) + return FI_SUCCESS; + + memset(tmp, 0, sizeof(tmp)); + snprintf(tmp, sizeof(tmp), "/proc/%d", shm_pid); + + if (stat(tmp, &sts) == -1 && errno == ENOENT) + return FI_SUCCESS; + +err: + close(*fd); + shm_unlink(name); + return -FI_EBUSY; } /* TODO: Determine if aligning SMR data helps performance */ int smr_create(const struct fi_provider *prov, struct smr_map *map, - const struct smr_attr *attr, struct smr_region **smr) + const struct smr_attr *attr, struct smr_region *volatile *smr) { - size_t total_size, cmd_queue_offset, peer_addr_offset; + struct smr_ep_name *ep_name; + size_t total_size, cmd_queue_offset, peer_data_offset; size_t resp_queue_offset, inject_pool_offset, name_offset; + size_t sar_pool_offset, sock_name_offset; int fd, ret, i; void *mapped_addr; + size_t tx_size, rx_size; - cmd_queue_offset = sizeof(**smr); - resp_queue_offset = cmd_queue_offset + sizeof(struct smr_cmd_queue) + - sizeof(struct smr_cmd) * attr->rx_count; - inject_pool_offset = resp_queue_offset + sizeof(struct smr_resp_queue) + - sizeof(struct smr_resp) * attr->tx_count; - peer_addr_offset = inject_pool_offset + sizeof(struct smr_inject_pool) + - sizeof(struct smr_inject_pool_entry) * attr->rx_count; - name_offset = peer_addr_offset + sizeof(struct smr_addr) * SMR_MAX_PEERS; - total_size = name_offset + strlen(attr->name) + 1; - total_size = roundup_power_of_two(total_size); + tx_size = roundup_power_of_two(attr->tx_count); + rx_size = roundup_power_of_two(attr->rx_count); + total_size = smr_calculate_size_offsets(tx_size, rx_size, &cmd_queue_offset, + &resp_queue_offset, &inject_pool_offset, + &sar_pool_offset, &peer_data_offset, + &name_offset, &sock_name_offset); - fd = shm_open(attr->name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + fd = shm_open(attr->name, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); if (fd < 0) { - FI_WARN(prov, FI_LOG_EP_CTRL, "shm_open error\n"); - goto err1; + if (errno != EEXIST) { + FI_WARN(prov, FI_LOG_EP_CTRL, + "shm_open error (%s): %s\n", + attr->name, strerror(errno)); + return -errno; + } + + ret = smr_retry_map(attr->name, &fd); + if (ret) { + FI_WARN(prov, FI_LOG_EP_CTRL, "shm file in use (%s)\n", + attr->name); + return ret; + } + FI_WARN(prov, FI_LOG_EP_CTRL, + "Overwriting shm from dead process (%s)\n", attr->name); + } + + ep_name = calloc(1, sizeof(*ep_name)); + if (!ep_name) { + FI_WARN(prov, FI_LOG_EP_CTRL, "calloc error\n"); + ret = -FI_ENOMEM; + goto close; } + strncpy(ep_name->name, (char *)attr->name, SMR_NAME_MAX - 1); + ep_name->name[SMR_NAME_MAX - 1] = '\0'; + + pthread_mutex_lock(&ep_list_lock); + dlist_insert_tail(&ep_name->entry, &ep_name_list); ret = ftruncate(fd, total_size); if (ret < 0) { FI_WARN(prov, FI_LOG_EP_CTRL, "ftruncate error\n"); - goto err2; + ret = -errno; + goto remove; } mapped_addr = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (mapped_addr == MAP_FAILED) { FI_WARN(prov, FI_LOG_EP_CTRL, "mmap error\n"); - goto err2; + ret = -errno; + goto remove; } close(fd); + ep_name->region = mapped_addr; + pthread_mutex_unlock(&ep_list_lock); + *smr = mapped_addr; fastlock_init(&(*smr)->lock); - fastlock_acquire(&(*smr)->lock); (*smr)->map = map; (*smr)->version = SMR_VERSION; (*smr)->flags = SMR_FLAG_ATOMIC | SMR_FLAG_DEBUG; - (*smr)->pid = getpid(); + (*smr)->cma_cap_peer = SMR_CMA_CAP_NA; + (*smr)->cma_cap_self = SMR_CMA_CAP_NA; + (*smr)->base_addr = *smr; (*smr)->total_size = total_size; (*smr)->cmd_queue_offset = cmd_queue_offset; (*smr)->resp_queue_offset = resp_queue_offset; (*smr)->inject_pool_offset = inject_pool_offset; - (*smr)->peer_addr_offset = peer_addr_offset; + (*smr)->sar_pool_offset = sar_pool_offset; + (*smr)->peer_data_offset = peer_data_offset; (*smr)->name_offset = name_offset; - (*smr)->cmd_cnt = attr->rx_count; - - smr_cmd_queue_init(smr_cmd_queue(*smr), attr->rx_count); - smr_resp_queue_init(smr_resp_queue(*smr), attr->tx_count); - smr_inject_pool_init(smr_inject_pool(*smr), attr->rx_count); - for (i = 0; i < SMR_MAX_PEERS; i++) - smr_peer_addr_init(&smr_peer_addr(*smr)[i]); + (*smr)->sock_name_offset = sock_name_offset; + (*smr)->cmd_cnt = rx_size; + /* Limit of 1 outstanding SAR message per peer */ + (*smr)->sar_cnt = SMR_MAX_PEERS; + + smr_cmd_queue_init(smr_cmd_queue(*smr), rx_size); + smr_resp_queue_init(smr_resp_queue(*smr), tx_size); + smr_inject_pool_init(smr_inject_pool(*smr), rx_size); + smr_sar_pool_init(smr_sar_pool(*smr), SMR_MAX_PEERS); + for (i = 0; i < SMR_MAX_PEERS; i++) { + smr_peer_addr_init(&smr_peer_data(*smr)[i].addr); + smr_peer_data(*smr)[i].sar_status = 0; + smr_peer_data(*smr)[i].name_sent = 0; + } strncpy((char *) smr_name(*smr), attr->name, total_size - name_offset); - fastlock_release(&(*smr)->lock); + /* Must be set last to signal full initialization to peers */ + (*smr)->pid = getpid(); return 0; -err2: - shm_unlink(attr->name); +remove: + dlist_remove(&ep_name->entry); + pthread_mutex_unlock(&ep_list_lock); + free(ep_name); +close: close(fd); -err1: - return -errno; + shm_unlink(attr->name); + return ret; } void smr_free(struct smr_region *smr) @@ -131,6 +307,16 @@ void smr_free(struct smr_region *smr) munmap(smr, smr->total_size); } +static int smr_name_compare(struct ofi_rbmap *map, void *key, void *data) +{ + struct smr_map *smr_map; + + smr_map = container_of(map, struct smr_map, rbmap); + + return strncmp(smr_map->peers[(int64_t) data].peer.name, + (char *) key, SMR_NAME_MAX); +} + int smr_map_create(const struct fi_provider *prov, int peer_count, struct smr_map **map) { @@ -142,23 +328,44 @@ int smr_map_create(const struct fi_provider *prov, int peer_count, return -FI_ENOMEM; } - for (i = 0; i < peer_count; i++) + for (i = 0; i < peer_count; i++) { smr_peer_addr_init(&(*map)->peers[i].peer); + (*map)->peers[i].fiaddr = FI_ADDR_UNSPEC; + } + ofi_rbmap_init(&(*map)->rbmap, smr_name_compare); fastlock_init(&(*map)->lock); return 0; } +static int smr_match_name(struct dlist_entry *item, const void *args) +{ + return !strcmp(container_of(item, struct smr_ep_name, entry)->name, + (char *) args); +} + int smr_map_to_region(const struct fi_provider *prov, struct smr_peer *peer_buf) { struct smr_region *peer; size_t size; int fd, ret = 0; + struct dlist_entry *entry; + + pthread_mutex_lock(&ep_list_lock); + entry = dlist_find_first_match(&ep_name_list, smr_match_name, + peer_buf->peer.name); + if (entry) { + peer_buf->region = container_of(entry, struct smr_ep_name, + entry)->region; + pthread_mutex_unlock(&ep_list_lock); + return FI_SUCCESS; + } + pthread_mutex_unlock(&ep_list_lock); fd = shm_open(peer_buf->peer.name, O_RDWR, S_IRUSR | S_IWUSR); if (fd < 0) { - FI_WARN(prov, FI_LOG_AV, "shm_open error\n"); + FI_WARN_ONCE(prov, FI_LOG_AV, "shm_open error\n"); return -errno; } @@ -188,97 +395,127 @@ int smr_map_to_region(const struct fi_provider *prov, struct smr_peer *peer_buf) return ret; } -void smr_map_to_endpoint(struct smr_region *region, int index) +void smr_map_to_endpoint(struct smr_region *region, int64_t id) { struct smr_region *peer_smr; - struct smr_addr *local_peers, *peer_peers; - int peer_index; + struct smr_peer_data *local_peers; - local_peers = smr_peer_addr(region); - - strncpy(smr_peer_addr(region)[index].name, - region->map->peers[index].peer.name, SMR_NAME_SIZE); - smr_peer_addr(region)[index].name[SMR_NAME_SIZE - 1] = '\0'; - if (region->map->peers[index].peer.addr == FI_ADDR_UNSPEC) + if (region->map->peers[id].peer.id < 0) return; - peer_smr = smr_peer_region(region, index); - peer_peers = smr_peer_addr(peer_smr); + local_peers = smr_peer_data(region); - for (peer_index = 0; peer_index < SMR_MAX_PEERS; peer_index++) { - if (!strncmp(smr_name(region), - peer_peers[peer_index].name, SMR_NAME_SIZE)) - break; - } - if (peer_index != SMR_MAX_PEERS) { - peer_peers[peer_index].addr = index; - local_peers[index].addr = peer_index; - } + strncpy(local_peers[id].addr.name, + region->map->peers[id].peer.name, SMR_NAME_MAX - 1); + local_peers[id].addr.name[SMR_NAME_MAX - 1] = '\0'; + + peer_smr = smr_peer_region(region, id); + + if ((region != peer_smr && region->cma_cap_peer == SMR_CMA_CAP_NA) || + (region == peer_smr && region->cma_cap_self == SMR_CMA_CAP_NA)) + smr_cma_check(region, peer_smr); } -void smr_unmap_from_endpoint(struct smr_region *region, int index) +void smr_unmap_from_endpoint(struct smr_region *region, int64_t id) { struct smr_region *peer_smr; - struct smr_addr *local_peers, *peer_peers; - int peer_index; + struct smr_peer_data *local_peers, *peer_peers; + int64_t peer_id; - local_peers = smr_peer_addr(region); + local_peers = smr_peer_data(region); - memset(local_peers[index].name, 0, SMR_NAME_SIZE); - peer_index = region->map->peers[index].peer.addr; - if (peer_index == FI_ADDR_UNSPEC) + memset(local_peers[id].addr.name, 0, SMR_NAME_MAX); + peer_id = region->map->peers[id].peer.id; + if (peer_id < 0) return; - peer_smr = smr_peer_region(region, index); - peer_peers = smr_peer_addr(peer_smr); + peer_smr = smr_peer_region(region, id); + peer_peers = smr_peer_data(peer_smr); - peer_peers[peer_index].addr = FI_ADDR_UNSPEC; + peer_peers[peer_id].addr.id = -1; + peer_peers[peer_id].name_sent = 0; } void smr_exchange_all_peers(struct smr_region *region) { - int i; + int64_t i; for (i = 0; i < SMR_MAX_PEERS; i++) smr_map_to_endpoint(region, i); } int smr_map_add(const struct fi_provider *prov, struct smr_map *map, - const char *name, int id) + const char *name, int64_t *id) { - int ret = 0; + struct ofi_rbnode *node; + int tries = 0, ret = 0; fastlock_acquire(&map->lock); - strncpy(map->peers[id].peer.name, name, SMR_NAME_SIZE); - map->peers[id].peer.name[SMR_NAME_SIZE - 1] = '\0'; - ret = smr_map_to_region(prov, &map->peers[id]); + ret = ofi_rbmap_insert(&map->rbmap, (void *) name, (void *) *id, &node); + if (ret) { + assert(ret == -FI_EALREADY); + *id = (int64_t) node->data; + fastlock_release(&map->lock); + return 0; + } + + while (map->peers[map->cur_id].peer.id != -1 && + tries < SMR_MAX_PEERS) { + if (++map->cur_id == SMR_MAX_PEERS) + map->cur_id = 0; + tries++; + } + + assert(map->cur_id < SMR_MAX_PEERS && tries < SMR_MAX_PEERS); + *id = map->cur_id; + node->data = (void *) *id; + strncpy(map->peers[*id].peer.name, name, SMR_NAME_MAX); + map->peers[*id].peer.name[SMR_NAME_MAX - 1] = '\0'; + + ret = smr_map_to_region(prov, &map->peers[*id]); if (!ret) - map->peers[id].peer.addr = id; - fastlock_release(&map->lock); + map->peers[*id].peer.id = *id; + fastlock_release(&map->lock); return ret == -ENOENT ? 0 : ret; } -void smr_map_del(struct smr_map *map, int id) +void smr_map_del(struct smr_map *map, int64_t id) { - if (id >= SMR_MAX_PEERS || id < 0 || - map->peers[id].peer.addr == FI_ADDR_UNSPEC) + struct dlist_entry *entry; + + if (id >= SMR_MAX_PEERS || id < 0 || map->peers[id].peer.id < 0) return; - munmap(map->peers[id].region, map->peers[id].region->total_size); - map->peers[id].peer.addr = FI_ADDR_UNSPEC; + pthread_mutex_lock(&ep_list_lock); + entry = dlist_find_first_match(&ep_name_list, smr_match_name, + map->peers[id].peer.name); + pthread_mutex_unlock(&ep_list_lock); + + fastlock_acquire(&map->lock); + if (!entry) + munmap(map->peers[id].region, map->peers[id].region->total_size); + + (void) ofi_rbmap_find_delete(&map->rbmap, + (void *) map->peers[id].peer.name); + + map->peers[id].fiaddr = FI_ADDR_UNSPEC; + map->peers[id].peer.id = -1; + + fastlock_release(&map->lock); } void smr_map_free(struct smr_map *map) { - int i; + int64_t i; for (i = 0; i < SMR_MAX_PEERS; i++) smr_map_del(map, i); + ofi_rbmap_cleanup(&map->rbmap); free(map); } -struct smr_region *smr_map_get(struct smr_map *map, int id) +struct smr_region *smr_map_get(struct smr_map *map, int64_t id) { if (id < 0 || id >= SMR_MAX_PEERS) return NULL; diff --git a/prov/util/src/util_wait.c b/prov/util/src/util_wait.c index 521cef42cda..07d6a7a6373 100644 --- a/prov/util/src/util_wait.c +++ b/prov/util/src/util_wait.c @@ -36,8 +36,20 @@ #include #include +#include +static uint32_t ofi_poll_to_epoll(uint32_t events) +{ + uint32_t epoll_events = 0; + + if (events & POLLIN) + epoll_events |= OFI_EPOLL_IN; + if (events & POLLOUT) + epoll_events |= OFI_EPOLL_OUT; + return epoll_events; +} + int ofi_trywait(struct fid_fabric *fabric, struct fid **fids, int count) { struct util_cq *cq; @@ -80,7 +92,9 @@ int ofi_check_wait_attr(const struct fi_provider *prov, switch (attr->wait_obj) { case FI_WAIT_UNSPEC: case FI_WAIT_FD: + case FI_WAIT_POLLFD: case FI_WAIT_MUTEX_COND: + case FI_WAIT_YIELD: break; default: FI_WARN(prov, FI_LOG_FABRIC, "invalid wait object type\n"); @@ -97,6 +111,7 @@ int ofi_check_wait_attr(const struct fi_provider *prov, int fi_wait_cleanup(struct util_wait *wait) { + struct ofi_wait_fid_entry *fid_entry; int ret; if (ofi_atomic_get32(&wait->ref)) @@ -106,12 +121,20 @@ int fi_wait_cleanup(struct util_wait *wait) if (ret) return ret; + while (!dlist_empty(&wait->fid_list)) { + dlist_pop_front(&wait->fid_list, struct ofi_wait_fid_entry, + fid_entry, entry); + free(fid_entry->pollfds.fd); + free(fid_entry); + } + + fastlock_destroy(&wait->lock); ofi_atomic_dec32(&wait->fabric->ref); return 0; } -int fi_wait_init(struct util_fabric *fabric, struct fi_wait_attr *attr, - struct util_wait *wait) +int ofi_wait_init(struct util_fabric *fabric, struct fi_wait_attr *attr, + struct util_wait *wait) { struct fid_poll *poll_fid; struct fi_poll_attr poll_attr; @@ -123,11 +146,13 @@ int fi_wait_init(struct util_fabric *fabric, struct fi_wait_attr *attr, switch (attr->wait_obj) { case FI_WAIT_UNSPEC: - case FI_WAIT_FD: wait->wait_obj = FI_WAIT_FD; break; + case FI_WAIT_FD: + case FI_WAIT_POLLFD: case FI_WAIT_MUTEX_COND: - wait->wait_obj = FI_WAIT_MUTEX_COND; + case FI_WAIT_YIELD: + wait->wait_obj = attr->wait_obj; break; default: assert(0); @@ -140,29 +165,55 @@ int fi_wait_init(struct util_fabric *fabric, struct fi_wait_attr *attr, return ret; wait->pollset = container_of(poll_fid, struct util_poll, poll_fid); + fastlock_init(&wait->lock); + dlist_init(&wait->fid_list); wait->fabric = fabric; ofi_atomic_inc32(&fabric->ref); return 0; } -static int ofi_wait_fd_match(struct dlist_entry *item, const void *arg) +static int ofi_wait_match_fd(struct dlist_entry *item, const void *arg) { struct ofi_wait_fd_entry *fd_entry; fd_entry = container_of(item, struct ofi_wait_fd_entry, entry); - return fd_entry->fd == *(int *)arg; + return fd_entry->fd == *(int *) arg; } -int ofi_wait_fd_del(struct util_wait *wait, int fd) +int ofi_wait_fdset_del(struct util_wait_fd *wait_fd, int fd) +{ + wait_fd->change_index++; + + return (wait_fd->util_wait.wait_obj == FI_WAIT_FD) ? + ofi_epoll_del(wait_fd->epoll_fd, fd) : + ofi_pollfds_del(wait_fd->pollfds, fd); +} + +static int ofi_wait_fdset_add(struct util_wait_fd *wait_fd, int fd, + uint32_t events, void *context) +{ + int ret; + + wait_fd->change_index++; + if (wait_fd->util_wait.wait_obj == FI_WAIT_FD) { + ret = ofi_epoll_add(wait_fd->epoll_fd, fd, + ofi_poll_to_epoll(events), context); + } else { + ret = ofi_pollfds_add(wait_fd->pollfds, fd, events, context); + } + return ret; +} + +int ofi_wait_del_fd(struct util_wait *wait, int fd) { - int ret = 0; struct ofi_wait_fd_entry *fd_entry; struct dlist_entry *entry; - struct util_wait_fd *wait_fd = container_of(wait, struct util_wait_fd, - util_wait); + struct util_wait_fd *wait_fd; + int ret = 0; - fastlock_acquire(&wait_fd->lock); - entry = dlist_find_first_match(&wait_fd->fd_list, ofi_wait_fd_match, &fd); + wait_fd = container_of(wait, struct util_wait_fd, util_wait); + fastlock_acquire(&wait->lock); + entry = dlist_find_first_match(&wait_fd->fd_list, ofi_wait_match_fd, &fd); if (!entry) { FI_INFO(wait->prov, FI_LOG_FABRIC, "Given fd (%d) not found in wait list - %p\n", @@ -170,28 +221,30 @@ int ofi_wait_fd_del(struct util_wait *wait, int fd) ret = -FI_EINVAL; goto out; } + fd_entry = container_of(entry, struct ofi_wait_fd_entry, entry); if (ofi_atomic_dec32(&fd_entry->ref)) goto out; + dlist_remove(&fd_entry->entry); - fi_epoll_del(wait_fd->epoll_fd, fd_entry->fd); + ofi_wait_fdset_del(wait_fd, fd_entry->fd); free(fd_entry); out: - fastlock_release(&wait_fd->lock); + fastlock_release(&wait->lock); return ret; } -int ofi_wait_fd_add(struct util_wait *wait, int fd, uint32_t events, - ofi_wait_fd_try_func wait_try, void *arg, void *context) +int ofi_wait_add_fd(struct util_wait *wait, int fd, uint32_t events, + ofi_wait_try_func wait_try, void *arg, void *context) { struct ofi_wait_fd_entry *fd_entry; struct dlist_entry *entry; - struct util_wait_fd *wait_fd = container_of(wait, struct util_wait_fd, - util_wait); + struct util_wait_fd *wait_fd; int ret = 0; - fastlock_acquire(&wait_fd->lock); - entry = dlist_find_first_match(&wait_fd->fd_list, ofi_wait_fd_match, &fd); + wait_fd = container_of(wait, struct util_wait_fd, util_wait); + fastlock_acquire(&wait->lock); + entry = dlist_find_first_match(&wait_fd->fd_list, ofi_wait_match_fd, &fd); if (entry) { FI_DBG(wait->prov, FI_LOG_EP_CTRL, "Given fd (%d) already added to wait list - %p \n", @@ -201,18 +254,20 @@ int ofi_wait_fd_add(struct util_wait *wait, int fd, uint32_t events, goto out; } - ret = fi_epoll_add(wait_fd->epoll_fd, fd, events, context); + ret = ofi_wait_fdset_add(wait_fd, fd, events, context); if (ret) { - FI_WARN(wait->prov, FI_LOG_FABRIC, "Unable to add fd to epoll\n"); + FI_WARN(wait->prov, FI_LOG_FABRIC, + "Unable to add fd to epoll\n"); goto out; } fd_entry = calloc(1, sizeof *fd_entry); if (!fd_entry) { ret = -FI_ENOMEM; - fi_epoll_del(wait_fd->epoll_fd, fd); + ofi_wait_fdset_del(wait_fd, fd); goto out; } + fd_entry->fd = fd; fd_entry->wait_try = wait_try; fd_entry->arg = arg; @@ -220,7 +275,7 @@ int ofi_wait_fd_add(struct util_wait *wait, int fd, uint32_t events, dlist_insert_tail(&fd_entry->entry, &wait_fd->fd_list); out: - fastlock_release(&wait_fd->lock); + fastlock_release(&wait->lock); return ret; } @@ -231,8 +286,64 @@ static void util_wait_fd_signal(struct util_wait *util_wait) fd_signal_set(&wait->signal); } +static int util_wait_update_pollfd(struct util_wait_fd *wait_fd, + struct ofi_wait_fid_entry *fid_entry) +{ + struct fi_wait_pollfd pollfds = { 0 }; + struct pollfd *fds; + size_t i; + int ret; + + ret = fi_control(fid_entry->fid, FI_GETWAIT, &pollfds); + if (ret != FI_ETOOSMALL) + return ret; + + if (pollfds.change_index == fid_entry->pollfds.change_index) + return 0; + + fds = fid_entry->pollfds.fd; + for (i = 0; i < fid_entry->pollfds.nfds; i++) { + ret = ofi_wait_fdset_del(wait_fd, fds->fd); + if (ret) { + FI_WARN(wait_fd->util_wait.prov, FI_LOG_EP_CTRL, + "epoll_del failed %s\n", fi_strerror(ret)); + } + } + + if (fid_entry->pollfds.nfds < pollfds.nfds) { + fds = calloc(pollfds.nfds, sizeof(*fds)); + if (!fds) + return -FI_ENOMEM; + + free(fid_entry->pollfds.fd); + fid_entry->pollfds.fd = fds; + fid_entry->pollfds.nfds = pollfds.nfds; + } + + ret = fi_control(fid_entry->fid, FI_GETWAIT, &fid_entry->pollfds); + if (ret) { + FI_WARN(wait_fd->util_wait.prov, FI_LOG_EP_CTRL, + "unable to get wait pollfd %s\n", fi_strerror(ret)); + return ret; + } + + fds = fid_entry->pollfds.fd; + for (i = 0; i < fid_entry->pollfds.nfds; i++) { + ret = ofi_wait_fdset_add(wait_fd, fds[i].fd, fds[i].events, + fid_entry->fid->context); + if (ret) { + FI_WARN(wait_fd->util_wait.prov, FI_LOG_EP_CTRL, + "unable to add fd %s\n", fi_strerror(ret)); + return ret; + } + } + + return -FI_EAGAIN; +} + static int util_wait_fd_try(struct util_wait *wait) { + struct ofi_wait_fid_entry *fid_entry; struct ofi_wait_fd_entry *fd_entry; struct util_wait_fd *wait_fd; void *context; @@ -240,18 +351,34 @@ static int util_wait_fd_try(struct util_wait *wait) wait_fd = container_of(wait, struct util_wait_fd, util_wait); fd_signal_reset(&wait_fd->signal); - fastlock_acquire(&wait_fd->lock); + fastlock_acquire(&wait->lock); dlist_foreach_container(&wait_fd->fd_list, struct ofi_wait_fd_entry, fd_entry, entry) { ret = fd_entry->wait_try(fd_entry->arg); - if (ret != FI_SUCCESS) { - fastlock_release(&wait_fd->lock); - return ret; + if (ret != FI_SUCCESS) + goto release; + } + + dlist_foreach_container(&wait->fid_list, + struct ofi_wait_fid_entry, fid_entry, entry) { + if (fid_entry->wait_obj == FI_WAIT_POLLFD) { + ret = util_wait_update_pollfd(wait_fd, fid_entry); + if (ret) + goto release; } + + ret = fid_entry->wait_try(fid_entry->fid); + if (ret != FI_SUCCESS) + goto release; } - fastlock_release(&wait_fd->lock); + + fastlock_release(&wait->lock); ret = fi_poll(&wait->pollset->poll_fid, &context, 1); return (ret > 0) ? -FI_EAGAIN : (ret == -FI_EAGAIN) ? FI_SUCCESS : ret; + +release: + fastlock_release(&wait->lock); + return ret; } static int util_wait_fd_run(struct fid_wait *wait_fid, int timeout) @@ -272,7 +399,9 @@ static int util_wait_fd_run(struct fid_wait *wait_fid, int timeout) if (ofi_adjust_timeout(endtime, &timeout)) return -FI_ETIMEDOUT; - ret = fi_epoll_wait(wait->epoll_fd, ep_context, 1, timeout); + ret = (wait->util_wait.wait_obj == FI_WAIT_FD) ? + ofi_epoll_wait(wait->epoll_fd, ep_context, 1, timeout) : + ofi_pollfds_wait(wait->pollfds, ep_context, 1, timeout); if (ret > 0) return FI_SUCCESS; @@ -287,17 +416,37 @@ static int util_wait_fd_run(struct fid_wait *wait_fid, int timeout) static int util_wait_fd_control(struct fid *fid, int command, void *arg) { struct util_wait_fd *wait; + struct fi_wait_pollfd *pollfd; int ret; wait = container_of(fid, struct util_wait_fd, util_wait.wait_fid.fid); switch (command) { case FI_GETWAIT: + if (wait->util_wait.wait_obj == FI_WAIT_FD) { #ifdef HAVE_EPOLL - *(int *) arg = wait->epoll_fd; - ret = 0; + *(int *) arg = wait->epoll_fd; + return 0; #else - ret = -FI_ENOSYS; + return -FI_ENODATA; #endif + } + + pollfd = arg; + fastlock_acquire(&wait->util_wait.lock); + if (pollfd->nfds >= wait->pollfds->nfds) { + memcpy(pollfd->fd, &wait->pollfds->fds[0], + wait->pollfds->nfds * sizeof(*wait->pollfds->fds)); + ret = 0; + } else { + ret = -FI_ETOOSMALL; + } + pollfd->change_index = wait->change_index; + pollfd->nfds = wait->pollfds->nfds; + fastlock_release(&wait->util_wait.lock); + break; + case FI_GETWAITOBJ: + *(enum fi_wait_obj *) arg = wait->util_wait.wait_obj; + ret = 0; break; default: FI_INFO(wait->util_wait.prov, FI_LOG_FABRIC, @@ -315,23 +464,27 @@ static int util_wait_fd_close(struct fid *fid) int ret; wait = container_of(fid, struct util_wait_fd, util_wait.wait_fid.fid); - ret = fi_wait_cleanup(&wait->util_wait); - if (ret) - return ret; - fastlock_acquire(&wait->lock); + fastlock_acquire(&wait->util_wait.lock); while (!dlist_empty(&wait->fd_list)) { dlist_pop_front(&wait->fd_list, struct ofi_wait_fd_entry, fd_entry, entry); - fi_epoll_del(wait->epoll_fd, fd_entry->fd); + ofi_wait_fdset_del(wait, fd_entry->fd); free(fd_entry); } - fastlock_release(&wait->lock); + fastlock_release(&wait->util_wait.lock); + + ret = fi_wait_cleanup(&wait->util_wait); + if (ret) + return ret; - fi_epoll_del(wait->epoll_fd, wait->signal.fd[FI_READ_FD]); + ofi_wait_fdset_del(wait, wait->signal.fd[FI_READ_FD]); fd_signal_free(&wait->signal); - fi_epoll_close(wait->epoll_fd); - fastlock_destroy(&wait->lock); + + if (wait->util_wait.wait_obj == FI_WAIT_FD) + ofi_epoll_close(wait->epoll_fd); + else + ofi_pollfds_close(wait->pollfds); free(wait); return 0; } @@ -361,6 +514,7 @@ static int util_verify_wait_fd_attr(const struct fi_provider *prov, switch (attr->wait_obj) { case FI_WAIT_UNSPEC: case FI_WAIT_FD: + case FI_WAIT_POLLFD: break; default: FI_WARN(prov, FI_LOG_FABRIC, "unsupported wait object\n"); @@ -386,7 +540,7 @@ int ofi_wait_fd_open(struct fid_fabric *fabric_fid, struct fi_wait_attr *attr, if (!wait) return -FI_ENOMEM; - ret = fi_wait_init(fabric, attr, &wait->util_wait); + ret = ofi_wait_init(fabric, attr, &wait->util_wait); if (ret) goto err1; @@ -396,12 +550,14 @@ int ofi_wait_fd_open(struct fid_fabric *fabric_fid, struct fi_wait_attr *attr, if (ret) goto err2; - ret = fi_epoll_create(&wait->epoll_fd); + ret = (wait->util_wait.wait_obj == FI_WAIT_FD) ? + ofi_epoll_create(&wait->epoll_fd) : + ofi_pollfds_create(&wait->pollfds); if (ret) goto err3; - ret = fi_epoll_add(wait->epoll_fd, wait->signal.fd[FI_READ_FD], - FI_EPOLL_IN, &wait->util_wait.wait_fid.fid); + ret = ofi_wait_fdset_add(wait, wait->signal.fd[FI_READ_FD], + POLLIN, &wait->util_wait.wait_fid.fid); if (ret) goto err4; @@ -409,13 +565,15 @@ int ofi_wait_fd_open(struct fid_fabric *fabric_fid, struct fi_wait_attr *attr, wait->util_wait.wait_fid.ops = &util_wait_fd_ops; dlist_init(&wait->fd_list); - fastlock_init(&wait->lock); *waitset = &wait->util_wait.wait_fid; return 0; err4: - fi_epoll_close(wait->epoll_fd); + if (wait->util_wait.wait_obj == FI_WAIT_FD) + ofi_epoll_close(wait->epoll_fd); + else + ofi_pollfds_close(wait->pollfds); err3: fd_signal_free(&wait->signal); err2: @@ -424,3 +582,290 @@ int ofi_wait_fd_open(struct fid_fabric *fabric_fid, struct fi_wait_attr *attr, free(wait); return ret; } + +static void util_wait_yield_signal(struct util_wait *util_wait) +{ + struct util_wait_yield *wait_yield; + + wait_yield = container_of(util_wait, struct util_wait_yield, util_wait); + + fastlock_acquire(&wait_yield->signal_lock); + wait_yield->signal = 1; + fastlock_release(&wait_yield->signal_lock); +} + +static int util_wait_yield_run(struct fid_wait *wait_fid, int timeout) +{ + struct util_wait_yield *wait; + struct ofi_wait_fid_entry *fid_entry; + int ret = 0; + + wait = container_of(wait_fid, struct util_wait_yield, util_wait.wait_fid); + while (!wait->signal) { + fastlock_acquire(&wait->util_wait.lock); + dlist_foreach_container(&wait->util_wait.fid_list, + struct ofi_wait_fid_entry, + fid_entry, entry) { + ret = fid_entry->wait_try(fid_entry->fid); + if (ret) { + fastlock_release(&wait->util_wait.lock); + return ret; + } + } + fastlock_release(&wait->util_wait.lock); + pthread_yield(); + } + + fastlock_acquire(&wait->signal_lock); + wait->signal = 0; + fastlock_release(&wait->signal_lock); + + return FI_SUCCESS; +} + +static int util_wait_yield_close(struct fid *fid) +{ + struct util_wait_yield *wait; + int ret; + + wait = container_of(fid, struct util_wait_yield, util_wait.wait_fid.fid); + ret = fi_wait_cleanup(&wait->util_wait); + if (ret) + return ret; + + fastlock_destroy(&wait->signal_lock); + free(wait); + return 0; +} + +static struct fi_ops_wait util_wait_yield_ops = { + .size = sizeof(struct fi_ops_wait), + .wait = util_wait_yield_run, +}; + +static struct fi_ops util_wait_yield_fi_ops = { + .size = sizeof(struct fi_ops), + .close = util_wait_yield_close, + .bind = fi_no_bind, + .control = fi_no_control, + .ops_open = fi_no_ops_open, +}; + +static int util_verify_wait_yield_attr(const struct fi_provider *prov, + const struct fi_wait_attr *attr) +{ + int ret; + + ret = ofi_check_wait_attr(prov, attr); + if (ret) + return ret; + + switch (attr->wait_obj) { + case FI_WAIT_UNSPEC: + case FI_WAIT_YIELD: + break; + default: + FI_WARN(prov, FI_LOG_FABRIC, "unsupported wait object\n"); + return -FI_EINVAL; + } + + return 0; +} + +int ofi_wait_yield_open(struct fid_fabric *fabric_fid, struct fi_wait_attr *attr, + struct fid_wait **waitset) +{ + struct util_fabric *fabric; + struct util_wait_yield *wait; + int ret; + + fabric = container_of(fabric_fid, struct util_fabric, fabric_fid); + ret = util_verify_wait_yield_attr(fabric->prov, attr); + if (ret) + return ret; + + attr->wait_obj = FI_WAIT_YIELD; + wait = calloc(1, sizeof(*wait)); + if (!wait) + return -FI_ENOMEM; + + ret = ofi_wait_init(fabric, attr, &wait->util_wait); + if (ret) { + free(wait); + return ret; + } + + wait->util_wait.signal = util_wait_yield_signal; + wait->signal = 0; + + wait->util_wait.wait_fid.fid.ops = &util_wait_yield_fi_ops; + wait->util_wait.wait_fid.ops = &util_wait_yield_ops; + + fastlock_init(&wait->signal_lock); + + *waitset = &wait->util_wait.wait_fid; + + return 0; +} + +static int ofi_wait_match_fid(struct dlist_entry *item, const void *arg) +{ + struct ofi_wait_fid_entry *fid_entry; + + fid_entry = container_of(item, struct ofi_wait_fid_entry, entry); + return fid_entry->fid == arg; +} + +int ofi_wait_del_fid(struct util_wait *wait, fid_t fid) +{ + struct ofi_wait_fid_entry *fid_entry; + struct util_wait_fd *wait_fd; + struct dlist_entry *entry; + struct pollfd *fds; + size_t i; + int ret = 0; + + fastlock_acquire(&wait->lock); + entry = dlist_find_first_match(&wait->fid_list, + ofi_wait_match_fid, fid); + if (!entry) { + FI_INFO(wait->prov, FI_LOG_EP_CTRL, + "Given fid (%p) not found in wait list - %p\n", + fid, wait); + ret = -FI_EINVAL; + goto out; + } + + fid_entry = container_of(entry, struct ofi_wait_fid_entry, entry); + if (ofi_atomic_dec32(&fid_entry->ref)) + goto out; + + wait_fd = container_of(wait, struct util_wait_fd, util_wait); + fds = fid_entry->pollfds.fd; + for (i = 0; i < fid_entry->pollfds.nfds; i++) { + assert(fds); + ret = ofi_wait_fdset_del(wait_fd, fds->fd); + if (ret) { + FI_WARN(wait->prov, FI_LOG_EP_CTRL, + "epoll_del failed %s\n", fi_strerror(ret)); + } + } + + dlist_remove(&fid_entry->entry); + free(fid_entry->pollfds.fd); + free(fid_entry); +out: + fastlock_release(&wait->lock); + return ret; +} + +static int ofi_wait_get_fd(struct util_wait_fd *wait_fd, + struct ofi_wait_fid_entry *fid_entry) +{ + struct pollfd *fds; + int ret; + + fds = calloc(1, sizeof(*fds)); + if (!fds) + return -FI_ENOMEM; + + ret = fi_control(fid_entry->fid, FI_GETWAIT, &fds->fd); + if (ret) { + FI_WARN(wait_fd->util_wait.prov, FI_LOG_EP_CTRL, + "unable to get wait fd %s\n", fi_strerror(ret)); + goto free; + } + + fds->events = fid_entry->events; + fid_entry->pollfds.fd = fds; + fid_entry->pollfds.nfds = 1; + return 0; + +free: + free(fds); + return ret; +} + +static int ofi_wait_get_fid_fds(struct util_wait *wait, + struct ofi_wait_fid_entry *fid_entry) +{ + struct util_wait_fd *wait_fd; + struct pollfd *fds; + size_t i; + int ret; + + ret = fi_control(fid_entry->fid, FI_GETWAITOBJ, + &fid_entry->wait_obj); + if ((fid_entry->wait_obj != FI_WAIT_FD) && + (fid_entry->wait_obj != FI_WAIT_POLLFD)) { + FI_WARN(wait->prov, FI_LOG_EP_CTRL, + "unsupported wait object %d (ret: %s)\n", + fid_entry->wait_obj, fi_strerror(ret)); + return ret; + } + + /* pollfd is updated during trywait */ + if (fid_entry->wait_obj == FI_WAIT_POLLFD) + return 0; + + wait_fd = container_of(wait, struct util_wait_fd, util_wait); + ret = ofi_wait_get_fd(wait_fd, fid_entry); + if (ret) + return ret; + + fds = fid_entry->pollfds.fd; + for (i = 0; i < fid_entry->pollfds.nfds; i++) { + ret = ofi_wait_fdset_add(wait_fd, fds[i].fd, fds[i].events, + fid_entry->fid->context); + if (ret) { + FI_WARN(wait->prov, FI_LOG_EP_CTRL, + "unable to add fd %s\n", fi_strerror(ret)); + return ret; + } + } + + return 0; +} + +int ofi_wait_add_fid(struct util_wait *wait, fid_t fid, uint32_t events, + ofi_wait_try_func wait_try) +{ + struct ofi_wait_fid_entry *fid_entry; + struct dlist_entry *entry; + int ret = 0; + + fastlock_acquire(&wait->lock); + entry = dlist_find_first_match(&wait->fid_list, + ofi_wait_match_fid, fid); + if (entry) { + FI_DBG(wait->prov, FI_LOG_EP_CTRL, + "Given fid (%p) already added to wait list - %p \n", + fid, wait); + fid_entry = container_of(entry, struct ofi_wait_fid_entry, entry); + ofi_atomic_inc32(&fid_entry->ref); + goto out; + } + + fid_entry = calloc(1, sizeof *fid_entry); + if (!fid_entry) { + ret = -FI_ENOMEM; + goto out; + } + + fid_entry->fid = fid; + fid_entry->wait_try = wait_try; + fid_entry->events = events; + ofi_atomic_initialize32(&fid_entry->ref, 1); + + if (wait->wait_obj == FI_WAIT_FD || wait->wait_obj == FI_WAIT_POLLFD) { + ret = ofi_wait_get_fid_fds(wait, fid_entry); + if (ret) { + free(fid_entry); + goto out; + } + } + dlist_insert_tail(&fid_entry->entry, &wait->fid_list); +out: + fastlock_release(&wait->lock); + return ret; +} diff --git a/prov/verbs/configure.m4 b/prov/verbs/configure.m4 index 3ae5c6c4f23..2d51072c6f1 100644 --- a/prov/verbs/configure.m4 +++ b/prov/verbs/configure.m4 @@ -47,6 +47,16 @@ AC_DEFUN([FI_VERBS_CONFIGURE],[ AS_IF([test $verbs_ibverbs_happy -eq 1 && \ test $verbs_rdmacm_happy -eq 1], [$1], [$2]) + #See if we have extended verbs calls + VERBS_HAVE_QUERY_EX=0 + AS_IF([test $verbs_ibverbs_happy -eq 1],[ + AC_CHECK_DECL([ibv_query_device_ex], + [VERBS_HAVE_QUERY_EX=1],[], + [#include ]) + ]) + AC_DEFINE_UNQUOTED([VERBS_HAVE_QUERY_EX],[$VERBS_HAVE_QUERY_EX], + [Whether infiniband/verbs.h has ibv_query_device_ex() support or not]) + #See if we have XRC support VERBS_HAVE_XRC=0 AS_IF([test $verbs_ibverbs_happy -eq 1 && \ @@ -58,6 +68,17 @@ AC_DEFUN([FI_VERBS_CONFIGURE],[ AC_DEFINE_UNQUOTED([VERBS_HAVE_XRC],[$VERBS_HAVE_XRC], [Whether infiniband/verbs.h has XRC support or not]) + #See if we have rdma-core rdma_establish support + VERBS_HAVE_RDMA_ESTABLISH=0 + AS_IF([test $verbs_ibverbs_happy -eq 1 && \ + test $verbs_rdmacm_ex_happy -eq 1],[ + AC_CHECK_DECL([rdma_establish], + [VERBS_HAVE_RDMA_ESTABLISH=1],[], + [#include ]) + ]) + AC_DEFINE_UNQUOTED([VERBS_HAVE_RDMA_ESTABLISH],[$VERBS_HAVE_RDMA_ESTABLISH], + [Whether rdma/rdma_cma.h has rdma_establish() support or not]) + # Technically, verbs_ibverbs_CPPFLAGS and # verbs_rdmacm_CPPFLAGS could be different, but it is highly # unlikely that they ever will be. So only list diff --git a/prov/verbs/src/fi_verbs.c b/prov/verbs/src/fi_verbs.c index 15c55f2fd23..665b22d69fe 100644 --- a/prov/verbs/src/fi_verbs.c +++ b/prov/verbs/src/fi_verbs.c @@ -35,20 +35,22 @@ #include #include "fi_verbs.h" +#include "ofi_hmem.h" -static void fi_ibv_fini(void); +static void vrb_fini(void); static const char *local_node = "localhost"; #define VERBS_DEFAULT_MIN_RNR_TIMER 12 -struct fi_ibv_gl_data fi_ibv_gl_data = { +struct vrb_gl_data vrb_gl_data = { .def_tx_size = 384, .def_rx_size = 384, .def_tx_iov_limit = 4, .def_rx_iov_limit = 4, .def_inline_size = 256, .min_rnr_timer = VERBS_DEFAULT_MIN_RNR_TIMER, + .use_odp = 0, .cqread_bunch_size = 8, .iface = NULL, .gid_idx = 0, @@ -65,7 +67,7 @@ struct fi_ibv_gl_data fi_ibv_gl_data = { }, }; -struct fi_ibv_dev_preset { +struct vrb_dev_preset { int max_inline_data; const char *dev_name_prefix; } verbs_dev_presets[] = { @@ -75,24 +77,24 @@ struct fi_ibv_dev_preset { }, }; -struct fi_provider fi_ibv_prov = { +struct fi_provider vrb_prov = { .name = VERBS_PROV_NAME, - .version = VERBS_PROV_VERS, - .fi_version = FI_VERSION(1, 8), - .getinfo = fi_ibv_getinfo, - .fabric = fi_ibv_fabric, - .cleanup = fi_ibv_fini + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST, + .getinfo = vrb_getinfo, + .fabric = vrb_fabric, + .cleanup = vrb_fini }; -struct util_prov fi_ibv_util_prov = { - .prov = &fi_ibv_prov, +struct util_prov vrb_util_prov = { + .prov = &vrb_prov, .info = NULL, /* The support of the shared recieve contexts * is dynamically calculated */ .flags = 0, }; -int fi_ibv_sockaddr_len(struct sockaddr *addr) +int vrb_sockaddr_len(struct sockaddr *addr) { if (addr->sa_family == AF_IB) return sizeof(struct sockaddr_ib); @@ -100,13 +102,17 @@ int fi_ibv_sockaddr_len(struct sockaddr *addr) return ofi_sizeofaddr(addr); } -int fi_ibv_get_rdma_rai(const char *node, const char *service, uint64_t flags, - const struct fi_info *hints, struct rdma_addrinfo **rai) +static int +vrb_get_rdmacm_rai(const char *node, const char *service, uint64_t flags, + uint32_t addr_format, void *src_addr, size_t src_addrlen, + void *dest_addr, size_t dest_addrlen, struct rdma_addrinfo **rai) { struct rdma_addrinfo rai_hints, *_rai; - struct rdma_addrinfo **rai_current; - int ret = fi_ibv_fi_to_rai(hints, flags, &rai_hints); + struct rdma_addrinfo **cur, *next; + int ret; + ret = vrb_set_rai(addr_format, src_addr, src_addrlen, dest_addr, + dest_addrlen, flags, &rai_hints); if (ret) goto out; @@ -116,37 +122,32 @@ int fi_ibv_get_rdma_rai(const char *node, const char *service, uint64_t flags, rai_hints.ai_flags |= RAI_PASSIVE; } - ret = rdma_getaddrinfo((char *) node, (char *) service, - &rai_hints, &_rai); + ret = rdma_getaddrinfo(node, service, &rai_hints, &_rai); if (ret) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_getaddrinfo", errno); - if (errno) { + if (errno) ret = -errno; - } goto out; } /* - * If caller requested rai, remove ib_rai entries added by IBACM to + * Remove ib_rai entries added by IBACM to * prevent wrong ib_connect_hdr from being sent in connect request. */ - if (rai && hints && (hints->addr_format != FI_SOCKADDR_IB)) { - for (rai_current = &_rai; *rai_current;) { - struct rdma_addrinfo *rai_next; - if ((*rai_current)->ai_family == AF_IB) { - rai_next = (*rai_current)->ai_next; - (*rai_current)->ai_next = NULL; - rdma_freeaddrinfo(*rai_current); - *rai_current = rai_next; - continue; + if (addr_format && (addr_format != FI_SOCKADDR_IB)) { + for (cur = &_rai; *cur; ) { + if ((*cur)->ai_family == AF_IB) { + next = (*cur)->ai_next; + (*cur)->ai_next = NULL; + rdma_freeaddrinfo(*cur); + *cur = next; + } else { + cur = &(*cur)->ai_next; } - rai_current = &(*rai_current)->ai_next; } } - if (rai) - *rai = _rai; - + *rai = _rai; out: if (rai_hints.ai_src_addr) free(rai_hints.ai_src_addr); @@ -155,18 +156,123 @@ int fi_ibv_get_rdma_rai(const char *node, const char *service, uint64_t flags, return ret; } -int fi_ibv_get_rai_id(const char *node, const char *service, uint64_t flags, +static int +vrb_get_sib_rai(const char *node, const char *service, uint64_t flags, + uint32_t addr_format, void *src_addr, size_t src_addrlen, + void *dest_addr, size_t dest_addrlen, struct rdma_addrinfo **rai) +{ + struct sockaddr_ib *sib; + size_t sib_len; + char *straddr; + uint32_t fmt; + int ret; + bool has_prefix; + const char *prefix = "fi_sockaddr_ib://"; + + *rai = calloc(1, sizeof(struct rdma_addrinfo)); + if (*rai == NULL) + return -FI_ENOMEM; + + ret = vrb_set_rai(addr_format, src_addr, src_addrlen, dest_addr, + dest_addrlen, flags, *rai); + if (ret) + return ret; + + if (node) { + fmt = ofi_addr_format(node); + if (fmt == FI_SOCKADDR_IB) + has_prefix = true; + else if (fmt == FI_FORMAT_UNSPEC) + has_prefix = false; + else + return -FI_EINVAL; + + if (service) { + ret = asprintf(&straddr, "%s%s:%s", has_prefix ? "" : prefix, + node, service); + } else { + ret = asprintf(&straddr, "%s%s", has_prefix ? "" : prefix, node); + } + + if (ret == -1) + return -FI_ENOMEM; + + ret = ofi_str_toaddr(straddr, &fmt, (void **)&sib, &sib_len); + free(straddr); + + if (ret || fmt != FI_SOCKADDR_IB) { + return -FI_EINVAL; + } + + if (flags & FI_SOURCE) { + (*rai)->ai_flags |= RAI_PASSIVE; + if ((*rai)->ai_src_addr) + free((*rai)->ai_src_addr); + (*rai)->ai_src_addr = (void *)sib; + (*rai)->ai_src_len = sizeof(struct sockaddr_ib); + } else { + if ((*rai)->ai_dst_addr) + free((*rai)->ai_dst_addr); + (*rai)->ai_dst_addr = (void *)sib; + (*rai)->ai_dst_len = sizeof(struct sockaddr_ib); + } + + } else if (service) { + if ((flags & FI_SOURCE) && (*rai)->ai_src_addr) { + if ((*rai)->ai_src_len < sizeof(struct sockaddr_ib)) + return -FI_EINVAL; + + (*rai)->ai_src_len = sizeof(struct sockaddr_ib); + sib = (struct sockaddr_ib *)(*rai)->ai_src_addr; + } else { + if ((*rai)->ai_dst_len < sizeof(struct sockaddr_ib)) + return -FI_EINVAL; + + (*rai)->ai_dst_len = sizeof(struct sockaddr_ib); + sib = (struct sockaddr_ib *)(*rai)->ai_dst_addr; + } + + sib->sib_sid = htonll(((uint64_t) RDMA_PS_IB << 16) + (uint16_t)atoi(service)); + sib->sib_sid_mask = htonll(OFI_IB_IP_PS_MASK | OFI_IB_IP_PORT_MASK); + } + + return 0; +} + +static int +vrb_get_rdma_rai(const char *node, const char *service, uint32_t addr_format, + void *src_addr, size_t src_addrlen, void *dest_addr, + size_t dest_addrlen, uint64_t flags, struct rdma_addrinfo **rai) +{ + if (addr_format == FI_SOCKADDR_IB && (node || src_addr || dest_addr)) { + return vrb_get_sib_rai(node, service, flags, addr_format, src_addr, + src_addrlen, dest_addr, dest_addrlen, rai); + } + + return vrb_get_rdmacm_rai(node, service, flags, addr_format, src_addr, + src_addrlen, dest_addr, dest_addrlen, rai); +} + +int vrb_get_rai_id(const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct rdma_addrinfo **rai, struct rdma_cm_id **id) { int ret; // TODO create a similar function that won't require pruning ib_rai - ret = fi_ibv_get_rdma_rai(node, service, flags, hints, rai); + if (hints) { + ret = vrb_get_rdma_rai(node, service, hints->addr_format, hints->src_addr, + hints->src_addrlen, hints->dest_addr, + hints->dest_addrlen, flags, rai); + } else { + ret = vrb_get_rdma_rai(node, service, FI_FORMAT_UNSPEC, NULL, 0, NULL, + 0, flags, rai); + } if (ret) return ret; - ret = rdma_create_id(NULL, id, NULL, RDMA_PS_TCP); + ret = rdma_create_id(NULL, id, NULL, vrb_get_port_space(hints ? hints->addr_format: + FI_FORMAT_UNSPEC)); if (ret) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_create_id", errno); ret = -errno; @@ -177,6 +283,8 @@ int fi_ibv_get_rai_id(const char *node, const char *service, uint64_t flags, ret = rdma_bind_addr(*id, (*rai)->ai_src_addr); if (ret) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_bind_addr", errno); + ofi_straddr_log(&vrb_prov, FI_LOG_INFO, FI_LOG_FABRIC, + "bind addr", (*rai)->ai_src_addr); ret = -errno; goto err2; } @@ -184,9 +292,13 @@ int fi_ibv_get_rai_id(const char *node, const char *service, uint64_t flags, } ret = rdma_resolve_addr(*id, (*rai)->ai_src_addr, - (*rai)->ai_dst_addr, 2000); + (*rai)->ai_dst_addr, VERBS_RESOLVE_TIMEOUT); if (ret) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_resolve_addr", errno); + ofi_straddr_log(&vrb_prov, FI_LOG_INFO, FI_LOG_FABRIC, + "src addr", (*rai)->ai_src_addr); + ofi_straddr_log(&vrb_prov, FI_LOG_INFO, FI_LOG_FABRIC, + "dst addr", (*rai)->ai_dst_addr); ret = -errno; goto err2; } @@ -199,45 +311,57 @@ int fi_ibv_get_rai_id(const char *node, const char *service, uint64_t flags, return ret; } -int fi_ibv_create_ep(const char *node, const char *service, - uint64_t flags, const struct fi_info *hints, - struct rdma_addrinfo **rai, struct rdma_cm_id **id) +int vrb_create_ep(struct vrb_ep *ep, enum rdma_port_space ps, + struct rdma_cm_id **id) { - struct rdma_addrinfo *_rai = NULL; + struct rdma_addrinfo *rai = NULL; int ret; - ret = fi_ibv_get_rdma_rai(node, service, flags, hints, &_rai); + ret = vrb_get_rdma_rai(NULL, NULL, ep->info_attr.addr_format, + ep->info_attr.src_addr, ep->info_attr.src_addrlen, + ep->info_attr.dest_addr, ep->info_attr.dest_addrlen, + 0, &rai); if (ret) { return ret; } - ret = rdma_create_ep(id, _rai, NULL, NULL); - if (ret) { - VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_create_ep", errno); + if (rdma_create_id(NULL, id, NULL, ps)) { ret = -errno; + FI_WARN(&vrb_prov, FI_LOG_FABRIC, "rdma_create_id failed: " + "%s (%d)\n", strerror(-ret), -ret); goto err1; } - if (rai) { - *rai = _rai; - } else { - rdma_freeaddrinfo(_rai); + /* TODO convert this call to non-blocking (use event channel) as well: + * This may likely be needed for better scaling when running large + * MPI jobs. + * Making this non-blocking would mean we can't create QP at EP enable + * time. We need to wait for RDMA_CM_EVENT_ADDR_RESOLVED event before + * creating the QP using rdma_create_qp. It would also require a SW + * receive queue to store recvs posted by app after enabling the EP. + */ + if (rdma_resolve_addr(*id, rai->ai_src_addr, rai->ai_dst_addr, + VERBS_RESOLVE_TIMEOUT)) { + ret = -errno; + FI_WARN(&vrb_prov, FI_LOG_EP_CTRL, "rdma_resolve_addr failed: " + "%s (%d)\n", strerror(-ret), -ret); + ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, + "src addr", rai->ai_src_addr); + ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, + "dst addr", rai->ai_dst_addr); + goto err2; } + rdma_freeaddrinfo(rai); + return 0; - return ret; +err2: + rdma_destroy_id(*id); err1: - rdma_freeaddrinfo(_rai); - - return ret; -} - -void fi_ibv_destroy_ep(struct rdma_addrinfo *rai, struct rdma_cm_id **id) -{ rdma_freeaddrinfo(rai); - rdma_destroy_ep(*id); + return ret; } -static int fi_ibv_param_define(const char *param_name, const char *param_str, +static int vrb_param_define(const char *param_name, const char *param_str, enum fi_param_type type, void *param_default) { char *param_help, param_default_str[256] = { 0 }; @@ -292,7 +416,7 @@ static int fi_ibv_param_define(const char *param_name, const char *param_str, param_help[len - 1] = '\0'; - fi_param_define(&fi_ibv_prov, param_name, type, param_help); + fi_param_define(&vrb_prov, param_name, type, param_help); free(param_help); fn: @@ -300,7 +424,7 @@ static int fi_ibv_param_define(const char *param_name, const char *param_str, } #if ENABLE_DEBUG -static int fi_ibv_dbg_query_qp_attr(struct ibv_qp *qp) +static int vrb_dbg_query_qp_attr(struct ibv_qp *qp) { struct ibv_qp_init_attr attr = { 0 }; struct ibv_qp_attr qp_attr = { 0 }; @@ -312,7 +436,7 @@ static int fi_ibv_dbg_query_qp_attr(struct ibv_qp *qp) VERBS_WARN(FI_LOG_EP_CTRL, "Unable to query QP\n"); return ret; } - FI_DBG(&fi_ibv_prov, FI_LOG_EP_CTRL, "QP attributes: " + FI_DBG(&vrb_prov, FI_LOG_EP_CTRL, "QP attributes: " "min_rnr_timer" ": %" PRIu8 ", " "timeout" ": %" PRIu8 ", " "retry_cnt" ": %" PRIu8 ", " @@ -322,24 +446,24 @@ static int fi_ibv_dbg_query_qp_attr(struct ibv_qp *qp) return 0; } #else -static int fi_ibv_dbg_query_qp_attr(struct ibv_qp *qp) +static int vrb_dbg_query_qp_attr(struct ibv_qp *qp) { return 0; } #endif -int fi_ibv_set_rnr_timer(struct ibv_qp *qp) +int vrb_set_rnr_timer(struct ibv_qp *qp) { struct ibv_qp_attr attr = { 0 }; int ret; - if (fi_ibv_gl_data.min_rnr_timer > 31) { + if (vrb_gl_data.min_rnr_timer > 31) { VERBS_WARN(FI_LOG_EQ, "min_rnr_timer value out of valid range; " "using default value of %d\n", VERBS_DEFAULT_MIN_RNR_TIMER); attr.min_rnr_timer = VERBS_DEFAULT_MIN_RNR_TIMER; } else { - attr.min_rnr_timer = fi_ibv_gl_data.min_rnr_timer; + attr.min_rnr_timer = vrb_gl_data.min_rnr_timer; } /* XRC initiator QP do not have responder logic */ @@ -351,13 +475,13 @@ int fi_ibv_set_rnr_timer(struct ibv_qp *qp) VERBS_WARN(FI_LOG_EQ, "Unable to modify QP attribute\n"); return ret; } - ret = fi_ibv_dbg_query_qp_attr(qp); + ret = vrb_dbg_query_qp_attr(qp); if (ret) return ret; return 0; } -int fi_ibv_find_max_inline(struct ibv_pd *pd, struct ibv_context *context, +int vrb_find_max_inline(struct ibv_pd *pd, struct ibv_context *context, enum ibv_qp_type qp_type) { struct ibv_qp_init_attr qp_attr; @@ -382,7 +506,7 @@ int fi_ibv_find_max_inline(struct ibv_pd *pd, struct ibv_context *context, qp_attr.qp_type = qp_type; qp_attr.cap.max_send_wr = 1; qp_attr.cap.max_send_sge = 1; - if (!fi_ibv_is_xrc_send_qp(qp_type)) { + if (qp_type != IBV_QPT_XRC_SEND) { qp_attr.recv_cq = cq; qp_attr.cap.max_recv_wr = 1; qp_attr.cap.max_recv_sge = 1; @@ -443,37 +567,37 @@ int fi_ibv_find_max_inline(struct ibv_pd *pd, struct ibv_context *context, return rst; } -static int fi_ibv_get_param_int(const char *param_name, +static int vrb_get_param_int(const char *param_name, const char *param_str, int *param_default) { int param, ret; - ret = fi_ibv_param_define(param_name, param_str, + ret = vrb_param_define(param_name, param_str, FI_PARAM_INT, param_default); if (ret) return ret; - if (!fi_param_get_int(&fi_ibv_prov, param_name, ¶m)) + if (!fi_param_get_int(&vrb_prov, param_name, ¶m)) *param_default = param; return 0; } -static int fi_ibv_get_param_bool(const char *param_name, +static int vrb_get_param_bool(const char *param_name, const char *param_str, int *param_default) { int param, ret; - ret = fi_ibv_param_define(param_name, param_str, + ret = vrb_param_define(param_name, param_str, FI_PARAM_BOOL, param_default); if (ret) return ret; - if (!fi_param_get_bool(&fi_ibv_prov, param_name, ¶m)) { + if (!fi_param_get_bool(&vrb_prov, param_name, ¶m)) { *param_default = param; if ((*param_default != 1) && (*param_default != 0)) return -FI_EINVAL; @@ -482,156 +606,203 @@ static int fi_ibv_get_param_bool(const char *param_name, return 0; } -static int fi_ibv_get_param_str(const char *param_name, +static int vrb_get_param_str(const char *param_name, const char *param_str, char **param_default) { char *param; int ret; - ret = fi_ibv_param_define(param_name, param_str, + ret = vrb_param_define(param_name, param_str, FI_PARAM_STRING, param_default); if (ret) return ret; - if (!fi_param_get_str(&fi_ibv_prov, param_name, ¶m)) + if (!fi_param_get_str(&vrb_prov, param_name, ¶m)) *param_default = param; return 0; } -static int fi_ibv_read_params(void) +static int vrb_read_params(void) { /* Common parameters */ - if (fi_ibv_get_param_int("tx_size", "Default maximum tx context size", - &fi_ibv_gl_data.def_tx_size) || - (fi_ibv_gl_data.def_tx_size < 0)) { - VERBS_WARN(FI_LOG_CORE, - "Invalid value of tx_size\n"); + if (vrb_get_param_int("tx_size", "Default maximum tx context size", + &vrb_gl_data.def_tx_size) || + (vrb_gl_data.def_tx_size < 0)) { + VERBS_WARN(FI_LOG_CORE, "Invalid value of tx_size\n"); return -FI_EINVAL; } - if (fi_ibv_get_param_int("rx_size", "Default maximum rx context size", - &fi_ibv_gl_data.def_rx_size) || - (fi_ibv_gl_data.def_rx_size < 0)) { - VERBS_WARN(FI_LOG_CORE, - "Invalid value of rx_size\n"); + if (vrb_get_param_int("rx_size", "Default maximum rx context size", + &vrb_gl_data.def_rx_size) || + (vrb_gl_data.def_rx_size < 0)) { + VERBS_WARN(FI_LOG_CORE, "Invalid value of rx_size\n"); return -FI_EINVAL; } - if (fi_ibv_get_param_int("tx_iov_limit", "Default maximum tx iov_limit", - &fi_ibv_gl_data.def_tx_iov_limit) || - (fi_ibv_gl_data.def_tx_iov_limit < 0)) { - VERBS_WARN(FI_LOG_CORE, - "Invalid value of tx_iov_limit\n"); + if (vrb_get_param_int("tx_iov_limit", "Default maximum tx iov_limit", + &vrb_gl_data.def_tx_iov_limit) || + (vrb_gl_data.def_tx_iov_limit < 0)) { + VERBS_WARN(FI_LOG_CORE, "Invalid value of tx_iov_limit\n"); return -FI_EINVAL; } - if (fi_ibv_get_param_int("rx_iov_limit", "Default maximum rx iov_limit", - &fi_ibv_gl_data.def_rx_iov_limit) || - (fi_ibv_gl_data.def_rx_iov_limit < 0)) { - VERBS_WARN(FI_LOG_CORE, - "Invalid value of rx_iov_limit\n"); + if (vrb_get_param_int("rx_iov_limit", "Default maximum rx iov_limit", + &vrb_gl_data.def_rx_iov_limit) || + (vrb_gl_data.def_rx_iov_limit < 0)) { + VERBS_WARN(FI_LOG_CORE, "Invalid value of rx_iov_limit\n"); return -FI_EINVAL; } - if (fi_ibv_get_param_int("inline_size", "Default maximum inline size. " - "Actual inject size returned in fi_info may be " - "greater", &fi_ibv_gl_data.def_inline_size) || - (fi_ibv_gl_data.def_inline_size < 0)) { - VERBS_WARN(FI_LOG_CORE, - "Invalid value of inline_size\n"); + if (vrb_get_param_int("inline_size", "Default maximum inline size. " + "Actual inject size returned in fi_info may be " + "greater", &vrb_gl_data.def_inline_size) || + (vrb_gl_data.def_inline_size < 0)) { + VERBS_WARN(FI_LOG_CORE, "Invalid value of inline_size\n"); return -FI_EINVAL; } - if (fi_ibv_get_param_int("min_rnr_timer", "Set min_rnr_timer QP " - "attribute (0 - 31)", - &fi_ibv_gl_data.min_rnr_timer) || - ((fi_ibv_gl_data.min_rnr_timer < 0) || - (fi_ibv_gl_data.min_rnr_timer > 31))) { - VERBS_WARN(FI_LOG_CORE, - "Invalid value of min_rnr_timer\n"); + if (vrb_get_param_int("min_rnr_timer", "Set min_rnr_timer QP " + "attribute (0 - 31)", + &vrb_gl_data.min_rnr_timer) || + ((vrb_gl_data.min_rnr_timer < 0) || + (vrb_gl_data.min_rnr_timer > 31))) { + VERBS_WARN(FI_LOG_CORE, "Invalid value of min_rnr_timer\n"); return -FI_EINVAL; } - if (fi_ibv_get_param_bool("prefer_xrc", "Order XRC transport fi_infos" - "ahead of RC. Default orders RC first.", - &fi_ibv_gl_data.msg.prefer_xrc)) { - VERBS_WARN(FI_LOG_CORE, - "Invalid value of prefer_xrc\n"); + if (vrb_get_param_bool("use_odp", "Enable on-demand paging memory " + "registrations, if supported. This is " + "currently required to register DAX file system " + "mmapped memory.", &vrb_gl_data.use_odp)) { + VERBS_WARN(FI_LOG_CORE, "Invalid value of use_odp\n"); return -FI_EINVAL; } - if (fi_ibv_get_param_str("xrcd_filename", "A file to " - "associate with the XRC domain.", - &fi_ibv_gl_data.msg.xrcd_filename)) { - VERBS_WARN(FI_LOG_CORE, - "Invalid value of xrcd_filename\n"); + if (vrb_get_param_bool("prefer_xrc", "Order XRC transport fi_infos " + "ahead of RC. Default orders RC first. This " + "setting must usually be combined with setting " + "FI_OFI_RXM_USE_SRX. See fi_verbs.7 man page.", + &vrb_gl_data.msg.prefer_xrc)) { + VERBS_WARN(FI_LOG_CORE, "Invalid value of prefer_xrc\n"); return -FI_EINVAL; } - if (fi_ibv_get_param_int("cqread_bunch_size", "The number of entries to " - "be read from the verbs completion queue at a time", - &fi_ibv_gl_data.cqread_bunch_size) || - (fi_ibv_gl_data.cqread_bunch_size <= 0)) { - VERBS_WARN(FI_LOG_CORE, - "Invalid value of cqread_bunch_size\n"); + + if (vrb_get_param_str("xrcd_filename", "A file to " + "associate with the XRC domain.", + &vrb_gl_data.msg.xrcd_filename)) { + VERBS_WARN(FI_LOG_CORE, "Invalid value of xrcd_filename\n"); return -FI_EINVAL; } - if (fi_ibv_get_param_str("iface", "The prefix or the full name of the " - "network interface associated with the verbs device", - &fi_ibv_gl_data.iface)) { - VERBS_WARN(FI_LOG_CORE, - "Invalid value of iface\n"); + if (vrb_get_param_int("cqread_bunch_size", "The number of entries to " + "be read from the verbs completion queue at a time", + &vrb_gl_data.cqread_bunch_size) || + (vrb_gl_data.cqread_bunch_size <= 0)) { + VERBS_WARN(FI_LOG_CORE, "Invalid value of cqread_bunch_size\n"); + return -FI_EINVAL; + } + if (vrb_get_param_int("gid_idx", "Set which gid index to use " + "attribute (0 - 255)", &vrb_gl_data.gid_idx) || + (vrb_gl_data.gid_idx < 0 || vrb_gl_data.gid_idx > 255)) { + VERBS_WARN(FI_LOG_CORE, "Invalid value of gid index\n"); return -FI_EINVAL; } - /* DGRAM-specific parameters */ - if (getenv("OMPI_COMM_WORLD_RANK") || getenv("PMI_RANK")) - fi_ibv_gl_data.dgram.use_name_server = 0; - if (fi_ibv_get_param_bool("dgram_use_name_server", "The option that " - "enables/disables OFI Name Server thread that is used " - "to resolve IP-addresses to provider specific " - "addresses. If MPI is used, the NS is disabled " - "by default.", &fi_ibv_gl_data.dgram.use_name_server)) { - VERBS_WARN(FI_LOG_CORE, - "Invalid value of dgram_use_name_server\n"); + if (vrb_get_param_str("device_name", "The prefix or the full name of the " + "verbs device to use", &vrb_gl_data.device_name)) { + VERBS_WARN(FI_LOG_CORE, "Invalid value of device_name\n"); return -FI_EINVAL; } - if (fi_ibv_get_param_int("dgram_name_server_port", "The port on which Name Server " - "thread listens incoming connections and requestes.", - &fi_ibv_gl_data.dgram.name_server_port) || - (fi_ibv_gl_data.dgram.name_server_port < 0 || - fi_ibv_gl_data.dgram.name_server_port > 65535)) { - VERBS_WARN(FI_LOG_CORE, - "Invalid value of dgram_name_server_port\n"); + + /* MSG-specific parameter */ + if (vrb_get_param_str("iface", "The prefix or the full name of the " + "network interface associated with the verbs " + "device", &vrb_gl_data.iface)) { + VERBS_WARN(FI_LOG_CORE, "Invalid value of iface\n"); + return -FI_EINVAL; + } + + /* DGRAM-specific parameters */ + if (getenv("OMPI_COMM_WORLD_RANK") || getenv("PMI_RANK")) + vrb_gl_data.dgram.use_name_server = 0; + if (vrb_get_param_bool("dgram_use_name_server", "The option that " + "enables/disables OFI Name Server thread used " + "to resolve IP-addresses to provider specific " + "addresses. If MPI is used, the NS is disabled " + "by default.", &vrb_gl_data.dgram.use_name_server)) { + VERBS_WARN(FI_LOG_CORE, "Invalid dgram_use_name_server\n"); return -FI_EINVAL; } - if (fi_ibv_get_param_int("gid_idx", "Set which gid index to use " - "attribute (0 - 255)", - &fi_ibv_gl_data.gid_idx) || - (fi_ibv_gl_data.gid_idx < 0 || - fi_ibv_gl_data.gid_idx > 255)) { - VERBS_WARN(FI_LOG_CORE, - "Invalid value of gid index\n"); + if (vrb_get_param_int("dgram_name_server_port", "The port on which " + "the name server thread listens incoming " + "requests.", &vrb_gl_data.dgram.name_server_port) || + (vrb_gl_data.dgram.name_server_port < 0 || + vrb_gl_data.dgram.name_server_port > 65535)) { + VERBS_WARN(FI_LOG_CORE, "Invalid dgram_name_server_port\n"); return -FI_EINVAL; } return FI_SUCCESS; } -static void fi_ibv_fini(void) +static void verbs_devs_free(void) +{ + struct verbs_dev_info *dev; + struct verbs_addr *addr; + + while (!dlist_empty(&verbs_devs)) { + dlist_pop_front(&verbs_devs, struct verbs_dev_info, dev, entry); + while (!dlist_empty(&dev->addrs)) { + dlist_pop_front(&dev->addrs, struct verbs_addr, addr, entry); + rdma_freeaddrinfo(addr->rai); + free(addr); + } + free(dev->name); + free(dev); + } +} + +static void vrb_set_peer_mem_support(void) +{ + char *line = NULL; + size_t line_size = 0; + ssize_t bytes; + FILE *kallsyms_fd; + + kallsyms_fd = fopen("/proc/kallsyms", "r"); + if (!kallsyms_fd) + return; + + while ((bytes = getline(&line, &line_size, kallsyms_fd)) != -1) { + if (strstr(line, "ib_register_peer_memory_client")) { + vrb_gl_data.peer_mem_support = true; + break; + } + } + + free(line); + fclose(kallsyms_fd); +} + +static void vrb_fini(void) { #if HAVE_VERBS_DL - ofi_monitor_cleanup(); + ofi_monitors_cleanup(); + ofi_hmem_cleanup(); ofi_mem_fini(); #endif - fi_freeinfo((void *)fi_ibv_util_prov.info); - fi_ibv_util_prov.info = NULL; + fi_freeinfo((void *)vrb_util_prov.info); + verbs_devs_free(); + vrb_util_prov.info = NULL; } VERBS_INI { #if HAVE_VERBS_DL ofi_mem_init(); - ofi_monitor_init(); + ofi_hmem_init(); + ofi_monitors_init(); #endif - if (fi_ibv_read_params()|| fi_ibv_init_info(&fi_ibv_util_prov.info)) + vrb_set_peer_mem_support(); + + if (vrb_read_params()|| vrb_init_info(&vrb_util_prov.info)) return NULL; - return &fi_ibv_prov; + return &vrb_prov; } diff --git a/prov/verbs/src/fi_verbs.h b/prov/verbs/src/fi_verbs.h index c7098db3b93..9310639c15c 100644 --- a/prov/verbs/src/fi_verbs.h +++ b/prov/verbs/src/fi_verbs.h @@ -1,6 +1,9 @@ /* * Copyright (c) 2013-2018 Intel Corporation, Inc. All rights reserved. * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2018-2019 Cray Inc. All rights reserved. + * Copyright (c) 2018-2019 System Fabric Works, Inc. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -48,7 +51,7 @@ #include #include #include -#include +#include #include #include @@ -71,6 +74,8 @@ #include "ofi_util.h" #include "ofi_tree.h" #include "ofi_indexer.h" +#include "ofi_iov.h" +#include "ofi_hmem.h" #include "ofi_verbs_priv.h" @@ -83,25 +88,29 @@ #define RAI_FAMILY 0x00000008 #endif +#define VERBS_RESOLVE_TIMEOUT 2000 // ms + #define VERBS_PROV_NAME "verbs" -#define VERBS_PROV_VERS FI_VERSION(1,0) -#define VERBS_DBG(subsys, ...) FI_DBG(&fi_ibv_prov, subsys, __VA_ARGS__) -#define VERBS_INFO(subsys, ...) FI_INFO(&fi_ibv_prov, subsys, __VA_ARGS__) +#define VERBS_DBG(subsys, ...) FI_DBG(&vrb_prov, subsys, __VA_ARGS__) +#define VERBS_INFO(subsys, ...) FI_INFO(&vrb_prov, subsys, __VA_ARGS__) #define VERBS_INFO_ERRNO(subsys, fn, errno) VERBS_INFO(subsys, fn ": %s(%d)\n", \ strerror(errno), errno) -#define VERBS_WARN(subsys, ...) FI_WARN(&fi_ibv_prov, subsys, __VA_ARGS__) +#define VERBS_WARN(subsys, ...) FI_WARN(&vrb_prov, subsys, __VA_ARGS__) -#define VERBS_INJECT_FLAGS(ep, len, flags) ((((flags) & FI_INJECT) || \ - len <= (ep)->inject_limit) ? IBV_SEND_INLINE : 0) -#define VERBS_INJECT(ep, len) VERBS_INJECT_FLAGS(ep, len, (ep)->info->tx_attr->op_flags) +#define VERBS_INJECT_FLAGS(ep, len, flags, desc) \ + (((flags) & FI_INJECT) || !(desc) || \ + ((((struct vrb_mem_desc *) (desc))->info.iface == FI_HMEM_SYSTEM) && \ + ((len) <= (ep)->info_attr.inject_size))) ? IBV_SEND_INLINE : 0 +#define VERBS_INJECT(ep, len, desc) \ + VERBS_INJECT_FLAGS(ep, len, (ep)->util_ep.tx_op_flags, desc) #define VERBS_COMP_FLAGS(ep, flags, context) \ (((ep)->util_ep.tx_op_flags | (flags)) & \ FI_COMPLETION ? context : VERBS_NO_COMP_FLAG) #define VERBS_COMP(ep, context) \ - VERBS_COMP_FLAGS((ep), (ep)->info->tx_attr->op_flags, context) + VERBS_COMP_FLAGS((ep), (ep)->util_ep.tx_op_flags, context) #define VERBS_WCE_CNT 1024 #define VERBS_WRE_CNT 1024 @@ -111,31 +120,33 @@ #define VERBS_NO_COMP_FLAG ((uint64_t)-1) -#define FI_IBV_CM_DATA_SIZE (56) -#define VERBS_CM_DATA_SIZE (FI_IBV_CM_DATA_SIZE - \ - sizeof(struct fi_ibv_cm_data_hdr)) +#define VRB_CM_DATA_SIZE (56) +#define VERBS_CM_DATA_SIZE (VRB_CM_DATA_SIZE - \ + sizeof(struct vrb_cm_data_hdr)) -#define FI_IBV_CM_REJ_CONSUMER_DEFINED 28 +#define VRB_CM_REJ_CONSUMER_DEFINED 28 +#define VRB_CM_REJ_SIDR_CONSUMER_DEFINED 2 -#define VERBS_DGRAM_MSG_PREFIX_SIZE (40) +#define VERBS_DGRAM_MSG_PREFIX_SIZE (40) -#define FI_IBV_EP_TYPE(info) \ +#define VRB_EP_TYPE(info) \ ((info && info->ep_attr) ? info->ep_attr->type : FI_EP_MSG) -#define FI_IBV_EP_PROTO(info) \ +#define VRB_EP_PROTO(info) \ (((info) && (info)->ep_attr) ? (info)->ep_attr->protocol : \ FI_PROTO_UNSPEC) -#define FI_IBV_MEM_ALIGNMENT (64) -#define FI_IBV_BUF_ALIGNMENT (4096) /* TODO: Page or MTU size */ -#define FI_IBV_POOL_BUF_CNT (100) +#define VRB_MEM_ALIGNMENT (64) +#define VRB_BUF_ALIGNMENT (4096) /* TODO: Page or MTU size */ +#define VRB_POOL_BUF_CNT (100) #define VERBS_ANY_DOMAIN "verbs_any_domain" #define VERBS_ANY_FABRIC "verbs_any_fabric" -extern struct fi_provider fi_ibv_prov; -extern struct util_prov fi_ibv_util_prov; +extern struct fi_provider vrb_prov; +extern struct util_prov vrb_util_prov; +extern struct dlist_entry verbs_devs; -extern struct fi_ibv_gl_data { +extern struct vrb_gl_data { int def_tx_size; int def_rx_size; int def_tx_iov_limit; @@ -143,8 +154,10 @@ extern struct fi_ibv_gl_data { int def_inline_size; int min_rnr_timer; int cqread_bunch_size; + int use_odp; char *iface; int gid_idx; + char *device_name; struct { int buffer_num; @@ -164,7 +177,9 @@ extern struct fi_ibv_gl_data { int prefer_xrc; char *xrcd_filename; } msg; -} fi_ibv_gl_data; + + bool peer_mem_support; +} vrb_gl_data; struct verbs_addr { struct dlist_entry entry; @@ -202,19 +217,20 @@ struct ofi_ib_ud_ep_name { #define VERBS_IB_UD_NS_ANY_SERVICE 0 static inline -int fi_ibv_dgram_ns_is_service_wildcard(void *svc) +int vrb_dgram_ns_is_service_wildcard(void *svc) { - return (*(int *)svc == VERBS_IB_UD_NS_ANY_SERVICE); + return (*(int *) svc == VERBS_IB_UD_NS_ANY_SERVICE); } static inline -int fi_ibv_dgram_ns_service_cmp(void *svc1, void *svc2) +int vrb_dgram_ns_service_cmp(void *svc1, void *svc2) { - int service1 = *(int *)svc1, service2 = *(int *)svc2; + int service1 = *(int *) svc1, service2 = *(int *) svc2; - if (fi_ibv_dgram_ns_is_service_wildcard(svc1) || - fi_ibv_dgram_ns_is_service_wildcard(svc2)) + if (vrb_dgram_ns_is_service_wildcard(svc1) || + vrb_dgram_ns_is_service_wildcard(svc2)) return 0; + return (service1 < service2) ? -1 : (service1 > service2); } @@ -225,24 +241,28 @@ struct verbs_dev_info { }; -struct fi_ibv_fabric { +struct vrb_fabric { struct util_fabric util_fabric; const struct fi_info *info; struct util_ns name_server; }; -int fi_ibv_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, +int vrb_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, void *context); -int fi_ibv_find_fabric(const struct fi_fabric_attr *attr); +int vrb_find_fabric(const struct fi_fabric_attr *attr); -struct fi_ibv_eq_entry { +struct vrb_eq_entry { struct dlist_entry item; uint32_t event; size_t len; - char eq_entry[0]; + union { + struct fi_eq_entry *eq_entry; + struct fi_eq_cm_entry *cm_entry; + uint8_t data[0]; + }; }; -typedef int (*fi_ibv_trywait_func)(struct fid *fid); +typedef int (*vrb_trywait_func)(struct fid *fid); /* An OFI indexer is used to maintain a unique connection request to * endpoint mapping. The key is a 32-bit value (referred to as a @@ -257,15 +277,17 @@ typedef int (*fi_ibv_trywait_func)(struct fid *fid); #define VERBS_CONN_TAG_INDEX_BITS 18 #define VERBS_CONN_TAG_INVALID 0xFFFFFFFF /* Key is not valid */ -struct fi_ibv_eq { +struct vrb_eq { struct fid_eq eq_fid; - struct fi_ibv_fabric *fab; + struct vrb_fabric *fab; fastlock_t lock; struct dlistfd_head list_head; struct rdma_event_channel *channel; uint64_t flags; struct fi_eq_err_entry err; - int epfd; + + ofi_epoll_t epollfd; + enum fi_wait_obj wait_obj; struct { /* The connection key map is used during the XRC connection @@ -280,83 +302,113 @@ struct fi_ibv_eq { * consider using an internal PEP listener for handling the * internally processed reciprocal connections. */ uint16_t pep_port; + + /* SIDR request/responses are a two-way handshake; therefore, + * we maintain an RB tree of SIDR accept responses, so that if + * a response is lost, the subsequent retried request can be + * detected and the original accept response resent. Note, that + * rejected requests can be passed to RXM and will be rejected + * a second time. */ + struct ofi_rbmap sidr_conn_rbmap; } xrc; }; -int fi_ibv_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, +int vrb_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, struct fid_eq **eq, void *context); -int fi_ibv_eq_trywait(struct fi_ibv_eq *eq); +int vrb_eq_trywait(struct vrb_eq *eq); +void vrb_eq_remove_events(struct vrb_eq *eq, struct fid *fid); -int fi_ibv_av_open(struct fid_domain *domain, struct fi_av_attr *attr, +int vrb_av_open(struct fid_domain *domain, struct fi_av_attr *attr, struct fid_av **av, void *context); -struct fi_ibv_pep { +struct vrb_pep { struct fid_pep pep_fid; - struct fi_ibv_eq *eq; + struct vrb_eq *eq; struct rdma_cm_id *id; + + /* XRC uses SIDR based RDMA CM exchanges for setting up + * shared QP connections. This ID is bound to the same + * port number as "id", but the RDMA_PS_UDP port space. */ + struct rdma_cm_id *xrc_ps_udp_id; + int backlog; int bound; size_t src_addrlen; struct fi_info *info; }; -struct fi_ops_cm *fi_ibv_pep_ops_cm(struct fi_ibv_pep *pep); +struct fi_ops_cm *vrb_pep_ops_cm(struct vrb_pep *pep); + + +#if VERBS_HAVE_QUERY_EX +#define VRB_ACCESS_ON_DEMAND IBV_ACCESS_ON_DEMAND +#else +#define VRB_ACCESS_ON_DEMAND 0 +#endif + +enum { + VRB_USE_XRC = BIT(0), + VRB_USE_ODP = BIT(1), +}; -struct fi_ibv_domain { +struct vrb_domain { struct util_domain util_domain; struct ibv_context *verbs; struct ibv_pd *pd; enum fi_ep_type ep_type; struct fi_info *info; + /* The EQ is utilized by verbs/MSG */ - struct fi_ibv_eq *eq; + struct vrb_eq *eq; uint64_t eq_flags; + ssize_t (*send_credits)(struct fid_ep *ep, uint64_t credits); + /* Indicates that MSG endpoints should use the XRC transport. * TODO: Move selection of XRC/RC to endpoint info from domain */ - int use_xrc; + int flags; struct { int xrcd_fd; struct ibv_xrcd *xrcd; - /* The domain maintains a RBTree for mapping an endpoint - * destination addresses to physical XRC INI QP connected - * to that host. */ - fastlock_t ini_mgmt_lock; + /* XRC INI QP connections can be shared between endpoint + * within the same domain. The domain maintains an RBTree + * for mapping endpoint destination addresses to the + * physical XRC INI connection to the associated node. The + * map and XRC INI connection object state information are + * protected via the ini_lock. */ + fastlock_t ini_lock; + ofi_fastlock_acquire_t lock_acquire; + ofi_fastlock_release_t lock_release; struct ofi_rbmap *ini_conn_rbmap; - } xrc ; + } xrc; /* MR stuff */ struct ofi_mr_cache cache; - int (*post_send)(struct ibv_qp *qp, - struct ibv_send_wr *wr, - struct ibv_send_wr **bad_wr); - int (*poll_cq)(struct ibv_cq *cq, - int num_entries, - struct ibv_wc *wc); }; -struct fi_ibv_cq; -typedef void (*fi_ibv_cq_read_entry)(struct ibv_wc *wc, void *buf); +struct vrb_cq; +typedef void (*vrb_cq_read_entry)(struct ibv_wc *wc, void *buf); -struct fi_ibv_wce { +struct vrb_wc_entry { struct slist_entry entry; struct ibv_wc wc; }; -struct fi_ibv_srq_ep; -struct fi_ibv_cq { +struct vrb_srq_ep; +struct vrb_cq { struct util_cq util_cq; struct ibv_comp_channel *channel; struct ibv_cq *cq; size_t entry_size; uint64_t flags; + enum fi_wait_obj wait_obj; enum fi_cq_wait_cond wait_cond; struct ibv_wc wc; int signal_fd[2]; - fi_ibv_cq_read_entry read_entry; - struct slist wcq; + vrb_cq_read_entry read_entry; + struct slist saved_wc_list; ofi_atomic32_t nevents; struct ofi_bufpool *wce_pool; @@ -365,31 +417,33 @@ struct fi_ibv_cq { fastlock_t srq_list_lock; struct dlist_entry srq_list; } xrc; - /* Track tx credits for verbs devices that can free-up send queue - * space after processing WRs even if the app hasn't read the CQ. - * Without this tracking we might overrun the CQ */ - ofi_atomic32_t credits; + + size_t credits; + /* As a future optimization, we can use the app's context + * if they set FI_CONTEXT. + */ + struct ofi_bufpool *ctx_pool; }; -int fi_ibv_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, +int vrb_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct fid_cq **cq, void *context); -int fi_ibv_cq_trywait(struct fi_ibv_cq *cq); +int vrb_cq_trywait(struct vrb_cq *cq); -struct fi_ibv_mem_desc { +struct vrb_mem_desc { struct fid_mr mr_fid; struct ibv_mr *mr; - struct fi_ibv_domain *domain; - size_t len; + struct vrb_domain *domain; /* this field is used only by MR cache operations */ struct ofi_mr_entry *entry; + struct ofi_mr_info info; + uint32_t lkey; }; -extern struct fi_ops_mr fi_ibv_mr_ops; -extern struct fi_ops_mr fi_ibv_mr_cache_ops; +extern struct fi_ops_mr vrb_mr_ops; -int fi_ibv_mr_cache_add_region(struct ofi_mr_cache *cache, +int vrb_mr_cache_add_region(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry); -void fi_ibv_mr_cache_delete_region(struct ofi_mr_cache *cache, +void vrb_mr_cache_delete_region(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry); /* @@ -397,7 +451,7 @@ void fi_ibv_mr_cache_delete_region(struct ofi_mr_cache *cache, * maintain a list of validated pre-posted receives to post once * the SRQ is created. */ -struct fi_ibv_xrc_srx_prepost { +struct vrb_xrc_srx_prepost { struct slist_entry prepost_entry; void *buf; void *desc; @@ -406,10 +460,12 @@ struct fi_ibv_xrc_srx_prepost { fi_addr_t src_addr; }; -struct fi_ibv_srq_ep { +struct vrb_srq_ep { struct fid_ep ep_fid; struct ibv_srq *srq; - struct fi_ibv_domain *domain; + struct vrb_domain *domain; + struct ofi_bufpool *ctx_pool; + fastlock_t ctx_lock; /* For XRC SRQ only */ struct { @@ -423,54 +479,46 @@ struct fi_ibv_srq_ep { /* The RX CQ associated with this XRC SRQ. This field * and the srq_entry should only be modified while holding * the associted cq::xrc.srq_list_lock. */ - struct fi_ibv_cq *cq; + struct vrb_cq *cq; /* The CQ maintains a list of XRC SRQ associated with it */ struct dlist_entry srq_entry; } xrc; }; -int fi_ibv_srq_context(struct fid_domain *domain, struct fi_rx_attr *attr, +int vrb_srq_context(struct fid_domain *domain, struct fi_rx_attr *attr, struct fid_ep **rx_ep, void *context); -static inline int fi_ibv_is_xrc(struct fi_info *info) -{ - return (FI_IBV_EP_TYPE(info) == FI_EP_MSG) && - (FI_IBV_EP_PROTO(info) == FI_PROTO_RDMA_CM_IB_XRC); -} +int vrb_domain_xrc_init(struct vrb_domain *domain); +int vrb_domain_xrc_cleanup(struct vrb_domain *domain); -static inline int fi_ibv_is_xrc_send_qp(enum ibv_qp_type qp_type) -{ - return qp_type == IBV_QPT_XRC_SEND; -} - -int fi_ibv_domain_xrc_init(struct fi_ibv_domain *domain); -int fi_ibv_domain_xrc_cleanup(struct fi_ibv_domain *domain); - -enum fi_ibv_ini_qp_state { - FI_IBV_INI_QP_UNCONNECTED, - FI_IBV_INI_QP_CONNECTING, - FI_IBV_INI_QP_CONNECTED +enum vrb_ini_qp_state { + VRB_INI_QP_UNCONNECTED, + VRB_INI_QP_CONNECTING, + VRB_INI_QP_CONNECTED }; -#define FI_IBV_NO_INI_TGT_QPNUM 0 -#define FI_IBV_RECIP_CONN 1 +#define VRB_NO_INI_TGT_QPNUM 0 +#define VRB_RECIP_CONN 1 /* * An XRC transport INI QP connection can be shared within a process to * communicate with all the ranks on the same remote node. This structure is * only accessed during connection setup and tear down and should be - * done while holding the domain:xrc:ini_mgmt_lock. + * done while holding the domain:eq:lock. */ -struct fi_ibv_ini_shared_conn { +struct vrb_ini_shared_conn { /* To share, EP must have same remote peer host addr and TX CQ */ struct sockaddr *peer_addr; - struct fi_ibv_cq *tx_cq; + struct vrb_cq *tx_cq; /* The physical INI/TGT QPN connection. Virtual connections to the * same remote peer and TGT QPN will share this connection, with - * the remote end opening the specified XRC TGT QPN for sharing. */ - enum fi_ibv_ini_qp_state state; + * the remote end opening the specified XRC TGT QPN for sharing + * During the physical connection setup, phys_conn_id identifies + * the RDMA CM ID (and MSG_EP) associated with the operation. */ + enum vrb_ini_qp_state state; + struct rdma_cm_id *phys_conn_id; struct ibv_qp *ini_qp; uint32_t tgt_qpn; @@ -481,12 +529,13 @@ struct fi_ibv_ini_shared_conn { ofi_atomic32_t ref_cnt; }; -enum fi_ibv_xrc_ep_conn_state { - FI_IBV_XRC_UNCONNECTED, - FI_IBV_XRC_ORIG_CONNECTING, - FI_IBV_XRC_ORIG_CONNECTED, - FI_IBV_XRC_RECIP_CONNECTING, - FI_IBV_XRC_CONNECTED +enum vrb_xrc_ep_conn_state { + VRB_XRC_UNCONNECTED, + VRB_XRC_ORIG_CONNECTING, + VRB_XRC_ORIG_CONNECTED, + VRB_XRC_RECIP_CONNECTING, + VRB_XRC_CONNECTED, + VRB_XRC_ERROR }; /* @@ -494,224 +543,291 @@ enum fi_ibv_xrc_ep_conn_state { * establishment and can be freed once bidirectional connectivity * is established. */ -struct fi_ibv_xrc_ep_conn_setup { +#define VRB_MAX_XRC_CONNECT_RETRIES 16 + +struct vrb_xrc_ep_conn_setup { + int retry_count; + /* The connection tag is used to associate the reciprocal * XRC INI/TGT QP connection request in the reverse direction * with the original request. The tag is created by the * original active side. */ uint32_t conn_tag; - bool created_conn_tag; - - /* IB CM message stale/duplicate detection processing requires - * that shared INI/TGT connections use unique QP numbers during - * RDMA CM connection setup. To avoid conflicts with actual HCA - * QP number space, we allocate minimal QP that are left in the - * reset state and closed once the setup process completes. */ - struct ibv_qp *rsvd_ini_qpn; - struct ibv_qp *rsvd_tgt_qpn; - - /* Temporary flags to indicate if the INI QP setup and the - * TGT QP setup have completed. */ - bool ini_connected; - bool tgt_connected; + uint32_t remote_conn_tag; /* Delivery of the FI_CONNECTED event is delayed until * bidirectional connectivity is established. */ size_t event_len; - uint8_t event_data[FI_IBV_CM_DATA_SIZE]; + uint8_t event_data[VRB_CM_DATA_SIZE]; /* Connection request may have to queue waiting for the * physical XRC INI/TGT QP connection to complete. */ int pending_recip; size_t pending_paramlen; - uint8_t pending_param[FI_IBV_CM_DATA_SIZE]; + uint8_t pending_param[VRB_CM_DATA_SIZE]; }; -struct fi_ibv_ep { +struct vrb_ep { struct util_ep util_ep; struct ibv_qp *ibv_qp; + + /* Protected by send CQ lock */ + uint64_t sq_credits; + uint64_t peer_rq_credits; + /* Protected by recv CQ lock */ + int64_t rq_credits_avail; + int64_t threshold; + union { - struct rdma_cm_id *id; + struct rdma_cm_id *id; struct { struct ofi_ib_ud_ep_name ep_name; int service; }; }; - size_t inject_limit; - - struct fi_ibv_eq *eq; - struct fi_ibv_srq_ep *srq_ep; - struct fi_info *info; + struct { + size_t inject_size; + size_t tx_size; + size_t tx_iov_limit; + size_t rx_size; + size_t rx_iov_limit; + uint32_t protocol; + uint32_t addr_format; + size_t src_addrlen; + size_t dest_addrlen; + void *src_addr; + void *dest_addr; + void *handle; + } info_attr; + struct vrb_eq *eq; + struct vrb_srq_ep *srq_ep; struct { struct ibv_send_wr rma_wr; struct ibv_send_wr msg_wr; struct ibv_sge sge; } *wrs; - size_t rx_size; + size_t rx_cq_size; + struct rdma_conn_param conn_param; + struct vrb_cm_data_hdr *cm_hdr; + void *cm_priv_data; + bool hmem_enabled; }; + +/* Must be cast-able to struct fi_context */ +struct vrb_context { + struct vrb_ep *ep; + struct vrb_srq_ep *srx; + void *user_ctx; + uint32_t flags; +}; + + #define VERBS_XRC_EP_MAGIC 0x1F3D5B79 -struct fi_ibv_xrc_ep { +struct vrb_xrc_ep { /* Must be first */ - struct fi_ibv_ep base_ep; + struct vrb_ep base_ep; /* XRC only fields */ struct rdma_cm_id *tgt_id; struct ibv_qp *tgt_ibv_qp; - enum fi_ibv_xrc_ep_conn_state conn_state; + enum vrb_xrc_ep_conn_state conn_state; + bool recip_req_received; uint32_t magic; uint32_t srqn; uint32_t peer_srqn; /* A reference is held to a shared physical XRC INI/TGT QP connecting * to the destination node. */ - struct fi_ibv_ini_shared_conn *ini_conn; + struct vrb_ini_shared_conn *ini_conn; struct dlist_entry ini_conn_entry; + /* The following is used for resending lost SIDR accept response + * messages when a retransmit SIDR connect request is received. */ + void *accept_param_data; + size_t accept_param_len; + uint16_t remote_pep_port; + bool recip_accept; + struct ofi_rbnode *conn_map_node; + /* The following state is allocated during XRC bidirectional setup and * freed once the connection is established. */ - struct fi_ibv_xrc_ep_conn_setup *conn_setup; + struct vrb_xrc_ep_conn_setup *conn_setup; }; -int fi_ibv_open_ep(struct fid_domain *domain, struct fi_info *info, +static inline int vrb_is_xrc_info(struct fi_info *info) +{ + return (VRB_EP_TYPE(info) == FI_EP_MSG) && + (VRB_EP_PROTO(info) == FI_PROTO_RDMA_CM_IB_XRC); +} + +static inline int vrb_is_xrc_ep(struct vrb_ep *ep) +{ + return (ep->util_ep.type == FI_EP_MSG) && + (ep->info_attr.protocol == FI_PROTO_RDMA_CM_IB_XRC); +} + +int vrb_open_ep(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context); -int fi_ibv_passive_ep(struct fid_fabric *fabric, struct fi_info *info, +int vrb_passive_ep(struct fid_fabric *fabric, struct fi_info *info, struct fid_pep **pep, void *context); -int fi_ibv_create_ep(const char *node, const char *service, - uint64_t flags, const struct fi_info *hints, - struct rdma_addrinfo **rai, struct rdma_cm_id **id); -void fi_ibv_destroy_ep(struct rdma_addrinfo *rai, struct rdma_cm_id **id); -int fi_ibv_dgram_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, +int vrb_create_ep(struct vrb_ep *ep, enum rdma_port_space ps, + struct rdma_cm_id **id); +int vrb_dgram_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **av_fid, void *context); static inline -struct fi_ibv_domain *fi_ibv_ep_to_domain(struct fi_ibv_ep *ep) +struct vrb_domain *vrb_ep_to_domain(struct vrb_ep *ep) { - return container_of(ep->util_ep.domain, struct fi_ibv_domain, + return container_of(ep->util_ep.domain, struct vrb_domain, util_domain); } -struct fi_ops_atomic fi_ibv_msg_ep_atomic_ops; -struct fi_ops_atomic fi_ibv_msg_xrc_ep_atomic_ops; -struct fi_ops_cm fi_ibv_msg_ep_cm_ops; -struct fi_ops_cm fi_ibv_msg_xrc_ep_cm_ops; -const struct fi_ops_msg fi_ibv_msg_ep_msg_ops_ts; -const struct fi_ops_msg fi_ibv_msg_ep_msg_ops; -const struct fi_ops_msg fi_ibv_dgram_msg_ops_ts; -const struct fi_ops_msg fi_ibv_dgram_msg_ops; -const struct fi_ops_msg fi_ibv_msg_xrc_ep_msg_ops; -const struct fi_ops_msg fi_ibv_msg_xrc_ep_msg_ops_ts; -const struct fi_ops_msg fi_ibv_msg_srq_xrc_ep_msg_ops; -struct fi_ops_rma fi_ibv_msg_ep_rma_ops_ts; -struct fi_ops_rma fi_ibv_msg_ep_rma_ops; -struct fi_ops_rma fi_ibv_msg_xrc_ep_rma_ops_ts; -struct fi_ops_rma fi_ibv_msg_xrc_ep_rma_ops; - -#define FI_IBV_XRC_VERSION 1 - -struct fi_ibv_xrc_cm_data { +extern struct fi_ops_atomic vrb_msg_ep_atomic_ops; +extern struct fi_ops_atomic vrb_msg_xrc_ep_atomic_ops; +extern struct fi_ops_cm vrb_msg_ep_cm_ops; +extern struct fi_ops_cm vrb_msg_xrc_ep_cm_ops; +extern const struct fi_ops_msg vrb_msg_ep_msg_ops_ts; +extern const struct fi_ops_msg vrb_msg_ep_msg_ops; +extern const struct fi_ops_msg vrb_dgram_msg_ops_ts; +extern const struct fi_ops_msg vrb_dgram_msg_ops; +extern const struct fi_ops_msg vrb_msg_xrc_ep_msg_ops; +extern const struct fi_ops_msg vrb_msg_xrc_ep_msg_ops_ts; +extern const struct fi_ops_msg vrb_msg_srq_xrc_ep_msg_ops; +extern struct fi_ops_rma vrb_msg_ep_rma_ops_ts; +extern struct fi_ops_rma vrb_msg_ep_rma_ops; +extern struct fi_ops_rma vrb_msg_xrc_ep_rma_ops_ts; +extern struct fi_ops_rma vrb_msg_xrc_ep_rma_ops; + +#define VRB_XRC_VERSION 2 + +struct vrb_xrc_cm_data { uint8_t version; uint8_t reciprocal; uint16_t port; - uint32_t param; + uint32_t tgt_qpn; + uint32_t srqn; uint32_t conn_tag; }; -struct fi_ibv_xrc_conn_info { +struct vrb_xrc_conn_info { uint32_t conn_tag; uint32_t is_reciprocal; uint32_t ini_qpn; - uint32_t conn_data; + uint32_t tgt_qpn; + uint32_t peer_srqn; uint16_t port; struct rdma_conn_param conn_param; }; -struct fi_ibv_connreq { +struct vrb_connreq { struct fid handle; struct rdma_cm_id *id; /* Support for XRC bidirectional connections, and * non-RDMA CM managed QP. */ int is_xrc; - struct fi_ibv_xrc_conn_info xrc; + struct vrb_xrc_conn_info xrc; +}; + +/* Structure below is a copy of the RDMA CM header (structure ib_connect_hdr in + * file librdmacm/cma.h) + * DO NOT MODIFY! */ +struct vrb_rdma_cm_hdr { + uint8_t cma_version; /* Set by the kernel */ + uint8_t ip_version; /* IP version: 7:4 */ + uint16_t port; + uint32_t src_addr[4]; + uint32_t dst_addr[4]; }; -struct fi_ibv_cm_data_hdr { +struct vrb_cm_data_hdr { uint8_t size; char data[]; }; -void fi_ibv_msg_ep_get_qp_attr(struct fi_ibv_ep *ep, +int vrb_eq_add_sidr_conn(struct vrb_xrc_ep *ep, + void *param_data, size_t param_len); +void vrb_eq_remove_sidr_conn(struct vrb_xrc_ep *ep); +struct vrb_xrc_ep *vrb_eq_get_sidr_conn(struct vrb_eq *eq, + struct sockaddr *peer, + uint16_t pep_port, bool recip); + +void vrb_msg_ep_get_qp_attr(struct vrb_ep *ep, struct ibv_qp_init_attr *attr); -int fi_ibv_process_xrc_connreq(struct fi_ibv_ep *ep, - struct fi_ibv_connreq *connreq); - -void fi_ibv_next_xrc_conn_state(struct fi_ibv_xrc_ep *ep); -void fi_ibv_prev_xrc_conn_state(struct fi_ibv_xrc_ep *ep); -void fi_ibv_eq_set_xrc_conn_tag(struct fi_ibv_xrc_ep *ep); -void fi_ibv_eq_clear_xrc_conn_tag(struct fi_ibv_xrc_ep *ep); -struct fi_ibv_xrc_ep *fi_ibv_eq_xrc_conn_tag2ep(struct fi_ibv_eq *eq, +int vrb_process_xrc_connreq(struct vrb_ep *ep, + struct vrb_connreq *connreq); + +void vrb_next_xrc_conn_state(struct vrb_xrc_ep *ep); +void vrb_prev_xrc_conn_state(struct vrb_xrc_ep *ep); +void vrb_eq_set_xrc_conn_tag(struct vrb_xrc_ep *ep); +void vrb_eq_clear_xrc_conn_tag(struct vrb_xrc_ep *ep); +struct vrb_xrc_ep *vrb_eq_xrc_conn_tag2ep(struct vrb_eq *eq, uint32_t conn_tag); -void fi_ibv_set_xrc_cm_data(struct fi_ibv_xrc_cm_data *local, int reciprocal, - uint32_t conn_tag, uint16_t port, uint32_t param); -int fi_ibv_verify_xrc_cm_data(struct fi_ibv_xrc_cm_data *remote, +void vrb_set_xrc_cm_data(struct vrb_xrc_cm_data *local, int reciprocal, + uint32_t conn_tag, uint16_t port, uint32_t tgt_qpn, + uint32_t srqn); +int vrb_verify_xrc_cm_data(struct vrb_xrc_cm_data *remote, int private_data_len); -int fi_ibv_connect_xrc(struct fi_ibv_xrc_ep *ep, struct sockaddr *addr, +int vrb_connect_xrc(struct vrb_xrc_ep *ep, struct sockaddr *addr, int reciprocal, void *param, size_t paramlen); -int fi_ibv_accept_xrc(struct fi_ibv_xrc_ep *ep, int reciprocal, +int vrb_accept_xrc(struct vrb_xrc_ep *ep, int reciprocal, void *param, size_t paramlen); -void fi_ibv_free_xrc_conn_setup(struct fi_ibv_xrc_ep *ep, int disconnect); -void fi_ibv_add_pending_ini_conn(struct fi_ibv_xrc_ep *ep, int reciprocal, +int vrb_resend_shared_accept_xrc(struct vrb_xrc_ep *ep, + struct vrb_connreq *connreq, + struct rdma_cm_id *id); +void vrb_free_xrc_conn_setup(struct vrb_xrc_ep *ep, int disconnect); +void vrb_add_pending_ini_conn(struct vrb_xrc_ep *ep, int reciprocal, void *conn_param, size_t conn_paramlen); -void fi_ibv_sched_ini_conn(struct fi_ibv_ini_shared_conn *ini_conn); -int fi_ibv_get_shared_ini_conn(struct fi_ibv_xrc_ep *ep, - struct fi_ibv_ini_shared_conn **ini_conn); -void fi_ibv_put_shared_ini_conn(struct fi_ibv_xrc_ep *ep); -int fi_ibv_reserve_qpn(struct fi_ibv_xrc_ep *ep, struct ibv_qp **qp); +void vrb_sched_ini_conn(struct vrb_ini_shared_conn *ini_conn); +int vrb_get_shared_ini_conn(struct vrb_xrc_ep *ep, + struct vrb_ini_shared_conn **ini_conn); +void vrb_put_shared_ini_conn(struct vrb_xrc_ep *ep); -void fi_ibv_save_priv_data(struct fi_ibv_xrc_ep *ep, const void *data, +void vrb_save_priv_data(struct vrb_xrc_ep *ep, const void *data, size_t len); -int fi_ibv_ep_create_ini_qp(struct fi_ibv_xrc_ep *ep, void *dst_addr, +int vrb_ep_create_ini_qp(struct vrb_xrc_ep *ep, void *dst_addr, uint32_t *peer_tgt_qpn); -void fi_ibv_ep_ini_conn_done(struct fi_ibv_xrc_ep *ep, uint32_t peer_srqn, - uint32_t peer_tgt_qpn); -void fi_ibv_ep_ini_conn_rejected(struct fi_ibv_xrc_ep *ep); -int fi_ibv_ep_create_tgt_qp(struct fi_ibv_xrc_ep *ep, uint32_t tgt_qpn); -void fi_ibv_ep_tgt_conn_done(struct fi_ibv_xrc_ep *qp); -int fi_ibv_ep_destroy_xrc_qp(struct fi_ibv_xrc_ep *ep); - -int fi_ibv_xrc_close_srq(struct fi_ibv_srq_ep *srq_ep); -int fi_ibv_sockaddr_len(struct sockaddr *addr); +void vrb_ep_ini_conn_done(struct vrb_xrc_ep *ep, uint32_t peer_tgt_qpn); +void vrb_ep_ini_conn_rejected(struct vrb_xrc_ep *ep); +int vrb_ep_create_tgt_qp(struct vrb_xrc_ep *ep, uint32_t tgt_qpn); +void vrb_ep_tgt_conn_done(struct vrb_xrc_ep *qp); +int vrb_ep_destroy_xrc_qp(struct vrb_xrc_ep *ep); +int vrb_xrc_close_srq(struct vrb_srq_ep *srq_ep); -int fi_ibv_init_info(const struct fi_info **all_infos); -int fi_ibv_getinfo(uint32_t version, const char *node, const char *service, +int vrb_init_info(const struct fi_info **all_infos); +int vrb_getinfo(uint32_t version, const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct fi_info **info); -const struct fi_info *fi_ibv_get_verbs_info(const struct fi_info *ilist, +const struct fi_info *vrb_get_verbs_info(const struct fi_info *ilist, const char *domain_name); -int fi_ibv_fi_to_rai(const struct fi_info *fi, uint64_t flags, - struct rdma_addrinfo *rai); -int fi_ibv_get_rdma_rai(const char *node, const char *service, uint64_t flags, - const struct fi_info *hints, struct rdma_addrinfo **rai); +int vrb_set_rai(uint32_t addr_format, void *src_addr, size_t src_addrlen, + void *dest_addr, size_t dest_addrlen, uint64_t flags, + struct rdma_addrinfo *rai); +int vrb_get_matching_info(uint32_t version, const struct fi_info *hints, + struct fi_info **info, const struct fi_info *verbs_info, + uint8_t passive); +int vrb_get_port_space(uint32_t addr_format); +void vrb_alter_info(const struct fi_info *hints, struct fi_info *info); + struct verbs_ep_domain { char *suffix; enum fi_ep_type type; uint32_t protocol; - uint64_t caps; }; extern const struct verbs_ep_domain verbs_dgram_domain; extern const struct verbs_ep_domain verbs_msg_xrc_domain; -int fi_ibv_check_ep_attr(const struct fi_info *hints, +int vrb_check_ep_attr(const struct fi_info *hints, const struct fi_info *info); -int fi_ibv_check_rx_attr(const struct fi_rx_attr *attr, +int vrb_check_rx_attr(const struct fi_rx_attr *attr, const struct fi_info *hints, const struct fi_info *info); -static inline int fi_ibv_cmp_xrc_domain_name(const char *domain_name, +static inline int vrb_cmp_xrc_domain_name(const char *domain_name, const char *rdma_name) { size_t domain_len = strlen(domain_name); @@ -721,34 +837,36 @@ static inline int fi_ibv_cmp_xrc_domain_name(const char *domain_name, domain_len - suffix_len) : -1; } -int fi_ibv_cq_signal(struct fid_cq *cq); +int vrb_cq_signal(struct fid_cq *cq); -ssize_t fi_ibv_eq_write_event(struct fi_ibv_eq *eq, uint32_t event, +struct vrb_eq_entry *vrb_eq_alloc_entry(uint32_t event, + const void *buf, size_t len); +ssize_t vrb_eq_write_event(struct vrb_eq *eq, uint32_t event, const void *buf, size_t len); -int fi_ibv_query_atomic(struct fid_domain *domain_fid, enum fi_datatype datatype, +int vrb_query_atomic(struct fid_domain *domain_fid, enum fi_datatype datatype, enum fi_op op, struct fi_atomic_attr *attr, uint64_t flags); -int fi_ibv_set_rnr_timer(struct ibv_qp *qp); -void fi_ibv_cleanup_cq(struct fi_ibv_ep *cur_ep); -int fi_ibv_find_max_inline(struct ibv_pd *pd, struct ibv_context *context, +int vrb_set_rnr_timer(struct ibv_qp *qp); +void vrb_cleanup_cq(struct vrb_ep *cur_ep); +int vrb_find_max_inline(struct ibv_pd *pd, struct ibv_context *context, enum ibv_qp_type qp_type); -struct fi_ibv_dgram_av { +struct vrb_dgram_av { struct util_av util_av; struct dlist_entry av_entry_list; }; -struct fi_ibv_dgram_av_entry { +struct vrb_dgram_av_entry { struct dlist_entry list_entry; struct ofi_ib_ud_ep_name addr; struct ibv_ah *ah; }; -static inline struct fi_ibv_dgram_av_entry* -fi_ibv_dgram_av_lookup_av_entry(fi_addr_t fi_addr) +static inline struct vrb_dgram_av_entry* +vrb_dgram_av_lookup_av_entry(fi_addr_t fi_addr) { - return (struct fi_ibv_dgram_av_entry *) (uintptr_t) fi_addr; + return (struct vrb_dgram_av_entry *) (uintptr_t) fi_addr; } /* NOTE: @@ -756,226 +874,63 @@ fi_ibv_dgram_av_lookup_av_entry(fi_addr_t fi_addr) * Deal with non-compliant libibverbs drivers which set errno * instead of directly returning the error value */ -static inline ssize_t fi_ibv_handle_post(int ret) +static inline ssize_t vrb_convert_ret(int ret) { - switch (ret) { - case -ENOMEM: - case ENOMEM: - ret = -FI_EAGAIN; - break; - case -1: - ret = (errno == ENOMEM) ? -FI_EAGAIN : - -errno; - break; - default: - ret = -abs(ret); - break; - } - return ret; -} - -/* Returns 0 if it processes WR entry for which user - * doesn't request the completion */ -static inline int -fi_ibv_process_wc(struct fi_ibv_cq *cq, struct ibv_wc *wc) -{ - return (wc->wr_id == VERBS_NO_COMP_FLAG) ? 0 : 1; -} - -/* Returns 0 and tries read new completions if it processes - * WR entry for which user doesn't request the completion */ -static inline int -fi_ibv_process_wc_poll_new(struct fi_ibv_cq *cq, struct ibv_wc *wc) -{ - struct fi_ibv_domain *domain = container_of(cq->util_cq.domain, - struct fi_ibv_domain, - util_domain); - if (wc->wr_id == VERBS_NO_COMP_FLAG) { - int ret; - - while ((ret = domain->poll_cq(cq->cq, 1, wc)) > 0) { - if (wc->wr_id != VERBS_NO_COMP_FLAG) - return 1; - } - return ret; - } - return 1; + if (!ret) + return 0; + else if (ret == -ENOMEM || ret == ENOMEM) + return -FI_EAGAIN; + else if (ret == -1) + return (errno == ENOMEM) ? -FI_EAGAIN : -errno; + else + return -abs(ret); } -static inline int fi_ibv_wc_2_wce(struct fi_ibv_cq *cq, - struct ibv_wc *wc, - struct fi_ibv_wce **wce) -{ - *wce = ofi_buf_alloc(cq->wce_pool); - if (OFI_UNLIKELY(!*wce)) - return -FI_ENOMEM; - memset(*wce, 0, sizeof(**wce)); - (*wce)->wc = *wc; +int vrb_poll_cq(struct vrb_cq *cq, struct ibv_wc *wc); +int vrb_save_wc(struct vrb_cq *cq, struct ibv_wc *wc); - return FI_SUCCESS; -} +#define vrb_init_sge(buf, len, desc) (struct ibv_sge) \ + { .addr = (uintptr_t) buf, \ + .length = (uint32_t) len, \ + .lkey = (desc) ? ((struct vrb_mem_desc *) (desc))->lkey : 0 } -#define fi_ibv_init_sge(buf, len, desc) (struct ibv_sge) \ - { .addr = (uintptr_t)buf, \ - .length = (uint32_t)len, \ - .lkey = (uint32_t)(uintptr_t)desc } - -#define fi_ibv_set_sge_iov(sg_list, iov, count, desc) \ -({ \ - size_t i; \ - sg_list = alloca(sizeof(*sg_list) * count); \ - for (i = 0; i < count; i++) { \ - sg_list[i] = fi_ibv_init_sge( \ - iov[i].iov_base, \ - iov[i].iov_len, \ - desc[i]); \ - } \ -}) - -#define fi_ibv_set_sge_iov_count_len(sg_list, iov, count, desc, len) \ -({ \ - size_t i; \ - sg_list = alloca(sizeof(*sg_list) * count); \ - for (i = 0; i < count; i++) { \ - sg_list[i] = fi_ibv_init_sge( \ - iov[i].iov_base, \ - iov[i].iov_len, \ - desc[i]); \ - len += iov[i].iov_len; \ - } \ -}) - -#define fi_ibv_init_sge_inline(buf, len) fi_ibv_init_sge(buf, len, NULL) - -#define fi_ibv_set_sge_iov_inline(sg_list, iov, count, len) \ -({ \ +#define vrb_iov_dupa(dst, iov, desc, count) \ +do { \ size_t i; \ - sg_list = alloca(sizeof(*sg_list) * count); \ + dst = alloca(sizeof(*dst) * count); \ for (i = 0; i < count; i++) { \ - sg_list[i] = fi_ibv_init_sge_inline( \ - iov[i].iov_base, \ - iov[i].iov_len); \ - len += iov[i].iov_len; \ + dst[i] = vrb_init_sge(iov[i].iov_base, \ + iov[i].iov_len, desc[i]); \ } \ -}) - -#define fi_ibv_send_iov(ep, wr, iov, desc, count) \ - fi_ibv_send_iov_flags(ep, wr, iov, desc, count, \ - (ep)->info->tx_attr->op_flags) - -#define fi_ibv_send_msg(ep, wr, msg, flags) \ - fi_ibv_send_iov_flags(ep, wr, (msg)->msg_iov, (msg)->desc, \ - (msg)->iov_count, flags) +} while (0) +#define vrb_wr_consumes_recv(wr) \ + ( wr->opcode == IBV_WR_SEND || wr->opcode == IBV_WR_SEND_WITH_IMM \ + || wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM ) -static inline int fi_ibv_poll_reap_unsig_cq(struct fi_ibv_ep *ep) -{ - struct fi_ibv_wce *wce; - struct ibv_wc wc[10]; - int ret, i; - struct fi_ibv_cq *cq = - container_of(ep->util_ep.tx_cq, struct fi_ibv_cq, util_cq); - struct fi_ibv_domain *domain = container_of(cq->util_cq.domain, - struct fi_ibv_domain, - util_domain); - - cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock); - /* TODO: retrieve WCs as much as possible in a single - * ibv_poll_cq call */ - while (1) { - ret = domain->poll_cq(cq->cq, 10, wc); - if (ret <= 0) { - cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); - return ret; - } - for (i = 0; i < ret; i++) { - if (!fi_ibv_process_wc(cq, &wc[i])) - continue; - if (OFI_LIKELY(!fi_ibv_wc_2_wce(cq, &wc[i], &wce))) - slist_insert_tail(&wce->entry, &cq->wcq); - } - } - - cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); - return FI_SUCCESS; -} - -/* WR must be filled out by now except for context */ -static inline ssize_t -fi_ibv_send_poll_cq_if_needed(struct fi_ibv_ep *ep, struct ibv_send_wr *wr) -{ - struct ibv_send_wr *bad_wr; - struct fi_ibv_domain *domain = - container_of(ep->util_ep.domain, struct fi_ibv_domain, util_domain); - int ret; - - ret = domain->post_send(ep->ibv_qp, wr, &bad_wr); - if (OFI_UNLIKELY(ret)) { - ret = fi_ibv_handle_post(ret); - if (OFI_LIKELY(ret == -FI_EAGAIN)) { - ret = fi_ibv_poll_reap_unsig_cq(ep); - if (OFI_UNLIKELY(ret)) - return -FI_EAGAIN; - /* Try again and return control to a caller */ - ret = fi_ibv_handle_post( - domain->post_send(ep->ibv_qp, wr, &bad_wr)); - } - } - return ret; -} +ssize_t vrb_post_send(struct vrb_ep *ep, struct ibv_send_wr *wr, uint64_t flags); +ssize_t vrb_post_recv(struct vrb_ep *ep, struct ibv_recv_wr *wr); static inline ssize_t -fi_ibv_send_buf(struct fi_ibv_ep *ep, struct ibv_send_wr *wr, +vrb_send_buf(struct vrb_ep *ep, struct ibv_send_wr *wr, const void *buf, size_t len, void *desc) { - struct ibv_sge sge = fi_ibv_init_sge(buf, len, desc); - - assert(wr->wr_id != VERBS_NO_COMP_FLAG); + struct ibv_sge sge = vrb_init_sge(buf, len, desc); wr->sg_list = &sge; wr->num_sge = 1; - return fi_ibv_send_poll_cq_if_needed(ep, wr); + return vrb_post_send(ep, wr, 0); } -static inline ssize_t -fi_ibv_send_buf_inline(struct fi_ibv_ep *ep, struct ibv_send_wr *wr, - const void *buf, size_t len) -{ - struct ibv_sge sge = fi_ibv_init_sge_inline(buf, len); - - assert(wr->wr_id == VERBS_NO_COMP_FLAG); - - wr->sg_list = &sge; - wr->num_sge = 1; +ssize_t vrb_send_iov(struct vrb_ep *ep, struct ibv_send_wr *wr, + const struct iovec *iov, void **desc, int count, + uint64_t flags); - return fi_ibv_send_poll_cq_if_needed(ep, wr); -} - -static inline ssize_t -fi_ibv_send_iov_flags(struct fi_ibv_ep *ep, struct ibv_send_wr *wr, - const struct iovec *iov, void **desc, int count, - uint64_t flags) -{ - size_t len = 0; - - if (!desc) - fi_ibv_set_sge_iov_inline(wr->sg_list, iov, count, len); - else - fi_ibv_set_sge_iov_count_len(wr->sg_list, iov, count, desc, len); - - wr->num_sge = count; - wr->send_flags = VERBS_INJECT_FLAGS(ep, len, flags); - wr->wr_id = VERBS_COMP_FLAGS(ep, flags, wr->wr_id); - - if (flags & FI_FENCE) - wr->send_flags |= IBV_SEND_FENCE; - - return fi_ibv_send_poll_cq_if_needed(ep, wr); -} +void vrb_add_credits(struct fid_ep *ep, size_t credits); -int fi_ibv_get_rai_id(const char *node, const char *service, uint64_t flags, +int vrb_get_rai_id(const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct rdma_addrinfo **rai, struct rdma_cm_id **id); diff --git a/prov/verbs/src/ofi_verbs_priv.h b/prov/verbs/src/ofi_verbs_priv.h index c19a1f16e91..88b5c113145 100644 --- a/prov/verbs/src/ofi_verbs_priv.h +++ b/prov/verbs/src/ofi_verbs_priv.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2018 Cray Inc. All rights reserved. + * Copyright (c) 2019 System Fabric Works, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -45,14 +46,14 @@ #define IBV_SRQ_INIT_ATTR_CQ 3ull #define IBV_SRQT_XRC 1ull -#define FI_IBV_SET_REMOTE_SRQN(var, val) do { } while (0) +#define VRB_SET_REMOTE_SRQN(var, val) do { } while (0) #define FI_VERBS_XRC_ONLY __attribute__((unused)) #define ibv_get_srq_num(srq, srqn) do { } while (0) #define ibv_create_srq_ex(context, attr) (NULL) #else /* !VERBS_HAVE_XRC */ -#define FI_IBV_SET_REMOTE_SRQN(var, val) \ +#define VRB_SET_REMOTE_SRQN(var, val) \ do { \ (var).qp_type.xrc.remote_srqn = (val); \ } while (0) @@ -60,4 +61,9 @@ #define FI_VERBS_XRC_ONLY #endif /* VERBS_HAVE_XRC */ +#if !VERBS_HAVE_RDMA_ESTABLISH +/* If older rdma-core this function does not exist/is not needed */ +#define rdma_establish(id) do { } while (0) +#endif + #endif /* OFI_VERBS_PRIV_H */ diff --git a/prov/verbs/src/verbs_cm.c b/prov/verbs/src/verbs_cm.c index 7e871408709..534f5e0fcff 100644 --- a/prov/verbs/src/verbs_cm.c +++ b/prov/verbs/src/verbs_cm.c @@ -35,9 +35,9 @@ #include "fi_verbs.h" -static int fi_ibv_copy_addr(void *dst_addr, size_t *dst_addrlen, void *src_addr) +static int vrb_copy_addr(void *dst_addr, size_t *dst_addrlen, void *src_addr) { - size_t src_addrlen = fi_ibv_sockaddr_len(src_addr); + size_t src_addrlen = ofi_sizeofaddr(src_addr); if (*dst_addrlen == 0) { *dst_addrlen = src_addrlen; @@ -53,31 +53,32 @@ static int fi_ibv_copy_addr(void *dst_addr, size_t *dst_addrlen, void *src_addr) return 0; } -static int fi_ibv_msg_ep_setname(fid_t ep_fid, void *addr, size_t addrlen) +static int vrb_msg_ep_setname(fid_t ep_fid, void *addr, size_t addrlen) { void *save_addr; struct rdma_cm_id *id; int ret; - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); - if (addrlen != ep->info->src_addrlen) { + if (addrlen != ep->info_attr.src_addrlen) { VERBS_INFO(FI_LOG_EP_CTRL,"addrlen expected: %zu, got: %zu.\n", - ep->info->src_addrlen, addrlen); + ep->info_attr.src_addrlen, addrlen); return -FI_EINVAL; } - save_addr = ep->info->src_addr; + save_addr = ep->info_attr.src_addr; - ep->info->src_addr = malloc(ep->info->src_addrlen); - if (!ep->info->src_addr) { + ep->info_attr.src_addr = malloc(ep->info_attr.src_addrlen); + if (!ep->info_attr.src_addr) { + VERBS_WARN(FI_LOG_EP_CTRL, "memory allocation failure\n"); ret = -FI_ENOMEM; goto err1; } - memcpy(ep->info->src_addr, addr, ep->info->src_addrlen); + memcpy(ep->info_attr.src_addr, addr, ep->info_attr.src_addrlen); - ret = fi_ibv_create_ep(NULL, NULL, 0, ep->info, NULL, &id); + ret = vrb_create_ep(ep, RDMA_PS_TCP, &id); if (ret) goto err2; @@ -90,106 +91,128 @@ static int fi_ibv_msg_ep_setname(fid_t ep_fid, void *addr, size_t addrlen) return 0; err2: - free(ep->info->src_addr); + free(ep->info_attr.src_addr); err1: - ep->info->src_addr = save_addr; + ep->info_attr.src_addr = save_addr; return ret; } -static int fi_ibv_msg_ep_getname(fid_t ep, void *addr, size_t *addrlen) +static int vrb_msg_ep_getname(fid_t ep, void *addr, size_t *addrlen) { struct sockaddr *sa; - struct fi_ibv_ep *_ep = - container_of(ep, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *_ep = container_of(ep, struct vrb_ep, util_ep.ep_fid); sa = rdma_get_local_addr(_ep->id); - return fi_ibv_copy_addr(addr, addrlen, sa); + return vrb_copy_addr(addr, addrlen, sa); } -static int fi_ibv_msg_ep_getpeer(struct fid_ep *ep, void *addr, size_t *addrlen) +static int vrb_msg_ep_getpeer(struct fid_ep *ep, void *addr, size_t *addrlen) { struct sockaddr *sa; - struct fi_ibv_ep *_ep = - container_of(ep, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *_ep = container_of(ep, struct vrb_ep, util_ep.ep_fid); sa = rdma_get_peer_addr(_ep->id); - return fi_ibv_copy_addr(addr, addrlen, sa); + return vrb_copy_addr(addr, addrlen, sa); } static inline void -fi_ibv_msg_ep_prepare_cm_data(const void *param, size_t param_size, - struct fi_ibv_cm_data_hdr *cm_hdr) +vrb_msg_ep_prepare_cm_data(const void *param, size_t param_size, + struct vrb_cm_data_hdr *cm_hdr) { cm_hdr->size = (uint8_t)param_size; memcpy(cm_hdr->data, param, cm_hdr->size); } static inline void -fi_ibv_ep_prepare_rdma_cm_param(struct rdma_conn_param *conn_param, - struct fi_ibv_cm_data_hdr *cm_hdr, - size_t cm_hdr_data_size) +vrb_ep_prepare_rdma_cm_param(struct rdma_conn_param *conn_param, + void *priv_data, size_t priv_data_size) { - conn_param->private_data = cm_hdr; - conn_param->private_data_len = (uint8_t)cm_hdr_data_size; + conn_param->private_data = priv_data; + conn_param->private_data_len = (uint8_t)priv_data_size; conn_param->responder_resources = RDMA_MAX_RESP_RES; conn_param->initiator_depth = RDMA_MAX_INIT_DEPTH; conn_param->flow_control = 1; conn_param->rnr_retry_count = 7; } +static void +vrb_msg_ep_prepare_rdma_cm_hdr(void *priv_data, + const struct rdma_cm_id *id) +{ + struct vrb_rdma_cm_hdr *rdma_cm_hdr = priv_data; + + /* ip_version=6 would requires IPoIB to be installed and the IP link + * to be UP, which we don't want. As a work-around, we set ip_version to 0, + * which let the CMA kernel code to skip any requirement for IPoIB. */ + rdma_cm_hdr->ip_version = 0; + rdma_cm_hdr->port = htons(ofi_addr_get_port(&id->route.addr.src_addr)); + + /* Record the GIDs */ + memcpy(rdma_cm_hdr->src_addr, + &((struct ofi_sockaddr_ib *)&id->route.addr.src_addr)->sib_addr, 16); + memcpy(rdma_cm_hdr->dst_addr, + &((struct ofi_sockaddr_ib *)&id->route.addr.dst_addr)->sib_addr, 16); +} + static int -fi_ibv_msg_ep_connect(struct fid_ep *ep, const void *addr, +vrb_msg_ep_connect(struct fid_ep *ep_fid, const void *addr, const void *param, size_t paramlen) { - struct rdma_conn_param conn_param = { 0 }; - struct sockaddr *src_addr, *dst_addr; + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); + size_t priv_data_len; + struct vrb_cm_data_hdr *cm_hdr; + off_t rdma_cm_hdr_len = 0; int ret; - struct fi_ibv_cm_data_hdr *cm_hdr; - struct fi_ibv_ep *_ep = - container_of(ep, struct fi_ibv_ep, util_ep.ep_fid); if (OFI_UNLIKELY(paramlen > VERBS_CM_DATA_SIZE)) return -FI_EINVAL; - if (!_ep->id->qp) { - ret = fi_control(&ep->fid, FI_ENABLE, NULL); + if (!ep->id->qp) { + ret = fi_control(&ep_fid->fid, FI_ENABLE, NULL); if (ret) return ret; } - cm_hdr = alloca(sizeof(*cm_hdr) + paramlen); - fi_ibv_msg_ep_prepare_cm_data(param, paramlen, cm_hdr); - fi_ibv_ep_prepare_rdma_cm_param(&conn_param, cm_hdr, - sizeof(*cm_hdr) + paramlen); - conn_param.retry_count = 15; - - if (_ep->srq_ep) - conn_param.srq = 1; + if (ep->id->route.addr.src_addr.sa_family == AF_IB) + rdma_cm_hdr_len = sizeof(struct vrb_rdma_cm_hdr); - src_addr = rdma_get_local_addr(_ep->id); - if (src_addr) { - VERBS_INFO(FI_LOG_CORE, "src_addr: %s:%d\n", - inet_ntoa(((struct sockaddr_in *)src_addr)->sin_addr), - ntohs(((struct sockaddr_in *)src_addr)->sin_port)); - } + priv_data_len = sizeof(*cm_hdr) + paramlen + rdma_cm_hdr_len; + ep->cm_priv_data = malloc(priv_data_len); + if (!ep->cm_priv_data) + return -FI_ENOMEM; - dst_addr = rdma_get_peer_addr(_ep->id); - if (dst_addr) { - VERBS_INFO(FI_LOG_CORE, "dst_addr: %s:%d\n", - inet_ntoa(((struct sockaddr_in *)dst_addr)->sin_addr), - ntohs(((struct sockaddr_in *)dst_addr)->sin_port)); + if (rdma_cm_hdr_len) + vrb_msg_ep_prepare_rdma_cm_hdr(ep->cm_priv_data, ep->id); + + cm_hdr = (void *)((char *)ep->cm_priv_data + rdma_cm_hdr_len); + vrb_msg_ep_prepare_cm_data(param, paramlen, cm_hdr); + vrb_ep_prepare_rdma_cm_param(&ep->conn_param, ep->cm_priv_data, + priv_data_len); + ep->conn_param.retry_count = 15; + + if (ep->srq_ep) + ep->conn_param.srq = 1; + + if (rdma_resolve_route(ep->id, VERBS_RESOLVE_TIMEOUT)) { + ret = -errno; + FI_WARN(&vrb_prov, FI_LOG_EP_CTRL, + "rdma_resolve_route failed: %s (%d)\n", + strerror(-ret), -ret); + free(ep->cm_priv_data); + ep->cm_priv_data = NULL; + return ret; } - - return rdma_connect(_ep->id, &conn_param) ? -errno : 0; + return 0; } static int -fi_ibv_msg_ep_accept(struct fid_ep *ep, const void *param, size_t paramlen) +vrb_msg_ep_accept(struct fid_ep *ep, const void *param, size_t paramlen) { struct rdma_conn_param conn_param; - struct fi_ibv_connreq *connreq; + struct vrb_connreq *connreq; int ret; - struct fi_ibv_cm_data_hdr *cm_hdr; - struct fi_ibv_ep *_ep = - container_of(ep, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_cm_data_hdr *cm_hdr; + struct vrb_ep *_ep = + container_of(ep, struct vrb_ep, util_ep.ep_fid); if (OFI_UNLIKELY(paramlen > VERBS_CM_DATA_SIZE)) return -FI_EINVAL; @@ -201,8 +224,8 @@ fi_ibv_msg_ep_accept(struct fid_ep *ep, const void *param, size_t paramlen) } cm_hdr = alloca(sizeof(*cm_hdr) + paramlen); - fi_ibv_msg_ep_prepare_cm_data(param, paramlen, cm_hdr); - fi_ibv_ep_prepare_rdma_cm_param(&conn_param, cm_hdr, + vrb_msg_ep_prepare_cm_data(param, paramlen, cm_hdr); + vrb_ep_prepare_rdma_cm_param(&conn_param, cm_hdr, sizeof(*cm_hdr) + paramlen); if (_ep->srq_ep) @@ -212,21 +235,21 @@ fi_ibv_msg_ep_accept(struct fid_ep *ep, const void *param, size_t paramlen) if (ret) return -errno; - connreq = container_of(_ep->info->handle, struct fi_ibv_connreq, handle); + connreq = container_of(_ep->info_attr.handle, struct vrb_connreq, handle); free(connreq); return 0; } -static int fi_ibv_msg_alloc_xrc_params(void **adjusted_param, +static int vrb_msg_alloc_xrc_params(void **adjusted_param, const void *param, size_t *paramlen) { - struct fi_ibv_xrc_cm_data *cm_data; + struct vrb_xrc_cm_data *cm_data; size_t cm_datalen = sizeof(*cm_data) + *paramlen; *adjusted_param = NULL; - if (cm_datalen > FI_IBV_CM_DATA_SIZE) { + if (cm_datalen > VRB_CM_DATA_SIZE) { VERBS_WARN(FI_LOG_EP_CTRL, "XRC CM data overflow %zu\n", cm_datalen); return -FI_EINVAL; @@ -247,18 +270,18 @@ static int fi_ibv_msg_alloc_xrc_params(void **adjusted_param, } static int -fi_ibv_msg_xrc_ep_reject(struct fi_ibv_connreq *connreq, +vrb_msg_xrc_ep_reject(struct vrb_connreq *connreq, const void *param, size_t paramlen) { - struct fi_ibv_xrc_cm_data *cm_data; + struct vrb_xrc_cm_data *cm_data; int ret; - ret = fi_ibv_msg_alloc_xrc_params((void **)&cm_data, param, ¶mlen); + ret = vrb_msg_alloc_xrc_params((void **)&cm_data, param, ¶mlen); if (ret) return ret; - fi_ibv_set_xrc_cm_data(cm_data, connreq->xrc.is_reciprocal, - connreq->xrc.conn_tag, connreq->xrc.port, 0); + vrb_set_xrc_cm_data(cm_data, connreq->xrc.is_reciprocal, + connreq->xrc.conn_tag, connreq->xrc.port, 0, 0); ret = rdma_reject(connreq->id, cm_data, (uint8_t) paramlen) ? -errno : 0; free(cm_data); @@ -266,59 +289,63 @@ fi_ibv_msg_xrc_ep_reject(struct fi_ibv_connreq *connreq, } static int -fi_ibv_msg_ep_reject(struct fid_pep *pep, fid_t handle, +vrb_msg_ep_reject(struct fid_pep *pep, fid_t handle, const void *param, size_t paramlen) { - struct fi_ibv_connreq *connreq = - container_of(handle, struct fi_ibv_connreq, handle); - struct fi_ibv_cm_data_hdr *cm_hdr; + struct vrb_connreq *connreq = + container_of(handle, struct vrb_connreq, handle); + struct vrb_cm_data_hdr *cm_hdr; + struct vrb_pep *_pep = container_of(pep, struct vrb_pep, + pep_fid); int ret; if (OFI_UNLIKELY(paramlen > VERBS_CM_DATA_SIZE)) return -FI_EINVAL; cm_hdr = alloca(sizeof(*cm_hdr) + paramlen); - fi_ibv_msg_ep_prepare_cm_data(param, paramlen, cm_hdr); + vrb_msg_ep_prepare_cm_data(param, paramlen, cm_hdr); + fastlock_acquire(&_pep->eq->lock); if (connreq->is_xrc) - ret = fi_ibv_msg_xrc_ep_reject(connreq, cm_hdr, + ret = vrb_msg_xrc_ep_reject(connreq, cm_hdr, (uint8_t)(sizeof(*cm_hdr) + paramlen)); - else ret = rdma_reject(connreq->id, cm_hdr, (uint8_t)(sizeof(*cm_hdr) + paramlen)) ? -errno : 0; + fastlock_release(&_pep->eq->lock); + free(connreq); return ret; } -static int fi_ibv_msg_ep_shutdown(struct fid_ep *ep, uint64_t flags) +static int vrb_msg_ep_shutdown(struct fid_ep *ep, uint64_t flags) { - struct fi_ibv_ep *_ep = - container_of(ep, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *_ep = + container_of(ep, struct vrb_ep, util_ep.ep_fid); if (_ep->id) return rdma_disconnect(_ep->id) ? -errno : 0; return 0; } -struct fi_ops_cm fi_ibv_msg_ep_cm_ops = { +struct fi_ops_cm vrb_msg_ep_cm_ops = { .size = sizeof(struct fi_ops_cm), - .setname = fi_ibv_msg_ep_setname, - .getname = fi_ibv_msg_ep_getname, - .getpeer = fi_ibv_msg_ep_getpeer, - .connect = fi_ibv_msg_ep_connect, + .setname = vrb_msg_ep_setname, + .getname = vrb_msg_ep_getname, + .getpeer = vrb_msg_ep_getpeer, + .connect = vrb_msg_ep_connect, .listen = fi_no_listen, - .accept = fi_ibv_msg_ep_accept, + .accept = vrb_msg_ep_accept, .reject = fi_no_reject, - .shutdown = fi_ibv_msg_ep_shutdown, + .shutdown = vrb_msg_ep_shutdown, .join = fi_no_join, }; static int -fi_ibv_msg_xrc_cm_common_verify(struct fi_ibv_xrc_ep *ep, size_t paramlen) +vrb_msg_xrc_cm_common_verify(struct vrb_xrc_ep *ep, size_t paramlen) { int ret; - if (!fi_ibv_is_xrc(ep->base_ep.info)) { + if (!vrb_is_xrc_ep(&ep->base_ep)) { VERBS_WARN(FI_LOG_EP_CTRL, "EP is not using XRC\n"); return -FI_EINVAL; } @@ -331,101 +358,110 @@ fi_ibv_msg_xrc_cm_common_verify(struct fi_ibv_xrc_ep *ep, size_t paramlen) } if (OFI_UNLIKELY(paramlen > VERBS_CM_DATA_SIZE - - sizeof(struct fi_ibv_xrc_cm_data))) + sizeof(struct vrb_xrc_cm_data))) return -FI_EINVAL; return FI_SUCCESS; } static int -fi_ibv_msg_xrc_ep_connect(struct fid_ep *ep, const void *addr, +vrb_msg_xrc_ep_connect(struct fid_ep *ep, const void *addr, const void *param, size_t paramlen) { - struct sockaddr *dst_addr; void *adjusted_param; - struct fi_ibv_ep *_ep = container_of(ep, struct fi_ibv_ep, + struct vrb_ep *_ep = container_of(ep, struct vrb_ep, util_ep.ep_fid); - struct fi_ibv_xrc_ep *xrc_ep = container_of(_ep, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *xrc_ep = container_of(_ep, struct vrb_xrc_ep, base_ep); int ret; - struct fi_ibv_cm_data_hdr *cm_hdr; + struct vrb_cm_data_hdr *cm_hdr; - ret = fi_ibv_msg_xrc_cm_common_verify(xrc_ep, paramlen); + ret = vrb_msg_xrc_cm_common_verify(xrc_ep, paramlen); if (ret) return ret; - cm_hdr = alloca(sizeof(*cm_hdr) + paramlen); - fi_ibv_msg_ep_prepare_cm_data(param, paramlen, cm_hdr); + cm_hdr = malloc(sizeof(*cm_hdr) + paramlen); + if (!cm_hdr) + return -FI_ENOMEM; + + vrb_msg_ep_prepare_cm_data(param, paramlen, cm_hdr); paramlen += sizeof(*cm_hdr); - ret = fi_ibv_msg_alloc_xrc_params(&adjusted_param, cm_hdr, ¶mlen); - if (ret) + ret = vrb_msg_alloc_xrc_params(&adjusted_param, cm_hdr, ¶mlen); + if (ret) { + free(cm_hdr); return ret; + } xrc_ep->conn_setup = calloc(1, sizeof(*xrc_ep->conn_setup)); if (!xrc_ep->conn_setup) { + VERBS_WARN(FI_LOG_EP_CTRL, + "Unable to allocate connection setup memory\n"); free(adjusted_param); + free(cm_hdr); return -FI_ENOMEM; } + xrc_ep->conn_setup->conn_tag = VERBS_CONN_TAG_INVALID; fastlock_acquire(&xrc_ep->base_ep.eq->lock); - xrc_ep->conn_setup->conn_tag = VERBS_CONN_TAG_INVALID; - fi_ibv_eq_set_xrc_conn_tag(xrc_ep); + ret = vrb_connect_xrc(xrc_ep, NULL, 0, adjusted_param, paramlen); fastlock_release(&xrc_ep->base_ep.eq->lock); - dst_addr = rdma_get_peer_addr(_ep->id); - ret = fi_ibv_connect_xrc(xrc_ep, dst_addr, 0, adjusted_param, paramlen); free(adjusted_param); + free(cm_hdr); return ret; } static int -fi_ibv_msg_xrc_ep_accept(struct fid_ep *ep, const void *param, size_t paramlen) +vrb_msg_xrc_ep_accept(struct fid_ep *ep, const void *param, size_t paramlen) { void *adjusted_param; - struct fi_ibv_ep *_ep = - container_of(ep, struct fi_ibv_ep, util_ep.ep_fid); - struct fi_ibv_xrc_ep *xrc_ep = container_of(_ep, struct fi_ibv_xrc_ep, + struct vrb_ep *_ep = + container_of(ep, struct vrb_ep, util_ep.ep_fid); + struct vrb_xrc_ep *xrc_ep = container_of(_ep, struct vrb_xrc_ep, base_ep); int ret; - struct fi_ibv_cm_data_hdr *cm_hdr; + struct vrb_cm_data_hdr *cm_hdr; - ret = fi_ibv_msg_xrc_cm_common_verify(xrc_ep, paramlen); + ret = vrb_msg_xrc_cm_common_verify(xrc_ep, paramlen); if (ret) return ret; cm_hdr = alloca(sizeof(*cm_hdr) + paramlen); - fi_ibv_msg_ep_prepare_cm_data(param, paramlen, cm_hdr); + vrb_msg_ep_prepare_cm_data(param, paramlen, cm_hdr); paramlen += sizeof(*cm_hdr); - ret = fi_ibv_msg_alloc_xrc_params(&adjusted_param, cm_hdr, ¶mlen); + ret = vrb_msg_alloc_xrc_params(&adjusted_param, cm_hdr, ¶mlen); if (ret) return ret; - ret = fi_ibv_accept_xrc(xrc_ep, 0, adjusted_param, paramlen); + fastlock_acquire(&xrc_ep->base_ep.eq->lock); + ret = vrb_accept_xrc(xrc_ep, 0, adjusted_param, paramlen); + fastlock_release(&xrc_ep->base_ep.eq->lock); + free(adjusted_param); return ret; } -struct fi_ops_cm fi_ibv_msg_xrc_ep_cm_ops = { +struct fi_ops_cm vrb_msg_xrc_ep_cm_ops = { .size = sizeof(struct fi_ops_cm), - .setname = fi_ibv_msg_ep_setname, - .getname = fi_ibv_msg_ep_getname, - .getpeer = fi_ibv_msg_ep_getpeer, - .connect = fi_ibv_msg_xrc_ep_connect, + .setname = vrb_msg_ep_setname, + .getname = vrb_msg_ep_getname, + .getpeer = vrb_msg_ep_getpeer, + .connect = vrb_msg_xrc_ep_connect, .listen = fi_no_listen, - .accept = fi_ibv_msg_xrc_ep_accept, + .accept = vrb_msg_xrc_ep_accept, .reject = fi_no_reject, - .shutdown = fi_ibv_msg_ep_shutdown, + .shutdown = vrb_msg_ep_shutdown, .join = fi_no_join, }; -static int fi_ibv_pep_setname(fid_t pep_fid, void *addr, size_t addrlen) +static int vrb_pep_setname(fid_t pep_fid, void *addr, size_t addrlen) { - struct fi_ibv_pep *pep; + struct vrb_pep *pep; int ret; - pep = container_of(pep_fid, struct fi_ibv_pep, pep_fid); + pep = container_of(pep_fid, struct vrb_pep, pep_fid); if (pep->src_addrlen && (addrlen != pep->src_addrlen)) { VERBS_INFO(FI_LOG_FABRIC, "addrlen expected: %zu, got: %zu.\n", @@ -459,47 +495,54 @@ static int fi_ibv_pep_setname(fid_t pep_fid, void *addr, size_t addrlen) return 0; } -static int fi_ibv_pep_getname(fid_t pep, void *addr, size_t *addrlen) +static int vrb_pep_getname(fid_t pep, void *addr, size_t *addrlen) { - struct fi_ibv_pep *_pep; + struct vrb_pep *_pep; struct sockaddr *sa; - _pep = container_of(pep, struct fi_ibv_pep, pep_fid); + _pep = container_of(pep, struct vrb_pep, pep_fid); sa = rdma_get_local_addr(_pep->id); - return fi_ibv_copy_addr(addr, addrlen, sa); + return vrb_copy_addr(addr, addrlen, sa); } -static int fi_ibv_pep_listen(struct fid_pep *pep_fid) +static int vrb_pep_listen(struct fid_pep *pep_fid) { - struct fi_ibv_pep *pep; + struct vrb_pep *pep; struct sockaddr *addr; + int ret; - pep = container_of(pep_fid, struct fi_ibv_pep, pep_fid); + pep = container_of(pep_fid, struct vrb_pep, pep_fid); addr = rdma_get_local_addr(pep->id); - if (addr) { - VERBS_INFO(FI_LOG_CORE, "Listening on %s:%d\n", - inet_ntoa(((struct sockaddr_in *)addr)->sin_addr), - ntohs(((struct sockaddr_in *)addr)->sin_port)); - } + ofi_straddr_log(&vrb_prov, FI_LOG_INFO, + FI_LOG_EP_CTRL, "listening on", addr); - return rdma_listen(pep->id, pep->backlog) ? -errno : 0; + ret = rdma_listen(pep->id, pep->backlog); + if (ret) + return -errno; + + if (vrb_is_xrc_info(pep->info)) { + ret = rdma_listen(pep->xrc_ps_udp_id, pep->backlog); + if (ret) + ret = -errno; + } + return ret; } -static struct fi_ops_cm fi_ibv_pep_cm_ops = { +static struct fi_ops_cm vrb_pep_cm_ops = { .size = sizeof(struct fi_ops_cm), - .setname = fi_ibv_pep_setname, - .getname = fi_ibv_pep_getname, + .setname = vrb_pep_setname, + .getname = vrb_pep_getname, .getpeer = fi_no_getpeer, .connect = fi_no_connect, - .listen = fi_ibv_pep_listen, + .listen = vrb_pep_listen, .accept = fi_no_accept, - .reject = fi_ibv_msg_ep_reject, + .reject = vrb_msg_ep_reject, .shutdown = fi_no_shutdown, .join = fi_no_join, }; -struct fi_ops_cm *fi_ibv_pep_ops_cm(struct fi_ibv_pep *pep) +struct fi_ops_cm *vrb_pep_ops_cm(struct vrb_pep *pep) { - return &fi_ibv_pep_cm_ops; + return &vrb_pep_cm_ops; } diff --git a/prov/verbs/src/verbs_cm_xrc.c b/prov/verbs/src/verbs_cm_xrc.c index 94f6198fa02..d8313075768 100644 --- a/prov/verbs/src/verbs_cm_xrc.c +++ b/prov/verbs/src/verbs_cm_xrc.c @@ -1,5 +1,6 @@ /* - * Copyright (c) 2018 Cray Inc. All rights reserved. + * Copyright (c) 2018-2019 Cray Inc. All rights reserved. + * Copyright (c) 2018-2019 System Fabric Works, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,22 +34,23 @@ #include "config.h" #include "fi_verbs.h" -void fi_ibv_next_xrc_conn_state(struct fi_ibv_xrc_ep *ep) +void vrb_next_xrc_conn_state(struct vrb_xrc_ep *ep) { switch (ep->conn_state) { - case FI_IBV_XRC_UNCONNECTED: - ep->conn_state = FI_IBV_XRC_ORIG_CONNECTING; + case VRB_XRC_UNCONNECTED: + ep->conn_state = VRB_XRC_ORIG_CONNECTING; break; - case FI_IBV_XRC_ORIG_CONNECTING: - ep->conn_state = FI_IBV_XRC_ORIG_CONNECTED; + case VRB_XRC_ORIG_CONNECTING: + ep->conn_state = VRB_XRC_ORIG_CONNECTED; break; - case FI_IBV_XRC_ORIG_CONNECTED: - ep->conn_state = FI_IBV_XRC_RECIP_CONNECTING; + case VRB_XRC_ORIG_CONNECTED: + ep->conn_state = VRB_XRC_RECIP_CONNECTING; break; - case FI_IBV_XRC_RECIP_CONNECTING: - ep->conn_state = FI_IBV_XRC_CONNECTED; + case VRB_XRC_RECIP_CONNECTING: + ep->conn_state = VRB_XRC_CONNECTED; break; - case FI_IBV_XRC_CONNECTED: + case VRB_XRC_CONNECTED: + case VRB_XRC_ERROR: break; default: assert(0); @@ -57,22 +59,24 @@ void fi_ibv_next_xrc_conn_state(struct fi_ibv_xrc_ep *ep) } } -void fi_ibv_prev_xrc_conn_state(struct fi_ibv_xrc_ep *ep) +void vrb_prev_xrc_conn_state(struct vrb_xrc_ep *ep) { switch (ep->conn_state) { - case FI_IBV_XRC_UNCONNECTED: + case VRB_XRC_UNCONNECTED: break; - case FI_IBV_XRC_ORIG_CONNECTING: - ep->conn_state = FI_IBV_XRC_UNCONNECTED; + case VRB_XRC_ORIG_CONNECTING: + ep->conn_state = VRB_XRC_UNCONNECTED; break; - case FI_IBV_XRC_ORIG_CONNECTED: - ep->conn_state = FI_IBV_XRC_ORIG_CONNECTING; + case VRB_XRC_ORIG_CONNECTED: + ep->conn_state = VRB_XRC_ORIG_CONNECTING; break; - case FI_IBV_XRC_RECIP_CONNECTING: - ep->conn_state = FI_IBV_XRC_ORIG_CONNECTED; + case VRB_XRC_RECIP_CONNECTING: + ep->conn_state = VRB_XRC_ORIG_CONNECTED; break; - case FI_IBV_XRC_CONNECTED: - ep->conn_state = FI_IBV_XRC_RECIP_CONNECTING; + case VRB_XRC_CONNECTED: + ep->conn_state = VRB_XRC_RECIP_CONNECTING; + break; + case VRB_XRC_ERROR: break; default: assert(0); @@ -81,7 +85,7 @@ void fi_ibv_prev_xrc_conn_state(struct fi_ibv_xrc_ep *ep) } } -void fi_ibv_save_priv_data(struct fi_ibv_xrc_ep *ep, const void *data, +void vrb_save_priv_data(struct vrb_xrc_ep *ep, const void *data, size_t len) { ep->conn_setup->event_len = MIN(sizeof(ep->conn_setup->event_data), @@ -89,17 +93,19 @@ void fi_ibv_save_priv_data(struct fi_ibv_xrc_ep *ep, const void *data, memcpy(ep->conn_setup->event_data, data, ep->conn_setup->event_len); } -void fi_ibv_set_xrc_cm_data(struct fi_ibv_xrc_cm_data *local, int reciprocal, - uint32_t conn_tag, uint16_t port, uint32_t param) +void vrb_set_xrc_cm_data(struct vrb_xrc_cm_data *local, int reciprocal, + uint32_t conn_tag, uint16_t port, uint32_t tgt_qpn, + uint32_t srqn) { - local->version = FI_IBV_XRC_VERSION; + local->version = VRB_XRC_VERSION; local->reciprocal = reciprocal ? 1 : 0; local->port = htons(port); local->conn_tag = htonl(conn_tag); - local->param = htonl(param); + local->tgt_qpn = htonl(tgt_qpn); + local->srqn = htonl(srqn); } -int fi_ibv_verify_xrc_cm_data(struct fi_ibv_xrc_cm_data *remote, +int vrb_verify_xrc_cm_data(struct vrb_xrc_cm_data *remote, int private_data_len) { if (sizeof(*remote) > private_data_len) { @@ -108,157 +114,140 @@ int fi_ibv_verify_xrc_cm_data(struct fi_ibv_xrc_cm_data *remote, return -FI_EINVAL; } - if (remote->version != FI_IBV_XRC_VERSION) { + if (remote->version != VRB_XRC_VERSION) { VERBS_WARN(FI_LOG_EP_CTRL, "XRC MSG EP connection protocol mismatch " "(local %"PRIu8", remote %"PRIu8")\n", - FI_IBV_XRC_VERSION, remote->version); + VRB_XRC_VERSION, remote->version); return -FI_EINVAL; } return FI_SUCCESS; } -void fi_ibv_log_ep_conn(struct fi_ibv_xrc_ep *ep, char *desc) +static void vrb_log_ep_conn(struct vrb_xrc_ep *ep, char *desc) { struct sockaddr *addr; char buf[OFI_ADDRSTRLEN]; - size_t len = sizeof(buf); + size_t len; - if (!fi_log_enabled(&fi_ibv_prov, FI_LOG_INFO, FI_LOG_FABRIC)) + if (!fi_log_enabled(&vrb_prov, FI_LOG_INFO, FI_LOG_EP_CTRL)) return; - VERBS_INFO(FI_LOG_FABRIC, "EP %p, %s\n", ep, desc); - VERBS_INFO(FI_LOG_FABRIC, + VERBS_INFO(FI_LOG_EP_CTRL, "EP %p, %s\n", (void *) ep, desc); + VERBS_INFO(FI_LOG_EP_CTRL, "EP %p, CM ID %p, TGT CM ID %p, SRQN %d Peer SRQN %d\n", - ep, ep->base_ep.id, ep->tgt_id, ep->srqn, ep->peer_srqn); + (void*) ep, (void *) ep->base_ep.id, (void *) ep->tgt_id, + ep->srqn, ep->peer_srqn); - assert(ep->base_ep.id); - addr = rdma_get_local_addr(ep->base_ep.id); - if (addr) { - ofi_straddr(buf, &len, ep->base_ep.info->addr_format, addr); - VERBS_INFO(FI_LOG_FABRIC, "EP %p src_addr: %s\n", ep, buf); - } - addr = rdma_get_peer_addr(ep->base_ep.id); - if (addr) { + if (ep->base_ep.id) { + addr = rdma_get_local_addr(ep->base_ep.id); + len = sizeof(buf); + ofi_straddr(buf, &len, ep->base_ep.info_attr.addr_format, addr); + VERBS_INFO(FI_LOG_EP_CTRL, "EP %p src_addr: %s\n", + (void *) ep, buf); + + addr = rdma_get_peer_addr(ep->base_ep.id); len = sizeof(buf); - ofi_straddr(buf, &len, ep->base_ep.info->addr_format, addr); - VERBS_INFO(FI_LOG_FABRIC, "EP %p dst_addr: %s\n", ep, buf); + ofi_straddr(buf, &len, ep->base_ep.info_attr.addr_format, addr); + VERBS_INFO(FI_LOG_EP_CTRL, "EP %p dst_addr: %s\n", + (void *) ep, buf); } if (ep->base_ep.ibv_qp) { - VERBS_INFO(FI_LOG_FABRIC, "EP %p, INI QP Num %d\n", - ep, ep->base_ep.ibv_qp->qp_num); - VERBS_INFO(FI_LOG_FABRIC, "EP %p, Remote TGT QP Num %d\n", ep, - ep->ini_conn->tgt_qpn); + VERBS_INFO(FI_LOG_EP_CTRL, "EP %p, INI QP Num %d\n", + (void *) ep, ep->base_ep.ibv_qp->qp_num); + VERBS_INFO(FI_LOG_EP_CTRL, "EP %p, Remote TGT QP Num %d\n", + (void *) ep, ep->ini_conn->tgt_qpn); } if (ep->tgt_ibv_qp) - VERBS_INFO(FI_LOG_FABRIC, "EP %p, TGT QP Num %d\n", - ep, ep->tgt_ibv_qp->qp_num); - if (ep->conn_setup && ep->conn_setup->rsvd_ini_qpn) - VERBS_INFO(FI_LOG_FABRIC, "EP %p, Reserved INI QPN %d\n", - ep, ep->conn_setup->rsvd_ini_qpn->qp_num); - if (ep->conn_setup && ep->conn_setup->rsvd_tgt_qpn) - VERBS_INFO(FI_LOG_FABRIC, "EP %p, Reserved TGT QPN %d\n", - ep, ep->conn_setup->rsvd_tgt_qpn->qp_num); + VERBS_INFO(FI_LOG_EP_CTRL, "EP %p, TGT QP Num %d\n", + (void *) ep, ep->tgt_ibv_qp->qp_num); } /* Caller must hold eq:lock */ -void fi_ibv_free_xrc_conn_setup(struct fi_ibv_xrc_ep *ep, int disconnect) +void vrb_free_xrc_conn_setup(struct vrb_xrc_ep *ep, int disconnect) { assert(ep->conn_setup); - /* Free shared connection reserved QP number resources. If - * a disconnect is requested and required then initiate a - * disconnect sequence (the XRC INI QP side disconnect is - * initiated when the remote target disconnect is received). - * If disconnecting, the QP resources will be destroyed when - * the timewait state has been exited or the EP is closed. */ - if (ep->conn_setup->rsvd_ini_qpn && !disconnect) { - assert(ep->base_ep.id); - assert(!ep->base_ep.id->qp); - - ibv_destroy_qp(ep->conn_setup->rsvd_ini_qpn); - ep->conn_setup->rsvd_ini_qpn = NULL; - } - - if (ep->conn_setup->rsvd_tgt_qpn) { + /* If a disconnect is requested then the XRC bidirectional connection + * has completed and a disconnect sequence is started (the XRC INI QP + * side disconnect is initiated when the remote target disconnect is + * received). */ + if (disconnect) { assert(ep->tgt_id); assert(!ep->tgt_id->qp); - if (disconnect && ep->conn_setup->tgt_connected) { - rdma_disconnect(ep->tgt_id); - ep->conn_setup->tgt_connected = 0; + if (ep->tgt_id->ps == RDMA_PS_UDP) { + rdma_destroy_id(ep->tgt_id); + ep->tgt_id = NULL; } else { - ibv_destroy_qp(ep->conn_setup->rsvd_tgt_qpn); - ep->conn_setup->rsvd_tgt_qpn = NULL; + rdma_disconnect(ep->tgt_id); } - } - if (ep->conn_setup->conn_tag != VERBS_CONN_TAG_INVALID) - fi_ibv_eq_clear_xrc_conn_tag(ep); + if (ep->base_ep.id->ps == RDMA_PS_UDP) { + rdma_destroy_id(ep->base_ep.id); + ep->base_ep.id = NULL; + } + } + vrb_eq_clear_xrc_conn_tag(ep); if (!disconnect) { free(ep->conn_setup); ep->conn_setup = NULL; + free(ep->base_ep.info_attr.src_addr); + ep->base_ep.info_attr.src_addr = NULL; + ep->base_ep.info_attr.src_addrlen = 0; } } -int fi_ibv_connect_xrc(struct fi_ibv_xrc_ep *ep, struct sockaddr *addr, +/* Caller must hold the eq:lock */ +int vrb_connect_xrc(struct vrb_xrc_ep *ep, struct sockaddr *addr, int reciprocal, void *param, size_t paramlen) { - struct fi_ibv_domain *domain = fi_ibv_ep_to_domain(&ep->base_ep); - struct sockaddr *peer_addr; + struct vrb_domain *domain = vrb_ep_to_domain(&ep->base_ep); int ret; - assert(ep->base_ep.id && !ep->base_ep.ibv_qp && !ep->ini_conn); - - peer_addr = rdma_get_local_addr(ep->base_ep.id); - if (peer_addr) - ofi_straddr_dbg(&fi_ibv_prov, FI_LOG_FABRIC, - "XRC connect src_addr", peer_addr); + assert(!ep->base_ep.id && !ep->base_ep.ibv_qp && !ep->ini_conn); - peer_addr = rdma_get_peer_addr(ep->base_ep.id); - if (peer_addr) - ofi_straddr_dbg(&fi_ibv_prov, FI_LOG_FABRIC, - "XRC connect dest_addr", peer_addr); - - fastlock_acquire(&domain->xrc.ini_mgmt_lock); - ret = fi_ibv_get_shared_ini_conn(ep, &ep->ini_conn); + domain->xrc.lock_acquire(&domain->xrc.ini_lock); + ret = vrb_get_shared_ini_conn(ep, &ep->ini_conn); if (ret) { VERBS_WARN(FI_LOG_EP_CTRL, "Get of shared XRC INI connection failed %d\n", ret); - fastlock_release(&domain->xrc.ini_mgmt_lock); if (!reciprocal) { free(ep->conn_setup); ep->conn_setup = NULL; } + domain->xrc.lock_release(&domain->xrc.ini_lock); return ret; } - fi_ibv_add_pending_ini_conn(ep, reciprocal, param, paramlen); - fi_ibv_sched_ini_conn(ep->ini_conn); - fastlock_release(&domain->xrc.ini_mgmt_lock); + + vrb_eq_set_xrc_conn_tag(ep); + vrb_add_pending_ini_conn(ep, reciprocal, param, paramlen); + vrb_sched_ini_conn(ep->ini_conn); + domain->xrc.lock_release(&domain->xrc.ini_lock); return FI_SUCCESS; } -void fi_ibv_ep_ini_conn_done(struct fi_ibv_xrc_ep *ep, uint32_t peer_srqn, - uint32_t tgt_qpn) +/* Caller must hold the eq:lock */ +void vrb_ep_ini_conn_done(struct vrb_xrc_ep *ep, uint32_t tgt_qpn) { - struct fi_ibv_domain *domain = fi_ibv_ep_to_domain(&ep->base_ep); + struct vrb_domain *domain = vrb_ep_to_domain(&ep->base_ep); assert(ep->base_ep.id && ep->ini_conn); - fastlock_acquire(&domain->xrc.ini_mgmt_lock); - - assert(ep->ini_conn->state == FI_IBV_INI_QP_CONNECTING || - ep->ini_conn->state == FI_IBV_INI_QP_CONNECTED); + domain->xrc.lock_acquire(&domain->xrc.ini_lock); + assert(ep->ini_conn->state == VRB_INI_QP_CONNECTING || + ep->ini_conn->state == VRB_INI_QP_CONNECTED); /* If this was a physical INI/TGT QP connection, remove the QP * from control of the RDMA CM. We don't want the shared INI QP * to be destroyed if this endpoint closes. */ - if (ep->base_ep.id->qp) { - ep->ini_conn->state = FI_IBV_INI_QP_CONNECTED; + if (ep->base_ep.id == ep->ini_conn->phys_conn_id) { + ep->ini_conn->phys_conn_id = NULL; + ep->ini_conn->state = VRB_INI_QP_CONNECTED; ep->ini_conn->tgt_qpn = tgt_qpn; ep->base_ep.id->qp = NULL; VERBS_DBG(FI_LOG_EP_CTRL, @@ -267,64 +256,91 @@ void fi_ibv_ep_ini_conn_done(struct fi_ibv_xrc_ep *ep, uint32_t peer_srqn, ep->ini_conn->tgt_qpn); } - ep->conn_setup->ini_connected = 1; - fi_ibv_log_ep_conn(ep, "INI Connection Done"); - fi_ibv_sched_ini_conn(ep->ini_conn); - fastlock_release(&domain->xrc.ini_mgmt_lock); + vrb_log_ep_conn(ep, "INI Connection Done"); + vrb_sched_ini_conn(ep->ini_conn); + domain->xrc.lock_release(&domain->xrc.ini_lock); } -void fi_ibv_ep_ini_conn_rejected(struct fi_ibv_xrc_ep *ep) +/* Caller must hold the eq:lock */ +void vrb_ep_ini_conn_rejected(struct vrb_xrc_ep *ep) { - struct fi_ibv_domain *domain = fi_ibv_ep_to_domain(&ep->base_ep); - assert(ep->base_ep.id && ep->ini_conn); - fastlock_acquire(&domain->xrc.ini_mgmt_lock); - fi_ibv_log_ep_conn(ep, "INI Connection Rejected"); - - if (ep->ini_conn->state == FI_IBV_INI_QP_CONNECTING) - ep->ini_conn->state = FI_IBV_INI_QP_UNCONNECTED; - fi_ibv_put_shared_ini_conn(ep); - fastlock_release(&domain->xrc.ini_mgmt_lock); + vrb_log_ep_conn(ep, "INI Connection Rejected"); + vrb_put_shared_ini_conn(ep); + ep->conn_state = VRB_XRC_ERROR; } -void fi_ibv_ep_tgt_conn_done(struct fi_ibv_xrc_ep *ep) +void vrb_ep_tgt_conn_done(struct vrb_xrc_ep *ep) { - fi_ibv_log_ep_conn(ep, "TGT Connection Done\n"); + vrb_log_ep_conn(ep, "TGT Connection Done\n"); if (ep->tgt_id->qp) { assert(ep->tgt_ibv_qp == ep->tgt_id->qp); ep->tgt_id->qp = NULL; } - ep->conn_setup->tgt_connected = 1; } -int fi_ibv_accept_xrc(struct fi_ibv_xrc_ep *ep, int reciprocal, +/* Caller must hold the eq:lock */ +int vrb_resend_shared_accept_xrc(struct vrb_xrc_ep *ep, + struct vrb_connreq *connreq, + struct rdma_cm_id *id) +{ + struct rdma_conn_param conn_param = { 0 }; + struct vrb_xrc_cm_data *cm_data = ep->accept_param_data; + + assert(cm_data && ep->tgt_ibv_qp); + assert(ep->tgt_ibv_qp->qp_num == connreq->xrc.tgt_qpn); + assert(ep->peer_srqn == connreq->xrc.peer_srqn); + + vrb_set_xrc_cm_data(cm_data, connreq->xrc.is_reciprocal, + connreq->xrc.conn_tag, connreq->xrc.port, + 0, ep->srqn); + conn_param.private_data = cm_data; + conn_param.private_data_len = ep->accept_param_len; + + conn_param.responder_resources = RDMA_MAX_RESP_RES; + conn_param.initiator_depth = RDMA_MAX_INIT_DEPTH; + conn_param.flow_control = 1; + conn_param.rnr_retry_count = 7; + if (ep->base_ep.srq_ep) + conn_param.srq = 1; + conn_param.qp_num = ep->tgt_ibv_qp->qp_num; + + return rdma_accept(id, &conn_param); +} + +/* Caller must hold the eq:lock */ +int vrb_accept_xrc(struct vrb_xrc_ep *ep, int reciprocal, void *param, size_t paramlen) { struct sockaddr *addr; - struct fi_ibv_connreq *connreq; + struct vrb_connreq *connreq; struct rdma_conn_param conn_param = { 0 }; - struct fi_ibv_xrc_cm_data *cm_data = param; + struct vrb_xrc_cm_data *cm_data = param; + struct vrb_xrc_cm_data connect_cm_data; int ret; addr = rdma_get_local_addr(ep->tgt_id); if (addr) - ofi_straddr_dbg(&fi_ibv_prov, FI_LOG_CORE, "src_addr", addr); + ofi_straddr_dbg(&vrb_prov, FI_LOG_CORE, "src_addr", addr); addr = rdma_get_peer_addr(ep->tgt_id); if (addr) - ofi_straddr_dbg(&fi_ibv_prov, FI_LOG_CORE, "dest_addr", addr); + ofi_straddr_dbg(&vrb_prov, FI_LOG_CORE, "dest_addr", addr); - connreq = container_of(ep->base_ep.info->handle, - struct fi_ibv_connreq, handle); - ret = fi_ibv_ep_create_tgt_qp(ep, connreq->xrc.conn_data); + connreq = container_of(ep->base_ep.info_attr.handle, + struct vrb_connreq, handle); + ret = vrb_ep_create_tgt_qp(ep, connreq->xrc.tgt_qpn); if (ret) return ret; - fi_ibv_set_xrc_cm_data(cm_data, connreq->xrc.is_reciprocal, + ep->peer_srqn = connreq->xrc.peer_srqn; + ep->remote_pep_port = connreq->xrc.port; + ep->recip_accept = connreq->xrc.is_reciprocal; + vrb_set_xrc_cm_data(cm_data, connreq->xrc.is_reciprocal, connreq->xrc.conn_tag, connreq->xrc.port, - ep->srqn); + 0, ep->srqn); conn_param.private_data = cm_data; conn_param.private_data_len = paramlen; conn_param.responder_resources = RDMA_MAX_RESP_RES; @@ -334,62 +350,73 @@ int fi_ibv_accept_xrc(struct fi_ibv_xrc_ep *ep, int reciprocal, if (ep->base_ep.srq_ep) conn_param.srq = 1; - /* Shared INI/TGT QP connection use a temporarily reserved QP number - * avoid the appearance of being a stale/duplicate IB CM message */ if (!ep->tgt_id->qp) - conn_param.qp_num = ep->conn_setup->rsvd_tgt_qpn->qp_num; + conn_param.qp_num = ep->tgt_ibv_qp->qp_num; - if (!connreq->xrc.is_reciprocal) - ep->conn_setup->conn_tag = connreq->xrc.conn_tag; + ep->conn_setup->remote_conn_tag = connreq->xrc.conn_tag; - assert(ep->conn_state == FI_IBV_XRC_UNCONNECTED || - ep->conn_state == FI_IBV_XRC_ORIG_CONNECTED); - fi_ibv_next_xrc_conn_state(ep); + assert(ep->conn_state == VRB_XRC_UNCONNECTED || + ep->conn_state == VRB_XRC_ORIG_CONNECTED); + vrb_next_xrc_conn_state(ep); ret = rdma_accept(ep->tgt_id, &conn_param); if (OFI_UNLIKELY(ret)) { ret = -errno; VERBS_WARN(FI_LOG_EP_CTRL, "XRC TGT, rdma_accept error %d\n", ret); - fi_ibv_prev_xrc_conn_state(ep); - } else - free(connreq); + vrb_prev_xrc_conn_state(ep); + return ret; + } + free(connreq); + + if (ep->tgt_id->ps == RDMA_PS_UDP && + vrb_eq_add_sidr_conn(ep, cm_data, paramlen)) + VERBS_WARN(FI_LOG_EP_CTRL, + "SIDR connection accept not added to map\n"); + + /* The passive side of the initial shared connection using + * SIDR is complete, initiate reciprocal connection */ + if (ep->tgt_id->ps == RDMA_PS_UDP && !reciprocal) { + vrb_next_xrc_conn_state(ep); + vrb_ep_tgt_conn_done(ep); + ret = vrb_connect_xrc(ep, NULL, VRB_RECIP_CONN, + &connect_cm_data, + sizeof(connect_cm_data)); + if (ret) { + VERBS_WARN(FI_LOG_EP_CTRL, + "XRC reciprocal connect error %d\n", ret); + vrb_prev_xrc_conn_state(ep); + ep->tgt_id->qp = NULL; + } + } return ret; } -int fi_ibv_process_xrc_connreq(struct fi_ibv_ep *ep, - struct fi_ibv_connreq *connreq) +int vrb_process_xrc_connreq(struct vrb_ep *ep, + struct vrb_connreq *connreq) { - struct fi_ibv_xrc_ep *xrc_ep = container_of(ep, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *xrc_ep = container_of(ep, struct vrb_xrc_ep, base_ep); - int ret; - assert(ep->info->src_addr); - assert(ep->info->dest_addr); + assert(ep->info_attr.src_addr); + assert(ep->info_attr.dest_addr); xrc_ep->conn_setup = calloc(1, sizeof(*xrc_ep->conn_setup)); - if (!xrc_ep->conn_setup) + if (!xrc_ep->conn_setup) { + VERBS_WARN(FI_LOG_EP_CTRL, + "Unable to allocate connection setup memory\n"); return -FI_ENOMEM; + } + xrc_ep->conn_setup->conn_tag = VERBS_CONN_TAG_INVALID; /* This endpoint was created on the passive side of a connection * request. The reciprocal connection request will go back to the * passive port indicated by the active side */ - ofi_addr_set_port(ep->info->src_addr, 0); - ofi_addr_set_port(ep->info->dest_addr, connreq->xrc.port); - - ret = fi_ibv_create_ep(NULL, NULL, 0, ep->info, NULL, &ep->id); - if (ret) { - VERBS_WARN(FI_LOG_EP_CTRL, - "Creation of INI cm_id failed %d\n", ret); - goto create_err; - } + ofi_addr_set_port(ep->info_attr.src_addr, 0); + ofi_addr_set_port(ep->info_attr.dest_addr, connreq->xrc.port); xrc_ep->tgt_id = connreq->id; xrc_ep->tgt_id->context = &ep->util_ep.ep_fid.fid; return FI_SUCCESS; - -create_err: - free(xrc_ep->conn_setup); - return ret; } diff --git a/prov/verbs/src/verbs_cq.c b/prov/verbs/src/verbs_cq.c index 27dde9bb296..14d91bab908 100644 --- a/prov/verbs/src/verbs_cq.c +++ b/prov/verbs/src/verbs_cq.c @@ -37,73 +37,86 @@ #include "fi_verbs.h" -static inline void fi_ibv_handle_wc(struct ibv_wc *wc, uint64_t *flags, - size_t *len, uint64_t *data) +static void vrb_cq_read_context_entry(struct ibv_wc *wc, void *buf) { + struct fi_cq_entry *entry = buf; + + entry->op_context = (void *) (uintptr_t) wc->wr_id; +} + +static void vrb_cq_read_msg_entry(struct ibv_wc *wc, void *buf) +{ + struct fi_cq_msg_entry *entry = buf; + + entry->op_context = (void *) (uintptr_t) wc->wr_id; + switch (wc->opcode) { case IBV_WC_SEND: - *flags = (FI_SEND | FI_MSG); + entry->flags = (FI_SEND | FI_MSG); break; case IBV_WC_RDMA_WRITE: - *flags = (FI_RMA | FI_WRITE); + entry->flags = (FI_RMA | FI_WRITE); break; case IBV_WC_RDMA_READ: - *flags = (FI_RMA | FI_READ); + entry->flags = (FI_RMA | FI_READ); break; case IBV_WC_COMP_SWAP: - *flags = FI_ATOMIC; + entry->flags = FI_ATOMIC; break; case IBV_WC_FETCH_ADD: - *flags = FI_ATOMIC; + entry->flags = FI_ATOMIC; break; case IBV_WC_RECV: - *len = wc->byte_len; - *flags = (FI_RECV | FI_MSG); - if (wc->wc_flags & IBV_WC_WITH_IMM) { - if (data) - *data = ntohl(wc->imm_data); - *flags |= FI_REMOTE_CQ_DATA; - } + entry->len = wc->byte_len; + entry->flags = (FI_RECV | FI_MSG); break; case IBV_WC_RECV_RDMA_WITH_IMM: - *len = wc->byte_len; - *flags = (FI_RMA | FI_REMOTE_WRITE); - if (wc->wc_flags & IBV_WC_WITH_IMM) { - if (data) - *data = ntohl(wc->imm_data); - *flags |= FI_REMOTE_CQ_DATA; - } + entry->len = wc->byte_len; + entry->flags = (FI_RMA | FI_REMOTE_WRITE); break; default: break; } } +static void vrb_cq_read_data_entry(struct ibv_wc *wc, void *buf) +{ + struct fi_cq_data_entry *entry = buf; + + /* fi_cq_data_entry can cast to fi_cq_msg_entry */ + vrb_cq_read_msg_entry(wc, buf); + if ((wc->wc_flags & IBV_WC_WITH_IMM) && + (wc->opcode & IBV_WC_RECV)) { + entry->data = ntohl(wc->imm_data); + entry->flags |= FI_REMOTE_CQ_DATA; + } +} + static ssize_t -fi_ibv_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *entry, +vrb_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *entry, uint64_t flags) { - struct fi_ibv_cq *cq; - struct fi_ibv_wce *wce; + struct vrb_cq *cq; + struct vrb_wc_entry *wce; struct slist_entry *slist_entry; uint32_t api_version; - cq = container_of(cq_fid, struct fi_ibv_cq, util_cq.cq_fid); + cq = container_of(cq_fid, struct vrb_cq, util_cq.cq_fid); cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock); - if (slist_empty(&cq->wcq)) + if (slist_empty(&cq->saved_wc_list)) goto err; - wce = container_of(cq->wcq.head, struct fi_ibv_wce, entry); + wce = container_of(cq->saved_wc_list.head, struct vrb_wc_entry, entry); if (!wce->wc.status) goto err; api_version = cq->util_cq.domain->fabric->fabric_fid.api_version; - slist_entry = slist_remove_head(&cq->wcq); + slist_entry = slist_remove_head(&cq->saved_wc_list); cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); - wce = container_of(slist_entry, struct fi_ibv_wce, entry); + wce = container_of(slist_entry, struct vrb_wc_entry, entry); entry->op_context = (void *)(uintptr_t)wce->wc.wr_id; entry->prov_errno = wce->wc.status; @@ -111,7 +124,9 @@ fi_ibv_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *entry, entry->err = FI_ECANCELED; else entry->err = EIO; - fi_ibv_handle_wc(&wce->wc, &entry->flags, &entry->len, &entry->data); + + /* fi_cq_err_entry can cast to fi_cq_data_entry */ + vrb_cq_read_data_entry(&wce->wc, (void *) entry); if ((FI_VERSION_GE(api_version, FI_VERSION(1, 5))) && entry->err_data && entry->err_data_size) { @@ -131,7 +146,7 @@ fi_ibv_cq_readerr(struct fid_cq *cq_fid, struct fi_cq_err_entry *entry, } static inline int -fi_ibv_poll_events(struct fi_ibv_cq *_cq, int timeout) +vrb_poll_events(struct vrb_cq *_cq, int timeout) { int ret, rc; void *context; @@ -173,16 +188,16 @@ fi_ibv_poll_events(struct fi_ibv_cq *_cq, int timeout) } static ssize_t -fi_ibv_cq_sread(struct fid_cq *cq, void *buf, size_t count, const void *cond, +vrb_cq_sread(struct fid_cq *cq, void *buf, size_t count, const void *cond, int timeout) { ssize_t ret = 0, cur; ssize_t threshold; - struct fi_ibv_cq *_cq; + struct vrb_cq *_cq; uint8_t *p; p = buf; - _cq = container_of(cq, struct fi_ibv_cq, util_cq.cq_fid); + _cq = container_of(cq, struct vrb_cq, util_cq.cq_fid); if (!_cq->channel) return -FI_ENOSYS; @@ -191,8 +206,8 @@ fi_ibv_cq_sread(struct fid_cq *cq, void *buf, size_t count, const void *cond, MIN((ssize_t) cond, count) : 1; for (cur = 0; cur < threshold; ) { - if (fi_ibv_cq_trywait(_cq) == FI_SUCCESS) { - ret = fi_ibv_poll_events(_cq, timeout); + if (vrb_cq_trywait(_cq) == FI_SUCCESS) { + ret = vrb_poll_events(_cq, timeout); if (ret) break; } @@ -211,129 +226,121 @@ fi_ibv_cq_sread(struct fid_cq *cq, void *buf, size_t count, const void *cond, return cur ? cur : ret; } -static void fi_ibv_cq_read_context_entry(struct ibv_wc *wc, void *buf) +/* Must be called with CQ lock held. */ +int vrb_poll_cq(struct vrb_cq *cq, struct ibv_wc *wc) { - struct fi_cq_entry *entry = buf; + struct vrb_context *ctx; + int ret; - entry->op_context = (void *)(uintptr_t)wc->wr_id; -} + do { + ret = ibv_poll_cq(cq->cq, 1, wc); + if (ret <= 0) + break; -static void fi_ibv_cq_read_msg_entry(struct ibv_wc *wc, void *buf) -{ - struct fi_cq_msg_entry *entry = buf; + ctx = (struct vrb_context *) (uintptr_t) wc->wr_id; + wc->wr_id = (uintptr_t) ctx->user_ctx; + if (ctx->flags & FI_TRANSMIT) { + cq->credits++; + ctx->ep->sq_credits++; + } - entry->op_context = (void *)(uintptr_t)wc->wr_id; - fi_ibv_handle_wc(wc, &entry->flags, &entry->len, NULL); -} + if (wc->status) { + if (ctx->flags & FI_RECV) + wc->opcode |= IBV_WC_RECV; + else + wc->opcode &= ~IBV_WC_RECV; + } + if (ctx->srx) { + fastlock_acquire(&ctx->srx->ctx_lock); + ofi_buf_free(ctx); + fastlock_release(&ctx->srx->ctx_lock); + } else { + ofi_buf_free(ctx); + } -static void fi_ibv_cq_read_data_entry(struct ibv_wc *wc, void *buf) -{ - struct fi_cq_data_entry *entry = buf; + } while (wc->wr_id == VERBS_NO_COMP_FLAG); - entry->op_context = (void *)(uintptr_t)wc->wr_id; - fi_ibv_handle_wc(wc, &entry->flags, &entry->len, &entry->data); + return ret; } -/* Must call with cq->lock held */ -static inline int fi_ibv_poll_outstanding_cq(struct fi_ibv_ep *ep, - struct fi_ibv_cq *cq) +/* Must be called with CQ lock held. */ +int vrb_save_wc(struct vrb_cq *cq, struct ibv_wc *wc) { - struct fi_ibv_domain *domain = container_of(cq->util_cq.domain, - struct fi_ibv_domain, - util_domain); - struct fi_ibv_wce *wce; - struct ibv_wc wc; - ssize_t ret; - - ret = domain->poll_cq(cq->cq, 1, &wc); - if (ret <= 0) - return ret; - - /* Handle WR entry when user doesn't request the completion */ - if (wc.wr_id == VERBS_NO_COMP_FLAG) { - /* To ensure the new iteration */ - return 1; - } + struct vrb_wc_entry *wce; - ret = fi_ibv_wc_2_wce(cq, &wc, &wce); - if (OFI_UNLIKELY(ret)) { - ret = -FI_EAGAIN; - goto fn; + wce = ofi_buf_alloc(cq->wce_pool); + if (!wce) { + FI_WARN(&vrb_prov, FI_LOG_CQ, + "Unable to save completion, completion lost!\n"); + return -FI_ENOMEM; } - slist_insert_tail(&wce->entry, &cq->wcq); - ret = 1; -fn: - return ret; + wce->wc = *wc; + slist_insert_tail(&wce->entry, &cq->saved_wc_list); + return FI_SUCCESS; } -void fi_ibv_cleanup_cq(struct fi_ibv_ep *ep) +static void vrb_flush_cq(struct vrb_cq *cq) { - int ret; + struct ibv_wc wc; + ssize_t ret; - ep->util_ep.rx_cq->cq_fastlock_acquire(&ep->util_ep.rx_cq->cq_lock); - do { - ret = fi_ibv_poll_outstanding_cq(ep, container_of(ep->util_ep.rx_cq, - struct fi_ibv_cq, util_cq)); - } while (ret > 0); - ep->util_ep.rx_cq->cq_fastlock_release(&ep->util_ep.rx_cq->cq_lock); + cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock); + while (1) { + ret = vrb_poll_cq(cq, &wc); + if (ret <= 0) + break; - ep->util_ep.tx_cq->cq_fastlock_acquire(&ep->util_ep.tx_cq->cq_lock); - do { - ret = fi_ibv_poll_outstanding_cq(ep, container_of(ep->util_ep.tx_cq, - struct fi_ibv_cq, util_cq)); - } while (ret > 0); - ep->util_ep.tx_cq->cq_fastlock_release(&ep->util_ep.tx_cq->cq_lock); + vrb_save_wc(cq, &wc); + }; + + cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); } -/* Must call with cq->lock held */ -static inline -ssize_t fi_ibv_poll_cq_process_wc(struct fi_ibv_cq *cq, struct ibv_wc *wc) +void vrb_cleanup_cq(struct vrb_ep *ep) { - struct fi_ibv_domain *domain = container_of(cq->util_cq.domain, - struct fi_ibv_domain, - util_domain); - ssize_t ret; - - ret = domain->poll_cq(cq->cq, 1, wc); - if (ret <= 0) - return ret; - - return fi_ibv_process_wc_poll_new(cq, wc); + if (ep->util_ep.rx_cq) { + vrb_flush_cq(container_of(ep->util_ep.rx_cq, + struct vrb_cq, util_cq)); + } + if (ep->util_ep.tx_cq) { + vrb_flush_cq(container_of(ep->util_ep.tx_cq, + struct vrb_cq, util_cq)); + } } -static ssize_t fi_ibv_cq_read(struct fid_cq *cq_fid, void *buf, size_t count) +static ssize_t vrb_cq_read(struct fid_cq *cq_fid, void *buf, size_t count) { - struct fi_ibv_cq *cq; - struct fi_ibv_wce *wce; + struct vrb_cq *cq; + struct vrb_wc_entry *wce; struct slist_entry *entry; struct ibv_wc wc; ssize_t ret = 0, i; - cq = container_of(cq_fid, struct fi_ibv_cq, util_cq.cq_fid); + cq = container_of(cq_fid, struct vrb_cq, util_cq.cq_fid); cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock); for (i = 0; i < count; i++) { - if (!slist_empty(&cq->wcq)) { - wce = container_of(cq->wcq.head, struct fi_ibv_wce, entry); + if (!slist_empty(&cq->saved_wc_list)) { + wce = container_of(cq->saved_wc_list.head, + struct vrb_wc_entry, entry); if (wce->wc.status) { ret = -FI_EAVAIL; break; } - entry = slist_remove_head(&cq->wcq); - wce = container_of(entry, struct fi_ibv_wce, entry); - cq->read_entry(&wce->wc, (char *)buf + i * cq->entry_size); + entry = slist_remove_head(&cq->saved_wc_list); + wce = container_of(entry, struct vrb_wc_entry, entry); + cq->read_entry(&wce->wc, (char *) buf + i * cq->entry_size); ofi_buf_free(wce); continue; } - ret = fi_ibv_poll_cq_process_wc(cq, &wc); + ret = vrb_poll_cq(cq, &wc); if (ret <= 0) break; - /* Insert error entry into wcq */ - if (OFI_UNLIKELY(wc.status)) { + if (wc.status) { wce = ofi_buf_alloc(cq->wce_pool); if (!wce) { cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); @@ -341,7 +348,7 @@ static ssize_t fi_ibv_cq_read(struct fid_cq *cq_fid, void *buf, size_t count) } memset(wce, 0, sizeof(*wce)); memcpy(&wce->wc, &wc, sizeof wc); - slist_insert_tail(&wce->entry, &cq->wcq); + slist_insert_tail(&wce->entry, &cq->saved_wc_list); ret = -FI_EAVAIL; break; } @@ -350,11 +357,11 @@ static ssize_t fi_ibv_cq_read(struct fid_cq *cq_fid, void *buf, size_t count) } cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); - return i ? i : (ret ? ret : -FI_EAGAIN); + return i ? i : (ret < 0 ? ret : -FI_EAGAIN); } static const char * -fi_ibv_cq_strerror(struct fid_cq *eq, int prov_errno, const void *err_data, +vrb_cq_strerror(struct fid_cq *eq, int prov_errno, const void *err_data, char *buf, size_t len) { if (buf && len) @@ -362,12 +369,12 @@ fi_ibv_cq_strerror(struct fid_cq *eq, int prov_errno, const void *err_data, return ibv_wc_status_str(prov_errno); } -int fi_ibv_cq_signal(struct fid_cq *cq) +int vrb_cq_signal(struct fid_cq *cq) { - struct fi_ibv_cq *_cq; + struct vrb_cq *_cq; char data = '0'; - _cq = container_of(cq, struct fi_ibv_cq, util_cq.cq_fid); + _cq = container_of(cq, struct vrb_cq, util_cq.cq_fid); if (write(_cq->signal_fd[1], &data, 1) != 1) { VERBS_WARN(FI_LOG_CQ, "Error signalling CQ\n"); @@ -377,9 +384,9 @@ int fi_ibv_cq_signal(struct fid_cq *cq) return 0; } -int fi_ibv_cq_trywait(struct fi_ibv_cq *cq) +int vrb_cq_trywait(struct vrb_cq *cq) { - struct fi_ibv_wce *wce; + struct ibv_wc wc; void *context; int ret = -FI_EAGAIN, rc; @@ -389,22 +396,14 @@ int fi_ibv_cq_trywait(struct fi_ibv_cq *cq) } cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock); - if (!slist_empty(&cq->wcq)) - goto out; - - wce = ofi_buf_alloc(cq->wce_pool); - if (!wce) { - ret = -FI_ENOMEM; + if (!slist_empty(&cq->saved_wc_list)) goto out; - } - memset(wce, 0, sizeof(*wce)); - rc = fi_ibv_poll_cq_process_wc(cq, &wce->wc); - if (rc > 0) { - slist_insert_tail(&wce->entry, &cq->wcq); + rc = vrb_poll_cq(cq, &wc); + if (rc) { + if (rc > 0) + vrb_save_wc(cq, &wc); goto out; - } else if (rc < 0) { - goto err; } while (!ibv_get_cq_event(cq->channel, &cq->cq, &context)) @@ -414,51 +413,67 @@ int fi_ibv_cq_trywait(struct fi_ibv_cq *cq) if (rc) { VERBS_WARN(FI_LOG_CQ, "ibv_req_notify_cq error: %d\n", ret); ret = -errno; - goto err; + goto out; } /* Read again to fetch any completions that we might have missed * while rearming */ - rc = fi_ibv_poll_cq_process_wc(cq, &wce->wc); - if (rc > 0) { - slist_insert_tail(&wce->entry, &cq->wcq); + rc = vrb_poll_cq(cq, &wc); + if (rc) { + if (rc > 0) + vrb_save_wc(cq, &wc); goto out; - } else if (rc < 0) { - goto err; } ret = FI_SUCCESS; -err: - ofi_buf_free(wce); out: cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); return ret; } -static struct fi_ops_cq fi_ibv_cq_ops = { +static struct fi_ops_cq vrb_cq_ops = { .size = sizeof(struct fi_ops_cq), - .read = fi_ibv_cq_read, + .read = vrb_cq_read, .readfrom = fi_no_cq_readfrom, - .readerr = fi_ibv_cq_readerr, - .sread = fi_ibv_cq_sread, + .readerr = vrb_cq_readerr, + .sread = vrb_cq_sread, .sreadfrom = fi_no_cq_sreadfrom, - .signal = fi_ibv_cq_signal, - .strerror = fi_ibv_cq_strerror + .signal = vrb_cq_signal, + .strerror = vrb_cq_strerror }; -static int fi_ibv_cq_control(fid_t fid, int command, void *arg) +static int vrb_cq_control(fid_t fid, int command, void *arg) { - struct fi_ibv_cq *cq; - int ret = 0; + struct fi_wait_pollfd *pollfd; + struct vrb_cq *cq; + int ret; - cq = container_of(fid, struct fi_ibv_cq, util_cq.cq_fid); + cq = container_of(fid, struct vrb_cq, util_cq.cq_fid); switch(command) { case FI_GETWAIT: if (!cq->channel) { ret = -FI_ENODATA; break; } - *(int *) arg = cq->channel->fd; + + if (cq->wait_obj == FI_WAIT_FD) { + *(int *) arg = cq->channel->fd; + return 0; + } + + pollfd = arg; + if (pollfd->nfds >= 1) { + pollfd->fd[0].fd = cq->channel->fd; + pollfd->fd[0].events = POLLIN; + ret = 0; + } else { + ret = -FI_ETOOSMALL; + } + pollfd->nfds = 1; + break; + case FI_GETWAITOBJ: + *(enum fi_wait_obj *) arg = cq->wait_obj; + ret = 0; break; default: ret = -FI_ENOSYS; @@ -468,14 +483,14 @@ static int fi_ibv_cq_control(fid_t fid, int command, void *arg) return ret; } -static int fi_ibv_cq_close(fid_t fid) +static int vrb_cq_close(fid_t fid) { - struct fi_ibv_wce *wce; + struct vrb_wc_entry *wce; struct slist_entry *entry; int ret; - struct fi_ibv_cq *cq = - container_of(fid, struct fi_ibv_cq, util_cq.cq_fid); - struct fi_ibv_srq_ep *srq_ep; + struct vrb_cq *cq = + container_of(fid, struct vrb_cq, util_cq.cq_fid); + struct vrb_srq_ep *srq_ep; struct dlist_entry *srq_ep_temp; if (ofi_atomic_get32(&cq->nevents)) @@ -485,9 +500,9 @@ static int fi_ibv_cq_close(fid_t fid) * and the XRC SRQ references the RX CQ, we must destroy any * XRC SRQ using this CQ before destroying the CQ. */ fastlock_acquire(&cq->xrc.srq_list_lock); - dlist_foreach_container_safe(&cq->xrc.srq_list, struct fi_ibv_srq_ep, + dlist_foreach_container_safe(&cq->xrc.srq_list, struct vrb_srq_ep, srq_ep, xrc.srq_entry, srq_ep_temp) { - ret = fi_ibv_xrc_close_srq(srq_ep); + ret = vrb_xrc_close_srq(srq_ep); if (ret) { fastlock_release(&cq->xrc.srq_list_lock); return -ret; @@ -496,14 +511,15 @@ static int fi_ibv_cq_close(fid_t fid) fastlock_release(&cq->xrc.srq_list_lock); cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock); - while (!slist_empty(&cq->wcq)) { - entry = slist_remove_head(&cq->wcq); - wce = container_of(entry, struct fi_ibv_wce, entry); + while (!slist_empty(&cq->saved_wc_list)) { + entry = slist_remove_head(&cq->saved_wc_list); + wce = container_of(entry, struct vrb_wc_entry, entry); ofi_buf_free(wce); } cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); ofi_bufpool_destroy(cq->wce_pool); + ofi_bufpool_destroy(cq->ctx_pool); if (cq->cq) { ret = ibv_destroy_cq(cq->cq); @@ -528,26 +544,26 @@ static int fi_ibv_cq_close(fid_t fid) return 0; } -static struct fi_ops fi_ibv_cq_fi_ops = { +static struct fi_ops vrb_cq_fi_ops = { .size = sizeof(struct fi_ops), - .close = fi_ibv_cq_close, + .close = vrb_cq_close, .bind = fi_no_bind, - .control = fi_ibv_cq_control, + .control = vrb_cq_control, .ops_open = fi_no_ops_open, }; -static void fi_ibv_util_cq_progress_noop(struct util_cq *cq) +static void vrb_util_cq_progress_noop(struct util_cq *cq) { /* This routine shouldn't be called */ assert(0); } -int fi_ibv_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, +int vrb_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, struct fid_cq **cq_fid, void *context) { - struct fi_ibv_cq *cq; - struct fi_ibv_domain *domain = - container_of(domain_fid, struct fi_ibv_domain, + struct vrb_cq *cq; + struct vrb_domain *domain = + container_of(domain_fid, struct vrb_domain, util_domain.domain_fid); size_t size; int ret; @@ -559,14 +575,26 @@ int fi_ibv_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, /* verbs uses its own implementation of wait objects for CQ */ tmp_attr.wait_obj = FI_WAIT_NONE; - ret = ofi_cq_init(&fi_ibv_prov, domain_fid, &tmp_attr, &cq->util_cq, - fi_ibv_util_cq_progress_noop, context); + ret = ofi_cq_init(&vrb_prov, domain_fid, &tmp_attr, &cq->util_cq, + vrb_util_cq_progress_noop, context); if (ret) goto err1; switch (attr->wait_obj) { case FI_WAIT_UNSPEC: + cq->wait_obj = FI_WAIT_FD; + break; case FI_WAIT_FD: + case FI_WAIT_POLLFD: + case FI_WAIT_NONE: + cq->wait_obj = attr->wait_obj; + break; + default: + ret = -FI_ENOSYS; + goto err4; + } + + if (cq->wait_obj != FI_WAIT_NONE) { cq->channel = ibv_create_comp_channel(domain->verbs); if (!cq->channel) { ret = -errno; @@ -587,13 +615,6 @@ int fi_ibv_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, ret = fi_fd_nonblock(cq->signal_fd[0]); if (ret) goto err4; - - break; - case FI_WAIT_NONE: - break; - default: - ret = -FI_ENOSYS; - goto err4; } size = attr->size ? attr->size : VERBS_DEF_CQ_SIZE; @@ -624,7 +645,7 @@ int fi_ibv_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, } } - ret = ofi_bufpool_create(&cq->wce_pool, sizeof(struct fi_ibv_wce), + ret = ofi_bufpool_create(&cq->wce_pool, sizeof(struct vrb_wc_entry), 16, 0, VERBS_WCE_CNT, 0); if (ret) { VERBS_WARN(FI_LOG_CQ, "Failed to create wce_pool\n"); @@ -634,21 +655,21 @@ int fi_ibv_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, cq->flags |= attr->flags; cq->wait_cond = attr->wait_cond; /* verbs uses its own ops for CQ */ - cq->util_cq.cq_fid.fid.ops = &fi_ibv_cq_fi_ops; - cq->util_cq.cq_fid.ops = &fi_ibv_cq_ops; + cq->util_cq.cq_fid.fid.ops = &vrb_cq_fi_ops; + cq->util_cq.cq_fid.ops = &vrb_cq_ops; switch (attr->format) { case FI_CQ_FORMAT_UNSPEC: case FI_CQ_FORMAT_CONTEXT: - cq->read_entry = fi_ibv_cq_read_context_entry; + cq->read_entry = vrb_cq_read_context_entry; cq->entry_size = sizeof(struct fi_cq_entry); break; case FI_CQ_FORMAT_MSG: - cq->read_entry = fi_ibv_cq_read_msg_entry; + cq->read_entry = vrb_cq_read_msg_entry; cq->entry_size = sizeof(struct fi_cq_msg_entry); break; case FI_CQ_FORMAT_DATA: - cq->read_entry = fi_ibv_cq_read_data_entry; + cq->read_entry = vrb_cq_read_data_entry; cq->entry_size = sizeof(struct fi_cq_data_entry); break; case FI_CQ_FORMAT_TAGGED: @@ -657,14 +678,18 @@ int fi_ibv_cq_open(struct fid_domain *domain_fid, struct fi_cq_attr *attr, goto err6; } - slist_init(&cq->wcq); + ret = ofi_bufpool_create(&cq->ctx_pool, sizeof(struct fi_context), + 16, 0, size, OFI_BUFPOOL_NO_TRACK); + if (ret) + goto err6; + + slist_init(&cq->saved_wc_list); dlist_init(&cq->xrc.srq_list); fastlock_init(&cq->xrc.srq_list_lock); ofi_atomic_initialize32(&cq->nevents, 0); - assert(size < INT32_MAX); - ofi_atomic_initialize32(&cq->credits, size); + cq->credits = size; *cq_fid = &cq->util_cq.cq_fid; return 0; diff --git a/prov/verbs/src/verbs_dgram_av.c b/prov/verbs/src/verbs_dgram_av.c index 10523257896..ce0f710f33d 100644 --- a/prov/verbs/src/verbs_dgram_av.c +++ b/prov/verbs/src/verbs_dgram_av.c @@ -32,7 +32,7 @@ #include "fi_verbs.h" -static inline int fi_ibv_dgram_av_is_addr_valid(struct fi_ibv_dgram_av *av, +static inline int vrb_dgram_av_is_addr_valid(struct vrb_dgram_av *av, const void *addr) { const struct ofi_ib_ud_ep_name *check_name = addr; @@ -40,7 +40,7 @@ static inline int fi_ibv_dgram_av_is_addr_valid(struct fi_ibv_dgram_av *av, } static inline int -fi_ibv_dgram_verify_av_flags(struct util_av *av, uint64_t flags) +vrb_dgram_verify_av_flags(struct util_av *av, uint64_t flags) { if ((av->flags & FI_EVENT) && !av->eq) { VERBS_WARN(FI_LOG_AV, "No EQ bound to AV\n"); @@ -56,13 +56,13 @@ fi_ibv_dgram_verify_av_flags(struct util_av *av, uint64_t flags) } static int -fi_ibv_dgram_av_insert_addr(struct fi_ibv_dgram_av *av, const void *addr, +vrb_dgram_av_insert_addr(struct vrb_dgram_av *av, const void *addr, fi_addr_t *fi_addr, void *context) { int ret; - struct fi_ibv_dgram_av_entry *av_entry; - struct fi_ibv_domain *domain = - container_of(av->util_av.domain, struct fi_ibv_domain, util_domain); + struct vrb_dgram_av_entry *av_entry; + struct vrb_domain *domain = + container_of(av->util_av.domain, struct vrb_domain, util_domain); struct ibv_ah_attr ah_attr = { .is_global = 0, @@ -76,8 +76,8 @@ fi_ibv_dgram_av_insert_addr(struct fi_ibv_dgram_av *av, const void *addr, ah_attr.is_global = 1; ah_attr.grh.hop_limit = 64; ah_attr.grh.dgid = ((struct ofi_ib_ud_ep_name *)addr)->gid; - ah_attr.grh.sgid_index = fi_ibv_gl_data.gid_idx; - } else if (OFI_UNLIKELY(!fi_ibv_dgram_av_is_addr_valid(av, addr))) { + ah_attr.grh.sgid_index = vrb_gl_data.gid_idx; + } else if (OFI_UNLIKELY(!vrb_dgram_av_is_addr_valid(av, addr))) { ret = -FI_EADDRNOTAVAIL; VERBS_WARN(FI_LOG_AV, "Invalid address\n"); goto fn1; @@ -111,22 +111,22 @@ fi_ibv_dgram_av_insert_addr(struct fi_ibv_dgram_av *av, const void *addr, return ret; } -static int fi_ibv_dgram_av_insert(struct fid_av *av_fid, const void *addr, +static int vrb_dgram_av_insert(struct fid_av *av_fid, const void *addr, size_t count, fi_addr_t *fi_addr, uint64_t flags, void *context) { int ret, success_cnt = 0; size_t i; - struct fi_ibv_dgram_av *av = - container_of(av_fid, struct fi_ibv_dgram_av, util_av.av_fid); + struct vrb_dgram_av *av = + container_of(av_fid, struct vrb_dgram_av, util_av.av_fid); - ret = fi_ibv_dgram_verify_av_flags(&av->util_av, flags); + ret = vrb_dgram_verify_av_flags(&av->util_av, flags); if (ret) return ret; VERBS_DBG(FI_LOG_AV, "Inserting %"PRIu64" addresses\n", count); for (i = 0; i < count; i++) { - ret = fi_ibv_dgram_av_insert_addr( + ret = vrb_dgram_av_insert_addr( av, (struct ofi_ib_ud_ep_name *)addr + i, fi_addr ? &fi_addr[i] : NULL, context); if (!ret) @@ -139,7 +139,7 @@ static int fi_ibv_dgram_av_insert(struct fid_av *av_fid, const void *addr, } static inline void -fi_ibv_dgram_av_remove_addr(struct fi_ibv_dgram_av_entry *av_entry) +vrb_dgram_av_remove_addr(struct vrb_dgram_av_entry *av_entry) { int ret = ibv_destroy_ah(av_entry->ah); if (ret) @@ -150,32 +150,32 @@ fi_ibv_dgram_av_remove_addr(struct fi_ibv_dgram_av_entry *av_entry) free(av_entry); } -static int fi_ibv_dgram_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, +static int vrb_dgram_av_remove(struct fid_av *av_fid, fi_addr_t *fi_addr, size_t count, uint64_t flags) { int i, ret; - struct fi_ibv_dgram_av *av = - container_of(av_fid, struct fi_ibv_dgram_av, util_av.av_fid); + struct vrb_dgram_av *av = + container_of(av_fid, struct vrb_dgram_av, util_av.av_fid); - ret = fi_ibv_dgram_verify_av_flags(&av->util_av, flags); + ret = vrb_dgram_verify_av_flags(&av->util_av, flags); if (ret) return ret; for (i = count - 1; i >= 0; i--) { - struct fi_ibv_dgram_av_entry *av_entry = - (struct fi_ibv_dgram_av_entry *) (uintptr_t) fi_addr[i]; - fi_ibv_dgram_av_remove_addr(av_entry); + struct vrb_dgram_av_entry *av_entry = + (struct vrb_dgram_av_entry *) (uintptr_t) fi_addr[i]; + vrb_dgram_av_remove_addr(av_entry); } return FI_SUCCESS; } static inline -int fi_ibv_dgram_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr, +int vrb_dgram_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr, void *addr, size_t *addrlen) { - struct fi_ibv_dgram_av_entry *av_entry; + struct vrb_dgram_av_entry *av_entry; - av_entry = fi_ibv_dgram_av_lookup_av_entry(fi_addr); + av_entry = vrb_dgram_av_lookup_av_entry(fi_addr); if (!av_entry) return -FI_ENOENT; @@ -185,56 +185,56 @@ int fi_ibv_dgram_av_lookup(struct fid_av *av_fid, fi_addr_t fi_addr, } static inline const char * -fi_ibv_dgram_av_straddr(struct fid_av *av, const void *addr, char *buf, size_t *len) +vrb_dgram_av_straddr(struct fid_av *av, const void *addr, char *buf, size_t *len) { return ofi_straddr(buf, len, FI_ADDR_IB_UD, addr); } -static int fi_ibv_dgram_av_close(struct fid *av_fid) +static int vrb_dgram_av_close(struct fid *av_fid) { - struct fi_ibv_dgram_av_entry *av_entry; - struct fi_ibv_dgram_av *av = - container_of(av_fid, struct fi_ibv_dgram_av, util_av.av_fid.fid); + struct vrb_dgram_av_entry *av_entry; + struct vrb_dgram_av *av = + container_of(av_fid, struct vrb_dgram_av, util_av.av_fid.fid); int ret = ofi_av_close_lightweight(&av->util_av); if (ret) return ret; while (!dlist_empty(&av->av_entry_list)) { av_entry = container_of(av->av_entry_list.next, - struct fi_ibv_dgram_av_entry, + struct vrb_dgram_av_entry, list_entry); - fi_ibv_dgram_av_remove_addr(av_entry); + vrb_dgram_av_remove_addr(av_entry); } free(av); return FI_SUCCESS; } -static struct fi_ops fi_ibv_dgram_fi_ops = { - .size = sizeof(fi_ibv_dgram_fi_ops), - .close = fi_ibv_dgram_av_close, +static struct fi_ops vrb_dgram_fi_ops = { + .size = sizeof(vrb_dgram_fi_ops), + .close = vrb_dgram_av_close, .bind = ofi_av_bind, .control = fi_no_control, .ops_open = fi_no_ops_open, }; -static struct fi_ops_av fi_ibv_dgram_av_ops = { - .size = sizeof(fi_ibv_dgram_av_ops), - .insert = fi_ibv_dgram_av_insert, +static struct fi_ops_av vrb_dgram_av_ops = { + .size = sizeof(vrb_dgram_av_ops), + .insert = vrb_dgram_av_insert, .insertsvc = fi_no_av_insertsvc, .insertsym = fi_no_av_insertsym, - .remove = fi_ibv_dgram_av_remove, - .lookup = fi_ibv_dgram_av_lookup, - .straddr = fi_ibv_dgram_av_straddr, + .remove = vrb_dgram_av_remove, + .lookup = vrb_dgram_av_lookup, + .straddr = vrb_dgram_av_straddr, }; -int fi_ibv_dgram_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, +int vrb_dgram_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, struct fid_av **av_fid, void *context) { - struct fi_ibv_domain *domain = - container_of(domain_fid, struct fi_ibv_domain, + struct vrb_domain *domain = + container_of(domain_fid, struct vrb_domain, util_domain.domain_fid); - struct fi_ibv_dgram_av *av; + struct vrb_dgram_av *av; int ret; av = calloc(1, sizeof(*av)); @@ -250,8 +250,8 @@ int fi_ibv_dgram_av_open(struct fid_domain *domain_fid, struct fi_av_attr *attr, goto err_av_init; dlist_init(&av->av_entry_list); - av->util_av.av_fid.fid.ops = &fi_ibv_dgram_fi_ops; - av->util_av.av_fid.ops = &fi_ibv_dgram_av_ops; + av->util_av.av_fid.fid.ops = &vrb_dgram_fi_ops; + av->util_av.av_fid.ops = &vrb_dgram_av_ops; *av_fid = &av->util_av.av_fid; return FI_SUCCESS; diff --git a/prov/verbs/src/verbs_dgram_ep_msg.c b/prov/verbs/src/verbs_dgram_ep_msg.c index 36ad816e57b..9a9cb62b37e 100644 --- a/prov/verbs/src/verbs_dgram_ep_msg.c +++ b/prov/verbs/src/verbs_dgram_ep_msg.c @@ -33,11 +33,11 @@ #include "fi_verbs.h" static inline int -fi_ibv_dgram_ep_set_addr(struct fi_ibv_ep *ep, fi_addr_t addr, +vrb_dgram_ep_set_addr(struct vrb_ep *ep, fi_addr_t addr, struct ibv_send_wr *wr) { - struct fi_ibv_dgram_av_entry *av_entry = - fi_ibv_dgram_av_lookup_av_entry(addr); + struct vrb_dgram_av_entry *av_entry = + vrb_dgram_av_lookup_av_entry(addr); if (OFI_UNLIKELY(!av_entry)) return -FI_ENOENT; wr->wr.ud.ah = av_entry->ah; @@ -48,27 +48,23 @@ fi_ibv_dgram_ep_set_addr(struct fi_ibv_ep *ep, fi_addr_t addr, } static inline ssize_t -fi_ibv_dgram_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, +vrb_dgram_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_recv_wr wr = { .wr_id = (uintptr_t)msg->context, .num_sge = msg->iov_count, .next = NULL, }; - struct ibv_recv_wr *bad_wr; - assert(ep->util_ep.rx_cq); - - fi_ibv_set_sge_iov(wr.sg_list, msg->msg_iov, msg->iov_count, msg->desc); - - return fi_ibv_handle_post(ibv_post_recv(ep->ibv_qp, &wr, &bad_wr)); + vrb_iov_dupa(wr.sg_list, msg->msg_iov, msg->desc, msg->iov_count); + return vrb_post_recv(ep, &wr); } static inline ssize_t -fi_ibv_dgram_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, +vrb_dgram_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, void *context) { struct fi_msg msg = { @@ -79,11 +75,11 @@ fi_ibv_dgram_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **des .context = context, }; - return fi_ibv_dgram_ep_recvmsg(ep_fid, &msg, 0); + return vrb_dgram_ep_recvmsg(ep_fid, &msg, 0); } static inline ssize_t -fi_ibv_dgram_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, +vrb_dgram_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context) { struct iovec iov = { @@ -91,16 +87,16 @@ fi_ibv_dgram_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, .iov_len = len, }; - return fi_ibv_dgram_ep_recvv(ep_fid, &iov, &desc, + return vrb_dgram_ep_recvv(ep_fid, &iov, &desc, 1, src_addr, context); } static ssize_t -fi_ibv_dgram_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, +vrb_dgram_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = (uintptr_t)msg->context, }; @@ -112,74 +108,76 @@ fi_ibv_dgram_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, wr.opcode = IBV_WR_SEND; } - if (fi_ibv_dgram_ep_set_addr(ep, msg->addr, &wr)) + if (vrb_dgram_ep_set_addr(ep, msg->addr, &wr)) return -FI_ENOENT; - return fi_ibv_send_msg(ep, &wr, msg, flags); + return vrb_send_iov(ep, &wr, msg->msg_iov, msg->desc, + msg->iov_count, flags); } static inline ssize_t -fi_ibv_dgram_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov, +vrb_dgram_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, void *context) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = (uintptr_t)context, .opcode = IBV_WR_SEND, }; - if (fi_ibv_dgram_ep_set_addr(ep, dest_addr, &wr)) + if (vrb_dgram_ep_set_addr(ep, dest_addr, &wr)) return -FI_ENOENT; - return fi_ibv_send_iov(ep, &wr, iov, desc, count); + return vrb_send_iov(ep, &wr, iov, desc, count, + ep->util_ep.tx_op_flags); } static ssize_t -fi_ibv_dgram_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len, +vrb_dgram_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP(ep, (uintptr_t)context), .opcode = IBV_WR_SEND, - .send_flags = VERBS_INJECT(ep, len), + .send_flags = VERBS_INJECT(ep, len, desc), }; - if (fi_ibv_dgram_ep_set_addr(ep, dest_addr, &wr)) + if (vrb_dgram_ep_set_addr(ep, dest_addr, &wr)) return -FI_ENOENT; - return fi_ibv_send_buf(ep, &wr, buf, len, desc); + return vrb_send_buf(ep, &wr, buf, len, desc); } static inline ssize_t -fi_ibv_dgram_ep_senddata(struct fid_ep *ep_fid, const void *buf, +vrb_dgram_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, void *context) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP(ep, (uintptr_t)context), .opcode = IBV_WR_SEND_WITH_IMM, .imm_data = htonl((uint32_t)data), - .send_flags = VERBS_INJECT(ep, len), + .send_flags = VERBS_INJECT(ep, len, desc), }; - if (fi_ibv_dgram_ep_set_addr(ep, dest_addr, &wr)) + if (vrb_dgram_ep_set_addr(ep, dest_addr, &wr)) return -FI_ENOENT; - return fi_ibv_send_buf(ep, &wr, buf, len, desc); + return vrb_send_buf(ep, &wr, buf, len, desc); } static ssize_t -fi_ibv_dgram_ep_injectdata(struct fid_ep *ep_fid, const void *buf, size_t len, +vrb_dgram_ep_injectdata(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_NO_COMP_FLAG, .opcode = IBV_WR_SEND_WITH_IMM, @@ -187,19 +185,19 @@ fi_ibv_dgram_ep_injectdata(struct fid_ep *ep_fid, const void *buf, size_t len, .send_flags = IBV_SEND_INLINE, }; - if (fi_ibv_dgram_ep_set_addr(ep, dest_addr, &wr)) + if (vrb_dgram_ep_set_addr(ep, dest_addr, &wr)) return -FI_ENOENT; - return fi_ibv_send_buf_inline(ep, &wr, buf, len); + return vrb_send_buf(ep, &wr, buf, len, NULL); } static ssize_t -fi_ibv_dgram_ep_injectdata_fast(struct fid_ep *ep_fid, const void *buf, size_t len, +vrb_dgram_ep_injectdata_fast(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr) { ssize_t ret; - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); ep->wrs->msg_wr.imm_data = htonl((uint32_t)data); ep->wrs->msg_wr.opcode = IBV_WR_SEND_WITH_IMM; @@ -207,70 +205,70 @@ fi_ibv_dgram_ep_injectdata_fast(struct fid_ep *ep_fid, const void *buf, size_t l ep->wrs->sge.addr = (uintptr_t) buf; ep->wrs->sge.length = (uint32_t) len; - if (fi_ibv_dgram_ep_set_addr(ep, dest_addr, &ep->wrs->msg_wr)) + if (vrb_dgram_ep_set_addr(ep, dest_addr, &ep->wrs->msg_wr)) return -FI_ENOENT; - ret = fi_ibv_send_poll_cq_if_needed(ep, &ep->wrs->msg_wr); + ret = vrb_post_send(ep, &ep->wrs->msg_wr, 0); ep->wrs->msg_wr.opcode = IBV_WR_SEND; return ret; } static ssize_t -fi_ibv_dgram_ep_inject(struct fid_ep *ep_fid, const void *buf, size_t len, +vrb_dgram_ep_inject(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_NO_COMP_FLAG, .opcode = IBV_WR_SEND, .send_flags = IBV_SEND_INLINE, }; - if (fi_ibv_dgram_ep_set_addr(ep, dest_addr, &wr)) + if (vrb_dgram_ep_set_addr(ep, dest_addr, &wr)) return -FI_ENOENT; - return fi_ibv_send_buf_inline(ep, &wr, buf, len); + return vrb_send_buf(ep, &wr, buf, len, NULL); } static ssize_t -fi_ibv_dgram_ep_inject_fast(struct fid_ep *ep_fid, const void *buf, size_t len, +vrb_dgram_ep_inject_fast(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); ep->wrs->sge.addr = (uintptr_t) buf; ep->wrs->sge.length = (uint32_t) len; - if (fi_ibv_dgram_ep_set_addr(ep, dest_addr, &ep->wrs->msg_wr)) + if (vrb_dgram_ep_set_addr(ep, dest_addr, &ep->wrs->msg_wr)) return -FI_ENOENT; - return fi_ibv_send_poll_cq_if_needed(ep, &ep->wrs->msg_wr); + return vrb_post_send(ep, &ep->wrs->msg_wr, 0); } -const struct fi_ops_msg fi_ibv_dgram_msg_ops = { - .size = sizeof(fi_ibv_dgram_msg_ops), - .recv = fi_ibv_dgram_ep_recv, - .recvv = fi_ibv_dgram_ep_recvv, - .recvmsg = fi_ibv_dgram_ep_recvmsg, - .send = fi_ibv_dgram_ep_send, - .sendv = fi_ibv_dgram_ep_sendv, - .sendmsg = fi_ibv_dgram_ep_sendmsg, - .inject = fi_ibv_dgram_ep_inject_fast, - .senddata = fi_ibv_dgram_ep_senddata, - .injectdata = fi_ibv_dgram_ep_injectdata_fast, +const struct fi_ops_msg vrb_dgram_msg_ops = { + .size = sizeof(vrb_dgram_msg_ops), + .recv = vrb_dgram_ep_recv, + .recvv = vrb_dgram_ep_recvv, + .recvmsg = vrb_dgram_ep_recvmsg, + .send = vrb_dgram_ep_send, + .sendv = vrb_dgram_ep_sendv, + .sendmsg = vrb_dgram_ep_sendmsg, + .inject = vrb_dgram_ep_inject_fast, + .senddata = vrb_dgram_ep_senddata, + .injectdata = vrb_dgram_ep_injectdata_fast, }; -const struct fi_ops_msg fi_ibv_dgram_msg_ops_ts = { - .size = sizeof(fi_ibv_dgram_msg_ops), - .recv = fi_ibv_dgram_ep_recv, - .recvv = fi_ibv_dgram_ep_recvv, - .recvmsg = fi_ibv_dgram_ep_recvmsg, - .send = fi_ibv_dgram_ep_send, - .sendv = fi_ibv_dgram_ep_sendv, - .sendmsg = fi_ibv_dgram_ep_sendmsg, - .inject = fi_ibv_dgram_ep_inject, - .senddata = fi_ibv_dgram_ep_senddata, - .injectdata = fi_ibv_dgram_ep_injectdata, +const struct fi_ops_msg vrb_dgram_msg_ops_ts = { + .size = sizeof(vrb_dgram_msg_ops), + .recv = vrb_dgram_ep_recv, + .recvv = vrb_dgram_ep_recvv, + .recvmsg = vrb_dgram_ep_recvmsg, + .send = vrb_dgram_ep_send, + .sendv = vrb_dgram_ep_sendv, + .sendmsg = vrb_dgram_ep_sendmsg, + .inject = vrb_dgram_ep_inject, + .senddata = vrb_dgram_ep_senddata, + .injectdata = vrb_dgram_ep_injectdata, }; diff --git a/prov/verbs/src/verbs_domain.c b/prov/verbs/src/verbs_domain.c index 6245bdebabc..e62d868a0f6 100644 --- a/prov/verbs/src/verbs_domain.c +++ b/prov/verbs/src/verbs_domain.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -38,19 +39,97 @@ #include -static int fi_ibv_domain_bind(struct fid *fid, struct fid *bfid, uint64_t flags) + +static void vrb_set_threshold(struct fid_ep *ep_fid, size_t threshold) +{ + struct vrb_ep *ep = container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); + ep->threshold = threshold; +} + +static void vrb_set_credit_handler(struct fid_domain *domain_fid, + ssize_t (*credit_handler)(struct fid_ep *ep, size_t credits)) +{ + struct vrb_domain *domain; + + domain = container_of(domain_fid, struct vrb_domain, + util_domain.domain_fid.fid); + domain->send_credits = credit_handler; +} + +static int vrb_enable_ep_flow_ctrl(struct fid_ep *ep_fid) +{ + struct vrb_ep *ep = container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); + // only enable if we are not using SRQ + if (!ep->srq_ep && ep->ibv_qp && ep->ibv_qp->qp_type == IBV_QPT_RC) { + ep->peer_rq_credits = 1; + return FI_SUCCESS; + } + + return -FI_ENOSYS; +} + +struct ofi_ops_flow_ctrl vrb_ops_flow_ctrl = { + .size = sizeof(struct ofi_ops_flow_ctrl), + .set_threshold = vrb_set_threshold, + .add_credits = vrb_add_credits, + .enable = vrb_enable_ep_flow_ctrl, + .set_send_handler = vrb_set_credit_handler, +}; + +static int +vrb_domain_ops_open(struct fid *fid, const char *name, uint64_t flags, + void **ops, void *context) +{ + if (flags) + return -FI_EBADFLAGS; + + if (!strcasecmp(name, OFI_OPS_FLOW_CTRL)) { + *ops = &vrb_ops_flow_ctrl; + return 0; + } + + return -FI_ENOSYS; +} + + +#if VERBS_HAVE_QUERY_EX +static int vrb_odp_flag(struct ibv_context *verbs) +{ + struct ibv_query_device_ex_input input = {0}; + struct ibv_device_attr_ex attr; + int ret; + + if (!vrb_gl_data.use_odp) + return 0; + + ret = ibv_query_device_ex(verbs, &input, &attr); + if (ret) + return 0; + + return attr.odp_caps.general_caps & IBV_ODP_SUPPORT ? + VRB_USE_ODP : 0; +} +#else +static int vrb_odp_flag(struct ibv_context *verbs) +{ + return 0; +} +#endif /* VERBS_HAVE_QUERY_EX */ + + +static int vrb_domain_bind(struct fid *fid, struct fid *bfid, uint64_t flags) { - struct fi_ibv_domain *domain; - struct fi_ibv_eq *eq; + struct vrb_domain *domain; + struct vrb_eq *eq; - domain = container_of(fid, struct fi_ibv_domain, + domain = container_of(fid, struct vrb_domain, util_domain.domain_fid.fid); switch (bfid->fclass) { case FI_CLASS_EQ: switch (domain->ep_type) { case FI_EP_MSG: - eq = container_of(bfid, struct fi_ibv_eq, eq_fid); + eq = container_of(bfid, struct vrb_eq, eq_fid); domain->eq = eq; domain->eq_flags = flags; break; @@ -70,28 +149,28 @@ static int fi_ibv_domain_bind(struct fid *fid, struct fid *bfid, uint64_t flags) return 0; } -static int fi_ibv_domain_close(fid_t fid) +static int vrb_domain_close(fid_t fid) { int ret; - struct fi_ibv_fabric *fab; - struct fi_ibv_domain *domain = - container_of(fid, struct fi_ibv_domain, + struct vrb_fabric *fab; + struct vrb_domain *domain = + container_of(fid, struct vrb_domain, util_domain.domain_fid.fid); switch (domain->ep_type) { case FI_EP_DGRAM: fab = container_of(&domain->util_domain.fabric->fabric_fid, - struct fi_ibv_fabric, + struct vrb_fabric, util_fabric.fabric_fid.fid); /* Even if it's invoked not for the first time * (e.g. multiple domains per fabric), it's safe */ - if (fi_ibv_gl_data.dgram.use_name_server) + if (vrb_gl_data.dgram.use_name_server) ofi_ns_stop_server(&fab->name_server); break; case FI_EP_MSG: - if (domain->use_xrc) { - ret = fi_ibv_domain_xrc_cleanup(domain); + if (domain->flags & VRB_USE_XRC) { + ret = vrb_domain_xrc_cleanup(domain); if (ret) return ret; } @@ -120,7 +199,7 @@ static int fi_ibv_domain_close(fid_t fid) return 0; } -static int fi_ibv_open_device_by_name(struct fi_ibv_domain *domain, const char *name) +static int vrb_open_device_by_name(struct vrb_domain *domain, const char *name) { struct ibv_context **dev_list; int i, ret = -FI_ENODEV; @@ -136,8 +215,8 @@ static int fi_ibv_open_device_by_name(struct fi_ibv_domain *domain, const char * const char *rdma_name = ibv_get_device_name(dev_list[i]->device); switch (domain->ep_type) { case FI_EP_MSG: - ret = domain->use_xrc ? - fi_ibv_cmp_xrc_domain_name(name, rdma_name) : + ret = domain->flags & VRB_USE_XRC ? + vrb_cmp_xrc_domain_name(name, rdma_name) : strcmp(name, rdma_name); break; case FI_EP_DGRAM: @@ -160,92 +239,63 @@ static int fi_ibv_open_device_by_name(struct fi_ibv_domain *domain, const char * return ret; } -static struct fi_ops fi_ibv_fid_ops = { +static struct fi_ops vrb_fid_ops = { .size = sizeof(struct fi_ops), - .close = fi_ibv_domain_close, - .bind = fi_ibv_domain_bind, + .close = vrb_domain_close, + .bind = vrb_domain_bind, .control = fi_no_control, - .ops_open = fi_no_ops_open, + .ops_open = vrb_domain_ops_open, }; -static struct fi_ops_domain fi_ibv_msg_domain_ops = { +static struct fi_ops_domain vrb_msg_domain_ops = { .size = sizeof(struct fi_ops_domain), .av_open = fi_no_av_open, - .cq_open = fi_ibv_cq_open, - .endpoint = fi_ibv_open_ep, + .cq_open = vrb_cq_open, + .endpoint = vrb_open_ep, .scalable_ep = fi_no_scalable_ep, .cntr_open = fi_no_cntr_open, .poll_open = fi_no_poll_open, .stx_ctx = fi_no_stx_context, - .srx_ctx = fi_ibv_srq_context, - .query_atomic = fi_ibv_query_atomic, + .srx_ctx = vrb_srq_context, + .query_atomic = vrb_query_atomic, + .query_collective = fi_no_query_collective, }; -static struct fi_ops_domain fi_ibv_dgram_domain_ops = { +static struct fi_ops_domain vrb_dgram_domain_ops = { .size = sizeof(struct fi_ops_domain), - .av_open = fi_ibv_dgram_av_open, - .cq_open = fi_ibv_cq_open, - .endpoint = fi_ibv_open_ep, + .av_open = vrb_dgram_av_open, + .cq_open = vrb_cq_open, + .endpoint = vrb_open_ep, .scalable_ep = fi_no_scalable_ep, .poll_open = fi_no_poll_open, .stx_ctx = fi_no_stx_context, .srx_ctx = fi_no_srx_context, .query_atomic = fi_no_query_atomic, + .query_collective = fi_no_query_collective, }; -static int -fi_ibv_post_send_track_credits(struct ibv_qp *qp, struct ibv_send_wr *wr, - struct ibv_send_wr **bad_wr) -{ - struct fi_ibv_cq *cq = - container_of(((struct fi_ibv_ep *)qp->qp_context)->util_ep.tx_cq, - struct fi_ibv_cq, util_cq); - int credits = (int)ofi_atomic_dec32(&cq->credits); - int ret; - - if (credits < 0) { - FI_DBG(&fi_ibv_prov, FI_LOG_EP_DATA, "CQ credits not available," - " retry later\n"); - ofi_atomic_inc32(&cq->credits); - return ENOMEM; - } - ret = ibv_post_send(qp, wr, bad_wr); - if (ret) - ofi_atomic_inc32(&cq->credits); - return ret; -} - -static int -fi_ibv_poll_cq_track_credits(struct ibv_cq *cq, int num_entries, - struct ibv_wc *wc) -{ - struct fi_ibv_cq *verbs_cq = (struct fi_ibv_cq *)cq->cq_context; - int i, ret; - - ret = ibv_poll_cq(cq, num_entries, wc); - for (i = 0; i < ret; i++) { - if (!(wc[i].opcode & IBV_WC_RECV)) - ofi_atomic_inc32(&verbs_cq->credits); - } - return ret; -} - static int -fi_ibv_domain(struct fid_fabric *fabric, struct fi_info *info, +vrb_domain(struct fid_fabric *fabric, struct fi_info *info, struct fid_domain **domain, void *context) { - struct fi_ibv_domain *_domain; + struct ofi_mem_monitor *memory_monitors[OFI_HMEM_MAX] = { + [FI_HMEM_SYSTEM] = default_monitor, + [FI_HMEM_CUDA] = default_cuda_monitor, + [FI_HMEM_ROCR] = default_rocr_monitor, + }; + enum fi_hmem_iface iface; + struct vrb_domain *_domain; int ret; - struct fi_ibv_fabric *fab = - container_of(fabric, struct fi_ibv_fabric, + struct vrb_fabric *fab = + container_of(fabric, struct vrb_fabric, util_fabric.fabric_fid); - const struct fi_info *fi = fi_ibv_get_verbs_info(fi_ibv_util_prov.info, + const struct fi_info *fi = vrb_get_verbs_info(vrb_util_prov.info, info->domain_attr->name); if (!fi) return -FI_EINVAL; - ret = ofi_check_domain_attr(&fi_ibv_prov, fabric->api_version, + ret = ofi_check_domain_attr(&vrb_prov, fabric->api_version, fi->domain_attr, info); if (ret) return ret; @@ -262,10 +312,10 @@ fi_ibv_domain(struct fid_fabric *fabric, struct fi_info *info, if (!_domain->info) goto err2; - _domain->ep_type = FI_IBV_EP_TYPE(info); - _domain->use_xrc = fi_ibv_is_xrc(info); + _domain->ep_type = VRB_EP_TYPE(info); + _domain->flags |= vrb_is_xrc_info(info) ? VRB_USE_XRC : 0; - ret = fi_ibv_open_device_by_name(_domain, info->domain_attr->name); + ret = vrb_open_device_by_name(_domain, info->domain_attr->name); if (ret) goto err3; @@ -275,46 +325,56 @@ fi_ibv_domain(struct fid_fabric *fabric, struct fi_info *info, goto err3; } + _domain->flags |= vrb_odp_flag(_domain->verbs); _domain->util_domain.domain_fid.fid.fclass = FI_CLASS_DOMAIN; _domain->util_domain.domain_fid.fid.context = context; - _domain->util_domain.domain_fid.fid.ops = &fi_ibv_fid_ops; + _domain->util_domain.domain_fid.fid.ops = &vrb_fid_ops; + _domain->util_domain.domain_fid.mr = &vrb_mr_ops; - _domain->cache.entry_data_size = sizeof(struct fi_ibv_mem_desc); - _domain->cache.add_region = fi_ibv_mr_cache_add_region; - _domain->cache.delete_region = fi_ibv_mr_cache_delete_region; - ret = ofi_mr_cache_init(&_domain->util_domain, uffd_monitor, + _domain->cache.entry_data_size = sizeof(struct vrb_mem_desc); + _domain->cache.add_region = vrb_mr_cache_add_region; + _domain->cache.delete_region = vrb_mr_cache_delete_region; + ret = ofi_mr_cache_init(&_domain->util_domain, memory_monitors, &_domain->cache); - if (!ret) - _domain->util_domain.domain_fid.mr = &fi_ibv_mr_cache_ops; - else - _domain->util_domain.domain_fid.mr = &fi_ibv_mr_ops; + if (ret) { + VERBS_INFO(FI_LOG_MR, + "MR cache init failed: %s. MR caching disabled.\n", + fi_strerror(-ret)); + } else { + for (iface = 0; iface < OFI_HMEM_MAX; iface++) { + if (_domain->cache.monitors[iface]) + VERBS_INFO(FI_LOG_MR, + "MR cache enabled for %s memory\n", + fi_tostr(&iface, FI_TYPE_HMEM_IFACE)); + } + } switch (_domain->ep_type) { case FI_EP_DGRAM: - if (fi_ibv_gl_data.dgram.use_name_server) { + if (vrb_gl_data.dgram.use_name_server) { /* Even if it's invoked not for the first time * (e.g. multiple domains per fabric), it's safe */ fab->name_server.port = - fi_ibv_gl_data.dgram.name_server_port; + vrb_gl_data.dgram.name_server_port; fab->name_server.name_len = sizeof(struct ofi_ib_ud_ep_name); fab->name_server.service_len = sizeof(int); - fab->name_server.service_cmp = fi_ibv_dgram_ns_service_cmp; + fab->name_server.service_cmp = vrb_dgram_ns_service_cmp; fab->name_server.is_service_wildcard = - fi_ibv_dgram_ns_is_service_wildcard; + vrb_dgram_ns_is_service_wildcard; ofi_ns_init(&fab->name_server); ofi_ns_start_server(&fab->name_server); } - _domain->util_domain.domain_fid.ops = &fi_ibv_dgram_domain_ops; + _domain->util_domain.domain_fid.ops = &vrb_dgram_domain_ops; break; case FI_EP_MSG: - if (_domain->use_xrc) { - ret = fi_ibv_domain_xrc_init(_domain); + if (_domain->flags & VRB_USE_XRC) { + ret = vrb_domain_xrc_init(_domain); if (ret) goto err4; } - _domain->util_domain.domain_fid.ops = &fi_ibv_msg_domain_ops; + _domain->util_domain.domain_fid.ops = &vrb_msg_domain_ops; break; default: VERBS_INFO(FI_LOG_DOMAIN, "Ivalid EP type is provided, " @@ -323,15 +383,6 @@ fi_ibv_domain(struct fid_fabric *fabric, struct fi_info *info, goto err4; } - if (!strncmp(info->domain_attr->name, "hfi1", strlen("hfi1")) || - !strncmp(info->domain_attr->name, "qib", strlen("qib"))) { - _domain->post_send = fi_ibv_post_send_track_credits; - _domain->poll_cq = fi_ibv_poll_cq_track_credits; - } else { - _domain->post_send = ibv_post_send; - _domain->poll_cq = ibv_poll_cq; - } - *domain = &_domain->util_domain.domain_fid; return FI_SUCCESS; err4: @@ -350,23 +401,23 @@ fi_ibv_domain(struct fid_fabric *fabric, struct fi_info *info, return ret; } -static int fi_ibv_trywait(struct fid_fabric *fabric, struct fid **fids, int count) +static int vrb_trywait(struct fid_fabric *fabric, struct fid **fids, int count) { - struct fi_ibv_cq *cq; - struct fi_ibv_eq *eq; + struct vrb_cq *cq; + struct vrb_eq *eq; int ret, i; for (i = 0; i < count; i++) { switch (fids[i]->fclass) { case FI_CLASS_CQ: - cq = container_of(fids[i], struct fi_ibv_cq, util_cq.cq_fid.fid); - ret = fi_ibv_cq_trywait(cq); + cq = container_of(fids[i], struct vrb_cq, util_cq.cq_fid.fid); + ret = vrb_cq_trywait(cq); if (ret) return ret; break; case FI_CLASS_EQ: - eq = container_of(fids[i], struct fi_ibv_eq, eq_fid.fid); - ret = fi_ibv_eq_trywait(eq); + eq = container_of(fids[i], struct vrb_eq, eq_fid.fid); + ret = vrb_eq_trywait(eq); if (ret) return ret; break; @@ -381,12 +432,12 @@ static int fi_ibv_trywait(struct fid_fabric *fabric, struct fid **fids, int coun return FI_SUCCESS; } -static int fi_ibv_fabric_close(fid_t fid) +static int vrb_fabric_close(fid_t fid) { - struct fi_ibv_fabric *fab; + struct vrb_fabric *fab; int ret; - fab = container_of(fid, struct fi_ibv_fabric, util_fabric.fabric_fid.fid); + fab = container_of(fid, struct vrb_fabric, util_fabric.fabric_fid.fid); ret = ofi_fabric_close(&fab->util_fabric); if (ret) return ret; @@ -395,28 +446,28 @@ static int fi_ibv_fabric_close(fid_t fid) return 0; } -static struct fi_ops fi_ibv_fi_ops = { +static struct fi_ops vrb_fi_ops = { .size = sizeof(struct fi_ops), - .close = fi_ibv_fabric_close, + .close = vrb_fabric_close, .bind = fi_no_bind, .control = fi_no_control, .ops_open = fi_no_ops_open, }; -static struct fi_ops_fabric fi_ibv_ops_fabric = { +static struct fi_ops_fabric vrb_ops_fabric = { .size = sizeof(struct fi_ops_fabric), - .domain = fi_ibv_domain, - .passive_ep = fi_ibv_passive_ep, - .eq_open = fi_ibv_eq_open, + .domain = vrb_domain, + .passive_ep = vrb_passive_ep, + .eq_open = vrb_eq_open, .wait_open = fi_no_wait_open, - .trywait = fi_ibv_trywait + .trywait = vrb_trywait }; -int fi_ibv_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, +int vrb_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, void *context) { - struct fi_ibv_fabric *fab; - const struct fi_info *cur, *info = fi_ibv_util_prov.info; + struct vrb_fabric *fab; + const struct fi_info *cur, *info = vrb_util_prov.info; int ret = FI_SUCCESS; fab = calloc(1, sizeof(*fab)); @@ -424,7 +475,7 @@ int fi_ibv_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, return -FI_ENOMEM; for (cur = info; cur; cur = info->next) { - ret = ofi_fabric_init(&fi_ibv_prov, cur->fabric_attr, attr, + ret = ofi_fabric_init(&vrb_prov, cur->fabric_attr, attr, &fab->util_fabric, context); if (ret != -FI_ENODATA) break; @@ -438,8 +489,8 @@ int fi_ibv_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, *fabric = &fab->util_fabric.fabric_fid; (*fabric)->fid.fclass = FI_CLASS_FABRIC; - (*fabric)->fid.ops = &fi_ibv_fi_ops; - (*fabric)->ops = &fi_ibv_ops_fabric; + (*fabric)->fid.ops = &vrb_fi_ops; + (*fabric)->ops = &vrb_ops_fabric; return 0; } diff --git a/prov/verbs/src/verbs_domain_xrc.c b/prov/verbs/src/verbs_domain_xrc.c index c453f1a1dc0..553dde170bf 100644 --- a/prov/verbs/src/verbs_domain_xrc.c +++ b/prov/verbs/src/verbs_domain_xrc.c @@ -1,5 +1,6 @@ /* - * Copyright (c) 2018 Cray Inc. All rights reserved. + * Copyright (c) 2018-2019 Cray Inc. All rights reserved. + * Copyright (c) 2018-2019 System Fabric Works, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -36,54 +37,22 @@ /* Domain XRC INI QP RBTree key */ -struct fi_ibv_ini_conn_key { +struct vrb_ini_conn_key { struct sockaddr *addr; - struct fi_ibv_cq *tx_cq; + struct vrb_cq *tx_cq; }; -static int fi_ibv_process_ini_conn(struct fi_ibv_xrc_ep *ep,int reciprocal, +static int vrb_process_ini_conn(struct vrb_xrc_ep *ep,int reciprocal, void *param, size_t paramlen); -/* - * This routine is a work around that creates a QP for the only purpose of - * reserving the QP number. The QP is not transitioned out of the RESET state. - */ -int fi_ibv_reserve_qpn(struct fi_ibv_xrc_ep *ep, struct ibv_qp **qp) -{ - struct fi_ibv_domain *domain = fi_ibv_ep_to_domain(&ep->base_ep); - struct fi_ibv_cq *cq = container_of(ep->base_ep.util_ep.tx_cq, - struct fi_ibv_cq, util_cq); - struct ibv_qp_init_attr attr = { 0 }; - int ret; - - /* Limit library allocated resources and do not INIT QP */ - attr.cap.max_send_wr = 1; - attr.cap.max_send_sge = 1; - attr.cap.max_recv_wr = 0; - attr.cap.max_recv_sge = 0; - attr.cap.max_inline_data = 0; - attr.send_cq = cq->cq; - attr.recv_cq = cq->cq; - attr.qp_type = IBV_QPT_RC; - - *qp = ibv_create_qp(domain->pd, &attr); - if (OFI_UNLIKELY(!*qp)) { - ret = -errno; - VERBS_WARN(FI_LOG_EP_CTRL, - "Reservation QP create failed %d\n", -ret); - return ret; - } - return FI_SUCCESS; -} - -static int fi_ibv_create_ini_qp(struct fi_ibv_xrc_ep *ep) +static int vrb_create_ini_qp(struct vrb_xrc_ep *ep) { #if VERBS_HAVE_XRC struct ibv_qp_init_attr_ex attr_ex; - struct fi_ibv_domain *domain = fi_ibv_ep_to_domain(&ep->base_ep); + struct vrb_domain *domain = vrb_ep_to_domain(&ep->base_ep); int ret; - fi_ibv_msg_ep_get_qp_attr(&ep->base_ep, + vrb_msg_ep_get_qp_attr(&ep->base_ep, (struct ibv_qp_init_attr *)&attr_ex); attr_ex.qp_type = IBV_QPT_XRC_SEND; attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; @@ -103,25 +72,24 @@ static int fi_ibv_create_ini_qp(struct fi_ibv_xrc_ep *ep) #endif /* !VERBS_HAVE_XRC */ } -static inline void fi_ibv_set_ini_conn_key(struct fi_ibv_xrc_ep *ep, - struct fi_ibv_ini_conn_key *key) +static inline void vrb_set_ini_conn_key(struct vrb_xrc_ep *ep, + struct vrb_ini_conn_key *key) { - key->addr = ep->base_ep.info->dest_addr; + key->addr = ep->base_ep.info_attr.dest_addr; key->tx_cq = container_of(ep->base_ep.util_ep.tx_cq, - struct fi_ibv_cq, util_cq); + struct vrb_cq, util_cq); } -/* Caller must hold domain:xrc:ini_mgmt_lock */ -int fi_ibv_get_shared_ini_conn(struct fi_ibv_xrc_ep *ep, - struct fi_ibv_ini_shared_conn **ini_conn) { - struct fi_ibv_domain *domain = fi_ibv_ep_to_domain(&ep->base_ep); - struct fi_ibv_ini_conn_key key; - struct fi_ibv_ini_shared_conn *conn; +/* Caller must hold domain:xrc.ini_lock */ +int vrb_get_shared_ini_conn(struct vrb_xrc_ep *ep, + struct vrb_ini_shared_conn **ini_conn) { + struct vrb_domain *domain = vrb_ep_to_domain(&ep->base_ep); + struct vrb_ini_conn_key key; + struct vrb_ini_shared_conn *conn; struct ofi_rbnode *node; int ret; - assert(ep->base_ep.id); - fi_ibv_set_ini_conn_key(ep, &key); + vrb_set_ini_conn_key(ep, &key); node = ofi_rbmap_find(domain->xrc.ini_conn_rbmap, &key); if (node) { *ini_conn = node->data; @@ -131,17 +99,22 @@ int fi_ibv_get_shared_ini_conn(struct fi_ibv_xrc_ep *ep, *ini_conn = NULL; conn = calloc(1, sizeof(*conn)); - if (!conn) + if (!conn) { + VERBS_WARN(FI_LOG_EP_CTRL, + "Unable to allocate INI connection memory\n"); return -FI_ENOMEM; + } - conn->tgt_qpn = FI_IBV_NO_INI_TGT_QPNUM; + conn->tgt_qpn = VRB_NO_INI_TGT_QPNUM; conn->peer_addr = mem_dup(key.addr, ofi_sizeofaddr(key.addr)); if (!conn->peer_addr) { + VERBS_WARN(FI_LOG_EP_CTRL, + "mem_dup of peer address failed\n"); free(conn); return -FI_ENOMEM; } conn->tx_cq = container_of(ep->base_ep.util_ep.tx_cq, - struct fi_ibv_cq, util_cq); + struct vrb_cq, util_cq); dlist_init(&conn->pending_list); dlist_init(&conn->active_list); ofi_atomic_initialize32(&conn->ref_cnt, 1); @@ -163,26 +136,34 @@ int fi_ibv_get_shared_ini_conn(struct fi_ibv_xrc_ep *ep, return ret; } -/* Caller must hold domain:xrc:ini_mgmt_lock */ -void fi_ibv_put_shared_ini_conn(struct fi_ibv_xrc_ep *ep) +/* Caller must hold domain:xrc.ini_lock */ +void _vrb_put_shared_ini_conn(struct vrb_xrc_ep *ep) { - struct fi_ibv_domain *domain = fi_ibv_ep_to_domain(&ep->base_ep); - struct fi_ibv_ini_shared_conn *ini_conn; - struct fi_ibv_ini_conn_key key; - struct ofi_rbnode *node; + struct vrb_domain *domain = vrb_ep_to_domain(&ep->base_ep); + struct vrb_ini_shared_conn *ini_conn; + struct vrb_ini_conn_key key; if (!ep->ini_conn) return; /* remove from pending or active connection list */ dlist_remove(&ep->ini_conn_entry); - ep->conn_state = FI_IBV_XRC_UNCONNECTED; + ep->conn_state = VRB_XRC_UNCONNECTED; ini_conn = ep->ini_conn; ep->ini_conn = NULL; ep->base_ep.ibv_qp = NULL; if (ep->base_ep.id) ep->base_ep.id->qp = NULL; + /* If XRC physical QP connection was not completed, make sure + * any pending connection to that destination will get scheduled. */ + if (ep->base_ep.id && ep->base_ep.id == ini_conn->phys_conn_id) { + if (ini_conn->state == VRB_INI_QP_CONNECTING) + ini_conn->state = VRB_INI_QP_UNCONNECTED; + + ini_conn->phys_conn_id = NULL; + } + /* Tear down physical INI/TGT when no longer being used */ if (!ofi_atomic_dec32(&ini_conn->ref_cnt)) { if (ini_conn->ini_qp && ibv_destroy_qp(ini_conn->ini_qp)) @@ -190,19 +171,27 @@ void fi_ibv_put_shared_ini_conn(struct fi_ibv_xrc_ep *ep) "Destroy of XRC physical INI QP failed %d\n", errno); - fi_ibv_set_ini_conn_key(ep, &key); - node = ofi_rbmap_find(domain->xrc.ini_conn_rbmap, &key); - assert(node); - ofi_rbmap_delete(domain->xrc.ini_conn_rbmap, node); + assert(dlist_empty(&ini_conn->pending_list)); + vrb_set_ini_conn_key(ep, &key); + ofi_rbmap_find_delete(domain->xrc.ini_conn_rbmap, &key); free(ini_conn->peer_addr); free(ini_conn); } else { - fi_ibv_sched_ini_conn(ini_conn); + vrb_sched_ini_conn(ini_conn); } } -/* Caller must hold domain:xrc:ini_mgmt_lock */ -void fi_ibv_add_pending_ini_conn(struct fi_ibv_xrc_ep *ep, int reciprocal, +void vrb_put_shared_ini_conn(struct vrb_xrc_ep *ep) +{ + struct vrb_domain *domain = vrb_ep_to_domain(&ep->base_ep); + + domain->xrc.lock_acquire(&domain->xrc.ini_lock); + _vrb_put_shared_ini_conn(ep); + domain->xrc.lock_release(&domain->xrc.ini_lock); +} + +/* Caller must hold domain:xrc.ini_lock */ +void vrb_add_pending_ini_conn(struct vrb_xrc_ep *ep, int reciprocal, void *conn_param, size_t conn_paramlen) { ep->conn_setup->pending_recip = reciprocal; @@ -213,21 +202,24 @@ void fi_ibv_add_pending_ini_conn(struct fi_ibv_xrc_ep *ep, int reciprocal, dlist_insert_tail(&ep->ini_conn_entry, &ep->ini_conn->pending_list); } -static void fi_ibv_create_shutdown_event(struct fi_ibv_xrc_ep *ep) +/* Caller must hold domain:eq:lock */ +static void vrb_create_shutdown_event(struct vrb_xrc_ep *ep) { struct fi_eq_cm_entry entry = { .fid = &ep->base_ep.util_ep.ep_fid.fid, }; + struct vrb_eq_entry *eq_entry; - fi_ibv_eq_write_event(ep->base_ep.eq, FI_SHUTDOWN, - &entry, sizeof(entry)); + eq_entry = vrb_eq_alloc_entry(FI_SHUTDOWN, &entry, sizeof(entry)); + if (eq_entry) + dlistfd_insert_tail(&eq_entry->item, &ep->base_ep.eq->list_head); } -/* Caller must hold domain:xrc:ini_mgmt_lock */ -void fi_ibv_sched_ini_conn(struct fi_ibv_ini_shared_conn *ini_conn) +/* Caller must hold domain:xrc.ini_lock */ +void vrb_sched_ini_conn(struct vrb_ini_shared_conn *ini_conn) { - struct fi_ibv_xrc_ep *ep; - enum fi_ibv_ini_qp_state last_state; + struct vrb_xrc_ep *ep; + enum vrb_ini_qp_state last_state; int ret; /* Continue to schedule shared connections if the physical connection @@ -236,108 +228,132 @@ void fi_ibv_sched_ini_conn(struct fi_ibv_ini_shared_conn *ini_conn) * limit the number of outstanding connections. */ while (1) { if (dlist_empty(&ini_conn->pending_list) || - ini_conn->state == FI_IBV_INI_QP_CONNECTING) + ini_conn->state == VRB_INI_QP_CONNECTING) return; dlist_pop_front(&ini_conn->pending_list, - struct fi_ibv_xrc_ep, ep, ini_conn_entry); + struct vrb_xrc_ep, ep, ini_conn_entry); dlist_insert_tail(&ep->ini_conn_entry, &ep->ini_conn->active_list); last_state = ep->ini_conn->state; - if (last_state == FI_IBV_INI_QP_UNCONNECTED) { + + ret = vrb_create_ep(&ep->base_ep, + last_state == VRB_INI_QP_UNCONNECTED ? + RDMA_PS_TCP : RDMA_PS_UDP, + &ep->base_ep.id); + if (ret) { + VERBS_WARN(FI_LOG_EP_CTRL, + "Failed to create active CM ID %d\n", + ret); + goto err; + } + + if (last_state == VRB_INI_QP_UNCONNECTED) { + assert(!ep->ini_conn->phys_conn_id && ep->base_ep.id); + if (ep->ini_conn->ini_qp && ibv_destroy_qp(ep->ini_conn->ini_qp)) { VERBS_WARN(FI_LOG_EP_CTRL, "Failed to destroy " "physical INI QP %d\n", errno); } - ret = fi_ibv_create_ini_qp(ep); + ret = vrb_create_ini_qp(ep); if (ret) { VERBS_WARN(FI_LOG_EP_CTRL, "Failed to create " "physical INI QP %d\n", ret); goto err; } ep->ini_conn->ini_qp = ep->base_ep.id->qp; - ep->ini_conn->state = FI_IBV_INI_QP_CONNECTING; + ep->ini_conn->state = VRB_INI_QP_CONNECTING; + ep->ini_conn->phys_conn_id = ep->base_ep.id; } else { - if (!ep->base_ep.id->qp) { - ret = fi_ibv_reserve_qpn(ep, - &ep->conn_setup->rsvd_ini_qpn); - if (ret) { - VERBS_WARN(FI_LOG_EP_CTRL, - "Failed to create rsvd INI " - "QP %d\n", ret); - goto err; - } - } + assert(!ep->base_ep.id->qp); + VERBS_DBG(FI_LOG_EP_CTRL, "Sharing XRC INI QPN %d\n", + ep->ini_conn->ini_qp->qp_num); } assert(ep->ini_conn->ini_qp); + ep->base_ep.id->context = &ep->base_ep.util_ep.ep_fid.fid; + ret = rdma_migrate_id(ep->base_ep.id, + ep->base_ep.eq->channel); + if (ret) { + VERBS_WARN(FI_LOG_EP_CTRL, + "Failed to migrate active CM ID %d\n", ret); + goto err; + } + + ofi_straddr_dbg(&vrb_prov, FI_LOG_EP_CTRL, "XRC connect src_addr", + rdma_get_local_addr(ep->base_ep.id)); + ofi_straddr_dbg(&vrb_prov, FI_LOG_EP_CTRL, "XRC connect dest_addr", + rdma_get_peer_addr(ep->base_ep.id)); ep->base_ep.ibv_qp = ep->ini_conn->ini_qp; - ret = fi_ibv_process_ini_conn(ep, ep->conn_setup->pending_recip, + ret = vrb_process_ini_conn(ep, ep->conn_setup->pending_recip, ep->conn_setup->pending_param, ep->conn_setup->pending_paramlen); err: if (ret) { ep->ini_conn->state = last_state; - fi_ibv_put_shared_ini_conn(ep); + _vrb_put_shared_ini_conn(ep); /* We need to let the application know that the * connect request has failed. */ - fi_ibv_create_shutdown_event(ep); + vrb_create_shutdown_event(ep); break; } } } -/* Caller must hold domain:xrc:ini_mgmt_lock */ -int fi_ibv_process_ini_conn(struct fi_ibv_xrc_ep *ep,int reciprocal, +/* Caller must hold domain:xrc:eq:lock */ +int vrb_process_ini_conn(struct vrb_xrc_ep *ep,int reciprocal, void *param, size_t paramlen) { - struct fi_ibv_xrc_cm_data *cm_data = param; - struct rdma_conn_param conn_param = { 0 }; + struct vrb_xrc_cm_data *cm_data = param; int ret; assert(ep->base_ep.ibv_qp); - fi_ibv_set_xrc_cm_data(cm_data, reciprocal, ep->conn_setup->conn_tag, + vrb_set_xrc_cm_data(cm_data, reciprocal, reciprocal ? + ep->conn_setup->remote_conn_tag : + ep->conn_setup->conn_tag, ep->base_ep.eq->xrc.pep_port, - ep->ini_conn->tgt_qpn); - conn_param.private_data = cm_data; - conn_param.private_data_len = paramlen; - conn_param.responder_resources = RDMA_MAX_RESP_RES; - conn_param.initiator_depth = RDMA_MAX_INIT_DEPTH; - conn_param.flow_control = 1; - conn_param.retry_count = 15; - conn_param.rnr_retry_count = 7; - conn_param.srq = 1; - - /* Shared connections use reserved temporary QP numbers to - * avoid the appearance of stale/duplicate CM messages */ + ep->ini_conn->tgt_qpn, ep->srqn); + + ep->base_ep.conn_param.private_data = cm_data; + ep->base_ep.conn_param.private_data_len = paramlen; + ep->base_ep.conn_param.responder_resources = RDMA_MAX_RESP_RES; + ep->base_ep.conn_param.initiator_depth = RDMA_MAX_INIT_DEPTH; + ep->base_ep.conn_param.flow_control = 1; + ep->base_ep.conn_param.retry_count = 15; + ep->base_ep.conn_param.rnr_retry_count = 7; + ep->base_ep.conn_param.srq = 1; + if (!ep->base_ep.id->qp) - conn_param.qp_num = ep->conn_setup->rsvd_ini_qpn->qp_num; + ep->base_ep.conn_param.qp_num = + ep->ini_conn->ini_qp->qp_num; - assert(ep->conn_state == FI_IBV_XRC_UNCONNECTED || - ep->conn_state == FI_IBV_XRC_ORIG_CONNECTED); - fi_ibv_next_xrc_conn_state(ep); + assert(ep->conn_state == VRB_XRC_UNCONNECTED || + ep->conn_state == VRB_XRC_ORIG_CONNECTED); + vrb_next_xrc_conn_state(ep); - ret = rdma_connect(ep->base_ep.id, &conn_param) ? -errno : 0; + ret = rdma_resolve_route(ep->base_ep.id, VERBS_RESOLVE_TIMEOUT); if (ret) { ret = -errno; - VERBS_WARN(FI_LOG_EP_CTRL, "rdma_connect failed %d\n", -ret); - fi_ibv_prev_xrc_conn_state(ep); + VERBS_WARN(FI_LOG_EP_CTRL, + "rdma_resolve_route failed %s (%d)\n", + strerror(-ret), -ret); + vrb_prev_xrc_conn_state(ep); } + return ret; } -int fi_ibv_ep_create_tgt_qp(struct fi_ibv_xrc_ep *ep, uint32_t tgt_qpn) +int vrb_ep_create_tgt_qp(struct vrb_xrc_ep *ep, uint32_t tgt_qpn) { #if VERBS_HAVE_XRC struct ibv_qp_open_attr open_attr; struct ibv_qp_init_attr_ex attr_ex; - struct fi_ibv_domain *domain = fi_ibv_ep_to_domain(&ep->base_ep); - struct ibv_qp *rsvd_qpn; + struct vrb_domain *domain = vrb_ep_to_domain(&ep->base_ep); int ret; assert(ep->tgt_id && !ep->tgt_id->qp); @@ -345,14 +361,6 @@ int fi_ibv_ep_create_tgt_qp(struct fi_ibv_xrc_ep *ep, uint32_t tgt_qpn) /* If a target QP number was specified then open that existing * QP for sharing. */ if (tgt_qpn) { - ret = fi_ibv_reserve_qpn(ep, &rsvd_qpn); - if (!rsvd_qpn) { - VERBS_WARN(FI_LOG_EP_CTRL, - "Create of XRC reserved QPN failed %d\n", - ret); - return ret; - } - memset(&open_attr, 0, sizeof(open_attr)); open_attr.qp_num = tgt_qpn; open_attr.comp_mask = IBV_QP_OPEN_ATTR_NUM | @@ -367,16 +375,14 @@ int fi_ibv_ep_create_tgt_qp(struct fi_ibv_xrc_ep *ep, uint32_t tgt_qpn) ret = -errno; VERBS_WARN(FI_LOG_EP_CTRL, "XRC TGT QP ibv_open_qp failed %d\n", -ret); - ibv_destroy_qp(rsvd_qpn); return ret; } - ep->conn_setup->rsvd_tgt_qpn = rsvd_qpn; return FI_SUCCESS; } /* An existing XRC target was not specified, create XRC TGT * side of new physical connection. */ - fi_ibv_msg_ep_get_qp_attr(&ep->base_ep, + vrb_msg_ep_get_qp_attr(&ep->base_ep, (struct ibv_qp_init_attr *)&attr_ex); attr_ex.qp_type = IBV_QPT_XRC_RECV; attr_ex.qp_context = ep; @@ -398,7 +404,7 @@ int fi_ibv_ep_create_tgt_qp(struct fi_ibv_xrc_ep *ep, uint32_t tgt_qpn) #endif /* !VERBS_HAVE_XRC */ } -static int fi_ibv_put_tgt_qp(struct fi_ibv_xrc_ep *ep) +static int vrb_put_tgt_qp(struct vrb_xrc_ep *ep) { int ret; @@ -422,21 +428,17 @@ static int fi_ibv_put_tgt_qp(struct fi_ibv_xrc_ep *ep) return FI_SUCCESS; } -int fi_ibv_ep_destroy_xrc_qp(struct fi_ibv_xrc_ep *ep) +/* Caller must hold eq:lock */ +int vrb_ep_destroy_xrc_qp(struct vrb_xrc_ep *ep) { - struct fi_ibv_domain *domain = fi_ibv_ep_to_domain(&ep->base_ep); + vrb_put_shared_ini_conn(ep); - if (ep->base_ep.ibv_qp) { - fastlock_acquire(&domain->xrc.ini_mgmt_lock); - fi_ibv_put_shared_ini_conn(ep); - fastlock_release(&domain->xrc.ini_mgmt_lock); - } if (ep->base_ep.id) { rdma_destroy_id(ep->base_ep.id); ep->base_ep.id = NULL; } if (ep->tgt_ibv_qp) - fi_ibv_put_tgt_qp(ep); + vrb_put_tgt_qp(ep); if (ep->tgt_id) { rdma_destroy_id(ep->tgt_id); @@ -446,10 +448,10 @@ int fi_ibv_ep_destroy_xrc_qp(struct fi_ibv_xrc_ep *ep) } FI_VERBS_XRC_ONLY -static int fi_ibv_ini_conn_compare(struct ofi_rbmap *map, void *key, void *data) +static int vrb_ini_conn_compare(struct ofi_rbmap *map, void *key, void *data) { - struct fi_ibv_ini_shared_conn *ini_conn = data; - struct fi_ibv_ini_conn_key *_key = key; + struct vrb_ini_shared_conn *ini_conn = data; + struct vrb_ini_conn_key *_key = key; int ret; assert(_key->addr->sa_family == ini_conn->peer_addr->sa_family); @@ -479,7 +481,7 @@ static int fi_ibv_ini_conn_compare(struct ofi_rbmap *map, void *key, void *data) } FI_VERBS_XRC_ONLY -static int fi_ibv_domain_xrc_validate_hw(struct fi_ibv_domain *domain) +static int vrb_domain_xrc_validate_hw(struct vrb_domain *domain) { struct ibv_device_attr attr; int ret; @@ -492,19 +494,19 @@ static int fi_ibv_domain_xrc_validate_hw(struct fi_ibv_domain *domain) return FI_SUCCESS; } -int fi_ibv_domain_xrc_init(struct fi_ibv_domain *domain) +int vrb_domain_xrc_init(struct vrb_domain *domain) { #if VERBS_HAVE_XRC struct ibv_xrcd_init_attr attr; int ret; - ret = fi_ibv_domain_xrc_validate_hw(domain); + ret = vrb_domain_xrc_validate_hw(domain); if (ret) return ret; domain->xrc.xrcd_fd = -1; - if (fi_ibv_gl_data.msg.xrcd_filename) { - domain->xrc.xrcd_fd = open(fi_ibv_gl_data.msg.xrcd_filename, + if (vrb_gl_data.msg.xrcd_filename) { + domain->xrc.xrcd_fd = open(vrb_gl_data.msg.xrcd_filename, O_CREAT, S_IWUSR | S_IRUSR); if (domain->xrc.xrcd_fd < 0) { VERBS_WARN(FI_LOG_DOMAIN, @@ -523,16 +525,22 @@ int fi_ibv_domain_xrc_init(struct fi_ibv_domain *domain) goto xrcd_err; } - fastlock_init(&domain->xrc.ini_mgmt_lock); - - domain->xrc.ini_conn_rbmap = ofi_rbmap_create(fi_ibv_ini_conn_compare); + domain->xrc.ini_conn_rbmap = ofi_rbmap_create(vrb_ini_conn_compare); if (!domain->xrc.ini_conn_rbmap) { ret = -ENOMEM; VERBS_INFO_ERRNO(FI_LOG_DOMAIN, "XRC INI QP RB Tree", -ret); goto rbmap_err; } - domain->use_xrc = 1; + fastlock_init(&domain->xrc.ini_lock); + if (domain->util_domain.threading == FI_THREAD_DOMAIN) { + domain->xrc.lock_acquire = ofi_fastlock_acquire_noop; + domain->xrc.lock_release = ofi_fastlock_release_noop; + } else { + domain->xrc.lock_acquire = ofi_fastlock_acquire; + domain->xrc.lock_release = ofi_fastlock_release; + } + domain->flags |= VRB_USE_XRC; return FI_SUCCESS; rbmap_err: @@ -548,13 +556,12 @@ int fi_ibv_domain_xrc_init(struct fi_ibv_domain *domain) #endif /* !VERBS_HAVE_XRC */ } -int fi_ibv_domain_xrc_cleanup(struct fi_ibv_domain *domain) +int vrb_domain_xrc_cleanup(struct vrb_domain *domain) { #if VERBS_HAVE_XRC int ret; assert(domain->xrc.xrcd); - /* All endpoint and hence XRC INI QP should be closed */ if (!ofi_rbmap_empty(domain->xrc.ini_conn_rbmap)) { VERBS_WARN(FI_LOG_DOMAIN, "XRC domain busy\n"); @@ -572,7 +579,7 @@ int fi_ibv_domain_xrc_cleanup(struct fi_ibv_domain *domain) } ofi_rbmap_destroy(domain->xrc.ini_conn_rbmap); - fastlock_destroy(&domain->xrc.ini_mgmt_lock); + fastlock_destroy(&domain->xrc.ini_lock); #endif /* VERBS_HAVE_XRC */ return 0; } diff --git a/prov/verbs/src/verbs_ep.c b/prov/verbs/src/verbs_ep.c index 4ce3abf3757..2898d769731 100644 --- a/prov/verbs/src/verbs_ep.c +++ b/prov/verbs/src/verbs_ep.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2013-2018 Intel Corporation, Inc. All rights reserved. + * Copyright (c) 2019 System Fabric Works, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -34,35 +35,235 @@ #include "fi_verbs.h" -#define VERBS_RESOLVE_TIMEOUT 2000 // ms +static struct fi_ops_msg vrb_srq_msg_ops; -static struct fi_ops_msg fi_ibv_srq_msg_ops; -static inline int fi_ibv_msg_ep_cmdata_size(fid_t fid) +void vrb_add_credits(struct fid_ep *ep_fid, size_t credits) { - struct fi_ibv_pep *pep; - struct fi_ibv_ep *ep; - struct fi_info *info; + struct vrb_ep *ep; + struct util_cq *cq; + + ep = container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); + cq = ep->util_ep.tx_cq; + + cq->cq_fastlock_acquire(&cq->cq_lock); + ep->peer_rq_credits += credits; + cq->cq_fastlock_release(&cq->cq_lock); +} + +/* Receive CQ credits are pre-allocated */ +ssize_t vrb_post_recv(struct vrb_ep *ep, struct ibv_recv_wr *wr) +{ + struct vrb_domain *domain; + struct vrb_context *ctx; + struct vrb_cq *cq; + struct ibv_recv_wr *bad_wr; + uint64_t credits_to_give; + int ret; + + cq = container_of(ep->util_ep.rx_cq, struct vrb_cq, util_cq); + domain = vrb_ep_to_domain(ep); + + cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock); + ctx = ofi_buf_alloc(cq->ctx_pool); + if (!ctx) + goto unlock; + + ctx->ep = ep; + ctx->user_ctx = (void *) (uintptr_t) wr->wr_id; + ctx->flags = FI_RECV; + wr->wr_id = (uintptr_t) ctx; + + ret = ibv_post_recv(ep->ibv_qp, wr, &bad_wr); + wr->wr_id = (uintptr_t) ctx->user_ctx; + if (ret) + goto freebuf; + + if (++ep->rq_credits_avail >= ep->threshold) { + credits_to_give = ep->rq_credits_avail; + ep->rq_credits_avail = 0; + } else { + credits_to_give = 0; + } + cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); + + if (credits_to_give && + domain->send_credits(&ep->util_ep.ep_fid, credits_to_give)) { + cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock); + ep->rq_credits_avail += credits_to_give; + cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); + } + + return 0; + +freebuf: + ofi_buf_free(ctx); +unlock: + cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); + return -FI_EAGAIN; +} + +ssize_t vrb_post_send(struct vrb_ep *ep, struct ibv_send_wr *wr, uint64_t flags) +{ + struct vrb_context *ctx; + struct vrb_domain *domain; + struct vrb_cq *cq; + struct vrb_cq *cq_rx; + struct ibv_send_wr *bad_wr; + struct ibv_wc wc; + size_t credits_to_give = 0; + int ret; + + cq = container_of(ep->util_ep.tx_cq, struct vrb_cq, util_cq); + domain = vrb_ep_to_domain(ep); + cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock); + ctx = ofi_buf_alloc(cq->ctx_pool); + if (!ctx) + goto unlock; + + if (!cq->credits || !ep->sq_credits || !ep->peer_rq_credits) { + ret = vrb_poll_cq(cq, &wc); + if (ret > 0) + vrb_save_wc(cq, &wc); + + if (!cq->credits || !ep->sq_credits || !ep->peer_rq_credits) { + goto freebuf; + } + } + + if (vrb_wr_consumes_recv(wr) && !--ep->peer_rq_credits && + !(flags & FI_PRIORITY)) { + /* Last credit is reserved for credit update */ + ep->peer_rq_credits++; + goto freebuf; + } + + cq->credits--; + ep->sq_credits--; + + ctx->ep = ep; + ctx->user_ctx = (void *) (uintptr_t) wr->wr_id; + ctx->flags = FI_TRANSMIT | flags; + wr->wr_id = (uintptr_t) ctx; + + ret = ibv_post_send(ep->ibv_qp, wr, &bad_wr); + wr->wr_id = (uintptr_t) ctx->user_ctx; + if (ret) { + VERBS_WARN(FI_LOG_EP_DATA, "Post send failed - %zd\n", + vrb_convert_ret(ret)); + goto credits; + } + cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); + + return 0; + +credits: + if (vrb_wr_consumes_recv(wr)) + ep->peer_rq_credits++; + cq->credits++; + ep->sq_credits++; +freebuf: + ofi_buf_free(ctx); +unlock: + cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); + cq_rx = container_of(ep->util_ep.rx_cq, struct vrb_cq, util_cq); + cq_rx->util_cq.cq_fastlock_acquire(&cq_rx->util_cq.cq_lock); + if (ep->rq_credits_avail >= ep->threshold) { + credits_to_give = ep->rq_credits_avail; + ep->rq_credits_avail = 0; + } + cq_rx->util_cq.cq_fastlock_release(&cq_rx->util_cq.cq_lock); + if (credits_to_give && + domain->send_credits(&ep->util_ep.ep_fid, credits_to_give)) { + cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock); + ep->rq_credits_avail += credits_to_give; + cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); + } + return -FI_EAGAIN; +} + +ssize_t vrb_send_iov(struct vrb_ep *ep, struct ibv_send_wr *wr, + const struct iovec *iov, void **desc, int count, + uint64_t flags) +{ + enum fi_hmem_iface iface; + uint64_t device; + void *bounce_buf; + void *send_desc; + size_t i, len = 0; + ssize_t ret; + + wr->sg_list = alloca(sizeof(*wr->sg_list) * count); + for (i = 0; i < count; i++) { + wr->sg_list[i].addr = (uintptr_t) iov[i].iov_base; + wr->sg_list[i].length = iov[i].iov_len; + wr->sg_list[i].lkey = + desc ? ((struct vrb_mem_desc *) desc[i])->lkey : 0; + len += iov[i].iov_len; + } + + if (desc) { + iface = ((struct vrb_mem_desc *) desc[0])->info.iface; + device = ((struct vrb_mem_desc *) desc[0])->info.device; + send_desc = desc[0]; + + wr->send_flags = VERBS_INJECT_FLAGS(ep, len, flags, send_desc); + } else { + iface = FI_HMEM_SYSTEM; + device = 0; + send_desc = NULL; + + wr->send_flags = IBV_SEND_INLINE; + } + + if (wr->send_flags & IBV_SEND_INLINE) { + bounce_buf = alloca(len); + ret = ofi_copy_from_hmem_iov(bounce_buf, len, iface, device, + iov, count, 0); + if (ret != len) { + VERBS_WARN(FI_LOG_EP_DATA, "hmem copy error"); + return -FI_EIO; + } + + wr->sg_list[0] = vrb_init_sge(bounce_buf, len, NULL); + wr->num_sge = 1; + } else { + wr->num_sge = count; + } + + wr->wr_id = VERBS_COMP_FLAGS(ep, flags, wr->wr_id); + if (flags & FI_FENCE) + wr->send_flags |= IBV_SEND_FENCE; + + ret = vrb_post_send(ep, wr, flags); + return ret; +} + +static inline int vrb_msg_ep_cmdata_size(fid_t fid) +{ + struct vrb_pep *pep; + struct vrb_ep *ep; + bool is_xrc; switch (fid->fclass) { case FI_CLASS_PEP: - pep = container_of(fid, struct fi_ibv_pep, pep_fid.fid); - info = pep->info; + pep = container_of(fid, struct vrb_pep, pep_fid.fid); + is_xrc = vrb_is_xrc_info(pep->info); break; case FI_CLASS_EP: - ep = container_of(fid, struct fi_ibv_ep, util_ep.ep_fid.fid); - info = ep->info; + ep = container_of(fid, struct vrb_ep, util_ep.ep_fid.fid); + is_xrc = vrb_is_xrc_ep(ep); break; default: - info = NULL; + is_xrc = 0; }; - if (fi_ibv_is_xrc(info)) - return VERBS_CM_DATA_SIZE - sizeof(struct fi_ibv_xrc_cm_data); + if (is_xrc) + return VERBS_CM_DATA_SIZE - sizeof(struct vrb_xrc_cm_data); else return VERBS_CM_DATA_SIZE; } -static int fi_ibv_ep_getopt(fid_t fid, int level, int optname, +static int vrb_ep_getopt(fid_t fid, int level, int optname, void *optval, size_t *optlen) { switch (level) { @@ -71,7 +272,7 @@ static int fi_ibv_ep_getopt(fid_t fid, int level, int optname, case FI_OPT_CM_DATA_SIZE: if (*optlen < sizeof(size_t)) return -FI_ETOOSMALL; - *((size_t *) optval) = fi_ibv_msg_ep_cmdata_size(fid); + *((size_t *) optval) = vrb_msg_ep_cmdata_size(fid); *optlen = sizeof(size_t); return 0; default: @@ -83,7 +284,7 @@ static int fi_ibv_ep_getopt(fid_t fid, int level, int optname, return 0; } -static int fi_ibv_ep_setopt(fid_t fid, int level, int optname, +static int vrb_ep_setopt(fid_t fid, int level, int optname, const void *optval, size_t optlen) { switch (level) { @@ -95,18 +296,18 @@ static int fi_ibv_ep_setopt(fid_t fid, int level, int optname, return 0; } -static struct fi_ops_ep fi_ibv_ep_base_ops = { +static struct fi_ops_ep vrb_ep_base_ops = { .size = sizeof(struct fi_ops_ep), .cancel = fi_no_cancel, - .getopt = fi_ibv_ep_getopt, - .setopt = fi_ibv_ep_setopt, + .getopt = vrb_ep_getopt, + .setopt = vrb_ep_setopt, .tx_ctx = fi_no_tx_ctx, .rx_ctx = fi_no_rx_ctx, .rx_size_left = fi_no_rx_size_left, .tx_size_left = fi_no_tx_size_left, }; -static struct fi_ops_rma fi_ibv_dgram_rma_ops = { +static struct fi_ops_rma vrb_dgram_rma_ops = { .size = sizeof(struct fi_ops_rma), .read = fi_no_rma_read, .readv = fi_no_rma_readv, @@ -119,7 +320,7 @@ static struct fi_ops_rma fi_ibv_dgram_rma_ops = { .injectdata = fi_no_rma_injectdata, }; -static int fi_ibv_alloc_wrs(struct fi_ibv_ep *ep) +static int vrb_alloc_wrs(struct vrb_ep *ep) { ep->wrs = calloc(1, sizeof(*ep->wrs)); if (!ep->wrs) @@ -140,26 +341,26 @@ static int fi_ibv_alloc_wrs(struct fi_ibv_ep *ep) return FI_SUCCESS; } -static void fi_ibv_free_wrs(struct fi_ibv_ep *ep) +static void vrb_free_wrs(struct vrb_ep *ep) { free(ep->wrs); } -static void fi_ibv_util_ep_progress_noop(struct util_ep *util_ep) +static void vrb_util_ep_progress_noop(struct util_ep *util_ep) { /* This routine shouldn't be called */ assert(0); } -static struct fi_ibv_ep * -fi_ibv_alloc_init_ep(struct fi_info *info, struct fi_ibv_domain *domain, +static struct vrb_ep * +vrb_alloc_init_ep(struct fi_info *info, struct vrb_domain *domain, void *context) { - struct fi_ibv_ep *ep; - struct fi_ibv_xrc_ep *xrc_ep; + struct vrb_ep *ep; + struct vrb_xrc_ep *xrc_ep; int ret; - if (fi_ibv_is_xrc(info)) { + if (vrb_is_xrc_info(info)) { xrc_ep = calloc(1, sizeof(*xrc_ep)); if (!xrc_ep) return NULL; @@ -171,93 +372,116 @@ fi_ibv_alloc_init_ep(struct fi_info *info, struct fi_ibv_domain *domain, return NULL; } - ep->info = fi_dupinfo(info); - if (!ep->info) - goto err1; + // When we are enabling flow control, we artificially inject + // a credit so that the credit messaging itself is not blocked + // by a lack of credits. To counter this, we will adjust the number + // of credit we send the first time by initializing to -1. + ep->rq_credits_avail = -1; if (domain->util_domain.threading != FI_THREAD_SAFE) { - if (fi_ibv_alloc_wrs(ep)) - goto err2; + if (vrb_alloc_wrs(ep)) + goto err1; } - ret = ofi_endpoint_init(&domain->util_domain.domain_fid, &fi_ibv_util_prov, info, - &ep->util_ep, context, fi_ibv_util_ep_progress_noop); + ret = ofi_endpoint_init(&domain->util_domain.domain_fid, &vrb_util_prov, info, + &ep->util_ep, context, vrb_util_ep_progress_noop); if (ret) { VERBS_WARN(FI_LOG_EP_CTRL, "Unable to initialize EP, error - %d\n", ret); - goto err3; + goto err2; } ep->util_ep.ep_fid.msg = calloc(1, sizeof(*ep->util_ep.ep_fid.msg)); if (!ep->util_ep.ep_fid.msg) - goto err4; + goto err3; return ep; -err4: - (void) ofi_endpoint_close(&ep->util_ep); err3: - fi_ibv_free_wrs(ep); + (void) ofi_endpoint_close(&ep->util_ep); err2: - fi_freeinfo(ep->info); + vrb_free_wrs(ep); err1: free(ep); return NULL; } -static int fi_ibv_close_free_ep(struct fi_ibv_ep *ep) +static int vrb_close_free_ep(struct vrb_ep *ep) { + struct vrb_cq *cq; int ret; free(ep->util_ep.ep_fid.msg); ep->util_ep.ep_fid.msg = NULL; + free(ep->cm_priv_data); + if (ep->util_ep.rx_cq) { + cq = container_of(ep->util_ep.rx_cq, struct vrb_cq, util_cq); + cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock); + cq->credits += ep->rx_cq_size; + cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); + } ret = ofi_endpoint_close(&ep->util_ep); if (ret) return ret; - fi_ibv_free_wrs(ep); - fi_freeinfo(ep->info); + vrb_free_wrs(ep); + free(ep->info_attr.src_addr); + free(ep->info_attr.dest_addr); free(ep); return 0; } /* Caller must hold eq:lock */ -static inline void fi_ibv_ep_xrc_close(struct fi_ibv_ep *ep) +static inline void vrb_ep_xrc_close(struct vrb_ep *ep) { - struct fi_ibv_xrc_ep *xrc_ep = container_of(ep, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *xrc_ep = container_of(ep, struct vrb_xrc_ep, base_ep); if (xrc_ep->conn_setup) - fi_ibv_free_xrc_conn_setup(xrc_ep, 0); - fi_ibv_ep_destroy_xrc_qp(xrc_ep); + vrb_free_xrc_conn_setup(xrc_ep, 0); + + if (xrc_ep->conn_map_node) + vrb_eq_remove_sidr_conn(xrc_ep); + vrb_ep_destroy_xrc_qp(xrc_ep); xrc_ep->magic = 0; } -static int fi_ibv_ep_close(fid_t fid) +static int vrb_ep_close(fid_t fid) { int ret; - struct fi_ibv_fabric *fab; - struct fi_ibv_ep *ep = - container_of(fid, struct fi_ibv_ep, util_ep.ep_fid.fid); + struct vrb_fabric *fab; + struct vrb_ep *ep = + container_of(fid, struct vrb_ep, util_ep.ep_fid.fid); switch (ep->util_ep.type) { case FI_EP_MSG: - if (ep->eq) + if (ep->eq) { fastlock_acquire(&ep->eq->lock); + if (ep->eq->err.err && ep->eq->err.fid == fid) { + if (ep->eq->err.err_data) { + free(ep->eq->err.err_data); + ep->eq->err.err_data = NULL; + ep->eq->err.err_data_size = 0; + } + ep->eq->err.err = 0; + ep->eq->err.prov_errno = 0; + } + vrb_eq_remove_events(ep->eq, fid); + } - if (fi_ibv_is_xrc(ep->info)) - fi_ibv_ep_xrc_close(ep); + if (vrb_is_xrc_ep(ep)) + vrb_ep_xrc_close(ep); else rdma_destroy_ep(ep->id); if (ep->eq) fastlock_release(&ep->eq->lock); - fi_ibv_cleanup_cq(ep); + vrb_cleanup_cq(ep); break; case FI_EP_DGRAM: fab = container_of(&ep->util_ep.domain->fabric->fabric_fid, - struct fi_ibv_fabric, util_fabric.fabric_fid.fid); + struct vrb_fabric, util_fabric.fabric_fid.fid); ofi_ns_del_local_name(&fab->name_server, &ep->service, &ep->ep_name); ret = ibv_destroy_qp(ep->ibv_qp); @@ -266,7 +490,7 @@ static int fi_ibv_ep_close(fid_t fid) "Unable to destroy QP (errno = %d)\n", errno); return -errno; } - fi_ibv_cleanup_cq(ep); + vrb_cleanup_cq(ep); break; default: VERBS_INFO(FI_LOG_DOMAIN, "Unknown EP type\n"); @@ -276,7 +500,7 @@ static int fi_ibv_ep_close(fid_t fid) VERBS_INFO(FI_LOG_DOMAIN, "EP %p is being closed\n", ep); - ret = fi_ibv_close_free_ep(ep); + ret = vrb_close_free_ep(ep); if (ret) { VERBS_WARN(FI_LOG_DOMAIN, "Unable to close EP (%p), error - %d\n", ep, ret); @@ -286,9 +510,9 @@ static int fi_ibv_ep_close(fid_t fid) return 0; } -static inline int fi_ibv_ep_xrc_set_tgt_chan(struct fi_ibv_ep *ep) +static inline int vrb_ep_xrc_set_tgt_chan(struct vrb_ep *ep) { - struct fi_ibv_xrc_ep *xrc_ep = container_of(ep, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *xrc_ep = container_of(ep, struct vrb_xrc_ep, base_ep); if (xrc_ep->tgt_id) return rdma_migrate_id(xrc_ep->tgt_id, ep->eq->channel); @@ -296,78 +520,82 @@ static inline int fi_ibv_ep_xrc_set_tgt_chan(struct fi_ibv_ep *ep) return FI_SUCCESS; } -static int fi_ibv_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) +static int vrb_ep_bind(struct fid *fid, struct fid *bfid, uint64_t flags) { - struct fi_ibv_ep *ep; - struct fi_ibv_cq *cq = - container_of(bfid, struct fi_ibv_cq, util_cq.cq_fid.fid); - struct fi_ibv_dgram_av *av; + struct vrb_ep *ep; + struct vrb_cq *cq = + container_of(bfid, struct vrb_cq, util_cq.cq_fid.fid); + struct vrb_dgram_av *av; int ret; - ep = container_of(fid, struct fi_ibv_ep, util_ep.ep_fid.fid); - ret = ofi_ep_bind_valid(&fi_ibv_prov, bfid, flags); + ep = container_of(fid, struct vrb_ep, util_ep.ep_fid.fid); + ret = ofi_ep_bind_valid(&vrb_prov, bfid, flags); if (ret) return ret; - switch (ep->util_ep.type) { - case FI_EP_MSG: - switch (bfid->fclass) { - case FI_CLASS_CQ: - ret = ofi_ep_bind_cq(&ep->util_ep, &cq->util_cq, flags); - if (ret) - return ret; - break; - case FI_CLASS_EQ: - ep->eq = container_of(bfid, struct fi_ibv_eq, eq_fid.fid); - ret = rdma_migrate_id(ep->id, ep->eq->channel); - if (ret) - return -errno; - if (fi_ibv_is_xrc(ep->info)) { - ret = fi_ibv_ep_xrc_set_tgt_chan(ep); - if (ret) - return -errno; + switch (bfid->fclass) { + case FI_CLASS_CQ: + /* Reserve space for receives */ + if (flags & FI_RECV) { + cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock); + if (cq->credits < ep->rx_cq_size) { + VERBS_WARN(FI_LOG_DOMAIN, + "Rx CQ is fully reserved\n"); + ep->rx_cq_size = 0; } - break; - case FI_CLASS_SRX_CTX: - ep->srq_ep = container_of(bfid, struct fi_ibv_srq_ep, ep_fid.fid); - break; - default: - return -FI_EINVAL; + cq->credits -= ep->rx_cq_size; + cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); + } + + ret = ofi_ep_bind_cq(&ep->util_ep, &cq->util_cq, flags); + if (ret) { + cq->util_cq.cq_fastlock_acquire(&cq->util_cq.cq_lock); + cq->credits += ep->rx_cq_size; + cq->util_cq.cq_fastlock_release(&cq->util_cq.cq_lock); + return ret; } break; - case FI_EP_DGRAM: - switch (bfid->fclass) { - case FI_CLASS_CQ: - ret = ofi_ep_bind_cq(&ep->util_ep, &cq->util_cq, flags); - if (ret) - return ret; - break; - case FI_CLASS_AV: - av = container_of(bfid, struct fi_ibv_dgram_av, - util_av.av_fid.fid); - return ofi_ep_bind_av(&ep->util_ep, &av->util_av); - default: + case FI_CLASS_EQ: + if (ep->util_ep.type != FI_EP_MSG) return -FI_EINVAL; - } + + ep->eq = container_of(bfid, struct vrb_eq, eq_fid.fid); + + /* Make sure EQ channel is not polled during migrate */ + fastlock_acquire(&ep->eq->lock); + if (vrb_is_xrc_ep(ep)) + ret = vrb_ep_xrc_set_tgt_chan(ep); + else + ret = rdma_migrate_id(ep->id, ep->eq->channel); + fastlock_release(&ep->eq->lock); + if (ret) + return -errno; + + break; + case FI_CLASS_SRX_CTX: + if (ep->util_ep.type != FI_EP_MSG) + return -FI_EINVAL; + + ep->srq_ep = container_of(bfid, struct vrb_srq_ep, ep_fid.fid); break; + case FI_CLASS_AV: + if (ep->util_ep.type != FI_EP_DGRAM) + return -FI_EINVAL; + + av = container_of(bfid, struct vrb_dgram_av, + util_av.av_fid.fid); + return ofi_ep_bind_av(&ep->util_ep, &av->util_av); default: - VERBS_INFO(FI_LOG_DOMAIN, "Unknown EP type\n"); - assert(0); return -FI_EINVAL; } - /* Reserve space for receives */ - if ((bfid->fclass == FI_CLASS_CQ) && (flags & FI_RECV)) { - assert(ep->rx_size < INT32_MAX); - ofi_atomic_sub32(&cq->credits, (int32_t)ep->rx_size); - } return 0; } -static int fi_ibv_create_dgram_ep(struct fi_ibv_domain *domain, struct fi_ibv_ep *ep, +static int vrb_create_dgram_ep(struct vrb_domain *domain, struct vrb_ep *ep, struct ibv_qp_init_attr *init_attr) { - struct fi_ibv_fabric *fab; + struct vrb_fabric *fab; struct ibv_qp_attr attr = { .qp_state = IBV_QPS_INIT, .pkey_index = 0, @@ -423,7 +651,7 @@ static int fi_ibv_create_dgram_ep(struct fi_ibv_domain *domain, struct fi_ibv_ep } } - if (ibv_query_gid(domain->verbs, 1, fi_ibv_gl_data.gid_idx, &gid)) { + if (ibv_query_gid(domain->verbs, 1, vrb_gl_data.gid_idx, &gid)) { VERBS_WARN(FI_LOG_EP_CTRL, "Unable to query GID, errno = %d", errno); @@ -451,7 +679,7 @@ static int fi_ibv_create_dgram_ep(struct fi_ibv_domain *domain, struct fi_ibv_ep ep->ep_name.pkey = p_key; fab = container_of(ep->util_ep.domain->fabric, - struct fi_ibv_fabric, util_fabric); + struct vrb_fabric, util_fabric); ofi_ns_add_local_name(&fab->name_server, &ep->service, &ep->ep_name); @@ -459,11 +687,11 @@ static int fi_ibv_create_dgram_ep(struct fi_ibv_domain *domain, struct fi_ibv_ep return 0; } -/* fi_ibv_srq_ep::xrc.prepost_lock must be held */ +/* vrb_srq_ep::xrc.prepost_lock must be held */ FI_VERBS_XRC_ONLY -static int fi_ibv_process_xrc_preposted(struct fi_ibv_srq_ep *srq_ep) +static int vrb_process_xrc_preposted(struct vrb_srq_ep *srq_ep) { - struct fi_ibv_xrc_srx_prepost *recv; + struct vrb_xrc_srx_prepost *recv; struct slist_entry *entry; int ret; @@ -471,7 +699,7 @@ static int fi_ibv_process_xrc_preposted(struct fi_ibv_srq_ep *srq_ep) * posting here results in adding the RX entries to the SRQ */ while (!slist_empty(&srq_ep->xrc.prepost_list)) { entry = slist_remove_head(&srq_ep->xrc.prepost_list); - recv = container_of(entry, struct fi_ibv_xrc_srx_prepost, + recv = container_of(entry, struct vrb_xrc_srx_prepost, prepost_entry); ret = fi_recv(&srq_ep->ep_fid, recv->buf, recv->len, recv->desc, recv->src_addr, recv->context); @@ -484,22 +712,22 @@ static int fi_ibv_process_xrc_preposted(struct fi_ibv_srq_ep *srq_ep) return FI_SUCCESS; } -static int fi_ibv_ep_enable_xrc(struct fi_ibv_ep *ep) +static int vrb_ep_enable_xrc(struct vrb_ep *ep) { #if VERBS_HAVE_XRC - struct fi_ibv_xrc_ep *xrc_ep = container_of(ep, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *xrc_ep = container_of(ep, struct vrb_xrc_ep, base_ep); - struct fi_ibv_srq_ep *srq_ep = ep->srq_ep; - struct fi_ibv_domain *domain = container_of(ep->util_ep.rx_cq->domain, - struct fi_ibv_domain, util_domain); - struct fi_ibv_cq *cq = container_of(ep->util_ep.rx_cq, - struct fi_ibv_cq, util_cq); + struct vrb_srq_ep *srq_ep = ep->srq_ep; + struct vrb_domain *domain = container_of(ep->util_ep.rx_cq->domain, + struct vrb_domain, util_domain); + struct vrb_cq *cq = container_of(ep->util_ep.rx_cq, + struct vrb_cq, util_cq); struct ibv_srq_init_attr_ex attr; ssize_t ret; /* XRC EP additional initialization */ dlist_init(&xrc_ep->ini_conn_entry); - xrc_ep->conn_state = FI_IBV_XRC_UNCONNECTED; + xrc_ep->conn_state = VRB_XRC_UNCONNECTED; fastlock_acquire(&srq_ep->xrc.prepost_lock); if (srq_ep->srq) { @@ -517,6 +745,14 @@ static int fi_ibv_ep_enable_xrc(struct fi_ibv_ep *ep) goto done; } + if (cq->credits < srq_ep->xrc.max_recv_wr) { + VERBS_WARN(FI_LOG_EP_CTRL, + "CQ credits %" PRId64 " insufficient\n", + cq->credits); + ret = -FI_EINVAL; + goto done; + } + memset(&attr, 0, sizeof(attr)); attr.attr.max_wr = srq_ep->xrc.max_recv_wr; attr.attr.max_sge = srq_ep->xrc.max_sge; @@ -538,13 +774,14 @@ static int fi_ibv_ep_enable_xrc(struct fi_ibv_ep *ep) cq->util_cq.cq_fastlock_acquire(&cq->xrc.srq_list_lock); dlist_insert_tail(&srq_ep->xrc.srq_entry, &cq->xrc.srq_list); srq_ep->xrc.cq = cq; + cq->credits -= srq_ep->xrc.max_recv_wr; cq->util_cq.cq_fastlock_release(&cq->xrc.srq_list_lock); ibv_get_srq_num(srq_ep->srq, &xrc_ep->srqn); /* Swap functions since locking is no longer required */ - srq_ep->ep_fid.msg = &fi_ibv_srq_msg_ops; - ret = fi_ibv_process_xrc_preposted(srq_ep); + srq_ep->ep_fid.msg = &vrb_srq_msg_ops; + ret = vrb_process_xrc_preposted(srq_ep); done: fastlock_release(&srq_ep->xrc.prepost_lock); @@ -554,39 +791,39 @@ static int fi_ibv_ep_enable_xrc(struct fi_ibv_ep *ep) #endif /* !VERBS_HAVE_XRC */ } -void fi_ibv_msg_ep_get_qp_attr(struct fi_ibv_ep *ep, - struct ibv_qp_init_attr *attr) +void vrb_msg_ep_get_qp_attr(struct vrb_ep *ep, + struct ibv_qp_init_attr *attr) { attr->qp_context = ep; if (ep->util_ep.tx_cq) { - struct fi_ibv_cq *cq = container_of(ep->util_ep.tx_cq, - struct fi_ibv_cq, util_cq); + struct vrb_cq *cq = container_of(ep->util_ep.tx_cq, + struct vrb_cq, util_cq); - attr->cap.max_send_wr = ep->info->tx_attr->size; - attr->cap.max_send_sge = ep->info->tx_attr->iov_limit; + attr->cap.max_send_wr = ep->info_attr.tx_size; + attr->cap.max_send_sge = ep->info_attr.tx_iov_limit; attr->send_cq = cq->cq; } else { - struct fi_ibv_cq *cq = - container_of(ep->util_ep.rx_cq, struct fi_ibv_cq, util_cq); + struct vrb_cq *cq = + container_of(ep->util_ep.rx_cq, struct vrb_cq, util_cq); attr->send_cq = cq->cq; } if (ep->util_ep.rx_cq) { - struct fi_ibv_cq *cq = - container_of(ep->util_ep.rx_cq, struct fi_ibv_cq, util_cq); + struct vrb_cq *cq = + container_of(ep->util_ep.rx_cq, struct vrb_cq, util_cq); - attr->cap.max_recv_wr = ep->info->rx_attr->size; - attr->cap.max_recv_sge = ep->info->rx_attr->iov_limit; + attr->cap.max_recv_wr = ep->info_attr.rx_size; + attr->cap.max_recv_sge = ep->info_attr.rx_iov_limit; attr->recv_cq = cq->cq; } else { - struct fi_ibv_cq *cq = - container_of(ep->util_ep.tx_cq, struct fi_ibv_cq, util_cq); + struct vrb_cq *cq = + container_of(ep->util_ep.tx_cq, struct vrb_cq, util_cq); attr->recv_cq = cq->cq; } - attr->cap.max_inline_data = ep->info->tx_attr->inject_size; + attr->cap.max_inline_data = ep->info_attr.inject_size; attr->qp_type = IBV_QPT_RC; attr->sq_sig_all = 1; @@ -598,12 +835,12 @@ void fi_ibv_msg_ep_get_qp_attr(struct fi_ibv_ep *ep, } -static int fi_ibv_ep_enable(struct fid_ep *ep_fid) +static int vrb_ep_enable(struct fid_ep *ep_fid) { struct ibv_qp_init_attr attr = { 0 }; - struct fi_ibv_ep *ep = container_of(ep_fid, struct fi_ibv_ep, + struct vrb_ep *ep = container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); - struct fi_ibv_domain *domain = fi_ibv_ep_to_domain(ep); + struct vrb_domain *domain = vrb_ep_to_domain(ep); int ret; if (!ep->eq && (ep->util_ep.type == FI_EP_MSG)) { @@ -618,36 +855,35 @@ static int fi_ibv_ep_enable(struct fid_ep *ep_fid) return -FI_ENOCQ; } - if (!ep->util_ep.tx_cq && (ofi_send_allowed(ep->util_ep.caps) || - ofi_rma_initiate_allowed(ep->util_ep.caps))) { + if (!ep->util_ep.tx_cq && (ofi_needs_tx(ep->util_ep.caps))) { VERBS_WARN(FI_LOG_EP_CTRL, "Endpoint is not bound to " "a send completion queue when it has transmit " "capabilities enabled (FI_SEND | FI_RMA).\n"); return -FI_ENOCQ; } - if (!ep->util_ep.rx_cq && ofi_recv_allowed(ep->util_ep.caps)) { + if (!ep->util_ep.rx_cq && ofi_needs_rx(ep->util_ep.caps)) { VERBS_WARN(FI_LOG_EP_CTRL, "Endpoint is not bound to " "a receive completion queue when it has receive " "capabilities enabled. (FI_RECV)\n"); return -FI_ENOCQ; } - fi_ibv_msg_ep_get_qp_attr(ep, &attr); + vrb_msg_ep_get_qp_attr(ep, &attr); switch (ep->util_ep.type) { case FI_EP_MSG: if (ep->srq_ep) { /* Override receive function pointers to prevent the user from * posting Receive WRs to a QP where a SRQ is attached to it */ - if (domain->use_xrc) { - *ep->util_ep.ep_fid.msg = fi_ibv_msg_srq_xrc_ep_msg_ops; - return fi_ibv_ep_enable_xrc(ep); + if (domain->flags & VRB_USE_XRC) { + *ep->util_ep.ep_fid.msg = vrb_msg_srq_xrc_ep_msg_ops; + return vrb_ep_enable_xrc(ep); } else { ep->util_ep.ep_fid.msg->recv = fi_no_msg_recv; ep->util_ep.ep_fid.msg->recvv = fi_no_msg_recvv; ep->util_ep.ep_fid.msg->recvmsg = fi_no_msg_recvmsg; } - } else if (domain->use_xrc) { + } else if (domain->flags & VRB_USE_XRC) { VERBS_WARN(FI_LOG_EP_CTRL, "XRC EP_MSG not bound " "to srx_context\n"); return -FI_EINVAL; @@ -669,7 +905,7 @@ static int fi_ibv_ep_enable(struct fid_ep *ep_fid) case FI_EP_DGRAM: assert(domain); attr.sq_sig_all = 1; - ret = fi_ibv_create_dgram_ep(domain, ep, &attr); + ret = vrb_create_dgram_ep(domain, ep, &attr); if (ret) { VERBS_WARN(FI_LOG_EP_CTRL, "Unable to create dgram EP: %s (%d)\n", fi_strerror(-ret), -ret); @@ -684,7 +920,7 @@ static int fi_ibv_ep_enable(struct fid_ep *ep_fid) return 0; } -static int fi_ibv_ep_control(struct fid *fid, int command, void *arg) +static int vrb_ep_control(struct fid *fid, int command, void *arg) { struct fid_ep *ep; @@ -693,7 +929,7 @@ static int fi_ibv_ep_control(struct fid *fid, int command, void *arg) ep = container_of(fid, struct fid_ep, fid); switch (command) { case FI_ENABLE: - return fi_ibv_ep_enable(ep); + return vrb_ep_enable(ep); break; default: return -FI_ENOSYS; @@ -704,57 +940,45 @@ static int fi_ibv_ep_control(struct fid *fid, int command, void *arg) } } -static int fi_ibv_dgram_ep_setname(fid_t ep_fid, void *addr, size_t addrlen) +static int vrb_dgram_ep_setname(fid_t ep_fid, void *addr, size_t addrlen) { - struct fi_ibv_ep *ep; + struct vrb_ep *ep; void *save_addr; int ret = FI_SUCCESS; - if (ep_fid->fclass != FI_CLASS_EP) - return -FI_EINVAL; - - ep = container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid.fid); - if (!ep) - return -FI_EINVAL; - - if (addrlen < ep->info->src_addrlen) { + ep = container_of(ep_fid, struct vrb_ep, util_ep.ep_fid.fid); + if (addrlen < ep->info_attr.src_addrlen) { VERBS_INFO(FI_LOG_EP_CTRL, "addrlen expected: %zu, got: %zu\n", - ep->info->src_addrlen, addrlen); + ep->info_attr.src_addrlen, addrlen); return -FI_ETOOSMALL; } /* * save previous address to be able make * a roll back on the previous one */ - save_addr = ep->info->src_addr; + save_addr = ep->info_attr.src_addr; - ep->info->src_addr = calloc(1, ep->info->src_addrlen); - if (!ep->info->src_addr) { - ep->info->src_addr = save_addr; + ep->info_attr.src_addr = calloc(1, ep->info_attr.src_addrlen); + if (!ep->info_attr.src_addr) { + ep->info_attr.src_addr = save_addr; ret = -FI_ENOMEM; goto err; } - memcpy(ep->info->src_addr, addr, ep->info->src_addrlen); - memcpy(&ep->ep_name, addr, ep->info->src_addrlen); + memcpy(ep->info_attr.src_addr, addr, ep->info_attr.src_addrlen); + memcpy(&ep->ep_name, addr, ep->info_attr.src_addrlen); err: - ep->info->src_addr = save_addr; + ep->info_attr.src_addr = save_addr; return ret; } -static int fi_ibv_dgram_ep_getname(fid_t ep_fid, void *addr, size_t *addrlen) +static int vrb_dgram_ep_getname(fid_t ep_fid, void *addr, size_t *addrlen) { - struct fi_ibv_ep *ep; - - if (ep_fid->fclass != FI_CLASS_EP) - return -FI_EINVAL; - - ep = container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid.fid); - if (!ep) - return -FI_EINVAL; + struct vrb_ep *ep; + ep = container_of(ep_fid, struct vrb_ep, util_ep.ep_fid.fid); if (*addrlen < sizeof(ep->ep_name)) { *addrlen = sizeof(ep->ep_name); VERBS_INFO(FI_LOG_EP_CTRL, @@ -770,18 +994,18 @@ static int fi_ibv_dgram_ep_getname(fid_t ep_fid, void *addr, size_t *addrlen) return FI_SUCCESS; } -static struct fi_ops fi_ibv_ep_ops = { +static struct fi_ops vrb_ep_ops = { .size = sizeof(struct fi_ops), - .close = fi_ibv_ep_close, - .bind = fi_ibv_ep_bind, - .control = fi_ibv_ep_control, + .close = vrb_ep_close, + .bind = vrb_ep_bind, + .control = vrb_ep_control, .ops_open = fi_no_ops_open, }; -static struct fi_ops_cm fi_ibv_dgram_cm_ops = { - .size = sizeof(fi_ibv_dgram_cm_ops), - .setname = fi_ibv_dgram_ep_setname, - .getname = fi_ibv_dgram_ep_getname, +static struct fi_ops_cm vrb_dgram_cm_ops = { + .size = sizeof(vrb_dgram_cm_ops), + .setname = vrb_dgram_ep_setname, + .getname = vrb_dgram_ep_getname, .getpeer = fi_no_getpeer, .connect = fi_no_connect, .listen = fi_no_listen, @@ -791,24 +1015,57 @@ static struct fi_ops_cm fi_ibv_dgram_cm_ops = { .join = fi_no_join, }; -int fi_ibv_open_ep(struct fid_domain *domain, struct fi_info *info, +static int vrb_ep_save_info_attr(struct vrb_ep *ep, struct fi_info *info) +{ + ep->info_attr.protocol = info->ep_attr ? info->ep_attr->protocol: + FI_PROTO_UNSPEC; + ep->info_attr.inject_size = info->tx_attr->inject_size; + ep->info_attr.tx_size = info->tx_attr->size; + ep->info_attr.tx_iov_limit = info->tx_attr->iov_limit; + ep->info_attr.rx_size = info->rx_attr->size; + ep->info_attr.rx_iov_limit = info->rx_attr->iov_limit; + ep->info_attr.addr_format = info->addr_format; + ep->info_attr.handle = info->handle; + + if (info->src_addr) { + ep->info_attr.src_addr = mem_dup(info->src_addr, info->src_addrlen); + if (ep->info_attr.src_addr == NULL) { + VERBS_WARN(FI_LOG_EP_CTRL, "Memory error save src addr\n"); + return -FI_ENOMEM; + } + ep->info_attr.src_addrlen = info->src_addrlen; + } + if (info->dest_addr) { + ep->info_attr.dest_addr = mem_dup(info->dest_addr, info->dest_addrlen); + if (ep->info_attr.dest_addr == NULL) { + VERBS_WARN(FI_LOG_EP_CTRL, "Memory error save dest addr\n"); + free(ep->info_attr.src_addr); + ep->info_attr.src_addr = NULL; + return -FI_ENOMEM; + } + ep->info_attr.dest_addrlen = info->dest_addrlen; + } + return FI_SUCCESS; +} + +int vrb_open_ep(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep_fid, void *context) { - struct fi_ibv_domain *dom; - struct fi_ibv_ep *ep; - struct fi_ibv_connreq *connreq; - struct fi_ibv_pep *pep; + struct vrb_domain *dom; + struct vrb_ep *ep; + struct vrb_connreq *connreq; + struct vrb_pep *pep; struct fi_info *fi; int ret; if (info->src_addr) - ofi_straddr_dbg(&fi_ibv_prov, FI_LOG_FABRIC, + ofi_straddr_dbg(&vrb_prov, FI_LOG_FABRIC, "open_ep src addr", info->src_addr); if (info->dest_addr) - ofi_straddr_dbg(&fi_ibv_prov, FI_LOG_FABRIC, + ofi_straddr_dbg(&vrb_prov, FI_LOG_FABRIC, "open_ep dest addr", info->dest_addr); - dom = container_of(domain, struct fi_ibv_domain, + dom = container_of(domain, struct vrb_domain, util_domain.domain_fid); /* strncmp is used here, because the function is used * to allocate DGRAM (has prefix -dgram) and MSG EPs */ @@ -823,66 +1080,80 @@ int fi_ibv_open_ep(struct fid_domain *domain, struct fi_info *info, fi = dom->info; if (info->ep_attr) { - ret = fi_ibv_check_ep_attr(info, fi); + ret = vrb_check_ep_attr(info, fi); if (ret) return ret; } if (info->tx_attr) { - ret = ofi_check_tx_attr(&fi_ibv_prov, fi->tx_attr, + ret = ofi_check_tx_attr(&vrb_prov, fi->tx_attr, info->tx_attr, info->mode); if (ret) return ret; } if (info->rx_attr) { - ret = fi_ibv_check_rx_attr(info->rx_attr, info, fi); + ret = vrb_check_rx_attr(info->rx_attr, info, fi); if (ret) return ret; } - ep = fi_ibv_alloc_init_ep(info, dom, context); - if (!ep) + ep = vrb_alloc_init_ep(info, dom, context); + if (!ep) { + VERBS_WARN(FI_LOG_EP_CTRL, + "Unable to allocate/init EP memory\n"); return -FI_ENOMEM; + } - ep->inject_limit = ep->info->tx_attr->inject_size; + ep->peer_rq_credits = UINT64_MAX; + ep->threshold = INT64_MAX; /* disables RQ flow control */ + ep->hmem_enabled = !!(ep->util_ep.caps & FI_HMEM); + + ret = vrb_ep_save_info_attr(ep, info); + if (ret) + goto err1; switch (info->ep_attr->type) { case FI_EP_MSG: - if (dom->use_xrc) { + if (dom->flags & VRB_USE_XRC) { if (dom->util_domain.threading == FI_THREAD_SAFE) { - *ep->util_ep.ep_fid.msg = fi_ibv_msg_xrc_ep_msg_ops_ts; - ep->util_ep.ep_fid.rma = &fi_ibv_msg_xrc_ep_rma_ops_ts; + *ep->util_ep.ep_fid.msg = vrb_msg_xrc_ep_msg_ops_ts; + ep->util_ep.ep_fid.rma = &vrb_msg_xrc_ep_rma_ops_ts; } else { - *ep->util_ep.ep_fid.msg = fi_ibv_msg_xrc_ep_msg_ops; - ep->util_ep.ep_fid.rma = &fi_ibv_msg_xrc_ep_rma_ops; + *ep->util_ep.ep_fid.msg = vrb_msg_xrc_ep_msg_ops; + ep->util_ep.ep_fid.rma = &vrb_msg_xrc_ep_rma_ops; } - ep->util_ep.ep_fid.cm = &fi_ibv_msg_xrc_ep_cm_ops; - ep->util_ep.ep_fid.atomic = &fi_ibv_msg_xrc_ep_atomic_ops; + ep->util_ep.ep_fid.cm = &vrb_msg_xrc_ep_cm_ops; + ep->util_ep.ep_fid.atomic = &vrb_msg_xrc_ep_atomic_ops; } else { if (dom->util_domain.threading == FI_THREAD_SAFE) { - *ep->util_ep.ep_fid.msg = fi_ibv_msg_ep_msg_ops_ts; - ep->util_ep.ep_fid.rma = &fi_ibv_msg_ep_rma_ops_ts; + *ep->util_ep.ep_fid.msg = vrb_msg_ep_msg_ops_ts; + ep->util_ep.ep_fid.rma = &vrb_msg_ep_rma_ops_ts; } else { - *ep->util_ep.ep_fid.msg = fi_ibv_msg_ep_msg_ops; - ep->util_ep.ep_fid.rma = &fi_ibv_msg_ep_rma_ops; + *ep->util_ep.ep_fid.msg = vrb_msg_ep_msg_ops; + ep->util_ep.ep_fid.rma = &vrb_msg_ep_rma_ops; } - ep->util_ep.ep_fid.cm = &fi_ibv_msg_ep_cm_ops; - ep->util_ep.ep_fid.atomic = &fi_ibv_msg_ep_atomic_ops; + ep->util_ep.ep_fid.cm = &vrb_msg_ep_cm_ops; + ep->util_ep.ep_fid.atomic = &vrb_msg_ep_atomic_ops; } if (!info->handle) { - ret = fi_ibv_create_ep(NULL, NULL, 0, info, NULL, &ep->id); - if (ret) - goto err1; + /* Only RC, XRC active RDMA CM ID is created at connect */ + if (!(dom->flags & VRB_USE_XRC)) { + ret = vrb_create_ep(ep, + vrb_get_port_space(info->addr_format), &ep->id); + if (ret) + goto err1; + ep->id->context = &ep->util_ep.ep_fid.fid; + } } else if (info->handle->fclass == FI_CLASS_CONNREQ) { connreq = container_of(info->handle, - struct fi_ibv_connreq, handle); - if (dom->use_xrc) { + struct vrb_connreq, handle); + if (dom->flags & VRB_USE_XRC) { assert(connreq->is_xrc); if (!connreq->xrc.is_reciprocal) { - ret = fi_ibv_process_xrc_connreq(ep, + ret = vrb_process_xrc_connreq(ep, connreq); if (ret) goto err1; @@ -890,9 +1161,10 @@ int fi_ibv_open_ep(struct fid_domain *domain, struct fi_info *info, } else { ep->id = connreq->id; ep->ibv_qp = ep->id->qp; + ep->id->context = &ep->util_ep.ep_fid.fid; } } else if (info->handle->fclass == FI_CLASS_PEP) { - pep = container_of(info->handle, struct fi_ibv_pep, pep_fid.fid); + pep = container_of(info->handle, struct vrb_pep, pep_fid.fid); ep->id = pep->id; ep->ibv_qp = ep->id->qp; pep->id = NULL; @@ -903,17 +1175,11 @@ int fi_ibv_open_ep(struct fid_domain *domain, struct fi_info *info, VERBS_INFO(FI_LOG_DOMAIN, "Unable to rdma_resolve_addr\n"); goto err2; } - - if (rdma_resolve_route(ep->id, VERBS_RESOLVE_TIMEOUT)) { - ret = -errno; - VERBS_INFO(FI_LOG_DOMAIN, "Unable to rdma_resolve_route\n"); - goto err2; - } + ep->id->context = &ep->util_ep.ep_fid.fid; } else { ret = -FI_ENOSYS; goto err1; } - ep->id->context = &ep->util_ep.ep_fid.fid; break; case FI_EP_DGRAM: ep->service = (info->src_addr) ? @@ -921,12 +1187,12 @@ int fi_ibv_open_ep(struct fid_domain *domain, struct fi_info *info, (((getpid() & 0x7FFF) << 16) + ((uintptr_t)ep & 0xFFFF)); if (dom->util_domain.threading == FI_THREAD_SAFE) { - *ep->util_ep.ep_fid.msg = fi_ibv_dgram_msg_ops_ts; + *ep->util_ep.ep_fid.msg = vrb_dgram_msg_ops_ts; } else { - *ep->util_ep.ep_fid.msg = fi_ibv_dgram_msg_ops; + *ep->util_ep.ep_fid.msg = vrb_dgram_msg_ops; } - ep->util_ep.ep_fid.rma = &fi_ibv_dgram_rma_ops; - ep->util_ep.ep_fid.cm = &fi_ibv_dgram_cm_ops; + ep->util_ep.ep_fid.rma = &vrb_dgram_rma_ops; + ep->util_ep.ep_fid.cm = &vrb_dgram_cm_ops; break; default: VERBS_INFO(FI_LOG_DOMAIN, "Unknown EP type\n"); @@ -935,31 +1201,42 @@ int fi_ibv_open_ep(struct fid_domain *domain, struct fi_info *info, goto err1; } - ep->rx_size = info->rx_attr->size; + if (info->ep_attr->rx_ctx_cnt == 0 || + info->ep_attr->rx_ctx_cnt == 1) { + ep->rx_cq_size = info->rx_attr ? info->rx_attr->size : + fi->rx_attr->size; + } + + if (info->ep_attr->tx_ctx_cnt == 0 || + info->ep_attr->tx_ctx_cnt == 1) { + ep->sq_credits = info->tx_attr ? info->tx_attr->size : + fi->tx_attr->size; + } *ep_fid = &ep->util_ep.ep_fid; - ep->util_ep.ep_fid.fid.ops = &fi_ibv_ep_ops; - ep->util_ep.ep_fid.ops = &fi_ibv_ep_base_ops; + ep->util_ep.ep_fid.fid.ops = &vrb_ep_ops; + ep->util_ep.ep_fid.ops = &vrb_ep_base_ops; return FI_SUCCESS; err2: ep->ibv_qp = NULL; - rdma_destroy_ep(ep->id); + if (ep->id) + rdma_destroy_ep(ep->id); err1: - fi_ibv_close_free_ep(ep); + vrb_close_free_ep(ep); return ret; } -static int fi_ibv_pep_bind(fid_t fid, struct fid *bfid, uint64_t flags) +static int vrb_pep_bind(fid_t fid, struct fid *bfid, uint64_t flags) { - struct fi_ibv_pep *pep; + struct vrb_pep *pep; int ret; - pep = container_of(fid, struct fi_ibv_pep, pep_fid.fid); + pep = container_of(fid, struct vrb_pep, pep_fid.fid); if (bfid->fclass != FI_CLASS_EQ) return -FI_EINVAL; - pep->eq = container_of(bfid, struct fi_ibv_eq, eq_fid.fid); + pep->eq = container_of(bfid, struct vrb_eq, eq_fid.fid); /* * This is a restrictive solution that enables an XRC EP to * inform it's peer the port that should be used in making the @@ -967,30 +1244,35 @@ static int fi_ibv_pep_bind(fid_t fid, struct fid *bfid, uint64_t flags) * it limits an EQ to a single passive endpoint. TODO: implement * a more general solution. */ - if (fi_ibv_is_xrc(pep->info)) { - if (pep->eq->xrc.pep_port) { + if (vrb_is_xrc_info(pep->info)) { + if (pep->eq->xrc.pep_port) { VERBS_WARN(FI_LOG_EP_CTRL, "XRC limits EQ binding to a single PEP\n"); return -FI_EINVAL; - } - pep->eq->xrc.pep_port = ntohs(rdma_get_src_port(pep->id)); + } + pep->eq->xrc.pep_port = ntohs(rdma_get_src_port(pep->id)); } ret = rdma_migrate_id(pep->id, pep->eq->channel); if (ret) return -errno; - return 0; + if (vrb_is_xrc_info(pep->info)) { + ret = rdma_migrate_id(pep->xrc_ps_udp_id, pep->eq->channel); + if (ret) + return -errno; + } + return FI_SUCCESS; } -static int fi_ibv_pep_control(struct fid *fid, int command, void *arg) +static int vrb_pep_control(struct fid *fid, int command, void *arg) { - struct fi_ibv_pep *pep; + struct vrb_pep *pep; int ret = 0; switch (fid->fclass) { case FI_CLASS_PEP: - pep = container_of(fid, struct fi_ibv_pep, pep_fid.fid); + pep = container_of(fid, struct vrb_pep, pep_fid.fid); switch (command) { case FI_BACKLOG: if (!arg) @@ -1010,30 +1292,32 @@ static int fi_ibv_pep_control(struct fid *fid, int command, void *arg) return ret; } -static int fi_ibv_pep_close(fid_t fid) +static int vrb_pep_close(fid_t fid) { - struct fi_ibv_pep *pep; + struct vrb_pep *pep; - pep = container_of(fid, struct fi_ibv_pep, pep_fid.fid); + pep = container_of(fid, struct vrb_pep, pep_fid.fid); if (pep->id) rdma_destroy_ep(pep->id); + if (pep->xrc_ps_udp_id) + rdma_destroy_ep(pep->xrc_ps_udp_id); fi_freeinfo(pep->info); free(pep); return 0; } -static struct fi_ops fi_ibv_pep_fi_ops = { +static struct fi_ops vrb_pep_fi_ops = { .size = sizeof(struct fi_ops), - .close = fi_ibv_pep_close, - .bind = fi_ibv_pep_bind, - .control = fi_ibv_pep_control, + .close = vrb_pep_close, + .bind = vrb_pep_bind, + .control = vrb_pep_control, .ops_open = fi_no_ops_open, }; -static struct fi_ops_ep fi_ibv_pep_ops = { +static struct fi_ops_ep vrb_pep_ops = { .size = sizeof(struct fi_ops_ep), - .getopt = fi_ibv_ep_getopt, + .getopt = vrb_ep_getopt, .setopt = fi_no_setopt, .tx_ctx = fi_no_tx_ctx, .rx_ctx = fi_no_rx_ctx, @@ -1041,10 +1325,10 @@ static struct fi_ops_ep fi_ibv_pep_ops = { .tx_size_left = fi_no_tx_size_left, }; -int fi_ibv_passive_ep(struct fid_fabric *fabric, struct fi_info *info, +int vrb_passive_ep(struct fid_fabric *fabric, struct fi_info *info, struct fid_pep **pep, void *context) { - struct fi_ibv_pep *_pep; + struct vrb_pep *_pep; int ret; _pep = calloc(1, sizeof *_pep); @@ -1062,9 +1346,10 @@ int fi_ibv_passive_ep(struct fid_fabric *fabric, struct fi_info *info, _pep->info->dest_addrlen = 0; } - ret = rdma_create_id(NULL, &_pep->id, &_pep->pep_fid.fid, RDMA_PS_TCP); + ret = rdma_create_id(NULL, &_pep->id, &_pep->pep_fid.fid, + vrb_get_port_space(_pep->info->addr_format)); if (ret) { - VERBS_INFO(FI_LOG_DOMAIN, "Unable to create rdma_cm_id\n"); + VERBS_INFO(FI_LOG_DOMAIN, "Unable to create PEP rdma_cm_id\n"); goto err2; } @@ -1077,17 +1362,41 @@ int fi_ibv_passive_ep(struct fid_fabric *fabric, struct fi_info *info, _pep->bound = 1; } + /* XRC listens on both RDMA_PS_TCP and RDMA_PS_UDP */ + if (vrb_is_xrc_info(info)) { + ret = rdma_create_id(NULL, &_pep->xrc_ps_udp_id, + &_pep->pep_fid.fid, RDMA_PS_UDP); + if (ret) { + VERBS_INFO(FI_LOG_DOMAIN, + "Unable to create PEP PS_UDP rdma_cm_id\n"); + goto err3; + } + /* Currently both listens must be bound to same port number */ + ofi_addr_set_port(_pep->info->src_addr, + ntohs(rdma_get_src_port(_pep->id))); + ret = rdma_bind_addr(_pep->xrc_ps_udp_id, + (struct sockaddr *)_pep->info->src_addr); + if (ret) { + VERBS_INFO(FI_LOG_DOMAIN, + "Unable to bind address to PS_UDP rdma_cm_id\n"); + goto err4; + } + } + _pep->pep_fid.fid.fclass = FI_CLASS_PEP; _pep->pep_fid.fid.context = context; - _pep->pep_fid.fid.ops = &fi_ibv_pep_fi_ops; - _pep->pep_fid.ops = &fi_ibv_pep_ops; - _pep->pep_fid.cm = fi_ibv_pep_ops_cm(_pep); + _pep->pep_fid.fid.ops = &vrb_pep_fi_ops; + _pep->pep_fid.ops = &vrb_pep_ops; + _pep->pep_fid.cm = vrb_pep_ops_cm(_pep); _pep->src_addrlen = info->src_addrlen; *pep = &_pep->pep_fid; return 0; +err4: + /* Only possible for XRC code path */ + rdma_destroy_id(_pep->xrc_ps_udp_id); err3: rdma_destroy_id(_pep->id); err2: @@ -1097,7 +1406,7 @@ int fi_ibv_passive_ep(struct fid_fabric *fabric, struct fi_info *info, return ret; } -static struct fi_ops_ep fi_ibv_srq_ep_base_ops = { +static struct fi_ops_ep vrb_srq_ep_base_ops = { .size = sizeof(struct fi_ops_ep), .cancel = fi_no_cancel, .getopt = fi_no_getopt, @@ -1108,7 +1417,7 @@ static struct fi_ops_ep fi_ibv_srq_ep_base_ops = { .tx_size_left = fi_no_tx_size_left, }; -static struct fi_ops_cm fi_ibv_srq_cm_ops = { +static struct fi_ops_cm vrb_srq_cm_ops = { .size = sizeof(struct fi_ops_cm), .setname = fi_no_setname, .getname = fi_no_getname, @@ -1121,7 +1430,7 @@ static struct fi_ops_cm fi_ibv_srq_cm_ops = { .join = fi_no_join, }; -static struct fi_ops_rma fi_ibv_srq_rma_ops = { +static struct fi_ops_rma vrb_srq_rma_ops = { .size = sizeof(struct fi_ops_rma), .read = fi_no_rma_read, .readv = fi_no_rma_readv, @@ -1134,7 +1443,7 @@ static struct fi_ops_rma fi_ibv_srq_rma_ops = { .injectdata = fi_no_rma_injectdata, }; -static struct fi_ops_atomic fi_ibv_srq_atomic_ops = { +static struct fi_ops_atomic vrb_srq_atomic_ops = { .size = sizeof(struct fi_ops_atomic), .write = fi_no_atomic_write, .writev = fi_no_atomic_writev, @@ -1151,45 +1460,69 @@ static struct fi_ops_atomic fi_ibv_srq_atomic_ops = { .compwritevalid = fi_no_atomic_compwritevalid, }; +/* Receive CQ credits are pre-allocated */ +ssize_t vrb_post_srq(struct vrb_srq_ep *ep, struct ibv_recv_wr *wr) +{ + struct vrb_context *ctx; + struct ibv_recv_wr *bad_wr; + int ret; + + fastlock_acquire(&ep->ctx_lock); + ctx = ofi_buf_alloc(ep->ctx_pool); + if (!ctx) + goto unlock; + + ctx->srx = ep; + ctx->user_ctx = (void *) (uintptr_t) wr->wr_id; + ctx->flags = FI_RECV; + wr->wr_id = (uintptr_t) ctx; + + ret = ibv_post_srq_recv(ep->srq, wr, &bad_wr); + wr->wr_id = (uintptr_t) ctx->user_ctx; + if (ret) + goto freebuf; + fastlock_release(&ep->ctx_lock); + return 0; + +freebuf: + ofi_buf_free(ctx); +unlock: + fastlock_release(&ep->ctx_lock); + return -FI_EAGAIN; +} + static inline ssize_t -fi_ibv_srq_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) +vrb_srq_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) { - struct fi_ibv_srq_ep *ep = - container_of(ep_fid, struct fi_ibv_srq_ep, ep_fid); + struct vrb_srq_ep *ep = container_of(ep_fid, struct vrb_srq_ep, ep_fid); struct ibv_recv_wr wr = { - .wr_id = (uintptr_t)msg->context, + .wr_id = (uintptr_t )msg->context, .num_sge = msg->iov_count, .next = NULL, }; - struct ibv_recv_wr *bad_wr; - assert(ep->srq); - - fi_ibv_set_sge_iov(wr.sg_list, msg->msg_iov, msg->iov_count, msg->desc); - - return fi_ibv_handle_post(ibv_post_srq_recv(ep->srq, &wr, &bad_wr)); + vrb_iov_dupa(wr.sg_list, msg->msg_iov, msg->desc, msg->iov_count); + return vrb_post_srq(ep, &wr); } static ssize_t -fi_ibv_srq_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, +vrb_srq_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context) { - struct fi_ibv_srq_ep *ep = - container_of(ep_fid, struct fi_ibv_srq_ep, ep_fid); - struct ibv_sge sge = fi_ibv_init_sge(buf, len, desc); + struct vrb_srq_ep *ep = container_of(ep_fid, struct vrb_srq_ep, ep_fid); + struct ibv_sge sge = vrb_init_sge(buf, len, desc); struct ibv_recv_wr wr = { - .wr_id = (uintptr_t)context, + .wr_id = (uintptr_t) context, .num_sge = 1, .sg_list = &sge, .next = NULL, }; - struct ibv_recv_wr *bad_wr; - return fi_ibv_handle_post(ibv_post_srq_recv(ep->srq, &wr, &bad_wr)); + return vrb_post_srq(ep, &wr); } static ssize_t -fi_ibv_srq_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, +vrb_srq_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, void *context) { struct fi_msg msg = { @@ -1200,14 +1533,14 @@ fi_ibv_srq_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, .context = context, }; - return fi_ibv_srq_ep_recvmsg(ep_fid, &msg, 0); + return vrb_srq_ep_recvmsg(ep_fid, &msg, 0); } -static struct fi_ops_msg fi_ibv_srq_msg_ops = { +static struct fi_ops_msg vrb_srq_msg_ops = { .size = sizeof(struct fi_ops_msg), - .recv = fi_ibv_srq_ep_recv, - .recvv = fi_ibv_srq_ep_recvv, - .recvmsg = fi_ibv_srq_ep_recvmsg, + .recv = vrb_srq_ep_recv, + .recvv = vrb_srq_ep_recvv, + .recvmsg = vrb_srq_ep_recvmsg, .send = fi_no_msg_send, .sendv = fi_no_msg_sendv, .sendmsg = fi_no_msg_sendmsg, @@ -1224,12 +1557,12 @@ static struct fi_ops_msg fi_ibv_srq_msg_ops = { * to the shared receive context is enabled. */ static ssize_t -fi_ibv_xrc_srq_ep_prepost_recv(struct fid_ep *ep_fid, void *buf, size_t len, +vrb_xrc_srq_ep_prepost_recv(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context) { - struct fi_ibv_srq_ep *ep = - container_of(ep_fid, struct fi_ibv_srq_ep, ep_fid); - struct fi_ibv_xrc_srx_prepost *recv; + struct vrb_srq_ep *ep = + container_of(ep_fid, struct vrb_srq_ep, ep_fid); + struct vrb_xrc_srx_prepost *recv; ssize_t ret; fastlock_acquire(&ep->xrc.prepost_lock); @@ -1238,7 +1571,7 @@ fi_ibv_xrc_srq_ep_prepost_recv(struct fid_ep *ep_fid, void *buf, size_t len, * receive message function is swapped out. */ if (ep->srq) { fastlock_release(&ep->xrc.prepost_lock); - return fi_ibv_handle_post(fi_recv(ep_fid, buf, len, desc, + return vrb_convert_ret(fi_recv(ep_fid, buf, len, desc, src_addr, context)); } @@ -1267,9 +1600,9 @@ fi_ibv_xrc_srq_ep_prepost_recv(struct fid_ep *ep_fid, void *buf, size_t len, return ret; } -static struct fi_ops_msg fi_ibv_xrc_srq_msg_ops = { +static struct fi_ops_msg vrb_xrc_srq_msg_ops = { .size = sizeof(struct fi_ops_msg), - .recv = fi_ibv_xrc_srq_ep_prepost_recv, + .recv = vrb_xrc_srq_ep_prepost_recv, .recvv = fi_no_msg_recvv, /* Not used by RXM */ .recvmsg = fi_no_msg_recvmsg, /* Not used by RXM */ .send = fi_no_msg_send, @@ -1280,25 +1613,25 @@ static struct fi_ops_msg fi_ibv_xrc_srq_msg_ops = { .injectdata = fi_no_msg_injectdata, }; -static void fi_ibv_cleanup_prepost_bufs(struct fi_ibv_srq_ep *srq_ep) +static void vrb_cleanup_prepost_bufs(struct vrb_srq_ep *srq_ep) { - struct fi_ibv_xrc_srx_prepost *recv; + struct vrb_xrc_srx_prepost *recv; struct slist_entry *entry; while (!slist_empty(&srq_ep->xrc.prepost_list)) { entry = slist_remove_head(&srq_ep->xrc.prepost_list); - recv = container_of(entry, struct fi_ibv_xrc_srx_prepost, + recv = container_of(entry, struct vrb_xrc_srx_prepost, prepost_entry); free(recv); } } /* Must hold the associated CQ lock cq::xrc.srq_list_lock */ -int fi_ibv_xrc_close_srq(struct fi_ibv_srq_ep *srq_ep) +int vrb_xrc_close_srq(struct vrb_srq_ep *srq_ep) { int ret; - assert(srq_ep->domain->use_xrc); + assert(srq_ep->domain->flags & VRB_USE_XRC); if (!srq_ep->xrc.cq || !srq_ep->srq) return FI_SUCCESS; @@ -1307,25 +1640,27 @@ int fi_ibv_xrc_close_srq(struct fi_ibv_srq_ep *srq_ep) VERBS_WARN(FI_LOG_EP_CTRL, "Cannot destroy SRQ rc=%d\n", ret); return -ret; } + srq_ep->xrc.cq->credits += srq_ep->xrc.max_recv_wr; srq_ep->srq = NULL; srq_ep->xrc.cq = NULL; dlist_remove(&srq_ep->xrc.srq_entry); - fi_ibv_cleanup_prepost_bufs(srq_ep); + vrb_cleanup_prepost_bufs(srq_ep); return FI_SUCCESS; } -static int fi_ibv_srq_close(fid_t fid) +static int vrb_srq_close(fid_t fid) { - struct fi_ibv_srq_ep *srq_ep = container_of(fid, struct fi_ibv_srq_ep, - ep_fid.fid); + struct vrb_srq_ep *srq_ep = container_of(fid, struct vrb_srq_ep, + ep_fid.fid); + struct vrb_cq *cq = srq_ep->xrc.cq; int ret; - if (srq_ep->domain->use_xrc) { - if (srq_ep->xrc.cq) { - fastlock_acquire(&srq_ep->xrc.cq->xrc.srq_list_lock); - ret = fi_ibv_xrc_close_srq(srq_ep); - fastlock_release(&srq_ep->xrc.cq->xrc.srq_list_lock); + if (srq_ep->domain->flags & VRB_USE_XRC) { + if (cq) { + fastlock_acquire(&cq->xrc.srq_list_lock); + ret = vrb_xrc_close_srq(srq_ep); + fastlock_release(&cq->xrc.srq_list_lock); if (ret) goto err; } @@ -1335,6 +1670,9 @@ static int fi_ibv_srq_close(fid_t fid) if (ret) goto err; } + + ofi_bufpool_destroy(srq_ep->ctx_pool); + fastlock_destroy(&srq_ep->ctx_lock); free(srq_ep); return FI_SUCCESS; @@ -1343,56 +1681,60 @@ static int fi_ibv_srq_close(fid_t fid) return ret; } -static struct fi_ops fi_ibv_srq_ep_ops = { +static struct fi_ops vrb_srq_ep_ops = { .size = sizeof(struct fi_ops), - .close = fi_ibv_srq_close, + .close = vrb_srq_close, .bind = fi_no_bind, .control = fi_no_control, .ops_open = fi_no_ops_open, }; -int fi_ibv_srq_context(struct fid_domain *domain, struct fi_rx_attr *attr, +int vrb_srq_context(struct fid_domain *domain, struct fi_rx_attr *attr, struct fid_ep **srq_ep_fid, void *context) { struct ibv_srq_init_attr srq_init_attr = { 0 }; - struct fi_ibv_domain *dom; - struct fi_ibv_srq_ep *srq_ep; + struct vrb_domain *dom; + struct vrb_srq_ep *srq_ep; int ret; if (!domain) return -FI_EINVAL; srq_ep = calloc(1, sizeof(*srq_ep)); - if (!srq_ep) { - ret = -FI_ENOMEM; - goto err1; - } + if (!srq_ep) + return -FI_ENOMEM; - dom = container_of(domain, struct fi_ibv_domain, + fastlock_init(&srq_ep->ctx_lock); + ret = ofi_bufpool_create(&srq_ep->ctx_pool, sizeof(struct fi_context), + 16, attr->size, 1024, OFI_BUFPOOL_NO_TRACK); + if (ret) + goto free_ep; + + dom = container_of(domain, struct vrb_domain, util_domain.domain_fid); srq_ep->ep_fid.fid.fclass = FI_CLASS_SRX_CTX; srq_ep->ep_fid.fid.context = context; - srq_ep->ep_fid.fid.ops = &fi_ibv_srq_ep_ops; - srq_ep->ep_fid.ops = &fi_ibv_srq_ep_base_ops; - srq_ep->ep_fid.cm = &fi_ibv_srq_cm_ops; - srq_ep->ep_fid.rma = &fi_ibv_srq_rma_ops; - srq_ep->ep_fid.atomic = &fi_ibv_srq_atomic_ops; + srq_ep->ep_fid.fid.ops = &vrb_srq_ep_ops; + srq_ep->ep_fid.ops = &vrb_srq_ep_base_ops; + srq_ep->ep_fid.cm = &vrb_srq_cm_ops; + srq_ep->ep_fid.rma = &vrb_srq_rma_ops; + srq_ep->ep_fid.atomic = &vrb_srq_atomic_ops; srq_ep->domain = dom; /* XRC SRQ creation is delayed until the first endpoint it is bound * to is enabled.*/ - if (dom->use_xrc) { + if (dom->flags & VRB_USE_XRC) { fastlock_init(&srq_ep->xrc.prepost_lock); slist_init(&srq_ep->xrc.prepost_list); dlist_init(&srq_ep->xrc.srq_entry); srq_ep->xrc.max_recv_wr = attr->size; srq_ep->xrc.max_sge = attr->iov_limit; - srq_ep->ep_fid.msg = &fi_ibv_xrc_srq_msg_ops; + srq_ep->ep_fid.msg = &vrb_xrc_srq_msg_ops; goto done; } - srq_ep->ep_fid.msg = &fi_ibv_srq_msg_ops; + srq_ep->ep_fid.msg = &vrb_srq_msg_ops; srq_init_attr.attr.max_wr = attr->size; srq_init_attr.attr.max_sge = attr->iov_limit; @@ -1400,49 +1742,49 @@ int fi_ibv_srq_context(struct fid_domain *domain, struct fi_rx_attr *attr, if (!srq_ep->srq) { VERBS_INFO_ERRNO(FI_LOG_DOMAIN, "ibv_create_srq", errno); ret = -errno; - goto err2; + goto free_bufs; } done: *srq_ep_fid = &srq_ep->ep_fid; - return FI_SUCCESS; -err2: - /* Only basic SRQ can take this path */ +free_bufs: + ofi_bufpool_destroy(srq_ep->ctx_pool); +free_ep: + fastlock_destroy(&srq_ep->ctx_lock); free(srq_ep); -err1: return ret; } -#define fi_ibv_atomicvalid(name, flags) \ -static int fi_ibv_msg_ep_atomic_ ## name(struct fid_ep *ep_fid, \ +#define VRB_DEF_ATOMICVALID(name, flags) \ +static int vrb_msg_ep_atomic_ ## name(struct fid_ep *ep_fid, \ enum fi_datatype datatype, \ enum fi_op op, size_t *count) \ { \ - struct fi_ibv_ep *ep = container_of(ep_fid, struct fi_ibv_ep, \ + struct vrb_ep *ep = container_of(ep_fid, struct vrb_ep, \ util_ep.ep_fid); \ struct fi_atomic_attr attr; \ int ret; \ \ - ret = fi_ibv_query_atomic(&ep->util_ep.domain->domain_fid, \ + ret = vrb_query_atomic(&ep->util_ep.domain->domain_fid, \ datatype, op, &attr, flags); \ if (!ret) \ *count = attr.count; \ return ret; \ } -fi_ibv_atomicvalid(writevalid, 0); -fi_ibv_atomicvalid(readwritevalid, FI_FETCH_ATOMIC); -fi_ibv_atomicvalid(compwritevalid, FI_COMPARE_ATOMIC); +VRB_DEF_ATOMICVALID(writevalid, 0) +VRB_DEF_ATOMICVALID(readwritevalid, FI_FETCH_ATOMIC) +VRB_DEF_ATOMICVALID(compwritevalid, FI_COMPARE_ATOMIC) -int fi_ibv_query_atomic(struct fid_domain *domain_fid, enum fi_datatype datatype, +int vrb_query_atomic(struct fid_domain *domain_fid, enum fi_datatype datatype, enum fi_op op, struct fi_atomic_attr *attr, uint64_t flags) { - struct fi_ibv_domain *domain = container_of(domain_fid, - struct fi_ibv_domain, + struct vrb_domain *domain = container_of(domain_fid, + struct vrb_domain, util_domain.domain_fid); char *log_str_fetch = "fi_fetch_atomic with FI_SUM op"; char *log_str_comp = "fi_compare_atomic"; @@ -1507,18 +1849,18 @@ int fi_ibv_query_atomic(struct fid_domain *domain_fid, enum fi_datatype datatype } static ssize_t -fi_ibv_msg_ep_atomic_write(struct fid_ep *ep_fid, const void *buf, size_t count, +vrb_msg_ep_atomic_write(struct fid_ep *ep_fid, const void *buf, size_t count, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP(ep, (uintptr_t)context), .opcode = IBV_WR_RDMA_WRITE, .wr.rdma.remote_addr = addr, .wr.rdma.rkey = (uint32_t)(uintptr_t)key, - .send_flags = VERBS_INJECT(ep, sizeof(uint64_t)) | + .send_flags = VERBS_INJECT(ep, sizeof(uint64_t), desc) | IBV_SEND_FENCE, }; size_t count_copy; @@ -1532,15 +1874,15 @@ fi_ibv_msg_ep_atomic_write(struct fid_ep *ep_fid, const void *buf, size_t count, count_copy = count; - ret = fi_ibv_msg_ep_atomic_writevalid(ep_fid, datatype, op, &count_copy); + ret = vrb_msg_ep_atomic_writevalid(ep_fid, datatype, op, &count_copy); if (ret) return ret; - return fi_ibv_send_buf(ep, &wr, buf, sizeof(uint64_t), desc); + return vrb_send_buf(ep, &wr, buf, sizeof(uint64_t), desc); } static ssize_t -fi_ibv_msg_ep_atomic_writev(struct fid_ep *ep, +vrb_msg_ep_atomic_writev(struct fid_ep *ep, const struct fi_ioc *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) @@ -1548,21 +1890,21 @@ fi_ibv_msg_ep_atomic_writev(struct fid_ep *ep, if (OFI_UNLIKELY(iov->count != 1)) return -FI_E2BIG; - return fi_ibv_msg_ep_atomic_write(ep, iov->addr, count, desc[0], + return vrb_msg_ep_atomic_write(ep, iov->addr, count, desc[0], dest_addr, addr, key, datatype, op, context); } static ssize_t -fi_ibv_msg_ep_atomic_writemsg(struct fid_ep *ep_fid, +vrb_msg_ep_atomic_writemsg(struct fid_ep *ep_fid, const struct fi_msg_atomic *msg, uint64_t flags) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP_FLAGS(ep, flags, (uintptr_t)msg->context), .wr.rdma.remote_addr = msg->rma_iov->addr, .wr.rdma.rkey = (uint32_t)(uintptr_t)msg->rma_iov->key, - .send_flags = VERBS_INJECT_FLAGS(ep, sizeof(uint64_t), flags) | + .send_flags = VERBS_INJECT_FLAGS(ep, sizeof(uint64_t), flags, msg->desc[0]) | IBV_SEND_FENCE, }; size_t count_copy; @@ -1576,7 +1918,7 @@ fi_ibv_msg_ep_atomic_writemsg(struct fid_ep *ep_fid, count_copy = msg->iov_count; - ret = fi_ibv_msg_ep_atomic_writevalid(ep_fid, msg->datatype, msg->op, + ret = vrb_msg_ep_atomic_writevalid(ep_fid, msg->datatype, msg->op, &count_copy); if (ret) return ret; @@ -1588,19 +1930,19 @@ fi_ibv_msg_ep_atomic_writemsg(struct fid_ep *ep_fid, wr.opcode = IBV_WR_RDMA_WRITE; } - return fi_ibv_send_buf(ep, &wr, msg->msg_iov->addr, sizeof(uint64_t), + return vrb_send_buf(ep, &wr, msg->msg_iov->addr, sizeof(uint64_t), msg->desc[0]); } static ssize_t -fi_ibv_msg_ep_atomic_readwrite(struct fid_ep *ep_fid, const void *buf, size_t count, +vrb_msg_ep_atomic_readwrite(struct fid_ep *ep_fid, const void *buf, size_t count, void *desc, void *result, void *result_desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP(ep, (uintptr_t)context), .send_flags = IBV_SEND_FENCE, @@ -1613,7 +1955,7 @@ fi_ibv_msg_ep_atomic_readwrite(struct fid_ep *ep_fid, const void *buf, size_t co count_copy = count; - ret = fi_ibv_msg_ep_atomic_readwritevalid(ep_fid, datatype, op, + ret = vrb_msg_ep_atomic_readwritevalid(ep_fid, datatype, op, &count_copy); if (ret) return ret; @@ -1635,11 +1977,11 @@ fi_ibv_msg_ep_atomic_readwrite(struct fid_ep *ep_fid, const void *buf, size_t co return -FI_ENOSYS; } - return fi_ibv_send_buf(ep, &wr, result, sizeof(uint64_t), result_desc); + return vrb_send_buf(ep, &wr, result, sizeof(uint64_t), result_desc); } static ssize_t -fi_ibv_msg_ep_atomic_readwritev(struct fid_ep *ep, const struct fi_ioc *iov, +vrb_msg_ep_atomic_readwritev(struct fid_ep *ep, const struct fi_ioc *iov, void **desc, size_t count, struct fi_ioc *resultv, void **result_desc, size_t result_count, fi_addr_t dest_addr, uint64_t addr, @@ -1649,19 +1991,19 @@ fi_ibv_msg_ep_atomic_readwritev(struct fid_ep *ep, const struct fi_ioc *iov, if (OFI_UNLIKELY(iov->count != 1)) return -FI_E2BIG; - return fi_ibv_msg_ep_atomic_readwrite(ep, iov->addr, count, + return vrb_msg_ep_atomic_readwrite(ep, iov->addr, count, desc[0], resultv->addr, result_desc[0], dest_addr, addr, key, datatype, op, context); } static ssize_t -fi_ibv_msg_ep_atomic_readwritemsg(struct fid_ep *ep_fid, +vrb_msg_ep_atomic_readwritemsg(struct fid_ep *ep_fid, const struct fi_msg_atomic *msg, struct fi_ioc *resultv, void **result_desc, size_t result_count, uint64_t flags) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP_FLAGS(ep, flags, (uintptr_t)msg->context), .send_flags = IBV_SEND_FENCE, @@ -1674,7 +2016,7 @@ fi_ibv_msg_ep_atomic_readwritemsg(struct fid_ep *ep_fid, count_copy = msg->iov_count; - ret = fi_ibv_msg_ep_atomic_readwritevalid(ep_fid, msg->datatype, msg->op, + ret = vrb_msg_ep_atomic_readwritevalid(ep_fid, msg->datatype, msg->op, &count_copy); if (ret) return ret; @@ -1699,12 +2041,12 @@ fi_ibv_msg_ep_atomic_readwritemsg(struct fid_ep *ep_fid, if (flags & FI_REMOTE_CQ_DATA) wr.imm_data = htonl((uint32_t) msg->data); - return fi_ibv_send_buf(ep, &wr, resultv->addr, + return vrb_send_buf(ep, &wr, resultv->addr, sizeof(uint64_t), result_desc[0]); } static ssize_t -fi_ibv_msg_ep_atomic_compwrite(struct fid_ep *ep_fid, const void *buf, size_t count, +vrb_msg_ep_atomic_compwrite(struct fid_ep *ep_fid, const void *buf, size_t count, void *desc, const void *compare, void *compare_desc, void *result, void *result_desc, @@ -1712,8 +2054,8 @@ fi_ibv_msg_ep_atomic_compwrite(struct fid_ep *ep_fid, const void *buf, size_t co enum fi_datatype datatype, enum fi_op op, void *context) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP(ep, (uintptr_t)context), .opcode = IBV_WR_ATOMIC_CMP_AND_SWP, @@ -1731,15 +2073,15 @@ fi_ibv_msg_ep_atomic_compwrite(struct fid_ep *ep_fid, const void *buf, size_t co count_copy = count; - ret = fi_ibv_msg_ep_atomic_compwritevalid(ep_fid, datatype, op, &count_copy); + ret = vrb_msg_ep_atomic_compwritevalid(ep_fid, datatype, op, &count_copy); if (ret) return ret; - return fi_ibv_send_buf(ep, &wr, result, sizeof(uint64_t), result_desc); + return vrb_send_buf(ep, &wr, result, sizeof(uint64_t), result_desc); } static ssize_t -fi_ibv_msg_ep_atomic_compwritev(struct fid_ep *ep, const struct fi_ioc *iov, +vrb_msg_ep_atomic_compwritev(struct fid_ep *ep, const struct fi_ioc *iov, void **desc, size_t count, const struct fi_ioc *comparev, void **compare_desc, size_t compare_count, @@ -1752,14 +2094,14 @@ fi_ibv_msg_ep_atomic_compwritev(struct fid_ep *ep, const struct fi_ioc *iov, if (OFI_UNLIKELY(iov->count != 1)) return -FI_E2BIG; - return fi_ibv_msg_ep_atomic_compwrite(ep, iov->addr, count, desc[0], + return vrb_msg_ep_atomic_compwrite(ep, iov->addr, count, desc[0], comparev->addr, compare_desc[0], resultv->addr, result_desc[0], dest_addr, addr, key, datatype, op, context); } static ssize_t -fi_ibv_msg_ep_atomic_compwritemsg(struct fid_ep *ep_fid, +vrb_msg_ep_atomic_compwritemsg(struct fid_ep *ep_fid, const struct fi_msg_atomic *msg, const struct fi_ioc *comparev, void **compare_desc, size_t compare_count, @@ -1767,8 +2109,8 @@ fi_ibv_msg_ep_atomic_compwritemsg(struct fid_ep *ep_fid, void **result_desc, size_t result_count, uint64_t flags) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP_FLAGS(ep, flags, (uintptr_t)msg->context), .opcode = IBV_WR_ATOMIC_CMP_AND_SWP, @@ -1786,7 +2128,7 @@ fi_ibv_msg_ep_atomic_compwritemsg(struct fid_ep *ep_fid, count_copy = msg->iov_count; - ret = fi_ibv_msg_ep_atomic_compwritevalid(ep_fid, msg->datatype, msg->op, + ret = vrb_msg_ep_atomic_compwritevalid(ep_fid, msg->datatype, msg->op, &count_copy); if (ret) return ret; @@ -1794,41 +2136,41 @@ fi_ibv_msg_ep_atomic_compwritemsg(struct fid_ep *ep_fid, if (flags & FI_REMOTE_CQ_DATA) wr.imm_data = htonl((uint32_t) msg->data); - return fi_ibv_send_buf(ep, &wr, resultv->addr, sizeof(uint64_t), + return vrb_send_buf(ep, &wr, resultv->addr, sizeof(uint64_t), result_desc[0]); } -struct fi_ops_atomic fi_ibv_msg_ep_atomic_ops = { +struct fi_ops_atomic vrb_msg_ep_atomic_ops = { .size = sizeof(struct fi_ops_atomic), - .write = fi_ibv_msg_ep_atomic_write, - .writev = fi_ibv_msg_ep_atomic_writev, - .writemsg = fi_ibv_msg_ep_atomic_writemsg, + .write = vrb_msg_ep_atomic_write, + .writev = vrb_msg_ep_atomic_writev, + .writemsg = vrb_msg_ep_atomic_writemsg, .inject = fi_no_atomic_inject, - .readwrite = fi_ibv_msg_ep_atomic_readwrite, - .readwritev = fi_ibv_msg_ep_atomic_readwritev, - .readwritemsg = fi_ibv_msg_ep_atomic_readwritemsg, - .compwrite = fi_ibv_msg_ep_atomic_compwrite, - .compwritev = fi_ibv_msg_ep_atomic_compwritev, - .compwritemsg = fi_ibv_msg_ep_atomic_compwritemsg, - .writevalid = fi_ibv_msg_ep_atomic_writevalid, - .readwritevalid = fi_ibv_msg_ep_atomic_readwritevalid, - .compwritevalid = fi_ibv_msg_ep_atomic_compwritevalid + .readwrite = vrb_msg_ep_atomic_readwrite, + .readwritev = vrb_msg_ep_atomic_readwritev, + .readwritemsg = vrb_msg_ep_atomic_readwritemsg, + .compwrite = vrb_msg_ep_atomic_compwrite, + .compwritev = vrb_msg_ep_atomic_compwritev, + .compwritemsg = vrb_msg_ep_atomic_compwritemsg, + .writevalid = vrb_msg_ep_atomic_writevalid, + .readwritevalid = vrb_msg_ep_atomic_readwritevalid, + .compwritevalid = vrb_msg_ep_atomic_compwritevalid }; static ssize_t -fi_ibv_msg_xrc_ep_atomic_write(struct fid_ep *ep_fid, const void *buf, +vrb_msg_xrc_ep_atomic_write(struct fid_ep *ep_fid, const void *buf, size_t count, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP(&ep->base_ep, (uintptr_t)context), .opcode = IBV_WR_RDMA_WRITE, .wr.rdma.remote_addr = addr, .wr.rdma.rkey = (uint32_t)(uintptr_t)key, - .send_flags = VERBS_INJECT(&ep->base_ep, sizeof(uint64_t)) | + .send_flags = VERBS_INJECT(&ep->base_ep, sizeof(uint64_t), desc) | IBV_SEND_FENCE, }; size_t count_copy; @@ -1840,22 +2182,22 @@ fi_ibv_msg_xrc_ep_atomic_write(struct fid_ep *ep_fid, const void *buf, if (OFI_UNLIKELY(op != FI_ATOMIC_WRITE)) return -FI_ENOSYS; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); count_copy = count; - ret = fi_ibv_msg_ep_atomic_writevalid(ep_fid, datatype, op, &count_copy); + ret = vrb_msg_ep_atomic_writevalid(ep_fid, datatype, op, &count_copy); if (ret) return ret; - return fi_ibv_send_buf(&ep->base_ep, &wr, buf, sizeof(uint64_t), desc); + return vrb_send_buf(&ep->base_ep, &wr, buf, sizeof(uint64_t), desc); } static ssize_t -fi_ibv_msg_xrc_ep_atomic_writemsg(struct fid_ep *ep_fid, +vrb_msg_xrc_ep_atomic_writemsg(struct fid_ep *ep_fid, const struct fi_msg_atomic *msg, uint64_t flags) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP_FLAGS(&ep->base_ep, flags, @@ -1863,7 +2205,7 @@ fi_ibv_msg_xrc_ep_atomic_writemsg(struct fid_ep *ep_fid, .wr.rdma.remote_addr = msg->rma_iov->addr, .wr.rdma.rkey = (uint32_t)(uintptr_t)msg->rma_iov->key, .send_flags = VERBS_INJECT_FLAGS(&ep->base_ep, - sizeof(uint64_t), flags) | IBV_SEND_FENCE, + sizeof(uint64_t), flags, msg->desc[0]) | IBV_SEND_FENCE, }; size_t count_copy; int ret; @@ -1874,10 +2216,10 @@ fi_ibv_msg_xrc_ep_atomic_writemsg(struct fid_ep *ep_fid, if (OFI_UNLIKELY(msg->op != FI_ATOMIC_WRITE)) return -FI_ENOSYS; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); count_copy = msg->iov_count; - ret = fi_ibv_msg_ep_atomic_writevalid(ep_fid, msg->datatype, msg->op, + ret = vrb_msg_ep_atomic_writevalid(ep_fid, msg->datatype, msg->op, &count_copy); if (ret) return ret; @@ -1889,17 +2231,17 @@ fi_ibv_msg_xrc_ep_atomic_writemsg(struct fid_ep *ep_fid, wr.opcode = IBV_WR_RDMA_WRITE; } - return fi_ibv_send_buf(&ep->base_ep, &wr, msg->msg_iov->addr, + return vrb_send_buf(&ep->base_ep, &wr, msg->msg_iov->addr, sizeof(uint64_t), msg->desc[0]); } static ssize_t -fi_ibv_msg_xrc_ep_atomic_readwrite(struct fid_ep *ep_fid, const void *buf, +vrb_msg_xrc_ep_atomic_readwrite(struct fid_ep *ep_fid, const void *buf, size_t count, void *desc, void *result, void *result_desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, enum fi_datatype datatype, enum fi_op op, void *context) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP(&ep->base_ep, (uintptr_t)context), @@ -1911,10 +2253,10 @@ fi_ibv_msg_xrc_ep_atomic_readwrite(struct fid_ep *ep_fid, const void *buf, if (OFI_UNLIKELY(count != 1)) return -FI_E2BIG; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); count_copy = count; - ret = fi_ibv_msg_ep_atomic_readwritevalid(ep_fid, datatype, op, + ret = vrb_msg_ep_atomic_readwritevalid(ep_fid, datatype, op, &count_copy); if (ret) return ret; @@ -1936,17 +2278,17 @@ fi_ibv_msg_xrc_ep_atomic_readwrite(struct fid_ep *ep_fid, const void *buf, return -FI_ENOSYS; } - return fi_ibv_send_buf(&ep->base_ep, &wr, result, + return vrb_send_buf(&ep->base_ep, &wr, result, sizeof(uint64_t), result_desc); } static ssize_t -fi_ibv_msg_xrc_ep_atomic_readwritemsg(struct fid_ep *ep_fid, +vrb_msg_xrc_ep_atomic_readwritemsg(struct fid_ep *ep_fid, const struct fi_msg_atomic *msg, struct fi_ioc *resultv, void **result_desc, size_t result_count, uint64_t flags) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP_FLAGS(&ep->base_ep, flags, @@ -1959,10 +2301,10 @@ fi_ibv_msg_xrc_ep_atomic_readwritemsg(struct fid_ep *ep_fid, if (OFI_UNLIKELY(msg->iov_count != 1 || msg->msg_iov->count != 1)) return -FI_E2BIG; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); count_copy = msg->iov_count; - ret = fi_ibv_msg_ep_atomic_readwritevalid(ep_fid, msg->datatype, msg->op, + ret = vrb_msg_ep_atomic_readwritevalid(ep_fid, msg->datatype, msg->op, &count_copy); if (ret) return ret; @@ -1987,12 +2329,12 @@ fi_ibv_msg_xrc_ep_atomic_readwritemsg(struct fid_ep *ep_fid, if (flags & FI_REMOTE_CQ_DATA) wr.imm_data = htonl((uint32_t) msg->data); - return fi_ibv_send_buf(&ep->base_ep, &wr, resultv->addr, + return vrb_send_buf(&ep->base_ep, &wr, resultv->addr, sizeof(uint64_t), result_desc[0]); } static ssize_t -fi_ibv_msg_xrc_ep_atomic_compwrite(struct fid_ep *ep_fid, const void *buf, size_t count, +vrb_msg_xrc_ep_atomic_compwrite(struct fid_ep *ep_fid, const void *buf, size_t count, void *desc, const void *compare, void *compare_desc, void *result, void *result_desc, @@ -2000,7 +2342,7 @@ fi_ibv_msg_xrc_ep_atomic_compwrite(struct fid_ep *ep_fid, const void *buf, size_ enum fi_datatype datatype, enum fi_op op, void *context) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP(&ep->base_ep, (uintptr_t)context), @@ -2017,19 +2359,19 @@ fi_ibv_msg_xrc_ep_atomic_compwrite(struct fid_ep *ep_fid, const void *buf, size_ if (OFI_UNLIKELY(count != 1)) return -FI_E2BIG; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); count_copy = count; - ret = fi_ibv_msg_ep_atomic_compwritevalid(ep_fid, datatype, op, &count_copy); + ret = vrb_msg_ep_atomic_compwritevalid(ep_fid, datatype, op, &count_copy); if (ret) return ret; - return fi_ibv_send_buf(&ep->base_ep, &wr, result, + return vrb_send_buf(&ep->base_ep, &wr, result, sizeof(uint64_t), result_desc); } static ssize_t -fi_ibv_msg_xrc_ep_atomic_compwritemsg(struct fid_ep *ep_fid, +vrb_msg_xrc_ep_atomic_compwritemsg(struct fid_ep *ep_fid, const struct fi_msg_atomic *msg, const struct fi_ioc *comparev, void **compare_desc, size_t compare_count, @@ -2037,7 +2379,7 @@ fi_ibv_msg_xrc_ep_atomic_compwritemsg(struct fid_ep *ep_fid, void **result_desc, size_t result_count, uint64_t flags) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP_FLAGS(&ep->base_ep, flags, @@ -2055,10 +2397,10 @@ fi_ibv_msg_xrc_ep_atomic_compwritemsg(struct fid_ep *ep_fid, if (OFI_UNLIKELY(msg->iov_count != 1 || msg->msg_iov->count != 1)) return -FI_E2BIG; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); count_copy = msg->iov_count; - ret = fi_ibv_msg_ep_atomic_compwritevalid(ep_fid, msg->datatype, msg->op, + ret = vrb_msg_ep_atomic_compwritevalid(ep_fid, msg->datatype, msg->op, &count_copy); if (ret) return ret; @@ -2066,23 +2408,23 @@ fi_ibv_msg_xrc_ep_atomic_compwritemsg(struct fid_ep *ep_fid, if (flags & FI_REMOTE_CQ_DATA) wr.imm_data = htonl((uint32_t) msg->data); - return fi_ibv_send_buf(&ep->base_ep, &wr, resultv->addr, + return vrb_send_buf(&ep->base_ep, &wr, resultv->addr, sizeof(uint64_t), result_desc[0]); } -struct fi_ops_atomic fi_ibv_msg_xrc_ep_atomic_ops = { +struct fi_ops_atomic vrb_msg_xrc_ep_atomic_ops = { .size = sizeof(struct fi_ops_atomic), - .write = fi_ibv_msg_xrc_ep_atomic_write, - .writev = fi_ibv_msg_ep_atomic_writev, - .writemsg = fi_ibv_msg_xrc_ep_atomic_writemsg, + .write = vrb_msg_xrc_ep_atomic_write, + .writev = vrb_msg_ep_atomic_writev, + .writemsg = vrb_msg_xrc_ep_atomic_writemsg, .inject = fi_no_atomic_inject, - .readwrite = fi_ibv_msg_xrc_ep_atomic_readwrite, - .readwritev = fi_ibv_msg_ep_atomic_readwritev, - .readwritemsg = fi_ibv_msg_xrc_ep_atomic_readwritemsg, - .compwrite = fi_ibv_msg_xrc_ep_atomic_compwrite, - .compwritev = fi_ibv_msg_ep_atomic_compwritev, - .compwritemsg = fi_ibv_msg_xrc_ep_atomic_compwritemsg, - .writevalid = fi_ibv_msg_ep_atomic_writevalid, - .readwritevalid = fi_ibv_msg_ep_atomic_readwritevalid, - .compwritevalid = fi_ibv_msg_ep_atomic_compwritevalid + .readwrite = vrb_msg_xrc_ep_atomic_readwrite, + .readwritev = vrb_msg_ep_atomic_readwritev, + .readwritemsg = vrb_msg_xrc_ep_atomic_readwritemsg, + .compwrite = vrb_msg_xrc_ep_atomic_compwrite, + .compwritev = vrb_msg_ep_atomic_compwritev, + .compwritemsg = vrb_msg_xrc_ep_atomic_compwritemsg, + .writevalid = vrb_msg_ep_atomic_writevalid, + .readwritevalid = vrb_msg_ep_atomic_readwritevalid, + .compwritevalid = vrb_msg_ep_atomic_compwritevalid }; diff --git a/prov/verbs/src/verbs_eq.c b/prov/verbs/src/verbs_eq.c index c691cfee653..33473b63124 100644 --- a/prov/verbs/src/verbs_eq.c +++ b/prov/verbs/src/verbs_eq.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. - * Copyright (c) 2018 Cray Inc. All rights reserved. + * Copyright (c) 2018-2019 Cray Inc. All rights reserved. + * Copyright (c) 2018-2019 System Fabric Works, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -36,8 +37,15 @@ #include #include "fi_verbs.h" +/* XRC SIDR connection map RBTree key */ +struct vrb_sidr_conn_key { + struct sockaddr *addr; + uint16_t pep_port; + bool recip; +}; + const struct fi_info * -fi_ibv_get_verbs_info(const struct fi_info *ilist, const char *domain_name) +vrb_get_verbs_info(const struct fi_info *ilist, const char *domain_name) { const struct fi_info *fi; @@ -50,11 +58,11 @@ fi_ibv_get_verbs_info(const struct fi_info *ilist, const char *domain_name) } static ssize_t -fi_ibv_eq_readerr(struct fid_eq *eq, struct fi_eq_err_entry *entry, +vrb_eq_readerr(struct fid_eq *eq, struct fi_eq_err_entry *entry, uint64_t flags) { - struct fi_ibv_eq *_eq = - container_of(eq, struct fi_ibv_eq, eq_fid.fid); + struct vrb_eq *_eq = + container_of(eq, struct vrb_eq, eq_fid.fid); ssize_t rd = -FI_EAGAIN; fastlock_acquire(&_eq->lock); if (!_eq->err.err) @@ -69,26 +77,25 @@ fi_ibv_eq_readerr(struct fid_eq *eq, struct fi_eq_err_entry *entry, } /* Caller must hold eq:lock */ -void fi_ibv_eq_set_xrc_conn_tag(struct fi_ibv_xrc_ep *ep) +void vrb_eq_set_xrc_conn_tag(struct vrb_xrc_ep *ep) { - struct fi_ibv_eq *eq = ep->base_ep.eq; + struct vrb_eq *eq = ep->base_ep.eq; assert(ep->conn_setup); assert(ep->conn_setup->conn_tag == VERBS_CONN_TAG_INVALID); ep->conn_setup->conn_tag = (uint32_t)ofi_idx2key(&eq->xrc.conn_key_idx, ofi_idx_insert(eq->xrc.conn_key_map, ep)); - ep->conn_setup->created_conn_tag = true; } /* Caller must hold eq:lock */ -void fi_ibv_eq_clear_xrc_conn_tag(struct fi_ibv_xrc_ep *ep) +void vrb_eq_clear_xrc_conn_tag(struct vrb_xrc_ep *ep) { - struct fi_ibv_eq *eq = ep->base_ep.eq; + struct vrb_eq *eq = ep->base_ep.eq; int index; assert(ep->conn_setup); - if (!ep->conn_setup->created_conn_tag) + if (ep->conn_setup->conn_tag == VERBS_CONN_TAG_INVALID) return; index = ofi_key2idx(&eq->xrc.conn_key_idx, @@ -102,33 +109,42 @@ void fi_ibv_eq_clear_xrc_conn_tag(struct fi_ibv_xrc_ep *ep) } /* Caller must hold eq:lock */ -struct fi_ibv_xrc_ep *fi_ibv_eq_xrc_conn_tag2ep(struct fi_ibv_eq *eq, +struct vrb_xrc_ep *vrb_eq_xrc_conn_tag2ep(struct vrb_eq *eq, uint32_t conn_tag) { - struct fi_ibv_xrc_ep *ep; + struct vrb_xrc_ep *ep; int index; index = ofi_key2idx(&eq->xrc.conn_key_idx, (uint64_t)conn_tag); ep = ofi_idx_lookup(eq->xrc.conn_key_map, index); - if (!ep || !ep->conn_setup || (ep->conn_setup->conn_tag != conn_tag)) { + if (!ep || ep->magic != VERBS_XRC_EP_MAGIC) { + VERBS_WARN(FI_LOG_EP_CTRL, "XRC EP is not valid\n"); + return NULL; + } + if (!ep->conn_setup) { VERBS_WARN(FI_LOG_EP_CTRL, - "Invalid/stale XRC connection tag\n"); - return ep; + "Bad state, no connection data\n"); + return NULL; + } + if (ep->conn_setup->conn_tag != conn_tag) { + VERBS_WARN(FI_LOG_EP_CTRL, "Connection tag mismatch\n"); + return NULL; } + ofi_idx_remove(eq->xrc.conn_key_map, index); ep->conn_setup->conn_tag = VERBS_CONN_TAG_INVALID; return ep; } -static int fi_ibv_eq_set_xrc_info(struct rdma_cm_event *event, - struct fi_ibv_xrc_conn_info *info) +static int vrb_eq_set_xrc_info(struct rdma_cm_event *event, + struct vrb_xrc_conn_info *info) { - struct fi_ibv_xrc_cm_data *remote = (struct fi_ibv_xrc_cm_data *) + struct vrb_xrc_cm_data *remote = (struct vrb_xrc_cm_data *) event->param.conn.private_data; int ret; - ret = fi_ibv_verify_xrc_cm_data(remote, + ret = vrb_verify_xrc_cm_data(remote, event->param.conn.private_data_len); if (ret) return ret; @@ -136,7 +152,8 @@ static int fi_ibv_eq_set_xrc_info(struct rdma_cm_event *event, info->is_reciprocal = remote->reciprocal; info->conn_tag = ntohl(remote->conn_tag); info->port = ntohs(remote->port); - info->conn_data = ntohl(remote->param); + info->tgt_qpn = ntohl(remote->tgt_qpn); + info->peer_srqn = ntohl(remote->srqn); info->conn_param = event->param.conn; info->conn_param.private_data = NULL; info->conn_param.private_data_len = 0; @@ -145,12 +162,12 @@ static int fi_ibv_eq_set_xrc_info(struct rdma_cm_event *event, } static int -fi_ibv_pep_dev_domain_match(struct fi_info *hints, const char *devname) +vrb_pep_dev_domain_match(struct fi_info *hints, const char *devname) { int ret; - if ((FI_IBV_EP_PROTO(hints)) == FI_PROTO_RDMA_CM_IB_XRC) - ret = fi_ibv_cmp_xrc_domain_name(hints->domain_attr->name, + if ((VRB_EP_PROTO(hints)) == FI_PROTO_RDMA_CM_IB_XRC) + ret = vrb_cmp_xrc_domain_name(hints->domain_attr->name, devname); else ret = strcmp(hints->domain_attr->name, devname); @@ -159,16 +176,18 @@ fi_ibv_pep_dev_domain_match(struct fi_info *hints, const char *devname) } static int -fi_ibv_eq_cm_getinfo(struct rdma_cm_event *event, struct fi_info *pep_info, +vrb_eq_cm_getinfo(struct rdma_cm_event *event, struct fi_info *pep_info, struct fi_info **info) { struct fi_info *hints; - struct fi_ibv_connreq *connreq; + struct vrb_connreq *connreq; const char *devname = ibv_get_device_name(event->id->verbs->device); int ret = -FI_ENOMEM; - if (!(hints = fi_dupinfo(pep_info))) + if (!(hints = fi_dupinfo(pep_info))) { + VERBS_WARN(FI_LOG_EP_CTRL, "dupinfo failure\n"); return -FI_ENOMEM; + } /* Free src_addr info from pep to avoid addr reuse errors */ free(hints->src_addr); @@ -180,8 +199,8 @@ fi_ibv_eq_cm_getinfo(struct rdma_cm_event *event, struct fi_info *pep_info, if (!(hints->domain_attr->name = strdup(devname))) goto err1; } else { - if (fi_ibv_pep_dev_domain_match(hints, devname)) { - VERBS_WARN(FI_LOG_EQ, "Passive endpoint domain: %s does" + if (vrb_pep_dev_domain_match(hints, devname)) { + VERBS_WARN(FI_LOG_EQ, "passive endpoint domain: %s does" " not match device: %s where we got a " "connection request\n", hints->domain_attr->name, devname); @@ -195,37 +214,46 @@ fi_ibv_eq_cm_getinfo(struct rdma_cm_event *event, struct fi_info *pep_info, hints->fabric_attr->name = NULL; } - if (fi_ibv_getinfo(hints->fabric_attr->api_version, NULL, NULL, 0, - hints, info)) + ret = vrb_get_matching_info(hints->fabric_attr->api_version, hints, + info, vrb_util_prov.info, 0); + if (ret) goto err1; assert(!(*info)->dest_addr); + ofi_alter_info(*info, hints, hints->fabric_attr->api_version); + vrb_alter_info(hints, *info); + free((*info)->src_addr); - (*info)->src_addrlen = fi_ibv_sockaddr_len(rdma_get_local_addr(event->id)); - if (!((*info)->src_addr = malloc((*info)->src_addrlen))) + (*info)->src_addrlen = ofi_sizeofaddr(rdma_get_local_addr(event->id)); + (*info)->src_addr = malloc((*info)->src_addrlen); + if (!((*info)->src_addr)) goto err2; memcpy((*info)->src_addr, rdma_get_local_addr(event->id), (*info)->src_addrlen); - (*info)->dest_addrlen = fi_ibv_sockaddr_len(rdma_get_peer_addr(event->id)); - if (!((*info)->dest_addr = malloc((*info)->dest_addrlen))) + (*info)->dest_addrlen = ofi_sizeofaddr(rdma_get_peer_addr(event->id)); + (*info)->dest_addr = malloc((*info)->dest_addrlen); + if (!((*info)->dest_addr)) goto err2; memcpy((*info)->dest_addr, rdma_get_peer_addr(event->id), (*info)->dest_addrlen); - ofi_straddr_dbg(&fi_ibv_prov, FI_LOG_EQ, "src", (*info)->src_addr); - ofi_straddr_dbg(&fi_ibv_prov, FI_LOG_EQ, "dst", (*info)->dest_addr); + ofi_straddr_dbg(&vrb_prov, FI_LOG_EQ, "src", (*info)->src_addr); + ofi_straddr_dbg(&vrb_prov, FI_LOG_EQ, "dst", (*info)->dest_addr); connreq = calloc(1, sizeof *connreq); - if (!connreq) + if (!connreq) { + VERBS_WARN(FI_LOG_EP_CTRL, + "Unable to allocate connreq memory\n"); goto err2; + } connreq->handle.fclass = FI_CLASS_CONNREQ; connreq->id = event->id; - if (fi_ibv_is_xrc(*info)) { + if (vrb_is_xrc_info(*info)) { connreq->is_xrc = 1; - ret = fi_ibv_eq_set_xrc_info(event, &connreq->xrc); + ret = vrb_eq_set_xrc_info(event, &connreq->xrc); if (ret) goto err3; } @@ -243,11 +271,11 @@ fi_ibv_eq_cm_getinfo(struct rdma_cm_event *event, struct fi_info *pep_info, return ret; } -static inline int fi_ibv_eq_copy_event_data(struct fi_eq_cm_entry *entry, +static inline int vrb_eq_copy_event_data(struct fi_eq_cm_entry *entry, size_t max_dest_len, const void *priv_data, size_t priv_datalen) { - const struct fi_ibv_cm_data_hdr *cm_hdr = priv_data; + const struct vrb_cm_data_hdr *cm_hdr = priv_data; size_t datalen = MIN(max_dest_len - sizeof(*entry), cm_hdr->size); if (datalen) @@ -256,10 +284,21 @@ static inline int fi_ibv_eq_copy_event_data(struct fi_eq_cm_entry *entry, return datalen; } -static void fi_ibv_eq_skip_xrc_cm_data(const void **priv_data, +static void vrb_eq_skip_rdma_cm_hdr(const void **priv_data, + size_t *priv_data_len) +{ + size_t rdma_cm_hdr_len = sizeof(struct vrb_rdma_cm_hdr); + + if (*priv_data_len > rdma_cm_hdr_len) { + *priv_data = (void*)((char *)*priv_data + rdma_cm_hdr_len); + *priv_data_len -= rdma_cm_hdr_len; + } +} + +static void vrb_eq_skip_xrc_cm_data(const void **priv_data, size_t *priv_data_len) { - const struct fi_ibv_xrc_cm_data *cm_data = *priv_data; + const struct vrb_xrc_cm_data *cm_data = *priv_data; if (*priv_data_len > sizeof(*cm_data)) { *priv_data = (cm_data + 1); @@ -267,37 +306,221 @@ static void fi_ibv_eq_skip_xrc_cm_data(const void **priv_data, } } +static inline void vrb_set_sidr_conn_key(struct sockaddr *addr, + uint16_t pep_port, bool recip, + struct vrb_sidr_conn_key *key) +{ + key->addr = addr; + key->pep_port = pep_port; + key->recip = recip; +} + +static int vrb_sidr_conn_compare(struct ofi_rbmap *map, + void *key, void *data) +{ + struct vrb_sidr_conn_key *_key = key; + struct vrb_xrc_ep *ep = data; + int ret; + + assert(_key->addr->sa_family == + ofi_sa_family(ep->base_ep.info_attr.dest_addr)); + + /* The interface address and the passive endpoint port define + * the unique connection to a peer */ + switch(_key->addr->sa_family) { + case AF_INET: + ret = memcmp(&ofi_sin_addr(_key->addr), + &ofi_sin_addr(ep->base_ep.info_attr.dest_addr), + sizeof(ofi_sin_addr(_key->addr))); + break; + case AF_INET6: + ret = memcmp(&ofi_sin6_addr(_key->addr), + &ofi_sin6_addr(ep->base_ep.info_attr.dest_addr), + sizeof(ofi_sin6_addr(_key->addr))); + break; + default: + VERBS_WARN(FI_LOG_EP_CTRL, "Unsuuported address format\n"); + assert(0); + ret = -FI_EINVAL; + } + + if (ret) + return ret; + + if (_key->pep_port != ep->remote_pep_port) + return _key->pep_port < ep->remote_pep_port ? -1 : 1; + + return _key->recip < ep->recip_accept ? + -1 : _key->recip > ep->recip_accept; +} + +/* Caller must hold eq:lock */ +struct vrb_xrc_ep *vrb_eq_get_sidr_conn(struct vrb_eq *eq, + struct sockaddr *peer, + uint16_t pep_port, bool recip) +{ + struct ofi_rbnode *node; + struct vrb_sidr_conn_key key; + + vrb_set_sidr_conn_key(peer, pep_port, recip, &key); + node = ofi_rbmap_find(&eq->xrc.sidr_conn_rbmap, &key); + if (OFI_LIKELY(!node)) + return NULL; + + return (struct vrb_xrc_ep *) node->data; +} + +/* Caller must hold eq:lock */ +int vrb_eq_add_sidr_conn(struct vrb_xrc_ep *ep, + void *param_data, size_t param_len) +{ + int ret; + struct vrb_sidr_conn_key key; + + assert(!ep->accept_param_data); + assert(param_len); + assert(ep->tgt_id && ep->tgt_id->ps == RDMA_PS_UDP); + + vrb_set_sidr_conn_key(ep->base_ep.info_attr.dest_addr, + ep->remote_pep_port, ep->recip_accept, &key); + ep->accept_param_data = calloc(1, param_len); + if (!ep->accept_param_data) { + VERBS_WARN(FI_LOG_EP_CTRL, + "SIDR alloc conn param memory failure\n"); + return -FI_ENOMEM; + } + memcpy(ep->accept_param_data, param_data, param_len); + ep->accept_param_len = param_len; + + ret = ofi_rbmap_insert(&ep->base_ep.eq->xrc.sidr_conn_rbmap, + &key, (void *) ep, &ep->conn_map_node); + assert(ret != -FI_EALREADY); + if (OFI_UNLIKELY(ret)) { + VERBS_WARN(FI_LOG_EP_CTRL, + "SIDR conn map entry insert error %d\n", ret); + free(ep->accept_param_data); + ep->accept_param_data = NULL; + return ret; + } + + return FI_SUCCESS; +} + +/* Caller must hold eq:lock */ +void vrb_eq_remove_sidr_conn(struct vrb_xrc_ep *ep) +{ + assert(ep->conn_map_node); + + ofi_rbmap_delete(&ep->base_ep.eq->xrc.sidr_conn_rbmap, + ep->conn_map_node); + ep->conn_map_node = NULL; + free(ep->accept_param_data); + ep->accept_param_data = NULL; +} + static int -fi_ibv_eq_xrc_connreq_event(struct fi_ibv_eq *eq, struct fi_eq_cm_entry *entry, +vrb_eq_accept_recip_conn(struct vrb_xrc_ep *ep, + struct fi_eq_cm_entry *entry, size_t len, + uint32_t *event, struct rdma_cm_event *cma_event, + int *acked) +{ + struct vrb_xrc_cm_data cm_data; + int ret; + + assert(ep->conn_state == VRB_XRC_ORIG_CONNECTED); + + ret = vrb_accept_xrc(ep, VRB_RECIP_CONN, &cm_data, + sizeof(cm_data)); + if (ret) { + VERBS_WARN(FI_LOG_EP_CTRL, + "Reciprocal XRC Accept failed %d\n", ret); + return ret; + } + + /* SIDR based shared reciprocal connections are complete at + * this point, generate the connection established event. */ + if (ep->tgt_id->ps == RDMA_PS_UDP) { + vrb_next_xrc_conn_state(ep); + vrb_ep_tgt_conn_done(ep); + entry->fid = &ep->base_ep.util_ep.ep_fid.fid; + *event = FI_CONNECTED; + len = vrb_eq_copy_event_data(entry, len, + ep->conn_setup->event_data, + ep->conn_setup->event_len); + *acked = 1; + rdma_ack_cm_event(cma_event); + vrb_free_xrc_conn_setup(ep, 1); + + return sizeof(*entry) + len; + } + + /* Event is handled internally and not passed to the application */ + return -FI_EAGAIN; +} + +static int +vrb_eq_xrc_connreq_event(struct vrb_eq *eq, struct fi_eq_cm_entry *entry, + size_t len, uint32_t *event, + struct rdma_cm_event *cma_event, int *acked, const void **priv_data, size_t *priv_datalen) { - struct fi_ibv_connreq *connreq = container_of(entry->info->handle, - struct fi_ibv_connreq, handle); - struct fi_ibv_xrc_ep *ep; - struct fi_ibv_xrc_cm_data cm_data; + struct vrb_connreq *connreq = container_of(entry->info->handle, + struct vrb_connreq, handle); + struct vrb_xrc_ep *ep; int ret; + /* + * If this is a retransmitted SIDR request for a previously accepted + * connection then the shared SIDR response message was lost and must + * be retransmitted. Note that a lost SIDR reject response message will + * be rejected again by the application. + */ + assert(entry->info->dest_addr); + if (cma_event->id->ps == RDMA_PS_UDP) { + ep = vrb_eq_get_sidr_conn(eq, entry->info->dest_addr, + connreq->xrc.port, + connreq->xrc.is_reciprocal); + if (ep) { + VERBS_DBG(FI_LOG_EP_CTRL, + "SIDR %s request retry received\n", + connreq->xrc.is_reciprocal ? + "reciprocal" : "original"); + ret = vrb_resend_shared_accept_xrc(ep, connreq, + cma_event->id); + if (ret) + VERBS_WARN(FI_LOG_EP_CTRL, + "SIDR accept resend failure %d\n", + -errno); + rdma_destroy_id(cma_event->id); + return -FI_EAGAIN; + } + } + if (!connreq->xrc.is_reciprocal) { - fi_ibv_eq_skip_xrc_cm_data(priv_data, priv_datalen); + vrb_eq_skip_xrc_cm_data(priv_data, priv_datalen); return FI_SUCCESS; } - fastlock_acquire(&eq->lock); /* * Reciprocal connections are initiated and handled internally by * the provider, get the endpoint that issued the original connection * request. */ - ep = fi_ibv_eq_xrc_conn_tag2ep(eq, connreq->xrc.conn_tag); + ep = vrb_eq_xrc_conn_tag2ep(eq, connreq->xrc.conn_tag); if (!ep) { VERBS_WARN(FI_LOG_EP_CTRL, - "Reciprocal XRC connection tag not found\n"); - goto done; + "Reciprocal XRC connection tag 0x%x not found\n", + connreq->xrc.conn_tag); + return -FI_EAGAIN; } + ep->recip_req_received = 1; + + assert(ep->conn_state == VRB_XRC_ORIG_CONNECTED || + ep->conn_state == VRB_XRC_ORIG_CONNECTING); ep->tgt_id = connreq->id; ep->tgt_id->context = &ep->base_ep.util_ep.ep_fid.fid; - ep->base_ep.info->handle = entry->info->handle; + ep->base_ep.info_attr.handle = entry->info->handle; ret = rdma_migrate_id(ep->tgt_id, ep->base_ep.eq->channel); if (ret) { @@ -305,41 +528,46 @@ fi_ibv_eq_xrc_connreq_event(struct fi_ibv_eq *eq, struct fi_eq_cm_entry *entry, goto send_reject; } - ret = fi_ibv_accept_xrc(ep, FI_IBV_RECIP_CONN, &cm_data, - sizeof(cm_data)); - if (ret) { - VERBS_WARN(FI_LOG_EP_CTRL, - "Reciprocal XRC Accept failed %d\n", ret); - goto send_reject; - } -done: - fastlock_release(&eq->lock); + /* If the initial connection has completed proceed with accepting + * the reciprocal; otherwise wait until it has before proceeding */ + if (ep->conn_state == VRB_XRC_ORIG_CONNECTED) + return vrb_eq_accept_recip_conn(ep, entry, len, event, + cma_event, acked); - /* Event is handled internally and not passed to the application */ return -FI_EAGAIN; send_reject: if (rdma_reject(connreq->id, *priv_data, *priv_datalen)) VERBS_WARN(FI_LOG_EP_CTRL, "rdma_reject %d\n", -errno); - fastlock_release(&eq->lock); return -FI_EAGAIN; } +static void +vrb_eq_xrc_establish(struct rdma_cm_event *cma_event) +{ + /* For newer rdma-core, active side must complete the + * connect if rdma_cm is not managing the QP */ + if (cma_event->event == RDMA_CM_EVENT_CONNECT_RESPONSE && + !cma_event->id->qp) + rdma_establish(cma_event->id); +} + static int -fi_ibv_eq_xrc_conn_event(struct fi_ibv_xrc_ep *ep, - struct rdma_cm_event *cma_event, - struct fi_eq_cm_entry *entry) +vrb_eq_xrc_conn_event(struct vrb_xrc_ep *ep, + struct rdma_cm_event *cma_event, int *acked, + struct fi_eq_cm_entry *entry, size_t len, + uint32_t *event) { - struct fi_ibv_xrc_conn_info xrc_info; - struct fi_ibv_xrc_cm_data cm_data; + struct vrb_xrc_conn_info xrc_info; + struct vrb_xrc_cm_data cm_data; const void *priv_data = cma_event->param.conn.private_data; size_t priv_datalen = cma_event->param.conn.private_data_len; int ret; - VERBS_DBG(FI_LOG_FABRIC, "EP %p INITIAL CONNECTION DONE state %d\n", - ep, ep->conn_state); - fi_ibv_next_xrc_conn_state(ep); + VERBS_DBG(FI_LOG_EP_CTRL, "EP %p INITIAL CONNECTION DONE state %d, ps %d\n", + ep, ep->conn_state, cma_event->id->ps); + vrb_next_xrc_conn_state(ep); /* * Original application initiated connect is done, if the passive @@ -347,23 +575,31 @@ fi_ibv_eq_xrc_conn_event(struct fi_ibv_xrc_ep *ep, * to create bidirectional connectivity. */ if (priv_data) { - ret = fi_ibv_eq_set_xrc_info(cma_event, &xrc_info); + ret = vrb_eq_set_xrc_info(cma_event, &xrc_info); if (ret) { - fi_ibv_prev_xrc_conn_state(ep); + vrb_prev_xrc_conn_state(ep); rdma_disconnect(ep->base_ep.id); goto err; } - ep->peer_srqn = xrc_info.conn_data; - fi_ibv_ep_ini_conn_done(ep, xrc_info.conn_data, - xrc_info.conn_param.qp_num); - fi_ibv_eq_skip_xrc_cm_data(&priv_data, &priv_datalen); - fi_ibv_save_priv_data(ep, priv_data, priv_datalen); + ep->peer_srqn = xrc_info.peer_srqn; + vrb_eq_skip_xrc_cm_data(&priv_data, &priv_datalen); + vrb_save_priv_data(ep, priv_data, priv_datalen); + vrb_ep_ini_conn_done(ep, xrc_info.conn_param.qp_num); + vrb_eq_xrc_establish(cma_event); + + /* If we have received the reciprocal connect request, + * process it now */ + if (ep->recip_req_received) + return vrb_eq_accept_recip_conn(ep, entry, + len, event, + cma_event, acked); } else { - fi_ibv_ep_tgt_conn_done(ep); - ret = fi_ibv_connect_xrc(ep, NULL, FI_IBV_RECIP_CONN, &cm_data, + vrb_ep_tgt_conn_done(ep); + ret = vrb_connect_xrc(ep, NULL, VRB_RECIP_CONN, &cm_data, sizeof(cm_data)); if (ret) { - fi_ibv_prev_xrc_conn_state(ep); + vrb_prev_xrc_conn_state(ep); + ep->tgt_id->qp = NULL; rdma_disconnect(ep->tgt_id); goto err; } @@ -375,22 +611,22 @@ fi_ibv_eq_xrc_conn_event(struct fi_ibv_xrc_ep *ep, } static size_t -fi_ibv_eq_xrc_recip_conn_event(struct fi_ibv_eq *eq, - struct fi_ibv_xrc_ep *ep, +vrb_eq_xrc_recip_conn_event(struct vrb_eq *eq, + struct vrb_xrc_ep *ep, struct rdma_cm_event *cma_event, struct fi_eq_cm_entry *entry, size_t len) { fid_t fid = cma_event->id->context; - struct fi_ibv_xrc_conn_info xrc_info; + struct vrb_xrc_conn_info xrc_info; int ret; - fi_ibv_next_xrc_conn_state(ep); - VERBS_DBG(FI_LOG_FABRIC, "EP %p RECIPROCAL CONNECTION DONE state %d\n", + vrb_next_xrc_conn_state(ep); + VERBS_DBG(FI_LOG_EP_CTRL, "EP %p RECIPROCAL CONNECTION DONE state %d\n", ep, ep->conn_state); /* If this is the reciprocal active side notification */ if (cma_event->param.conn.private_data) { - ret = fi_ibv_eq_set_xrc_info(cma_event, &xrc_info); + ret = vrb_eq_set_xrc_info(cma_event, &xrc_info); if (ret) { VERBS_WARN(FI_LOG_EP_CTRL, "Reciprocal connection protocol mismatch\n"); @@ -400,18 +636,18 @@ fi_ibv_eq_xrc_recip_conn_event(struct fi_ibv_eq *eq, return -FI_EAVAIL; } - ep->peer_srqn = xrc_info.conn_data; - fi_ibv_ep_ini_conn_done(ep, xrc_info.conn_data, - xrc_info.conn_param.qp_num); + ep->peer_srqn = xrc_info.peer_srqn; + vrb_ep_ini_conn_done(ep, xrc_info.conn_param.qp_num); + vrb_eq_xrc_establish(cma_event); } else { - fi_ibv_ep_tgt_conn_done(ep); + vrb_ep_tgt_conn_done(ep); } /* The internal reciprocal XRC connection has completed. Return the * CONNECTED event application data associated with the original * connection. */ entry->fid = fid; - len = fi_ibv_eq_copy_event_data(entry, len, + len = vrb_eq_copy_event_data(entry, len, ep->conn_setup->event_data, ep->conn_setup->event_len); entry->info = NULL; @@ -420,32 +656,40 @@ fi_ibv_eq_xrc_recip_conn_event(struct fi_ibv_eq *eq, /* Caller must hold eq:lock */ static int -fi_ibv_eq_xrc_rej_event(struct fi_ibv_eq *eq, struct rdma_cm_event *cma_event) +vrb_eq_xrc_rej_event(struct vrb_eq *eq, struct rdma_cm_event *cma_event) { - struct fi_ibv_xrc_ep *ep; + struct vrb_xrc_ep *ep; fid_t fid = cma_event->id->context; - struct fi_ibv_xrc_conn_info xrc_info; - enum fi_ibv_xrc_ep_conn_state state; + struct vrb_xrc_conn_info xrc_info; + enum vrb_xrc_ep_conn_state state; - ep = container_of(fid, struct fi_ibv_xrc_ep, base_ep.util_ep.ep_fid); - state = ep->conn_state; + ep = container_of(fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); + if (ep->magic != VERBS_XRC_EP_MAGIC) { + VERBS_WARN(FI_LOG_EP_CTRL, + "CM ID context not valid\n"); + return -FI_EAGAIN; + } - if (ep->base_ep.id != cma_event->id || state == FI_IBV_XRC_CONNECTED) { + state = ep->conn_state; + if (ep->base_ep.id != cma_event->id || + (state != VRB_XRC_ORIG_CONNECTING && + state != VRB_XRC_RECIP_CONNECTING)) { VERBS_WARN(FI_LOG_EP_CTRL, - "Stale CM Reject %d received\n", cma_event->status); + "Stale/invalid CM reject %d received\n", cma_event->status); return -FI_EAGAIN; } /* If reject comes from remote provider peer */ - if (cma_event->status == FI_IBV_CM_REJ_CONSUMER_DEFINED) { + if (cma_event->status == VRB_CM_REJ_CONSUMER_DEFINED || + cma_event->status == VRB_CM_REJ_SIDR_CONSUMER_DEFINED) { if (cma_event->param.conn.private_data_len && - fi_ibv_eq_set_xrc_info(cma_event, &xrc_info)) { + vrb_eq_set_xrc_info(cma_event, &xrc_info)) { VERBS_WARN(FI_LOG_EP_CTRL, "CM REJ private data not valid\n"); return -FI_EAGAIN; } - fi_ibv_ep_ini_conn_rejected(ep); + vrb_ep_ini_conn_rejected(ep); return FI_SUCCESS; } @@ -454,186 +698,293 @@ fi_ibv_eq_xrc_rej_event(struct fi_ibv_eq *eq, struct rdma_cm_event *cma_event) if (cma_event->param.conn.private_data_len) VERBS_WARN(FI_LOG_EP_CTRL, "Unexpected CM Reject priv_data\n"); - fi_ibv_ep_ini_conn_rejected(ep); + vrb_ep_ini_conn_rejected(ep); + + return state == VRB_XRC_ORIG_CONNECTING ? FI_SUCCESS : -FI_EAGAIN; +} + +/* Caller must hold eq:lock */ +static inline int +vrb_eq_xrc_connect_retry(struct vrb_xrc_ep *ep, + struct rdma_cm_event *cma_event, int *acked) +{ + if (ep->base_ep.info_attr.src_addr) + ofi_straddr_dbg(&vrb_prov, FI_LOG_EP_CTRL, + "Connect retry src ", + ep->base_ep.info_attr.src_addr); + if (ep->base_ep.info_attr.dest_addr) + ofi_straddr_dbg(&vrb_prov, FI_LOG_EP_CTRL, + "Connect retry dest ", + ep->base_ep.info_attr.dest_addr); + + *acked = 1; + rdma_ack_cm_event(cma_event); + rdma_destroy_id(ep->base_ep.id); + ep->base_ep.id = NULL; + vrb_eq_clear_xrc_conn_tag(ep); + ep->conn_setup->retry_count++; + return vrb_connect_xrc(ep, NULL, ep->conn_setup->pending_recip, + ep->conn_setup->pending_param, + ep->conn_setup->pending_paramlen); +} + +/* Caller must hold eq:lock */ +static inline int +vrb_eq_xrc_cm_err_event(struct vrb_eq *eq, + struct rdma_cm_event *cma_event, int *acked) +{ + struct vrb_xrc_ep *ep; + fid_t fid = cma_event->id->context; + int ret; + + ep = container_of(fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); + if (ep->magic != VERBS_XRC_EP_MAGIC) { + VERBS_WARN(FI_LOG_EP_CTRL, "CM ID context invalid\n"); + return -FI_EAGAIN; + } + + /* Connect errors can be reported on active or passive side, all other + * errors considered are reported on the active side only */ + if ((ep->base_ep.id != cma_event->id) && + (cma_event->event == RDMA_CM_EVENT_CONNECT_ERROR && + ep->tgt_id != cma_event->id)) { + VERBS_WARN(FI_LOG_EP_CTRL, "CM error not valid for EP\n"); + return -FI_EAGAIN; + } + + if (ep->base_ep.id == cma_event->id) { + vrb_put_shared_ini_conn(ep); - return state == FI_IBV_XRC_ORIG_CONNECTING ? FI_SUCCESS : -FI_EAGAIN; + /* Active side connect errors are retried */ + if (ep->conn_setup && (ep->conn_setup->retry_count < + VRB_MAX_XRC_CONNECT_RETRIES)) { + ret = vrb_eq_xrc_connect_retry(ep, cma_event, acked); + if (!ret) + return -FI_EAGAIN; + } + } + + VERBS_WARN(FI_LOG_EP_CTRL, "CM error event %s, status %d\n", + rdma_event_str(cma_event->event), cma_event->status); + if (ep->base_ep.info_attr.src_addr) + ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, + "Src ", ep->base_ep.info_attr.src_addr); + if (ep->base_ep.info_attr.dest_addr) + ofi_straddr_log(&vrb_prov, FI_LOG_WARN, FI_LOG_EP_CTRL, + "Dest ", ep->base_ep.info_attr.dest_addr); + ep->conn_state = VRB_XRC_ERROR; + return FI_SUCCESS; } /* Caller must hold eq:lock */ static inline int -fi_ibv_eq_xrc_connected_event(struct fi_ibv_eq *eq, - struct rdma_cm_event *cma_event, +vrb_eq_xrc_connected_event(struct vrb_eq *eq, + struct rdma_cm_event *cma_event, int *acked, struct fi_eq_cm_entry *entry, size_t len, - int *acked) + uint32_t *event) { - struct fi_ibv_xrc_ep *ep; + struct vrb_xrc_ep *ep; fid_t fid = cma_event->id->context; int ret; - ep = container_of(fid, struct fi_ibv_xrc_ep, base_ep.util_ep.ep_fid); + ep = container_of(fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); - assert(ep->conn_state == FI_IBV_XRC_ORIG_CONNECTING || - ep->conn_state == FI_IBV_XRC_RECIP_CONNECTING); + assert(ep->conn_state == VRB_XRC_ORIG_CONNECTING || + ep->conn_state == VRB_XRC_RECIP_CONNECTING); - if (ep->conn_state == FI_IBV_XRC_ORIG_CONNECTING) - return fi_ibv_eq_xrc_conn_event(ep, cma_event, entry); + if (ep->conn_state == VRB_XRC_ORIG_CONNECTING) + return vrb_eq_xrc_conn_event(ep, cma_event, acked, + entry, len, event); - ret = fi_ibv_eq_xrc_recip_conn_event(eq, ep, cma_event, entry, len); + ret = vrb_eq_xrc_recip_conn_event(eq, ep, cma_event, entry, len); - /* Bidirectional connection setup is complete, disconnect RDMA CM - * ID(s) and release shared QP reservations/hardware resources - * that were needed for shared connection setup only. */ + /* Bidirectional connection setup is complete, release RDMA CM ID + * resources. */ *acked = 1; rdma_ack_cm_event(cma_event); - fi_ibv_free_xrc_conn_setup(ep, 1); + vrb_free_xrc_conn_setup(ep, 1); return ret; } /* Caller must hold eq:lock */ static inline void -fi_ibv_eq_xrc_timewait_event(struct fi_ibv_eq *eq, +vrb_eq_xrc_timewait_event(struct vrb_eq *eq, struct rdma_cm_event *cma_event, int *acked) { fid_t fid = cma_event->id->context; - struct fi_ibv_xrc_ep *ep = container_of(fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); assert(ep->magic == VERBS_XRC_EP_MAGIC); assert(ep->conn_setup); - if (cma_event->id == ep->tgt_id && ep->conn_setup->rsvd_tgt_qpn) { + if (cma_event->id == ep->tgt_id) { *acked = 1; rdma_ack_cm_event(cma_event); - ibv_destroy_qp(ep->conn_setup->rsvd_tgt_qpn); - ep->conn_setup->rsvd_tgt_qpn = NULL; rdma_destroy_id(ep->tgt_id); ep->tgt_id = NULL; - } else if (cma_event->id == ep->base_ep.id && - ep->conn_setup->rsvd_ini_qpn) { + } else if (cma_event->id == ep->base_ep.id) { *acked = 1; rdma_ack_cm_event(cma_event); - ibv_destroy_qp(ep->conn_setup->rsvd_ini_qpn); - ep->conn_setup->rsvd_ini_qpn = NULL; rdma_destroy_id(ep->base_ep.id); ep->base_ep.id = NULL; } - if (!ep->conn_setup->rsvd_ini_qpn && !ep->conn_setup->rsvd_tgt_qpn) - fi_ibv_free_xrc_conn_setup(ep, 0); + if (!ep->base_ep.id && !ep->tgt_id) + vrb_free_xrc_conn_setup(ep, 0); } /* Caller must hold eq:lock */ static inline void -fi_ibv_eq_xrc_disconnect_event(struct fi_ibv_eq *eq, +vrb_eq_xrc_disconnect_event(struct vrb_eq *eq, struct rdma_cm_event *cma_event, int *acked) { fid_t fid = cma_event->id->context; - struct fi_ibv_xrc_ep *ep = container_of(fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); assert(ep->magic == VERBS_XRC_EP_MAGIC); - if (ep->conn_setup && cma_event->id == ep->base_ep.id && - ep->conn_setup->rsvd_ini_qpn) { + if (ep->conn_setup && cma_event->id == ep->base_ep.id) { *acked = 1; rdma_ack_cm_event(cma_event); rdma_disconnect(ep->base_ep.id); - ep->conn_setup->ini_connected = 0; } } static ssize_t -fi_ibv_eq_cm_process_event(struct fi_ibv_eq *eq, +vrb_eq_cm_process_event(struct vrb_eq *eq, struct rdma_cm_event *cma_event, uint32_t *event, - struct fi_eq_cm_entry *entry, size_t len, int *acked) + struct fi_eq_cm_entry *entry, size_t len) { - const struct fi_ibv_cm_data_hdr *cm_hdr; + const struct vrb_cm_data_hdr *cm_hdr; size_t datalen = 0; size_t priv_datalen = cma_event->param.conn.private_data_len; const void *priv_data = cma_event->param.conn.private_data; - int ret; + int ret, acked = 0;; fid_t fid = cma_event->id->context; - struct fi_ibv_pep *pep = - container_of(fid, struct fi_ibv_pep, pep_fid); - struct fi_ibv_ep *ep; - - *acked = 0; + struct vrb_pep *pep = + container_of(fid, struct vrb_pep, pep_fid); + struct vrb_ep *ep; + struct vrb_xrc_ep *xrc_ep; switch (cma_event->event) { + case RDMA_CM_EVENT_ROUTE_RESOLVED: + ep = container_of(fid, struct vrb_ep, util_ep.ep_fid); + if (rdma_connect(ep->id, &ep->conn_param)) { + ret = -errno; + FI_WARN(&vrb_prov, FI_LOG_EP_CTRL, + "rdma_connect failed: %s (%d)\n", + strerror(-ret), -ret); + if (vrb_is_xrc_ep(ep)) { + xrc_ep = container_of(fid, struct vrb_xrc_ep, + base_ep.util_ep.ep_fid); + vrb_put_shared_ini_conn(xrc_ep); + } + } else { + ret = -FI_EAGAIN; + } + goto ack; case RDMA_CM_EVENT_CONNECT_REQUEST: *event = FI_CONNREQ; - ret = fi_ibv_eq_cm_getinfo(cma_event, pep->info, &entry->info); + ret = vrb_eq_cm_getinfo(cma_event, pep->info, &entry->info); if (ret) { - fastlock_acquire(&eq->lock); + VERBS_WARN(FI_LOG_EP_CTRL, + "CM getinfo error %d\n", ret); rdma_destroy_id(cma_event->id); eq->err.err = -ret; eq->err.prov_errno = ret; goto err; } - if (fi_ibv_is_xrc(entry->info)) { - ret = fi_ibv_eq_xrc_connreq_event(eq, entry, &priv_data, - &priv_datalen); - if (ret == -FI_EAGAIN) - return ret; + if (vrb_is_xrc_info(entry->info)) { + ret = vrb_eq_xrc_connreq_event(eq, entry, len, event, + cma_event, &acked, + &priv_data, &priv_datalen); + if (ret == -FI_EAGAIN) { + fi_freeinfo(entry->info); + entry->info = NULL; + goto ack; + } + if (*event == FI_CONNECTED) + goto ack; + } else if (cma_event->id->route.addr.src_addr.sa_family == AF_IB) { + vrb_eq_skip_rdma_cm_hdr(&priv_data, &priv_datalen); } break; + case RDMA_CM_EVENT_CONNECT_RESPONSE: case RDMA_CM_EVENT_ESTABLISHED: *event = FI_CONNECTED; if (cma_event->id->qp && cma_event->id->qp->context->device->transport_type != IBV_TRANSPORT_IWARP) { - ret = fi_ibv_set_rnr_timer(cma_event->id->qp); + ret = vrb_set_rnr_timer(cma_event->id->qp); if (ret) - return ret; + goto ack; } - ep = container_of(fid, struct fi_ibv_ep, util_ep.ep_fid); - if (fi_ibv_is_xrc(ep->info)) { - fastlock_acquire(&eq->lock); - ret = fi_ibv_eq_xrc_connected_event(eq, cma_event, - entry, len, acked); - fastlock_release(&eq->lock); - return ret; + ep = container_of(fid, struct vrb_ep, util_ep.ep_fid); + if (vrb_is_xrc_ep(ep)) { + ret = vrb_eq_xrc_connected_event(eq, cma_event, + &acked, entry, len, + event); + goto ack; } entry->info = NULL; break; case RDMA_CM_EVENT_DISCONNECTED: - ep = container_of(fid, struct fi_ibv_ep, util_ep.ep_fid); - if (fi_ibv_is_xrc(ep->info)) { - fastlock_acquire(&eq->lock); - fi_ibv_eq_xrc_disconnect_event(eq, cma_event, acked); - fastlock_release(&eq->lock); - return -FI_EAGAIN; + ep = container_of(fid, struct vrb_ep, util_ep.ep_fid); + if (vrb_is_xrc_ep(ep)) { + vrb_eq_xrc_disconnect_event(eq, cma_event, &acked); + ret = -FI_EAGAIN; + goto ack; } *event = FI_SHUTDOWN; entry->info = NULL; break; case RDMA_CM_EVENT_TIMEWAIT_EXIT: - ep = container_of(fid, struct fi_ibv_ep, util_ep.ep_fid); - if (fi_ibv_is_xrc(ep->info)) { - fastlock_acquire(&eq->lock); - fi_ibv_eq_xrc_timewait_event(eq, cma_event, acked); - fastlock_release(&eq->lock); - } - return -FI_EAGAIN; + ep = container_of(fid, struct vrb_ep, util_ep.ep_fid); + if (vrb_is_xrc_ep(ep)) + vrb_eq_xrc_timewait_event(eq, cma_event, &acked); + ret = -FI_EAGAIN; + goto ack; case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: - fastlock_acquire(&eq->lock); - eq->err.err = -cma_event->status; + ep = container_of(fid, struct vrb_ep, util_ep.ep_fid); + if (vrb_is_xrc_ep(ep)) { + /* SIDR Reject is reported as UNREACHABLE unless + * status is negative */ + if (cma_event->id->ps == RDMA_PS_UDP && + (cma_event->event == RDMA_CM_EVENT_UNREACHABLE && + cma_event->status >= 0)) + goto xrc_shared_reject; + + ret = vrb_eq_xrc_cm_err_event(eq, cma_event, &acked); + if (ret == -FI_EAGAIN) + goto ack; + + *event = FI_SHUTDOWN; + entry->info = NULL; + break; + } + eq->err.err = ETIMEDOUT; + eq->err.prov_errno = -cma_event->status; + if (eq->err.err_data) { + free(eq->err.err_data); + eq->err.err_data = NULL; + eq->err.err_data_size = 0; + } goto err; case RDMA_CM_EVENT_REJECTED: - ep = container_of(fid, struct fi_ibv_ep, util_ep.ep_fid); - if (fi_ibv_is_xrc(ep->info)) { - fastlock_acquire(&eq->lock); - ret = fi_ibv_eq_xrc_rej_event(eq, cma_event); - fastlock_release(&eq->lock); + ep = container_of(fid, struct vrb_ep, util_ep.ep_fid); + if (vrb_is_xrc_ep(ep)) { +xrc_shared_reject: + ret = vrb_eq_xrc_rej_event(eq, cma_event); if (ret == -FI_EAGAIN) - return ret; - fi_ibv_eq_skip_xrc_cm_data(&priv_data, &priv_datalen); + goto ack; + vrb_eq_skip_xrc_cm_data(&priv_data, &priv_datalen); } - fastlock_acquire(&eq->lock); eq->err.err = ECONNREFUSED; eq->err.prov_errno = -cma_event->status; if (eq->err.err_data) { @@ -651,31 +1002,37 @@ fi_ibv_eq_cm_process_event(struct fi_ibv_eq *eq, } goto err; case RDMA_CM_EVENT_DEVICE_REMOVAL: - fastlock_acquire(&eq->lock); eq->err.err = ENODEV; goto err; case RDMA_CM_EVENT_ADDR_CHANGE: - fastlock_acquire(&eq->lock); eq->err.err = EADDRNOTAVAIL; goto err; default: - return 0; + VERBS_WARN(FI_LOG_EP_CTRL, "unknown rdmacm event received: %d\n", + cma_event->event); + ret = -FI_EAGAIN; + goto ack; } entry->fid = fid; /* rdmacm has no way to track how much data is sent by peer */ if (priv_datalen) - datalen = fi_ibv_eq_copy_event_data(entry, len, priv_data, + datalen = vrb_eq_copy_event_data(entry, len, priv_data, priv_datalen); + if (!acked) + rdma_ack_cm_event(cma_event); return sizeof(*entry) + datalen; err: + ret = -FI_EAVAIL; eq->err.fid = fid; - fastlock_release(&eq->lock); - return -FI_EAVAIL; +ack: + if (!acked) + rdma_ack_cm_event(cma_event); + return ret; } -int fi_ibv_eq_trywait(struct fi_ibv_eq *eq) +int vrb_eq_trywait(struct vrb_eq *eq) { int ret; fastlock_acquire(&eq->lock); @@ -684,18 +1041,66 @@ int fi_ibv_eq_trywait(struct fi_ibv_eq *eq) return ret ? 0 : -FI_EAGAIN; } -ssize_t fi_ibv_eq_write_event(struct fi_ibv_eq *eq, uint32_t event, - const void *buf, size_t len) +int vrb_eq_match_event(struct dlist_entry *item, const void *arg) { - struct fi_ibv_eq_entry *entry; + struct vrb_eq_entry *entry; + const struct fid *fid = arg; + + entry = container_of(item, struct vrb_eq_entry, item); + switch (entry->event) { + case FI_CONNREQ: + case FI_CONNECTED: + case FI_SHUTDOWN: + return entry->cm_entry->fid == fid; + case FI_MR_COMPLETE: + case FI_AV_COMPLETE: + case FI_JOIN_COMPLETE: + return entry->eq_entry->fid == fid; + default: + return 0; + } +} - entry = calloc(1, sizeof(struct fi_ibv_eq_entry) + len); +/* Caller must hold eq->lock */ +void vrb_eq_remove_events(struct vrb_eq *eq, struct fid *fid) +{ + struct dlist_entry *item; + struct vrb_eq_entry *entry; + + while ((item = + dlistfd_remove_first_match(&eq->list_head, + vrb_eq_match_event, fid))) { + entry = container_of(item, struct vrb_eq_entry, item); + if (entry->event == FI_CONNREQ) + fi_freeinfo(entry->cm_entry->info); + free(entry); + } +} + +struct vrb_eq_entry * +vrb_eq_alloc_entry(uint32_t event, const void *buf, size_t len) +{ + struct vrb_eq_entry *entry; + + entry = calloc(1, sizeof(struct vrb_eq_entry) + len); if (!entry) - return -FI_ENOMEM; + return NULL; entry->event = event; entry->len = len; - memcpy(entry->eq_entry, buf, len); + memcpy(entry->data, buf, len); + + return entry; +} + +ssize_t vrb_eq_write_event(struct vrb_eq *eq, uint32_t event, + const void *buf, size_t len) +{ + struct vrb_eq_entry *entry; + + entry = vrb_eq_alloc_entry(event, buf, len); + if (!entry) + return -FI_ENOMEM; fastlock_acquire(&eq->lock); dlistfd_insert_tail(&entry->item, &eq->list_head); @@ -704,22 +1109,22 @@ ssize_t fi_ibv_eq_write_event(struct fi_ibv_eq *eq, uint32_t event, return len; } -static ssize_t fi_ibv_eq_write(struct fid_eq *eq_fid, uint32_t event, +static ssize_t vrb_eq_write(struct fid_eq *eq_fid, uint32_t event, const void *buf, size_t len, uint64_t flags) { - struct fi_ibv_eq *eq; + struct vrb_eq *eq; - eq = container_of(eq_fid, struct fi_ibv_eq, eq_fid.fid); + eq = container_of(eq_fid, struct vrb_eq, eq_fid.fid); if (!(eq->flags & FI_WRITE)) return -FI_EINVAL; - return fi_ibv_eq_write_event(eq, event, buf, len); + return vrb_eq_write_event(eq, event, buf, len); } -static size_t fi_ibv_eq_read_event(struct fi_ibv_eq *eq, uint32_t *event, +static size_t vrb_eq_read_event(struct vrb_eq *eq, uint32_t *event, void *buf, size_t len, uint64_t flags) { - struct fi_ibv_eq_entry *entry; + struct vrb_eq_entry *entry; ssize_t ret = 0; fastlock_acquire(&eq->lock); @@ -732,7 +1137,7 @@ static size_t fi_ibv_eq_read_event(struct fi_ibv_eq *eq, uint32_t *event, if (dlistfd_empty(&eq->list_head)) goto out; - entry = container_of(eq->list_head.list.next, struct fi_ibv_eq_entry, item); + entry = container_of(eq->list_head.list.next, struct vrb_eq_entry, item); if (entry->len > len) { ret = -FI_ETOOSMALL; goto out; @@ -740,7 +1145,7 @@ static size_t fi_ibv_eq_read_event(struct fi_ibv_eq *eq, uint32_t *event, ret = entry->len; *event = entry->event; - memcpy(buf, entry->eq_entry, entry->len); + memcpy(buf, entry->data, entry->len); if (!(flags & FI_PEEK)) { dlistfd_remove(eq->list_head.list.next, &eq->list_head); @@ -753,43 +1158,41 @@ static size_t fi_ibv_eq_read_event(struct fi_ibv_eq *eq, uint32_t *event, } static ssize_t -fi_ibv_eq_read(struct fid_eq *eq_fid, uint32_t *event, +vrb_eq_read(struct fid_eq *eq_fid, uint32_t *event, void *buf, size_t len, uint64_t flags) { - struct fi_ibv_eq *eq; + struct vrb_eq *eq; struct rdma_cm_event *cma_event; ssize_t ret = 0; - int acked; - eq = container_of(eq_fid, struct fi_ibv_eq, eq_fid.fid); + if (len < sizeof(struct fi_eq_cm_entry)) + return -FI_ETOOSMALL; - if ((ret = fi_ibv_eq_read_event(eq, event, buf, len, flags))) + eq = container_of(eq_fid, struct vrb_eq, eq_fid.fid); + + if ((ret = vrb_eq_read_event(eq, event, buf, len, flags))) return ret; if (eq->channel) { +next_event: fastlock_acquire(&eq->lock); ret = rdma_get_cm_event(eq->channel, &cma_event); - fastlock_release(&eq->lock); - - if (ret) + if (ret) { + fastlock_release(&eq->lock); return -errno; - - acked = 0; - if (len < sizeof(struct fi_eq_cm_entry)) { - ret = -FI_ETOOSMALL; - goto ack; } - ret = fi_ibv_eq_cm_process_event(eq, cma_event, event, - (struct fi_eq_cm_entry *)buf, len, &acked); - if (ret < 0) - goto ack; + ret = vrb_eq_cm_process_event(eq, cma_event, event, + (struct fi_eq_cm_entry *)buf, + len); + fastlock_release(&eq->lock); + /* If the CM event was handled internally (e.g. XRC), continue + * to process events. */ + if (ret == -FI_EAGAIN) + goto next_event; if (flags & FI_PEEK) - ret = fi_ibv_eq_write_event(eq, *event, buf, ret); -ack: - if (!acked) - rdma_ack_cm_event(cma_event); + ret = vrb_eq_write_event(eq, *event, buf, ret); return ret; } @@ -798,21 +1201,21 @@ fi_ibv_eq_read(struct fid_eq *eq_fid, uint32_t *event, } static ssize_t -fi_ibv_eq_sread(struct fid_eq *eq_fid, uint32_t *event, +vrb_eq_sread(struct fid_eq *eq_fid, uint32_t *event, void *buf, size_t len, int timeout, uint64_t flags) { - struct fi_ibv_eq *eq; - struct epoll_event events[2]; + struct vrb_eq *eq; + void *contexts; ssize_t ret; - eq = container_of(eq_fid, struct fi_ibv_eq, eq_fid.fid); + eq = container_of(eq_fid, struct vrb_eq, eq_fid.fid); while (1) { - ret = fi_ibv_eq_read(eq_fid, event, buf, len, flags); + ret = vrb_eq_read(eq_fid, event, buf, len, flags); if (ret && (ret != -FI_EAGAIN)) return ret; - ret = epoll_wait(eq->epfd, events, 2, timeout); + ret = ofi_epoll_wait(eq->epollfd, &contexts, 1, timeout); if (ret == 0) return -FI_EAGAIN; else if (ret < 0) @@ -821,7 +1224,7 @@ fi_ibv_eq_sread(struct fid_eq *eq_fid, uint32_t *event, } static const char * -fi_ibv_eq_strerror(struct fid_eq *eq, int prov_errno, const void *err_data, +vrb_eq_strerror(struct fid_eq *eq, int prov_errno, const void *err_data, char *buf, size_t len) { if (buf && len) @@ -829,28 +1232,48 @@ fi_ibv_eq_strerror(struct fid_eq *eq, int prov_errno, const void *err_data, return strerror(prov_errno); } -static struct fi_ops_eq fi_ibv_eq_ops = { +static struct fi_ops_eq vrb_eq_ops = { .size = sizeof(struct fi_ops_eq), - .read = fi_ibv_eq_read, - .readerr = fi_ibv_eq_readerr, - .write = fi_ibv_eq_write, - .sread = fi_ibv_eq_sread, - .strerror = fi_ibv_eq_strerror + .read = vrb_eq_read, + .readerr = vrb_eq_readerr, + .write = vrb_eq_write, + .sread = vrb_eq_sread, + .strerror = vrb_eq_strerror }; -static int fi_ibv_eq_control(fid_t fid, int command, void *arg) +static int vrb_eq_control(fid_t fid, int command, void *arg) { - struct fi_ibv_eq *eq; - int ret = 0; + struct fi_wait_pollfd *pollfd; + struct vrb_eq *eq; + int ret; - eq = container_of(fid, struct fi_ibv_eq, eq_fid.fid); + eq = container_of(fid, struct vrb_eq, eq_fid.fid); switch (command) { case FI_GETWAIT: - if (!eq->epfd) { - ret = -FI_ENODATA; - break; +#ifndef HAVE_EPOLL + /* We expect verbs to only run on systems with epoll */ + return -FI_ENOSYS; +#else + if (eq->wait_obj == FI_WAIT_FD) { + *(int *) arg = eq->epollfd; + return 0; } - *(int *) arg = eq->epfd; + + pollfd = arg; + if (pollfd->nfds >= 1) { + pollfd->fd[0].fd = eq->epollfd; + pollfd->fd[0].events = POLLIN; + ret = 0; + } else { + ret = -FI_ETOOSMALL; + } + pollfd->change_index = 1; + pollfd->nfds = 1; +#endif + break; + case FI_GETWAITOBJ: + *(enum fi_wait_obj *) arg = eq->wait_obj; + ret = 0; break; default: ret = -FI_ENOSYS; @@ -860,30 +1283,34 @@ static int fi_ibv_eq_control(fid_t fid, int command, void *arg) return ret; } -static int fi_ibv_eq_close(fid_t fid) +static int vrb_eq_close(fid_t fid) { - struct fi_ibv_eq *eq; - struct fi_ibv_eq_entry *entry; + struct vrb_eq *eq; + struct vrb_eq_entry *entry; - eq = container_of(fid, struct fi_ibv_eq, eq_fid.fid); + eq = container_of(fid, struct vrb_eq, eq_fid.fid); /* TODO: use util code, if possible, and add ref counting */ + if (!ofi_rbmap_empty(&eq->xrc.sidr_conn_rbmap)) + VERBS_WARN(FI_LOG_EP_CTRL, "SIDR connection RBmap not empty\n"); + free(eq->err.err_data); if (eq->channel) rdma_destroy_event_channel(eq->channel); - close(eq->epfd); + ofi_epoll_close(eq->epollfd); while (!dlistfd_empty(&eq->list_head)) { entry = container_of(eq->list_head.list.next, - struct fi_ibv_eq_entry, item); + struct vrb_eq_entry, item); dlistfd_remove(eq->list_head.list.next, &eq->list_head); free(entry); } dlistfd_head_free(&eq->list_head); + ofi_rbmap_cleanup(&eq->xrc.sidr_conn_rbmap); ofi_idx_reset(eq->xrc.conn_key_map); free(eq->xrc.conn_key_map); fastlock_destroy(&eq->lock); @@ -892,26 +1319,25 @@ static int fi_ibv_eq_close(fid_t fid) return 0; } -static struct fi_ops fi_ibv_eq_fi_ops = { +static struct fi_ops vrb_eq_fi_ops = { .size = sizeof(struct fi_ops), - .close = fi_ibv_eq_close, + .close = vrb_eq_close, .bind = fi_no_bind, - .control = fi_ibv_eq_control, + .control = vrb_eq_control, .ops_open = fi_no_ops_open, }; -int fi_ibv_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, +int vrb_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, struct fid_eq **eq, void *context) { - struct fi_ibv_eq *_eq; - struct epoll_event event; + struct vrb_eq *_eq; int ret; _eq = calloc(1, sizeof *_eq); if (!_eq) return -ENOMEM; - _eq->fab = container_of(fabric, struct fi_ibv_fabric, + _eq->fab = container_of(fabric, struct vrb_fabric, util_fabric.fabric_fid); ofi_key_idx_init(&_eq->xrc.conn_key_idx, VERBS_CONN_TAG_INDEX_BITS); @@ -920,6 +1346,8 @@ int fi_ibv_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, ret = -ENOMEM; goto err0; } + ofi_rbmap_init(&_eq->xrc.sidr_conn_rbmap, vrb_sidr_conn_compare); + fastlock_init(&_eq->lock); ret = dlistfd_head_init(&_eq->list_head); if (ret) { @@ -927,17 +1355,12 @@ int fi_ibv_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, goto err1; } - _eq->epfd = epoll_create1(0); - if (_eq->epfd < 0) { - ret = -errno; + ret = ofi_epoll_create(&_eq->epollfd); + if (ret) goto err2; - } - - memset(&event, 0, sizeof(event)); - event.events = EPOLLIN; - if (epoll_ctl(_eq->epfd, EPOLL_CTL_ADD, - _eq->list_head.signal.fd[FI_READ_FD], &event)) { + if (ofi_epoll_add(_eq->epollfd, _eq->list_head.signal.fd[FI_READ_FD], + OFI_EPOLL_IN, NULL)) { ret = -errno; goto err3; } @@ -946,32 +1369,36 @@ int fi_ibv_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, case FI_WAIT_NONE: case FI_WAIT_UNSPEC: case FI_WAIT_FD: - _eq->channel = rdma_create_event_channel(); - if (!_eq->channel) { - ret = -errno; - goto err3; - } - - ret = fi_fd_nonblock(_eq->channel->fd); - if (ret) - goto err4; - - if (epoll_ctl(_eq->epfd, EPOLL_CTL_ADD, _eq->channel->fd, &event)) { - ret = -errno; - goto err4; - } - + _eq->wait_obj = FI_WAIT_FD; + break; + case FI_WAIT_POLLFD: + _eq->wait_obj = FI_WAIT_POLLFD; break; default: ret = -FI_ENOSYS; goto err1; } + _eq->channel = rdma_create_event_channel(); + if (!_eq->channel) { + ret = -errno; + goto err3; + } + + ret = fi_fd_nonblock(_eq->channel->fd); + if (ret) + goto err4; + + if (ofi_epoll_add(_eq->epollfd, _eq->channel->fd, OFI_EPOLL_IN, NULL)) { + ret = -errno; + goto err4; + } + _eq->flags = attr->flags; _eq->eq_fid.fid.fclass = FI_CLASS_EQ; _eq->eq_fid.fid.context = context; - _eq->eq_fid.fid.ops = &fi_ibv_eq_fi_ops; - _eq->eq_fid.ops = &fi_ibv_eq_ops; + _eq->eq_fid.fid.ops = &vrb_eq_fi_ops; + _eq->eq_fid.ops = &vrb_eq_ops; *eq = &_eq->eq_fid; return 0; @@ -979,7 +1406,7 @@ int fi_ibv_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, if (_eq->channel) rdma_destroy_event_channel(_eq->channel); err3: - close(_eq->epfd); + ofi_epoll_close(_eq->epollfd); err2: dlistfd_head_free(&_eq->list_head); err1: diff --git a/prov/verbs/src/verbs_info.c b/prov/verbs/src/verbs_info.c index 8c283d9e568..616d3b7eded 100644 --- a/prov/verbs/src/verbs_info.c +++ b/prov/verbs/src/verbs_info.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2013-2015 Intel Corporation, Inc. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -35,6 +36,7 @@ #include #include #include +#include #include "fi_verbs.h" @@ -44,10 +46,13 @@ #define VERBS_DOMAIN_CAPS (FI_LOCAL_COMM | FI_REMOTE_COMM) -#define VERBS_MSG_CAPS (FI_MSG | FI_RMA | FI_ATOMICS | FI_READ | FI_WRITE | \ - FI_SEND | FI_RECV | FI_REMOTE_READ | FI_REMOTE_WRITE | \ - VERBS_DOMAIN_CAPS) -#define VERBS_DGRAM_CAPS (FI_MSG | FI_RECV | FI_SEND | VERBS_DOMAIN_CAPS) +#define VERBS_MSG_TX_CAPS (OFI_TX_MSG_CAPS | OFI_TX_RMA_CAPS | FI_ATOMICS) +#define VERBS_MSG_RX_CAPS (OFI_RX_MSG_CAPS | OFI_RX_RMA_CAPS | FI_ATOMICS) +#define VERBS_MSG_CAPS (VERBS_MSG_TX_CAPS | VERBS_MSG_RX_CAPS | VERBS_DOMAIN_CAPS) +#define VERBS_DGRAM_TX_CAPS (OFI_TX_MSG_CAPS) +#define VERBS_DGRAM_RX_CAPS (OFI_RX_MSG_CAPS) +#define VERBS_DGRAM_CAPS (VERBS_DGRAM_TX_CAPS | VERBS_DGRAM_RX_CAPS | \ + VERBS_DOMAIN_CAPS) #define VERBS_DGRAM_RX_MODE (FI_MSG_PREFIX) @@ -67,7 +72,7 @@ (ib_ud_addr)->lid, (ib_ud_addr)->service) const struct fi_fabric_attr verbs_fabric_attr = { - .prov_version = VERBS_PROV_VERS, + .prov_version = OFI_VERSION_DEF_PROV, }; const struct fi_domain_attr verbs_domain_attr = { @@ -99,6 +104,7 @@ const struct fi_ep_attr verbs_ep_attr = { }; const struct fi_rx_attr verbs_rx_attr = { + .caps = VERBS_MSG_RX_CAPS, .mode = VERBS_RX_MODE, .op_flags = FI_COMPLETION, .msg_order = VERBS_MSG_ORDER, @@ -107,6 +113,7 @@ const struct fi_rx_attr verbs_rx_attr = { }; const struct fi_rx_attr verbs_dgram_rx_attr = { + .caps = VERBS_DGRAM_RX_CAPS, .mode = VERBS_DGRAM_RX_MODE | VERBS_RX_MODE, .op_flags = FI_COMPLETION, .msg_order = VERBS_MSG_ORDER, @@ -115,6 +122,7 @@ const struct fi_rx_attr verbs_dgram_rx_attr = { }; const struct fi_tx_attr verbs_tx_attr = { + .caps = VERBS_MSG_TX_CAPS, .mode = 0, .op_flags = VERBS_TX_OP_FLAGS, .msg_order = VERBS_MSG_ORDER, @@ -124,6 +132,7 @@ const struct fi_tx_attr verbs_tx_attr = { }; const struct fi_tx_attr verbs_dgram_tx_attr = { + .caps = VERBS_DGRAM_TX_CAPS, .mode = 0, .op_flags = VERBS_TX_OP_FLAGS, .msg_order = VERBS_MSG_ORDER, @@ -136,29 +145,29 @@ const struct verbs_ep_domain verbs_msg_domain = { .suffix = "", .type = FI_EP_MSG, .protocol = FI_PROTO_UNSPEC, - .caps = VERBS_MSG_CAPS, }; const struct verbs_ep_domain verbs_msg_xrc_domain = { .suffix = "-xrc", .type = FI_EP_MSG, .protocol = FI_PROTO_RDMA_CM_IB_XRC, - .caps = VERBS_MSG_CAPS, }; const struct verbs_ep_domain verbs_dgram_domain = { .suffix = "-dgram", .type = FI_EP_DGRAM, .protocol = FI_PROTO_UNSPEC, - .caps = VERBS_DGRAM_CAPS, }; -int fi_ibv_check_ep_attr(const struct fi_info *hints, +/* The list (not thread safe) is populated once when the provider is initialized */ +DEFINE_LIST(verbs_devs); + +int vrb_check_ep_attr(const struct fi_info *hints, const struct fi_info *info) { struct fi_info *user_hints; struct util_prov tmp_util_prov = { - .prov = &fi_ibv_prov, + .prov = &vrb_prov, .info = NULL, .flags = (info->domain_attr->max_ep_srx_ctx && info->ep_attr->type == FI_EP_MSG) ? @@ -197,7 +206,7 @@ int fi_ibv_check_ep_attr(const struct fi_info *hints, return ret; } -int fi_ibv_check_rx_attr(const struct fi_rx_attr *attr, +int vrb_check_rx_attr(const struct fi_rx_attr *attr, const struct fi_info *hints, const struct fi_info *info) { @@ -207,21 +216,21 @@ int fi_ibv_check_rx_attr(const struct fi_rx_attr *attr, if ((hints->domain_attr && hints->domain_attr->cq_data_size) || (hints->rx_attr && hints->rx_attr->mode & FI_RX_CQ_DATA) || hints->mode & FI_RX_CQ_DATA) { - ret = ofi_check_rx_attr(&fi_ibv_prov, info, attr, hints->mode); + ret = ofi_check_rx_attr(&vrb_prov, info, attr, hints->mode); } else { dup_info = fi_dupinfo(info); if (!dup_info) return -FI_ENOMEM; dup_info->rx_attr->mode &= ~FI_RX_CQ_DATA; - ret = ofi_check_rx_attr(&fi_ibv_prov, dup_info, attr, + ret = ofi_check_rx_attr(&vrb_prov, dup_info, attr, hints->mode); fi_freeinfo(dup_info); } return ret; } -static int fi_ibv_check_hints(uint32_t version, const struct fi_info *hints, +static int vrb_check_hints(uint32_t version, const struct fi_info *hints, const struct fi_info *info) { int ret; @@ -229,7 +238,7 @@ static int fi_ibv_check_hints(uint32_t version, const struct fi_info *hints, if (hints->caps & ~(info->caps)) { VERBS_INFO(FI_LOG_CORE, "Unsupported capabilities\n"); - FI_INFO_CHECK(&fi_ibv_prov, info, hints, caps, FI_TYPE_CAPS); + FI_INFO_CHECK(&vrb_prov, info, hints, caps, FI_TYPE_CAPS); return -FI_ENODATA; } @@ -237,19 +246,26 @@ static int fi_ibv_check_hints(uint32_t version, const struct fi_info *hints, if ((hints->mode & prov_mode) != prov_mode) { VERBS_INFO(FI_LOG_CORE, "needed mode not set\n"); - FI_INFO_MODE(&fi_ibv_prov, prov_mode, hints->mode); + FI_INFO_MODE(&vrb_prov, prov_mode, hints->mode); return -FI_ENODATA; } if (hints->fabric_attr) { - ret = ofi_check_fabric_attr(&fi_ibv_prov, info->fabric_attr, + ret = ofi_check_fabric_attr(&vrb_prov, info->fabric_attr, hints->fabric_attr); if (ret) return ret; } if (hints->domain_attr) { - ret = ofi_check_domain_attr(&fi_ibv_prov, version, + if (hints->domain_attr->name && + strcasecmp(hints->domain_attr->name, info->domain_attr->name)) { + VERBS_INFO(FI_LOG_CORE, "skipping device %s (want %s)\n", + info->domain_attr->name, hints->domain_attr->name); + return -FI_ENODATA; + } + + ret = ofi_check_domain_attr(&vrb_prov, version, info->domain_attr, hints); if (ret) @@ -257,19 +273,19 @@ static int fi_ibv_check_hints(uint32_t version, const struct fi_info *hints, } if (hints->ep_attr) { - ret = fi_ibv_check_ep_attr(hints, info); + ret = vrb_check_ep_attr(hints, info); if (ret) return ret; } if (hints->rx_attr) { - ret = fi_ibv_check_rx_attr(hints->rx_attr, hints, info); + ret = vrb_check_rx_attr(hints->rx_attr, hints, info); if (ret) return ret; } if (hints->tx_attr) { - ret = ofi_check_tx_attr(&fi_ibv_prov, info->tx_attr, + ret = ofi_check_tx_attr(&vrb_prov, info->tx_attr, hints->tx_attr, hints->mode); if (ret) return ret; @@ -278,8 +294,9 @@ static int fi_ibv_check_hints(uint32_t version, const struct fi_info *hints, return FI_SUCCESS; } -int fi_ibv_fi_to_rai(const struct fi_info *fi, uint64_t flags, - struct rdma_addrinfo *rai) +int vrb_set_rai(uint32_t addr_format, void *src_addr, size_t src_addrlen, + void *dest_addr, size_t dest_addrlen, uint64_t flags, + struct rdma_addrinfo *rai) { memset(rai, 0, sizeof *rai); if (flags & FI_SOURCE) @@ -288,56 +305,56 @@ int fi_ibv_fi_to_rai(const struct fi_info *fi, uint64_t flags, rai->ai_flags |= RAI_NUMERICHOST; rai->ai_qp_type = IBV_QPT_RC; - rai->ai_port_space = RDMA_PS_TCP; - - if (!fi) - return 0; - switch(fi->addr_format) { + switch(addr_format) { case FI_SOCKADDR_IN: case FI_FORMAT_UNSPEC: + rai->ai_port_space = RDMA_PS_TCP; rai->ai_family = AF_INET; rai->ai_flags |= RAI_FAMILY; break; case FI_SOCKADDR_IN6: + rai->ai_port_space = RDMA_PS_TCP; rai->ai_family = AF_INET6; rai->ai_flags |= RAI_FAMILY; break; case FI_SOCKADDR_IB: + rai->ai_port_space = RDMA_PS_IB; rai->ai_family = AF_IB; rai->ai_flags |= RAI_FAMILY; break; case FI_SOCKADDR: - if (fi->src_addrlen) { - rai->ai_family = ((struct sockaddr *)fi->src_addr)->sa_family; + rai->ai_port_space = RDMA_PS_TCP; + if (src_addrlen) { + rai->ai_family = ((struct sockaddr *)src_addr)->sa_family; rai->ai_flags |= RAI_FAMILY; - } else if (fi->dest_addrlen) { - rai->ai_family = ((struct sockaddr *)fi->dest_addr)->sa_family; + } else if (dest_addrlen) { + rai->ai_family = ((struct sockaddr *)dest_addr)->sa_family; rai->ai_flags |= RAI_FAMILY; } break; default: - VERBS_INFO(FI_LOG_FABRIC, "Unknown fi->addr_format\n"); + VERBS_INFO(FI_LOG_FABRIC, "Unknown addr_format\n"); } - if (fi->src_addrlen) { - if (!(rai->ai_src_addr = malloc(fi->src_addrlen))) + if (src_addrlen) { + if (!(rai->ai_src_addr = malloc(src_addrlen))) return -FI_ENOMEM; - memcpy(rai->ai_src_addr, fi->src_addr, fi->src_addrlen); - rai->ai_src_len = fi->src_addrlen; + memcpy(rai->ai_src_addr, src_addr, src_addrlen); + rai->ai_src_len = src_addrlen; } - if (fi->dest_addrlen) { - if (!(rai->ai_dst_addr = malloc(fi->dest_addrlen))) + if (dest_addrlen) { + if (!(rai->ai_dst_addr = malloc(dest_addrlen))) return -FI_ENOMEM; - memcpy(rai->ai_dst_addr, fi->dest_addr, fi->dest_addrlen); - rai->ai_dst_len = fi->dest_addrlen; + memcpy(rai->ai_dst_addr, dest_addr, dest_addrlen); + rai->ai_dst_len = dest_addrlen; } return 0; } static inline -void *fi_ibv_dgram_ep_name_to_string(const struct ofi_ib_ud_ep_name *name, +void *vrb_dgram_ep_name_to_string(const struct ofi_ib_ud_ep_name *name, size_t *len) { char *str; @@ -358,11 +375,11 @@ void *fi_ibv_dgram_ep_name_to_string(const struct ofi_ib_ud_ep_name *name, return str; } -static int fi_ibv_fill_addr_by_ep_name(struct ofi_ib_ud_ep_name *ep_name, +static int vrb_fill_addr_by_ep_name(struct ofi_ib_ud_ep_name *ep_name, uint32_t fmt, void **addr, size_t *addrlen) { if (fmt == FI_ADDR_STR) { - *addr = fi_ibv_dgram_ep_name_to_string(ep_name, addrlen); + *addr = vrb_dgram_ep_name_to_string(ep_name, addrlen); if (!*addr) return -FI_ENOMEM; } else { @@ -376,7 +393,7 @@ static int fi_ibv_fill_addr_by_ep_name(struct ofi_ib_ud_ep_name *ep_name, return FI_SUCCESS; } -static int fi_ibv_rai_to_fi(struct rdma_addrinfo *rai, struct fi_info *fi) +static int vrb_rai_to_fi(struct rdma_addrinfo *rai, struct fi_info *fi) { if (!rai) return FI_SUCCESS; @@ -405,7 +422,7 @@ static int fi_ibv_rai_to_fi(struct rdma_addrinfo *rai, struct fi_info *fi) return FI_SUCCESS; } -static inline int fi_ibv_get_qp_cap(struct ibv_context *ctx, +static inline int vrb_get_qp_cap(struct ibv_context *ctx, struct fi_info *info, uint32_t protocol) { struct ibv_pd *pd; @@ -442,19 +459,19 @@ static inline int fi_ibv_get_qp_cap(struct ibv_context *ctx, info->rx_attr->size && info->rx_attr->iov_limit); - init_attr.cap.max_send_wr = MIN(fi_ibv_gl_data.def_tx_size, + init_attr.cap.max_send_wr = MIN(vrb_gl_data.def_tx_size, info->tx_attr->size); - init_attr.cap.max_send_sge = MIN(fi_ibv_gl_data.def_tx_iov_limit, + init_attr.cap.max_send_sge = MIN(vrb_gl_data.def_tx_iov_limit, info->tx_attr->iov_limit); - if (!fi_ibv_is_xrc_send_qp(qp_type)) { + if (qp_type != IBV_QPT_XRC_SEND) { init_attr.recv_cq = cq; - init_attr.cap.max_recv_wr = MIN(fi_ibv_gl_data.def_rx_size, + init_attr.cap.max_recv_wr = MIN(vrb_gl_data.def_rx_size, info->rx_attr->size); - init_attr.cap.max_recv_sge = MIN(fi_ibv_gl_data.def_rx_iov_limit, + init_attr.cap.max_recv_sge = MIN(vrb_gl_data.def_rx_iov_limit, info->rx_attr->iov_limit); } - init_attr.cap.max_inline_data = fi_ibv_find_max_inline(pd, ctx, qp_type); + init_attr.cap.max_inline_data = vrb_find_max_inline(pd, ctx, qp_type); init_attr.qp_type = qp_type; qp = ibv_create_qp(pd, &init_attr); @@ -475,7 +492,7 @@ static inline int fi_ibv_get_qp_cap(struct ibv_context *ctx, return ret; } -static int fi_ibv_mtu_type_to_len(enum ibv_mtu mtu_type) +static int vrb_mtu_type_to_len(enum ibv_mtu mtu_type) { switch (mtu_type) { case IBV_MTU_256: @@ -493,7 +510,7 @@ static int fi_ibv_mtu_type_to_len(enum ibv_mtu mtu_type) } } -static enum fi_link_state fi_ibv_pstate_2_lstate(enum ibv_port_state pstate) +static enum fi_link_state vrb_pstate_2_lstate(enum ibv_port_state pstate) { switch (pstate) { case IBV_PORT_DOWN: @@ -507,7 +524,7 @@ static enum fi_link_state fi_ibv_pstate_2_lstate(enum ibv_port_state pstate) } } -static const char *fi_ibv_link_layer_str(uint8_t link_layer) +static const char *vrb_link_layer_str(uint8_t link_layer) { switch (link_layer) { case IBV_LINK_LAYER_UNSPECIFIED: @@ -520,56 +537,7 @@ static const char *fi_ibv_link_layer_str(uint8_t link_layer) } } -static size_t fi_ibv_speed(uint8_t speed, uint8_t width) -{ - const size_t gbit_2_bit_coef = 1024 * 1024; - size_t width_val, speed_val; - - switch (speed) { - case 1: - speed_val = (size_t) (2.5 * (float) gbit_2_bit_coef); - break; - case 2: - speed_val = 5 * gbit_2_bit_coef; - break; - case 4: - case 8: - speed_val = 8 * gbit_2_bit_coef; - break; - case 16: - speed_val = 14 * gbit_2_bit_coef; - break; - case 32: - speed_val = 25 * gbit_2_bit_coef; - break; - default: - speed_val = 0; - break; - } - - switch (width) { - case 1: - width_val = 1; - break; - case 2: - width_val = 4; - break; - case 4: - width_val = 8; - break; - case 8: - width_val = 12; - break; - default: - width_val = 0; - break; - } - - return width_val * speed_val; -} - - -static int fi_ibv_get_device_attrs(struct ibv_context *ctx, +static int vrb_get_device_attrs(struct ibv_context *ctx, struct fi_info *info, uint32_t protocol) { struct ibv_device_attr device_attr; @@ -577,6 +545,9 @@ static int fi_ibv_get_device_attrs(struct ibv_context *ctx, size_t max_sup_size; int ret = 0, mtu_size; uint8_t port_num; + enum fi_log_level level = + vrb_gl_data.msg.prefer_xrc ? FI_LOG_WARN : FI_LOG_INFO; + const char *dev_name = ibv_get_device_name(ctx->device); ret = ibv_query_device(ctx, &device_attr); if (ret) { @@ -587,7 +558,9 @@ static int fi_ibv_get_device_attrs(struct ibv_context *ctx, if (protocol == FI_PROTO_RDMA_CM_IB_XRC) { if (!(device_attr.device_cap_flags & IBV_DEVICE_XRC)) { - VERBS_WARN(FI_LOG_FABRIC, "XRC not supported\n"); + FI_LOG(&vrb_prov, level, FI_LOG_FABRIC, + "XRC support unavailable in device: %s\n", + dev_name); return -FI_EINVAL; } } @@ -621,7 +594,7 @@ static int fi_ibv_get_device_attrs(struct ibv_context *ctx, info->ep_attr->rx_ctx_cnt = FI_SHARED_CONTEXT; } - ret = fi_ibv_get_qp_cap(ctx, info, protocol); + ret = vrb_get_qp_cap(ctx, info, protocol); if (ret) return ret; @@ -637,21 +610,20 @@ static int fi_ibv_get_device_attrs(struct ibv_context *ctx, } if (port_num == device_attr.phys_port_cnt + 1) { - VERBS_INFO(FI_LOG_FABRIC, "There are no active ports\n"); + FI_WARN(&vrb_prov, FI_LOG_FABRIC, "device %s: there are no " + "active ports\n", dev_name); return -FI_ENODATA; } else { - VERBS_INFO(FI_LOG_FABRIC, - "The first found active port is %"PRIu8"\n", - port_num); + VERBS_INFO(FI_LOG_FABRIC, "device %s: first found active port " + "is %"PRIu8"\n", dev_name, port_num); } if (info->ep_attr->type == FI_EP_DGRAM) { - ret = fi_ibv_mtu_type_to_len(port_attr.active_mtu); + ret = vrb_mtu_type_to_len(port_attr.active_mtu); if (ret < 0) { - VERBS_WARN(FI_LOG_FABRIC, "Device %s (port: %d) reports" + VERBS_WARN(FI_LOG_FABRIC, "device %s (port: %d) reports" " an unrecognized MTU (%d) \n", - ibv_get_device_name(ctx->device), port_num, - port_attr.active_mtu); + dev_name, port_num, port_attr.active_mtu); return ret; } max_sup_size = MIN(ret, port_attr.max_msg_sz); @@ -663,7 +635,7 @@ static int fi_ibv_get_device_attrs(struct ibv_context *ctx, info->ep_attr->max_order_raw_size = max_sup_size; info->ep_attr->max_order_waw_size = max_sup_size; - ret = asprintf(&info->nic->device_attr->device_id, "%"PRIu32, + ret = asprintf(&info->nic->device_attr->device_id, "0x%04x", device_attr.vendor_part_id); if (ret < 0) { info->nic->device_attr->device_id = NULL; @@ -672,7 +644,7 @@ static int fi_ibv_get_device_attrs(struct ibv_context *ctx, return -FI_ENOMEM; } - ret = asprintf(&info->nic->device_attr->vendor_id, "%"PRIu32, + ret = asprintf(&info->nic->device_attr->vendor_id, "0x%04x", device_attr.vendor_id); if (ret < 0) { info->nic->device_attr->vendor_id = NULL; @@ -697,14 +669,14 @@ static int fi_ibv_get_device_attrs(struct ibv_context *ctx, return -FI_ENOMEM; } - mtu_size = fi_ibv_mtu_type_to_len(port_attr.active_mtu); + mtu_size = vrb_mtu_type_to_len(port_attr.active_mtu); info->nic->link_attr->mtu = (size_t) (mtu_size > 0 ? mtu_size : 0); - info->nic->link_attr->speed = fi_ibv_speed(port_attr.active_speed, - port_attr.active_width); + info->nic->link_attr->speed = ofi_vrb_speed(port_attr.active_speed, + port_attr.active_width); info->nic->link_attr->state = - fi_ibv_pstate_2_lstate(port_attr.state); + vrb_pstate_2_lstate(port_attr.state); info->nic->link_attr->network_type = - strdup(fi_ibv_link_layer_str(port_attr.link_layer)); + strdup(vrb_link_layer_str(port_attr.link_layer)); if (!info->nic->link_attr->network_type) { VERBS_WARN(FI_LOG_FABRIC, "Unable to allocate memory for link_attr::network_type\n"); @@ -720,10 +692,12 @@ static int fi_ibv_get_device_attrs(struct ibv_context *ctx, * This avoids the lower libraries (libibverbs and librdmacm) from * reporting error messages to stderr. */ -static int fi_ibv_have_device(void) +static int vrb_have_device(void) { struct ibv_device **devs; struct ibv_context *verbs; + struct ibv_device_attr attr; + const int AWS_VENDOR_ID = 0x1d0f; int i, ret = 0; devs = ibv_get_device_list(NULL); @@ -733,9 +707,21 @@ static int fi_ibv_have_device(void) for (i = 0; devs[i]; i++) { verbs = ibv_open_device(devs[i]); if (verbs) { + ret = ibv_query_device(verbs, &attr); ibv_close_device(verbs); - ret = 1; - break; + /* + * According to the librdmacm library interface, + * rdma_get_devices() in vrb_init_info leaves devices + * open even after rdma_free_devices() is called, + * causing failure in efa provider. + * Also, efa and verb devices are not expected to + * co-exist on a system. If its an efa device, then it + * should be handled by the efa provider. + */ + if (!ret && (attr.vendor_id != AWS_VENDOR_ID)) { + ret = 1; + break; + } } } @@ -743,12 +729,21 @@ static int fi_ibv_have_device(void) return ret; } -static int fi_ibv_alloc_info(struct ibv_context *ctx, struct fi_info **info, +static bool vrb_hmem_supported(const char *dev_name) +{ + if (vrb_gl_data.peer_mem_support && strstr(dev_name, "mlx")) + return true; + + return false; +} + +static int vrb_alloc_info(struct ibv_context *ctx, struct fi_info **info, const struct verbs_ep_domain *ep_dom) { struct fi_info *fi; union ibv_gid gid; size_t name_len; + const char *dev_name = ibv_get_device_name(ctx->device); int ret; if ((ctx->device->transport_type != IBV_TRANSPORT_IB) && @@ -760,17 +755,18 @@ static int fi_ibv_alloc_info(struct ibv_context *ctx, struct fi_info **info, if (!fi) return -FI_ENOMEM; - fi->caps = ep_dom->caps; fi->handle = NULL; *(fi->ep_attr) = verbs_ep_attr; *(fi->domain_attr) = verbs_domain_attr; switch (ep_dom->type) { case FI_EP_MSG: + fi->caps = VERBS_MSG_CAPS; *(fi->tx_attr) = verbs_tx_attr; *(fi->rx_attr) = verbs_rx_attr; break; case FI_EP_DGRAM: + fi->caps = VERBS_DGRAM_CAPS; fi->mode = VERBS_DGRAM_RX_MODE; *(fi->tx_attr) = verbs_dgram_tx_attr; *(fi->rx_attr) = verbs_dgram_rx_attr; @@ -780,13 +776,11 @@ static int fi_ibv_alloc_info(struct ibv_context *ctx, struct fi_info **info, assert(0); return -FI_EINVAL; } - + *(fi->fabric_attr) = verbs_fabric_attr; fi->ep_attr->type = ep_dom->type; - fi->tx_attr->caps = ep_dom->caps; - fi->rx_attr->caps = ep_dom->caps; fi->nic = ofi_nic_dup(NULL); if (!fi->nic) { @@ -794,19 +788,19 @@ static int fi_ibv_alloc_info(struct ibv_context *ctx, struct fi_info **info, goto err; } - fi->nic->device_attr->name = strdup(ibv_get_device_name(ctx->device)); + fi->nic->device_attr->name = strdup(dev_name); if (!fi->nic->device_attr->name) { ret = -FI_ENOMEM; goto err; } - ret = fi_ibv_get_device_attrs(ctx, fi, ep_dom->protocol); + ret = vrb_get_device_attrs(ctx, fi, ep_dom->protocol); if (ret) goto err; switch (ctx->device->transport_type) { case IBV_TRANSPORT_IB: - if (ibv_query_gid(ctx, 1, 0, &gid)) { + if (ibv_query_gid(ctx, 1, vrb_gl_data.gid_idx, &gid)) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_query_gid", errno); ret = -errno; @@ -857,7 +851,7 @@ static int fi_ibv_alloc_info(struct ibv_context *ctx, struct fi_info **info, goto err; } - name_len = strlen(ibv_get_device_name(ctx->device)) + strlen(ep_dom->suffix); + name_len = strlen(dev_name) + strlen(ep_dom->suffix); fi->domain_attr->name = calloc(1, name_len + 2); if (!fi->domain_attr->name) { ret = -FI_ENOMEM; @@ -875,67 +869,249 @@ static int fi_ibv_alloc_info(struct ibv_context *ctx, struct fi_info **info, return ret; } -static void fi_ibv_verbs_devs_free(struct dlist_entry *verbs_devs) +static void verbs_devs_print(void) { struct verbs_dev_info *dev; struct verbs_addr *addr; - - while (!dlist_empty(verbs_devs)) { - dlist_pop_front(verbs_devs, struct verbs_dev_info, dev, entry); - while (!dlist_empty(&dev->addrs)) { - dlist_pop_front(&dev->addrs, struct verbs_addr, addr, entry); - rdma_freeaddrinfo(addr->rai); - free(addr); + char addr_str[INET6_ADDRSTRLEN]; + int i = 0; + + FI_INFO(&vrb_prov, FI_LOG_FABRIC, + "list of verbs devices found for FI_EP_MSG:\n"); + dlist_foreach_container(&verbs_devs, struct verbs_dev_info, + dev, entry) { + FI_INFO(&vrb_prov, FI_LOG_FABRIC, + "#%d %s - IPoIB addresses:\n", ++i, dev->name); + dlist_foreach_container(&dev->addrs, struct verbs_addr, + addr, entry) { + if (!inet_ntop(addr->rai->ai_family, + ofi_get_ipaddr(addr->rai->ai_src_addr), + addr_str, INET6_ADDRSTRLEN)) + FI_INFO(&vrb_prov, FI_LOG_FABRIC, + "unable to convert address to string\n"); + else + FI_INFO(&vrb_prov, FI_LOG_FABRIC, + "\t%s\n", addr_str); } - free(dev->name); - free(dev); } } -static int fi_ibv_add_rai(struct dlist_entry *verbs_devs, struct rdma_cm_id *id, - struct rdma_addrinfo *rai) +static int verbs_devs_add(struct dlist_entry *verbs_devs, char *dev_name, + struct rdma_addrinfo *rai) { struct verbs_dev_info *dev; struct verbs_addr *addr; - const char *dev_name; if (!(addr = malloc(sizeof(*addr)))) return -FI_ENOMEM; addr->rai = rai; - dev_name = ibv_get_device_name(id->verbs->device); dlist_foreach_container(verbs_devs, struct verbs_dev_info, dev, entry) - if (!strcmp(dev_name, dev->name)) + if (!strcmp(dev_name, dev->name)) { + free(dev_name); goto add_rai; + } if (!(dev = malloc(sizeof(*dev)))) goto err1; - if (!(dev->name = strdup(dev_name))) - goto err2; - + dev->name = dev_name; dlist_init(&dev->addrs); dlist_insert_tail(&dev->entry, verbs_devs); add_rai: dlist_insert_tail(&addr->entry, &dev->addrs); return 0; -err2: - free(dev); err1: free(addr); return -FI_ENOMEM; } +#define IPV6_LINK_LOCAL_ADDR_PREFIX_STR "fe80" + +static int vrb_ifa_rdma_info(const struct ifaddrs *ifa, char **dev_name, + struct rdma_addrinfo **rai) +{ + char name[INET6_ADDRSTRLEN + 16]; + struct rdma_addrinfo rai_hints = { + .ai_flags = RAI_PASSIVE | RAI_NUMERICHOST, + }, *rai_; + struct rdma_cm_id *id; + int ret; + + if (!inet_ntop(ifa->ifa_addr->sa_family, ofi_get_ipaddr(ifa->ifa_addr), + name, INET6_ADDRSTRLEN)) + return -errno; + + ret = rdma_create_id(NULL, &id, NULL, RDMA_PS_TCP); + if (ret) + return ret; + + /* Detect if the IPv6 address is link local. + * TODO should we do something similar for IPv4? */ + if (!strncmp(name, IPV6_LINK_LOCAL_ADDR_PREFIX_STR, + strlen(IPV6_LINK_LOCAL_ADDR_PREFIX_STR))) { + strncat(name, "%", sizeof(name) - strlen(name) - 1); + strncat(name, ifa->ifa_name, sizeof(name) - strlen(name) - 1); + } + + ret = rdma_getaddrinfo((char *) name, NULL, &rai_hints, &rai_); + if (ret) { + ret = -errno; + FI_DBG(&vrb_prov, FI_LOG_FABRIC, "rdma_getaddrinfo failed " + "with error code: %d (%s) for interface %s with address:" + " %s\n", -ret, strerror(-ret), ifa->ifa_name, name); + goto err1; + } + + ret = rdma_bind_addr(id, rai_->ai_src_addr); + if (ret) { + ret = -errno; + FI_DBG(&vrb_prov, FI_LOG_FABRIC, "rdma_bind_addr failed " + "with error code: %d (%s) for interface %s with address:" + " %s\n", -ret, strerror(-ret), ifa->ifa_name, name); + goto err2; + } + + if (!id->verbs) { + ret = -FI_EINVAL; + goto err2; + } + + *dev_name = strdup(ibv_get_device_name(id->verbs->device)); + if (!(*dev_name)) { + ret = -FI_ENOMEM; + goto err2; + } + + rdma_destroy_id(id); + *rai = rai_; + return 0; +err2: + rdma_freeaddrinfo(rai_); +err1: + rdma_destroy_id(id); + return ret; +} + +int vrb_get_port_space(uint32_t addr_format) +{ + if (addr_format == FI_SOCKADDR_IB) + return RDMA_PS_IB; + else + return RDMA_PS_TCP; +} + +static struct rdma_addrinfo *vrb_alloc_ib_addrinfo(uint8_t port_num, + const union ibv_gid *gid, uint16_t pkey) +{ + struct rdma_addrinfo *rai; + struct sockaddr_ib *sib; + + rai = calloc(1, sizeof(struct rdma_addrinfo)); + if (!rai) + return NULL; + + rai->ai_flags = RAI_PASSIVE | RAI_NUMERICHOST | RAI_FAMILY; + rai->ai_family = AF_IB; + rai->ai_port_space = RDMA_PS_IB; + + sib = calloc(1, sizeof(struct sockaddr_ib)); + if (!sib) { + free(rai); + return NULL; + } + rai->ai_src_addr = (struct sockaddr *) sib; + rai->ai_src_len = sizeof(struct sockaddr_ib); + + sib->sib_family = AF_IB; + memcpy(&sib->sib_addr.sib_raw, &gid->raw, sizeof(*gid)); + sib->sib_pkey = pkey; + sib->sib_scope_id = port_num; + + ofi_addr_set_port((struct sockaddr *)sib, 0); + + return rai; +} + +static int vrb_get_sib(struct dlist_entry *verbs_devs) +{ + struct rdma_addrinfo *rai = NULL; + struct ibv_device **devices; + char *dev_name = NULL; + int num_devices; + struct ibv_context *context; + int ret, num_verbs_ifs = 0; + struct ibv_device_attr device_attr; + struct ibv_port_attr port_attr; + union ibv_gid gid; + uint16_t pkey; + + devices = ibv_get_device_list(&num_devices); + if (!devices) + return -errno; + + for (int dev = 0; dev < num_devices; dev++) { + if (!devices[dev]) + continue; + + context = ibv_open_device(devices[dev]); + if (!context) + continue; + + ret = ibv_query_device(context, &device_attr); + if (ret) + continue; + + for (int port = 1; port <= device_attr.phys_port_cnt; port++) { + ret = ibv_query_port(context, port, &port_attr); + if (ret) + continue; + + for (int gidx = 0; gidx < port_attr.gid_tbl_len; gidx++) { + /* gid_tbl_len may contain GID entries that are NULL (fe80::), + * so we need to filter them out */ + ret = ibv_query_gid(context, port, gidx, &gid); + if (ret || !gid.global.interface_id || !gid.global.subnet_prefix) + continue; + + for (int pidx = 0; pidx < port_attr.pkey_tbl_len; pidx++) { + ret = ibv_query_pkey(context, port, pidx, &pkey); + if (ret || !pkey) + continue; + + rai = vrb_alloc_ib_addrinfo(port, &gid, pkey); + if (!rai) + continue; + + dev_name = strdup(ibv_get_device_name(context->device)); + if (!dev_name) + return -FI_ENOMEM; + + ret = verbs_devs_add(verbs_devs, dev_name, rai); + if (ret) { + free(dev_name); + rdma_freeaddrinfo(rai); + continue; + } + + num_verbs_ifs++; + } + } + } + } + + ibv_free_device_list(devices); + return num_verbs_ifs ? 0 : -FI_ENODATA; +} + /* Builds a list of interfaces that correspond to active verbs devices */ -static int fi_ibv_getifaddrs(struct dlist_entry *verbs_devs) +static int vrb_getifaddrs(struct dlist_entry *verbs_devs) { struct ifaddrs *ifaddr, *ifa; - char name[INET6_ADDRSTRLEN]; - struct rdma_addrinfo *rai; - struct rdma_cm_id *id; - const char *ret_ptr; - char *iface = fi_ibv_gl_data.iface; + struct rdma_addrinfo *rai = NULL; + char *dev_name = NULL; + char *iface = vrb_gl_data.iface; int ret, num_verbs_ifs = 0; size_t iface_len = 0; int exact_match = 0; @@ -943,7 +1119,7 @@ static int fi_ibv_getifaddrs(struct dlist_entry *verbs_devs) ret = ofi_getifaddrs(&ifaddr); if (ret) { VERBS_WARN(FI_LOG_FABRIC, - "Unable to get interface addresses\n"); + "unable to get interface addresses\n"); return ret; } @@ -951,9 +1127,8 @@ static int fi_ibv_getifaddrs(struct dlist_entry *verbs_devs) if (iface) { iface_len = strlen(iface); if (iface_len > IFNAMSIZ) { - VERBS_INFO(FI_LOG_EP_CTRL, - "Too long iface name: %s, max: %d\n", - iface, IFNAMSIZ); + VERBS_INFO(FI_LOG_FABRIC, "iface name: %s, too long " + "max: %d\n", iface, IFNAMSIZ); } for (ifa = ifaddr; ifa && !exact_match; ifa = ifa->ifa_next) @@ -967,162 +1142,99 @@ static int fi_ibv_getifaddrs(struct dlist_entry *verbs_devs) if (iface) { if (exact_match) { - if (strcmp(ifa->ifa_name, iface)) + if (strcmp(ifa->ifa_name, iface)) { + FI_INFO(&vrb_prov, FI_LOG_FABRIC, + "skipping interface: %s for FI_EP_MSG" + " as it doesn't match filter: %s\n", + ifa->ifa_name, iface); continue; + } } else { - if (strncmp(ifa->ifa_name, iface, iface_len)) + if (strncmp(ifa->ifa_name, iface, iface_len)) { + FI_INFO(&vrb_prov, FI_LOG_FABRIC, + "skipping interface: %s for FI_EP_MSG" + " as it doesn't match filter: %s\n", + ifa->ifa_name, iface); continue; + } } } - switch (ifa->ifa_addr->sa_family) { - case AF_INET: - ret_ptr = inet_ntop(AF_INET, &ofi_sin_addr(ifa->ifa_addr), - name, INET6_ADDRSTRLEN); - break; - case AF_INET6: - ret_ptr = inet_ntop(AF_INET6, &ofi_sin6_addr(ifa->ifa_addr), - name, INET6_ADDRSTRLEN); - break; - default: - continue; - } - if (!ret_ptr) { - VERBS_WARN(FI_LOG_FABRIC, - "inet_ntop failed: %s(%d)\n", - strerror(errno), errno); - ret = -errno; - goto err1; - } - - ret = fi_ibv_get_rai_id(name, NULL, FI_NUMERICHOST | FI_SOURCE, - NULL, &rai, &id); + ret = vrb_ifa_rdma_info(ifa, &dev_name, &rai); if (ret) continue; - ret = fi_ibv_add_rai(verbs_devs, id, rai); + ret = verbs_devs_add(verbs_devs, dev_name, rai); if (ret) { + free(dev_name); rdma_freeaddrinfo(rai); - rdma_destroy_id(id); - goto err1; + continue; } - VERBS_DBG(FI_LOG_FABRIC, "Found active interface for verbs device: " - "%s with address: %s\n", - ibv_get_device_name(id->verbs->device), name); - rdma_destroy_id(id); num_verbs_ifs++; } + + verbs_devs_print(); + freeifaddrs(ifaddr); return num_verbs_ifs ? 0 : -FI_ENODATA; -err1: - fi_ibv_verbs_devs_free(verbs_devs); - freeifaddrs(ifaddr); - return ret; } -static int fi_ibv_get_srcaddr_devs(struct fi_info **info) +static int +vrb_info_add_dev_addr(struct fi_info **info, struct verbs_dev_info *dev) { - struct fi_info *fi, *add_info; - struct fi_info *fi_unconf = NULL, *fi_prev = NULL; - struct verbs_dev_info *dev; + struct fi_info *add_info; struct verbs_addr *addr; - int ret = 0; - - DEFINE_LIST(verbs_devs); + int ret; - ret = fi_ibv_getifaddrs(&verbs_devs); - if (ret) - return ret; + dlist_foreach_container(&dev->addrs, struct verbs_addr, addr, entry) { + /* When a device has multiple interfaces/addresses configured + * duplicate fi_info and add the address info. fi->src_addr + * would have been set in the previous iteration */ + if ((*info)->src_addr) { + if (!(add_info = fi_dupinfo(*info))) + return -FI_ENOMEM; + + add_info->next = (*info)->next; + (*info)->next = add_info; + *info = add_info; + } - if (dlist_empty(&verbs_devs)) { - VERBS_WARN(FI_LOG_CORE, "No interface address found\n"); - return 0; + ret = vrb_rai_to_fi(addr->rai, *info); + if (ret) + return ret; } + return 0; +} + +static int vrb_get_srcaddr_devs(struct fi_info **info) +{ + struct verbs_dev_info *dev; + struct fi_info *fi; + int ret; for (fi = *info; fi; fi = fi->next) { if (fi->ep_attr->type == FI_EP_DGRAM) continue; - dlist_foreach_container(&verbs_devs, struct verbs_dev_info, dev, entry) - if (!strncmp(fi->domain_attr->name, dev->name, strlen(dev->name))) { - dlist_foreach_container(&dev->addrs, struct verbs_addr, addr, entry) { - /* When a device has multiple interfaces/addresses configured - * duplicate fi_info and add the address info. fi->src_addr - * would have been set in the previous iteration */ - if (fi->src_addr) { - if (!(add_info = fi_dupinfo(fi))) { - ret = -FI_ENOMEM; - goto out; - } - - add_info->next = fi->next; - fi->next = add_info; - fi = add_info; - } - - ret = fi_ibv_rai_to_fi(addr->rai, fi); - if (ret) - goto out; - } + dlist_foreach_container(&verbs_devs, struct verbs_dev_info, + dev, entry) { + /* strncmp because we want to process XRC fi_info as + * well which have a "-xrc" suffix in domain name */ + if (!strncmp(fi->domain_attr->name, dev->name, + strlen(dev->name))) { + ret = vrb_info_add_dev_addr(&fi, dev); + if (ret) + return ret; break; } - } - - /* re-order info: move info without src_addr to tail */ - for (fi = *info; fi;) { - if (!fi->src_addr) { - /* re-link list - exclude current element */ - if (fi == *info) { - *info = fi->next; - fi->next = fi_unconf; - fi_unconf = fi; - fi = *info; - } else { - assert(fi_prev); - fi_prev->next = fi->next; - fi->next = fi_unconf; - fi_unconf = fi; - fi = fi_prev->next; - } - } else { - fi_prev = fi; - fi = fi->next; } } - - /* append excluded elements to tail of list */ - if (fi_unconf) { - if (fi_prev) { - assert(!fi_prev->next); - fi_prev->next = fi_unconf; - } else if (*info) { - assert(!(*info)->next); - (*info)->next = fi_unconf; - } else /* !(*info) */ { - (*info) = fi_unconf; - } - } - -out: - fi_ibv_verbs_devs_free(&verbs_devs); - return ret; -} - -static void fi_ibv_sockaddr_set_port(struct sockaddr *sa, uint16_t port) -{ - switch(sa->sa_family) { - case AF_INET: - ((struct sockaddr_in *)sa)->sin_port = port; - break; - case AF_INET6: - ((struct sockaddr_in6 *)sa)->sin6_port = port; - break; - } + return 0; } /* the `rai` parameter is used for the MSG EP type */ /* the `fmt`, `[src | dest]_addr` parameters are used for the DGRAM EP type */ /* if the `fmt` parameter isn't used, pass FI_FORMAT_UNSPEC */ -static int fi_ibv_set_info_addrs(struct fi_info *info, +static int vrb_set_info_addrs(struct fi_info *info, struct rdma_addrinfo *rai, uint32_t fmt, struct ofi_ib_ud_ep_name *src_addr, @@ -1133,19 +1245,19 @@ static int fi_ibv_set_info_addrs(struct fi_info *info, for (; iter_info; iter_info = iter_info->next) { if (iter_info->ep_attr->type != FI_EP_DGRAM) { - ret = fi_ibv_rai_to_fi(rai, iter_info); + ret = vrb_rai_to_fi(rai, iter_info); if (ret) return ret; } else { if (src_addr) { - ret = fi_ibv_fill_addr_by_ep_name(src_addr, fmt, + ret = vrb_fill_addr_by_ep_name(src_addr, fmt, &iter_info->src_addr, &iter_info->src_addrlen); if (ret) return ret; } if (dest_addr) { - ret = fi_ibv_fill_addr_by_ep_name(dest_addr, fmt, + ret = vrb_fill_addr_by_ep_name(dest_addr, fmt, &iter_info->dest_addr, &iter_info->dest_addrlen); if (ret) @@ -1158,7 +1270,7 @@ static int fi_ibv_set_info_addrs(struct fi_info *info, return FI_SUCCESS; } -static int fi_ibv_fill_addr(struct rdma_addrinfo *rai, struct fi_info **info, +static int vrb_fill_addr(struct rdma_addrinfo *rai, struct fi_info **info, struct rdma_cm_id *id) { struct sockaddr *local_addr; @@ -1172,60 +1284,83 @@ static int fi_ibv_fill_addr(struct rdma_addrinfo *rai, struct fi_info **info, goto rai_to_fi; if (!id->verbs) - return fi_ibv_get_srcaddr_devs(info); + return vrb_get_srcaddr_devs(info); /* Handle the case when rdma_cm doesn't fill src address even * though it fills the destination address (presence of id->verbs * corresponds to a valid dest addr) */ local_addr = rdma_get_local_addr(id); - if (!local_addr) { - VERBS_WARN(FI_LOG_CORE, - "Unable to get local address\n"); - return -FI_ENODATA; - } - rai->ai_src_len = fi_ibv_sockaddr_len(local_addr); - if (!(rai->ai_src_addr = malloc(rai->ai_src_len))) + rai->ai_src_len = ofi_sizeofaddr(local_addr); + rai->ai_src_addr = malloc(rai->ai_src_len); + if (!rai->ai_src_addr) return -FI_ENOMEM; memcpy(rai->ai_src_addr, local_addr, rai->ai_src_len); /* User didn't specify a port. Zero out the random port * assigned by rdmamcm so that this rai/fi_info can be * used multiple times to create rdma endpoints.*/ - fi_ibv_sockaddr_set_port(rai->ai_src_addr, 0); + ofi_addr_set_port(rai->ai_src_addr, 0); rai_to_fi: - return fi_ibv_set_info_addrs(*info, rai, FI_FORMAT_UNSPEC, + return vrb_set_info_addrs(*info, rai, FI_FORMAT_UNSPEC, NULL, NULL); } +static int vrb_device_has_ipoib_addr(const char *dev_name) +{ + struct verbs_dev_info *dev; + + dlist_foreach_container(&verbs_devs, struct verbs_dev_info, dev, entry) { + if (!strcmp(dev_name, dev->name)) + return 1; + } + return 0; +} + #define VERBS_NUM_DOMAIN_TYPES 3 -int fi_ibv_init_info(const struct fi_info **all_infos) +int vrb_init_info(const struct fi_info **all_infos) { struct ibv_context **ctx_list; struct fi_info *fi = NULL, *tail = NULL; const struct verbs_ep_domain *ep_type[VERBS_NUM_DOMAIN_TYPES]; - int ret = 0, i, j, num_devices; + int ret = 0, i, j, num_devices, dom_count = 0; *all_infos = NULL; - /* List XRC MSG_EP domain before default RC MSG_EP if requested */ - if (fi_ibv_gl_data.msg.prefer_xrc) { - ep_type[0] = &verbs_msg_xrc_domain; - ep_type[1] = &verbs_msg_domain; - } else { - ep_type[0] = &verbs_msg_domain; - ep_type[1] = &verbs_msg_xrc_domain; - } - ep_type[2] = &verbs_dgram_domain; - - if (!fi_ibv_have_device()) { - VERBS_INFO(FI_LOG_FABRIC, "No RDMA devices found\n"); + if (!vrb_have_device()) { + VERBS_INFO(FI_LOG_FABRIC, "no RDMA devices found\n"); ret = -FI_ENODATA; goto done; } + /* List XRC MSG_EP domain before default RC MSG_EP if requested */ + if (vrb_gl_data.msg.prefer_xrc) { + if (VERBS_HAVE_XRC) + ep_type[dom_count++] = &verbs_msg_xrc_domain; + else + FI_WARN(&vrb_prov, FI_LOG_FABRIC, + "XRC not built into provider, skip allocating " + "fi_info for XRC FI_EP_MSG endpoints\n"); + } + + vrb_getifaddrs(&verbs_devs); + if (!vrb_gl_data.iface) + vrb_get_sib(&verbs_devs); + + if (dlist_empty(&verbs_devs)) + FI_WARN(&vrb_prov, FI_LOG_FABRIC, + "no valid IPoIB interfaces found, FI_EP_MSG endpoint " + "type would not be available\n"); + else + ep_type[dom_count++] = &verbs_msg_domain; + + if (!vrb_gl_data.msg.prefer_xrc && VERBS_HAVE_XRC) + ep_type[dom_count++] = &verbs_msg_xrc_domain; + + ep_type[dom_count++] = &verbs_dgram_domain; + ctx_list = rdma_get_devices(&num_devices); if (!num_devices) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_get_devices", errno); @@ -1234,13 +1369,55 @@ int fi_ibv_init_info(const struct fi_info **all_infos) } for (i = 0; i < num_devices; i++) { - for (j = 0; j < VERBS_NUM_DOMAIN_TYPES; j++) { - ret = fi_ibv_alloc_info(ctx_list[i], &fi, ep_type[j]); - if (!ret) { - if (!*all_infos) - *all_infos = fi; - else - tail->next = fi; + if (!ctx_list[i]) { + FI_INFO(&vrb_prov, FI_LOG_FABRIC, + "skipping device: %d, " + "the interface may be down, faulty or disabled\n", + i); + continue; + } + + for (j = 0; j < dom_count; j++) { + if (ep_type[j]->type == FI_EP_MSG && + !vrb_device_has_ipoib_addr(ctx_list[i]->device->name)) { + FI_INFO(&vrb_prov, FI_LOG_FABRIC, + "skipping device: %s for FI_EP_MSG, " + "it may have a filtered IPoIB interface" + " (FI_VERBS_IFACE) or it may not have a" + " valid IP address configured\n", + ctx_list[i]->device->name); + continue; + } + if (vrb_gl_data.device_name && + strncasecmp(ctx_list[i]->device->name, + vrb_gl_data.device_name, + strlen(vrb_gl_data.device_name))) + continue; + + ret = vrb_alloc_info(ctx_list[i], &fi, ep_type[j]); + if (ret) + continue; + + if (!*all_infos) + *all_infos = fi; + else + tail->next = fi; + tail = fi; + + /* If verbs HMEM is supported, duplicate previously + * allocated fi_info and apply HMEM flags. + */ + if (vrb_hmem_supported(ctx_list[i]->device->name)) { + fi = fi_dupinfo(fi); + if (!fi) + continue; + + fi->caps |= FI_HMEM; + fi->tx_attr->caps |= FI_HMEM; + fi->rx_attr->caps |= FI_HMEM; + fi->domain_attr->mr_mode |= FI_MR_HMEM; + + tail->next = fi; tail = fi; } } @@ -1254,61 +1431,36 @@ int fi_ibv_init_info(const struct fi_info **all_infos) return ret; } -static int fi_ibv_set_default_attr(struct fi_info *info, size_t *attr, - size_t default_attr, char *attr_str) +static void vrb_set_default_attr(size_t *attr, size_t default_attr) { - if (default_attr > *attr) { - VERBS_INFO(FI_LOG_FABRIC, "Ignoring provider default value " - "for %s as it is greater than the value supported " - "by domain: %s\n", attr_str, info->domain_attr->name); - } else { + if (default_attr <= *attr) *attr = default_attr; - } - return 0; } /* Set default values for attributes. ofi_alter_info would change them if the * user has asked for a different value in hints */ -static int fi_ibv_set_default_info(struct fi_info *info) +static void vrb_set_default_info(struct fi_info *info) { - int ret; - - ret = fi_ibv_set_default_attr(info, &info->tx_attr->size, - fi_ibv_gl_data.def_tx_size, - "tx context size"); - if (ret) - return ret; + vrb_set_default_attr(&info->tx_attr->size, + vrb_gl_data.def_tx_size); - ret = fi_ibv_set_default_attr(info, &info->rx_attr->size, - fi_ibv_gl_data.def_rx_size, - "rx context size"); - if (ret) - return ret; - ret = fi_ibv_set_default_attr(info, &info->tx_attr->iov_limit, - fi_ibv_gl_data.def_tx_iov_limit, - "tx iov_limit"); - if (ret) - return ret; + vrb_set_default_attr(&info->rx_attr->size, + vrb_gl_data.def_rx_size); - ret = fi_ibv_set_default_attr(info, &info->rx_attr->iov_limit, - fi_ibv_gl_data.def_rx_iov_limit, - "rx iov_limit"); - if (ret) - return ret; + vrb_set_default_attr(&info->tx_attr->iov_limit, + vrb_gl_data.def_tx_iov_limit); + vrb_set_default_attr(&info->rx_attr->iov_limit, + vrb_gl_data.def_rx_iov_limit); if (info->ep_attr->type == FI_EP_MSG) { /* For verbs iov limit is same for * both regular messages and RMA */ - ret = fi_ibv_set_default_attr(info, &info->tx_attr->rma_iov_limit, - fi_ibv_gl_data.def_tx_iov_limit, - "tx rma_iov_limit"); - if (ret) - return ret; + vrb_set_default_attr(&info->tx_attr->rma_iov_limit, + vrb_gl_data.def_tx_iov_limit); } - return 0; } -static struct fi_info *fi_ibv_get_passive_info(const struct fi_info *prov_info, +static struct fi_info *vrb_get_passive_info(const struct fi_info *prov_info, const struct fi_info *hints) { struct fi_info *info; @@ -1337,54 +1489,55 @@ static struct fi_info *fi_ibv_get_passive_info(const struct fi_info *prov_info, return info; } -static int fi_ibv_get_matching_info(uint32_t version, - const struct fi_info *hints, - struct fi_info **info, - const struct fi_info *verbs_info, - uint8_t passive) +int vrb_get_matching_info(uint32_t version, const struct fi_info *hints, + struct fi_info **info, const struct fi_info *verbs_info, + uint8_t passive) { const struct fi_info *check_info = verbs_info; struct fi_info *fi, *tail; - int ret; + int ret, i; uint8_t got_passive_info = 0; + enum fi_log_level level = + vrb_gl_data.msg.prefer_xrc ? FI_LOG_WARN : FI_LOG_INFO; *info = tail = NULL; - for ( ; check_info; check_info = check_info->next) { - VERBS_DBG(FI_LOG_FABRIC, "Checking domain: %s\n", - check_info->domain_attr->name); - + for (i = 1; check_info; check_info = check_info->next, i++) { if (hints) { + FI_INFO(&vrb_prov, FI_LOG_FABRIC, + "checking domain: #%d %s\n", + i, check_info->domain_attr->name); + + if (hints->ep_attr) { + /* check EP type first to avoid other unnecessary checks */ + ret = ofi_check_ep_type( + &vrb_prov, check_info->ep_attr, hints->ep_attr); + if (ret) + continue; + } + + ret = vrb_check_hints(version, hints, + check_info); + if (ret) + continue; + if ((check_info->ep_attr->protocol == FI_PROTO_RDMA_CM_IB_XRC) && (!hints->ep_attr || (hints->ep_attr->rx_ctx_cnt != FI_SHARED_CONTEXT))) { - VERBS_INFO(FI_LOG_FABRIC, - "hints->ep_attr->rx_ctx_cnt != " - "FI_SHARED_CONTEXT. Skipping " - "XRC FI_EP_MSG endpoints\n"); + FI_LOG(&vrb_prov, level, FI_LOG_FABRIC, + "hints->ep_attr->rx_ctx_cnt != " + "FI_SHARED_CONTEXT. Skipping " + "XRC FI_EP_MSG endpoints\n"); continue; } - if ((check_info->ep_attr->protocol == - FI_PROTO_RDMA_CM_IB_XRC) && !VERBS_HAVE_XRC) { - VERBS_INFO(FI_LOG_FABRIC, - "XRC not built into provider, " - "skipping XRC FI_EP_MSG " - "endpoints\n"); - continue; - } - - ret = fi_ibv_check_hints(version, hints, - check_info); - if (ret) - continue; } if ((check_info->ep_attr->type == FI_EP_MSG) && passive) { if (got_passive_info) continue; - if (!(fi = fi_ibv_get_passive_info(check_info, hints))) { + if (!(fi = vrb_get_passive_info(check_info, hints))) { ret = -FI_ENOMEM; goto err; } @@ -1394,15 +1547,11 @@ static int fi_ibv_get_matching_info(uint32_t version, ret = -FI_ENOMEM; goto err; } - ret = fi_ibv_set_default_info(fi); - if (ret) { - fi_freeinfo(fi); - continue; - } + vrb_set_default_info(fi); } - VERBS_DBG(FI_LOG_FABRIC, "Adding fi_info for domain: %s\n", - fi->domain_attr->name); + FI_INFO(&vrb_prov, FI_LOG_FABRIC, + "adding fi_info for domain: %s\n", fi->domain_attr->name); if (!*info) *info = fi; else @@ -1419,7 +1568,7 @@ static int fi_ibv_get_matching_info(uint32_t version, return ret; } -static int fi_ibv_del_info_not_belong_to_dev(const char *dev_name, struct fi_info **info) +static int vrb_del_info_not_belong_to_dev(const char *dev_name, struct fi_info **info) { struct fi_info *check_info = *info; struct fi_info *cur, *prev = NULL; @@ -1454,16 +1603,16 @@ static int fi_ibv_del_info_not_belong_to_dev(const char *dev_name, struct fi_inf return FI_SUCCESS; } -static int fi_ibv_resolve_ib_ud_dest_addr(const char *node, const char *service, +static int vrb_resolve_ib_ud_dest_addr(const char *node, const char *service, struct ofi_ib_ud_ep_name **dest_addr) { int svc = VERBS_IB_UD_NS_ANY_SERVICE; struct util_ns ns = { - .port = fi_ibv_gl_data.dgram.name_server_port, + .port = vrb_gl_data.dgram.name_server_port, .name_len = sizeof(**dest_addr), .service_len = sizeof(svc), - .service_cmp = fi_ibv_dgram_ns_service_cmp, - .is_service_wildcard = fi_ibv_dgram_ns_is_service_wildcard, + .service_cmp = vrb_dgram_ns_service_cmp, + .is_service_wildcard = vrb_dgram_ns_is_service_wildcard, }; ofi_ns_init(&ns); @@ -1483,7 +1632,32 @@ static int fi_ibv_resolve_ib_ud_dest_addr(const char *node, const char *service, return 0; } -static int fi_ibv_handle_ib_ud_addr(const char *node, const char *service, +static void vrb_delete_dgram_infos(struct fi_info **info) +{ + struct fi_info *check_info = *info; + struct fi_info *cur, *prev = NULL; + + *info = NULL; + + while (check_info) { + if (check_info->ep_attr->type == FI_EP_DGRAM) { + cur = check_info; + if (prev) + prev->next = check_info->next; + check_info = check_info->next; + + cur->next = NULL; + fi_freeinfo(cur); + } else { + prev = check_info; + if (!*info) + *info = check_info; + check_info = check_info->next; + } + } +} + +static int vrb_handle_ib_ud_addr(const char *node, const char *service, uint64_t flags, struct fi_info **info) { struct ofi_ib_ud_ep_name *dest_addr = NULL; @@ -1512,7 +1686,8 @@ static int fi_ibv_handle_ib_ud_addr(const char *node, const char *service, if (!src_addr) { VERBS_INFO(FI_LOG_CORE, "failed to allocate src addr.\n"); - return -FI_ENODATA; + ret = -FI_ENODATA; + goto err; } if (flags & FI_SOURCE) { @@ -1521,7 +1696,7 @@ static int fi_ibv_handle_ib_ud_addr(const char *node, const char *service, &src_addr->service); if (ret != 1) { ret = -errno; - goto fn2; + goto err; } } @@ -1532,18 +1707,19 @@ static int fi_ibv_handle_ib_ud_addr(const char *node, const char *service, } if (!dest_addr && node && !(flags & FI_SOURCE)) { - ret = fi_ibv_resolve_ib_ud_dest_addr(node, service, &dest_addr); + ret = vrb_resolve_ib_ud_dest_addr(node, service, &dest_addr); if (ret) - goto fn2; /* Here possible that `src_addr` isn't a NULL */ + goto err; /* Here possible that `src_addr` isn't a NULL */ } - ret = fi_ibv_set_info_addrs(*info, NULL, fmt, src_addr, dest_addr); - if (ret) - goto fn2; - + ret = vrb_set_info_addrs(*info, NULL, fmt, src_addr, dest_addr); + if (!ret) + goto out; +err: + vrb_delete_dgram_infos(info); /* `fi_info::src_addr` and `fi_info::dest_addr` is freed * in the `fi_freeinfo` function in case of failure */ -fn2: +out: if (src_addr) free(src_addr); if (dest_addr) @@ -1551,26 +1727,7 @@ static int fi_ibv_handle_ib_ud_addr(const char *node, const char *service, return ret; } -static void fi_ibv_remove_nosrc_info(struct fi_info **info) -{ - struct fi_info **fi = info, *next; - while (*fi && ((*fi)->ep_attr->type == FI_EP_MSG)) { - if (!(*fi)->src_addr) { - VERBS_INFO(FI_LOG_FABRIC, "Not reporting fi_info " - "corresponding to domain: %s as it has no IP" - "address configured\n", - (*fi)->domain_attr->name); - next = (*fi)->next; - (*fi)->next = NULL; - fi_freeinfo(*fi); - *fi = next; - } else { - fi = &(*fi)->next; - } - } -} - -static int fi_ibv_handle_sock_addr(const char *node, const char *service, +static int vrb_handle_sock_addr(const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct fi_info **info) { @@ -1579,18 +1736,17 @@ static int fi_ibv_handle_sock_addr(const char *node, const char *service, const char *dev_name = NULL; int ret; - ret = fi_ibv_get_rai_id(node, service, flags, hints, &rai, &id); + ret = vrb_get_rai_id(node, service, flags, hints, &rai, &id); if (ret) return ret; if (id->verbs) { dev_name = ibv_get_device_name(id->verbs->device); - ret = fi_ibv_del_info_not_belong_to_dev(dev_name, info); + ret = vrb_del_info_not_belong_to_dev(dev_name, info); if (ret) goto out; } - ret = fi_ibv_fill_addr(rai, info, id); - fi_ibv_remove_nosrc_info(info); + ret = vrb_fill_addr(rai, info, id); out: rdma_freeaddrinfo(rai); if (rdma_destroy_id(id)) @@ -1598,83 +1754,43 @@ static int fi_ibv_handle_sock_addr(const char *node, const char *service, return ret; } -static inline int -fi_ibv_hints_match_dgram_ep(const struct fi_info *hints) -{ - return (hints && ((hints->addr_format == FI_ADDR_IB_UD) || - (hints->ep_attr && (hints->ep_attr->type == FI_EP_DGRAM)))); -} - -static inline int -fi_ibv_hints_match_msg_ep(const struct fi_info *hints) -{ - return (hints && ((hints->addr_format == FI_SOCKADDR) || - (hints->addr_format == FI_SOCKADDR_IN) || - (hints->addr_format == FI_SOCKADDR_IN6) || - (hints->addr_format == FI_SOCKADDR_IB) || - (hints->ep_attr && (hints->ep_attr->type == FI_EP_MSG)))); -} - -static int fi_ibv_get_match_infos(uint32_t version, const char *node, +static int vrb_get_match_infos(uint32_t version, const char *node, const char *service, uint64_t flags, const struct fi_info *hints, const struct fi_info **raw_info, struct fi_info **info) { - int ret, ret_sock_addr, ret_ib_ud_addr; + int ret, ret_sock_addr = -FI_ENODATA, ret_ib_ud_addr = -FI_ENODATA; // TODO check for AF_IB addr - ret = fi_ibv_get_matching_info(version, hints, info, *raw_info, + ret = vrb_get_matching_info(version, hints, info, *raw_info, ofi_is_wildcard_listen_addr(node, service, flags, hints)); if (ret) return ret; - /* Check if the user requested to support DGRAM EP type only */ - if (fi_ibv_hints_match_dgram_ep(hints)) { - /* This is case when only IB UD addresses are passed */ - ret = fi_ibv_handle_ib_ud_addr(node, service, flags, info); - if (ret) { - VERBS_INFO(FI_LOG_CORE, - "Handling of the IB UD address fails - %d, " - "support of this was requested thru the passed hints\n", - ret); - fi_freeinfo(*info); - } - return ret; - } - - /* Check if the user requested to support MSG EP type only */ - if (fi_ibv_hints_match_msg_ep(hints)) { - ret = fi_ibv_handle_sock_addr(node, service, flags, hints, info); - if (ret) { - VERBS_INFO(FI_LOG_CORE, - "Handling of the socket address fails - %d, but the " - "support of this was requested thru the passed hints\n", - ret); - if (*info) - fi_freeinfo(*info); + if (!hints || !hints->ep_attr || hints->ep_attr->type == FI_EP_MSG || + hints->ep_attr->type == FI_EP_UNSPEC) { + ret_sock_addr = vrb_handle_sock_addr(node, service, flags, hints, info); + if (ret_sock_addr) { + VERBS_INFO(FI_LOG_FABRIC, + "handling of the socket address fails - %d\n", + ret_sock_addr); } else { if (!*info) return -FI_ENODATA; } - return ret; } - ret_sock_addr = fi_ibv_handle_sock_addr(node, service, flags, hints, info); - if (ret_sock_addr) { - VERBS_INFO(FI_LOG_CORE, "Handling of the socket address fails - %d\n", - ret_sock_addr); - } else { - if (!*info) - return -FI_ENODATA; + if (!hints || !hints->ep_attr || hints->ep_attr->type == FI_EP_DGRAM || + hints->ep_attr->type == FI_EP_UNSPEC) { + ret_ib_ud_addr = vrb_handle_ib_ud_addr(node, service, flags, info); + if (ret_ib_ud_addr) + VERBS_INFO(FI_LOG_FABRIC, + "handling of the IB ID address fails - %d\n", + ret_ib_ud_addr); } - ret_ib_ud_addr = fi_ibv_handle_ib_ud_addr(node, service, flags, info); - if (ret_ib_ud_addr) - VERBS_INFO(FI_LOG_CORE, "Handling of the IB ID address fails - %d\n", - ret_ib_ud_addr); - if (ret_sock_addr && ret_ib_ud_addr) { /* neither the sockaddr nor the ib_ud address wasn't * handled to satisfy the selection procedure */ @@ -1687,7 +1803,7 @@ static int fi_ibv_get_match_infos(uint32_t version, const char *node, return FI_SUCCESS; } -static void fi_ibv_alter_info(const struct fi_info *hints, struct fi_info *info) +void vrb_alter_info(const struct fi_info *hints, struct fi_info *info) { struct fi_info *cur; @@ -1711,26 +1827,26 @@ static void fi_ibv_alter_info(const struct fi_info *hints, struct fi_info *info) * This is to avoid drop in throughput */ cur->tx_attr->inject_size = MIN(cur->tx_attr->inject_size, - fi_ibv_gl_data.def_inline_size); + vrb_gl_data.def_inline_size); } } } -int fi_ibv_getinfo(uint32_t version, const char *node, const char *service, +int vrb_getinfo(uint32_t version, const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct fi_info **info) { int ret; - ret = fi_ibv_get_match_infos(version, node, service, + ret = vrb_get_match_infos(version, node, service, flags, hints, - &fi_ibv_util_prov.info, info); + &vrb_util_prov.info, info); if (ret) goto out; ofi_alter_info(*info, hints, version); - fi_ibv_alter_info(hints, *info); + vrb_alter_info(hints, *info); out: if (!ret || ret == -FI_ENOMEM || ret == -FI_ENODEV) return ret; diff --git a/prov/verbs/src/verbs_mr.c b/prov/verbs/src/verbs_mr.c index 6a88d711692..215d412995e 100644 --- a/prov/verbs/src/verbs_mr.c +++ b/prov/verbs/src/verbs_mr.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2017-2019 Intel Corporation, Inc. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,39 +34,12 @@ #include #include "fi_verbs.h" - -static int -fi_ibv_mr_regv(struct fid *fid, const struct iovec *iov, - size_t count, uint64_t access, uint64_t offset, - uint64_t requested_key, uint64_t flags, - struct fid_mr **mr, void *context) -{ - struct fid_domain *domain = container_of(fid, struct fid_domain, fid); - - if (OFI_UNLIKELY(count > 1)) - return -FI_EINVAL; - - return count ? fi_mr_reg(domain, (const void *) iov->iov_base, - iov->iov_len, access, offset, requested_key, - flags, mr, context) : - fi_mr_reg(domain, NULL, 0, access, offset, requested_key, - flags, mr, context); -} - -static int fi_ibv_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, - uint64_t flags, struct fid_mr **mr) -{ - return fi_ibv_mr_regv(fid, attr->mr_iov, attr->iov_count, attr->access, - attr->offset, attr->requested_key, flags, mr, - attr->context); -} - -static int fi_ibv_mr_close(fid_t fid) +static int vrb_mr_close(fid_t fid) { - struct fi_ibv_mem_desc *mr; + struct vrb_mem_desc *mr; int ret; - mr = container_of(fid, struct fi_ibv_mem_desc, mr_fid.fid); + mr = container_of(fid, struct vrb_mem_desc, mr_fid.fid); if (!mr->mr) return 0; @@ -75,23 +49,31 @@ static int fi_ibv_mr_close(fid_t fid) return ret; } -static struct fi_ops fi_ibv_mr_fi_ops = { +static struct fi_ops vrb_mr_fi_ops = { .size = sizeof(struct fi_ops), - .close = fi_ibv_mr_close, + .close = vrb_mr_close, .bind = fi_no_bind, .control = fi_no_control, .ops_open = fi_no_ops_open, }; static inline -int fi_ibv_mr_reg_common(struct fi_ibv_mem_desc *md, int fi_ibv_access, - const void *buf, size_t len, void *context) +int vrb_mr_reg_common(struct vrb_mem_desc *md, int vrb_access, const void *buf, + size_t len, void *context, enum fi_hmem_iface iface, + uint64_t device) { /* ops should be set in special functions */ md->mr_fid.fid.fclass = FI_CLASS_MR; md->mr_fid.fid.context = context; + md->info.iface = iface; + md->info.device = device; + md->info.iov.iov_base = (void *) buf; + md->info.iov.iov_len = len; - md->mr = ibv_reg_mr(md->domain->pd, (void *) buf, len, fi_ibv_access); + if (md->domain->flags & VRB_USE_ODP && iface == FI_HMEM_SYSTEM) + vrb_access |= VRB_ACCESS_ON_DEMAND; + + md->mr = ibv_reg_mr(md->domain->pd, (void *) buf, len, vrb_access); if (!md->mr) { if (len) return -errno; @@ -99,8 +81,9 @@ int fi_ibv_mr_reg_common(struct fi_ibv_mem_desc *md, int fi_ibv_access, /* Ignore failure for zero length memory registration */ assert(errno == FI_EINVAL); } else { - md->mr_fid.mem_desc = (void *)(uintptr_t)md->mr->lkey; + md->mr_fid.mem_desc = md; md->mr_fid.key = md->mr->rkey; + md->lkey = md->mr->lkey; } if (md->domain->eq_flags & FI_REG_MR) { @@ -109,8 +92,8 @@ int fi_ibv_mr_reg_common(struct fi_ibv_mem_desc *md, int fi_ibv_access, .context = context, }; if (md->domain->eq) - fi_ibv_eq_write_event(md->domain->eq, FI_MR_COMPLETE, - &entry, sizeof(entry)); + vrb_eq_write_event(md->domain->eq, FI_MR_COMPLETE, + &entry, sizeof(entry)); else if (md->domain->util_domain.eq) /* This branch is taken for the verbs/DGRAM */ fi_eq_write(&md->domain->util_domain.eq->eq_fid, @@ -120,7 +103,7 @@ int fi_ibv_mr_reg_common(struct fi_ibv_mem_desc *md, int fi_ibv_access, } static inline int -fi_ibv_mr_ofi2ibv_access(uint64_t ofi_access, struct fi_ibv_domain *domain) +vrb_mr_ofi2ibv_access(uint64_t ofi_access, struct vrb_domain *domain) { int ibv_access = 0; @@ -152,26 +135,26 @@ fi_ibv_mr_ofi2ibv_access(uint64_t ofi_access, struct fi_ibv_domain *domain) } static int -fi_ibv_mr_reg(struct fid *fid, const void *buf, size_t len, - uint64_t access, uint64_t offset, uint64_t requested_key, - uint64_t flags, struct fid_mr **mr, void *context) +vrb_mr_nocache_reg(struct vrb_domain *domain, const void *buf, size_t len, + uint64_t access, uint64_t offset, uint64_t requested_key, + uint64_t flags, struct fid_mr **mr, void *context, + enum fi_hmem_iface iface, uint64_t device) { - struct fi_ibv_mem_desc *md; + struct vrb_mem_desc *md; int ret; - if (OFI_UNLIKELY(flags)) + if (OFI_UNLIKELY(flags & ~OFI_MR_NOCACHE)) return -FI_EBADFLAGS; md = calloc(1, sizeof(*md)); if (OFI_UNLIKELY(!md)) return -FI_ENOMEM; - md->domain = container_of(fid, struct fi_ibv_domain, - util_domain.domain_fid.fid); - md->mr_fid.fid.ops = &fi_ibv_mr_fi_ops; + md->domain = domain; + md->mr_fid.fid.ops = &vrb_mr_fi_ops; - ret = fi_ibv_mr_reg_common(md, fi_ibv_mr_ofi2ibv_access(access, md->domain), - buf, len, context); + ret = vrb_mr_reg_common(md, vrb_mr_ofi2ibv_access(access, md->domain), + buf, len, context, iface, device); if (OFI_UNLIKELY(ret)) goto err; @@ -182,71 +165,61 @@ fi_ibv_mr_reg(struct fid *fid, const void *buf, size_t len, return ret; } -static int fi_ibv_mr_cache_close(fid_t fid) +static int vrb_mr_cache_close(fid_t fid) { - struct fi_ibv_mem_desc *md = - container_of(fid, struct fi_ibv_mem_desc, mr_fid.fid); - + struct vrb_mem_desc *md = + container_of(fid, struct vrb_mem_desc, mr_fid.fid); ofi_mr_cache_delete(&md->domain->cache, md->entry); return FI_SUCCESS; } -struct fi_ops_mr fi_ibv_mr_ops = { - .size = sizeof(struct fi_ops_mr), - .reg = fi_ibv_mr_reg, - .regv = fi_ibv_mr_regv, - .regattr = fi_ibv_mr_regattr, -}; - -static struct fi_ops fi_ibv_mr_cache_fi_ops = { +static struct fi_ops vrb_mr_cache_fi_ops = { .size = sizeof(struct fi_ops), - .close = fi_ibv_mr_cache_close, + .close = vrb_mr_cache_close, .bind = fi_no_bind, .control = fi_no_control, .ops_open = fi_no_ops_open, }; -int fi_ibv_mr_cache_add_region(struct ofi_mr_cache *cache, +int vrb_mr_cache_add_region(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) { - struct fi_ibv_mem_desc *md = (struct fi_ibv_mem_desc *) entry->data; + struct vrb_mem_desc *md = (struct vrb_mem_desc *) entry->data; - md->domain = container_of(cache->domain, struct fi_ibv_domain, util_domain); - md->mr_fid.fid.ops = &fi_ibv_mr_cache_fi_ops; + md->domain = container_of(cache->domain, struct vrb_domain, util_domain); + md->mr_fid.fid.ops = &vrb_mr_cache_fi_ops; md->entry = entry; - return fi_ibv_mr_reg_common(md, IBV_ACCESS_LOCAL_WRITE | + return vrb_mr_reg_common(md, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_REMOTE_READ, entry->info.iov.iov_base, - entry->info.iov.iov_len, NULL); + entry->info.iov.iov_len, NULL, entry->info.iface, + entry->info.device); } -void fi_ibv_mr_cache_delete_region(struct ofi_mr_cache *cache, +void vrb_mr_cache_delete_region(struct ofi_mr_cache *cache, struct ofi_mr_entry *entry) { - struct fi_ibv_mem_desc *md = (struct fi_ibv_mem_desc *)entry->data; + struct vrb_mem_desc *md = (struct vrb_mem_desc *)entry->data; if (md->mr) (void)ibv_dereg_mr(md->mr); } static int -fi_ibv_mr_cache_reg(struct fid *fid, const void *buf, size_t len, - uint64_t access, uint64_t offset, uint64_t requested_key, - uint64_t flags, struct fid_mr **mr, void *context) +vrb_mr_cache_reg(struct vrb_domain *domain, const void *buf, size_t len, + uint64_t access, uint64_t offset, uint64_t requested_key, + uint64_t flags, struct fid_mr **mr, void *context, + enum fi_hmem_iface iface, uint64_t device) { - struct fi_ibv_domain *domain; - struct fi_ibv_mem_desc *md; + struct vrb_mem_desc *md; struct ofi_mr_entry *entry; struct fi_mr_attr attr; struct iovec iov; int ret; - if (OFI_UNLIKELY(flags)) + if (flags & ~OFI_MR_NOCACHE) return -FI_EBADFLAGS; - domain = container_of(fid, struct fi_ibv_domain, - util_domain.domain_fid.fid); - attr.access = access; attr.context = context; attr.iov_count = 1; @@ -256,19 +229,99 @@ fi_ibv_mr_cache_reg(struct fid *fid, const void *buf, size_t len, attr.offset = offset; attr.requested_key = requested_key; attr.auth_key_size = 0; + attr.iface = iface; + attr.device.reserved = device; - ret = ofi_mr_cache_search(&domain->cache, &attr, &entry); + ret = (flags & OFI_MR_NOCACHE) ? + ofi_mr_cache_reg(&domain->cache, &attr, &entry) : + ofi_mr_cache_search(&domain->cache, &attr, &entry); if (OFI_UNLIKELY(ret)) return ret; - md = (struct fi_ibv_mem_desc *) entry->data; + md = (struct vrb_mem_desc *) entry->data; *mr = &md->mr_fid; return FI_SUCCESS; } -struct fi_ops_mr fi_ibv_mr_cache_ops = { +static int +vrb_mr_reg_iface(struct fid *fid, const void *buf, size_t len, uint64_t access, + uint64_t offset, uint64_t requested_key, uint64_t flags, + struct fid_mr **mr, void *context, enum fi_hmem_iface iface, + uint64_t device) +{ + struct vrb_domain *domain; + + domain = container_of(fid, struct vrb_domain, + util_domain.domain_fid.fid); + + if (domain->cache.monitors[iface]) + return vrb_mr_cache_reg(domain, buf, len, access, offset, + requested_key, flags, mr, context, + iface, device); + else + return vrb_mr_nocache_reg(domain, buf, len, access, offset, + requested_key, flags, mr, context, + iface, device); +} + +static int +vrb_mr_regv_iface(struct fid *fid, const struct iovec *iov, size_t count, + uint64_t access, uint64_t offset, uint64_t requested_key, + uint64_t flags, struct fid_mr **mr, void *context, + enum fi_hmem_iface iface, uint64_t device) +{ + const void *addr = count ? iov->iov_base: NULL; + size_t len = count ? iov->iov_len : 0; + + if (OFI_UNLIKELY(count > 1)) + return -FI_EINVAL; + + return vrb_mr_reg_iface(fid, addr, len, access, offset, requested_key, + flags, mr, context, iface, device); +} + +static int +vrb_mr_reg(struct fid *fid, const void *buf, size_t len, uint64_t access, + uint64_t offset, uint64_t requested_key, uint64_t flags, + struct fid_mr **mr, void *context) +{ + return vrb_mr_reg_iface(fid, buf, len, access, offset, requested_key, + flags, mr, context, FI_HMEM_SYSTEM, 0); +} + +static int +vrb_mr_regv(struct fid *fid, const struct iovec *iov, size_t count, + uint64_t access, uint64_t offset, uint64_t requested_key, + uint64_t flags, struct fid_mr **mr, void *context) +{ + return vrb_mr_regv_iface(fid, iov, count, access, offset, requested_key, + flags, mr, context, FI_HMEM_SYSTEM, 0); +} + +static int vrb_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, + uint64_t flags, struct fid_mr **mr) +{ + struct vrb_domain *domain; + struct fi_mr_attr cur_abi_attr; + + domain = container_of(fid, struct vrb_domain, + util_domain.domain_fid.fid); + + ofi_mr_update_attr(domain->util_domain.fabric->fabric_fid.api_version, + domain->util_domain.info_domain_caps, attr, + &cur_abi_attr); + + return vrb_mr_regv_iface(fid, cur_abi_attr.mr_iov, + cur_abi_attr.iov_count, cur_abi_attr.access, + cur_abi_attr.offset, + cur_abi_attr.requested_key, flags, mr, + cur_abi_attr.context, cur_abi_attr.iface, + cur_abi_attr.device.reserved); +} + +struct fi_ops_mr vrb_mr_ops = { .size = sizeof(struct fi_ops_mr), - .reg = fi_ibv_mr_cache_reg, - .regv = fi_ibv_mr_regv, - .regattr = fi_ibv_mr_regattr, + .reg = vrb_mr_reg, + .regv = vrb_mr_regv, + .regattr = vrb_mr_regattr, }; diff --git a/prov/verbs/src/verbs_msg.c b/prov/verbs/src/verbs_msg.c index e8b79b50a09..17fc9a534a7 100644 --- a/prov/verbs/src/verbs_msg.c +++ b/prov/verbs/src/verbs_msg.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2013-2018 Intel Corporation, Inc. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -36,46 +37,39 @@ static inline ssize_t -fi_ibv_msg_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) +vrb_msg_ep_recvmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_recv_wr wr = { .wr_id = (uintptr_t)msg->context, .num_sge = msg->iov_count, .next = NULL, }; - struct ibv_recv_wr *bad_wr; - assert(ep->util_ep.rx_cq); - - fi_ibv_set_sge_iov(wr.sg_list, msg->msg_iov, msg->iov_count, msg->desc); - - return fi_ibv_handle_post(ibv_post_recv(ep->ibv_qp, &wr, &bad_wr)); + vrb_iov_dupa(wr.sg_list, msg->msg_iov, msg->desc, msg->iov_count); + return vrb_post_recv(ep, &wr); } static ssize_t -fi_ibv_msg_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, +vrb_msg_ep_recv(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, void *context) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); - struct ibv_sge sge = fi_ibv_init_sge(buf, len, desc); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); + struct ibv_sge sge = vrb_init_sge(buf, len, desc); struct ibv_recv_wr wr = { .wr_id = (uintptr_t)context, .num_sge = 1, .sg_list = &sge, .next = NULL, }; - struct ibv_recv_wr *bad_wr; - - assert(ep->util_ep.rx_cq); - return fi_ibv_handle_post(ibv_post_recv(ep->ibv_qp, &wr, &bad_wr)); + return vrb_post_recv(ep, &wr); } static ssize_t -fi_ibv_msg_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, +vrb_msg_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, void *context) { struct fi_msg msg = { @@ -86,14 +80,14 @@ fi_ibv_msg_ep_recvv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, .context = context, }; - return fi_ibv_msg_ep_recvmsg(ep_fid, &msg, 0); + return vrb_msg_ep_recvmsg(ep_fid, &msg, 0); } static ssize_t -fi_ibv_msg_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) +vrb_msg_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = (uintptr_t)msg->context, }; @@ -105,73 +99,75 @@ fi_ibv_msg_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t wr.opcode = IBV_WR_SEND; } - return fi_ibv_send_msg(ep, &wr, msg, flags); + return vrb_send_iov(ep, &wr, msg->msg_iov, msg->desc, + msg->iov_count, flags); } static ssize_t -fi_ibv_msg_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len, +vrb_msg_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP(ep, (uintptr_t)context), .opcode = IBV_WR_SEND, - .send_flags = VERBS_INJECT(ep, len), + .send_flags = VERBS_INJECT(ep, len, desc), }; - return fi_ibv_send_buf(ep, &wr, buf, len, desc); + return vrb_send_buf(ep, &wr, buf, len, desc); } static ssize_t -fi_ibv_msg_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t len, +vrb_msg_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, void *context) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP(ep, (uintptr_t)context), .opcode = IBV_WR_SEND_WITH_IMM, .imm_data = htonl((uint32_t)data), - .send_flags = VERBS_INJECT(ep, len), + .send_flags = VERBS_INJECT(ep, len, desc), }; - return fi_ibv_send_buf(ep, &wr, buf, len, desc); + return vrb_send_buf(ep, &wr, buf, len, desc); } static ssize_t -fi_ibv_msg_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, +vrb_msg_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, void *context) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = (uintptr_t)context, .opcode = IBV_WR_SEND, }; - return fi_ibv_send_iov(ep, &wr, iov, desc, count); + return vrb_send_iov(ep, &wr, iov, desc, count, + ep->util_ep.tx_op_flags); } -static ssize_t fi_ibv_msg_ep_inject(struct fid_ep *ep_fid, const void *buf, size_t len, +static ssize_t vrb_msg_ep_inject(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_NO_COMP_FLAG, .opcode = IBV_WR_SEND, .send_flags = IBV_SEND_INLINE, }; - return fi_ibv_send_buf_inline(ep, &wr, buf, len); + return vrb_send_buf(ep, &wr, buf, len, NULL); } -static ssize_t fi_ibv_msg_ep_injectdata(struct fid_ep *ep_fid, const void *buf, size_t len, +static ssize_t vrb_msg_ep_injectdata(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_NO_COMP_FLAG, .opcode = IBV_WR_SEND_WITH_IMM, @@ -179,28 +175,28 @@ static ssize_t fi_ibv_msg_ep_injectdata(struct fid_ep *ep_fid, const void *buf, .send_flags = IBV_SEND_INLINE, }; - return fi_ibv_send_buf_inline(ep, &wr, buf, len); + return vrb_send_buf(ep, &wr, buf, len, NULL); } static ssize_t -fi_ibv_msg_inject_fast(struct fid_ep *ep_fid, const void *buf, size_t len, +vrb_msg_inject_fast(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); ep->wrs->sge.addr = (uintptr_t) buf; ep->wrs->sge.length = (uint32_t) len; - return fi_ibv_send_poll_cq_if_needed(ep, &ep->wrs->msg_wr); + return vrb_post_send(ep, &ep->wrs->msg_wr, 0); } -static ssize_t fi_ibv_msg_ep_injectdata_fast(struct fid_ep *ep_fid, const void *buf, size_t len, +static ssize_t vrb_msg_ep_injectdata_fast(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr) { ssize_t ret; - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); ep->wrs->msg_wr.imm_data = htonl((uint32_t)data); ep->wrs->msg_wr.opcode = IBV_WR_SEND_WITH_IMM; @@ -208,47 +204,47 @@ static ssize_t fi_ibv_msg_ep_injectdata_fast(struct fid_ep *ep_fid, const void * ep->wrs->sge.addr = (uintptr_t) buf; ep->wrs->sge.length = (uint32_t) len; - ret = fi_ibv_send_poll_cq_if_needed(ep, &ep->wrs->msg_wr); + ret = vrb_post_send(ep, &ep->wrs->msg_wr, 0); ep->wrs->msg_wr.opcode = IBV_WR_SEND; return ret; } -const struct fi_ops_msg fi_ibv_msg_ep_msg_ops_ts = { +const struct fi_ops_msg vrb_msg_ep_msg_ops_ts = { .size = sizeof(struct fi_ops_msg), - .recv = fi_ibv_msg_ep_recv, - .recvv = fi_ibv_msg_ep_recvv, - .recvmsg = fi_ibv_msg_ep_recvmsg, - .send = fi_ibv_msg_ep_send, - .sendv = fi_ibv_msg_ep_sendv, - .sendmsg = fi_ibv_msg_ep_sendmsg, - .inject = fi_ibv_msg_ep_inject, - .senddata = fi_ibv_msg_ep_senddata, - .injectdata = fi_ibv_msg_ep_injectdata, + .recv = vrb_msg_ep_recv, + .recvv = vrb_msg_ep_recvv, + .recvmsg = vrb_msg_ep_recvmsg, + .send = vrb_msg_ep_send, + .sendv = vrb_msg_ep_sendv, + .sendmsg = vrb_msg_ep_sendmsg, + .inject = vrb_msg_ep_inject, + .senddata = vrb_msg_ep_senddata, + .injectdata = vrb_msg_ep_injectdata, }; -const struct fi_ops_msg fi_ibv_msg_ep_msg_ops = { +const struct fi_ops_msg vrb_msg_ep_msg_ops = { .size = sizeof(struct fi_ops_msg), - .recv = fi_ibv_msg_ep_recv, - .recvv = fi_ibv_msg_ep_recvv, - .recvmsg = fi_ibv_msg_ep_recvmsg, - .send = fi_ibv_msg_ep_send, - .sendv = fi_ibv_msg_ep_sendv, - .sendmsg = fi_ibv_msg_ep_sendmsg, - .inject = fi_ibv_msg_inject_fast, - .senddata = fi_ibv_msg_ep_senddata, - .injectdata = fi_ibv_msg_ep_injectdata_fast, + .recv = vrb_msg_ep_recv, + .recvv = vrb_msg_ep_recvv, + .recvmsg = vrb_msg_ep_recvmsg, + .send = vrb_msg_ep_send, + .sendv = vrb_msg_ep_sendv, + .sendmsg = vrb_msg_ep_sendmsg, + .inject = vrb_msg_inject_fast, + .senddata = vrb_msg_ep_senddata, + .injectdata = vrb_msg_ep_injectdata_fast, }; static ssize_t -fi_ibv_msg_xrc_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) +vrb_msg_xrc_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint64_t flags) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = (uintptr_t)msg->context, }; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); if (flags & FI_REMOTE_CQ_DATA) { wr.opcode = IBV_WR_SEND_WITH_IMM; @@ -257,64 +253,66 @@ fi_ibv_msg_xrc_ep_sendmsg(struct fid_ep *ep_fid, const struct fi_msg *msg, uint6 wr.opcode = IBV_WR_SEND; } - return fi_ibv_send_msg(&ep->base_ep, &wr, msg, flags); + return vrb_send_iov(&ep->base_ep, &wr, msg->msg_iov, msg->desc, + msg->iov_count, flags); } static ssize_t -fi_ibv_msg_xrc_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len, +vrb_msg_xrc_ep_send(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, void *context) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP(&ep->base_ep, (uintptr_t)context), .opcode = IBV_WR_SEND, - .send_flags = VERBS_INJECT(&ep->base_ep, len), + .send_flags = VERBS_INJECT(&ep->base_ep, len, desc), }; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); - return fi_ibv_send_buf(&ep->base_ep, &wr, buf, len, desc); + return vrb_send_buf(&ep->base_ep, &wr, buf, len, desc); } static ssize_t -fi_ibv_msg_xrc_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t len, +vrb_msg_xrc_ep_senddata(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, void *context) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP(&ep->base_ep, (uintptr_t)context), .opcode = IBV_WR_SEND_WITH_IMM, .imm_data = htonl((uint32_t)data), - .send_flags = VERBS_INJECT(&ep->base_ep, len), + .send_flags = VERBS_INJECT(&ep->base_ep, len, desc), }; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); - return fi_ibv_send_buf(&ep->base_ep, &wr, buf, len, desc); + return vrb_send_buf(&ep->base_ep, &wr, buf, len, desc); } static ssize_t -fi_ibv_msg_xrc_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, +vrb_msg_xrc_ep_sendv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, void *context) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = (uintptr_t)context, .opcode = IBV_WR_SEND, }; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); - return fi_ibv_send_iov(&ep->base_ep, &wr, iov, desc, count); + return vrb_send_iov(&ep->base_ep, &wr, iov, desc, count, + ep->base_ep.util_ep.tx_op_flags); } -static ssize_t fi_ibv_msg_xrc_ep_inject(struct fid_ep *ep_fid, const void *buf, size_t len, +static ssize_t vrb_msg_xrc_ep_inject(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_NO_COMP_FLAG, @@ -322,15 +320,15 @@ static ssize_t fi_ibv_msg_xrc_ep_inject(struct fid_ep *ep_fid, const void *buf, .send_flags = IBV_SEND_INLINE, }; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); - return fi_ibv_send_buf_inline(&ep->base_ep, &wr, buf, len); + return vrb_send_buf(&ep->base_ep, &wr, buf, len, NULL); } -static ssize_t fi_ibv_msg_xrc_ep_injectdata(struct fid_ep *ep_fid, const void *buf, size_t len, +static ssize_t vrb_msg_xrc_ep_injectdata(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_NO_COMP_FLAG, @@ -339,13 +337,13 @@ static ssize_t fi_ibv_msg_xrc_ep_injectdata(struct fid_ep *ep_fid, const void *b .send_flags = IBV_SEND_INLINE, }; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); - return fi_ibv_send_buf_inline(&ep->base_ep, &wr, buf, len); + return vrb_send_buf(&ep->base_ep, &wr, buf, len, NULL); } /* NOTE: Initially the XRC endpoint must be used with a SRQ. */ -const struct fi_ops_msg fi_ibv_msg_xrc_ep_msg_ops_ts = { +const struct fi_ops_msg vrb_msg_xrc_ep_msg_ops_ts = { .size = sizeof(struct fi_ops_msg), .recv = fi_no_msg_recv, .recvv = fi_no_msg_recvv, @@ -358,7 +356,7 @@ const struct fi_ops_msg fi_ibv_msg_xrc_ep_msg_ops_ts = { .injectdata = fi_no_msg_injectdata, }; -const struct fi_ops_msg fi_ibv_msg_xrc_ep_msg_ops = { +const struct fi_ops_msg vrb_msg_xrc_ep_msg_ops = { .size = sizeof(struct fi_ops_msg), .recv = fi_no_msg_recv, .recvv = fi_no_msg_recvv, @@ -371,15 +369,15 @@ const struct fi_ops_msg fi_ibv_msg_xrc_ep_msg_ops = { .injectdata = fi_no_msg_injectdata, }; -const struct fi_ops_msg fi_ibv_msg_srq_xrc_ep_msg_ops = { +const struct fi_ops_msg vrb_msg_srq_xrc_ep_msg_ops = { .size = sizeof(struct fi_ops_msg), .recv = fi_no_msg_recv, .recvv = fi_no_msg_recvv, .recvmsg = fi_no_msg_recvmsg, - .send = fi_ibv_msg_xrc_ep_send, - .sendv = fi_ibv_msg_xrc_ep_sendv, - .sendmsg = fi_ibv_msg_xrc_ep_sendmsg, - .inject = fi_ibv_msg_xrc_ep_inject, - .senddata = fi_ibv_msg_xrc_ep_senddata, - .injectdata = fi_ibv_msg_xrc_ep_injectdata, + .send = vrb_msg_xrc_ep_send, + .sendv = vrb_msg_xrc_ep_sendv, + .sendmsg = vrb_msg_xrc_ep_sendmsg, + .inject = vrb_msg_xrc_ep_inject, + .senddata = vrb_msg_xrc_ep_senddata, + .injectdata = vrb_msg_xrc_ep_injectdata, }; diff --git a/prov/verbs/src/verbs_rma.c b/prov/verbs/src/verbs_rma.c index 702c8959d47..074baffcc54 100644 --- a/prov/verbs/src/verbs_rma.c +++ b/prov/verbs/src/verbs_rma.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2013-2018 Intel Corporation, Inc. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -44,30 +45,30 @@ VERBS_COMP_READ_FLAGS(ep, 0, context) static ssize_t -fi_ibv_msg_ep_rma_write(struct fid_ep *ep_fid, const void *buf, size_t len, +vrb_msg_ep_rma_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP(ep, (uintptr_t)context), .opcode = IBV_WR_RDMA_WRITE, .wr.rdma.remote_addr = addr, .wr.rdma.rkey = (uint32_t)key, - .send_flags = VERBS_INJECT(ep, len), + .send_flags = VERBS_INJECT(ep, len, desc), }; - return fi_ibv_send_buf(ep, &wr, buf, len, desc); + return vrb_send_buf(ep, &wr, buf, len, desc); } static ssize_t -fi_ibv_msg_ep_rma_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, +vrb_msg_ep_rma_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = (uintptr_t)context, .opcode = IBV_WR_RDMA_WRITE, @@ -75,15 +76,16 @@ fi_ibv_msg_ep_rma_writev(struct fid_ep *ep_fid, const struct iovec *iov, void ** .wr.rdma.rkey = (uint32_t)key, }; - return fi_ibv_send_iov(ep, &wr, iov, desc, count); + return vrb_send_iov(ep, &wr, iov, desc, count, + ep->util_ep.tx_op_flags); } static ssize_t -fi_ibv_msg_ep_rma_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, +vrb_msg_ep_rma_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = (uintptr_t)msg->context, .wr.rdma.remote_addr = msg->rma_iov->addr, @@ -97,16 +99,17 @@ fi_ibv_msg_ep_rma_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, wr.opcode = IBV_WR_RDMA_WRITE; } - return fi_ibv_send_msg(ep, &wr, msg, flags); + return vrb_send_iov(ep, &wr, msg->msg_iov, msg->desc, + msg->iov_count, flags); } static ssize_t -fi_ibv_msg_ep_rma_read(struct fid_ep *ep_fid, void *buf, size_t len, +vrb_msg_ep_rma_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP_READ(ep, (uintptr_t)context), .opcode = IBV_WR_RDMA_READ, @@ -114,16 +117,16 @@ fi_ibv_msg_ep_rma_read(struct fid_ep *ep_fid, void *buf, size_t len, .wr.rdma.rkey = (uint32_t)key, }; - return fi_ibv_send_buf(ep, &wr, buf, len, desc); + return vrb_send_buf(ep, &wr, buf, len, desc); } static ssize_t -fi_ibv_msg_ep_rma_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, +vrb_msg_ep_rma_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP_READ(ep, (uintptr_t)context), .opcode = IBV_WR_RDMA_READ, @@ -132,17 +135,16 @@ fi_ibv_msg_ep_rma_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **d .num_sge = count, }; - fi_ibv_set_sge_iov(wr.sg_list, iov, count, desc); - - return fi_ibv_send_poll_cq_if_needed(ep, &wr); + vrb_iov_dupa(wr.sg_list, iov, desc, count); + return vrb_post_send(ep, &wr, 0); } static ssize_t -fi_ibv_msg_ep_rma_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, +vrb_msg_ep_rma_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP_READ_FLAGS(ep, flags, (uintptr_t)msg->context), .opcode = IBV_WR_RDMA_READ, @@ -151,36 +153,35 @@ fi_ibv_msg_ep_rma_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, .num_sge = msg->iov_count, }; - fi_ibv_set_sge_iov(wr.sg_list, msg->msg_iov, msg->iov_count, msg->desc); - - return fi_ibv_send_poll_cq_if_needed(ep, &wr); + vrb_iov_dupa(wr.sg_list, msg->msg_iov, msg->desc, msg->iov_count); + return vrb_post_send(ep, &wr, 0); } static ssize_t -fi_ibv_msg_ep_rma_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, +vrb_msg_ep_rma_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP(ep, (uintptr_t)context), .opcode = IBV_WR_RDMA_WRITE_WITH_IMM, .imm_data = htonl((uint32_t)data), .wr.rdma.remote_addr = addr, .wr.rdma.rkey = (uint32_t)key, - .send_flags = VERBS_INJECT(ep, len), + .send_flags = VERBS_INJECT(ep, len, desc), }; - return fi_ibv_send_buf(ep, &wr, buf, len, desc); + return vrb_send_buf(ep, &wr, buf, len, desc); } static ssize_t -fi_ibv_msg_ep_rma_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len, +vrb_msg_ep_rma_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_NO_COMP_FLAG, .opcode = IBV_WR_RDMA_WRITE, @@ -189,16 +190,16 @@ fi_ibv_msg_ep_rma_inject_write(struct fid_ep *ep_fid, const void *buf, size_t le .send_flags = IBV_SEND_INLINE, }; - return fi_ibv_send_buf_inline(ep, &wr, buf, len); + return vrb_send_buf(ep, &wr, buf, len, NULL); } static ssize_t -fi_ibv_rma_write_fast(struct fid_ep *ep_fid, const void *buf, size_t len, +vrb_rma_write_fast(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { - struct fi_ibv_ep *ep; + struct vrb_ep *ep; - ep = container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + ep = container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); ep->wrs->rma_wr.wr.rdma.remote_addr = addr; ep->wrs->rma_wr.wr.rdma.rkey = (uint32_t) key; @@ -206,16 +207,16 @@ fi_ibv_rma_write_fast(struct fid_ep *ep_fid, const void *buf, size_t len, ep->wrs->sge.addr = (uintptr_t) buf; ep->wrs->sge.length = (uint32_t) len; - return fi_ibv_send_poll_cq_if_needed(ep, &ep->wrs->rma_wr); + return vrb_post_send(ep, &ep->wrs->rma_wr, 0); } static ssize_t -fi_ibv_msg_ep_rma_inject_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, +vrb_msg_ep_rma_inject_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_NO_COMP_FLAG, .opcode = IBV_WR_RDMA_WRITE_WITH_IMM, @@ -225,17 +226,17 @@ fi_ibv_msg_ep_rma_inject_writedata(struct fid_ep *ep_fid, const void *buf, size_ .send_flags = IBV_SEND_INLINE, }; - return fi_ibv_send_buf_inline(ep, &wr, buf, len); + return vrb_send_buf(ep, &wr, buf, len, NULL); } static ssize_t -fi_ibv_msg_ep_rma_inject_writedata_fast(struct fid_ep *ep_fid, const void *buf, size_t len, +vrb_msg_ep_rma_inject_writedata_fast(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { ssize_t ret; - struct fi_ibv_ep *ep = - container_of(ep_fid, struct fi_ibv_ep, util_ep.ep_fid); + struct vrb_ep *ep = + container_of(ep_fid, struct vrb_ep, util_ep.ep_fid); ep->wrs->rma_wr.wr.rdma.remote_addr = addr; ep->wrs->rma_wr.wr.rdma.rkey = (uint32_t) key; @@ -245,63 +246,63 @@ fi_ibv_msg_ep_rma_inject_writedata_fast(struct fid_ep *ep_fid, const void *buf, ep->wrs->sge.addr = (uintptr_t) buf; ep->wrs->sge.length = (uint32_t) len; - ret = fi_ibv_send_poll_cq_if_needed(ep, &ep->wrs->rma_wr); + ret = vrb_post_send(ep, &ep->wrs->rma_wr, 0); ep->wrs->rma_wr.opcode = IBV_WR_RDMA_WRITE; return ret; } -struct fi_ops_rma fi_ibv_msg_ep_rma_ops_ts = { +struct fi_ops_rma vrb_msg_ep_rma_ops_ts = { .size = sizeof(struct fi_ops_rma), - .read = fi_ibv_msg_ep_rma_read, - .readv = fi_ibv_msg_ep_rma_readv, - .readmsg = fi_ibv_msg_ep_rma_readmsg, - .write = fi_ibv_msg_ep_rma_write, - .writev = fi_ibv_msg_ep_rma_writev, - .writemsg = fi_ibv_msg_ep_rma_writemsg, - .inject = fi_ibv_msg_ep_rma_inject_write, - .writedata = fi_ibv_msg_ep_rma_writedata, - .injectdata = fi_ibv_msg_ep_rma_inject_writedata, + .read = vrb_msg_ep_rma_read, + .readv = vrb_msg_ep_rma_readv, + .readmsg = vrb_msg_ep_rma_readmsg, + .write = vrb_msg_ep_rma_write, + .writev = vrb_msg_ep_rma_writev, + .writemsg = vrb_msg_ep_rma_writemsg, + .inject = vrb_msg_ep_rma_inject_write, + .writedata = vrb_msg_ep_rma_writedata, + .injectdata = vrb_msg_ep_rma_inject_writedata, }; -struct fi_ops_rma fi_ibv_msg_ep_rma_ops = { +struct fi_ops_rma vrb_msg_ep_rma_ops = { .size = sizeof(struct fi_ops_rma), - .read = fi_ibv_msg_ep_rma_read, - .readv = fi_ibv_msg_ep_rma_readv, - .readmsg = fi_ibv_msg_ep_rma_readmsg, - .write = fi_ibv_msg_ep_rma_write, - .writev = fi_ibv_msg_ep_rma_writev, - .writemsg = fi_ibv_msg_ep_rma_writemsg, - .inject = fi_ibv_rma_write_fast, - .writedata = fi_ibv_msg_ep_rma_writedata, - .injectdata = fi_ibv_msg_ep_rma_inject_writedata_fast, + .read = vrb_msg_ep_rma_read, + .readv = vrb_msg_ep_rma_readv, + .readmsg = vrb_msg_ep_rma_readmsg, + .write = vrb_msg_ep_rma_write, + .writev = vrb_msg_ep_rma_writev, + .writemsg = vrb_msg_ep_rma_writemsg, + .inject = vrb_rma_write_fast, + .writedata = vrb_msg_ep_rma_writedata, + .injectdata = vrb_msg_ep_rma_inject_writedata_fast, }; static ssize_t -fi_ibv_msg_xrc_ep_rma_write(struct fid_ep *ep_fid, const void *buf, +vrb_msg_xrc_ep_rma_write(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP(&ep->base_ep, (uintptr_t)context), .opcode = IBV_WR_RDMA_WRITE, .wr.rdma.remote_addr = addr, .wr.rdma.rkey = (uint32_t)key, - .send_flags = VERBS_INJECT(&ep->base_ep, len), + .send_flags = VERBS_INJECT(&ep->base_ep, len, desc), }; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); - return fi_ibv_send_buf(&ep->base_ep, &wr, buf, len, desc); + return vrb_send_buf(&ep->base_ep, &wr, buf, len, desc); } static ssize_t -fi_ibv_msg_xrc_ep_rma_writev(struct fid_ep *ep_fid, const struct iovec *iov, +vrb_msg_xrc_ep_rma_writev(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = (uintptr_t)context, @@ -310,16 +311,17 @@ fi_ibv_msg_xrc_ep_rma_writev(struct fid_ep *ep_fid, const struct iovec *iov, .wr.rdma.rkey = (uint32_t)key, }; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); - return fi_ibv_send_iov(&ep->base_ep, &wr, iov, desc, count); + return vrb_send_iov(&ep->base_ep, &wr, iov, desc, count, + ep->base_ep.util_ep.tx_op_flags); } static ssize_t -fi_ibv_msg_xrc_ep_rma_writemsg(struct fid_ep *ep_fid, +vrb_msg_xrc_ep_rma_writemsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = (uintptr_t)msg->context, @@ -327,7 +329,7 @@ fi_ibv_msg_xrc_ep_rma_writemsg(struct fid_ep *ep_fid, .wr.rdma.rkey = (uint32_t)msg->rma_iov->key, }; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); if (flags & FI_REMOTE_CQ_DATA) { wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; @@ -336,15 +338,16 @@ fi_ibv_msg_xrc_ep_rma_writemsg(struct fid_ep *ep_fid, wr.opcode = IBV_WR_RDMA_WRITE; } - return fi_ibv_send_msg(&ep->base_ep, &wr, msg, flags); + return vrb_send_iov(&ep->base_ep, &wr, msg->msg_iov, msg->desc, + msg->iov_count, flags); } static ssize_t -fi_ibv_msg_xrc_ep_rma_read(struct fid_ep *ep_fid, void *buf, size_t len, +vrb_msg_xrc_ep_rma_read(struct fid_ep *ep_fid, void *buf, size_t len, void *desc, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP_READ(&ep->base_ep, (uintptr_t)context), @@ -353,17 +356,17 @@ fi_ibv_msg_xrc_ep_rma_read(struct fid_ep *ep_fid, void *buf, size_t len, .wr.rdma.rkey = (uint32_t)key, }; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); - return fi_ibv_send_buf(&ep->base_ep, &wr, buf, len, desc); + return vrb_send_buf(&ep->base_ep, &wr, buf, len, desc); } static ssize_t -fi_ibv_msg_xrc_ep_rma_readv(struct fid_ep *ep_fid, const struct iovec *iov, +vrb_msg_xrc_ep_rma_readv(struct fid_ep *ep_fid, const struct iovec *iov, void **desc, size_t count, fi_addr_t src_addr, uint64_t addr, uint64_t key, void *context) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP_READ(&ep->base_ep, (uintptr_t)context), @@ -373,18 +376,17 @@ fi_ibv_msg_xrc_ep_rma_readv(struct fid_ep *ep_fid, const struct iovec *iov, .num_sge = count, }; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); - fi_ibv_set_sge_iov(wr.sg_list, iov, count, desc); - - return fi_ibv_send_poll_cq_if_needed(&ep->base_ep, &wr); + vrb_iov_dupa(wr.sg_list, iov, desc, count); + return vrb_post_send(&ep->base_ep, &wr, 0); } static ssize_t -fi_ibv_msg_xrc_ep_rma_readmsg(struct fid_ep *ep_fid, +vrb_msg_xrc_ep_rma_readmsg(struct fid_ep *ep_fid, const struct fi_msg_rma *msg, uint64_t flags) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP_READ_FLAGS(&ep->base_ep, flags, @@ -395,19 +397,18 @@ fi_ibv_msg_xrc_ep_rma_readmsg(struct fid_ep *ep_fid, .num_sge = msg->iov_count, }; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); - - fi_ibv_set_sge_iov(wr.sg_list, msg->msg_iov, msg->iov_count, msg->desc); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); - return fi_ibv_send_poll_cq_if_needed(&ep->base_ep, &wr); + vrb_iov_dupa(wr.sg_list, msg->msg_iov, msg->desc, msg->iov_count); + return vrb_post_send(&ep->base_ep, &wr, flags); } static ssize_t -fi_ibv_msg_xrc_ep_rma_writedata(struct fid_ep *ep_fid, const void *buf, +vrb_msg_xrc_ep_rma_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, void *desc, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key, void *context) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_COMP(&ep->base_ep, (uintptr_t)context), @@ -415,20 +416,20 @@ fi_ibv_msg_xrc_ep_rma_writedata(struct fid_ep *ep_fid, const void *buf, .imm_data = htonl((uint32_t)data), .wr.rdma.remote_addr = addr, .wr.rdma.rkey = (uint32_t)key, - .send_flags = VERBS_INJECT(&ep->base_ep, len), + .send_flags = VERBS_INJECT(&ep->base_ep, len, desc), }; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); - return fi_ibv_send_buf(&ep->base_ep, &wr, buf, len, desc); + return vrb_send_buf(&ep->base_ep, &wr, buf, len, desc); } static ssize_t -fi_ibv_msg_xrc_ep_rma_inject_write(struct fid_ep *ep_fid, const void *buf, +vrb_msg_xrc_ep_rma_inject_write(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { .wr_id = VERBS_NO_COMP_FLAG, @@ -438,34 +439,33 @@ fi_ibv_msg_xrc_ep_rma_inject_write(struct fid_ep *ep_fid, const void *buf, .send_flags = IBV_SEND_INLINE, }; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); - return fi_ibv_send_buf_inline(&ep->base_ep, &wr, buf, len); + return vrb_send_buf(&ep->base_ep, &wr, buf, len, NULL); } static ssize_t -fi_ibv_xrc_rma_write_fast(struct fid_ep *ep_fid, const void *buf, +vrb_xrc_rma_write_fast(struct fid_ep *ep_fid, const void *buf, size_t len, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); ep->base_ep.wrs->rma_wr.wr.rdma.remote_addr = addr; ep->base_ep.wrs->rma_wr.wr.rdma.rkey = (uint32_t) key; - FI_IBV_SET_REMOTE_SRQN(ep->base_ep.wrs->rma_wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(ep->base_ep.wrs->rma_wr, ep->peer_srqn); ep->base_ep.wrs->sge.addr = (uintptr_t) buf; ep->base_ep.wrs->sge.length = (uint32_t) len; - return fi_ibv_send_poll_cq_if_needed(&ep->base_ep, - &ep->base_ep.wrs->rma_wr); + return vrb_post_send(&ep->base_ep, &ep->base_ep.wrs->rma_wr, 0); } static ssize_t -fi_ibv_msg_xrc_ep_rma_inject_writedata(struct fid_ep *ep_fid, +vrb_msg_xrc_ep_rma_inject_writedata(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); struct ibv_send_wr wr = { @@ -477,22 +477,22 @@ fi_ibv_msg_xrc_ep_rma_inject_writedata(struct fid_ep *ep_fid, .send_flags = IBV_SEND_INLINE, }; - FI_IBV_SET_REMOTE_SRQN(wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(wr, ep->peer_srqn); - return fi_ibv_send_buf_inline(&ep->base_ep, &wr, buf, len); + return vrb_send_buf(&ep->base_ep, &wr, buf, len, NULL); } static ssize_t -fi_ibv_msg_xrc_ep_rma_inject_writedata_fast(struct fid_ep *ep_fid, +vrb_msg_xrc_ep_rma_inject_writedata_fast(struct fid_ep *ep_fid, const void *buf, size_t len, uint64_t data, fi_addr_t dest_addr, uint64_t addr, uint64_t key) { ssize_t ret; - struct fi_ibv_xrc_ep *ep = container_of(ep_fid, struct fi_ibv_xrc_ep, + struct vrb_xrc_ep *ep = container_of(ep_fid, struct vrb_xrc_ep, base_ep.util_ep.ep_fid); ep->base_ep.wrs->rma_wr.wr.rdma.remote_addr = addr; ep->base_ep.wrs->rma_wr.wr.rdma.rkey = (uint32_t) key; - FI_IBV_SET_REMOTE_SRQN(ep->base_ep.wrs->rma_wr, ep->peer_srqn); + VRB_SET_REMOTE_SRQN(ep->base_ep.wrs->rma_wr, ep->peer_srqn); ep->base_ep.wrs->rma_wr.imm_data = htonl((uint32_t) data); ep->base_ep.wrs->rma_wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; @@ -500,34 +500,33 @@ fi_ibv_msg_xrc_ep_rma_inject_writedata_fast(struct fid_ep *ep_fid, ep->base_ep.wrs->sge.addr = (uintptr_t) buf; ep->base_ep.wrs->sge.length = (uint32_t) len; - ret = fi_ibv_send_poll_cq_if_needed(&ep->base_ep, - &ep->base_ep.wrs->rma_wr); + ret = vrb_post_send(&ep->base_ep, &ep->base_ep.wrs->rma_wr, 0); ep->base_ep.wrs->rma_wr.opcode = IBV_WR_RDMA_WRITE; return ret; } -struct fi_ops_rma fi_ibv_msg_xrc_ep_rma_ops_ts = { +struct fi_ops_rma vrb_msg_xrc_ep_rma_ops_ts = { .size = sizeof(struct fi_ops_rma), - .read = fi_ibv_msg_xrc_ep_rma_read, - .readv = fi_ibv_msg_xrc_ep_rma_readv, - .readmsg = fi_ibv_msg_xrc_ep_rma_readmsg, - .write = fi_ibv_msg_xrc_ep_rma_write, - .writev = fi_ibv_msg_xrc_ep_rma_writev, - .writemsg = fi_ibv_msg_xrc_ep_rma_writemsg, - .inject = fi_ibv_msg_xrc_ep_rma_inject_write, - .writedata = fi_ibv_msg_xrc_ep_rma_writedata, - .injectdata = fi_ibv_msg_xrc_ep_rma_inject_writedata, + .read = vrb_msg_xrc_ep_rma_read, + .readv = vrb_msg_xrc_ep_rma_readv, + .readmsg = vrb_msg_xrc_ep_rma_readmsg, + .write = vrb_msg_xrc_ep_rma_write, + .writev = vrb_msg_xrc_ep_rma_writev, + .writemsg = vrb_msg_xrc_ep_rma_writemsg, + .inject = vrb_msg_xrc_ep_rma_inject_write, + .writedata = vrb_msg_xrc_ep_rma_writedata, + .injectdata = vrb_msg_xrc_ep_rma_inject_writedata, }; -struct fi_ops_rma fi_ibv_msg_xrc_ep_rma_ops = { +struct fi_ops_rma vrb_msg_xrc_ep_rma_ops = { .size = sizeof(struct fi_ops_rma), - .read = fi_ibv_msg_xrc_ep_rma_read, - .readv = fi_ibv_msg_xrc_ep_rma_readv, - .readmsg = fi_ibv_msg_xrc_ep_rma_readmsg, - .write = fi_ibv_msg_xrc_ep_rma_write, - .writev = fi_ibv_msg_xrc_ep_rma_writev, - .writemsg = fi_ibv_msg_xrc_ep_rma_writemsg, - .inject = fi_ibv_xrc_rma_write_fast, - .writedata = fi_ibv_msg_xrc_ep_rma_writedata, - .injectdata = fi_ibv_msg_xrc_ep_rma_inject_writedata_fast, + .read = vrb_msg_xrc_ep_rma_read, + .readv = vrb_msg_xrc_ep_rma_readv, + .readmsg = vrb_msg_xrc_ep_rma_readmsg, + .write = vrb_msg_xrc_ep_rma_write, + .writev = vrb_msg_xrc_ep_rma_writev, + .writemsg = vrb_msg_xrc_ep_rma_writemsg, + .inject = vrb_xrc_rma_write_fast, + .writedata = vrb_msg_xrc_ep_rma_writedata, + .injectdata = vrb_msg_xrc_ep_rma_inject_writedata_fast, }; diff --git a/src/abi_1_0.c b/src/abi_1_0.c index c0fdd832f2a..34d8e605b6b 100644 --- a/src/abi_1_0.c +++ b/src/abi_1_0.c @@ -88,6 +88,21 @@ struct fi_ep_attr_1_0 { size_t rx_ctx_cnt; }; +struct fi_tx_attr_1_0 { + uint64_t caps; + uint64_t mode; + uint64_t op_flags; + uint64_t msg_order; + uint64_t comp_order; + size_t inject_size; + size_t size; + size_t iov_limit; + size_t rma_iov_limit; +}; + +/* External structure is still ABI 1.0 compliant */ +#define fi_rx_attr_1_0 fi_rx_attr + struct fi_info_1_0 { struct fi_info *next; uint64_t caps; @@ -98,13 +113,47 @@ struct fi_info_1_0 { void *src_addr; void *dest_addr; fid_t handle; - struct fi_tx_attr *tx_attr; - struct fi_rx_attr *rx_attr; + struct fi_tx_attr_1_0 *tx_attr; + struct fi_rx_attr_1_0 *rx_attr; struct fi_ep_attr_1_0 *ep_attr; struct fi_domain_attr_1_0 *domain_attr; struct fi_fabric_attr_1_0 *fabric_attr; }; +struct fi_domain_attr_1_1 { + struct fid_domain *domain; + char *name; + enum fi_threading threading; + enum fi_progress control_progress; + enum fi_progress data_progress; + enum fi_resource_mgmt resource_mgmt; + enum fi_av_type av_type; + int mr_mode; + size_t mr_key_size; + size_t cq_data_size; + size_t cq_cnt; + size_t ep_cnt; + size_t tx_ctx_cnt; + size_t rx_ctx_cnt; + size_t max_ep_tx_ctx; + size_t max_ep_rx_ctx; + size_t max_ep_stx_ctx; + size_t max_ep_srx_ctx; + size_t cntr_cnt; + size_t mr_iov_limit; + uint64_t caps; + uint64_t mode; + uint8_t *auth_key; + size_t auth_key_size; + size_t max_err_data; + size_t mr_cnt; +}; + +#define fi_tx_attr_1_1 fi_tx_attr_1_0 +#define fi_rx_attr_1_1 fi_rx_attr_1_0 +#define fi_ep_attr_1_1 fi_ep_attr +#define fi_fabric_attr_1_1 fi_fabric_attr + struct fi_info_1_1 { struct fi_info *next; uint64_t caps; @@ -115,13 +164,47 @@ struct fi_info_1_1 { void *src_addr; void *dest_addr; fid_t handle; - struct fi_tx_attr *tx_attr; - struct fi_rx_attr *rx_attr; - struct fi_ep_attr_1_0 *ep_attr; - struct fi_domain_attr_1_0 *domain_attr; - struct fi_fabric_attr_1_0 *fabric_attr; + struct fi_tx_attr_1_1 *tx_attr; + struct fi_rx_attr_1_1 *rx_attr; + struct fi_ep_attr_1_1 *ep_attr; + struct fi_domain_attr_1_1 *domain_attr; + struct fi_fabric_attr_1_1 *fabric_attr; +}; + +#define fi_tx_attr_1_2 fi_tx_attr_1_1 +#define fi_rx_attr_1_2 fi_rx_attr_1_1 +#define fi_ep_attr_1_2 fi_ep_attr_1_1 +#define fi_domain_attr_1_2 fi_domain_attr_1_1 +#define fi_fabric_attr_1_2 fi_fabric_attr_1_1 +#define fid_nic_1_2 fid_nic + +struct fi_info_1_2 { + struct fi_info *next; + uint64_t caps; + uint64_t mode; + uint32_t addr_format; + size_t src_addrlen; + size_t dest_addrlen; + void *src_addr; + void *dest_addr; + fid_t handle; + struct fi_tx_attr_1_2 *tx_attr; + struct fi_rx_attr_1_2 *rx_attr; + struct fi_ep_attr_1_2 *ep_attr; + struct fi_domain_attr_1_2 *domain_attr; + struct fi_fabric_attr_1_2 *fabric_attr; + struct fid_nic_1_2 *nic; }; +/* +#define fi_tx_attr_1_3 fi_tx_attr +#define fi_rx_attr_1_3 fi_rx_attr_1_2 +#define fi_ep_attr_1_3 fi_ep_attr_1_2 +#define fi_domain_attr_1_3 fi_domain_attr +#define fi_fabric_attr_1_3 fi_fabric_attr_1_2 +fi_info_1_3 -> fi_info +*/ + #define ofi_dup_attr(dst, src) \ do { \ dst = calloc(1, sizeof(*dst)); \ @@ -316,3 +399,55 @@ int fi_getinfo_1_1(uint32_t version, const char *node, const char *service, return ret; } COMPAT_SYMVER(fi_getinfo_1_1, fi_getinfo, FABRIC_1.1); + +/* + * ABI 1.2 + */ +__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) +void fi_freeinfo_1_2(struct fi_info_1_2 *info) +{ + fi_freeinfo((struct fi_info *) info); +} +COMPAT_SYMVER(fi_freeinfo_1_2, fi_freeinfo, FABRIC_1.2); + +__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) +struct fi_info_1_2 *fi_dupinfo_1_2(const struct fi_info_1_2 *info) +{ + struct fi_info *dup, *base; + + if (!info) + return (struct fi_info_1_2 *) ofi_allocinfo_internal(); + + ofi_dup_attr(base, info); + if (base == NULL) + return NULL; + + dup = fi_dupinfo(base); + + free(base); + return (struct fi_info_1_2 *) dup; +} +COMPAT_SYMVER(fi_dupinfo_1_2, fi_dupinfo, FABRIC_1.2); + +__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) +int fi_getinfo_1_2(uint32_t version, const char *node, const char *service, + uint64_t flags, const struct fi_info_1_2 *hints_1_2, + struct fi_info_1_2 **info) +{ + struct fi_info *hints; + int ret; + + if (hints_1_2) { + hints = (struct fi_info *) fi_dupinfo_1_2(hints_1_2); + if (!hints) + return -FI_ENOMEM; + } else { + hints = NULL; + } + ret = fi_getinfo(version, node, service, flags, hints, + (struct fi_info **) info); + fi_freeinfo(hints); + + return ret; +} +COMPAT_SYMVER(fi_getinfo_1_2, fi_getinfo, FABRIC_1.2); diff --git a/src/common.c b/src/common.c index 5e833419eb9..30c3f452ba0 100644 --- a/src/common.c +++ b/src/common.c @@ -3,6 +3,7 @@ * Copyright (c) 2006-2017 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2013-2018 Intel Corp., Inc. All rights reserved. * Copyright (c) 2015 Los Alamos Nat. Security, LLC. All rights reserved. + * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -70,8 +71,8 @@ struct fi_provider core_prov = { .name = "core", - .version = 1, - .fi_version = FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION) + .version = OFI_VERSION_DEF_PROV, + .fi_version = OFI_VERSION_LATEST }; struct ofi_common_locks common_locks = { @@ -79,6 +80,8 @@ struct ofi_common_locks common_locks = { .util_fabric_lock = PTHREAD_MUTEX_INITIALIZER, }; +size_t ofi_universe_size = 1024; + int fi_poll_fd(int fd, int timeout) { struct pollfd fds; @@ -116,64 +119,66 @@ uint8_t ofi_lsb(uint64_t num) return ofi_msb(num & (~(num - 1))); } -int ofi_send_allowed(uint64_t caps) +bool ofi_send_allowed(uint64_t caps) { - if (caps & FI_MSG || - caps & FI_TAGGED) { + if ((caps & FI_MSG) || (caps & FI_TAGGED)) { if (caps & FI_SEND) - return 1; + return true; if (caps & FI_RECV) - return 0; - return 1; + return false; + return true; } - return 0; + return false; } -int ofi_recv_allowed(uint64_t caps) +bool ofi_recv_allowed(uint64_t caps) { - if (caps & FI_MSG || - caps & FI_TAGGED) { + if ((caps & FI_MSG) || (caps & FI_TAGGED)) { if (caps & FI_RECV) - return 1; + return true; if (caps & FI_SEND) - return 0; - return 1; + return false; + return true; } - return 0; + return false; } -int ofi_rma_initiate_allowed(uint64_t caps) +bool ofi_rma_initiate_allowed(uint64_t caps) { - if (caps & FI_RMA || - caps & FI_ATOMICS) { - if (caps & FI_WRITE || - caps & FI_READ) - return 1; - if (caps & FI_REMOTE_WRITE || - caps & FI_REMOTE_READ) - return 0; - return 1; + if ((caps & FI_RMA) || (caps & FI_ATOMICS)) { + if ((caps & FI_WRITE) || (caps & FI_READ)) + return true; + if ((caps & FI_REMOTE_WRITE) || (caps & FI_REMOTE_READ)) + return false; + return true; } - return 0; + return false; } -int ofi_rma_target_allowed(uint64_t caps) +bool ofi_rma_target_allowed(uint64_t caps) { - if (caps & FI_RMA || - caps & FI_ATOMICS) { - if (caps & FI_REMOTE_WRITE || - caps & FI_REMOTE_READ) - return 1; - if (caps & FI_WRITE || - caps & FI_READ) - return 0; - return 1; + if ((caps & FI_RMA) || (caps & FI_ATOMICS)) { + if ((caps & FI_REMOTE_WRITE) || (caps & FI_REMOTE_READ)) + return true; + if ((caps & FI_WRITE) || (caps & FI_READ)) + return false; + return true; } - return 0; + return false; +} + +bool ofi_needs_tx(uint64_t caps) +{ + return ofi_send_allowed(caps) || ofi_rma_initiate_allowed(caps); +} + +bool ofi_needs_rx(uint64_t caps) +{ + return ofi_recv_allowed(caps); } int ofi_ep_bind_valid(const struct fi_provider *prov, struct fid *bfid, uint64_t flags) @@ -218,20 +223,36 @@ int ofi_check_rx_mode(const struct fi_info *info, uint64_t flags) return (info->mode & flags) ? 1 : 0; } -uint64_t fi_gettime_ms(void) +uint32_t ofi_generate_seed(void) +{ + /* Time returns long; keep the lower and most significant 32 bits */ + uint32_t rand_seed; + struct timeval tv; + gettimeofday(&tv, NULL); + rand_seed = ((getpid() & 0xffffffff) << 16); + + /* Mix the PID into the upper bits */ + rand_seed |= (uint32_t) tv.tv_usec; + + return rand_seed; +} + +uint64_t ofi_gettime_ns(void) { - struct timeval now; + struct timespec now; - gettimeofday(&now, NULL); - return now.tv_sec * 1000 + now.tv_usec / 1000; + clock_gettime(CLOCK_MONOTONIC, &now); + return now.tv_sec * 1000000000 + now.tv_nsec; } -uint64_t fi_gettime_us(void) +uint64_t ofi_gettime_us(void) { - struct timeval now; + return ofi_gettime_ns() / 1000; +} - gettimeofday(&now, NULL); - return now.tv_sec * 1000000 + now.tv_usec; +uint64_t ofi_gettime_ms(void) +{ + return ofi_gettime_ns() / 1000000; } uint16_t ofi_get_sa_family(const struct fi_info *info) @@ -265,6 +286,7 @@ const char *ofi_straddr(char *buf, size_t *len, const struct sockaddr *sock_addr; const struct sockaddr_in6 *sin6; const struct sockaddr_in *sin; + const struct ofi_sockaddr_ib *sib; char str[INET6_ADDRSTRLEN + 8]; size_t size; @@ -309,11 +331,23 @@ const char *ofi_straddr(char *buf, size_t *len, memset(str, 0, sizeof(str)); if (!inet_ntop(AF_INET6, addr, str, INET6_ADDRSTRLEN)) return NULL; - size = snprintf(buf, *len, "fi_addr_efa://[%s]:%" PRIu16, - str, *((uint16_t *)addr + 8)); + size = snprintf(buf, *len, "fi_addr_efa://[%s]:%" PRIu16 ":%" PRIu32, + str, *((uint16_t *)addr + 8), *((uint32_t *)addr + 5)); break; case FI_SOCKADDR_IB: - size = snprintf(buf, *len, "fi_sockaddr_ib://%p", addr); + sib = addr; + memset(str, 0, sizeof(str)); + if (!inet_ntop(AF_INET6, sib->sib_addr, str, INET6_ADDRSTRLEN)) + return NULL; + + size = snprintf(buf, *len, "fi_sockaddr_ib://[%s]" /* GID */ + ":0x%" PRIx16 /* P_Key */ + ":0x%" PRIx16 /* port space */ + ":0x%" PRIx8 /* Scope ID */, + str, /* GID */ + ntohs(sib->sib_pkey), /* P_Key */ + (uint16_t)(ntohll(sib->sib_sid) >> 16) & 0xfff, /* port space */ + (uint8_t)ntohll(sib->sib_scope_id) & 0xff); break; case FI_ADDR_PSMX: size = snprintf(buf, *len, "fi_addr_psmx://%" PRIx64, @@ -324,6 +358,11 @@ const char *ofi_straddr(char *buf, size_t *len, snprintf(buf, *len, "fi_addr_psmx2://%" PRIx64 ":%" PRIx64, *(uint64_t *)addr, *((uint64_t *)addr + 1)); break; + case FI_ADDR_PSMX3: + size = + snprintf(buf, *len, "fi_addr_psmx3://%" PRIx64 ":%" PRIx64, + *(uint64_t *)addr, *((uint64_t *)addr + 1)); + break; case FI_ADDR_GNI: size = snprintf(buf, *len, "fi_addr_gni://%" PRIx64, *(uint64_t *)addr); @@ -361,16 +400,16 @@ const char *ofi_straddr(char *buf, size_t *len, return buf; } -static uint32_t ofi_addr_format(const char *str) +uint32_t ofi_addr_format(const char *str) { - char fmt[16]; + char fmt[17]; int ret; + memset(fmt, 0, sizeof(fmt)); ret = sscanf(str, "%16[^:]://", fmt); if (ret != 1) return FI_FORMAT_UNSPEC; - fmt[sizeof(fmt) - 1] = '\0'; if (!strcasecmp(fmt, "fi_sockaddr_in")) return FI_SOCKADDR_IN; else if (!strcasecmp(fmt, "fi_sockaddr_in6")) @@ -381,6 +420,8 @@ static uint32_t ofi_addr_format(const char *str) return FI_ADDR_PSMX; else if (!strcasecmp(fmt, "fi_addr_psmx2")) return FI_ADDR_PSMX2; + else if (!strcasecmp(fmt, "fi_addr_psmx3")) + return FI_ADDR_PSMX3; else if (!strcasecmp(fmt, "fi_addr_gni")) return FI_ADDR_GNI; else if (!strcasecmp(fmt, "fi_addr_bgq")) @@ -430,6 +471,24 @@ static int ofi_str_to_psmx2(const char *str, void **addr, size_t *len) return -FI_EINVAL; } +static int ofi_str_to_psmx3(const char *str, void **addr, size_t *len) +{ + int ret; + + *len = 2 * sizeof(uint64_t); + *addr = calloc(1, *len); + if (!(*addr)) + return -FI_ENOMEM; + + ret = sscanf(str, "%*[^:]://%" SCNx64 ":%" SCNx64, + (uint64_t *) *addr, (uint64_t *) *addr + 1); + if (ret == 2) + return 0; + + free(*addr); + return -FI_EINVAL; +} + static int ofi_str_to_ib_ud(const char *str, void **addr, size_t *len) { int ret; @@ -457,21 +516,117 @@ static int ofi_str_to_ib_ud(const char *str, void **addr, size_t *len) return -FI_EINVAL; } +static int ofi_str_to_sib(const char *str, void **addr, size_t *len) +{ + int ret; + char *tok, *endptr, *saveptr; + struct ofi_sockaddr_ib *sib; + uint16_t pkey; + uint16_t ps; + uint64_t scope_id; + uint16_t port; + char gid[64 + 1]; + char extra_str[64 + 1]; + + memset(gid, 0, sizeof(gid)); + + ret = sscanf(str, "%*[^:]://[%64[^]]]" /* GID */ + ":%64s", /* P_Key : port_space : Scope ID : port */ + gid, extra_str); + if (ret != 2) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Invalid GID in address: %s\n", str); + return -FI_EINVAL; + } + + tok = strtok_r(extra_str, ":", &saveptr); + if (!tok) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Invalid pkey in address: %s\n", str); + return -FI_EINVAL; + } + + pkey = strtol(tok, &endptr, 0); + if (*endptr) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Invalid pkey in address: %s\n", str); + return -FI_EINVAL; + } + + tok = strtok_r(NULL, ":", &saveptr); + if (!tok) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Invalid port space in address: %s\n", str); + return -FI_EINVAL; + } + + ps = strtol(tok, &endptr, 0); + if (*endptr) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Invalid port space in address: %s\n", str); + return -FI_EINVAL; + } + + tok = strtok_r(NULL, ":", &saveptr); + if (!tok) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Invalid scope id in address: %s\n", str); + return -FI_EINVAL; + } + + scope_id = strtol(tok, &endptr, 0); + if (*endptr) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Invalid scope id in address: %s\n", str); + return -FI_EINVAL; + } + + /* Port is optional */ + tok = strtok_r(NULL, ":", &saveptr); + if (tok) + port = strtol(tok, &endptr, 0); + else + port = 0; + + *len = sizeof(struct ofi_sockaddr_ib); + *addr = calloc(1, *len); + if (!*addr) + return -FI_ENOMEM; + + sib = (struct ofi_sockaddr_ib *)(*addr); + + if (inet_pton(AF_INET6, gid, sib->sib_addr) > 0) { + sib->sib_family = AF_IB; + sib->sib_pkey = htons(pkey); + if (ps && port) { + sib->sib_sid = htonll(((uint64_t) ps << 16) + port); + sib->sib_sid_mask = htonll(OFI_IB_IP_PS_MASK | + OFI_IB_IP_PORT_MASK); + } + sib->sib_scope_id = htonll(scope_id); + return FI_SUCCESS; + } + + free(*addr); + return -FI_EINVAL; +} + static int ofi_str_to_efa(const char *str, void **addr, size_t *len) { char gid[INET6_ADDRSTRLEN]; uint16_t *qpn; + uint32_t *qkey; int ret; memset(gid, 0, sizeof(gid)); - *len = 18; + *len = 24; *addr = calloc(1, *len); if (!*addr) return -FI_ENOMEM; qpn = (uint16_t *)*addr + 8; - - ret = sscanf(str, "%*[^:]://[%64[^]]]:%" SCNu16, gid, qpn); + qkey = (uint32_t *)*addr + 5; + ret = sscanf(str, "%*[^:]://[%64[^]]]:%" SCNu16 ":%" SCNu32, gid, qpn, qkey); if (ret < 1) goto err; @@ -666,11 +821,14 @@ int ofi_str_toaddr(const char *str, uint32_t *addr_format, return ofi_str_to_psmx(str, addr, len); case FI_ADDR_PSMX2: return ofi_str_to_psmx2(str, addr, len); + case FI_ADDR_PSMX3: + return ofi_str_to_psmx3(str, addr, len); case FI_ADDR_IB_UD: return ofi_str_to_ib_ud(str, addr, len); case FI_ADDR_EFA: return ofi_str_to_efa(str, addr, len); case FI_SOCKADDR_IB: + return ofi_str_to_sib(str, addr, len); case FI_ADDR_GNI: case FI_ADDR_BGQ: case FI_ADDR_MLX: @@ -729,10 +887,10 @@ static int ofi_is_any_addr_port(struct sockaddr *addr) { switch (ofi_sa_family(addr)) { case AF_INET: - return (ofi_ipv4_is_any_addr(addr) && + return (ofi_sin_is_any_addr(addr) && ofi_sin_port(addr)); case AF_INET6: - return (ofi_ipv6_is_any_addr(addr) && + return (ofi_sin6_is_any_addr(addr) && ofi_sin6_port(addr)); default: FI_WARN(&core_prov, FI_LOG_CORE, @@ -741,8 +899,8 @@ static int ofi_is_any_addr_port(struct sockaddr *addr) } } -int ofi_is_wildcard_listen_addr(const char *node, const char *service, - uint64_t flags, const struct fi_info *hints) +bool ofi_is_wildcard_listen_addr(const char *node, const char *service, + uint64_t flags, const struct fi_info *hints) { struct addrinfo *res = NULL; int ret; @@ -751,28 +909,30 @@ int ofi_is_wildcard_listen_addr(const char *node, const char *service, hints->addr_format != FI_SOCKADDR && hints->addr_format != FI_SOCKADDR_IN && hints->addr_format != FI_SOCKADDR_IN6) - return 0; + return false; /* else it's okay to call getaddrinfo, proceed with processing */ if (node) { + if (!(flags & FI_SOURCE)) + return false; ret = getaddrinfo(node, service, NULL, &res); if (ret) { FI_WARN(&core_prov, FI_LOG_CORE, "getaddrinfo failed!\n"); - return 0; + return false; } if (ofi_is_any_addr_port(res->ai_addr)) { freeaddrinfo(res); goto out; } freeaddrinfo(res); - return 0; + return false; } if (hints) { if (hints->dest_addr) - return 0; + return false; if (!hints->src_addr) goto out; @@ -780,7 +940,7 @@ int ofi_is_wildcard_listen_addr(const char *node, const char *service, return ofi_is_any_addr_port(hints->src_addr); } out: - return ((flags & FI_SOURCE) && service) ? 1 : 0; + return ((flags & FI_SOURCE) && service); } size_t ofi_mask_addr(struct sockaddr *maskaddr, const struct sockaddr *srcaddr, @@ -823,9 +983,13 @@ void ofi_straddr_log_internal(const char *func, int line, size_t len = sizeof(buf); if (fi_log_enabled(prov, level, subsys)) { - addr_format = ofi_translate_addr_format(ofi_sa_family(addr)); - fi_log(prov, level, subsys, func, line, "%s: %s\n", log_str, - ofi_straddr(buf, &len, addr_format, addr)); + if (addr) { + addr_format = ofi_translate_addr_format(ofi_sa_family(addr)); + fi_log(prov, level, subsys, func, line, "%s: %s\n", log_str, + ofi_straddr(buf, &len, addr_format, addr)); + } else { + fi_log(prov, level, subsys, func, line, "%s: (null)\n", log_str); + } } } @@ -840,47 +1004,44 @@ int ofi_discard_socket(SOCKET sock, size_t len) } -#ifndef HAVE_EPOLL - -int fi_epoll_create(struct fi_epoll **ep) +int ofi_pollfds_create(struct ofi_pollfds **pfds) { int ret; - *ep = calloc(1, sizeof(struct fi_epoll)); - if (!*ep) + *pfds = calloc(1, sizeof(struct ofi_pollfds)); + if (!*pfds) return -FI_ENOMEM; - (*ep)->size = 64; - (*ep)->fds = calloc((*ep)->size, sizeof(*(*ep)->fds) + - sizeof(*(*ep)->context)); - if (!(*ep)->fds) { + (*pfds)->size = 64; + (*pfds)->fds = calloc((*pfds)->size, sizeof(*(*pfds)->fds) + + sizeof(*(*pfds)->context)); + if (!(*pfds)->fds) { ret = -FI_ENOMEM; goto err1; } - (*ep)->context = (void *)((*ep)->fds + (*ep)->size); + (*pfds)->context = (void *)((*pfds)->fds + (*pfds)->size); - ret = fd_signal_init(&(*ep)->signal); + ret = fd_signal_init(&(*pfds)->signal); if (ret) goto err2; - (*ep)->fds[(*ep)->nfds].fd = (*ep)->signal.fd[FI_READ_FD]; - (*ep)->fds[(*ep)->nfds].events = FI_EPOLL_IN; - (*ep)->context[(*ep)->nfds++] = NULL; - slist_init(&(*ep)->work_item_list); - fastlock_init(&(*ep)->lock); + (*pfds)->fds[(*pfds)->nfds].fd = (*pfds)->signal.fd[FI_READ_FD]; + (*pfds)->fds[(*pfds)->nfds].events = POLLIN; + (*pfds)->context[(*pfds)->nfds++] = NULL; + slist_init(&(*pfds)->work_item_list); + fastlock_init(&(*pfds)->lock); return FI_SUCCESS; err2: - free((*ep)->fds); + free((*pfds)->fds); err1: - free(*ep); + free(*pfds); return ret; } - -static int fi_epoll_ctl(struct fi_epoll *ep, enum fi_epoll_ctl op, - int fd, uint32_t events, void *context) +static int ofi_pollfds_ctl(struct ofi_pollfds *pfds, enum ofi_pollfds_ctl op, + int fd, uint32_t events, void *context) { - struct fi_epoll_work_item *item; + struct ofi_pollfds_work_item *item; item = calloc(1,sizeof(*item)); if (!item) @@ -890,102 +1051,104 @@ static int fi_epoll_ctl(struct fi_epoll *ep, enum fi_epoll_ctl op, item->events = events; item->context = context; item->type = op; - fastlock_acquire(&ep->lock); - slist_insert_tail(&item->entry, &ep->work_item_list); - fd_signal_set(&ep->signal); - fastlock_release(&ep->lock); + fastlock_acquire(&pfds->lock); + slist_insert_tail(&item->entry, &pfds->work_item_list); + fd_signal_set(&pfds->signal); + fastlock_release(&pfds->lock); return 0; } -int fi_epoll_add(struct fi_epoll *ep, int fd, uint32_t events, void *context) +int ofi_pollfds_add(struct ofi_pollfds *pfds, int fd, uint32_t events, + void *context) { - return fi_epoll_ctl(ep, EPOLL_CTL_ADD, fd, events, context); + return ofi_pollfds_ctl(pfds, POLLFDS_CTL_ADD, fd, events, context); } -int fi_epoll_mod(struct fi_epoll *ep, int fd, uint32_t events, void *context) +int ofi_pollfds_mod(struct ofi_pollfds *pfds, int fd, uint32_t events, + void *context) { - return fi_epoll_ctl(ep, EPOLL_CTL_MOD, fd, events, context); + return ofi_pollfds_ctl(pfds, POLLFDS_CTL_MOD, fd, events, context); } -int fi_epoll_del(struct fi_epoll *ep, int fd) +int ofi_pollfds_del(struct ofi_pollfds *pfds, int fd) { - return fi_epoll_ctl(ep, EPOLL_CTL_DEL, fd, 0, NULL); + return ofi_pollfds_ctl(pfds, POLLFDS_CTL_DEL, fd, 0, NULL); } -static int fi_epoll_fd_array_grow(struct fi_epoll *ep) +static int ofi_pollfds_array(struct ofi_pollfds *pfds) { struct pollfd *fds; void *contexts; - fds = calloc(ep->size + 64, - sizeof(*ep->fds) + sizeof(*ep->context)); + fds = calloc(pfds->size + 64, + sizeof(*pfds->fds) + sizeof(*pfds->context)); if (!fds) return -FI_ENOMEM; - ep->size += 64; - contexts = fds + ep->size; + pfds->size += 64; + contexts = fds + pfds->size; - memcpy(fds, ep->fds, ep->nfds * sizeof(*ep->fds)); - memcpy(contexts, ep->context, ep->nfds * sizeof(*ep->context)); - free(ep->fds); - ep->fds = fds; - ep->context = contexts; + memcpy(fds, pfds->fds, pfds->nfds * sizeof(*pfds->fds)); + memcpy(contexts, pfds->context, pfds->nfds * sizeof(*pfds->context)); + free(pfds->fds); + pfds->fds = fds; + pfds->context = contexts; return FI_SUCCESS; } -static void fi_epoll_cleanup_array(struct fi_epoll *ep) +static void ofi_pollfds_cleanup(struct ofi_pollfds *pfds) { int i; - for (i = 0; i < ep->nfds; i++) { - while (ep->fds[i].fd == INVALID_SOCKET) { - ep->fds[i].fd = ep->fds[ep->nfds-1].fd; - ep->fds[i].events = ep->fds[ep->nfds-1].events; - ep->fds[i].revents = ep->fds[ep->nfds-1].revents; - ep->context[i] = ep->context[ep->nfds-1]; - ep->nfds--; - if (i == ep->nfds) + for (i = 0; i < pfds->nfds; i++) { + while (pfds->fds[i].fd == INVALID_SOCKET) { + pfds->fds[i].fd = pfds->fds[pfds->nfds-1].fd; + pfds->fds[i].events = pfds->fds[pfds->nfds-1].events; + pfds->fds[i].revents = pfds->fds[pfds->nfds-1].revents; + pfds->context[i] = pfds->context[pfds->nfds-1]; + pfds->nfds--; + if (i == pfds->nfds) break; } } } -static void fi_epoll_process_work_item_list(struct fi_epoll *ep) +static void ofi_pollfds_process_work(struct ofi_pollfds *pfds) { struct slist_entry *entry; - struct fi_epoll_work_item *item; + struct ofi_pollfds_work_item *item; int i; - while (!slist_empty(&ep->work_item_list)) { - if ((ep->nfds == ep->size) && - fi_epoll_fd_array_grow(ep)) + while (!slist_empty(&pfds->work_item_list)) { + if ((pfds->nfds == pfds->size) && + ofi_pollfds_array(pfds)) continue; - entry = slist_remove_head(&ep->work_item_list); - item = container_of(entry, struct fi_epoll_work_item, entry); + entry = slist_remove_head(&pfds->work_item_list); + item = container_of(entry, struct ofi_pollfds_work_item, entry); switch (item->type) { - case EPOLL_CTL_ADD: - ep->fds[ep->nfds].fd = item->fd; - ep->fds[ep->nfds].events = item->events; - ep->context[ep->nfds] = item->context; - ep->nfds++; + case POLLFDS_CTL_ADD: + pfds->fds[pfds->nfds].fd = item->fd; + pfds->fds[pfds->nfds].events = item->events; + pfds->fds[pfds->nfds].revents = 0; + pfds->context[pfds->nfds] = item->context; + pfds->nfds++; break; - case EPOLL_CTL_DEL: - for (i = 0; i < ep->nfds; i++) { - if (ep->fds[i].fd == item->fd) { - ep->fds[i].fd = INVALID_SOCKET; + case POLLFDS_CTL_DEL: + for (i = 0; i < pfds->nfds; i++) { + if (pfds->fds[i].fd == item->fd) { + pfds->fds[i].fd = INVALID_SOCKET; break; } } break; - case EPOLL_CTL_MOD: - for (i = 0; i < ep->nfds; i++) { - if (ep->fds[i].fd == item->fd) { - - ep->fds[i].events = item->events; - ep->fds[i].revents &= item->events; - ep->context = item->context; + case POLLFDS_CTL_MOD: + for (i = 0; i < pfds->nfds; i++) { + if (pfds->fds[i].fd == item->fd) { + pfds->fds[i].events = item->events; + pfds->fds[i].revents &= item->events; + pfds->context[i] = item->context; break; } } @@ -997,74 +1160,74 @@ static void fi_epoll_process_work_item_list(struct fi_epoll *ep) free(item); } out: - fi_epoll_cleanup_array(ep); + ofi_pollfds_cleanup(pfds); } -int fi_epoll_wait(struct fi_epoll *ep, void **contexts, int max_contexts, - int timeout) +int ofi_pollfds_wait(struct ofi_pollfds *pfds, void **contexts, + int max_contexts, int timeout) { int i, ret; int found = 0; - uint64_t start = (timeout >= 0) ? fi_gettime_ms() : 0; + uint64_t start = (timeout >= 0) ? ofi_gettime_ms() : 0; do { - ret = poll(ep->fds, ep->nfds, timeout); + ret = poll(pfds->fds, pfds->nfds, timeout); if (ret == SOCKET_ERROR) return -ofi_sockerr(); else if (ret == 0) return 0; - if (ep->fds[0].revents) - fd_signal_reset(&ep->signal); + if (pfds->fds[0].revents) + fd_signal_reset(&pfds->signal); - fastlock_acquire(&ep->lock); - if (!slist_empty(&ep->work_item_list)) - fi_epoll_process_work_item_list(ep); + fastlock_acquire(&pfds->lock); + if (!slist_empty(&pfds->work_item_list)) + ofi_pollfds_process_work(pfds); - fastlock_release(&ep->lock); + fastlock_release(&pfds->lock); - for (i = ep->index; i < ep->nfds && found < max_contexts; i++) { - if (ep->fds[i].revents && i) { - contexts[found++] = ep->context[i]; - ep->index = i; + /* Index 0 is the internal signaling fd, skip it */ + for (i = pfds->index; i < pfds->nfds && found < max_contexts; i++) { + if (pfds->fds[i].revents && i) { + contexts[found++] = pfds->context[i]; + pfds->index = i; } } - for (i = 0; i < ep->index && found < max_contexts; i++) { - if (ep->fds[i].revents && i) { - contexts[found++] = ep->context[i]; - ep->index = i; + for (i = 0; i < pfds->index && found < max_contexts; i++) { + if (pfds->fds[i].revents && i) { + contexts[found++] = pfds->context[i]; + pfds->index = i; } } if (timeout > 0) - timeout -= (int) (fi_gettime_ms() - start); + timeout -= (int) (ofi_gettime_ms() - start); } while (timeout > 0 && !found); return found; } -void fi_epoll_close(struct fi_epoll *ep) +void ofi_pollfds_close(struct ofi_pollfds *pfds) { - struct fi_epoll_work_item *item; + struct ofi_pollfds_work_item *item; struct slist_entry *entry; - if (ep) { - while (!slist_empty(&ep->work_item_list)) { - entry = slist_remove_head(&ep->work_item_list); + + if (pfds) { + while (!slist_empty(&pfds->work_item_list)) { + entry = slist_remove_head(&pfds->work_item_list); item = container_of(entry, - struct fi_epoll_work_item, + struct ofi_pollfds_work_item, entry); free(item); } - fastlock_destroy(&ep->lock); - fd_signal_free(&ep->signal); - free(ep->fds); - free(ep); + fastlock_destroy(&pfds->lock); + fd_signal_free(&pfds->signal); + free(pfds->fds); + free(pfds); } } -#endif - void ofi_free_list_of_addr(struct slist *addr_list) { @@ -1078,32 +1241,38 @@ void ofi_free_list_of_addr(struct slist *addr_list) } static inline -void ofi_insert_loopback_addr(struct fi_provider *prov, struct slist *addr_list) +void ofi_insert_loopback_addr(const struct fi_provider *prov, struct slist *addr_list) { struct ofi_addr_list_entry *addr_entry; - addr_entry = calloc(1, sizeof(struct ofi_addr_list_entry)); + addr_entry = calloc(1, sizeof(*addr_entry)); if (!addr_entry) return; + addr_entry->comm_caps = FI_LOCAL_COMM; addr_entry->ipaddr.sin.sin_family = AF_INET; - addr_entry->ipaddr.sin.sin_addr.s_addr = INADDR_LOOPBACK; + addr_entry->ipaddr.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); ofi_straddr_log(prov, FI_LOG_INFO, FI_LOG_CORE, "available addr: ", &addr_entry->ipaddr); strncpy(addr_entry->ipstr, "127.0.0.1", sizeof(addr_entry->ipstr)); + strncpy(addr_entry->net_name, "127.0.0.1/32", sizeof(addr_entry->net_name)); + strncpy(addr_entry->ifa_name, "lo", sizeof(addr_entry->ifa_name)); slist_insert_tail(&addr_entry->entry, addr_list); - addr_entry = calloc(1, sizeof(struct ofi_addr_list_entry)); + addr_entry = calloc(1, sizeof(*addr_entry)); if (!addr_entry) return; + addr_entry->comm_caps = FI_LOCAL_COMM; addr_entry->ipaddr.sin6.sin6_family = AF_INET6; addr_entry->ipaddr.sin6.sin6_addr = in6addr_loopback; ofi_straddr_log(prov, FI_LOG_INFO, FI_LOG_CORE, "available addr: ", &addr_entry->ipaddr); strncpy(addr_entry->ipstr, "::1", sizeof(addr_entry->ipstr)); + strncpy(addr_entry->net_name, "::1/128", sizeof(addr_entry->net_name)); + strncpy(addr_entry->ifa_name, "lo", sizeof(addr_entry->ifa_name)); slist_insert_tail(&addr_entry->entry, addr_list); } @@ -1149,7 +1318,33 @@ ofi_addr_list_entry_comp_speed(struct slist_entry *cur, const void *insert) return (cur_addr->speed < insert_addr->speed); } -void ofi_get_list_of_addr(struct fi_provider *prov, const char *env_name, +void ofi_set_netmask_str(char *netstr, size_t len, struct ifaddrs *ifa) +{ + union ofi_sock_ip addr; + size_t prefix_len; + + netstr[0] = '\0'; + prefix_len = ofi_mask_addr(&addr.sa, ifa->ifa_addr, ifa->ifa_netmask); + + switch (addr.sa.sa_family) { + case AF_INET: + inet_ntop(AF_INET, &addr.sin.sin_addr, netstr, len); + break; + case AF_INET6: + inet_ntop(AF_INET6, &addr.sin6.sin6_addr, netstr, len); + break; + default: + snprintf(netstr, len, "%s", ""); + netstr[len - 1] = '\0'; + break; + } + + snprintf(netstr + strlen(netstr), len - strlen(netstr), + "%s%d", "/", (int) prefix_len); + netstr[len - 1] = '\0'; +} + +void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, struct slist *addr_list) { int ret; @@ -1157,72 +1352,80 @@ void ofi_get_list_of_addr(struct fi_provider *prov, const char *env_name, struct ofi_addr_list_entry *addr_entry; struct ifaddrs *ifaddrs, *ifa; - fi_param_get_str(prov, env_name, &iface); + fi_param_get_str((struct fi_provider *) prov, env_name, &iface); ret = ofi_getifaddrs(&ifaddrs); - if (!ret) { - if (iface) { - for (ifa = ifaddrs; ifa != NULL; ifa = ifa->ifa_next) { - if (strncmp(iface, ifa->ifa_name, - strlen(iface)) == 0) { - break; - } - } - if (ifa == NULL) { - FI_INFO(prov, FI_LOG_CORE, - "Can't set filter to unknown interface: (%s)\n", - iface); - iface = NULL; - } - } - for (ifa = ifaddrs; ifa != NULL; ifa = ifa->ifa_next) { - if (ifa->ifa_addr == NULL || - !(ifa->ifa_flags & IFF_UP) || - (ifa->ifa_flags & IFF_LOOPBACK) || - ((ifa->ifa_addr->sa_family != AF_INET) && - (ifa->ifa_addr->sa_family != AF_INET6))) - continue; - if (iface && strncmp(iface, ifa->ifa_name, strlen(iface)) != 0) { - FI_DBG(prov, FI_LOG_CORE, - "Skip (%s) interface\n", ifa->ifa_name); - continue; - } + if (ret) + goto insert_lo; - addr_entry = calloc(1, sizeof(struct ofi_addr_list_entry)); - if (!addr_entry) - continue; - - memcpy(&addr_entry->ipaddr, ifa->ifa_addr, - ofi_sizeofaddr(ifa->ifa_addr)); - ofi_straddr_log(prov, FI_LOG_INFO, FI_LOG_CORE, - "available addr: ", ifa->ifa_addr); - - if (!inet_ntop(ifa->ifa_addr->sa_family, - ofi_get_ipaddr(ifa->ifa_addr), - addr_entry->ipstr, - sizeof(addr_entry->ipstr))) { - FI_DBG(prov, FI_LOG_CORE, - "inet_ntop failed: %d\n", errno); - free(addr_entry); - continue; + if (iface) { + for (ifa = ifaddrs; ifa != NULL; ifa = ifa->ifa_next) { + if (strncmp(iface, ifa->ifa_name, + strlen(iface)) == 0) { + break; } + } + if (ifa == NULL) { + FI_INFO(prov, FI_LOG_CORE, + "Can't set filter to unknown interface: (%s)\n", + iface); + iface = NULL; + } + } + for (ifa = ifaddrs; ifa != NULL; ifa = ifa->ifa_next) { + if (ifa->ifa_addr == NULL || + !(ifa->ifa_flags & IFF_UP) || + (ifa->ifa_flags & IFF_LOOPBACK) || + ((ifa->ifa_addr->sa_family != AF_INET) && + (ifa->ifa_addr->sa_family != AF_INET6))) + continue; + if (iface && strncmp(iface, ifa->ifa_name, strlen(iface)) != 0) { + FI_DBG(prov, FI_LOG_CORE, + "Skip (%s) interface\n", ifa->ifa_name); + continue; + } - addr_entry->speed = ofi_ifaddr_get_speed(ifa); + addr_entry = calloc(1, sizeof(*addr_entry)); + if (!addr_entry) + continue; - slist_insert_before_first_match(addr_list, ofi_addr_list_entry_comp_speed, - &addr_entry->entry); + addr_entry->comm_caps = FI_LOCAL_COMM | FI_REMOTE_COMM; + memcpy(&addr_entry->ipaddr, ifa->ifa_addr, + ofi_sizeofaddr(ifa->ifa_addr)); + strncpy(addr_entry->ifa_name, ifa->ifa_name, + sizeof(addr_entry->ifa_name) - 1); + ofi_set_netmask_str(addr_entry->net_name, + sizeof(addr_entry->net_name), ifa); + + if (!inet_ntop(ifa->ifa_addr->sa_family, + ofi_get_ipaddr(ifa->ifa_addr), + addr_entry->ipstr, + sizeof(addr_entry->ipstr))) { + FI_DBG(prov, FI_LOG_CORE, + "inet_ntop failed: %d\n", errno); + free(addr_entry); + continue; } - freeifaddrs(ifaddrs); + addr_entry->speed = ofi_ifaddr_get_speed(ifa); + FI_INFO(prov, FI_LOG_CORE, "Available addr: %s, " + "iface name: %s, speed: %zu\n", + addr_entry->ipstr, ifa->ifa_name, addr_entry->speed); + + slist_insert_before_first_match(addr_list, ofi_addr_list_entry_comp_speed, + &addr_entry->entry); } + freeifaddrs(ifaddrs); + +insert_lo: /* Always add loopback address at the end */ ofi_insert_loopback_addr(prov, addr_list); } #elif defined HAVE_MIB_IPADDRTABLE -void ofi_get_list_of_addr(struct fi_provider *prov, const char *env_name, +void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, struct slist *addr_list) { struct ofi_addr_list_entry *addr_entry; @@ -1246,11 +1449,12 @@ void ofi_get_list_of_addr(struct fi_provider *prov, const char *env_name, for (i = 0; i < iptbl->dwNumEntries; i++) { if (iptbl->table[i].dwAddr && - (iptbl->table[i].dwAddr != ntohl(INADDR_LOOPBACK))) { + (iptbl->table[i].dwAddr != htonl(INADDR_LOOPBACK))) { addr_entry = calloc(1, sizeof(*addr_entry)); if (!addr_entry) break; + addr_entry->comm_caps = FI_LOCAL_COMM | FI_REMOTE_COMM; addr_entry->ipaddr.sin.sin_family = AF_INET; addr_entry->ipaddr.sin.sin_addr.s_addr = iptbl->table[i].dwAddr; @@ -1271,7 +1475,7 @@ void ofi_get_list_of_addr(struct fi_provider *prov, const char *env_name, #else /* !HAVE_MIB_IPADDRTABLE && !HAVE_MIB_IPADDRTABLE */ -void ofi_get_list_of_addr(struct fi_provider *prov, const char *env_name, +void ofi_get_list_of_addr(const struct fi_provider *prov, const char *env_name, struct slist *addr_list) { ofi_insert_loopback_addr(prov, addr_list); @@ -1561,3 +1765,54 @@ struct fid_nic *ofi_nic_dup(const struct fid_nic *nic) ofi_nic_close(&dup_nic->fid); return NULL; } + +/* + * Calculate bits per second based on verbs port active_speed and active_width. + */ +size_t ofi_vrb_speed(uint8_t speed, uint8_t width) +{ + const size_t gbit_2_bit_coef = 1000 * 1000 * 1000; + size_t width_val, speed_val; + + switch (speed) { + case 1: + speed_val = (size_t) (2.5 * (float) gbit_2_bit_coef); + break; + case 2: + speed_val = 5 * gbit_2_bit_coef; + break; + case 4: + case 8: + speed_val = 8 * gbit_2_bit_coef; + break; + case 16: + speed_val = 14 * gbit_2_bit_coef; + break; + case 32: + speed_val = 25 * gbit_2_bit_coef; + break; + default: + speed_val = 0; + break; + } + + switch (width) { + case 1: + width_val = 1; + break; + case 2: + width_val = 4; + break; + case 4: + width_val = 8; + break; + case 8: + width_val = 12; + break; + default: + width_val = 0; + break; + } + + return width_val * speed_val; +} diff --git a/src/enosys.c b/src/enosys.c index 62348aa843d..32f1bcca84f 100644 --- a/src/enosys.c +++ b/src/enosys.c @@ -270,6 +270,11 @@ int fi_no_query_atomic(struct fid_domain *domain, enum fi_datatype datatype, { return -FI_ENOSYS; } +int fi_no_query_collective(struct fid_domain *domain, enum fi_collective_op coll, + struct fi_collective_attr *attr, uint64_t flags) +{ + return -FI_ENOSYS; +} /* * struct fi_ops_mr @@ -588,3 +593,67 @@ int fi_no_av_remove(struct fid_av *av, fi_addr_t *fi_addr, size_t count, { return -FI_ENOSYS; } + +ssize_t fi_coll_no_barrier(struct fid_ep *ep, fi_addr_t coll_addr, void *context) +{ + return -FI_ENOSYS; +} +ssize_t fi_coll_no_broadcast(struct fid_ep *ep, void *buf, size_t count, void *desc, + fi_addr_t coll_addr, fi_addr_t root_addr, + enum fi_datatype datatype, uint64_t flags, void *context) +{ + return -FI_ENOSYS; +} +ssize_t fi_coll_no_alltoall(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + enum fi_datatype datatype, uint64_t flags, void *context) +{ + return -FI_ENOSYS; +} +ssize_t fi_coll_no_allreduce(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + enum fi_datatype datatype, enum fi_op op, uint64_t flags, + void *context) +{ + return -FI_ENOSYS; +} +ssize_t fi_coll_no_allgather(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + enum fi_datatype datatype, uint64_t flags, void *context) +{ + return -FI_ENOSYS; +} +ssize_t fi_coll_no_reduce_scatter(struct fid_ep *ep, const void *buf, size_t count, + void *desc, void *result, void *result_desc, + fi_addr_t coll_addr, enum fi_datatype datatype, + enum fi_op op, uint64_t flags, void *context) +{ + return -FI_ENOSYS; +} +ssize_t fi_coll_no_reduce(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + fi_addr_t root_addr, enum fi_datatype datatype, enum fi_op op, + uint64_t flags, void *context) +{ + return -FI_ENOSYS; +} +ssize_t fi_coll_no_scatter(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + fi_addr_t root_addr, enum fi_datatype datatype, uint64_t flags, + void *context) +{ + return -FI_ENOSYS; +} +ssize_t fi_coll_no_gather(struct fid_ep *ep, const void *buf, size_t count, void *desc, + void *result, void *result_desc, fi_addr_t coll_addr, + fi_addr_t root_addr, enum fi_datatype datatype, uint64_t flags, + void *context) +{ + return -FI_ENOSYS; +} +ssize_t fi_coll_no_msg(struct fid_ep *ep, const struct fi_msg_collective *msg, + struct fi_ioc *resultv, void **result_desc, size_t result_count, + uint64_t flags) +{ + return -FI_ENOSYS; +} diff --git a/src/fabric.c b/src/fabric.c index bd1ade955aa..86734d7340c 100644 --- a/src/fabric.c +++ b/src/fabric.c @@ -2,6 +2,7 @@ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2013-2017 Intel Corp., Inc. All rights reserved. + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -39,6 +40,7 @@ #include #include #include +#include #include #include "ofi_util.h" @@ -46,6 +48,7 @@ #include "shared/ofi_str.h" #include "ofi_prov.h" #include "ofi_perf.h" +#include "ofi_hmem.h" #ifdef HAVE_LIBDL #include @@ -56,6 +59,7 @@ struct ofi_prov { char *prov_name; struct fi_provider *provider; void *dlhandle; + bool hidden; }; static struct ofi_prov *prov_head, *prov_tail; @@ -101,6 +105,32 @@ static int ofi_find_core_name(char **names, const char *name) return -1; } +static void ofi_closest_prov_names(char *prov_name, char* miss_prov_name, int n) +{ + if (strncasecmp( prov_name, miss_prov_name, n ) == 0 ) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Instead misspelled provider: %s, you may want: %s?\n", + miss_prov_name, prov_name); + } +} + +static void ofi_suggest_prov_names(char *name_to_match) +{ + struct ofi_prov *prov; + for (prov = prov_head; prov; prov = prov->next) { + if (strlen(prov->prov_name) != strlen(name_to_match) + && !strncasecmp(prov->prov_name, name_to_match, + strlen(name_to_match))) { + if (strlen(name_to_match) > 5) + ofi_closest_prov_names(prov->prov_name, + name_to_match, 5); + else + ofi_closest_prov_names(prov->prov_name, + name_to_match, 2); + } + } +} + static enum ofi_prov_type ofi_prov_type(const struct fi_provider *provider) { const struct fi_prov_context *ctx; @@ -108,6 +138,13 @@ static enum ofi_prov_type ofi_prov_type(const struct fi_provider *provider) return ctx->type; } +static int ofi_disable_util_layering(const struct fi_provider *provider) { + const struct fi_prov_context *ctx; + + ctx = (const struct fi_prov_context *) &provider->context; + return ctx->disable_layering; +} + static int ofi_is_util_prov(const struct fi_provider *provider) { return ofi_prov_type(provider) == OFI_PROV_UTIL; @@ -234,7 +271,7 @@ static struct ofi_prov *ofi_getprov(const char *prov_name, size_t len) for (prov = prov_head; prov; prov = prov->next) { if ((strlen(prov->prov_name) == len) && - !strncmp(prov->prov_name, prov_name, len)) + !strncasecmp(prov->prov_name, prov_name, len)) return prov; } @@ -313,6 +350,8 @@ static struct ofi_prov *ofi_create_prov_entry(const char *prov_name) prov_head = prov; prov_tail = prov; + prov->hidden = false; + return prov; } @@ -323,15 +362,15 @@ static struct ofi_prov *ofi_create_prov_entry(const char *prov_name) static void ofi_ordered_provs_init(void) { char *ordered_prov_names[] = { - "psm2", "psm", "efa", "usnic", "gni", "bgq", "verbs", - "netdir", "ofi_rxm", "ofi_rxd", "shm", "mlx", + "psm3", "psm2", "psm", "efa", "usnic", "gni", "bgq", "verbs", + "netdir", "ofi_rxm", "ofi_rxd", "shm", /* Initialize the socket based providers last of the * standard providers. This will result in them being * the least preferred providers. */ /* Before you add ANYTHING here, read the comment above!!! */ - "UDP", "tcp", "sockets", /* NOTHING GOES HERE! */ + "udp", "tcp", "sockets", /* NOTHING GOES HERE! */ /* Seriously, read it! */ /* These are hooking providers only. Their order @@ -356,16 +395,15 @@ static void ofi_set_prov_type(struct fi_prov_context *ctx, ctx->type = OFI_PROV_CORE; } -static int ofi_register_provider(struct fi_provider *provider, void *dlhandle) +static void ofi_register_provider(struct fi_provider *provider, void *dlhandle) { struct fi_prov_context *ctx; struct ofi_prov *prov = NULL; - int ret; + bool hidden = false; if (!provider || !provider->name) { - FI_WARN(&core_prov, FI_LOG_CORE, - "no provider structure or name\n"); - ret = -FI_EINVAL; + FI_DBG(&core_prov, FI_LOG_CORE, + "no provider structure or name\n"); goto cleanup; } @@ -376,7 +414,6 @@ static int ofi_register_provider(struct fi_provider *provider, void *dlhandle) if (!provider->fabric) { FI_WARN(&core_prov, FI_LOG_CORE, "provider missing mandatory entry points\n"); - ret = -FI_EINVAL; goto cleanup; } @@ -391,8 +428,6 @@ static int ofi_register_provider(struct fi_provider *provider, void *dlhandle) FI_MAJOR(provider->fi_version), FI_MINOR(provider->fi_version), FI_MAJOR_VERSION, FI_MINOR_VERSION); - - ret = -FI_ENOSYS; goto cleanup; } @@ -403,13 +438,22 @@ static int ofi_register_provider(struct fi_provider *provider, void *dlhandle) FI_INFO(&core_prov, FI_LOG_CORE, "\"%s\" filtered by provider include/exclude " "list, skipping\n", provider->name); - ret = -FI_ENODEV; - goto cleanup; + hidden = true; } if (ofi_apply_filter(&prov_log_filter, provider->name)) ctx->disable_logging = 1; + /* + * Prevent utility providers from layering on these core providers + * unless explicitly requested. + */ + if (!strcasecmp(provider->name, "sockets") || + !strcasecmp(provider->name, "shm") || + !strcasecmp(provider->name, "efa") || + !strcasecmp(provider->name, "psm3") || ofi_is_util_prov(provider)) + ctx->disable_layering = 1; + prov = ofi_getprov(provider->name, strlen(provider->name)); if (prov) { /* If this provider has not been init yet, then we add the @@ -425,7 +469,6 @@ static int ofi_register_provider(struct fi_provider *provider, void *dlhandle) FI_INFO(&core_prov, FI_LOG_CORE, "a newer %s provider was already loaded; " "ignoring this one\n", provider->name); - ret = -FI_EALREADY; goto cleanup; } @@ -440,20 +483,20 @@ static int ofi_register_provider(struct fi_provider *provider, void *dlhandle) cleanup_provider(prov->provider, prov->dlhandle); } else { prov = ofi_create_prov_entry(provider->name); - if (!prov) { - ret = -FI_EOTHER; + if (!prov) goto cleanup; - } } + if (hidden) + prov->hidden = true; + update_prov_registry: prov->dlhandle = dlhandle; prov->provider = provider; - return 0; + return; cleanup: cleanup_provider(provider, dlhandle); - return ret; } #ifdef HAVE_LIBDL @@ -469,6 +512,32 @@ static int lib_filter(const struct dirent *entry) } #endif +static int verify_filter_names(char **names) +{ + int i, j; + char** split_names; + for (i = 0; names[i]; i++) { + split_names = ofi_split_and_alloc(names[i], ";", NULL); + if (!split_names) { + FI_WARN(&core_prov, FI_LOG_CORE, + "unable to parse given filter string\n"); + return -FI_ENODATA; + } + + for(j = 0; split_names[j]; j++) { + if(!ofi_getprov(split_names[j], strlen(split_names[j]))) { + FI_WARN(&core_prov, FI_LOG_CORE, + "provider %s is unknown, misspelled" + " or DL provider?\n", split_names[j]); + ofi_suggest_prov_names(split_names[j]); + } + } + ofi_free_string_array(split_names); + } + + return FI_SUCCESS; +} + void ofi_free_filter(struct fi_filter *filter) { ofi_free_string_array(filter->names); @@ -485,20 +554,47 @@ void ofi_create_filter(struct fi_filter *filter, const char *raw_filter) ++raw_filter; } - filter->names= ofi_split_and_alloc(raw_filter, ",", NULL); - if (!filter->names) + filter->names = ofi_split_and_alloc(raw_filter, ",", NULL); + if (!filter->names) { FI_WARN(&core_prov, FI_LOG_CORE, "unable to parse filter from: %s\n", raw_filter); + return; + } + + if (verify_filter_names(filter->names)) + FI_WARN(&core_prov, FI_LOG_CORE, + "unable to verify filter name\n"); } #ifdef HAVE_LIBDL +static void ofi_reg_dl_prov(const char *lib) +{ + void *dlhandle; + struct fi_provider* (*inif)(void); + + FI_DBG(&core_prov, FI_LOG_CORE, "opening provider lib %s\n", lib); + + dlhandle = dlopen(lib, RTLD_NOW); + if (dlhandle == NULL) { + FI_DBG(&core_prov, FI_LOG_CORE, + "dlopen(%s): %s\n", lib, dlerror()); + return; + } + + inif = dlsym(dlhandle, "fi_prov_ini"); + if (inif == NULL) { + FI_WARN(&core_prov, FI_LOG_CORE, "dlsym: %s\n", dlerror()); + dlclose(dlhandle); + } else { + ofi_register_provider((inif)(), dlhandle); + } +} + static void ofi_ini_dir(const char *dir) { int n = 0; char *lib; - void *dlhandle; struct dirent **liblist = NULL; - struct fi_provider* (*inif)(void); n = scandir(dir, &liblist, lib_filter, NULL); if (n < 0) @@ -510,25 +606,10 @@ static void ofi_ini_dir(const char *dir) "asprintf failed to allocate memory\n"); goto libdl_done; } - FI_DBG(&core_prov, FI_LOG_CORE, "opening provider lib %s\n", lib); + ofi_reg_dl_prov(lib); - dlhandle = dlopen(lib, RTLD_NOW); free(liblist[n]); - if (dlhandle == NULL) { - FI_WARN(&core_prov, FI_LOG_CORE, - "dlopen(%s): %s\n", lib, dlerror()); - free(lib); - continue; - } free(lib); - - inif = dlsym(dlhandle, "fi_prov_ini"); - if (inif == NULL) { - FI_WARN(&core_prov, FI_LOG_CORE, "dlsym: %s\n", dlerror()); - dlclose(dlhandle); - } else { - ofi_register_provider((inif)(), dlhandle); - } } libdl_done: @@ -536,6 +617,39 @@ static void ofi_ini_dir(const char *dir) free(liblist[n]); free(liblist); } + +/* Search standard system library paths (i.e. LD_LIBRARY_PATH) for known DL provider + * libraries. + */ +static void ofi_find_prov_libs(void) +{ + const char* lib_prefix = "lib"; + struct ofi_prov *prov; + char* lib; + char* short_prov_name; + + for (prov = prov_head; prov; prov = prov->next) { + + if (!prov->prov_name) + continue; + + if (ofi_has_util_prefix(prov->prov_name)) { + short_prov_name = prov->prov_name + strlen(OFI_UTIL_PREFIX); + } else { + short_prov_name = prov->prov_name; + } + + if (asprintf(&lib, "%s%s%s%s", lib_prefix, + short_prov_name, "-", FI_LIB_SUFFIX) < 0) { + FI_WARN(&core_prov, FI_LOG_CORE, + "asprintf failed to allocate memory\n"); + continue; + } + + ofi_reg_dl_prov(lib); + free(lib); + } +} #endif void fi_ini(void) @@ -555,7 +669,8 @@ void fi_ini(void) ofi_pmem_init(); ofi_perf_init(); ofi_hook_init(); - ofi_monitor_init(); + ofi_hmem_init(); + ofi_monitors_init(); fi_param_define(NULL, "provider", FI_PARAM_STRING, "Only use specified provider (default: all available)"); @@ -568,7 +683,8 @@ void fi_ini(void) "Defines the maximum number of processes that will be" " used by distribute OFI application. The provider uses" " this to optimize resource allocations" - " (default: OFI service specific)"); + " (default: provider specific)"); + fi_param_get_size_t(NULL, "universe_size", &ofi_universe_size); fi_param_get_str(NULL, "provider", ¶m_val); ofi_create_filter(&prov_filter, param_val); @@ -590,9 +706,10 @@ void fi_ini(void) "Search for providers in specific path (default: " PROVDLDIR ")"); fi_param_get_str(NULL, "provider_path", &provdir); - if (!provdir) + if (!provdir) { provdir = PROVDLDIR; - + ofi_find_prov_libs(); + } dirs = ofi_split_and_alloc(provdir, ":", NULL); if (dirs) { for (n = 0; dirs[n]; ++n) { @@ -603,10 +720,10 @@ void fi_ini(void) libdl_done: #endif + ofi_register_provider(PSM3_INIT, NULL); ofi_register_provider(PSM2_INIT, NULL); ofi_register_provider(PSM_INIT, NULL); ofi_register_provider(USNIC_INIT, NULL); - ofi_register_provider(MLX_INIT, NULL); ofi_register_provider(GNI_INIT, NULL); ofi_register_provider(BGQ_INIT, NULL); ofi_register_provider(NETDIR_INIT, NULL); @@ -647,7 +764,8 @@ FI_DESTRUCTOR(fi_fini(void)) } ofi_free_filter(&prov_filter); - ofi_monitor_cleanup(); + ofi_monitors_cleanup(); + ofi_hmem_cleanup(); ofi_mem_fini(); fi_log_fini(); fi_param_fini(); @@ -687,7 +805,7 @@ void DEFAULT_SYMVER_PRE(fi_freeinfo)(struct fi_info *info) free(info); } } -CURRENT_SYMVER(fi_freeinfo_, fi_freeinfo); +DEFAULT_SYMVER(fi_freeinfo_, fi_freeinfo, FABRIC_1.3); /* * Make a dummy info object for each provider, and copy in the @@ -760,8 +878,9 @@ static void ofi_set_prov_attr(struct fi_fabric_attr *attr, * 1b. If a utility provider is specified, return it over any* core provider. * 1c. If a core provider is specified, return any utility provider that can * layer over it, plus the core provider itself, if possible. - * 1d. A utility provider will not layer over the sockets provider unless the - * user explicitly requests that combination. + * 1d. A utility provider will not layer over a provider that has disabled + * utility provider layering unless the user explicitly requests that + * combination. * 1e. OFI_CORE_PROV_ONLY flag prevents utility providers layering over other * utility providers. * 2. If both the providers are utility providers or if more than two providers @@ -775,6 +894,7 @@ static int ofi_layering_ok(const struct fi_provider *provider, uint64_t flags) { char *prov_name; + struct ofi_prov *core_ofi_prov; int i; /* Excluded providers must be at the end */ @@ -796,9 +916,9 @@ static int ofi_layering_ok(const struct fi_provider *provider, return 0; } - if ((count == 0) && !strcasecmp(provider->name, "sockets")) { + if ((count == 0) && ofi_disable_util_layering(provider)) { FI_INFO(&core_prov, FI_LOG_CORE, - "Skipping util;sockets layering\n"); + "Skipping util;%s layering\n", provider->name); return 0; } } @@ -813,13 +933,15 @@ static int ofi_layering_ok(const struct fi_provider *provider, if ((count == 1) && ofi_is_util_prov(provider) && !ofi_has_util_prefix(prov_vec[0])) { - if (!strcasecmp(prov_vec[0], "sockets")) { + core_ofi_prov = ofi_getprov(prov_vec[0], strlen(prov_vec[0])); + if (core_ofi_prov && core_ofi_prov->provider && + ofi_disable_util_layering(core_ofi_prov->provider)) { FI_INFO(&core_prov, FI_LOG_CORE, - "Sockets requested, skipping util layering\n"); + "Skipping %s;%s layering\n", prov_vec[0], + provider->name); return 0; - } else { - return 1; } + return 1; } if ((count == 2) && ofi_has_util_prefix(prov_vec[0]) && @@ -840,6 +962,7 @@ int DEFAULT_SYMVER_PRE(fi_getinfo)(uint32_t version, const char *node, struct fi_info *tail, *cur; char **prov_vec = NULL; size_t count = 0; + enum fi_log_level level; int ret; if (!ofi_init) @@ -869,6 +992,9 @@ int DEFAULT_SYMVER_PRE(fi_getinfo)(uint32_t version, const char *node, if (!prov->provider || !prov->provider->getinfo) continue; + if (prov->hidden && !(flags & OFI_GETINFO_HIDDEN)) + continue; + if (!ofi_layering_ok(prov->provider, prov_vec, count, flags)) continue; @@ -882,10 +1008,15 @@ int DEFAULT_SYMVER_PRE(fi_getinfo)(uint32_t version, const char *node, continue; } + cur = NULL; ret = prov->provider->getinfo(version, node, service, flags, hints, &cur); if (ret) { - FI_WARN(&core_prov, FI_LOG_CORE, + level = ((hints && hints->fabric_attr && + hints->fabric_attr->prov_name) ? + FI_LOG_WARN : FI_LOG_INFO); + + FI_LOG(&core_prov, level, FI_LOG_CORE, "fi_getinfo: provider %s returned -%d (%s)\n", prov->provider->name, -ret, fi_strerror(-ret)); continue; @@ -898,6 +1029,9 @@ int DEFAULT_SYMVER_PRE(fi_getinfo)(uint32_t version, const char *node, continue; } + FI_DBG(&core_prov, FI_LOG_CORE, "fi_getinfo: provider %s " + "returned success\n", prov->provider->name); + if (!*info) *info = cur; else @@ -912,12 +1046,13 @@ int DEFAULT_SYMVER_PRE(fi_getinfo)(uint32_t version, const char *node, } ofi_free_string_array(prov_vec); - if (!(flags & (OFI_CORE_PROV_ONLY | OFI_GETINFO_INTERNAL))) + if (!(flags & (OFI_CORE_PROV_ONLY | OFI_GETINFO_INTERNAL | + OFI_GETINFO_HIDDEN))) ofi_filter_info(info); return *info ? 0 : -FI_ENODATA; } -CURRENT_SYMVER(fi_getinfo_, fi_getinfo); +DEFAULT_SYMVER(fi_getinfo_, fi_getinfo, FABRIC_1.3); struct fi_info *ofi_allocinfo_internal(void) { @@ -1048,7 +1183,7 @@ struct fi_info *DEFAULT_SYMVER_PRE(fi_dupinfo)(const struct fi_info *info) fi_freeinfo(dup); return NULL; } -CURRENT_SYMVER(fi_dupinfo_, fi_dupinfo); +DEFAULT_SYMVER(fi_dupinfo_, fi_dupinfo, FABRIC_1.3); __attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) int DEFAULT_SYMVER_PRE(fi_fabric)(struct fi_fabric_attr *attr, diff --git a/src/fi_tostr.c b/src/fi_tostr.c index 8c99cb12c8a..9a9185675b1 100644 --- a/src/fi_tostr.c +++ b/src/fi_tostr.c @@ -47,6 +47,8 @@ #include #include #include +#include + /* Print fi_info and related structs, enums, OR_able flags, addresses. * @@ -68,706 +70,792 @@ * fi_info->caps : ofi_tostr_caps(..., typeof(caps), ...) */ -#define OFI_BUFSIZ 8192 -static void ofi_tostr_fid(const char *label, char *buf, const struct fid *fid) +static void +ofi_tostr_fid(const char *label, char *buf, size_t len, const struct fid *fid) { if (!fid || !FI_CHECK_OP(fid->ops, struct fi_ops, tostr)) - ofi_strcatf(buf, "%s%p\n", label, fid); + ofi_strncatf(buf, len, "%s%p\n", label, fid); else - fid->ops->tostr(fid, buf, OFI_BUFSIZ - strnlen(buf, OFI_BUFSIZ)); + fid->ops->tostr(fid, buf, len - strnlen(buf, len)); } -static void ofi_tostr_opflags(char *buf, uint64_t flags) +static void ofi_tostr_opflags(char *buf, size_t len, uint64_t flags) { - IFFLAGSTR(flags, FI_MULTICAST); - - IFFLAGSTR(flags, FI_MULTI_RECV); - IFFLAGSTR(flags, FI_REMOTE_CQ_DATA); - IFFLAGSTR(flags, FI_MORE); - IFFLAGSTR(flags, FI_PEEK); - IFFLAGSTR(flags, FI_TRIGGER); - IFFLAGSTR(flags, FI_FENCE); - - IFFLAGSTR(flags, FI_COMPLETION); - IFFLAGSTR(flags, FI_INJECT); - IFFLAGSTR(flags, FI_INJECT_COMPLETE); - IFFLAGSTR(flags, FI_TRANSMIT_COMPLETE); - IFFLAGSTR(flags, FI_DELIVERY_COMPLETE); - IFFLAGSTR(flags, FI_AFFINITY); - - IFFLAGSTR(flags, FI_CLAIM); - IFFLAGSTR(flags, FI_DISCARD); + IFFLAGSTRN(flags, FI_MULTICAST, len); + + IFFLAGSTRN(flags, FI_MULTI_RECV, len); + IFFLAGSTRN(flags, FI_REMOTE_CQ_DATA, len); + IFFLAGSTRN(flags, FI_MORE, len); + IFFLAGSTRN(flags, FI_PEEK, len); + IFFLAGSTRN(flags, FI_TRIGGER, len); + IFFLAGSTRN(flags, FI_FENCE, len); + + IFFLAGSTRN(flags, FI_COMPLETION, len); + IFFLAGSTRN(flags, FI_INJECT, len); + IFFLAGSTRN(flags, FI_INJECT_COMPLETE, len); + IFFLAGSTRN(flags, FI_TRANSMIT_COMPLETE, len); + IFFLAGSTRN(flags, FI_DELIVERY_COMPLETE, len); + IFFLAGSTRN(flags, FI_MATCH_COMPLETE, len); + IFFLAGSTRN(flags, FI_AFFINITY, len); + + IFFLAGSTRN(flags, FI_CLAIM, len); + IFFLAGSTRN(flags, FI_DISCARD, len); ofi_remove_comma(buf); } -static void oofi_tostr_addr_format(char *buf, uint32_t addr_format) +static void ofi_tostr_addr_format(char *buf, size_t len, uint32_t addr_format) { switch (addr_format) { - CASEENUMSTR(FI_FORMAT_UNSPEC); - CASEENUMSTR(FI_SOCKADDR); - CASEENUMSTR(FI_SOCKADDR_IN); - CASEENUMSTR(FI_SOCKADDR_IN6); - CASEENUMSTR(FI_SOCKADDR_IB); - CASEENUMSTR(FI_ADDR_PSMX); - CASEENUMSTR(FI_ADDR_PSMX2); - CASEENUMSTR(FI_ADDR_GNI); - CASEENUMSTR(FI_ADDR_BGQ); - CASEENUMSTR(FI_ADDR_MLX); - CASEENUMSTR(FI_ADDR_STR); - CASEENUMSTR(FI_ADDR_IB_UD); - CASEENUMSTR(FI_ADDR_EFA); + CASEENUMSTRN(FI_FORMAT_UNSPEC, len); + CASEENUMSTRN(FI_SOCKADDR, len); + CASEENUMSTRN(FI_SOCKADDR_IN, len); + CASEENUMSTRN(FI_SOCKADDR_IN6, len); + CASEENUMSTRN(FI_SOCKADDR_IB, len); + CASEENUMSTRN(FI_ADDR_PSMX, len); + CASEENUMSTRN(FI_ADDR_PSMX2, len); + CASEENUMSTRN(FI_ADDR_GNI, len); + CASEENUMSTRN(FI_ADDR_BGQ, len); + CASEENUMSTRN(FI_ADDR_MLX, len); + CASEENUMSTRN(FI_ADDR_STR, len); + CASEENUMSTRN(FI_ADDR_IB_UD, len); + CASEENUMSTRN(FI_ADDR_EFA, len); + CASEENUMSTRN(FI_ADDR_PSMX3, len); default: if (addr_format & FI_PROV_SPECIFIC) - ofi_strcatf(buf, "Provider specific"); + ofi_strncatf(buf, len, "Provider specific"); else - ofi_strcatf(buf, "Unknown"); + ofi_strncatf(buf, len, "Unknown"); break; } } -static void ofi_tostr_progress(char *buf, enum fi_progress progress) +static void ofi_tostr_progress(char *buf, size_t len, enum fi_progress progress) { switch (progress) { - CASEENUMSTR(FI_PROGRESS_UNSPEC); - CASEENUMSTR(FI_PROGRESS_AUTO); - CASEENUMSTR(FI_PROGRESS_MANUAL); + CASEENUMSTRN(FI_PROGRESS_UNSPEC, len); + CASEENUMSTRN(FI_PROGRESS_AUTO, len); + CASEENUMSTRN(FI_PROGRESS_MANUAL, len); default: - ofi_strcatf(buf, "Unknown"); + ofi_strncatf(buf, len, "Unknown"); break; } } -static void ofi_tostr_threading(char *buf, enum fi_threading threading) +static void +ofi_tostr_threading(char *buf, size_t len, enum fi_threading threading) { switch (threading) { - CASEENUMSTR(FI_THREAD_UNSPEC); - CASEENUMSTR(FI_THREAD_SAFE); - CASEENUMSTR(FI_THREAD_FID); - CASEENUMSTR(FI_THREAD_DOMAIN); - CASEENUMSTR(FI_THREAD_COMPLETION); - CASEENUMSTR(FI_THREAD_ENDPOINT); + CASEENUMSTRN(FI_THREAD_UNSPEC, len); + CASEENUMSTRN(FI_THREAD_SAFE, len); + CASEENUMSTRN(FI_THREAD_FID, len); + CASEENUMSTRN(FI_THREAD_DOMAIN, len); + CASEENUMSTRN(FI_THREAD_COMPLETION, len); + CASEENUMSTRN(FI_THREAD_ENDPOINT, len); default: - ofi_strcatf(buf, "Unknown"); + ofi_strncatf(buf, len, "Unknown"); break; } } -static void ofi_tostr_msgorder(char *buf, uint64_t flags) +static void ofi_tostr_msgorder(char *buf, size_t len, uint64_t flags) { - IFFLAGSTR(flags, FI_ORDER_RAR); - IFFLAGSTR(flags, FI_ORDER_RAW); - IFFLAGSTR(flags, FI_ORDER_RAS); - IFFLAGSTR(flags, FI_ORDER_WAR); - IFFLAGSTR(flags, FI_ORDER_WAW); - IFFLAGSTR(flags, FI_ORDER_WAS); - IFFLAGSTR(flags, FI_ORDER_SAR); - IFFLAGSTR(flags, FI_ORDER_SAW); - IFFLAGSTR(flags, FI_ORDER_SAS); - IFFLAGSTR(flags, FI_ORDER_RMA_RAR); - IFFLAGSTR(flags, FI_ORDER_RMA_RAW); - IFFLAGSTR(flags, FI_ORDER_RMA_WAR); - IFFLAGSTR(flags, FI_ORDER_RMA_WAW); - IFFLAGSTR(flags, FI_ORDER_ATOMIC_RAR); - IFFLAGSTR(flags, FI_ORDER_ATOMIC_RAW); - IFFLAGSTR(flags, FI_ORDER_ATOMIC_WAR); - IFFLAGSTR(flags, FI_ORDER_ATOMIC_WAW); + IFFLAGSTRN(flags, FI_ORDER_RAR, len); + IFFLAGSTRN(flags, FI_ORDER_RAW, len); + IFFLAGSTRN(flags, FI_ORDER_RAS, len); + IFFLAGSTRN(flags, FI_ORDER_WAR, len); + IFFLAGSTRN(flags, FI_ORDER_WAW, len); + IFFLAGSTRN(flags, FI_ORDER_WAS, len); + IFFLAGSTRN(flags, FI_ORDER_SAR, len); + IFFLAGSTRN(flags, FI_ORDER_SAW, len); + IFFLAGSTRN(flags, FI_ORDER_SAS, len); + IFFLAGSTRN(flags, FI_ORDER_RMA_RAR, len); + IFFLAGSTRN(flags, FI_ORDER_RMA_RAW, len); + IFFLAGSTRN(flags, FI_ORDER_RMA_WAR, len); + IFFLAGSTRN(flags, FI_ORDER_RMA_WAW, len); + IFFLAGSTRN(flags, FI_ORDER_ATOMIC_RAR, len); + IFFLAGSTRN(flags, FI_ORDER_ATOMIC_RAW, len); + IFFLAGSTRN(flags, FI_ORDER_ATOMIC_WAR, len); + IFFLAGSTRN(flags, FI_ORDER_ATOMIC_WAW, len); ofi_remove_comma(buf); } -static void ofi_tostr_comporder(char *buf, uint64_t flags) +static void ofi_tostr_comporder(char *buf, size_t len, uint64_t flags) { if ((flags & FI_ORDER_STRICT) == FI_ORDER_NONE) { - ofi_strcatf(buf, "FI_ORDER_NONE, "); + ofi_strncatf(buf, len, "FI_ORDER_NONE, "); } else if ((flags & FI_ORDER_STRICT) == FI_ORDER_STRICT) { - ofi_strcatf(buf, "FI_ORDER_STRICT, "); + ofi_strncatf(buf, len, "FI_ORDER_STRICT, "); } - IFFLAGSTR(flags, FI_ORDER_DATA); + IFFLAGSTRN(flags, FI_ORDER_DATA, len); ofi_remove_comma(buf); } -static void ofi_tostr_caps(char *buf, uint64_t caps) +static void ofi_tostr_caps(char *buf, size_t len, uint64_t caps) { - IFFLAGSTR(caps, FI_MSG); - IFFLAGSTR(caps, FI_RMA); - IFFLAGSTR(caps, FI_TAGGED); - IFFLAGSTR(caps, FI_ATOMIC); - IFFLAGSTR(caps, FI_MULTICAST); - IFFLAGSTR(caps, FI_COLLECTIVE); - - IFFLAGSTR(caps, FI_READ); - IFFLAGSTR(caps, FI_WRITE); - IFFLAGSTR(caps, FI_RECV); - IFFLAGSTR(caps, FI_SEND); - IFFLAGSTR(caps, FI_REMOTE_READ); - IFFLAGSTR(caps, FI_REMOTE_WRITE); - - IFFLAGSTR(caps, FI_MULTI_RECV); - IFFLAGSTR(caps, FI_REMOTE_CQ_DATA); - IFFLAGSTR(caps, FI_TRIGGER); - IFFLAGSTR(caps, FI_FENCE); - - IFFLAGSTR(caps, FI_VARIABLE_MSG); - IFFLAGSTR(caps, FI_RMA_PMEM); - IFFLAGSTR(caps, FI_SOURCE_ERR); - IFFLAGSTR(caps, FI_LOCAL_COMM); - IFFLAGSTR(caps, FI_REMOTE_COMM); - IFFLAGSTR(caps, FI_SHARED_AV); - IFFLAGSTR(caps, FI_RMA_EVENT); - IFFLAGSTR(caps, FI_SOURCE); - IFFLAGSTR(caps, FI_NAMED_RX_CTX); - IFFLAGSTR(caps, FI_DIRECTED_RECV); + IFFLAGSTRN(caps, FI_MSG, len); + IFFLAGSTRN(caps, FI_RMA, len); + IFFLAGSTRN(caps, FI_TAGGED, len); + IFFLAGSTRN(caps, FI_ATOMIC, len); + IFFLAGSTRN(caps, FI_MULTICAST, len); + IFFLAGSTRN(caps, FI_COLLECTIVE, len); + + IFFLAGSTRN(caps, FI_READ, len); + IFFLAGSTRN(caps, FI_WRITE, len); + IFFLAGSTRN(caps, FI_RECV, len); + IFFLAGSTRN(caps, FI_SEND, len); + IFFLAGSTRN(caps, FI_REMOTE_READ, len); + IFFLAGSTRN(caps, FI_REMOTE_WRITE, len); + + IFFLAGSTRN(caps, FI_MULTI_RECV, len); + IFFLAGSTRN(caps, FI_REMOTE_CQ_DATA, len); + IFFLAGSTRN(caps, FI_TRIGGER, len); + IFFLAGSTRN(caps, FI_FENCE, len); + + IFFLAGSTRN(caps, FI_VARIABLE_MSG, len); + IFFLAGSTRN(caps, FI_RMA_PMEM, len); + IFFLAGSTRN(caps, FI_SOURCE_ERR, len); + IFFLAGSTRN(caps, FI_LOCAL_COMM, len); + IFFLAGSTRN(caps, FI_REMOTE_COMM, len); + IFFLAGSTRN(caps, FI_SHARED_AV, len); + IFFLAGSTRN(caps, FI_RMA_EVENT, len); + IFFLAGSTRN(caps, FI_SOURCE, len); + IFFLAGSTRN(caps, FI_NAMED_RX_CTX, len); + IFFLAGSTRN(caps, FI_DIRECTED_RECV, len); + IFFLAGSTRN(caps, FI_HMEM, len); ofi_remove_comma(buf); } -static void ofi_tostr_ep_type(char *buf, enum fi_ep_type ep_type) +static void ofi_tostr_ep_type(char *buf, size_t len, enum fi_ep_type ep_type) { switch (ep_type) { - CASEENUMSTR(FI_EP_UNSPEC); - CASEENUMSTR(FI_EP_MSG); - CASEENUMSTR(FI_EP_DGRAM); - CASEENUMSTR(FI_EP_RDM); - CASEENUMSTR(FI_EP_SOCK_STREAM); - CASEENUMSTR(FI_EP_SOCK_DGRAM); + CASEENUMSTRN(FI_EP_UNSPEC, len); + CASEENUMSTRN(FI_EP_MSG, len); + CASEENUMSTRN(FI_EP_DGRAM, len); + CASEENUMSTRN(FI_EP_RDM, len); + CASEENUMSTRN(FI_EP_SOCK_STREAM, len); + CASEENUMSTRN(FI_EP_SOCK_DGRAM, len); default: - ofi_strcatf(buf, "Unknown"); + ofi_strncatf(buf, len, "Unknown"); break; } } -static void ofi_tostr_protocol(char *buf, uint32_t protocol) +static void ofi_tostr_protocol(char *buf, size_t len, uint32_t protocol) { switch (protocol) { - CASEENUMSTR(FI_PROTO_UNSPEC); - CASEENUMSTR(FI_PROTO_RDMA_CM_IB_RC); - CASEENUMSTR(FI_PROTO_IWARP); - CASEENUMSTR(FI_PROTO_IB_UD); - CASEENUMSTR(FI_PROTO_PSMX); - CASEENUMSTR(FI_PROTO_PSMX2); - CASEENUMSTR(FI_PROTO_UDP); - CASEENUMSTR(FI_PROTO_SOCK_TCP); - CASEENUMSTR(FI_PROTO_IB_RDM); - CASEENUMSTR(FI_PROTO_IWARP_RDM); - CASEENUMSTR(FI_PROTO_GNI); - CASEENUMSTR(FI_PROTO_RXM); - CASEENUMSTR(FI_PROTO_RXD); - CASEENUMSTR(FI_PROTO_MLX); - CASEENUMSTR(FI_PROTO_NETWORKDIRECT); - CASEENUMSTR(FI_PROTO_SHM); - CASEENUMSTR(FI_PROTO_RSTREAM); - CASEENUMSTR(FI_PROTO_RDMA_CM_IB_XRC); - CASEENUMSTR(FI_PROTO_EFA); + CASEENUMSTRN(FI_PROTO_UNSPEC, len); + CASEENUMSTRN(FI_PROTO_RDMA_CM_IB_RC, len); + CASEENUMSTRN(FI_PROTO_IWARP, len); + CASEENUMSTRN(FI_PROTO_IB_UD, len); + CASEENUMSTRN(FI_PROTO_PSMX, len); + CASEENUMSTRN(FI_PROTO_PSMX2, len); + CASEENUMSTRN(FI_PROTO_UDP, len); + CASEENUMSTRN(FI_PROTO_SOCK_TCP, len); + CASEENUMSTRN(FI_PROTO_IB_RDM, len); + CASEENUMSTRN(FI_PROTO_IWARP_RDM, len); + CASEENUMSTRN(FI_PROTO_GNI, len); + CASEENUMSTRN(FI_PROTO_RXM, len); + CASEENUMSTRN(FI_PROTO_RXD, len); + CASEENUMSTRN(FI_PROTO_MLX, len); + CASEENUMSTRN(FI_PROTO_NETWORKDIRECT, len); + CASEENUMSTRN(FI_PROTO_SHM, len); + CASEENUMSTRN(FI_PROTO_RSTREAM, len); + CASEENUMSTRN(FI_PROTO_RDMA_CM_IB_XRC, len); + CASEENUMSTRN(FI_PROTO_EFA, len); + CASEENUMSTRN(FI_PROTO_PSMX3, len); default: if (protocol & FI_PROV_SPECIFIC) - ofi_strcatf(buf, "Provider specific"); + ofi_strncatf(buf, len, "Provider specific"); else - ofi_strcatf(buf, "Unknown"); + ofi_strncatf(buf, len, "Unknown"); break; } } -static void ofi_tostr_mode(char *buf, uint64_t mode) +static void ofi_tostr_mode(char *buf, size_t len, uint64_t mode) { - IFFLAGSTR(mode, FI_CONTEXT); - IFFLAGSTR(mode, FI_MSG_PREFIX); - IFFLAGSTR(mode, FI_ASYNC_IOV); - IFFLAGSTR(mode, FI_RX_CQ_DATA); - IFFLAGSTR(mode, FI_LOCAL_MR); - IFFLAGSTR(mode, FI_NOTIFY_FLAGS_ONLY); - IFFLAGSTR(mode, FI_RESTRICTED_COMP); - IFFLAGSTR(mode, FI_CONTEXT2); - IFFLAGSTR(mode, FI_BUFFERED_RECV); + IFFLAGSTRN(mode, FI_CONTEXT, len); + IFFLAGSTRN(mode, FI_MSG_PREFIX, len); + IFFLAGSTRN(mode, FI_ASYNC_IOV, len); + IFFLAGSTRN(mode, FI_RX_CQ_DATA, len); + IFFLAGSTRN(mode, FI_LOCAL_MR, len); + IFFLAGSTRN(mode, FI_NOTIFY_FLAGS_ONLY, len); + IFFLAGSTRN(mode, FI_RESTRICTED_COMP, len); + IFFLAGSTRN(mode, FI_CONTEXT2, len); + IFFLAGSTRN(mode, FI_BUFFERED_RECV, len); ofi_remove_comma(buf); } -static void ofi_tostr_addr(char *buf, uint32_t addr_format, void *addr) +static void +ofi_tostr_addr(char *buf, size_t len, uint32_t addr_format, void *addr) { char *p; - size_t len; + size_t addrlen; p = buf + strlen(buf); + addrlen = len - strlen(buf); if (addr == NULL) { - ofi_strcatf(p, "(null)"); + ofi_strncatf(p, addrlen, "(null)"); return; } - len = 64; - ofi_straddr(p, &len, addr_format, addr); + ofi_straddr(p, &addrlen, addr_format, addr); } -static void ofi_tostr_tx_attr(char *buf, const struct fi_tx_attr *attr, - const char *prefix) +static void +ofi_tostr_tx_attr(char *buf, size_t len, const struct fi_tx_attr *attr, + const char *prefix) { if (!attr) { - ofi_strcatf(buf, "%sfi_tx_attr: (null)\n", prefix); + ofi_strncatf(buf, len, "%sfi_tx_attr: (null)\n", prefix); return; } - ofi_strcatf(buf, "%sfi_tx_attr:\n", prefix); - ofi_strcatf(buf, "%s%scaps: [ ", prefix, TAB); - ofi_tostr_caps(buf, attr->caps); - ofi_strcatf(buf, " ]\n"); - - ofi_strcatf(buf, "%s%smode: [ ", prefix, TAB); - ofi_tostr_mode(buf, attr->mode); - ofi_strcatf(buf, " ]\n"); - - ofi_strcatf(buf, "%s%sop_flags: [ ", prefix, TAB); - ofi_tostr_opflags(buf, attr->op_flags); - ofi_strcatf(buf, " ]\n"); - - ofi_strcatf(buf, "%s%smsg_order: [ ", prefix, TAB); - ofi_tostr_msgorder(buf, attr->msg_order); - ofi_strcatf(buf, " ]\n"); - - ofi_strcatf(buf, "%s%scomp_order: [ ", prefix, TAB); - ofi_tostr_comporder(buf, attr->comp_order); - ofi_strcatf(buf, " ]\n"); - - ofi_strcatf(buf, "%s%sinject_size: %zu\n", prefix, TAB, attr->inject_size); - ofi_strcatf(buf, "%s%ssize: %zu\n", prefix, TAB, attr->size); - ofi_strcatf(buf, "%s%siov_limit: %zu\n", prefix, TAB, attr->iov_limit); - ofi_strcatf(buf, "%s%srma_iov_limit: %zu\n", prefix, TAB, attr->rma_iov_limit); + ofi_strncatf(buf, len, "%sfi_tx_attr:\n", prefix); + ofi_strncatf(buf, len, "%s%scaps: [ ", prefix, TAB); + ofi_tostr_caps(buf, len, attr->caps); + ofi_strncatf(buf, len, " ]\n"); + + ofi_strncatf(buf, len, "%s%smode: [ ", prefix, TAB); + ofi_tostr_mode(buf, len, attr->mode); + ofi_strncatf(buf, len, " ]\n"); + + ofi_strncatf(buf, len, "%s%sop_flags: [ ", prefix, TAB); + ofi_tostr_opflags(buf, len, attr->op_flags); + ofi_strncatf(buf, len, " ]\n"); + + ofi_strncatf(buf, len, "%s%smsg_order: [ ", prefix, TAB); + ofi_tostr_msgorder(buf, len, attr->msg_order); + ofi_strncatf(buf, len, " ]\n"); + + ofi_strncatf(buf, len, "%s%scomp_order: [ ", prefix, TAB); + ofi_tostr_comporder(buf, len, attr->comp_order); + ofi_strncatf(buf, len, " ]\n"); + + ofi_strncatf(buf, len, "%s%sinject_size: %zu\n", prefix, TAB, + attr->inject_size); + ofi_strncatf(buf, len, "%s%ssize: %zu\n", prefix, TAB, attr->size); + ofi_strncatf(buf, len, "%s%siov_limit: %zu\n", prefix, TAB, + attr->iov_limit); + ofi_strncatf(buf, len, "%s%srma_iov_limit: %zu\n", prefix, TAB, + attr->rma_iov_limit); } -static void ofi_tostr_rx_attr(char *buf, const struct fi_rx_attr *attr, - const char *prefix) +static void +ofi_tostr_rx_attr(char *buf, size_t len, const struct fi_rx_attr *attr, + const char *prefix) { if (!attr) { - ofi_strcatf(buf, "%sfi_rx_attr: (null)\n", prefix); + ofi_strncatf(buf, len, "%sfi_rx_attr: (null)\n", prefix); return; } - ofi_strcatf(buf, "%sfi_rx_attr:\n", prefix); - ofi_strcatf(buf, "%s%scaps: [ ", prefix, TAB); - ofi_tostr_caps(buf, attr->caps); - ofi_strcatf(buf, " ]\n"); + ofi_strncatf(buf, len, "%sfi_rx_attr:\n", prefix); + ofi_strncatf(buf, len, "%s%scaps: [ ", prefix, TAB); + ofi_tostr_caps(buf, len, attr->caps); + ofi_strncatf(buf, len, " ]\n"); - ofi_strcatf(buf, "%s%smode: [ ", prefix, TAB); - ofi_tostr_mode(buf, attr->mode); - ofi_strcatf(buf, " ]\n"); + ofi_strncatf(buf, len, "%s%smode: [ ", prefix, TAB); + ofi_tostr_mode(buf, len, attr->mode); + ofi_strncatf(buf, len, " ]\n"); - ofi_strcatf(buf, "%s%sop_flags: [ ", prefix, TAB); - ofi_tostr_opflags(buf, attr->op_flags); - ofi_strcatf(buf, " ]\n"); + ofi_strncatf(buf, len, "%s%sop_flags: [ ", prefix, TAB); + ofi_tostr_opflags(buf, len, attr->op_flags); + ofi_strncatf(buf, len, " ]\n"); - ofi_strcatf(buf, "%s%smsg_order: [ ", prefix, TAB); - ofi_tostr_msgorder(buf, attr->msg_order); - ofi_strcatf(buf, " ]\n"); + ofi_strncatf(buf, len, "%s%smsg_order: [ ", prefix, TAB); + ofi_tostr_msgorder(buf, len, attr->msg_order); + ofi_strncatf(buf, len, " ]\n"); - ofi_strcatf(buf, "%s%scomp_order: [ ", prefix, TAB); - ofi_tostr_comporder(buf, attr->comp_order); - ofi_strcatf(buf, " ]\n"); + ofi_strncatf(buf, len, "%s%scomp_order: [ ", prefix, TAB); + ofi_tostr_comporder(buf, len, attr->comp_order); + ofi_strncatf(buf, len, " ]\n"); - ofi_strcatf(buf, "%s%stotal_buffered_recv: %zu\n", prefix, TAB, attr->total_buffered_recv); - ofi_strcatf(buf, "%s%ssize: %zu\n", prefix, TAB, attr->size); - ofi_strcatf(buf, "%s%siov_limit: %zu\n", prefix, TAB, attr->iov_limit); + ofi_strncatf(buf, len, "%s%stotal_buffered_recv: %zu\n", prefix, TAB, + attr->total_buffered_recv); + ofi_strncatf(buf, len, "%s%ssize: %zu\n", prefix, TAB, attr->size); + ofi_strncatf(buf, len, "%s%siov_limit: %zu\n", prefix, TAB, + attr->iov_limit); } -static void ofi_tostr_ep_attr(char *buf, const struct fi_ep_attr *attr, const char *prefix) +static void +ofi_tostr_ep_attr(char *buf, size_t len, const struct fi_ep_attr *attr, + const char *prefix) { if (!attr) { - ofi_strcatf(buf, "%sfi_ep_attr: (null)\n", prefix); + ofi_strncatf(buf, len, "%sfi_ep_attr: (null)\n", prefix); return; } - ofi_strcatf(buf, "%sfi_ep_attr:\n", prefix); - ofi_strcatf(buf, "%s%stype: ", prefix, TAB); - ofi_tostr_ep_type(buf, attr->type); - ofi_strcatf(buf, "\n"); - ofi_strcatf(buf, "%s%sprotocol: ", prefix, TAB); - ofi_tostr_protocol(buf, attr->protocol); - ofi_strcatf(buf, "\n"); - ofi_strcatf(buf, "%s%sprotocol_version: %d\n", prefix, TAB, attr->protocol_version); - ofi_strcatf(buf, "%s%smax_msg_size: %zu\n", prefix, TAB, attr->max_msg_size); - ofi_strcatf(buf, "%s%smsg_prefix_size: %zu\n", prefix, TAB, attr->msg_prefix_size); - ofi_strcatf(buf, "%s%smax_order_raw_size: %zu\n", prefix, TAB, attr->max_order_raw_size); - ofi_strcatf(buf, "%s%smax_order_war_size: %zu\n", prefix, TAB, attr->max_order_war_size); - ofi_strcatf(buf, "%s%smax_order_waw_size: %zu\n", prefix, TAB, attr->max_order_waw_size); - ofi_strcatf(buf, "%s%smem_tag_format: 0x%016llx\n", prefix, TAB, attr->mem_tag_format); - - ofi_strcatf(buf, "%s%stx_ctx_cnt: %zu\n", prefix, TAB, attr->tx_ctx_cnt); - ofi_strcatf(buf, "%s%srx_ctx_cnt: %zu\n", prefix, TAB, attr->rx_ctx_cnt); - - ofi_strcatf(buf, "%s%sauth_key_size: %zu\n", prefix, TAB, attr->auth_key_size); + ofi_strncatf(buf, len, "%sfi_ep_attr:\n", prefix); + ofi_strncatf(buf, len, "%s%stype: ", prefix, TAB); + ofi_tostr_ep_type(buf, len, attr->type); + ofi_strncatf(buf, len, "\n"); + ofi_strncatf(buf, len, "%s%sprotocol: ", prefix, TAB); + ofi_tostr_protocol(buf, len, attr->protocol); + ofi_strncatf(buf, len, "\n"); + ofi_strncatf(buf, len, "%s%sprotocol_version: %d\n", prefix, TAB, + attr->protocol_version); + ofi_strncatf(buf, len, "%s%smax_msg_size: %zu\n", prefix, TAB, + attr->max_msg_size); + ofi_strncatf(buf, len, "%s%smsg_prefix_size: %zu\n", prefix, TAB, + attr->msg_prefix_size); + ofi_strncatf(buf, len, "%s%smax_order_raw_size: %zu\n", prefix, TAB, + attr->max_order_raw_size); + ofi_strncatf(buf, len, "%s%smax_order_war_size: %zu\n", prefix, TAB, + attr->max_order_war_size); + ofi_strncatf(buf, len, "%s%smax_order_waw_size: %zu\n", prefix, TAB, + attr->max_order_waw_size); + ofi_strncatf(buf, len, "%s%smem_tag_format: 0x%016llx\n", prefix, TAB, + attr->mem_tag_format); + + ofi_strncatf(buf, len, "%s%stx_ctx_cnt: %zu\n", prefix, TAB, + attr->tx_ctx_cnt); + ofi_strncatf(buf, len, "%s%srx_ctx_cnt: %zu\n", prefix, TAB, + attr->rx_ctx_cnt); + + ofi_strncatf(buf, len, "%s%sauth_key_size: %zu\n", prefix, TAB, + attr->auth_key_size); } -static void ofi_tostr_resource_mgmt(char *buf, enum fi_resource_mgmt rm) +static void +ofi_tostr_resource_mgmt(char *buf, size_t len, enum fi_resource_mgmt rm) { switch (rm) { - CASEENUMSTR(FI_RM_UNSPEC); - CASEENUMSTR(FI_RM_DISABLED); - CASEENUMSTR(FI_RM_ENABLED); + CASEENUMSTRN(FI_RM_UNSPEC, len); + CASEENUMSTRN(FI_RM_DISABLED, len); + CASEENUMSTRN(FI_RM_ENABLED, len); default: - ofi_strcatf(buf, "Unknown"); + ofi_strncatf(buf, len, "Unknown"); break; } } -static void ofi_tostr_av_type(char *buf, enum fi_av_type type) +static void ofi_tostr_av_type(char *buf, size_t len, enum fi_av_type type) { switch (type) { - CASEENUMSTR(FI_AV_UNSPEC); - CASEENUMSTR(FI_AV_MAP); - CASEENUMSTR(FI_AV_TABLE); + CASEENUMSTRN(FI_AV_UNSPEC, len); + CASEENUMSTRN(FI_AV_MAP, len); + CASEENUMSTRN(FI_AV_TABLE, len); default: - ofi_strcatf(buf, "Unknown"); + ofi_strncatf(buf, len, "Unknown"); break; } } -static void ofi_tostr_mr_mode(char *buf, int mr_mode) +static void ofi_tostr_mr_mode(char *buf, size_t len, int mr_mode) { - IFFLAGSTR(mr_mode, FI_MR_BASIC); - IFFLAGSTR(mr_mode, FI_MR_SCALABLE); - IFFLAGSTR(mr_mode, FI_MR_LOCAL); - IFFLAGSTR(mr_mode, FI_MR_RAW); - IFFLAGSTR(mr_mode, FI_MR_VIRT_ADDR); - IFFLAGSTR(mr_mode, FI_MR_ALLOCATED); - IFFLAGSTR(mr_mode, FI_MR_PROV_KEY); - IFFLAGSTR(mr_mode, FI_MR_MMU_NOTIFY); - IFFLAGSTR(mr_mode, FI_MR_RMA_EVENT); - IFFLAGSTR(mr_mode, FI_MR_ENDPOINT); + IFFLAGSTRN(mr_mode, FI_MR_BASIC, len); + IFFLAGSTRN(mr_mode, FI_MR_SCALABLE, len); + IFFLAGSTRN(mr_mode, FI_MR_LOCAL, len); + IFFLAGSTRN(mr_mode, FI_MR_RAW, len); + IFFLAGSTRN(mr_mode, FI_MR_VIRT_ADDR, len); + IFFLAGSTRN(mr_mode, FI_MR_ALLOCATED, len); + IFFLAGSTRN(mr_mode, FI_MR_PROV_KEY, len); + IFFLAGSTRN(mr_mode, FI_MR_MMU_NOTIFY, len); + IFFLAGSTRN(mr_mode, FI_MR_RMA_EVENT, len); + IFFLAGSTRN(mr_mode, FI_MR_ENDPOINT, len); + IFFLAGSTRN(mr_mode, FI_MR_HMEM, len); ofi_remove_comma(buf); } -static void ofi_tostr_op_type(char *buf, int op_type) +static void ofi_tostr_op_type(char *buf, size_t len, int op_type) { switch (op_type) { - CASEENUMSTR(FI_OP_RECV); - CASEENUMSTR(FI_OP_SEND); - CASEENUMSTR(FI_OP_TRECV); - CASEENUMSTR(FI_OP_TSEND); - CASEENUMSTR(FI_OP_READ); - CASEENUMSTR(FI_OP_WRITE); - CASEENUMSTR(FI_OP_ATOMIC); - CASEENUMSTR(FI_OP_FETCH_ATOMIC); - CASEENUMSTR(FI_OP_COMPARE_ATOMIC); - CASEENUMSTR(FI_OP_CNTR_SET); - CASEENUMSTR(FI_OP_CNTR_ADD); + CASEENUMSTRN(FI_OP_RECV, len); + CASEENUMSTRN(FI_OP_SEND, len); + CASEENUMSTRN(FI_OP_TRECV, len); + CASEENUMSTRN(FI_OP_TSEND, len); + CASEENUMSTRN(FI_OP_READ, len); + CASEENUMSTRN(FI_OP_WRITE, len); + CASEENUMSTRN(FI_OP_ATOMIC, len); + CASEENUMSTRN(FI_OP_FETCH_ATOMIC, len); + CASEENUMSTRN(FI_OP_COMPARE_ATOMIC, len); + CASEENUMSTRN(FI_OP_CNTR_SET, len); + CASEENUMSTRN(FI_OP_CNTR_ADD, len); default: - ofi_strcatf(buf, "Unknown"); + ofi_strncatf(buf, len, "Unknown"); break; } } -static void ofi_tostr_domain_attr(char *buf, const struct fi_domain_attr *attr, - const char *prefix) +static void +ofi_tostr_domain_attr(char *buf, size_t len, const struct fi_domain_attr *attr, + const char *prefix) { if (!attr) { - ofi_strcatf(buf, "%sfi_domain_attr: (null)\n", prefix); + ofi_strncatf(buf, len, "%sfi_domain_attr: (null)\n", prefix); return; } - ofi_strcatf(buf, "%sfi_domain_attr:\n", prefix); - - ofi_strcatf(buf, "%s%sdomain: 0x%x\n", prefix, TAB, attr->domain); - - ofi_strcatf(buf, "%s%sname: %s\n", prefix, TAB, attr->name); - ofi_strcatf(buf, "%s%sthreading: ", prefix, TAB); - ofi_tostr_threading(buf, attr->threading); - ofi_strcatf(buf, "\n"); - - ofi_strcatf(buf, "%s%scontrol_progress: ", prefix,TAB); - ofi_tostr_progress(buf, attr->control_progress); - ofi_strcatf(buf, "\n"); - ofi_strcatf(buf, "%s%sdata_progress: ", prefix, TAB); - ofi_tostr_progress(buf, attr->data_progress); - ofi_strcatf(buf, "\n"); - ofi_strcatf(buf, "%s%sresource_mgmt: ", prefix, TAB); - ofi_tostr_resource_mgmt(buf, attr->resource_mgmt); - ofi_strcatf(buf, "\n"); - ofi_strcatf(buf, "%s%sav_type: ", prefix, TAB); - ofi_tostr_av_type(buf, attr->av_type); - ofi_strcatf(buf, "\n"); - ofi_strcatf(buf, "%s%smr_mode: [ ", prefix, TAB); - ofi_tostr_mr_mode(buf, attr->mr_mode); - ofi_strcatf(buf, " ]\n"); - - ofi_strcatf(buf, "%s%smr_key_size: %zu\n", prefix, TAB, attr->mr_key_size); - ofi_strcatf(buf, "%s%scq_data_size: %zu\n", prefix, TAB, attr->cq_data_size); - ofi_strcatf(buf, "%s%scq_cnt: %zu\n", prefix, TAB, attr->cq_cnt); - ofi_strcatf(buf, "%s%sep_cnt: %zu\n", prefix, TAB, attr->ep_cnt); - ofi_strcatf(buf, "%s%stx_ctx_cnt: %zu\n", prefix, TAB, attr->tx_ctx_cnt); - ofi_strcatf(buf, "%s%srx_ctx_cnt: %zu\n", prefix, TAB, attr->rx_ctx_cnt); - ofi_strcatf(buf, "%s%smax_ep_tx_ctx: %zu\n", prefix, TAB, attr->max_ep_tx_ctx); - ofi_strcatf(buf, "%s%smax_ep_rx_ctx: %zu\n", prefix, TAB, attr->max_ep_rx_ctx); - ofi_strcatf(buf, "%s%smax_ep_stx_ctx: %zu\n", prefix, TAB, attr->max_ep_stx_ctx); - ofi_strcatf(buf, "%s%smax_ep_srx_ctx: %zu\n", prefix, TAB, attr->max_ep_srx_ctx); - ofi_strcatf(buf, "%s%scntr_cnt: %zu\n", prefix, TAB, attr->cntr_cnt); - ofi_strcatf(buf, "%s%smr_iov_limit: %zu\n", prefix, TAB, attr->mr_iov_limit); - - ofi_strcatf(buf, "%scaps: [ ", TAB); - ofi_tostr_caps(buf, attr->caps); - ofi_strcatf(buf, " ]\n"); - - ofi_strcatf(buf, "%smode: [ ", TAB); - ofi_tostr_mode(buf, attr->mode); - ofi_strcatf(buf, " ]\n"); - - ofi_strcatf(buf, "%s%sauth_key_size: %zu\n", prefix, TAB, attr->auth_key_size); - ofi_strcatf(buf, "%s%smax_err_data: %zu\n", prefix, TAB, attr->max_err_data); - ofi_strcatf(buf, "%s%smr_cnt: %zu\n", prefix, TAB, attr->mr_cnt); + ofi_strncatf(buf, len, "%sfi_domain_attr:\n", prefix); + + ofi_strncatf(buf, len, "%s%sdomain: 0x%x\n", prefix, TAB, attr->domain); + + ofi_strncatf(buf, len, "%s%sname: %s\n", prefix, TAB, attr->name); + ofi_strncatf(buf, len, "%s%sthreading: ", prefix, TAB); + ofi_tostr_threading(buf, len, attr->threading); + ofi_strncatf(buf, len, "\n"); + + ofi_strncatf(buf, len, "%s%scontrol_progress: ", prefix,TAB); + ofi_tostr_progress(buf, len, attr->control_progress); + ofi_strncatf(buf, len, "\n"); + ofi_strncatf(buf, len, "%s%sdata_progress: ", prefix, TAB); + ofi_tostr_progress(buf, len, attr->data_progress); + ofi_strncatf(buf, len, "\n"); + ofi_strncatf(buf, len, "%s%sresource_mgmt: ", prefix, TAB); + ofi_tostr_resource_mgmt(buf, len, attr->resource_mgmt); + ofi_strncatf(buf, len, "\n"); + ofi_strncatf(buf, len, "%s%sav_type: ", prefix, TAB); + ofi_tostr_av_type(buf, len, attr->av_type); + ofi_strncatf(buf, len, "\n"); + ofi_strncatf(buf, len, "%s%smr_mode: [ ", prefix, TAB); + ofi_tostr_mr_mode(buf, len, attr->mr_mode); + ofi_strncatf(buf, len, " ]\n"); + + ofi_strncatf(buf, len, "%s%smr_key_size: %zu\n", prefix, TAB, + attr->mr_key_size); + ofi_strncatf(buf, len, "%s%scq_data_size: %zu\n", prefix, TAB, + attr->cq_data_size); + ofi_strncatf(buf, len, "%s%scq_cnt: %zu\n", prefix, TAB, + attr->cq_cnt); + ofi_strncatf(buf, len, "%s%sep_cnt: %zu\n", prefix, TAB, attr->ep_cnt); + ofi_strncatf(buf, len, "%s%stx_ctx_cnt: %zu\n", prefix, TAB, + attr->tx_ctx_cnt); + ofi_strncatf(buf, len, "%s%srx_ctx_cnt: %zu\n", prefix, TAB, + attr->rx_ctx_cnt); + ofi_strncatf(buf, len, "%s%smax_ep_tx_ctx: %zu\n", prefix, TAB, + attr->max_ep_tx_ctx); + ofi_strncatf(buf, len, "%s%smax_ep_rx_ctx: %zu\n", prefix, TAB, + attr->max_ep_rx_ctx); + ofi_strncatf(buf, len, "%s%smax_ep_stx_ctx: %zu\n", prefix, TAB, + attr->max_ep_stx_ctx); + ofi_strncatf(buf, len, "%s%smax_ep_srx_ctx: %zu\n", prefix, TAB, + attr->max_ep_srx_ctx); + ofi_strncatf(buf, len, "%s%scntr_cnt: %zu\n", prefix, TAB, + attr->cntr_cnt); + ofi_strncatf(buf, len, "%s%smr_iov_limit: %zu\n", prefix, TAB, + attr->mr_iov_limit); + + ofi_strncatf(buf, len, "%scaps: [ ", TAB); + ofi_tostr_caps(buf, len, attr->caps); + ofi_strncatf(buf, len, " ]\n"); + + ofi_strncatf(buf, len, "%smode: [ ", TAB); + ofi_tostr_mode(buf, len, attr->mode); + ofi_strncatf(buf, len, " ]\n"); + + ofi_strncatf(buf, len, "%s%sauth_key_size: %zu\n", prefix, TAB, + attr->auth_key_size); + ofi_strncatf(buf, len, "%s%smax_err_data: %zu\n", prefix, TAB, + attr->max_err_data); + ofi_strncatf(buf, len, "%s%smr_cnt: %zu\n", prefix, TAB, attr->mr_cnt); } -static void ofi_tostr_fabric_attr(char *buf, const struct fi_fabric_attr *attr, - const char *prefix) +static void +ofi_tostr_fabric_attr(char *buf, size_t len, const struct fi_fabric_attr *attr, + const char *prefix) { if (!attr) { - ofi_strcatf(buf, "%sfi_fabric_attr: (null)\n", prefix); + ofi_strncatf(buf, len, "%sfi_fabric_attr: (null)\n", prefix); return; } - ofi_strcatf(buf, "%sfi_fabric_attr:\n", prefix); - ofi_strcatf(buf, "%s%sname: %s\n", prefix, TAB, attr->name); - ofi_strcatf(buf, "%s%sprov_name: %s\n", prefix, TAB, attr->prov_name); - ofi_strcatf(buf, "%s%sprov_version: %d.%d\n", prefix, TAB, + ofi_strncatf(buf, len, "%sfi_fabric_attr:\n", prefix); + ofi_strncatf(buf, len, "%s%sname: %s\n", prefix, TAB, attr->name); + ofi_strncatf(buf, len, "%s%sprov_name: %s\n", prefix, TAB, + attr->prov_name); + ofi_strncatf(buf, len, "%s%sprov_version: %d.%d\n", prefix, TAB, FI_MAJOR(attr->prov_version), FI_MINOR(attr->prov_version)); - ofi_strcatf(buf, "%s%sapi_version: %d.%d\n", prefix, TAB, + ofi_strncatf(buf, len, "%s%sapi_version: %d.%d\n", prefix, TAB, FI_MAJOR(attr->api_version), FI_MINOR(attr->api_version)); } -static void ofi_tostr_info(char *buf, const struct fi_info *info) +static void ofi_tostr_info(char *buf, size_t len, const struct fi_info *info) { - ofi_strcatf(buf, "fi_info:\n"); - ofi_strcatf(buf, "%scaps: [ ", TAB); - ofi_tostr_caps(buf, info->caps); - ofi_strcatf(buf, " ]\n"); - - ofi_strcatf(buf, "%smode: [ ", TAB); - ofi_tostr_mode(buf, info->mode); - ofi_strcatf(buf, " ]\n"); - - ofi_strcatf(buf, "%saddr_format: ", TAB); - oofi_tostr_addr_format(buf, info->addr_format); - ofi_strcatf(buf, "\n"); - - ofi_strcatf(buf, "%ssrc_addrlen: %zu\n", TAB, info->src_addrlen); - ofi_strcatf(buf, "%sdest_addrlen: %zu\n", TAB, info->dest_addrlen); - ofi_strcatf(buf, "%ssrc_addr: ", TAB); - ofi_tostr_addr(buf, info->addr_format, info->src_addr); - ofi_strcatf(buf, "\n"); - ofi_strcatf(buf, "%sdest_addr: ", TAB); - ofi_tostr_addr(buf, info->addr_format, info->dest_addr); - ofi_strcatf(buf, "\n"); - ofi_tostr_fid(TAB "handle: ", buf, info->handle); - - ofi_tostr_tx_attr(buf, info->tx_attr, TAB); - ofi_tostr_rx_attr(buf, info->rx_attr, TAB); - ofi_tostr_ep_attr(buf, info->ep_attr, TAB); - ofi_tostr_domain_attr(buf, info->domain_attr, TAB); - ofi_tostr_fabric_attr(buf, info->fabric_attr, TAB); - ofi_tostr_fid(TAB "nic_fid: ", buf, &info->nic->fid); + ofi_strncatf(buf, len, "fi_info:\n"); + ofi_strncatf(buf, len, "%scaps: [ ", TAB); + ofi_tostr_caps(buf, len, info->caps); + ofi_strncatf(buf, len, " ]\n"); + + ofi_strncatf(buf, len, "%smode: [ ", TAB); + ofi_tostr_mode(buf, len, info->mode); + ofi_strncatf(buf, len, " ]\n"); + + ofi_strncatf(buf, len, "%saddr_format: ", TAB); + ofi_tostr_addr_format(buf, len, info->addr_format); + ofi_strncatf(buf, len, "\n"); + + ofi_strncatf(buf, len, "%ssrc_addrlen: %zu\n", TAB, info->src_addrlen); + ofi_strncatf(buf, len, "%sdest_addrlen: %zu\n", TAB, + info->dest_addrlen); + ofi_strncatf(buf, len, "%ssrc_addr: ", TAB); + ofi_tostr_addr(buf, len, info->addr_format, info->src_addr); + ofi_strncatf(buf, len, "\n"); + ofi_strncatf(buf, len, "%sdest_addr: ", TAB); + ofi_tostr_addr(buf, len, info->addr_format, info->dest_addr); + ofi_strncatf(buf, len, "\n"); + ofi_tostr_fid(TAB "handle: ", buf, len, info->handle); + + ofi_tostr_tx_attr(buf, len, info->tx_attr, TAB); + ofi_tostr_rx_attr(buf, len, info->rx_attr, TAB); + ofi_tostr_ep_attr(buf, len, info->ep_attr, TAB); + ofi_tostr_domain_attr(buf, len, info->domain_attr, TAB); + ofi_tostr_fabric_attr(buf, len, info->fabric_attr, TAB); + ofi_tostr_fid(TAB "nic_fid: ", buf, len, &info->nic->fid); } -static void ofi_tostr_atomic_type(char *buf, enum fi_datatype type) +static void ofi_tostr_atomic_type(char *buf, size_t len, enum fi_datatype type) { switch (type) { - CASEENUMSTR(FI_INT8); - CASEENUMSTR(FI_UINT8); - CASEENUMSTR(FI_INT16); - CASEENUMSTR(FI_UINT16); - CASEENUMSTR(FI_INT32); - CASEENUMSTR(FI_UINT32); - CASEENUMSTR(FI_INT64); - CASEENUMSTR(FI_UINT64); - CASEENUMSTR(FI_FLOAT); - CASEENUMSTR(FI_DOUBLE); - CASEENUMSTR(FI_FLOAT_COMPLEX); - CASEENUMSTR(FI_DOUBLE_COMPLEX); - CASEENUMSTR(FI_LONG_DOUBLE); - CASEENUMSTR(FI_LONG_DOUBLE_COMPLEX); - CASEENUMSTR(FI_VOID); + CASEENUMSTRN(FI_INT8, len); + CASEENUMSTRN(FI_UINT8, len); + CASEENUMSTRN(FI_INT16, len); + CASEENUMSTRN(FI_UINT16, len); + CASEENUMSTRN(FI_INT32, len); + CASEENUMSTRN(FI_UINT32, len); + CASEENUMSTRN(FI_INT64, len); + CASEENUMSTRN(FI_UINT64, len); + CASEENUMSTRN(FI_FLOAT, len); + CASEENUMSTRN(FI_DOUBLE, len); + CASEENUMSTRN(FI_FLOAT_COMPLEX, len); + CASEENUMSTRN(FI_DOUBLE_COMPLEX, len); + CASEENUMSTRN(FI_LONG_DOUBLE, len); + CASEENUMSTRN(FI_LONG_DOUBLE_COMPLEX, len); + default: + ofi_strncatf(buf, len, "Unknown"); + break; + } +} + +static void ofi_tostr_atomic_op(char *buf, size_t len, enum fi_op op) +{ + switch (op) { + CASEENUMSTRN(FI_MIN, len); + CASEENUMSTRN(FI_MAX, len); + CASEENUMSTRN(FI_SUM, len); + CASEENUMSTRN(FI_PROD, len); + CASEENUMSTRN(FI_LOR, len); + CASEENUMSTRN(FI_LAND, len); + CASEENUMSTRN(FI_BOR, len); + CASEENUMSTRN(FI_BAND, len); + CASEENUMSTRN(FI_LXOR, len); + CASEENUMSTRN(FI_BXOR, len); + CASEENUMSTRN(FI_ATOMIC_READ, len); + CASEENUMSTRN(FI_ATOMIC_WRITE, len); + CASEENUMSTRN(FI_CSWAP, len); + CASEENUMSTRN(FI_CSWAP_NE, len); + CASEENUMSTRN(FI_CSWAP_LE, len); + CASEENUMSTRN(FI_CSWAP_LT, len); + CASEENUMSTRN(FI_CSWAP_GE, len); + CASEENUMSTRN(FI_CSWAP_GT, len); + CASEENUMSTRN(FI_MSWAP, len); default: - ofi_strcatf(buf, "Unknown"); + ofi_strncatf(buf, len, "Unknown"); break; } } -static void ofi_tostr_atomic_op(char *buf, enum fi_op op) +static void +ofi_tostr_collective_op(char *buf, size_t len, enum fi_collective_op op) { switch (op) { - CASEENUMSTR(FI_MIN); - CASEENUMSTR(FI_MAX); - CASEENUMSTR(FI_SUM); - CASEENUMSTR(FI_PROD); - CASEENUMSTR(FI_LOR); - CASEENUMSTR(FI_LAND); - CASEENUMSTR(FI_BOR); - CASEENUMSTR(FI_BAND); - CASEENUMSTR(FI_LXOR); - CASEENUMSTR(FI_BXOR); - CASEENUMSTR(FI_ATOMIC_READ); - CASEENUMSTR(FI_ATOMIC_WRITE); - CASEENUMSTR(FI_CSWAP); - CASEENUMSTR(FI_CSWAP_NE); - CASEENUMSTR(FI_CSWAP_LE); - CASEENUMSTR(FI_CSWAP_LT); - CASEENUMSTR(FI_CSWAP_GE); - CASEENUMSTR(FI_CSWAP_GT); - CASEENUMSTR(FI_MSWAP); - CASEENUMSTR(FI_BARRIER); - CASEENUMSTR(FI_BROADCAST); - CASEENUMSTR(FI_ALLTOALL); - CASEENUMSTR(FI_ALLGATHER); + CASEENUMSTRN(FI_BARRIER, len); + CASEENUMSTRN(FI_BROADCAST, len); + CASEENUMSTRN(FI_ALLTOALL, len); + CASEENUMSTRN(FI_ALLREDUCE, len); + CASEENUMSTRN(FI_ALLGATHER, len); + CASEENUMSTRN(FI_REDUCE_SCATTER, len); + CASEENUMSTRN(FI_REDUCE, len); + CASEENUMSTRN(FI_SCATTER, len); + CASEENUMSTRN(FI_GATHER, len); default: - ofi_strcatf(buf, "Unknown"); + ofi_strncatf(buf, len, "Unknown"); break; } } -static void ofi_tostr_version(char *buf) +static void ofi_tostr_version(char *buf, size_t len) { - ofi_strcatf(buf, VERSION); - ofi_strcatf(buf, BUILD_ID); + ofi_strncatf(buf, len, VERSION); + ofi_strncatf(buf, len, BUILD_ID); } -static void ofi_tostr_eq_event(char *buf, int type) +static void ofi_tostr_eq_event(char *buf, size_t len, int type) { switch (type) { - CASEENUMSTR(FI_NOTIFY); - CASEENUMSTR(FI_CONNREQ); - CASEENUMSTR(FI_CONNECTED); - CASEENUMSTR(FI_SHUTDOWN); - CASEENUMSTR(FI_MR_COMPLETE); - CASEENUMSTR(FI_AV_COMPLETE); - CASEENUMSTR(FI_JOIN_COMPLETE); + CASEENUMSTRN(FI_NOTIFY, len); + CASEENUMSTRN(FI_CONNREQ, len); + CASEENUMSTRN(FI_CONNECTED, len); + CASEENUMSTRN(FI_SHUTDOWN, len); + CASEENUMSTRN(FI_MR_COMPLETE, len); + CASEENUMSTRN(FI_AV_COMPLETE, len); + CASEENUMSTRN(FI_JOIN_COMPLETE, len); default: - ofi_strcatf(buf, "Unknown"); + ofi_strncatf(buf, len, "Unknown"); break; } } -static void ofi_tostr_cq_event_flags(char *buf, uint64_t flags) +static void ofi_tostr_cq_event_flags(char *buf, size_t len, uint64_t flags) { - IFFLAGSTR(flags, FI_SEND); - IFFLAGSTR(flags, FI_RECV); - IFFLAGSTR(flags, FI_RMA); - IFFLAGSTR(flags, FI_ATOMIC); - IFFLAGSTR(flags, FI_MSG); - IFFLAGSTR(flags, FI_TAGGED); - IFFLAGSTR(flags, FI_READ); - IFFLAGSTR(flags, FI_WRITE); - IFFLAGSTR(flags, FI_REMOTE_READ); - IFFLAGSTR(flags, FI_REMOTE_WRITE); - IFFLAGSTR(flags, FI_REMOTE_CQ_DATA); - IFFLAGSTR(flags, FI_MULTI_RECV); - IFFLAGSTR(flags, FI_MORE); - IFFLAGSTR(flags, FI_CLAIM); + IFFLAGSTRN(flags, FI_SEND, len); + IFFLAGSTRN(flags, FI_RECV, len); + IFFLAGSTRN(flags, FI_RMA, len); + IFFLAGSTRN(flags, FI_ATOMIC, len); + IFFLAGSTRN(flags, FI_MSG, len); + IFFLAGSTRN(flags, FI_TAGGED, len); + IFFLAGSTRN(flags, FI_READ, len); + IFFLAGSTRN(flags, FI_WRITE, len); + IFFLAGSTRN(flags, FI_REMOTE_READ, len); + IFFLAGSTRN(flags, FI_REMOTE_WRITE, len); + IFFLAGSTRN(flags, FI_REMOTE_CQ_DATA, len); + IFFLAGSTRN(flags, FI_MULTI_RECV, len); + IFFLAGSTRN(flags, FI_MORE, len); + IFFLAGSTRN(flags, FI_CLAIM, len); ofi_remove_comma(buf); } +static void +ofi_tostr_hmem_iface(char *buf, size_t len, enum fi_hmem_iface iface) +{ + switch (iface) { + CASEENUMSTRN(FI_HMEM_SYSTEM, len); + CASEENUMSTRN(FI_HMEM_CUDA, len); + CASEENUMSTRN(FI_HMEM_ROCR, len); + CASEENUMSTRN(FI_HMEM_ZE, len); + default: + ofi_strncatf(buf, len, "Unknown"); + break; + } +} + __attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) -char *DEFAULT_SYMVER_PRE(fi_tostr)(const void *data, enum fi_type datatype) +char *DEFAULT_SYMVER_PRE(fi_tostr_r)(char *buf, size_t len, + const void *data, enum fi_type datatype) { - static char *buf = NULL; const uint64_t *val64; const uint32_t *val32; const int *enumval; - if (!data) + if (!data || !buf || !len) return NULL; val64 = (const uint64_t *) data; val32 = (const uint32_t *) data; enumval = (const int *) data; - if (!buf) { - buf = calloc(OFI_BUFSIZ, 1); - if (!buf) - return NULL; - } buf[0] = '\0'; switch (datatype) { case FI_TYPE_INFO: - ofi_tostr_info(buf, data); + ofi_tostr_info(buf, len, data); break; case FI_TYPE_EP_TYPE: - ofi_tostr_ep_type(buf, *enumval); + ofi_tostr_ep_type(buf, len, *enumval); break; case FI_TYPE_CAPS: - ofi_tostr_caps(buf, *val64); + ofi_tostr_caps(buf, len, *val64); break; case FI_TYPE_OP_FLAGS: - ofi_tostr_opflags(buf, *val64); + ofi_tostr_opflags(buf, len, *val64); break; case FI_TYPE_ADDR_FORMAT: - oofi_tostr_addr_format(buf, *val32); + ofi_tostr_addr_format(buf, len, *val32); break; case FI_TYPE_TX_ATTR: - ofi_tostr_tx_attr(buf, data, ""); + ofi_tostr_tx_attr(buf, len, data, ""); break; case FI_TYPE_RX_ATTR: - ofi_tostr_rx_attr(buf, data, ""); + ofi_tostr_rx_attr(buf, len, data, ""); break; case FI_TYPE_EP_ATTR: - ofi_tostr_ep_attr(buf, data, ""); + ofi_tostr_ep_attr(buf, len, data, ""); break; case FI_TYPE_DOMAIN_ATTR: - ofi_tostr_domain_attr(buf, data, ""); + ofi_tostr_domain_attr(buf, len, data, ""); break; case FI_TYPE_FABRIC_ATTR: - ofi_tostr_fabric_attr(buf, data, ""); + ofi_tostr_fabric_attr(buf, len, data, ""); break; case FI_TYPE_THREADING: - ofi_tostr_threading(buf, *enumval); + ofi_tostr_threading(buf, len, *enumval); break; case FI_TYPE_PROGRESS: - ofi_tostr_progress(buf, *enumval); + ofi_tostr_progress(buf, len, *enumval); break; case FI_TYPE_PROTOCOL: - ofi_tostr_protocol(buf, *val32); + ofi_tostr_protocol(buf, len, *val32); break; case FI_TYPE_MSG_ORDER: - ofi_tostr_msgorder(buf, *val64); + ofi_tostr_msgorder(buf, len, *val64); break; case FI_TYPE_MODE: - ofi_tostr_mode(buf, *val64); + ofi_tostr_mode(buf, len, *val64); break; case FI_TYPE_AV_TYPE: - ofi_tostr_av_type(buf, *enumval); + ofi_tostr_av_type(buf, len, *enumval); break; case FI_TYPE_ATOMIC_TYPE: - ofi_tostr_atomic_type(buf, *enumval); + ofi_tostr_atomic_type(buf, len, *enumval); break; case FI_TYPE_ATOMIC_OP: - ofi_tostr_atomic_op(buf, *enumval); + ofi_tostr_atomic_op(buf, len, *enumval); break; case FI_TYPE_VERSION: - ofi_tostr_version(buf); + ofi_tostr_version(buf, len); break; case FI_TYPE_EQ_EVENT: - ofi_tostr_eq_event(buf, *enumval); + ofi_tostr_eq_event(buf, len, *enumval); break; case FI_TYPE_CQ_EVENT_FLAGS: - ofi_tostr_cq_event_flags(buf, *val64); + ofi_tostr_cq_event_flags(buf, len, *val64); break; case FI_TYPE_MR_MODE: /* mr_mode was an enum converted to int flags */ - ofi_tostr_mr_mode(buf, *enumval); + ofi_tostr_mr_mode(buf, len, *enumval); break; case FI_TYPE_OP_TYPE: - ofi_tostr_op_type(buf, *enumval); + ofi_tostr_op_type(buf, len, *enumval); break; case FI_TYPE_FID: - ofi_tostr_fid("fid: ", buf, data); + ofi_tostr_fid("fid: ", buf, len, data); + break; + case FI_TYPE_COLLECTIVE_OP: + ofi_tostr_collective_op(buf, len, *enumval); + break; + case FI_TYPE_HMEM_IFACE: + ofi_tostr_hmem_iface(buf, len, *enumval); break; default: - ofi_strcatf(buf, "Unknown type"); + ofi_strncatf(buf, len, "Unknown type"); break; } return buf; } -DEFAULT_SYMVER(fi_tostr_, fi_tostr, FABRIC_1.0); +CURRENT_SYMVER(fi_tostr_r_, fi_tostr_r); -#undef CASEENUMSTR -#undef IFFLAGSTR +__attribute__((visibility ("default"),EXTERNALLY_VISIBLE)) +char *DEFAULT_SYMVER_PRE(fi_tostr)(const void *data, enum fi_type datatype) +{ + static char *buf = NULL; + size_t len = 8192; + + if (!buf) { + buf = calloc(len, 1); + if (!buf) + return NULL; + } + + return fi_tostr_r(buf, len, data, datatype); +} +DEFAULT_SYMVER(fi_tostr_, fi_tostr, FABRIC_1.0); diff --git a/src/hmem.c b/src/hmem.c new file mode 100644 index 00000000000..33288e8c424 --- /dev/null +++ b/src/hmem.c @@ -0,0 +1,319 @@ +/* + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * (C) Copyright 2020-2021 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +#include +#endif + +#include "ofi_hmem.h" +#include "ofi.h" +#include "ofi_iov.h" + +struct ofi_hmem_ops { + bool initialized; + int (*init)(void); + int (*cleanup)(void); + int (*copy_to_hmem)(uint64_t device, void *dest, const void *src, + size_t size); + int (*copy_from_hmem)(uint64_t device, void *dest, const void *src, + size_t size); + bool (*is_addr_valid)(const void *addr); + int (*get_handle)(void *dev_buf, void **handle); + int (*open_handle)(void **handle, uint64_t device, void **ipc_ptr); + int (*close_handle)(void *ipc_ptr); + int (*host_register)(void *ptr, size_t size); + int (*host_unregister)(void *ptr); + int (*get_base_addr)(const void *ptr, void **base); +}; + +static struct ofi_hmem_ops hmem_ops[] = { + [FI_HMEM_SYSTEM] = { + .initialized = false, + .init = ofi_hmem_init_noop, + .cleanup = ofi_hmem_cleanup_noop, + .copy_to_hmem = ofi_memcpy, + .copy_from_hmem = ofi_memcpy, + .get_handle = ofi_hmem_no_get_handle, + .open_handle = ofi_hmem_no_open_handle, + .close_handle = ofi_hmem_no_close_handle, + .host_register = ofi_hmem_register_noop, + .host_unregister = ofi_hmem_host_unregister_noop, + .get_base_addr = ofi_hmem_no_base_addr, + }, + [FI_HMEM_CUDA] = { + .initialized = false, + .init = cuda_hmem_init, + .cleanup = cuda_hmem_cleanup, + .copy_to_hmem = cuda_copy_to_dev, + .copy_from_hmem = cuda_copy_from_dev, + .is_addr_valid = cuda_is_addr_valid, + .get_handle = ofi_hmem_no_get_handle, + .open_handle = ofi_hmem_no_open_handle, + .close_handle = ofi_hmem_no_close_handle, + .host_register = cuda_host_register, + .host_unregister = cuda_host_unregister, + .get_base_addr = ofi_hmem_no_base_addr, + }, + [FI_HMEM_ROCR] = { + .initialized = false, + .init = rocr_hmem_init, + .cleanup = rocr_hmem_cleanup, + .copy_to_hmem = rocr_copy_to_dev, + .copy_from_hmem = rocr_copy_from_dev, + .is_addr_valid = rocr_is_addr_valid, + .get_handle = ofi_hmem_no_get_handle, + .open_handle = ofi_hmem_no_open_handle, + .close_handle = ofi_hmem_no_close_handle, + .host_register = rocr_host_register, + .host_unregister = rocr_host_unregister, + .get_base_addr = ofi_hmem_no_base_addr, + }, + [FI_HMEM_ZE] = { + .initialized = false, + .init = ze_hmem_init, + .cleanup = ze_hmem_cleanup, + .copy_to_hmem = ze_hmem_copy, + .copy_from_hmem = ze_hmem_copy, + .is_addr_valid = ze_is_addr_valid, + .get_handle = ze_hmem_get_handle, + .open_handle = ze_hmem_open_handle, + .close_handle = ze_hmem_close_handle, + .host_register = ofi_hmem_register_noop, + .host_unregister = ofi_hmem_host_unregister_noop, + .get_base_addr = ze_hmem_get_base_addr, + }, +}; + +static inline int ofi_copy_to_hmem(enum fi_hmem_iface iface, uint64_t device, + void *dest, const void *src, size_t size) +{ + return hmem_ops[iface].copy_to_hmem(device, dest, src, size); +} + +static inline int ofi_copy_from_hmem(enum fi_hmem_iface iface, uint64_t device, + void *dest, const void *src, size_t size) +{ + return hmem_ops[iface].copy_from_hmem(device, dest, src, size); +} + +static ssize_t ofi_copy_hmem_iov_buf(enum fi_hmem_iface hmem_iface, uint64_t device, + const struct iovec *hmem_iov, + size_t hmem_iov_count, + uint64_t hmem_iov_offset, void *buf, + size_t size, int dir) +{ + uint64_t done = 0, len; + char *hmem_buf; + size_t i; + int ret; + + for (i = 0; i < hmem_iov_count && size; i++) { + len = hmem_iov[i].iov_len; + + if (hmem_iov_offset > len) { + hmem_iov_offset -= len; + continue; + } + + hmem_buf = (char *)hmem_iov[i].iov_base + hmem_iov_offset; + len -= hmem_iov_offset; + hmem_iov_offset = 0; + + len = MIN(len, size); + if (!len) + continue; + + if (dir == OFI_COPY_BUF_TO_IOV) + ret = ofi_copy_to_hmem(hmem_iface, device, hmem_buf, + (char *)buf + done, len); + else + ret = ofi_copy_from_hmem(hmem_iface, device, + (char *)buf + done, hmem_buf, + len); + + if (ret) + return ret; + + size -= len; + done += len; + } + return done; +} + +ssize_t ofi_copy_from_hmem_iov(void *dest, size_t size, + enum fi_hmem_iface hmem_iface, uint64_t device, + const struct iovec *hmem_iov, + size_t hmem_iov_count, + uint64_t hmem_iov_offset) +{ + return ofi_copy_hmem_iov_buf(hmem_iface, device, hmem_iov, + hmem_iov_count, hmem_iov_offset, + dest, size, OFI_COPY_IOV_TO_BUF); +} + +ssize_t ofi_copy_to_hmem_iov(enum fi_hmem_iface hmem_iface, uint64_t device, + const struct iovec *hmem_iov, + size_t hmem_iov_count, uint64_t hmem_iov_offset, + const void *src, size_t size) +{ + return ofi_copy_hmem_iov_buf(hmem_iface, device, hmem_iov, + hmem_iov_count, hmem_iov_offset, + (void *) src, size, OFI_COPY_BUF_TO_IOV); +} + +int ofi_hmem_get_handle(enum fi_hmem_iface iface, void *dev_buf, void **handle) +{ + return hmem_ops[iface].get_handle(dev_buf, handle); +} + +int ofi_hmem_open_handle(enum fi_hmem_iface iface, void **handle, + uint64_t device, void **ipc_ptr) +{ + return hmem_ops[iface].open_handle(handle, device, ipc_ptr); +} + +int ofi_hmem_close_handle(enum fi_hmem_iface iface, void *ipc_ptr) +{ + return hmem_ops[iface].close_handle(ipc_ptr); +} + +int ofi_hmem_get_base_addr(enum fi_hmem_iface iface, const void *ptr, + void **base) +{ + return hmem_ops[iface].get_base_addr(ptr, base); +} + +void ofi_hmem_init(void) +{ + int iface, ret; + + for (iface = 0; iface < ARRAY_SIZE(hmem_ops); iface++) { + ret = hmem_ops[iface].init(); + if (ret != FI_SUCCESS) { + if (ret == -FI_ENOSYS) + FI_INFO(&core_prov, FI_LOG_CORE, + "Hmem iface %s not supported\n", + fi_tostr(&iface, FI_TYPE_HMEM_IFACE)); + else + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to initialize hmem iface %s: %s\n", + fi_tostr(&iface, FI_TYPE_HMEM_IFACE), + fi_strerror(-ret)); + } else { + hmem_ops[iface].initialized = true; + } + } +} + +void ofi_hmem_cleanup(void) +{ + enum fi_hmem_iface iface; + + for (iface = 0; iface < ARRAY_SIZE(hmem_ops); iface++) { + if (hmem_ops[iface].initialized) + hmem_ops[iface].cleanup(); + } +} + +enum fi_hmem_iface ofi_get_hmem_iface(const void *addr) +{ + int iface; + + /* Since a is_addr_valid function is not implemented for FI_HMEM_SYSTEM, + * HMEM iface is skipped. In addition, if no other HMEM ifaces claim the + * address as valid, it is assumed the address is FI_HMEM_SYSTEM. + */ + for (iface = ARRAY_SIZE(hmem_ops) - 1; iface > FI_HMEM_SYSTEM; + iface--) { + if (hmem_ops[iface].initialized && + hmem_ops[iface].is_addr_valid(addr)) + return iface; + } + + return FI_HMEM_SYSTEM; +} + +int ofi_hmem_host_register(void *ptr, size_t size) +{ + int iface, ret; + + for (iface = 0; iface < ARRAY_SIZE(hmem_ops); iface++) { + if (!hmem_ops[iface].initialized) + continue; + + ret = hmem_ops[iface].host_register(ptr, size); + if (ret != FI_SUCCESS) + goto err; + } + + return FI_SUCCESS; + +err: + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to register host memory with hmem iface %s: %s\n", + fi_tostr(&iface, FI_TYPE_HMEM_IFACE), + fi_strerror(-ret)); + + for (iface--; iface >= 0; iface--) { + if (!hmem_ops[iface].initialized) + continue; + + hmem_ops[iface].host_unregister(ptr); + } + + return ret; +} + +int ofi_hmem_host_unregister(void *ptr) +{ + int iface, ret; + + for (iface = 0; iface < ARRAY_SIZE(hmem_ops); iface++) { + if (!hmem_ops[iface].initialized) + continue; + + ret = hmem_ops[iface].host_unregister(ptr); + if (ret != FI_SUCCESS) + goto err; + } + + return FI_SUCCESS; + +err: + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to unregister host memory with hmem iface %s: %s\n", + fi_tostr(&iface, FI_TYPE_HMEM_IFACE), + fi_strerror(-ret)); + + return ret; +} diff --git a/src/hmem_cuda.c b/src/hmem_cuda.c new file mode 100644 index 00000000000..9581c72d323 --- /dev/null +++ b/src/hmem_cuda.c @@ -0,0 +1,466 @@ +/* + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +#include +#endif + +#include "ofi_hmem.h" +#include "ofi.h" + +#if HAVE_LIBCUDA + +#include +#include + +struct cuda_ops { + cudaError_t (*cudaMemcpy)(void *dst, const void *src, size_t count, + enum cudaMemcpyKind kind); + const char *(*cudaGetErrorName)(cudaError_t error); + const char *(*cudaGetErrorString)(cudaError_t error); + CUresult (*cuPointerGetAttribute)(void *data, + CUpointer_attribute attribute, + CUdeviceptr ptr); + cudaError_t (*cudaHostRegister)(void *ptr, size_t size, + unsigned int flags); + cudaError_t (*cudaHostUnregister)(void *ptr); + cudaError_t (*cudaGetDeviceCount)(int *count); +}; + +static int hmem_cuda_use_gdrcopy; + +#ifdef ENABLE_CUDA_DLOPEN + +#include + +static void *cudart_handle; +static void *cuda_handle; +static struct cuda_ops cuda_ops; + +#else + +static struct cuda_ops cuda_ops = { + .cudaMemcpy = cudaMemcpy, + .cudaGetErrorName = cudaGetErrorName, + .cudaGetErrorString = cudaGetErrorString, + .cuPointerGetAttribute = cuPointerGetAttribute, + .cudaHostRegister = cudaHostRegister, + .cudaHostUnregister = cudaHostUnregister, + .cudaGetDeviceCount = cudaGetDeviceCount, +}; + +#endif /* ENABLE_CUDA_DLOPEN */ + +cudaError_t ofi_cudaMemcpy(void *dst, const void *src, size_t count, + enum cudaMemcpyKind kind) +{ + return cuda_ops.cudaMemcpy(dst, src, count, kind); +} + +const char *ofi_cudaGetErrorName(cudaError_t error) +{ + return cuda_ops.cudaGetErrorName(error); +} + +const char *ofi_cudaGetErrorString(cudaError_t error) +{ + return cuda_ops.cudaGetErrorString(error); +} + +CUresult ofi_cuPointerGetAttribute(void *data, CUpointer_attribute attribute, + CUdeviceptr ptr) +{ + return cuda_ops.cuPointerGetAttribute(data, attribute, ptr); +} + +cudaError_t ofi_cudaHostRegister(void *ptr, size_t size, unsigned int flags) +{ + return cuda_ops.cudaHostRegister(ptr, size, flags); +} + +cudaError_t ofi_cudaHostUnregister(void *ptr) +{ + return cuda_ops.cudaHostUnregister(ptr); +} + +static cudaError_t ofi_cudaGetDeviceCount(int *count) +{ + return cuda_ops.cudaGetDeviceCount(count); +} + +int cuda_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size) +{ + if (hmem_cuda_use_gdrcopy) { + cuda_gdrcopy_to_dev(device, dev, host, size); + return FI_SUCCESS; + } + + cudaError_t cuda_ret; + + cuda_ret = ofi_cudaMemcpy(dev, host, size, cudaMemcpyHostToDevice); + if (cuda_ret == cudaSuccess) + return 0; + + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to perform cudaMemcpy: %s:%s\n", + ofi_cudaGetErrorName(cuda_ret), + ofi_cudaGetErrorString(cuda_ret)); + + return -FI_EIO; +} + +int cuda_copy_from_dev(uint64_t device, void *host, const void *dev, size_t size) +{ + if (hmem_cuda_use_gdrcopy) { + cuda_gdrcopy_from_dev(device, host, dev, size); + return FI_SUCCESS; + } + + cudaError_t cuda_ret; + + cuda_ret = ofi_cudaMemcpy(host, dev, size, cudaMemcpyDeviceToHost); + if (cuda_ret == cudaSuccess) + return 0; + + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to perform cudaMemcpy: %s:%s\n", + ofi_cudaGetErrorName(cuda_ret), + ofi_cudaGetErrorString(cuda_ret)); + + return -FI_EIO; +} + +int cuda_dev_register(struct fi_mr_attr *mr_attr, uint64_t *handle) +{ + if (hmem_cuda_use_gdrcopy) + return cuda_gdrcopy_dev_register(mr_attr, handle); + + *handle = mr_attr->device.cuda; + return FI_SUCCESS; +} + +int cuda_dev_unregister(uint64_t handle) +{ + if (hmem_cuda_use_gdrcopy) + return cuda_gdrcopy_dev_unregister(handle); + + return FI_SUCCESS; +} + +static int cuda_hmem_dl_init(void) +{ +#ifdef ENABLE_CUDA_DLOPEN + /* Assume failure to dlopen CUDA runtime is caused by the library not + * being found. Thus, CUDA is not supported. + */ + cudart_handle = dlopen("libcudart.so", RTLD_NOW); + if (!cudart_handle) { + FI_INFO(&core_prov, FI_LOG_CORE, + "Failed to dlopen libcudart.so\n"); + return -FI_ENOSYS; + } + + cuda_handle = dlopen("libcuda.so", RTLD_NOW); + if (!cuda_handle) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to dlopen libcuda.so\n"); + goto err_dlclose_cudart; + } + + cuda_ops.cudaMemcpy = dlsym(cudart_handle, "cudaMemcpy"); + if (!cuda_ops.cudaMemcpy) { + FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find cudaMemcpy\n"); + goto err_dlclose_cuda; + } + + cuda_ops.cudaGetErrorName = dlsym(cudart_handle, "cudaGetErrorName"); + if (!cuda_ops.cudaGetErrorName) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find cudaGetErrorName\n"); + goto err_dlclose_cuda; + } + + cuda_ops.cudaGetErrorString = dlsym(cudart_handle, + "cudaGetErrorString"); + if (!cuda_ops.cudaGetErrorString) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find cudaGetErrorString\n"); + goto err_dlclose_cuda; + } + + cuda_ops.cuPointerGetAttribute = dlsym(cuda_handle, + "cuPointerGetAttribute"); + if (!cuda_ops.cuPointerGetAttribute) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find cuPointerGetAttribute\n"); + goto err_dlclose_cuda; + } + + cuda_ops.cudaHostRegister = dlsym(cudart_handle, "cudaHostRegister"); + if (!cuda_ops.cudaHostRegister) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find cudaHostRegister\n"); + goto err_dlclose_cuda; + } + + cuda_ops.cudaHostUnregister = dlsym(cudart_handle, + "cudaHostUnregister"); + if (!cuda_ops.cudaHostUnregister) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find cudaHostUnregister\n"); + goto err_dlclose_cuda; + } + + cuda_ops.cudaGetDeviceCount = dlsym(cudart_handle, + "cudaGetDeviceCount"); + if (!cuda_ops.cudaGetDeviceCount) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find cudaGetDeviceCount\n"); + goto err_dlclose_cuda; + } + + return FI_SUCCESS; + +err_dlclose_cuda: + dlclose(cuda_handle); +err_dlclose_cudart: + dlclose(cudart_handle); + + return -FI_ENODATA; +#else + return FI_SUCCESS; +#endif /* ENABLE_CUDA_DLOPEN */ +} + +static void cuda_hmem_dl_cleanup(void) +{ +#ifdef ENABLE_CUDA_DLOPEN + dlclose(cuda_handle); + dlclose(cudart_handle); +#endif +} + +static int cuda_hmem_verify_devices(void) +{ + int device_count; + cudaError_t cuda_ret; + + /* Verify CUDA compute-capable devices are present on the host. */ + cuda_ret = ofi_cudaGetDeviceCount(&device_count); + switch (cuda_ret) { + case cudaSuccess: + break; + + case cudaErrorNoDevice: + return -FI_ENOSYS; + + default: + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to perform cudaGetDeviceCount: %s:%s\n", + ofi_cudaGetErrorName(cuda_ret), + ofi_cudaGetErrorString(cuda_ret)); + return -FI_EIO; + } + + if (device_count == 0) + return -FI_ENOSYS; + + return FI_SUCCESS; +} + +int cuda_hmem_init(void) +{ + int ret; + + ret = cuda_hmem_dl_init(); + if (ret != FI_SUCCESS) + return ret; + + ret = cuda_hmem_verify_devices(); + if (ret != FI_SUCCESS) + goto dl_cleanup; + + ret = cuda_gdrcopy_hmem_init(); + if (ret == FI_SUCCESS) { + hmem_cuda_use_gdrcopy = 1; + fi_param_define(NULL, "hmem_cuda_use_gdrcopy", FI_PARAM_BOOL, + "Use gdrcopy to copy data to/from GPU memory"); + fi_param_get_bool(NULL, "hmem_cuda_use_gdrcopy", + &hmem_cuda_use_gdrcopy); + } else { + hmem_cuda_use_gdrcopy = 0; + if (ret != -FI_ENOSYS) + FI_WARN(&core_prov, FI_LOG_CORE, + "gdrcopy initialization failed! gdrcopy will not be used.\n"); + } + + return ret; + +dl_cleanup: + cuda_hmem_dl_cleanup(); + + return ret; +} + +int cuda_hmem_cleanup(void) +{ + cuda_hmem_dl_cleanup(); + cuda_gdrcopy_hmem_cleanup(); + return FI_SUCCESS; +} + +bool cuda_is_addr_valid(const void *addr) +{ + CUresult cuda_ret; + unsigned int data; + + cuda_ret = ofi_cuPointerGetAttribute(&data, + CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + (CUdeviceptr)addr); + switch (cuda_ret) { + case CUDA_SUCCESS: + if (data == CU_MEMORYTYPE_DEVICE) + return true; + break; + + /* Returned if the buffer is not associated with the CUcontext support + * unified virtual addressing. Since host buffers may fall into this + * category, this is not treated as an error. + */ + case CUDA_ERROR_INVALID_VALUE: + break; + + /* Returned if cuInit() has not been called. This can happen if support + * for CUDA is enabled but the user has not made a CUDA call. This is + * not treated as an error. + */ + case CUDA_ERROR_NOT_INITIALIZED: + break; + + /* Returned if the CUcontext does not support unified virtual + * addressing. + */ + case CUDA_ERROR_INVALID_CONTEXT: + FI_WARN(&core_prov, FI_LOG_CORE, + "CUcontext does not support unified virtual addressining\n"); + break; + + default: + FI_WARN(&core_prov, FI_LOG_CORE, + "Unhandle cuPointerGetAttribute return code: ret=%d\n", + cuda_ret); + break; + } + + return false; +} + +int cuda_host_register(void *ptr, size_t size) +{ + cudaError_t cuda_ret; + + cuda_ret = ofi_cudaHostRegister(ptr, size, cudaHostRegisterDefault); + if (cuda_ret == cudaSuccess) + return FI_SUCCESS; + + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to perform cudaMemcpy: %s:%s\n", + ofi_cudaGetErrorName(cuda_ret), + ofi_cudaGetErrorString(cuda_ret)); + + return -FI_EIO; +} + +int cuda_host_unregister(void *ptr) +{ + cudaError_t cuda_ret; + + cuda_ret = ofi_cudaHostUnregister(ptr); + if (cuda_ret == cudaSuccess) + return FI_SUCCESS; + + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to perform cudaMemcpy: %s:%s\n", + ofi_cudaGetErrorName(cuda_ret), + ofi_cudaGetErrorString(cuda_ret)); + + return -FI_EIO; +} + +#else + +int cuda_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size) +{ + return -FI_ENOSYS; +} + +int cuda_copy_from_dev(uint64_t device, void *host, const void *dev, size_t size) +{ + return -FI_ENOSYS; +} + +int cuda_hmem_init(void) +{ + return -FI_ENOSYS; +} + +int cuda_hmem_cleanup(void) +{ + return -FI_ENOSYS; +} + +bool cuda_is_addr_valid(const void *addr) +{ + return false; +} + +int cuda_host_register(void *ptr, size_t size) +{ + return -FI_ENOSYS; +} + +int cuda_host_unregister(void *ptr) +{ + return -FI_ENOSYS; +} + +int cuda_dev_register(struct fi_mr_attr *mr_attr, uint64_t *handle) +{ + return FI_SUCCESS; +} + +int cuda_dev_unregister(uint64_t handle) +{ + return FI_SUCCESS; +} + +#endif /* HAVE_LIBCUDA */ diff --git a/src/hmem_cuda_gdrcopy.c b/src/hmem_cuda_gdrcopy.c new file mode 100644 index 00000000000..41ea5f35a2c --- /dev/null +++ b/src/hmem_cuda_gdrcopy.c @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +#include +#endif + +#include "ofi_hmem.h" +#include "ofi.h" + +#ifdef HAVE_GDRCOPY + +#include +#include + +struct gdrcopy_handle { + gdr_mh_t mh; /* memory handler */ + void *cuda_ptr; /* page aligned gpu pointer */ + void *user_ptr; /* user space ptr mapped to GPU memory */ + size_t length; /* page aligned length */ +}; + +struct gdrcopy_ops { + gdr_t (*gdr_open)(); + int (*gdr_close)(gdr_t g); + int (*gdr_pin_buffer)(gdr_t g, unsigned long addr, size_t size, + uint64_t p2p_token, uint32_t va_space, + gdr_mh_t *handle); + int (*gdr_unpin_buffer)(gdr_t g, gdr_mh_t handle); + int (*gdr_map)(gdr_t g, gdr_mh_t handle, void **va, size_t size); + int (*gdr_unmap)(gdr_t g, gdr_mh_t handle, void *va, size_t size); + int (*gdr_copy_to_mapping)(gdr_mh_t handle, void *map_d_ptr, + const void *h_ptr, size_t size); + int (*gdr_copy_from_mapping)(gdr_mh_t handle, void *map_d_ptr, + const void *h_ptr, size_t size); +}; + +enum gdrcopy_dir { + GDRCOPY_TO_DEVICE, + GDRCOPY_FROM_DEVICE, +}; + +static gdr_t global_gdr; +static pthread_spinlock_t global_gdr_lock; + +#ifdef ENABLE_GDRCOPY_DLOPEN + +#include + +static void *gdrapi_handle; +static struct gdrcopy_ops global_gdrcopy_ops; + +static int cuda_gdrcopy_dl_hmem_init(void) +{ + gdrapi_handle = dlopen("libgdrapi.so", RTLD_NOW); + if (!gdrapi_handle) { + FI_INFO(&core_prov, FI_LOG_CORE, + "Failed to dlopen libgdrapi.so\n"); + return -FI_ENOSYS; + } + + global_gdrcopy_ops.gdr_open = dlsym(gdrapi_handle, "gdr_open"); + if (!global_gdrcopy_ops.gdr_open) { + FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find gdr_open\n"); + goto err_dlclose_gdrapi; + } + + global_gdrcopy_ops.gdr_close = dlsym(gdrapi_handle, "gdr_close"); + if (!global_gdrcopy_ops.gdr_close) { + FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find gdr_close\n"); + goto err_dlclose_gdrapi; + } + + global_gdrcopy_ops.gdr_pin_buffer = dlsym(gdrapi_handle, "gdr_pin_buffer"); + if (!global_gdrcopy_ops.gdr_pin_buffer) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find gdr_pin_buffer\n"); + goto err_dlclose_gdrapi; + } + + global_gdrcopy_ops.gdr_unpin_buffer = dlsym(gdrapi_handle, "gdr_unpin_buffer"); + if (!global_gdrcopy_ops.gdr_unpin_buffer) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find gdr_unpin_buffer\n"); + goto err_dlclose_gdrapi; + } + + global_gdrcopy_ops.gdr_map = dlsym(gdrapi_handle, "gdr_map"); + if (!global_gdrcopy_ops.gdr_map) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find gdr_map\n"); + goto err_dlclose_gdrapi; + } + + global_gdrcopy_ops.gdr_unmap = dlsym(gdrapi_handle, "gdr_unmap"); + if (!global_gdrcopy_ops.gdr_unmap) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find gdr_unmap\n"); + goto err_dlclose_gdrapi; + } + + global_gdrcopy_ops.gdr_copy_to_mapping = dlsym(gdrapi_handle, "gdr_copy_to_mapping"); + if (!global_gdrcopy_ops.gdr_copy_to_mapping) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find gdr_copy_to_mapping\n"); + goto err_dlclose_gdrapi; + } + + global_gdrcopy_ops.gdr_copy_from_mapping = dlsym(gdrapi_handle, "gdr_copy_from_mapping"); + if (!global_gdrcopy_ops.gdr_copy_from_mapping) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find gdr_copy_from_mapping\n"); + goto err_dlclose_gdrapi; + } + + return FI_SUCCESS; + +err_dlclose_gdrapi: + memset(&global_gdrcopy_ops, 0, sizeof(global_gdrcopy_ops)); + dlclose(gdrapi_handle); + return -FI_ENODATA; +} + +static int cuda_gdrcopy_dl_hmem_cleanup(void) +{ + dlclose(gdrapi_handle); + return FI_SUCCESS; +} + +#else + +static struct gdrcopy_ops global_gdrcopy_ops = { + .gdr_open = gdr_open, + .gdr_close = gdr_close, + .gdr_pin_buffer = gdr_pin_buffer, + .gdr_unpin_buffer = gdr_unpin_buffer, + .gdr_map = gdr_map, + .gdr_unmap = gdr_unmap, + .gdr_copy_to_mapping = gdr_copy_to_mapping, + .gdr_copy_from_mapping = gdr_copy_from_mapping +}; + +static int cuda_gdrcopy_dl_hmem_init(void) +{ + return FI_SUCCESS; +} + +static int cuda_gdrcopy_dl_hmem_cleanup(void) +{ + return FI_SUCCESS; +} + +#endif /* ENABLE_CUDA_DLOPEN */ + +int cuda_gdrcopy_hmem_init(void) +{ + int err, ret = 0; + + err = cuda_gdrcopy_dl_hmem_init(); + if (err) { + FI_WARN(&core_prov, FI_LOG_CORE, + "gdrcopy_dl_hmem_init failed!\n"); + return -FI_ENOSYS; + } + + assert(global_gdrcopy_ops.gdr_open); + + global_gdr = global_gdrcopy_ops.gdr_open(); + if (!global_gdr) { + FI_WARN(&core_prov, FI_LOG_CORE, + "gdr_open failed!\n"); + ret = -FI_ENOMEM; + goto exit; + } + + err = pthread_spin_init(&global_gdr_lock, 0); + if (err) { + assert(global_gdrcopy_ops.gdr_close); + global_gdrcopy_ops.gdr_close(global_gdr); + ret = -err; + } + +exit: + cuda_gdrcopy_dl_hmem_cleanup(); + return ret; +} + +int cuda_gdrcopy_hmem_cleanup(void) +{ + int err, ret = 0; + + err = pthread_spin_destroy(&global_gdr_lock); + if (err) { + FI_WARN(&core_prov, FI_LOG_CORE, + "destroy global_gdr_lock failed! err: %s\n", + strerror(err)); + ret = err; + } + + assert(global_gdrcopy_ops.gdr_close); + err = global_gdrcopy_ops.gdr_close(global_gdr); + if (err) { + FI_WARN(&core_prov, FI_LOG_CORE, + "close global_gdr failed! err: %s\n", + strerror(err)); + ret = err; + } + + err = cuda_gdrcopy_dl_hmem_cleanup(); + if (err) { + FI_WARN(&core_prov, FI_LOG_CORE, + "cuda_gdrcopy_dl_hmem_cleaup() failed! err: %s\n", + strerror(err)); + ret = err; + } + + return ret; +} + +void cuda_gdrcopy_impl(uint64_t handle, void *devptr, + void *hostptr, size_t len, + enum gdrcopy_dir dir) +{ + ssize_t off; + struct gdrcopy_handle *gdrcopy; + void *gdrcopy_user_ptr; + + assert(global_gdrcopy_ops.gdr_copy_to_mapping); + assert(handle); + + gdrcopy = (struct gdrcopy_handle *)handle; + off = (char *)devptr - (char *)gdrcopy->cuda_ptr; + assert(off >= 0 && off + len <= gdrcopy->length); + gdrcopy_user_ptr = (char *)gdrcopy->user_ptr + off; + if (dir == GDRCOPY_TO_DEVICE) { + global_gdrcopy_ops.gdr_copy_to_mapping(gdrcopy->mh, + gdrcopy_user_ptr, + hostptr, len); + } else { + assert(dir == GDRCOPY_FROM_DEVICE); + global_gdrcopy_ops.gdr_copy_from_mapping(gdrcopy->mh, + gdrcopy_user_ptr, + hostptr, len); + } +} + +void cuda_gdrcopy_to_dev(uint64_t handle, void *devptr, + const void *hostptr, size_t len) +{ + cuda_gdrcopy_impl(handle, devptr, (void *)hostptr, len, + GDRCOPY_TO_DEVICE); +} + +void cuda_gdrcopy_from_dev(uint64_t handle, void *hostptr, + const void *devptr, size_t len) +{ + cuda_gdrcopy_impl(handle, (void *)devptr, hostptr, len, + GDRCOPY_FROM_DEVICE); +} + +int cuda_gdrcopy_dev_register(struct fi_mr_attr *mr_attr, uint64_t *handle) +{ + int err; + uintptr_t regbgn, regend; + size_t reglen; + struct gdrcopy_handle *gdrcopy; + + assert(global_gdr); + assert(global_gdrcopy_ops.gdr_pin_buffer); + assert(global_gdrcopy_ops.gdr_map); + + regbgn = (uintptr_t)mr_attr->mr_iov->iov_base; + regend = (uintptr_t)mr_attr->mr_iov->iov_base + mr_attr->mr_iov->iov_len; + regbgn = regbgn & GPU_PAGE_MASK; + regend = (regend & GPU_PAGE_MASK) + GPU_PAGE_SIZE; + reglen = regend - regbgn; + + gdrcopy = malloc(sizeof(struct gdrcopy_handle)); + if (!gdrcopy) + return -FI_ENOMEM; + + assert(global_gdr); + pthread_spin_lock(&global_gdr_lock); + err = global_gdrcopy_ops.gdr_pin_buffer(global_gdr, regbgn, + reglen, 0, 0, &gdrcopy->mh); + if (err) { + FI_WARN(&core_prov, FI_LOG_CORE, + "gdr_pin_buffer failed! error: %s", + strerror(err)); + free(gdrcopy); + goto exit; + } + + gdrcopy->cuda_ptr = (void *)regbgn; + gdrcopy->length = reglen; + + err = global_gdrcopy_ops.gdr_map(global_gdr, gdrcopy->mh, + &gdrcopy->user_ptr, gdrcopy->length); + if (err) { + FI_WARN(&core_prov, FI_LOG_CORE, "gdr_map failed! error: %s\n", + strerror(err)); + global_gdrcopy_ops.gdr_unpin_buffer(global_gdr, gdrcopy->mh); + free(gdrcopy); + goto exit; + } + + *handle = (uint64_t)gdrcopy; +exit: + pthread_spin_unlock(&global_gdr_lock); + return err; +} + +int cuda_gdrcopy_dev_unregister(uint64_t handle) +{ + int err; + struct gdrcopy_handle *gdrcopy; + + assert(global_gdr); + assert(global_gdrcopy_ops.gdr_unmap); + assert(global_gdrcopy_ops.gdr_unpin_buffer); + + gdrcopy = (struct gdrcopy_handle *)handle; + assert(gdrcopy); + + pthread_spin_lock(&global_gdr_lock); + err = global_gdrcopy_ops.gdr_unmap(global_gdr, gdrcopy->mh, + gdrcopy->user_ptr, gdrcopy->length); + if (err) { + FI_WARN(&core_prov, FI_LOG_CORE, + "gdr_unmap failed! error: %s\n", + strerror(err)); + goto exit; + } + + err = global_gdrcopy_ops.gdr_unpin_buffer(global_gdr, gdrcopy->mh); + if (err) { + FI_WARN(&core_prov, FI_LOG_MR, + "gdr_unmap failed! error: %s\n", + strerror(err)); + goto exit; + } + +exit: + pthread_spin_unlock(&global_gdr_lock); + free(gdrcopy); + return err; +} + +#else + +int cuda_gdrcopy_hmem_init(void) +{ + return -FI_ENOSYS; +} + +int cuda_gdrcopy_hmem_cleanup(void) +{ + return FI_SUCCESS; +} + +void cuda_gdrcopy_to_dev(uint64_t devhandle, void *devptr, + const void *hostptr, size_t len) +{ +} + +void cuda_gdrcopy_from_dev(uint64_t devhandle, void *hostptr, + const void *devptr, size_t len) +{ +} + +int cuda_gdrcopy_dev_register(struct fi_mr_attr *mr_attr, uint64_t *handle) +{ + return FI_SUCCESS; +} + +int cuda_gdrcopy_dev_unregister(uint64_t handle) +{ + return FI_SUCCESS; +} + +#endif /* HAVE_GDRCOPY */ diff --git a/src/hmem_rocr.c b/src/hmem_rocr.c new file mode 100644 index 00000000000..640a777a6ce --- /dev/null +++ b/src/hmem_rocr.c @@ -0,0 +1,501 @@ +/* + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +#include +#endif + +#include "ofi_hmem.h" +#include "ofi.h" + +#ifdef HAVE_ROCR + +#include + +struct rocr_ops { + hsa_status_t (*hsa_memory_copy)(void *dst, const void *src, + size_t size); + hsa_status_t (*hsa_amd_pointer_info)(void *ptr, + hsa_amd_pointer_info_t *info, + void *(*alloc)(size_t), + uint32_t *num_agents_accessible, + hsa_agent_t **accessible); + hsa_status_t (*hsa_init)(void); + hsa_status_t (*hsa_shut_down)(void); + hsa_status_t (*hsa_status_string)(hsa_status_t status, + const char **status_string); + hsa_status_t (*hsa_amd_dereg_dealloc_cb)(void *ptr, + hsa_amd_deallocation_callback_t cb); + hsa_status_t (*hsa_amd_reg_dealloc_cb)(void *ptr, + hsa_amd_deallocation_callback_t cb, + void *user_data); + hsa_status_t (*hsa_amd_memory_lock)(void *host_ptr, size_t size, + hsa_agent_t *agents, int num_agents, + void **agent_ptr); + hsa_status_t (*hsa_amd_memory_unlock)(void *host_ptr); + hsa_status_t (*hsa_agent_get_info)(hsa_agent_t agent, + hsa_agent_info_t attribute, + void *value); +}; + +#ifdef ENABLE_ROCR_DLOPEN + +#include + +static void *rocr_handle; +static struct rocr_ops rocr_ops; + +#else + +static struct rocr_ops rocr_ops = { + .hsa_memory_copy = hsa_memory_copy, + .hsa_amd_pointer_info = hsa_amd_pointer_info, + .hsa_init = hsa_init, + .hsa_shut_down = hsa_shut_down, + .hsa_status_string = hsa_status_string, + .hsa_amd_dereg_dealloc_cb = + hsa_amd_deregister_deallocation_callback, + .hsa_amd_reg_dealloc_cb = + hsa_amd_register_deallocation_callback, + .hsa_amd_memory_lock = hsa_amd_memory_lock, + .hsa_amd_memory_unlock = hsa_amd_memory_unlock, + .hsa_agent_get_info = hsa_agent_get_info, +}; + +#endif /* ENABLE_ROCR_DLOPEN */ + +hsa_status_t ofi_hsa_amd_memory_lock(void *host_ptr, size_t size, + hsa_agent_t *agents, int num_agents, + void **agent_ptr) +{ + return rocr_ops.hsa_amd_memory_lock(host_ptr, size, agents, num_agents, + agent_ptr); +} + +hsa_status_t ofi_hsa_amd_memory_unlock(void *host_ptr) +{ + return rocr_ops.hsa_amd_memory_unlock(host_ptr); +} + +hsa_status_t ofi_hsa_memory_copy(void *dst, const void *src, size_t size) +{ + return rocr_ops.hsa_memory_copy(dst, src, size); +} + +hsa_status_t ofi_hsa_amd_pointer_info(void *ptr, hsa_amd_pointer_info_t *info, + void *(*alloc)(size_t), + uint32_t *num_agents_accessible, + hsa_agent_t **accessible) +{ + return rocr_ops.hsa_amd_pointer_info(ptr, info, alloc, + num_agents_accessible, accessible); +} + +hsa_status_t ofi_hsa_init(void) +{ + return rocr_ops.hsa_init(); +} + +hsa_status_t ofi_hsa_shut_down(void) +{ + return rocr_ops.hsa_shut_down(); +} + +hsa_status_t ofi_hsa_status_string(hsa_status_t status, + const char **status_string) +{ + return rocr_ops.hsa_status_string(status, status_string); +} + +const char *ofi_hsa_status_to_string(hsa_status_t status) +{ + const char *str; + hsa_status_t hsa_ret; + + hsa_ret = ofi_hsa_status_string(status, &str); + if (hsa_ret != HSA_STATUS_SUCCESS) + return "unknown error"; + + return str; +} + +hsa_status_t ofi_hsa_amd_dereg_dealloc_cb(void *ptr, + hsa_amd_deallocation_callback_t cb) +{ + return rocr_ops.hsa_amd_dereg_dealloc_cb(ptr, cb); +} + +hsa_status_t ofi_hsa_amd_reg_dealloc_cb(void *ptr, + hsa_amd_deallocation_callback_t cb, + void *user_data) +{ + return rocr_ops.hsa_amd_reg_dealloc_cb(ptr, cb, user_data); +} + +static hsa_status_t ofi_hsa_agent_get_info(hsa_agent_t agent, + hsa_agent_info_t attribute, + void *value) +{ + return rocr_ops.hsa_agent_get_info(agent, attribute, value); +} + +static int rocr_memcpy(void *dest, const void *src, size_t size) +{ + hsa_status_t hsa_ret; + + hsa_ret = ofi_hsa_memory_copy(dest, src, size); + if (hsa_ret == HSA_STATUS_SUCCESS) + return 0; + + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to perform hsa_memory_copy: %s\n", + ofi_hsa_status_to_string(hsa_ret)); + + return -FI_EIO; +} + +static int rocr_host_memory_ptr(void *host_ptr, void **ptr) +{ + hsa_amd_pointer_info_t info = { + .size = sizeof(info), + }; + hsa_status_t hsa_ret; + + hsa_ret = ofi_hsa_amd_pointer_info((void *)host_ptr, &info, NULL, NULL, + NULL); + if (hsa_ret != HSA_STATUS_SUCCESS) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to perform hsa_amd_pointer_info: %s\n", + ofi_hsa_status_to_string(hsa_ret)); + + return -FI_EIO; + } + + if (info.type != HSA_EXT_POINTER_TYPE_LOCKED) + *ptr = host_ptr; + else + *ptr = (void *) ((uintptr_t) info.agentBaseAddress + + (uintptr_t) host_ptr - + (uintptr_t) info.hostBaseAddress); + + return FI_SUCCESS; +} + +int rocr_copy_from_dev(uint64_t device, void *dest, const void *src, + size_t size) +{ + int ret; + void *dest_memcpy_ptr; + + ret = rocr_host_memory_ptr(dest, &dest_memcpy_ptr); + if (ret != FI_SUCCESS) + return ret; + + ret = rocr_memcpy(dest_memcpy_ptr, src, size); + + return ret; +} + +int rocr_copy_to_dev(uint64_t device, void *dest, const void *src, + size_t size) +{ + int ret; + void *src_memcpy_ptr; + + ret = rocr_host_memory_ptr((void *) src, &src_memcpy_ptr); + if (ret != FI_SUCCESS) + return ret; + + ret = rocr_memcpy(dest, src_memcpy_ptr, size); + + return ret; +} + +bool rocr_is_addr_valid(const void *addr) +{ + hsa_amd_pointer_info_t hsa_info = { + .size = sizeof(hsa_info), + }; + hsa_device_type_t hsa_dev_type; + hsa_status_t hsa_ret; + + hsa_ret = ofi_hsa_amd_pointer_info((void *)addr, &hsa_info, NULL, NULL, + NULL); + if (hsa_ret == HSA_STATUS_SUCCESS) { + hsa_ret = ofi_hsa_agent_get_info(hsa_info.agentOwner, + HSA_AGENT_INFO_DEVICE, + (void *) &hsa_dev_type); + if (hsa_ret == HSA_STATUS_SUCCESS) { + if (hsa_dev_type == HSA_DEVICE_TYPE_GPU) + return true; + } else { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to perform hsa_agent_get_info: %s\n", + ofi_hsa_status_to_string(hsa_ret)); + } + } else { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to perform hsa_amd_pointer_info: %s\n", + ofi_hsa_status_to_string(hsa_ret)); + } + + return false; +} + +static int rocr_hmem_dl_init(void) +{ +#ifdef ENABLE_ROCR_DLOPEN + /* Assume if dlopen fails, the ROCR library could not be found. Do not + * treat this as an error. + */ + rocr_handle = dlopen("libhsa-runtime64.so", RTLD_NOW); + if (!rocr_handle) { + FI_INFO(&core_prov, FI_LOG_CORE, + "Unable to dlopen libhsa-runtime64.so\n"); + return -FI_ENOSYS; + } + + rocr_ops.hsa_memory_copy = dlsym(rocr_handle, "hsa_memory_copy"); + if (!rocr_ops.hsa_memory_copy) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find hsa_memory_copy\n"); + goto err; + } + + rocr_ops.hsa_amd_pointer_info = dlsym(rocr_handle, + "hsa_amd_pointer_info"); + if (!rocr_ops.hsa_amd_pointer_info) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find hsa_amd_pointer_info\n"); + goto err; + } + + rocr_ops.hsa_init = dlsym(rocr_handle, "hsa_init"); + if (!rocr_ops.hsa_init) { + FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find hsa_init\n"); + goto err; + } + + rocr_ops.hsa_shut_down = dlsym(rocr_handle, "hsa_shut_down"); + if (!rocr_ops.hsa_shut_down) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find hsa_shut_down\n"); + goto err; + } + + rocr_ops.hsa_status_string = dlsym(rocr_handle, "hsa_status_string"); + if (!rocr_ops.hsa_status_string) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find hsa_status_string\n"); + goto err; + } + + rocr_ops.hsa_amd_dereg_dealloc_cb = + dlsym(rocr_handle, "hsa_amd_deregister_deallocation_callback"); + if (!rocr_ops.hsa_amd_dereg_dealloc_cb) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find hsa_amd_deregister_deallocation_callback\n"); + goto err; + } + + rocr_ops.hsa_amd_reg_dealloc_cb = + dlsym(rocr_handle, "hsa_amd_register_deallocation_callback"); + if (!rocr_ops.hsa_amd_reg_dealloc_cb) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find hsa_amd_register_deallocation_callback\n"); + goto err; + } + + rocr_ops.hsa_amd_memory_lock = dlsym(rocr_handle, + "hsa_amd_memory_lock"); + if (!rocr_ops.hsa_amd_memory_lock) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find hsa_amd_memory_lock\n"); + goto err; + } + + rocr_ops.hsa_amd_memory_unlock = dlsym(rocr_handle, + "hsa_amd_memory_unlock"); + if (!rocr_ops.hsa_amd_memory_lock) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find hsa_amd_memory_unlock\n"); + goto err; + } + + rocr_ops.hsa_agent_get_info = dlsym(rocr_handle, "hsa_agent_get_info"); + if (!rocr_ops.hsa_agent_get_info) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to find hsa_agent_get_info\n"); + goto err; + } + + return FI_SUCCESS; + +err: + dlclose(rocr_handle); + + return -FI_ENODATA; +#else + return FI_SUCCESS; +#endif /* ENABLE_ROCR_DLOPEN */ +} + +static void rocr_hmem_dl_cleanup(void) +{ +#ifdef ENABLE_ROCR_DLOPEN + dlclose(rocr_handle); +#endif +} + +int rocr_hmem_init(void) +{ + hsa_status_t hsa_ret; + int ret; + int log_level; + + ret = rocr_hmem_dl_init(); + if (ret != FI_SUCCESS) + return ret; + + hsa_ret = ofi_hsa_init(); + if (hsa_ret == HSA_STATUS_SUCCESS) + return FI_SUCCESS; + + /* Treat HSA_STATUS_ERROR_OUT_OF_RESOURCES as ROCR not being supported + * instead of an error. This ROCR error is typically returned if no + * devices are supported. + */ + if (hsa_ret == HSA_STATUS_ERROR_OUT_OF_RESOURCES) { + log_level = FI_LOG_INFO; + ret = -FI_ENOSYS; + } else { + log_level = FI_LOG_WARN; + ret = -FI_EIO; + } + + FI_LOG(&core_prov, log_level, FI_LOG_CORE, + "Failed to perform hsa_init: %s\n", + ofi_hsa_status_to_string(hsa_ret)); + + rocr_hmem_dl_cleanup(); + + return ret; +} + +int rocr_hmem_cleanup(void) +{ + hsa_status_t hsa_ret; + + hsa_ret = ofi_hsa_shut_down(); + if (hsa_ret != HSA_STATUS_SUCCESS) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to perform hsa_shut_down: %s\n", + ofi_hsa_status_to_string(hsa_ret)); + return -FI_ENODATA; + } + + rocr_hmem_dl_cleanup(); + + return FI_SUCCESS; +} + +int rocr_host_register(void *ptr, size_t size) +{ + hsa_status_t hsa_ret; + void *tmp; + + hsa_ret = ofi_hsa_amd_memory_lock(ptr, size, NULL, 0, &tmp); + if (hsa_ret == HSA_STATUS_SUCCESS) + return FI_SUCCESS; + + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to perform hsa_amd_memory_lock: %s\n", + ofi_hsa_status_to_string(hsa_ret)); + + return -FI_EIO; +} + +int rocr_host_unregister(void *ptr) +{ + hsa_status_t hsa_ret; + + hsa_ret = ofi_hsa_amd_memory_unlock(ptr); + if (hsa_ret == HSA_STATUS_SUCCESS) + return FI_SUCCESS; + + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to perform hsa_amd_memory_unlock: %s\n", + ofi_hsa_status_to_string(hsa_ret)); + + return -FI_EIO; +} + +#else + +int rocr_copy_from_dev(uint64_t device, void *dest, const void *src, + size_t size) +{ + return -FI_ENOSYS; +} + +int rocr_copy_to_dev(uint64_t device, void *dest, const void *src, + size_t size) +{ + return -FI_ENOSYS; +} + +int rocr_hmem_init(void) +{ + return -FI_ENOSYS; +} + +int rocr_hmem_cleanup(void) +{ + return -FI_ENOSYS; +} + +bool rocr_is_addr_valid(const void *addr) +{ + return false; +} + +int rocr_host_register(void *ptr, size_t size) +{ + return -FI_ENOSYS; +} + +int rocr_host_unregister(void *ptr) +{ + return -FI_ENOSYS; +} + +#endif /* HAVE_ROCR */ diff --git a/src/hmem_ze.c b/src/hmem_ze.c new file mode 100644 index 00000000000..c99a29e490d --- /dev/null +++ b/src/hmem_ze.c @@ -0,0 +1,418 @@ +/* + * Copyright (c) 2020-2021 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +#include +#endif + +#include "ofi_hmem.h" +#include "ofi.h" + +#ifdef HAVE_LIBZE + +#include +#include +#include +#include + +static ze_context_handle_t context; +static ze_device_handle_t devices[ZE_MAX_DEVICES]; +static ze_command_queue_handle_t cmd_queue[ZE_MAX_DEVICES]; +static int num_devices = 0; +static int dev_fds[ZE_MAX_DEVICES]; +static bool p2p_enabled = false; + +static const ze_command_queue_desc_t cq_desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + .pNext = NULL, + .ordinal = 0, + .index = 0, + .flags = 0, + .mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, + .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL, +}; + +static const ze_command_list_desc_t cl_desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, + .pNext = NULL, + .commandQueueGroupOrdinal = 0, + .flags = 0, +}; + +static int ze_hmem_init_fds(void) +{ + const char *dev_dir = "/dev/dri/by-path"; + const char *suffix = "-render"; + DIR *dir; + struct dirent *ent = NULL; + char dev_name[128]; + int i = 0; + + dir = opendir(dev_dir); + if (dir == NULL) + return -FI_EIO; + + while ((ent = readdir(dir)) != NULL) { + if (ent->d_name[0] == '.' || + strstr(ent->d_name, suffix) == NULL) + continue; + + memset(dev_name, 0, sizeof(dev_name)); + strncpy(dev_name, dev_dir, sizeof(dev_name)); + strncat(dev_name, "/", + sizeof(dev_name) - strlen(dev_name)); + strncat(dev_name, ent->d_name, + sizeof(dev_name) - strlen(dev_name)); + dev_fds[i] = open(dev_name, O_RDWR); + if (dev_fds[i] == -1) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed open device %d\n", i); + return -FI_EIO; + } + i++; + } + return FI_SUCCESS; +} + +int ze_hmem_init(void) +{ + ze_driver_handle_t driver; + ze_context_desc_t context_desc = {0}; + ze_result_t ze_ret; + ze_bool_t access; + uint32_t count, i; + bool p2p = true; + int ret; + + ze_ret = zeInit(ZE_INIT_FLAG_GPU_ONLY); + if (ze_ret) + return -FI_EIO; + + count = 1; + ze_ret = zeDriverGet(&count, &driver); + if (ze_ret) + return -FI_EIO; + + ze_ret = zeContextCreate(driver, &context_desc, &context); + if (ze_ret) + return -FI_EIO; + + for (i = 0; i < ZE_MAX_DEVICES; dev_fds[i++] = -1) + ; + + count = 0; + ze_ret = zeDeviceGet(driver, &count, NULL); + if (ze_ret || count > ZE_MAX_DEVICES) + goto err; + + ze_ret = zeDeviceGet(driver, &count, devices); + if (ze_ret) + goto err; + + ret = ze_hmem_init_fds(); + if (ret) + goto err; + + for (num_devices = 0; num_devices < count; num_devices++) { + ze_ret = zeCommandQueueCreate(context, devices[num_devices], &cq_desc, + &cmd_queue[num_devices]); + if (ze_ret) + goto err; + + for (i = 0; i < count; i++) { + if (zeDeviceCanAccessPeer(devices[num_devices], + devices[i], &access) || !access) + p2p = false; + } + } + + p2p_enabled = p2p; + return FI_SUCCESS; + +err: + (void) ze_hmem_cleanup(); + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to initialize ZE driver resources\n"); + + return -FI_EIO; +} + +int ze_hmem_cleanup(void) +{ + int i, ret = FI_SUCCESS; + + for (i = 0; i < num_devices; i++) { + if (cmd_queue[i] && zeCommandQueueDestroy(cmd_queue[i])) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to destroy ZE cmd_queue\n"); + ret = -FI_EINVAL; + } + if (dev_fds[i] != -1) { + close(dev_fds[i]); + dev_fds[i] = -1; + } + } + + if (zeContextDestroy(context)) + return -FI_EINVAL; + + return ret; +} + +int ze_hmem_copy(uint64_t device, void *dst, const void *src, size_t size) +{ + ze_command_list_handle_t cmd_list; + ze_result_t ze_ret; + int dev_id = (int) device; + + ze_ret = zeCommandListCreate(context, devices[dev_id], &cl_desc, &cmd_list); + if (ze_ret) + goto err; + + ze_ret = zeCommandListAppendMemoryCopy(cmd_list, dst, src, size, NULL, 0, NULL); + if (ze_ret) + goto free; + + ze_ret = zeCommandListClose(cmd_list); + if (ze_ret) + goto free; + + ze_ret = zeCommandQueueExecuteCommandLists(cmd_queue[dev_id], 1, + &cmd_list, NULL); + +free: + if (!zeCommandListDestroy(cmd_list) && !ze_ret) + return FI_SUCCESS; +err: + FI_WARN(&core_prov, FI_LOG_CORE, + "Failed to perform ze copy (%d)\n", ze_ret); + + return -FI_EIO; +} + +bool ze_is_addr_valid(const void *addr) +{ + ze_result_t ze_ret; + ze_memory_allocation_properties_t mem_prop; + int i; + + for (i = 0; i < num_devices; i++) { + ze_ret = zeMemGetAllocProperties(context, addr, &mem_prop, + &devices[i]); + if (!ze_ret && mem_prop.type == ZE_MEMORY_TYPE_DEVICE) + return true; + } + return false; +} + +int ze_hmem_get_handle(void *dev_buf, void **handle) +{ + ze_result_t ze_ret; + + ze_ret = zeMemGetIpcHandle(context, dev_buf, + (ze_ipc_mem_handle_t *) handle); + if (ze_ret) { + FI_WARN(&core_prov, FI_LOG_CORE, "Unable to get handle\n"); + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +int ze_hmem_open_handle(void **handle, uint64_t device, void **ipc_ptr) +{ + ze_result_t ze_ret; + + ze_ret = zeMemOpenIpcHandle(context, devices[device], + *((ze_ipc_mem_handle_t *) handle), + 0, ipc_ptr); + if (ze_ret) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Unable to open memory handle\n"); + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +int ze_hmem_get_shared_handle(int dev_fd, void *dev_buf, int *ze_fd, + void **handle) +{ + struct drm_prime_handle open_fd = {0, 0, 0}; + ze_ipc_mem_handle_t ze_handle; + int ret; + + ret = ze_hmem_get_handle(dev_buf, (void **) &ze_handle); + if (ret) + return ret; + + memcpy(ze_fd, &ze_handle, sizeof(*ze_fd)); + memcpy(&open_fd.fd, &ze_handle, sizeof(open_fd.fd)); + ret = ioctl(dev_fd, DRM_IOCTL_PRIME_FD_TO_HANDLE, &open_fd); + if (ret) { + FI_WARN(&core_prov, FI_LOG_CORE, + "ioctl call failed on get, err %d\n", errno); + return -FI_EINVAL; + } + + *(int *) handle = open_fd.handle; + return FI_SUCCESS; +} + +int ze_hmem_open_shared_handle(int dev_fd, void **handle, int *ze_fd, + uint64_t device, void **ipc_ptr) +{ + struct drm_prime_handle open_fd = {0, 0, 0}; + ze_ipc_mem_handle_t ze_handle; + int ret; + + open_fd.flags = DRM_CLOEXEC | DRM_RDWR; + open_fd.handle = *(int *) handle; + + ret = ioctl(dev_fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &open_fd); + if (ret) { + FI_WARN(&core_prov, FI_LOG_CORE, + "ioctl call failed on open, err %d\n", errno); + return -FI_EINVAL; + } + + *ze_fd = open_fd.fd; + memset(&ze_handle, 0, sizeof(ze_handle)); + memcpy(&ze_handle, &open_fd.fd, sizeof(open_fd.fd)); + return ze_hmem_open_handle((void **) &ze_handle, device, ipc_ptr); +} + +int ze_hmem_close_handle(void *ipc_ptr) +{ + ze_result_t ze_ret; + + ze_ret = zeMemCloseIpcHandle(context, ipc_ptr); + if (ze_ret) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Unable to close memory handle\n"); + return -FI_EINVAL; + } + + return FI_SUCCESS; +} + +bool ze_hmem_p2p_enabled(void) +{ + return p2p_enabled; +} + +int ze_hmem_get_base_addr(const void *ptr, void **base) +{ + ze_result_t ze_ret; + size_t size; + + ze_ret = zeMemGetAddressRange(context, ptr, base, &size); + if (ze_ret) { + FI_WARN(&core_prov, FI_LOG_CORE, + "Could not get base addr\n"); + return -FI_EINVAL; + } + return FI_SUCCESS; +} + +int *ze_hmem_get_dev_fds(int *nfds) +{ + *nfds = num_devices; + return dev_fds; +} + +#else + +int ze_hmem_init(void) +{ + return -FI_ENOSYS; +} + +int ze_hmem_cleanup(void) +{ + return -FI_ENOSYS; +} + +int ze_hmem_copy(uint64_t device, void *dst, const void *src, size_t size) +{ + return -FI_ENOSYS; +} + +bool ze_is_addr_valid(const void *addr) +{ + return false; +} + +int ze_hmem_get_handle(void *dev_buf, void **handle) +{ + return -FI_ENOSYS; +} + +int ze_hmem_open_handle(void **handle, uint64_t device, void **ipc_ptr) +{ + return -FI_ENOSYS; +} + +int ze_hmem_get_shared_handle(int dev_fd, void *dev_buf, int *ze_fd, + void **handle) +{ + return -FI_ENOSYS; +} + +int ze_hmem_open_shared_handle(int dev_fd, void **handle, int *ze_fd, + uint64_t device, void **ipc_ptr) +{ + return -FI_ENOSYS; +} + +int ze_hmem_close_handle(void *ipc_ptr) +{ + return -FI_ENOSYS; +} + +bool ze_hmem_p2p_enabled(void) +{ + return false; +} + +int ze_hmem_get_base_addr(const void *ptr, void **base) +{ + return -FI_ENOSYS; +} + +int *ze_hmem_get_dev_fds(int *nfds) +{ + *nfds = 0; + return NULL; +} + +#endif /* HAVE_LIBZE */ diff --git a/src/indexer.c b/src/indexer.c index d094a0e1a09..0abd7fb5eea 100644 --- a/src/indexer.c +++ b/src/indexer.c @@ -36,7 +36,7 @@ #include #include #include - +#include #include /* @@ -113,6 +113,31 @@ void *ofi_idx_remove(struct indexer *idx, int index) return item; } +void *ofi_idx_remove_ordered(struct indexer *idx, int index) +{ + struct ofi_idx_entry *entry; + void *item; + int temp_index; + int entry_index = ofi_idx_entry_index(index); + + entry = idx->array[ofi_idx_array_index(index)]; + item = entry[entry_index].item; + entry[entry_index].item = NULL; + if (ofi_idx_free_list_empty(idx) || index < idx->free_list) { + entry[entry_index].next = idx->free_list; + idx->free_list = index; + return item; + } + temp_index = idx->free_list; + while (entry[ofi_idx_entry_index(temp_index)].next < index) { + temp_index = entry[ofi_idx_entry_index(temp_index)].next; + } + entry[entry_index].next = entry[ofi_idx_entry_index(temp_index)].next; + entry[ofi_idx_entry_index(temp_index)].next = index; + + return item; +} + void ofi_idx_replace(struct indexer *idx, int index, void *item) { struct ofi_idx_entry *entry; @@ -179,16 +204,29 @@ void *ofi_idm_clear(struct index_map *idm, int index) return item; } -void ofi_idm_reset(struct index_map *idm) +void ofi_idm_reset(struct index_map *idm, void (*callback)(void *item)) { - int i; + void **entry; + void *item; + int a, i; + + for (a = 0; a < OFI_IDX_ARRAY_SIZE; a++) { + if (!idm->array[a]) { + assert(idm->count[a] == 0); + continue; + } - for (i=0; iarray[i]) { - free(idm->array[i]); - idm->array[i] = NULL; - idm->count[i] = 0; + for (i = 0; idm->count[a] && i < OFI_IDX_ARRAY_SIZE; i++) { + entry = idm->array[a]; + item = entry[i]; + if (item) { + if (callback) + callback(item); + idm->count[a]--; + } } + free(idm->array[a]); + idm->array[a] = NULL; } } diff --git a/src/iov.c b/src/iov.c index b40c2196e23..cc6b674ffdf 100644 --- a/src/iov.c +++ b/src/iov.c @@ -68,7 +68,8 @@ uint64_t ofi_copy_iov_buf(const struct iovec *iov, size_t iov_count, uint64_t io return done; } -void ofi_consume_iov(struct iovec *iov, size_t *iov_count, size_t consumed) +void ofi_consume_iov_desc(struct iovec *iov, void **desc, + size_t *iov_count, size_t to_consume) { size_t i; @@ -76,28 +77,57 @@ void ofi_consume_iov(struct iovec *iov, size_t *iov_count, size_t consumed) goto out; for (i = 0; i < *iov_count; i++) { - if (consumed < iov[i].iov_len) + if (to_consume < iov[i].iov_len) break; - consumed -= iov[i].iov_len; + to_consume -= iov[i].iov_len; } memmove(iov, &iov[i], sizeof(*iov) * (*iov_count - i)); + if (desc) + memmove(desc, &desc[i], + sizeof(*desc) * (*iov_count - i)); *iov_count -= i; out: - iov[0].iov_base = (uint8_t *)iov[0].iov_base + consumed; - iov[0].iov_len -= consumed; + iov[0].iov_base = (uint8_t *)iov[0].iov_base + to_consume; + iov[0].iov_len -= to_consume; } -int ofi_truncate_iov(struct iovec *iov, size_t *iov_count, size_t trim_size) +void ofi_consume_iov(struct iovec *iov, size_t *iov_count, size_t to_consume) +{ + ofi_consume_iov_desc(iov, NULL, iov_count, to_consume); +} + +void ofi_consume_rma_iov(struct fi_rma_iov *rma_iov, size_t *rma_iov_count, + size_t to_consume) +{ + size_t i; + + if (*rma_iov_count == 1) + goto out; + + for (i = 0; i < *rma_iov_count; i++) { + if (to_consume < rma_iov[i].len) + break; + to_consume -= rma_iov[i].len; + } + memmove(rma_iov, &rma_iov[i], + sizeof(*rma_iov) * (*rma_iov_count - i)); + *rma_iov_count -= i; +out: + rma_iov[0].addr += to_consume; + rma_iov[0].len -= to_consume; +} + +int ofi_truncate_iov(struct iovec *iov, size_t *iov_count, size_t new_size) { size_t i; for (i = 0; i < *iov_count; i++) { - if (trim_size <= iov[i].iov_len) { - iov[i].iov_len = trim_size; + if (new_size <= iov[i].iov_len) { + iov[i].iov_len = new_size; *iov_count = i + 1; return FI_SUCCESS; } - trim_size -= iov[i].iov_len; + new_size -= iov[i].iov_len; } return -FI_ETRUNC; } diff --git a/src/mem.c b/src/mem.c index 91836a79c68..6cd5f48332b 100644 --- a/src/mem.c +++ b/src/mem.c @@ -1,5 +1,5 @@ /* - * Copyright 2014-2018, Intel Corporation + * Copyright 2014-2019, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -84,7 +84,7 @@ void ofi_mem_init(void) num_page_sizes = 1; } - while (n--) { + while (n-- > 0) { if (sscanf(pglist[n]->d_name, "hugepages-%zukB", &hpsize) == 1) { hpsize *= 1024; if (hpsize != page_sizes[OFI_DEF_HUGEPAGE_SIZE]) @@ -104,6 +104,24 @@ void ofi_mem_fini(void) free(page_sizes); } +size_t ofi_get_mem_size(void) +{ + long page_cnt, page_size; + size_t mem_size; + + page_cnt = ofi_sysconf(_SC_PHYS_PAGES); + page_size = ofi_get_page_size(); + + if (page_cnt <= 0 || page_size <= 0) + return 0; + + mem_size = (size_t) page_cnt * (size_t) page_size; + if (mem_size < page_cnt || mem_size < page_size) + return 0; + + return mem_size; +} + uint64_t OFI_RMA_PMEM; void (*ofi_pmem_commit)(const void *addr, size_t len); diff --git a/src/osx/osd.c b/src/osx/osd.c new file mode 100644 index 00000000000..bcf671e9715 --- /dev/null +++ b/src/osx/osd.c @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2020 by Argonne National Laboratory. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ofi.h" +#include "ofi_osd.h" + +static ssize_t +ofi_sendv_socket(SOCKET fd, const struct iovec *iovec, size_t iov_cnt, int flags) +{ + ssize_t size = 0; + int ret, i; + + if (iov_cnt == 1) { + return ofi_send_socket(fd, iovec[0].iov_base, + iovec[0].iov_len, flags); + } + + for (i = 0; i < iov_cnt; i++) { + ret = ofi_send_socket(fd, iovec[i].iov_base, + iovec[i].iov_len, flags); + if (ret >= 0) { + size += ret; + if (ret != iovec[i].iov_len) + return size; + } else { + return size ? size : ret; + } + } + return size; +} + +static ssize_t +ofi_recvv_socket(SOCKET fd, const struct iovec *iovec, size_t iov_cnt, int flags) +{ + ssize_t size = 0; + int ret, i; + + if (iov_cnt == 1) { + return ofi_recv_socket(fd, iovec[0].iov_base, + iovec[0].iov_len, flags); + } + + for (i = 0; i < iov_cnt; i++) { + ret = ofi_recv_socket(fd, iovec[i].iov_base, + iovec[i].iov_len, flags); + if (ret >= 0) { + size += ret; + if (ret != iovec[i].iov_len) + return size; + } else { + return size ? size : ret; + } + } + return size; +} + +ssize_t ofi_writev_socket(SOCKET fd, const struct iovec *iovec, size_t iov_cnt) +{ + return ofi_sendv_socket(fd, iovec, iov_cnt, 0); +} + +ssize_t ofi_readv_socket(SOCKET fd, const struct iovec *iovec, size_t iov_cnt) +{ + return ofi_recvv_socket(fd, iovec, iov_cnt, 0); +} + +ssize_t ofi_sendmsg_tcp(SOCKET fd, const struct msghdr *msg, int flags) +{ + return ofi_sendv_socket(fd, msg->msg_iov, msg->msg_iovlen, flags); +} + +ssize_t ofi_recvmsg_tcp(SOCKET fd, struct msghdr *msg, int flags) +{ + return ofi_recvv_socket(fd, msg->msg_iov, msg->msg_iovlen, flags); +} diff --git a/src/shared/ofi_str.c b/src/shared/ofi_str.c index 80218f9de95..f40559d5e63 100644 --- a/src/shared/ofi_str.c +++ b/src/shared/ofi_str.c @@ -62,6 +62,35 @@ static inline char* strsep(char **stringp, const char *delim) return ptr; } + +char *strcasestr(const char *haystack, const char *needle) +{ + char *uneedle, *uhaystack, *pos = NULL; + int i; + + uneedle = malloc(strlen(needle) + 1); + uhaystack = malloc(strlen(haystack) + 1); + if (!uneedle || !uhaystack) + goto out; + + for (i = 0; i < strlen(needle); i++) + uneedle[i] = toupper(needle[i]); + uneedle[i] = '\0'; + + for (i = 0; i < strlen(haystack); i++) + uhaystack[i] = toupper(haystack[i]); + uhaystack[i] = '\0'; + + pos = strstr(uhaystack, uneedle); + if (pos) + pos = (char *) ((uintptr_t) haystack + (uintptr_t) pos - + (uintptr_t) uhaystack); +out: + free(uneedle); + free(uhaystack); + return pos; +} + #endif /* String utility functions */ diff --git a/src/tree.c b/src/tree.c index 77e29378ba9..7419bd744d9 100644 --- a/src/tree.c +++ b/src/tree.c @@ -47,9 +47,28 @@ #include #include +#include #include +static struct ofi_rbnode *ofi_rbnode_alloc(struct ofi_rbmap *map) +{ + struct ofi_rbnode *node; + + if (!map->free_list) + return malloc(sizeof(*node)); + + node = map->free_list; + map->free_list = node->right; + return node; +} + +static void ofi_rbnode_free(struct ofi_rbmap *map, struct ofi_rbnode *node) +{ + node->right = map->free_list ? map->free_list : NULL; + map->free_list = node; +} + void ofi_rbmap_init(struct ofi_rbmap *map, int (*compare)(struct ofi_rbmap *map, void *key, void *data)) { @@ -86,7 +105,14 @@ static void ofi_delete_tree(struct ofi_rbmap *map, struct ofi_rbnode *node) void ofi_rbmap_cleanup(struct ofi_rbmap *map) { + struct ofi_rbnode *node; + ofi_delete_tree(map, map->root); + while (map->free_list) { + node = map->free_list; + map->free_list = node->right; + free(node); + } } void ofi_rbmap_destroy(struct ofi_rbmap *map) @@ -203,14 +229,17 @@ int ofi_rbmap_insert(struct ofi_rbmap *map, void *key, void *data, while (current != &map->sentinel) { ret = map->compare(map, key, current->data); - if (ret == 0) + if (ret == 0) { + if (ret_node) + *ret_node = current; return -FI_EALREADY; + } parent = current; current = (ret < 0) ? current->left : current->right; } - node = malloc(sizeof(*node)); + node = ofi_rbnode_alloc(map); if (!node) return -FI_ENOMEM; @@ -293,22 +322,43 @@ static void ofi_delete_rebalance(struct ofi_rbmap *map, struct ofi_rbnode *node) node->color = BLACK; } +static void ofi_rbmap_replace_node_ptr(struct ofi_rbmap *map, + struct ofi_rbnode *old_node, struct ofi_rbnode *new_node) +{ + if (new_node == old_node) + return; + + *new_node = *old_node; + + if (!old_node->parent) + map->root = new_node; + else if (old_node == old_node->parent->left) + old_node->parent->left = new_node; + else + old_node->parent->right = new_node; + + if (old_node->left != &map->sentinel) + old_node->left->parent = new_node; + if (old_node->right != &map->sentinel) + old_node->right->parent = new_node; +} + void ofi_rbmap_delete(struct ofi_rbmap *map, struct ofi_rbnode *node) { struct ofi_rbnode *x, *y; - if (node->left == &map->sentinel || node->right == &map->sentinel) { + if (node->left == &map->sentinel) { + y = node; + x = y->right; + } else if (node->right == &map->sentinel) { y = node; + x = y->left; } else { y = node->right; while (y->left != &map->sentinel) y = y->left; - } - - if (y->left != &map->sentinel) - x = y->left; - else x = y->right; + } x->parent = y->parent; if (y->parent) { @@ -326,7 +376,16 @@ void ofi_rbmap_delete(struct ofi_rbmap *map, struct ofi_rbnode *node) if (y->color == BLACK) ofi_delete_rebalance(map, x); - free (y); + /* swap y in for node, so we can free node */ + ofi_rbmap_replace_node_ptr(map, node, y); + ofi_rbnode_free(map, node); +} + +struct ofi_rbnode *ofi_rbmap_get_root(struct ofi_rbmap *map) +{ + if (ofi_rbmap_empty(map)) + return NULL; + return map->root; } struct ofi_rbnode *ofi_rbmap_find(struct ofi_rbmap *map, void *key) @@ -345,6 +404,18 @@ struct ofi_rbnode *ofi_rbmap_find(struct ofi_rbmap *map, void *key) return NULL; } +int ofi_rbmap_find_delete(struct ofi_rbmap *map, void *key) +{ + struct ofi_rbnode *node; + + node = ofi_rbmap_find(map, key); + if (!node) + return -FI_ENODATA; + + ofi_rbmap_delete(map, node); + return 0; +} + struct ofi_rbnode *ofi_rbmap_search(struct ofi_rbmap *map, void *key, int (*compare)(struct ofi_rbmap *map, void *key, void *data)) { diff --git a/src/unix/osd.c b/src/unix/osd.c index 34446e96644..ac11573a623 100644 --- a/src/unix/osd.c +++ b/src/unix/osd.c @@ -62,6 +62,20 @@ typedef cpuset_t ofi_cpu_set_t; typedef cpu_set_t ofi_cpu_set_t; #endif +#if !HAVE_CLOCK_GETTIME +int clock_gettime(clockid_t clk_id, struct timespec *tp) { + int retval; + struct timeval tv; + + retval = gettimeofday(&tv, NULL); + + tp->tv_sec = tv.tv_sec; + tp->tv_nsec = tv.tv_usec * 1000; + + return retval; +} +#endif /* !HAVE_CLOCK_GETTIME */ + int fi_fd_nonblock(int fd) { long flags = 0; @@ -77,6 +91,21 @@ int fi_fd_nonblock(int fd) return 0; } +int fi_fd_block(int fd) +{ + long flags = 0; + + flags = fcntl(fd, F_GETFL); + if (flags < 0) { + return -errno; + } + + if(fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) + return -errno; + + return 0; +} + int fi_wait_cond(pthread_cond_t *cond, pthread_mutex_t *mut, int timeout_ms) { uint64_t t; @@ -85,7 +114,7 @@ int fi_wait_cond(pthread_cond_t *cond, pthread_mutex_t *mut, int timeout_ms) if (timeout_ms < 0) return pthread_cond_wait(cond, mut); - t = fi_gettime_ms() + timeout_ms; + t = ofi_gettime_ms() + timeout_ms; ts.tv_sec = t / 1000; ts.tv_nsec = (t % 1000) * 1000000; return pthread_cond_timedwait(cond, mut, &ts); @@ -98,16 +127,17 @@ int ofi_shm_map(struct util_shm *shm, const char *name, size_t size, int i, ret = FI_SUCCESS; int flags = O_RDWR | (readonly ? 0 : O_CREAT); struct stat mapstat; + int fname_size = 0; *mapped = MAP_FAILED; memset(shm, 0, sizeof(*shm)); - fname = calloc(1, strlen(name) + 2); /* '/' + %s + trailing 0 */ + fname_size = strlen(name) + 2; /* '/' + %s + trailing 0 */ + fname = calloc(1, fname_size); if (!fname) return -FI_ENOMEM; - strcpy(fname, "/"); - strcat(fname, name); + snprintf(fname, fname_size, "/%s", name); shm->name = fname; for (i = 0; i < strlen(fname); i++) { diff --git a/src/var.c b/src/var.c index ee43ff988e7..6103db1afb9 100644 --- a/src/var.c +++ b/src/var.c @@ -228,7 +228,7 @@ int DEFAULT_SYMVER_PRE(fi_param_define)(const struct fi_provider *provider, dlist_insert_tail(&v->entry, ¶m_list); - FI_INFO(provider, FI_LOG_CORE, "registered var %s\n", param_name); + FI_DBG(provider, FI_LOG_CORE, "registered var %s\n", param_name); return FI_SUCCESS; } DEFAULT_SYMVER(fi_param_define_, fi_param_define, FABRIC_1.0); diff --git a/src/windows/osd.c b/src/windows/osd.c index a8d4efbef21..7c0005d4ecc 100644 --- a/src/windows/osd.c +++ b/src/windows/osd.c @@ -465,7 +465,7 @@ int getifaddrs(struct ifaddrs **ifap) &fa->in_netmasks; netmask4->sin_family = pSockAddr->sa_family; addr4->sin_family = pSockAddr->sa_family; - netmask4->sin_addr.S_un.S_addr = mask; + netmask4->sin_addr.S_un.S_addr = *mask; pInAddr = (struct sockaddr_in *) pSockAddr; addr4->sin_addr = pInAddr->sin_addr; } else { @@ -509,11 +509,14 @@ ofi_sendv_socket(SOCKET fd, const struct iovec *iovec, size_t iov_cnt, int flags ssize_t size = 0; int ret, i; - if (iov_cnt == 1) - return send(fd, iovec[0].iov_base, iovec[0].iov_len, flags); + if (iov_cnt == 1) { + return ofi_send_socket(fd, iovec[0].iov_base, + iovec[0].iov_len, flags); + } for (i = 0; i < iov_cnt; i++) { - ret = send(fd, iovec[i].iov_base, iovec[i].iov_len, flags); + ret = ofi_send_socket(fd, iovec[i].iov_base, + iovec[i].iov_len, flags); if (ret >= 0) { size += ret; if (ret != iovec[i].iov_len) @@ -531,11 +534,14 @@ ofi_recvv_socket(SOCKET fd, const struct iovec *iovec, size_t iov_cnt, int flags ssize_t size = 0; int ret, i; - if (iov_cnt == 1) - return recv(fd, iovec[0].iov_base, iovec[0].iov_len, flags); + if (iov_cnt == 1) { + return ofi_recv_socket(fd, iovec[0].iov_base, + iovec[0].iov_len, flags); + } for (i = 0; i < iov_cnt; i++) { - ret = recv(fd, iovec[i].iov_base, iovec[i].iov_len, flags); + ret = ofi_recv_socket(fd, iovec[i].iov_base, + iovec[i].iov_len, flags); if (ret >= 0) { size += ret; if (ret != iovec[i].iov_len) diff --git a/strerror.vcxproj b/strerror.vcxproj index ed0f356ca2d..506535c64b6 100644 --- a/strerror.vcxproj +++ b/strerror.vcxproj @@ -13,6 +13,10 @@ Debug-v140 x64 + + Debug-v142 + x64 + Release-ICC x64 @@ -25,6 +29,10 @@ Release-v140 x64 + + Release-v142 + x64 + {C835FB00-8E80-4D4A-9791-4B7D6D37168A} @@ -45,6 +53,12 @@ v141 Unicode + + Application + true + v142 + Unicode + Application true @@ -65,6 +79,13 @@ true Unicode + + Application + false + v142 + true + Unicode + Application false @@ -83,6 +104,9 @@ + + + @@ -92,6 +116,9 @@ + + + @@ -106,6 +133,11 @@ $(Platform)\$(Configuration)\strerror\ fi_$(ProjectName) + + true + $(Platform)\$(Configuration)\strerror\ + fi_$(ProjectName) + true $(Platform)\$(Configuration)\strerror\ @@ -121,6 +153,11 @@ $(Platform)\$(Configuration)\strerror\ fi_$(ProjectName) + + false + $(Platform)\$(Configuration)\strerror\ + fi_$(ProjectName) + false $(Platform)\$(Configuration)\strerror\ @@ -158,6 +195,22 @@ Synchronization.lib;Ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + + + + Level3 + Disabled + WIN32;_WINSOCKAPI_=;_CRT_SECURE_NO_WARNINGS;_WINDOWS;_USRDLL;LIBFABRIC_EXPORTS;HAVE_CONFIG_H;%(PreprocessorDefinitions) + $(SoludionDir)util\windows\getopt;$(SolutionDir)include;$(SolutionDir)include\windows;%(AdditionalIncludeDirectories) + MultiThreadedDebug + + + Console + true + Synchronization.lib;Ws2_32.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) + + @@ -212,6 +265,25 @@ true + + + Level3 + + + MaxSpeed + true + true + WIN32;_WINSOCKAPI_=;_CRT_SECURE_NO_WARNINGS;_WINDOWS;_USRDLL;LIBFABRIC_EXPORTS;HAVE_CONFIG_H;%(PreprocessorDefinitions) + $(SoludionDir)util\windows\getopt;$(SolutionDir)include;$(SolutionDir)include\windows;%(AdditionalIncludeDirectories) + MultiThreaded + + + Console + true + true + true + + Level3 @@ -235,9 +307,11 @@ true true + true true true true + true true diff --git a/util/info.c b/util/info.c index 8703392fa06..511e115436f 100644 --- a/util/info.c +++ b/util/info.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014 Intel Corporation. All rights reserved. + * Copyright (c) 2013-2020 Intel Corporation. All rights reserved. * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. * * This software is available to you under the BSD license below: @@ -45,11 +45,13 @@ static char *node, *port; static int ver = 0; static int list_providers = 0; static int verbose = 0, env = 0; +static char *envstr; /* options and matching help strings need to be kept in sync */ static const struct option longopts[] = { {"help", no_argument, NULL, 'h'}, + {"src_addr", required_argument, NULL, 's'}, {"node", required_argument, NULL, 'n'}, {"port", required_argument, NULL, 'P'}, {"caps", required_argument, NULL, 'c'}, @@ -60,6 +62,7 @@ static const struct option longopts[] = { {"addr_format", required_argument, NULL, 'a'}, {"provider", required_argument, NULL, 'p'}, {"env", no_argument, NULL, 'e'}, + {"getenv", required_argument, NULL, 'g'}, {"list", no_argument, NULL, 'l'}, {"verbose", no_argument, NULL, 'v'}, {"version", no_argument, &ver, 1}, @@ -68,7 +71,8 @@ static const struct option longopts[] = { static const char *help_strings[][2] = { {"", "\t\tdisplay this help and exit"}, - {"NAME", "\t\tnode name or address"}, + {"ADDR", "\t\tsource name or address"}, + {"NAME", "\t\tdest node name or address"}, {"PNUM", "\t\tport number"}, {"CAP1|CAP2..", "\tone or more capabilities: FI_MSG|FI_RMA..."}, {"MOD1|MOD2..", "\tone or more modes, default all modes"}, @@ -78,6 +82,7 @@ static const char *help_strings[][2] = { {"FMT", "\t\tspecify accepted address format: FI_FORMAT_UNSPEC, FI_SOCKADDR..."}, {"PROV", "\t\tspecify provider explicitly"}, {"", "\t\tprint libfabric environment variables"}, + {"SUBSTR", "\t\tprint libfabric environment variables with substr"}, {"", "\t\tlist available libfabric providers"}, {"", "\t\tverbose output"}, {"", "\t\tprint version info and exit"}, @@ -116,6 +121,7 @@ static int str2cap(char *inputstr, uint64_t *value) ORCASE(FI_TAGGED); ORCASE(FI_ATOMIC); ORCASE(FI_MULTICAST); + ORCASE(FI_COLLECTIVE); ORCASE(FI_READ); ORCASE(FI_WRITE); @@ -139,6 +145,7 @@ static int str2cap(char *inputstr, uint64_t *value) ORCASE(FI_SOURCE); ORCASE(FI_NAMED_RX_CTX); ORCASE(FI_DIRECTED_RECV); + ORCASE(FI_HMEM); fprintf(stderr, "error: Unrecognized capability: %s\n", inputstr); @@ -230,38 +237,18 @@ static const char *param_type(enum fi_param_type type) } } -static char * get_var_prefix(const char *prov_name) -{ - int i; - char *prefix; - - if (!prov_name) { - return NULL; - } else { - if (asprintf(&prefix, "FI_%s", prov_name) < 0) - return NULL; - for (i = 0; i < strlen(prefix); ++i) - prefix[i] = toupper((unsigned char) prefix[i]); - } - - return prefix; -} - static int print_vars(void) { int ret, count, i; struct fi_param *params; char delim; - char *var_prefix; ret = fi_getparams(¶ms, &count); if (ret) return ret; - var_prefix = get_var_prefix(hints->fabric_attr->prov_name); - for (i = 0; i < count; ++i) { - if (var_prefix && strncmp(params[i].name, var_prefix, strlen(var_prefix))) + if (envstr && !strcasestr(params[i].name, envstr)) continue; printf("# %s: %s\n", params[i].name, param_type(params[i].type)); @@ -276,7 +263,6 @@ static int print_vars(void) printf("\n"); } - free(var_prefix); fi_freeparams(params); return ret; } @@ -320,15 +306,13 @@ static int print_long_info(struct fi_info *info) return EXIT_SUCCESS; } -static int run(struct fi_info *hints, char *node, char *port) +static int run(struct fi_info *hints, char *node, char *port, uint64_t flags) { struct fi_info *info; int ret; - uint64_t flags; - flags = list_providers ? FI_PROV_ATTR_ONLY : 0; ret = fi_getinfo(FI_VERSION(FI_MAJOR_VERSION, FI_MINOR_VERSION), - node, port, flags, hints, &info); + node, port, flags, hints, &info); if (ret) { fprintf(stderr, "fi_getinfo: %d\n", ret); return ret; @@ -349,6 +333,7 @@ static int run(struct fi_info *hints, char *node, char *port) int main(int argc, char **argv) { + uint64_t flags = 0; int op, ret, option_index; int use_hints = 0; @@ -360,7 +345,7 @@ int main(int argc, char **argv) hints->domain_attr->mode = ~0; hints->domain_attr->mr_mode = ~(FI_MR_BASIC | FI_MR_SCALABLE); - while ((op = getopt_long(argc, argv, "n:P:c:m:t:a:p:d:f:elhv", longopts, + while ((op = getopt_long(argc, argv, "s:n:P:c:m:t:a:p:d:f:eg:lhv", longopts, &option_index)) != -1) { switch (op) { case 0: @@ -373,6 +358,10 @@ int main(int argc, char **argv) return EXIT_SUCCESS; } goto print_help; + case 's': + node = optarg; + flags |= FI_SOURCE; + break; case 'n': node = optarg; break; @@ -421,11 +410,15 @@ int main(int argc, char **argv) hints->fabric_attr->name = strdup(optarg); use_hints = 1; break; + case 'g': + envstr = optarg; + /* fall through */ case 'e': env = 1; break; case 'l': list_providers = 1; + flags |= FI_PROV_ATTR_ONLY; break; case 'v': verbose = 1; @@ -439,7 +432,7 @@ int main(int argc, char **argv) } } - ret = run(use_hints ? hints : NULL, node, port); + ret = run(use_hints ? hints : NULL, node, port, flags); out: fi_freeinfo(hints); diff --git a/util/pingpong.c b/util/pingpong.c index b62dd5058ee..f554189e636 100644 --- a/util/pingpong.c +++ b/util/pingpong.c @@ -98,6 +98,7 @@ struct pp_opts { #define PP_MAX_CTRL_MSG 64 #define PP_CTRL_BUF_LEN 64 #define PP_MR_KEY 0xC0DE +#define PP_MAX_ADDRLEN 1024 #define INTEG_SEED 7 #define PP_ENABLE_ALL (~0) @@ -525,6 +526,15 @@ static int pp_ctrl_recv(struct ct_pingpong *ct, char *buf, size_t size) return ret; } +static int pp_ctrl_recv_str(struct ct_pingpong *ct, char *buf, size_t size) +{ + int ret; + + ret = pp_ctrl_recv(ct, buf, size); + buf[size - 1] = '\0'; + return ret; +} + static int pp_send_name(struct ct_pingpong *ct, struct fid *endpoint) { size_t addrlen = 0; @@ -590,6 +600,8 @@ static int pp_recv_name(struct ct_pingpong *ct) return ret; len = ntohl(len); + if (len > PP_MAX_ADDRLEN) + return -EINVAL; ct->rem_name = calloc(1, len); if (!ct->rem_name) { @@ -654,12 +666,11 @@ static int pp_ctrl_sync(struct ct_pingpong *ct) } PP_DEBUG("CLIENT: syncing now\n"); - ret = pp_ctrl_recv(ct, ct->ctrl_buf, sizeof(PP_MSG_SYNC_A)); + ret = pp_ctrl_recv_str(ct, ct->ctrl_buf, sizeof(PP_MSG_SYNC_A)); PP_DEBUG("CLIENT: after recv / ret=%d\n", ret); if (ret < 0) return ret; if (strcmp(ct->ctrl_buf, PP_MSG_SYNC_A)) { - ct->ctrl_buf[PP_CTRL_BUF_LEN] = '\0'; PP_DEBUG("CLIENT: sync error while acking A: <%s> " "(len=%zu)\n", ct->ctrl_buf, strlen(ct->ctrl_buf)); @@ -668,12 +679,11 @@ static int pp_ctrl_sync(struct ct_pingpong *ct) PP_DEBUG("CLIENT: synced\n"); } else { PP_DEBUG("SERVER: syncing\n"); - ret = pp_ctrl_recv(ct, ct->ctrl_buf, sizeof(PP_MSG_SYNC_Q)); + ret = pp_ctrl_recv_str(ct, ct->ctrl_buf, sizeof(PP_MSG_SYNC_Q)); PP_DEBUG("SERVER: after recv / ret=%d\n", ret); if (ret < 0) return ret; if (strcmp(ct->ctrl_buf, PP_MSG_SYNC_Q)) { - ct->ctrl_buf[PP_CTRL_BUF_LEN] = '\0'; PP_DEBUG("SERVER: sync error while acking Q: <%s> " "(len=%zu)\n", ct->ctrl_buf, strlen(ct->ctrl_buf)); @@ -724,8 +734,8 @@ static int pp_ctrl_txrx_msg_count(struct ct_pingpong *ct) } PP_DEBUG("CLIENT: sent count\n"); - ret = - pp_ctrl_recv(ct, ct->ctrl_buf, sizeof(PP_MSG_CHECK_CNT_OK)); + ret = pp_ctrl_recv_str(ct, ct->ctrl_buf, + sizeof(PP_MSG_CHECK_CNT_OK)); if (ret < 0) return ret; if (ret < sizeof(PP_MSG_CHECK_CNT_OK)) { @@ -1786,13 +1796,16 @@ static int pp_init_fabric(struct ct_pingpong *ct) NULL); if (ret) return ret; - ret = pp_av_insert(ct->av, ct->local_name, 1, &(ct->local_fi_addr), 0, - NULL); + if (ct->fi->domain_attr->caps & FI_LOCAL_COMM) + ret = pp_av_insert(ct->av, ct->local_name, 1, + &(ct->local_fi_addr), 0, NULL); } else { - ret = pp_av_insert(ct->av, ct->local_name, 1, &(ct->local_fi_addr), 0, - NULL); - if (ret) - return ret; + if (ct->fi->domain_attr->caps & FI_LOCAL_COMM) { + ret = pp_av_insert(ct->av, ct->local_name, 1, + &(ct->local_fi_addr), 0, NULL); + if (ret) + return ret; + } ret = pp_av_insert(ct->av, ct->rem_name, 1, &(ct->remote_fi_addr), 0, NULL); } @@ -1826,7 +1839,7 @@ static void pp_free_res(struct ct_pingpong *ct) free(ct->rem_name); free(ct->local_name); - + if (ct->buf) { ofi_freealign(ct->buf); ct->buf = ct->rx_buf = ct->tx_buf = NULL; @@ -1859,8 +1872,7 @@ static int pp_finalize(struct ct_pingpong *ct) PP_DEBUG("Terminating test\n"); - strcpy(ct->tx_buf, fin_buf); - ((char *)ct->tx_buf)[fin_buf_size - 1] = '\0'; + snprintf(ct->tx_buf, fin_buf_size, "%s", fin_buf); iov.iov_base = ct->tx_buf; iov.iov_len = fin_buf_size + ct->tx_prefix_size;