@@ -199,6 +199,7 @@ declare -a block_devs
199199declare -a bluestore_db_devs
200200declare -a bluestore_wal_devs
201201declare -a secondary_block_devs
202+ declare -a cpu_table
202203secondary_block_devs_type=" SSD"
203204
204205VSTART_SEC=" client.vstart.sh"
@@ -275,6 +276,7 @@ options:
275276 --crimson-smp: number of cores to use for crimson
276277 --crimson-alien-num-threads: number of alien-tp threads
277278 --crimson-alien-num-cores: number of cores to use for alien-tp
279+ --crimson-balance-cpu: distribute the Seastar reactors uniformly across OSDs (osd) or NUMA (socket)
278280 --osds-per-host: populate crush_location as each host holds the specified number of osds if set
279281 --require-osd-and-client-version: if supplied, do set-require-min-compat-client and require-osd-release to specified value
280282 --use-crush-tunables: if supplied, set tunables to specified value
@@ -346,10 +348,43 @@ parse_secondary_devs() {
346348 done
347349}
348350
351+ # Auxiliar function to prepare the CPU cores to pin Seastar reactors
352+ prep_balance_cpu () {
353+ local crimson_smp=$1
354+ local balance_strategy=$2
355+ local in_file_name=" /tmp/numa_args_${balance_strategy} .out"
356+ local out_file_name=" /tmp/numa_nodes.json"
357+ local log_file_name=" /tmp/numa_bal_${balance_strategy} .log"
358+ local cmd
359+
360+ # Check the table is empty
361+ if [ " ${# cpu_table[@]} " -eq 0 ]; then
362+ # Ensure the file with the CPU mappings exist
363+ if [ ! -f ${in_file_name} ]; then
364+ debug echo " lscpu --json > ${out_file_name} "
365+ lscpu --json > ${out_file_name}
366+ MY_CPUS=$( taskset -acp $$ | awk -F : ' {print $2}' )
367+ cmd=" python3 ${CEPH_DIR} /../src/tools/contrib/balance_cpu.py -o ${CEPH_NUM_OSD} \
368+ -r ${crimson_smp} -b ${balance_strategy} -u ${out_file_name} -t ${MY_CPUS} > ${in_file_name} "
369+ debug echo " $cmd "
370+ eval " $cmd " >> ${log_file_name}
371+ fi
372+
373+ readarray -t cpu_table < ${in_file_name}
374+ # Check the table is not empty, bail out otherwise
375+ if [ " ${# cpu_table[@]} " -ne 0 ]; then
376+ debug echo " CPU table not empty with ${# cpu_table[@]} entries"
377+ else
378+ debug echo " CPU table empty, bailing out. Check ${log_file_name} "
379+ fi
380+ fi
381+ }
382+
349383# Default values for the crimson options
350384crimson_smp=1
351385crimson_alien_num_threads=0
352386crimson_alien_num_cores=0
387+ crimson_balance_cpu=" " # "osd", "socket"
353388
354389while [ $# -ge 1 ]; do
355390case $1 in
@@ -589,6 +624,10 @@ case $1 in
589624 crimson_alien_num_cores=$2
590625 shift
591626 ;;
627+ --crimson-balance-cpu)
628+ crimson_balance_cpu=$2
629+ shift
630+ ;;
592631 --bluestore-spdk)
593632 [ -z " $2 " ] && usage_exit
594633 IFS=' ,' read -r -a bluestore_spdk_dev <<< " $2"
@@ -1156,6 +1195,14 @@ start_cephexporter() {
11561195 --addrs " $IP "
11571196}
11581197
1198+ do_balance_cpu () {
1199+ local osd=$1
1200+
1201+ interval=${cpu_table[${osd}]}
1202+ echo " $CEPH_BIN /ceph -c $conf_fn config set osd.$osd crimson_seastar_cpu_cores $interval "
1203+ $CEPH_BIN /ceph -c $conf_fn config set " osd.$osd " crimson_seastar_cpu_cores " $interval "
1204+ }
1205+
11591206start_osd () {
11601207 if [ $inc_osd_num -gt 0 ]; then
11611208 old_maxosd=$( $CEPH_BIN /ceph osd getmaxosd | sed -e ' s/max_osd = //' -e ' s/ in epoch.*//' )
@@ -1167,15 +1214,24 @@ start_osd() {
11671214 end=$(( $CEPH_NUM_OSD - 1 ))
11681215 fi
11691216 local osds_wait
1217+ # If the type of OSD is Crimson and the option to balance the Seastar reactors is true
1218+ if [ " $ceph_osd " == " crimson-osd" ] && [ ! -z " $crimson_balance_cpu " ]; then
1219+ debug echo " Preparing balance CPU for Crimson"
1220+ prep_balance_cpu $crimson_smp $crimson_balance_cpu
1221+ fi
11701222 for osd in ` seq $start $end `
11711223 do
11721224 if [ " $ceph_osd " == " crimson-osd" ]; then
1173- bottom_cpu=$(( osd * crimson_smp ))
1174- top_cpu=$(( bottom_cpu + crimson_smp - 1 ))
1175- # set exclusive CPU nodes for each osd
1176- echo " $CEPH_BIN /ceph -c $conf_fn config set osd.$osd crimson_seastar_cpu_cores $bottom_cpu -$top_cpu "
1177- $CEPH_BIN /ceph -c $conf_fn config set " osd.$osd " crimson_seastar_cpu_cores " $bottom_cpu -$top_cpu "
1178- fi
1225+ if [ ! -z " $crimson_balance_cpu " ]; then
1226+ do_balance_cpu $osd
1227+ else
1228+ bottom_cpu=$(( osd * crimson_smp ))
1229+ top_cpu=$(( bottom_cpu + crimson_smp - 1 ))
1230+ # set exclusive CPU nodes for each osd
1231+ echo " $CEPH_BIN /ceph -c $conf_fn config set osd.$osd crimson_seastar_cpu_cores $bottom_cpu -$top_cpu "
1232+ $CEPH_BIN /ceph -c $conf_fn config set " osd.$osd " crimson_seastar_cpu_cores " $bottom_cpu -$top_cpu "
1233+ fi
1234+ fi
11791235 if [ " $new " -eq 1 -o $inc_osd_num -gt 0 ]; then
11801236 wconf << EOF
11811237[osd.$osd ]
@@ -1703,21 +1759,31 @@ if [ "$ceph_osd" == "crimson-osd" ]; then
17031759 extra_seastar_args=" --trace"
17041760 fi
17051761 if [ " $objectstore " == " bluestore" ]; then
1762+ # This condition verifies the number of logical CPU cores
17061763 if [ " $( expr $( nproc) - 1) " -gt " $(( $CEPH_NUM_OSD * crimson_smp)) " ]; then
1707- if [ $crimson_alien_num_cores -gt 0 ]; then
1708- alien_bottom_cpu=$(( $CEPH_NUM_OSD * crimson_smp))
1709- alien_top_cpu=$(( alien_bottom_cpu + crimson_alien_num_cores - 1 ))
1710- # Ensure top value within range:
1711- if [ " $(( $alien_top_cpu )) " -gt " $( expr $( nproc) - 1) " ]; then
1712- alien_top_cpu=$( expr $( nproc) - 1)
1713- fi
1714- echo " crimson_alien_thread_cpu_cores: $alien_bottom_cpu -$alien_top_cpu "
1715- # This is a (logical) processor id range, it could be refined to encompass only physical processor ids
1716- # (equivalently, ignore hyperthreading sibling processor ids)
1717- $CEPH_BIN /ceph -c $conf_fn config set osd crimson_alien_thread_cpu_cores " $alien_bottom_cpu -$alien_top_cpu "
1764+ if [ ! -z " $crimson_balance_cpu " ]; then
1765+ debug echo " Preparing balance CPU for Crimson"
1766+ prep_balance_cpu $crimson_smp $crimson_balance_cpu
1767+ available_cpus=" ${cpu_table[-1]} "
1768+ echo " crimson_alien_thread_cpu_cores: '$available_cpus '"
1769+ $CEPH_BIN /ceph -c $conf_fn config set osd crimson_alien_thread_cpu_cores " $available_cpus "
17181770 else
1719- echo " crimson_alien_thread_cpu_cores:" $(( $CEPH_NUM_OSD * crimson_smp)) -" $( expr $( nproc) - 1) "
1720- $CEPH_BIN /ceph -c $conf_fn config set osd crimson_alien_thread_cpu_cores $(( $CEPH_NUM_OSD * crimson_smp)) -" $( expr $( nproc) - 1) "
1771+ if [ $crimson_alien_num_cores -gt 0 ]; then
1772+ alien_bottom_cpu=$(( $CEPH_NUM_OSD * crimson_smp))
1773+ alien_top_cpu=$(( alien_bottom_cpu + crimson_alien_num_cores - 1 ))
1774+ # Ensure top value within range:
1775+ if [ " $(( $alien_top_cpu )) " -gt " $( expr $( nproc) - 1) " ]; then
1776+ alien_top_cpu=$( expr $( nproc) - 1)
1777+ fi
1778+ echo " crimson_alien_thread_cpu_cores: $alien_bottom_cpu -$alien_top_cpu "
1779+ # This is a (logical) processor id range, it could be refined to encompass only physical processor ids
1780+ # (equivalently, ignore hyperthreading sibling processor ids)
1781+ $CEPH_BIN /ceph -c $conf_fn config set osd crimson_alien_thread_cpu_cores " $alien_bottom_cpu -$alien_top_cpu "
1782+ else
1783+ # This is the legacy default case
1784+ echo " crimson_alien_thread_cpu_cores:" $(( $CEPH_NUM_OSD * crimson_smp)) -" $( expr $( nproc) - 1) "
1785+ $CEPH_BIN /ceph -c $conf_fn config set osd crimson_alien_thread_cpu_cores $(( $CEPH_NUM_OSD * crimson_smp)) -" $( expr $( nproc) - 1) "
1786+ fi
17211787 fi
17221788 if [ $crimson_alien_num_threads -gt 0 ]; then
17231789 echo " $CEPH_BIN /ceph -c $conf_fn config set osd crimson_alien_op_num_threads $crimson_alien_num_threads "
0 commit comments