@@ -381,6 +381,62 @@ fn generate_chrony_config(
381381 args : ChronySetupArgs ,
382382 log : & Logger ,
383383) -> anyhow:: Result < ( ) > {
384+ // Rack Time Synchronisation
385+ // -------------------------
386+ //
387+ // Within an Oxide rack, every sled in the cluster runs an NTP server zone.
388+ // Two of these zones are nominated to be "Boundary NTP Servers" which
389+ // means that they have external connectivity via boundary networking
390+ // services and are configured with the NTP server address(es)/name(s)
391+ // provided during RSS. The other zones are "Internal NTP Servers", do not
392+ // have external connectivity, and synchronise with the boundary servers
393+ // across the rack's underlay network.
394+ //
395+ // Every sled initially starts up with the notion that it is late December
396+ // 1986, and there are a number of challenges in order to reach consensus
397+ // around time in the rack, particularly in situations where one of more
398+ // boundary servers lacks external connectivity, either at startup or
399+ // later. A number of strategies are employed in the configurations
400+ // below.
401+ //
402+ // - Each boundary server can authoratitively advertise time at stratum
403+ // 10 based on its local clock and will do this when there are no
404+ // "selectable" upstream servers. However, to avoid the situation
405+ // where December 1986 is advertised with authority, they will not use
406+ // this local source until the clock has been successfully
407+ // synchronised to an upstream source at least once. In the event that
408+ // a rack starts up with no external NTP connectivity everything
409+ // stops, waiting for time synchronisation to occur (that is, for the
410+ // networking issue to be resolved).
411+ //
412+ // - Each boundary server has its upstream sources configured with:
413+ // - maximum poll interval 2^5 (32 seconds). When a time source is
414+ // considered trustworthy and relatively stable over time, the rate
415+ // at which it is queried is reduced. We set a ceiling on the
416+ // polling rate so that we can still react relatively quickly to
417+ // events such as loss of external connectivity. Note that if
418+ // an update fails, the poll interval will rapidly decrease back
419+ // down towards one second.
420+ // - maximum number of retained samples is 8. This sets an upper limit
421+ // on the number of samples so that trust degrades more quickly in
422+ // the event the source is not contactable.
423+ // - The "failfast" flag causes the source to be immediately marked as
424+ // "unselectable" if it has not been contactable for several
425+ // consecutive attempts. Without this flag, the source would remain
426+ // selected and its root dispersion (and therefore its distance)
427+ // would increase fairly slowly. The source would become
428+ // unselectable after around an hour given the rest of the
429+ // configuration, which is far too slow.
430+ //
431+ // - The boundary servers include each other in their list of sources.
432+ // While they will see themselves in their source list, they will
433+ // automatically discount that to prevent a loop. Due to the "orphan"
434+ // tab on the local source mentioned earlier, when both boundary
435+ // servers fall back to their local clock source, the one with the
436+ // lowest reference ID will be preferred, protecting against a split
437+ // brain scenario when neither server has upstream connectivity and
438+ // both are are advertising their local clock with authority.
439+
384440 let internal_ntp_tpl = "#
385441# Configuration file for an internal NTP server - one which communicates with
386442# boundary NTP servers within the rack.
@@ -432,18 +488,25 @@ allow @ALLOW@
432488# appears synchronised even if there are currently no active upstreams. When
433489# in this mode, we report as stratum 10 to clients. The `distance' parameter
434490# controls when we will decide to abandon the upstreams and switch to the local
435- # reference. By setting `activate`, we prevent the server from ever activating
436- # its local reference until it has synchronised with upstream at least once and
437- # the root distance has dropped below the provided threshold. This prevents
438- # a boundary server in a cold booted rack from authoritatively advertising a
439- # time from the 1980s prior to gaining external connectivity.
491+ # reference, although this is largely redundant due to the upstream sources
492+ # being flagged as 'failfast'. By setting `activate`, we prevent the server
493+ # from ever activating its local reference until it has synchronised with
494+ # upstream at least once and the root distance has dropped below the
495+ # provided threshold. This prevents a boundary server in a cold booted rack
496+ # from authoritatively advertising a time from the 1980s prior to gaining
497+ # external connectivity.
440498#
441499# distance: Distance from root above which we use the local reference, opting
442500# to ignore the upstream.
443501# activate: Distance from root below which we must fall once to ever consider
444502# the local reference.
503+ # orphan: This option enables orphan mode, where sources with the same
504+ # stratum as our local are ignored unless no other source is
505+ # selectable and their reference IDs are smaller than ours. This
506+ # protects against a split brain situation when neither boundary
507+ # server has connectivity.
445508#
446- local stratum 10 distance 0.4 activate 0.5
509+ local stratum 10 orphan distance 0.4 activate 0.5
447510
448511# makestep <threshold> <limit>
449512# We allow chrony to step the system clock during the first three time updates
@@ -454,6 +517,13 @@ makestep 0.1 3
454517leapsecmode slew
455518maxslewrate 2708.333
456519
520+ # Refresh boundary NTP servers every two minutes instead of every two weeks
521+ refresh 120
522+
523+ # When a source is unreachable, increase its dispersion by 60 microseconds/s
524+ # instead of the default of 1.
525+ maxclockerror 60
526+
457527" ;
458528
459529 let ChronySetupArgs {
@@ -481,17 +551,17 @@ maxslewrate 2708.333
481551 for s in servers {
482552 writeln ! (
483553 & mut new_config,
484- "pool {s} iburst maxdelay 0.1 minpoll 0 maxpoll 3 maxsources 16"
554+ "pool {s} iburst maxdelay 0.1 maxsources 16 \
555+ minpoll 0 maxpoll 5 maxsamples 8 failfast"
485556 )
486557 . expect ( "write to String is infallible" ) ;
487558 }
488- } else {
489- writeln ! (
490- & mut new_config,
491- "pool {boundary_pool} iburst maxdelay 0.1 maxsources 16" ,
492- )
493- . expect ( "write to String is infallible" ) ;
494559 }
560+ writeln ! (
561+ & mut new_config,
562+ "pool {boundary_pool} iburst maxdelay 0.1 maxsources 16" ,
563+ )
564+ . expect ( "write to String is infallible" ) ;
495565
496566 // We read the contents from the old configuration file if it existed
497567 // so that we can verify if it changed.
0 commit comments