-
Notifications
You must be signed in to change notification settings - Fork 0
Ctl switchover #6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 20 commits
a7d7466
2be30da
12e0df5
f772966
412d0a6
0804b93
ec9b6c3
78d8607
d3d1e47
b47d7b9
07b81cd
ed70e0e
2001406
6ed7554
bf5746e
d49c0ba
ab43b49
2d10c10
9f20479
482e68c
3bf317a
db359d6
ce73bff
b1f204e
94fbf0b
49f8c25
84c351d
a5a9c39
50c9785
aed5381
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -370,7 +370,7 @@ def is_citus_cluster() -> bool: | |
| __dcs_cache: Dict[Tuple[str, Optional[int]], AbstractDCS] = {} | ||
|
|
||
|
|
||
| def get_dcs(scope: str, group: Optional[int]) -> AbstractDCS: | ||
| def get_dcs(scope: str, group: Optional[int], multisite: Optional[bool] = False) -> AbstractDCS: | ||
| """Get the DCS object. | ||
|
|
||
| :param scope: cluster name. | ||
|
|
@@ -385,13 +385,20 @@ def get_dcs(scope: str, group: Optional[int]) -> AbstractDCS: | |
| """ | ||
| if (scope, group) in __dcs_cache: | ||
| return __dcs_cache[(scope, group)] | ||
|
|
||
| config = _get_configuration() | ||
| config.update({'scope': scope, 'patronictl': True}) | ||
| if group is not None: | ||
| config['citus'] = {'group': group, 'database': 'postgres'} | ||
| config.setdefault('name', scope) | ||
|
|
||
| try: | ||
| dcs = _get_dcs(config) | ||
| # TODO: might be necessary for site switchover candidates collection | ||
| # if multisite: | ||
| # _, dcs = MultisiteController.get_dcs_config(config) | ||
| # else: | ||
| # dcs = _get_dcs(config) | ||
| if is_citus_cluster() and group is None: | ||
| dcs.is_mpp_coordinator = lambda: True | ||
| click.get_current_context().obj['__mpp'] = dcs.mpp | ||
|
|
@@ -1321,6 +1328,11 @@ def _do_failover_or_switchover(action: str, cluster_name: str, group: Optional[i | |
|
|
||
| if candidate is None and not force: | ||
| candidate = click.prompt('Candidate ' + str(candidate_names), type=str, default='') | ||
| # TODO: set candidate names reliably where there's only one | ||
| # if len(candidate_names) == 1: | ||
| # candidate = click.prompt('Candidate ', type=str, default=candidate_names[0]) | ||
| # else: | ||
| # candidate = click.prompt('Candidate ' + str(candidate_names), type=str, default='') | ||
|
|
||
| if action == 'failover' and not candidate: | ||
| raise PatroniCtlException('Failover could be performed only to a specific candidate') | ||
|
|
@@ -1404,6 +1416,151 @@ def _do_failover_or_switchover(action: str, cluster_name: str, group: Optional[i | |
| output_members(cluster, cluster_name, group=group) | ||
|
|
||
|
|
||
| def _do_site_switchover(cluster_name: str, group: Optional[int], | ||
| switchover_leader: Optional[str], candidate: Optional[str], | ||
| force: bool, scheduled: Optional[str] = None) -> None: | ||
| """Perform a site switchover operation in the cluster. | ||
|
|
||
| Informational messages are printed in the console during the operation, as well as the list of members before and | ||
| after the operation, so the user can follow the operation status. | ||
|
|
||
| .. note:: | ||
| If not able to perform the operation through the REST API, write directly to the DCS as a fall back. | ||
|
|
||
| :param cluster_name: name of the Patroni cluster. | ||
| :param group: filter Citus group within we should perform a failover or switchover. If ``None``, user will be | ||
| prompted for filling it -- unless *force* is ``True``, in which case an exception is raised. | ||
| :param switchover_leader: name of the leader site passed as switchover option. | ||
| :param candidate: name of a standby site to be promoted. | ||
| :param force: perform the switchover without asking for confirmations. | ||
| :param scheduled: timestamp when the switchover should be scheduled to occur. If ``now`` perform immediately. | ||
|
|
||
| :raises: | ||
| :class:`PatroniCtlException`: if: | ||
| * Patroni is running on a Citus cluster, but no *group* was specified; or | ||
| * a switchover was requested but the cluster has no leader site (or multisite is not active); or | ||
| * *switchover_leader* does not match the current leader site of the cluster; or | ||
| * cluster has no candidate sites available for the operation; or | ||
| * current leader and *candidate* are the same; or | ||
| * *candidate* is not a site of the cluster; or | ||
| * trying to schedule a switchover in a cluster that is in maintenance mode; or | ||
| * user aborts the operation. | ||
| """ | ||
| dcs = get_dcs(cluster_name, group) | ||
| cluster = dcs.get_cluster() | ||
| click.echo('Current cluster topology') | ||
| output_members(cluster, cluster_name, group=group) | ||
|
|
||
| if is_citus_cluster() and group is None: | ||
| if force: | ||
| raise PatroniCtlException('For Citus clusters the --group must me specified') | ||
| else: | ||
| group = click.prompt('Citus group', type=int) | ||
| dcs = get_dcs(cluster_name, group) | ||
| cluster = dcs.get_cluster() | ||
|
|
||
| config = global_config.from_cluster(cluster) | ||
|
|
||
| cluster_leader = cluster.leader and cluster.leader.name | ||
|
|
||
| if not cluster_leader: | ||
| raise PatroniCtlException('This cluster has no leader') | ||
|
|
||
| if cluster.leader and cluster.leader.multisite: | ||
| leader_site = (cluster.leader.multisite.get('name') if not cluster.leader.multisite.get('standby_config') else | ||
| cluster.leader.multisite.get('standby_config', {}).get('leader_site')) | ||
| else: | ||
| raise PatroniCtlException('Multisite is not active or there is no leader site, cannot switch sites') | ||
|
|
||
| if switchover_leader is None: | ||
| if force: | ||
| switchover_leader = leader_site | ||
| else: | ||
| prompt = 'Leader site' | ||
| switchover_leader = click.prompt(prompt, type=str, default=leader_site) | ||
|
|
||
| if leader_site != switchover_leader: | ||
| raise PatroniCtlException(f'Site {switchover_leader} is not the leader of cluster {cluster_name}') | ||
|
|
||
| # multisite_dcs = get_dcs(cluster_name, group, True) | ||
| # multisite_cluster = multisite_dcs.get_cluster() | ||
|
|
||
| candidate_names = [str(m.multisite['name']) for m in cluster.members | ||
| if m.multisite and m.multisite['name'] != leader_site] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this should not work, the other sites members are not listed in this sites DCS. Meaning candidate_names will only contain the site name that this is executed on. This is only correct when executed on standby site in a 2 node cluster. |
||
| # We sort the names for consistent output to the client | ||
| candidate_names.sort() | ||
|
|
||
| # TODO: once there is a reliable way for getting the candidate sites when on the leader site, turn this back on | ||
| # if not candidate_names: | ||
| # raise PatroniCtlException('No candidates found to switch over to') | ||
|
|
||
| if candidate is None and not force: | ||
| candidate = click.prompt('Candidate ' + str(candidate_names), type=str, default='') | ||
|
|
||
| if candidate and candidate not in candidate_names: | ||
| if candidate == cluster_leader: | ||
| raise PatroniCtlException( | ||
| f'Site {candidate} is already the leader of cluster {cluster_name}') | ||
| raise PatroniCtlException( | ||
| f'Site {candidate} does not exist in cluster {cluster_name}') | ||
|
|
||
| scheduled_at_str = None | ||
| scheduled_at = None | ||
|
|
||
| if scheduled is None and not force: | ||
| next_hour = (datetime.datetime.now() + datetime.timedelta(hours=1)).strftime('%Y-%m-%dT%H:%M') | ||
| scheduled = click.prompt('When should the switchover take place (e.g. ' + next_hour + ' ) ', | ||
| type=str, default='now') | ||
|
|
||
| scheduled_at = parse_scheduled(scheduled) | ||
| if scheduled_at: | ||
| if config.is_paused: | ||
| raise PatroniCtlException("Can't schedule switchover in the paused state") | ||
| scheduled_at_str = scheduled_at.isoformat() | ||
|
|
||
| switchover_value = {'candidate': candidate} | ||
|
|
||
| if scheduled_at_str: | ||
| switchover_value['scheduled_at'] = scheduled_at_str | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This value is ignored by the multisite code. Scheduled switchover is not yet implemented. |
||
|
|
||
| logging.debug(switchover_value) | ||
|
|
||
| # By now we have established that the leader site exists and the candidate also exists | ||
| if not force: | ||
| demote_msg = f' to another site, demoting current site {cluster_leader}' if cluster_leader else '' | ||
| if scheduled_at_str: | ||
| if not click.confirm(f'Are you sure you want to schedule switchover of cluster ' | ||
| f'{cluster_name} at {scheduled_at_str}{demote_msg}?'): | ||
| raise PatroniCtlException('Aborting scheduled switchover') | ||
| else: | ||
| if not click.confirm(f'Are you sure you want to switch over cluster {cluster_name}{demote_msg}?'): | ||
| raise PatroniCtlException('Aborting switchover') | ||
|
|
||
| r = None | ||
| try: | ||
| # We would already have thrown an exception if there was no leader | ||
| member = cluster.leader.member if cluster.leader else candidate and cluster.get_member(candidate, False) | ||
| if TYPE_CHECKING: # pragma: no cover | ||
| assert isinstance(member, Member) | ||
| r = request_patroni(member, 'post', 'site_switchover', switchover_value) | ||
|
|
||
| if r.status in (200, 202): | ||
| logging.debug(r) | ||
| cluster = dcs.get_cluster() | ||
| logging.debug(cluster) | ||
| click.echo('{0} {1}'.format(timestamp(), r.data.decode('utf-8'))) | ||
| else: | ||
| click.echo('Multisite switchover failed, details: {0}, {1}'.format(r.status, r.data.decode('utf-8'))) | ||
| return | ||
| except Exception: | ||
| logging.exception(r) | ||
| logging.warning('Failing over to DCS') | ||
| click.echo('{0} Could not perform site switchover using Patroni API, falling back to DCS'.format(timestamp())) | ||
| dcs.manual_failover(leader=switchover_leader, candidate='', target_site=candidate, scheduled_at=scheduled_at) | ||
|
|
||
| output_members(cluster, cluster_name, group=group) | ||
|
|
||
|
|
||
| @ctl.command('failover', help='Failover to a replica') | ||
| @arg_cluster_name | ||
| @option_citus_group | ||
|
|
@@ -1455,6 +1612,36 @@ def switchover(cluster_name: str, group: Optional[int], leader: Optional[str], | |
| _do_failover_or_switchover('switchover', cluster_name, group, candidate, force, leader, scheduled) | ||
|
|
||
|
|
||
| @ctl.command('site-switchover', help='Switchover to another data centre') | ||
| @arg_cluster_name | ||
| @option_citus_group | ||
| @click.option('--leader-site', '--primary-site', 'leader_site', help='The name of the current leader site', | ||
| default=None) | ||
| @click.option('--candidate-site', 'candidate_site', help='The name of the candidate', default=None) | ||
| @click.option('--scheduled', help='Timestamp of a scheduled switchover in unambiguous format (e.g. ISO 8601)', | ||
| default=None) | ||
| @option_force | ||
| def site_switchover(cluster_name: str, group: Optional[int], leader_site: Optional[str], | ||
| candidate_site: Optional[str], force: bool, scheduled: Optional[str]) -> None: | ||
| """Process ``multisite-switchover`` command of ``patronictl`` utility. | ||
|
|
||
| Perform a site switchover operation in the multisite cluster. | ||
|
|
||
| .. seealso:: | ||
| Refer to :func:`_do_site_switchover` for details. | ||
|
|
||
| :param cluster_name: name of the Patroni cluster. | ||
| :param group: filter Citus group within we should perform a switchover. If ``None``, user will be prompted for | ||
| filling it -- unless *force* is ``True``, in which case an exception is raised by | ||
| :func:`_do_failover_or_switchover`. | ||
| :param leader-site: name of the current leader site. | ||
| :param candidate-site: name of a standby site to be promoted. | ||
| :param force: perform the switchover without asking for confirmations. | ||
| :param scheduled: timestamp when the switchover should be scheduled to occur. If ``now`` perform immediately. | ||
| """ | ||
| _do_site_switchover(cluster_name, group, leader_site, candidate_site, force, scheduled) | ||
|
|
||
|
|
||
| def generate_topology(level: int, member: Dict[str, Any], | ||
| topology: Dict[Optional[str], List[Dict[str, Any]]]) -> Iterator[Dict[str, Any]]: | ||
| """Recursively yield members with their names adjusted according to their *level* in the cluster topology. | ||
|
|
@@ -1543,13 +1730,12 @@ def get_cluster_service_info(cluster: Dict[str, Any]) -> List[str]: | |
| """ | ||
| service_info: List[str] = [] | ||
|
|
||
|
|
||
|
|
||
| if 'multisite' in cluster: | ||
| info = f"Multisite {cluster['multisite']['name'] or ''} is {cluster['multisite']['status'].lower()}" | ||
| info = f"Multisite {cluster['multisite'].get('name') or ''} is {cluster['multisite']['status'].lower()}" | ||
| standby_config = cluster['multisite'].get('standby_config', {}) | ||
| if standby_config and standby_config.get('host'): | ||
| info += f", replicating from {standby_config['host']}:{standby_config.get('port', 5432)}" | ||
| info += f", replicating from {standby_config['leader_site']}" | ||
| info += f" ({standby_config['host']}:{standby_config.get('port', 5432)})" | ||
| service_info.append(info) | ||
|
|
||
| if cluster.get('pause'): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Citus will be quite unhappy if some nodes don't have a local leader. Correct fix is probably to have a single point that determines which site is active for the whole cluster. But that is a larger job. I think for now we should just have an error saying that citus and multisite together is unsupported.