diff --git a/config/clusters/nasa-ghg/staging.values.yaml b/config/clusters/nasa-ghg/staging.values.yaml index f51624949a..9c648ab0ed 100644 --- a/config/clusters/nasa-ghg/staging.values.yaml +++ b/config/clusters/nasa-ghg/staging.values.yaml @@ -23,7 +23,7 @@ basehub: custom: homepage: gitRepoBranch: staging - gitRepoUrl: "https://github.com/US-GHG-Center/ghgc-hub-homepage" + gitRepoUrl: https://github.com/US-GHG-Center/ghgc-hub-homepage hub: config: GitHubOAuthenticator: diff --git a/deployer/commands/generate/resource_allocation/generate_choices.py b/deployer/commands/generate/resource_allocation/generate_choices.py index ddc4eee541..9fb2b480bb 100644 --- a/deployer/commands/generate/resource_allocation/generate_choices.py +++ b/deployer/commands/generate/resource_allocation/generate_choices.py @@ -1,8 +1,8 @@ import json -import math import sys from enum import Enum from pathlib import Path +from typing import List import typer from ruamel.yaml import YAML @@ -48,15 +48,17 @@ def proportional_memory_strategy( # We operate on *available* memory, which already accounts for system components (like kubelet & systemd) # as well as daemonsets we run on every node. This represents the resources that are available # for user pods. - - # FIXME: Add some more more wiggle room here - available_node_mem = nodeinfo["available"]["memory"] - available_node_cpu = nodeinfo["available"]["cpu"] - - # Only show one digit after . for CPU, but round *down* not up so we never - # say they are getting more CPU than our limit is set to. We multiply & divide - # with a floor, as otherwise 3.75 gets rounded to 3.8, not 3.7 - cpu_display = math.floor(available_node_cpu * 10) / 10 + # In addition, we provide some wiggle room to account for additional daemonset requests or other + # issues that may pop up due to changes outside our control (like k8s upgrades). This is either + # 2% of the available capacity, or 2GB / 1 CPU (whichever is smaller) + WIGGLE_PERCENTAGE = 0.02 + mem_overhead_wiggle = min( + nodeinfo["available"]["memory"] * WIGGLE_PERCENTAGE, 2 * 1024 * 1024 * 1024 + ) + cpu_overhead_wiggle = min(nodeinfo["available"]["cpu"] * WIGGLE_PERCENTAGE, 1) + + available_node_mem = nodeinfo["available"]["memory"] - mem_overhead_wiggle + available_node_cpu = nodeinfo["available"]["cpu"] - cpu_overhead_wiggle # We always start from the top, and provide a choice that takes up the whole node. mem_limit = available_node_mem @@ -67,12 +69,26 @@ def proportional_memory_strategy( # This makes sure we utilize all the memory on a node all the time. cpu_guarantee = (mem_limit / available_node_mem) * available_node_cpu - # Memory is in bytes, let's convert it to GB (with only 1 digit after .) to display - mem_display = f"{mem_limit / 1024 / 1024 / 1024:.1f}" - display_name = f"{mem_display} GB RAM, upto {cpu_display} CPUs" + # Memory is in bytes, let's convert it to GB or MB (with no digits after 0) to display + if mem_limit < 1024 * 1024 * 1024: + mem_display = f"{mem_limit / 1024 / 1024:.0f} MB" + else: + mem_display = f"{mem_limit / 1024 / 1024 / 1024:.0f} GB" + + if cpu_guarantee < 2: + cpu_guarantee_display = f"~{cpu_guarantee:0.1f}" + else: + cpu_guarantee_display = f"~{cpu_guarantee:0.0f}" + + display_name = f"~{mem_display} RAM, {cpu_guarantee_display} CPUs" + if cpu_guarantee != available_node_cpu: + description = f"Up to ~{available_node_cpu:.0f} CPUs when available" + else: + description = f"~{available_node_cpu:.0f} CPUs always available" choice = { "display_name": display_name, + "description": description, "kubespawner_override": { # Guarantee and Limit are the same - this strategy has no oversubscription "mem_guarantee": int(mem_limit), @@ -91,7 +107,8 @@ def proportional_memory_strategy( # Use the amount of RAM made available as a slug, to allow combining choices from # multiple instance types in the same profile. This does mean you can not have # the same RAM allocation from multiple node selectors. But that's a feature, not a bug. - choices[f"mem_{mem_display.replace('.', '_')}"] = choice + choice_key = f"mem_{mem_display.replace('.', '_').replace(' ', '_')}".lower() + choices[choice_key] = choice # Halve the mem_limit for the next choice mem_limit = mem_limit / 2 @@ -99,18 +116,15 @@ def proportional_memory_strategy( # Reverse the choices so the smallest one is first choices = dict(reversed(choices.items())) - # Make the smallest choice the default explicitly - choices[list(choices.keys())[0]]["default"] = True - return choices @resource_allocation_app.command() def choices( - instance_type: str = typer.Argument( - ..., help="Instance type to generate Resource Allocation options for" + instance_specification: List[str] = typer.Argument( + ..., + help="Instance type and number of choices to generate Resource Allocation options for. Specify as instance_type:count.", ), - num_allocations: int = typer.Option(5, help="Number of choices to generate"), strategy: ResourceAllocationStrategies = typer.Option( ResourceAllocationStrategies.PROPORTIONAL_MEMORY_STRATEGY, help="Strategy to use for generating resource allocation choices choices", @@ -122,19 +136,25 @@ def choices( """ with open(HERE / "node-capacity-info.json") as f: nodeinfo = json.load(f) - - if instance_type not in nodeinfo: - print( - f"Capacity information about {instance_type} not available", file=sys.stderr - ) - print("TODO: Provide information on how to update it", file=sys.stderr) - sys.exit(1) - - # Call appropriate function based on what strategy we want to use - if strategy == ResourceAllocationStrategies.PROPORTIONAL_MEMORY_STRATEGY: - choices = proportional_memory_strategy( - instance_type, nodeinfo[instance_type], num_allocations - ) - else: - raise ValueError(f"Strategy {strategy} is not currently supported") + choices = {} + for instance_spec in instance_specification: + instance_type, num_allocations = instance_spec.split(":", 2) + + if instance_type not in nodeinfo: + print( + f"Capacity information about {instance_type} not available", + file=sys.stderr, + ) + print("TODO: Provide information on how to update it", file=sys.stderr) + sys.exit(1) + + # Call appropriate function based on what strategy we want to use + if strategy == ResourceAllocationStrategies.PROPORTIONAL_MEMORY_STRATEGY: + choices.update( + proportional_memory_strategy( + instance_type, nodeinfo[instance_type], int(num_allocations) + ) + ) + else: + raise ValueError(f"Strategy {strategy} is not currently supported") yaml.dump(choices, sys.stdout)