Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion config/clusters/nasa-ghg/staging.values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ basehub:
custom:
homepage:
gitRepoBranch: staging
gitRepoUrl: "https://github.com/US-GHG-Center/ghgc-hub-homepage"
gitRepoUrl: https://github.com/US-GHG-Center/ghgc-hub-homepage
hub:
config:
GitHubOAuthenticator:
Expand Down
90 changes: 55 additions & 35 deletions deployer/commands/generate/resource_allocation/generate_choices.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import json
import math
import sys
from enum import Enum
from pathlib import Path
from typing import List

import typer
from ruamel.yaml import YAML
Expand Down Expand Up @@ -48,15 +48,17 @@ def proportional_memory_strategy(
# We operate on *available* memory, which already accounts for system components (like kubelet & systemd)
# as well as daemonsets we run on every node. This represents the resources that are available
# for user pods.

# FIXME: Add some more more wiggle room here
available_node_mem = nodeinfo["available"]["memory"]
available_node_cpu = nodeinfo["available"]["cpu"]

# Only show one digit after . for CPU, but round *down* not up so we never
# say they are getting more CPU than our limit is set to. We multiply & divide
# with a floor, as otherwise 3.75 gets rounded to 3.8, not 3.7
cpu_display = math.floor(available_node_cpu * 10) / 10
# In addition, we provide some wiggle room to account for additional daemonset requests or other
# issues that may pop up due to changes outside our control (like k8s upgrades). This is either
# 2% of the available capacity, or 2GB / 1 CPU (whichever is smaller)
WIGGLE_PERCENTAGE = 0.02
mem_overhead_wiggle = min(
nodeinfo["available"]["memory"] * WIGGLE_PERCENTAGE, 2 * 1024 * 1024 * 1024
)
cpu_overhead_wiggle = min(nodeinfo["available"]["cpu"] * WIGGLE_PERCENTAGE, 1)

available_node_mem = nodeinfo["available"]["memory"] - mem_overhead_wiggle
available_node_cpu = nodeinfo["available"]["cpu"] - cpu_overhead_wiggle

# We always start from the top, and provide a choice that takes up the whole node.
mem_limit = available_node_mem
Expand All @@ -67,12 +69,26 @@ def proportional_memory_strategy(
# This makes sure we utilize all the memory on a node all the time.
cpu_guarantee = (mem_limit / available_node_mem) * available_node_cpu

# Memory is in bytes, let's convert it to GB (with only 1 digit after .) to display
mem_display = f"{mem_limit / 1024 / 1024 / 1024:.1f}"
display_name = f"{mem_display} GB RAM, upto {cpu_display} CPUs"
# Memory is in bytes, let's convert it to GB or MB (with no digits after 0) to display
if mem_limit < 1024 * 1024 * 1024:
mem_display = f"{mem_limit / 1024 / 1024:.0f} MB"
else:
mem_display = f"{mem_limit / 1024 / 1024 / 1024:.0f} GB"

if cpu_guarantee < 2:
cpu_guarantee_display = f"~{cpu_guarantee:0.1f}"
else:
cpu_guarantee_display = f"~{cpu_guarantee:0.0f}"

display_name = f"~{mem_display} RAM, {cpu_guarantee_display} CPUs"
if cpu_guarantee != available_node_cpu:
description = f"Up to ~{available_node_cpu:.0f} CPUs when available"
else:
description = f"~{available_node_cpu:.0f} CPUs always available"

choice = {
"display_name": display_name,
"description": description,
"kubespawner_override": {
# Guarantee and Limit are the same - this strategy has no oversubscription
"mem_guarantee": int(mem_limit),
Expand All @@ -91,26 +107,24 @@ def proportional_memory_strategy(
# Use the amount of RAM made available as a slug, to allow combining choices from
# multiple instance types in the same profile. This does mean you can not have
# the same RAM allocation from multiple node selectors. But that's a feature, not a bug.
choices[f"mem_{mem_display.replace('.', '_')}"] = choice
choice_key = f"mem_{mem_display.replace('.', '_').replace(' ', '_')}".lower()
choices[choice_key] = choice

# Halve the mem_limit for the next choice
mem_limit = mem_limit / 2

# Reverse the choices so the smallest one is first
choices = dict(reversed(choices.items()))

# Make the smallest choice the default explicitly
choices[list(choices.keys())[0]]["default"] = True

return choices


@resource_allocation_app.command()
def choices(
instance_type: str = typer.Argument(
..., help="Instance type to generate Resource Allocation options for"
instance_specification: List[str] = typer.Argument(
...,
help="Instance type and number of choices to generate Resource Allocation options for. Specify as instance_type:count.",
),
num_allocations: int = typer.Option(5, help="Number of choices to generate"),
strategy: ResourceAllocationStrategies = typer.Option(
ResourceAllocationStrategies.PROPORTIONAL_MEMORY_STRATEGY,
help="Strategy to use for generating resource allocation choices choices",
Expand All @@ -122,19 +136,25 @@ def choices(
"""
with open(HERE / "node-capacity-info.json") as f:
nodeinfo = json.load(f)

if instance_type not in nodeinfo:
print(
f"Capacity information about {instance_type} not available", file=sys.stderr
)
print("TODO: Provide information on how to update it", file=sys.stderr)
sys.exit(1)

# Call appropriate function based on what strategy we want to use
if strategy == ResourceAllocationStrategies.PROPORTIONAL_MEMORY_STRATEGY:
choices = proportional_memory_strategy(
instance_type, nodeinfo[instance_type], num_allocations
)
else:
raise ValueError(f"Strategy {strategy} is not currently supported")
choices = {}
for instance_spec in instance_specification:
instance_type, num_allocations = instance_spec.split(":", 2)

if instance_type not in nodeinfo:
print(
f"Capacity information about {instance_type} not available",
file=sys.stderr,
)
print("TODO: Provide information on how to update it", file=sys.stderr)
sys.exit(1)

# Call appropriate function based on what strategy we want to use
if strategy == ResourceAllocationStrategies.PROPORTIONAL_MEMORY_STRATEGY:
choices.update(
proportional_memory_strategy(
instance_type, nodeinfo[instance_type], int(num_allocations)
)
)
else:
raise ValueError(f"Strategy {strategy} is not currently supported")
yaml.dump(choices, sys.stdout)
Loading