Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions src/swell/cylc_swell.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import os
import sys

from swell.deployment.platforms.platforms import SwellPlatform
from swell.deployment.platforms.platforms import SwellPlatforms
from swell.utilities.logger import Logger

# --------------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -55,10 +55,10 @@ def execute_cylc(argv=sys.argv) -> None:

logger = Logger('SwellCylcEntryPoint')

platform = SwellPlatform.detect_platform()
platform = SwellPlatforms.detect_platform()

# Location for Discover cylc installation
if platform in [SwellPlatform.NCCS_DISCOVER_CASCADE, SwellPlatform.NCCS_DISCOVER_SLES15]:
if platform in [SwellPlatforms.NCCS_DISCOVER_CASCADE, SwellPlatforms.NCCS_DISCOVER_SLES15]:
opt = '/discover/nobackup/projects/gmao/advda/swell/dev/core/cylc/sles15_8.4.0/'
python_ver = 'python3.11'

Expand All @@ -70,6 +70,11 @@ def execute_cylc(argv=sys.argv) -> None:

subprocess.run(cylc_command, env=env)

elif platform == SwellPlatforms.AWS:
cylc_command = ['/usr/local/bin/cylc'] + sys.argv[1:]

subprocess.run(cylc_command)

# Try just calling cylc from the path
else:
logger.warning('Platform not recognized, attempting to call Cylc executable from the path.')
Expand Down
86 changes: 49 additions & 37 deletions src/swell/deployment/platforms/platforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import yaml
from enum import Enum
import subprocess
import platform as pltfrm
from typing import Self

from importlib import resources

Expand All @@ -30,22 +32,6 @@ def platform_path() -> str:
# --------------------------------------------------------------------------------------------------


def get_platforms() -> list:

# Get list of supported platforms
platforms = [dir for dir in os.listdir(platform_path())
if os.path.isdir(os.path.join(platform_path(), dir))]

# If anything in platforms contains '__' remove it from platforms list
platforms = [platform for platform in platforms if '__' not in platform]

# List all directories in directory
return platforms


# --------------------------------------------------------------------------------------------------


def login_or_compute(platform) -> str:

'''
Expand Down Expand Up @@ -87,36 +73,62 @@ def login_or_compute(platform) -> str:
# --------------------------------------------------------------------------------------------------


class SwellPlatform(Enum):
''' Store filepaths for platform defaults. '''
NCCS_DISCOVER_SLES15 = os.path.join(platform_path(), 'nccs_discover_sles15')
NCCS_DISCOVER_CASCADE = os.path.join(platform_path(), 'nccs_discover')
GENERIC = os.path.join(platform_path(), 'generic')
class SwellPlatforms(Enum):
''' Track platforms supported by Swell. '''
NCCS_DISCOVER_SLES15 = 'nccs_discover_sles15'
NCCS_DISCOVER_CASCADE = 'nccs_discover_cascade'
AWS = 'aws'

@classmethod
def detect_platform(cls):
''' Detect the current platform, or return generic (NCCS only). '''
''' Detect the current platform, or return generic. '''

# Try to get the hostname
hostname = os.environ.get('HOSTNAME')
if hostname is None or not any(key in hostname for key in ['discover', 'borg', 'warp']):
return cls.GENERIC
os_name = pltfrm.platform()

if hostname is not None:

# Check for Discover hostnames
if any(key in hostname for key in ['discover', 'borg', 'warp']):

# Try the lscpu shell command, which should be available across NCCS
try:
cpu_info = str(subprocess.run('lscpu', capture_output=True).stdout)
try:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm pretty sure Swell won't actually work with the GENERIC platform, right? If that's the case, I suggest we don't actually include it as a "supported" platform here and just let any errors in this section get thrown (which would be a noteworthy error --- we're already limiting this section to Discover hostnames).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or would this break code_tests or other unit tests running on a generic linux machine or something? I.e., Does code_tests depend on the platform? If so, it shouldn't!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code tests don't depend on platform, but one quirk of the click interface is that it executes all calls in the driver group regardless of whether the particular command that needs it is called or not. So if we raise an error if the platform was not detected, it will raise that error any time swell is called for any purpose

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, makes sense. Can we work around that with something like this?

@swell_driver.command()
@click.argument('suite', type=click.Choice(AllSuites.config_names()))
@click.option('-m', '--input_method', 'input_method', default='defaults',
              type=click.Choice(['defaults', 'cli']), help=input_method_help)
@click.option('-p', '--platform', 'platform', default=None,     # <--- NOTE: Default to `None`
              type=click.Choice(platforms.get_all()), help=platform_help)
@click.option('-o', '--override', 'override', default=None, help=override_help)
@click.option('-a', '--advanced', 'advanced', default=False, help=advanced_help)
@click.option('-s', '--slurm', 'slurm', default=None, help=slurm_help)
def create(
    suite: str,
    input_method: str,
    platform: str | None,
    override: Union[dict, str, None],
    advanced: bool,
    slurm: str
) -> None:
    """
    Create a new experiment

    This command creates an experiment directory based on the provided suite name and options.

    Arguments: \n
        suite (str): Name of the suite you wish to run. \n

    """

    if platform is None:
        platform = platforms.detect_platform()

    # Create the experiment directory
    create_experiment_directory(suite, input_method, platform, override, advanced, slurm)

# Try the lscpu shell command, which should be available across NCCS
cpu_info = str(subprocess.run('lscpu', capture_output=True).stdout)

model_name = cpu_info.split('Model name:')[1].strip().split('\n')[0].strip()
model_name = cpu_info.split('Model name:')[1].strip().split('\n')[0].strip()

# Match the cpu to the expected platform
if all(key in model_name for key in ['Intel', 'Xeon']):
return cls.NCCS_DISCOVER_CASCADE
elif all(key in model_name for key in ['AMD', 'EPYC']):
return cls.NCCS_DISCOVER_SLES15
else:
return cls.GENERIC
except (FileNotFoundError, IndexError):
raise ValueError('NCCS Discover hostname detected, but failed to '
'automatically detect cpu type with "lscpu".')

# Match the cpu to the expected platform
if all(key in model_name for key in ['Intel', 'Xeon']):
return cls.NCCS_DISCOVER_CASCADE
elif all(key in model_name for key in ['AMD', 'EPYC']):
return cls.NCCS_DISCOVER_SLES15
else:
raise ValueError(f'NCCS Discover hostname detected, but CPU model '
f'{model_name} does not match any known node types')

# Check for AWS
if all(key in os_name for key in ['Linux', 'aws']):
return cls.AWS

raise ValueError(f'Unknown or unsupported platform: {os_name}.')

# --------------------------------------------------------------------------------------------------

@classmethod
def get_all(cls) -> list:
return [item.value for item in cls]

# --------------------------------------------------------------------------------------------------

@classmethod
def match_name(cls, name: str) -> Self:
# Return the enum instance based on the name
return getattr(cls, name.upper())

except (FileNotFoundError, IndexError):
return cls.GENERIC

# --------------------------------------------------------------------------------------------------
30 changes: 20 additions & 10 deletions src/swell/swell.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import click
from typing import Union, Optional, Literal

from swell.deployment.platforms.platforms import get_platforms
from swell.deployment.platforms.platforms import SwellPlatforms as platforms
from swell.deployment.create_experiment import clone_config, create_experiment_directory
from swell.deployment.launch_experiment import launch_experiment
from swell.tasks.base.task_base import task_wrapper, get_tasks
Expand Down Expand Up @@ -55,7 +55,7 @@ def swell_driver() -> None:

platform_help = 'If using defaults for input_method, this option is used to determine which ' + \
'platform to use for platform specific defaults. Options are ' + \
str(get_platforms())
str(platforms.get_all())

override_help = 'After generating the config file, parameters inside can be overridden ' + \
'using values from the override config file.'
Expand Down Expand Up @@ -90,8 +90,8 @@ def swell_driver() -> None:
@click.argument('suite', type=click.Choice(AllSuites.config_names()))
@click.option('-m', '--input_method', 'input_method', default='defaults',
type=click.Choice(['defaults', 'cli']), help=input_method_help)
@click.option('-p', '--platform', 'platform', default='nccs_discover_sles15',
type=click.Choice(get_platforms()), help=platform_help)
@click.option('-p', '--platform', 'platform', default=None,
type=click.Choice(platforms.get_all()), help=platform_help)
@click.option('-o', '--override', 'override', default=None, help=override_help)
@click.option('-a', '--advanced', 'advanced', default=False, help=advanced_help)
@click.option('-s', '--slurm', 'slurm', default=None, help=slurm_help)
Expand All @@ -112,6 +112,10 @@ def create(
suite (str): Name of the suite you wish to run. \n

"""

if platform is None:
platform = platforms.detect_platform().value

# Create the experiment directory
create_experiment_directory(suite, input_method, platform, override, advanced, slurm)

Expand Down Expand Up @@ -244,41 +248,47 @@ def test(test: str) -> None:


@swell_driver.command()
@click.option('-p', '--platform', 'platform', type=click.Choice(get_platforms()),
default="nccs_discover_sles15", help=platform_help)
@click.option('-p', '--platform', 'platform', type=click.Choice(platforms.get_all()),
default=None, help=platform_help)
@click.argument('suite', type=click.Choice(("hofx", "3dvar", "ufo_testing")))
def t1test(
suite: Literal["hofx", "3dvar", "ufo_testing"],
platform: Optional[str] = "nccs_discover_sles15"
platform: Optional[str]
) -> None:
"""
Run a particular swell suite from the tier 1 tests.

Arguments:
suite (str): Name of the suite to run (e.g., hofx, 3dvar, ufo_testing)
"""
if platform is None:
platform = platforms.detect_platform().value

run_suite(suite, platform, TestSuite.TIER1)


# --------------------------------------------------------------------------------------------------


@swell_driver.command()
@click.option('-p', '--platform', 'platform', type=click.Choice(get_platforms()),
default="nccs_discover_sles15", help=platform_help)
@click.option('-p', '--platform', 'platform', type=click.Choice(platforms.get_all()),
default=None, help=platform_help)
@click.argument('suite', type=click.Choice(("hofx", "3dvar", "ufo_testing",
"convert_ncdiags", "3dfgat_atmos", "build_jedi")))
def t2test(
suite: Literal["hofx", "3dvar", "ufo_testing",
"convert_ncdiags", "3dfgat_atmos", "build_jedi"],
platform: Optional[str] = "nccs_discover_sles15"
platform: Optional[str]
) -> None:
"""
Run a particular swell suite from the tier 2 tests.

Arguments:
suite (str): Name of the suite to run (e.g., hofx, 3dvar, ufo_testing)
"""
if platform is None:
platform = platforms.detect_platform().value

run_suite(suite, platform, TestSuite.TIER2)


Expand Down