Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
74 commits
Select commit Hold shift + click to select a range
a430e8c
add basic example for regridding using scipy
PetarStam Apr 10, 2025
1821bb1
rename and delete comments
PetarStam Apr 10, 2025
0585383
start a branch for transfering corrdiff from nvidia-modulus
PetarStam Apr 10, 2025
99d55df
add model and loss scripts
PetarStam Apr 11, 2025
1f404b3
add utils and change imports
PetarStam Apr 11, 2025
3b172cc
restructure file system and add pyproject.toml
PetarStam Apr 14, 2025
05e3a08
Delete src/hirad_gen.egg-info directory
PetarStam Apr 14, 2025
dbf101e
add gitignore
PetarStam Apr 14, 2025
bb6288c
Merge branch 'corr-diff' of https://github.com/MeteoSwiss/HiRAD-Gen i…
PetarStam Apr 14, 2025
5de83bc
add train script with missing parts
PetarStam Apr 16, 2025
134ed83
add abstract dataset class
PetarStam Apr 16, 2025
de4e7c5
adapt checkpoint saving and loading
PetarStam Apr 23, 2025
cff9da4
adapt checkpoint loading and saving
PetarStam Apr 25, 2025
1029d4c
fix model imports and dependency on module metadata
PetarStam Apr 25, 2025
a1daebe
add generate utils
PetarStam Apr 25, 2025
8164766
add sceleton for era5-cosmo dataset
PetarStam Apr 25, 2025
a9f7034
add in_channels to arg saving list
PetarStam Apr 25, 2025
e0059c8
add dataset era5_cosmo
PetarStam Apr 28, 2025
da4cb6c
fix imports
PetarStam Apr 28, 2025
c284d0a
add getitem to dataset
PetarStam Apr 29, 2025
e7d5b1b
add grid flip to start at top left corner
PetarStam Apr 29, 2025
d093e66
small fix
PetarStam Apr 29, 2025
7e695f0
update everything for training
PetarStam May 7, 2025
7510cf1
small fix
PetarStam May 7, 2025
75db04f
remove tracked .pyc files
PetarStam May 7, 2025
4b5ecb4
add small loggign changes
May 9, 2025
3eb4d0a
adapt sbatch script to slurm config
May 9, 2025
9aaea9d
adapt era5cosmo loader to trim_edge 19
May 9, 2025
dca7ff4
add inference
PetarStam May 12, 2025
8ba0c5a
Plot absolute error onto a projection for a given date
May 12, 2025
c6e632a
Adjust how reshaping is done before error calcs
May 12, 2025
d21ec35
plot for all channels, and against baseline
May 12, 2025
5b32566
Add MAE output
May 13, 2025
60c8aff
Fix indexing error
May 13, 2025
032492d
Merge branch 'corr-diff' of https://github.com/MeteoSwiss/HiRAD-Gen i…
May 13, 2025
5cc42e9
Try adding spectral graph
May 14, 2025
f164504
Start some CE stuff
May 14, 2025
5b77dbe
fix inference for diffusion
May 15, 2025
53e2033
Merge remote-tracking branch 'origin/corr-diff' into corr-diff
May 15, 2025
83716f4
clean up
May 15, 2025
69f10dd
Add power spectrum plots
May 15, 2025
5a4eb3f
Merge branch 'corr-diff' of https://github.com/MeteoSwiss/HiRAD-Gen i…
May 15, 2025
b4c97c5
clean up a bit
May 15, 2025
90c7e28
clean up a bit
May 15, 2025
a905602
add readme for training
May 15, 2025
d2a1200
Merge remote-tracking branch 'origin/corr-diff' into corr-diff
May 15, 2025
57e2b35
Merge branch 'corr-diff' of https://github.com/MeteoSwiss/HiRAD-Gen i…
May 16, 2025
573dc23
update readme for inference
May 16, 2025
4abed05
Merge remote-tracking branch 'origin/corr-diff' into corr-diff
May 16, 2025
f4d856b
small fix for inference on multiple time steps
May 16, 2025
dcc2a06
enable validation during training
May 21, 2025
cadccd5
change generate eval to new functions
May 22, 2025
010bf26
Add pytorch toml files
May 22, 2025
92b08b7
fix average training loss tracking
May 23, 2025
97970fc
fix validation bug
May 23, 2025
937e7c9
update to latest corrdiff version
May 26, 2025
5db5e47
small config fix
May 26, 2025
e8bd5cd
fix generation on distributed
May 27, 2025
692dfe2
delete unnecessary logging
May 27, 2025
e3bab90
delete unnecessary logging
May 27, 2025
a7c0ca0
Merge remote-tracking branch 'origin/corr-diff' into ci_cd
May 27, 2025
996f136
new image
May 27, 2025
5d4fdff
use absolute path
May 27, 2025
9a46267
cd to /src
May 27, 2025
a834730
add USE_NCCL variable to pipeline
May 27, 2025
cdb35f4
Attempting adding more variables
May 27, 2025
0601b70
add env logging
May 27, 2025
aa9b9aa
Try distributed torch
May 27, 2025
18ab0dc
distributed test
May 27, 2025
d341167
run training from ci/cd.
May 28, 2025
2f2e5f5
Split Dockerfile so we have differnet environments, to keep the CI on…
May 28, 2025
417291c
update dockerfile
Jun 5, 2025
700d91d
Adding back Dockerfile to see if that causes webhook to work
Jun 5, 2025
72f40a2
Re-delete Dockerfile, don't need it.
Jun 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions .edf/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
run:
```
export EDF_PATH=`pwd`/.edf
```
This adds the repository path to the EDF search path.

run:
```
srun -A a-a122 --environment=ubuntu2 cat /etc/os-release
```

# local development
srun --environment $PWD/.edf/hirad-ci.toml -A a-a122 -p debug --pty bash



# list current images
podman images

# build according to the dockerfile into an image with tag tmpv1, from current directory.
podman build -f ci/docker/Dockerfile -t tmpv1 .

#
podman run -it localhost/tmpv1

mkdir /capstor/scratch/cscs/mmcgloho/images

# export the image into a sqsh file so it is availabe outside the interactive shell
enroot import -x mount -o /capstor/scratch/cscs/mmcgloho/images/hirad-pytorch-25.01-py3.sqsh podman://localhost/tmpv1

ls /capstor/scratch/cscs/mmcgloho/images
14 changes: 14 additions & 0 deletions .edf/gemma-pytorch.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
image = "/iopsstor/scratch/cscs/${USER}/pytorch-24.01-py3-venv/pytorch-24.01-py3-venv.sqsh"

mounts = ["/capstor", "/users","/iopsstor/scratch/cscs/mmcgloho"]

writable = true

[annotations]
com.hooks.aws_ofi_nccl.enabled = "true"
com.hooks.aws_ofi_nccl.variant = "cuda12"

[env]
FI_CXI_DISABLE_HOST_REGISTER = "1"
FI_MR_CACHE_MONITOR = "userfaultfd"
NCCL_DEBUG = "INFO"
14 changes: 14 additions & 0 deletions .edf/hirad-ci.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
image = "/capstor/scratch/cscs/${USER}/images/hirad-pytorch-25.01-py3.sqsh"

mounts = ["/capstor","/iopsstor"]

writable = true

[annotations]
com.hooks.aws_ofi_nccl.enabled = "true"
com.hooks.aws_ofi_nccl.variant = "cuda12"

[env]
FI_CXI_DISABLE_HOST_REGISTER = "1"
FI_MR_CACHE_MONITOR = "userfaultfd"
NCCL_DEBUG = "INFO"
6 changes: 6 additions & 0 deletions .edf/ngc-pytorch.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch
image = "nvcr.io#nvidia/pytorch:22.01-py3"
mounts = ["/capstor/scratch/cscs/${USER}:/capstor/scratch/cscs/${USER}"]
workdir = "/capstor/scratch/cscs/${USER}"

# Maybe above should be iopsstor
3 changes: 3 additions & 0 deletions .edf/ubuntu.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
image = "library/ubuntu:24.04"
mounts = ["/capstor/scratch/cscs/mmcgloho:/capstor/scratch/cscs/mmcgloho"]
workdir = "/capstor/scratch/cscs/mmcgloho"
5 changes: 5 additions & 0 deletions .edf/ubuntu2.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@


image = "library/ubuntu:24.04"
mounts = ["/capstor/scratch/cscs/${USER}:/capstor/scratch/cscs/${USER}"]
workdir = "/capstor/scratch/cscs/${USER}"
188 changes: 188 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

### Python Patch ###
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
poetry.toml

# ruff
.ruff_cache/

# LSP config files
pyrightconfig.json

# output files
*.out
*.torch
plots/*
*.npz

# conda
.conda/*

# temp
temp.*

# local script
interpolate.sh
core_clariden-ln002_241188

Loading