|
3 | 3 | from __future__ import annotations
|
4 | 4 |
|
5 | 5 | import gc
|
| 6 | +import json |
6 | 7 | import logging
|
7 | 8 | import os
|
8 | 9 | import traceback
|
@@ -808,11 +809,44 @@ def on_train_begin(
|
808 | 809 | artifact.add_file(temp_file.name)
|
809 | 810 | wandb.log_artifact(artifact)
|
810 | 811 | wandb.save(temp_file.name)
|
811 |
| - LOG.info( |
812 |
| - "The Axolotl config has been saved to the WandB run under files." |
813 |
| - ) |
| 812 | + LOG.info( |
| 813 | + "The Axolotl config has been saved to the WandB run under files." |
| 814 | + ) |
814 | 815 | except (FileNotFoundError, ConnectionError) as err:
|
815 | 816 | LOG.warning(f"Error while saving Axolotl config to WandB: {err}")
|
| 817 | + |
| 818 | + if args.deepspeed: |
| 819 | + try: |
| 820 | + # sync config to top level in run, cannot delete file right away because wandb schedules it to be synced even w/policy = 'now', so let OS delete it later. |
| 821 | + with NamedTemporaryFile( |
| 822 | + mode="w", |
| 823 | + delete=False, |
| 824 | + suffix=".json", |
| 825 | + prefix="deepspeed_config_", |
| 826 | + ) as temp_file: |
| 827 | + skip_upload = False |
| 828 | + if isinstance(args.deepspeed, dict): |
| 829 | + json.dump(args.deepspeed, temp_file, indent=4) |
| 830 | + elif isinstance(args.deepspeed, str) and os.path.exists( |
| 831 | + args.deepspeed |
| 832 | + ): |
| 833 | + copyfile(args.deepspeed, temp_file.name) |
| 834 | + else: |
| 835 | + skip_upload = True |
| 836 | + if not skip_upload: |
| 837 | + artifact = wandb.Artifact( |
| 838 | + f"deepspeed-config-{wandb.run.id}", |
| 839 | + type="deepspeed-config", |
| 840 | + ) |
| 841 | + artifact.add_file(temp_file.name) |
| 842 | + wandb.log_artifact(artifact) |
| 843 | + wandb.save(temp_file.name) |
| 844 | + LOG.info( |
| 845 | + "The DeepSpeed config has been saved to the WandB run under files." |
| 846 | + ) |
| 847 | + except (FileNotFoundError, ConnectionError) as err: |
| 848 | + LOG.warning(f"Error while saving DeepSpeed config to WandB: {err}") |
| 849 | + |
816 | 850 | return control
|
817 | 851 |
|
818 | 852 |
|
|
0 commit comments