Skip to content
This repository was archived by the owner on May 29, 2025. It is now read-only.

Commit baae6c0

Browse files
authored
Add CloudFormation templates for FSx and EC2. Move Dockerfiles up one level. Better tqdm logging, disabled on SageMaker. Move argparse lists to separate file. (#15)
* Optimizer checkpointing. * TqdmHandler for logger. * Move docker/ out of albert/. Move argparse to its own function at the top. Disable tqdm if on SageMaker. Add --log_frequency argument. * Abstract argparse into an arguments.py file. * Move SM parameters into arguments.py. * Add CloudFormation templates. * Add CloudFormation templates, for real. * Mild name fixes.
1 parent f1d1bae commit baae6c0

File tree

15 files changed

+497
-381
lines changed

15 files changed

+497
-381
lines changed

models/nlp/albert/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@ Language models help AWS customers to improve search results, text classificatio
1515

1616
1. Create an FSx volume.
1717

18-
2. Download the datasets onto FSx. You will need English Wikipedia and BookCorpus, and helper scripts for downloading will be forthcoming.
18+
2. Download the datasets onto FSx. The simplest way to start is with English Wikipedia.
1919

20-
3. Create an Elastic Container Registry repository. Then build a Docker image from `docker/ngc_sagemaker.Dockerfile` and push it to ECR.
20+
3. Create an Amazon Elastic Container Registry (ECR) repository. Then build a Docker image from `docker/ngc_sagemaker.Dockerfile` and push it to ECR.
2121

2222
```bash
2323
export IMAGE=${ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/${REPO}:ngc_tf21_sagemaker

models/nlp/albert/arguments.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
""" Since arguments are duplicated in run_pretraining.py and sagemaker_pretraining.py, they have
2+
been abstracted into this file. It also makes the training scripts much shorter.
3+
"""
4+
5+
import argparse
6+
import os
7+
8+
9+
def populate_pretraining_parser(parser: argparse.ArgumentParser) -> None:
10+
parser.add_argument("--model_dir", help="Unused, but passed by SageMaker")
11+
parser.add_argument("--model_type", default="albert", choices=["albert", "bert"])
12+
parser.add_argument("--model_size", default="base", choices=["base", "large"])
13+
parser.add_argument("--batch_size", type=int, default=32, help="per GPU")
14+
parser.add_argument("--gradient_accumulation_steps", type=int, default=2)
15+
parser.add_argument("--max_seq_length", type=int, default=512, choices=[128, 512])
16+
parser.add_argument("--warmup_steps", type=int, default=3125)
17+
parser.add_argument("--total_steps", type=int, default=125000)
18+
parser.add_argument("--learning_rate", type=float, default=0.00176)
19+
parser.add_argument("--end_learning_rate", type=float, default=3e-5)
20+
parser.add_argument("--learning_rate_decay_power", type=float, default=1.0)
21+
parser.add_argument("--hidden_dropout_prob", type=float, default=0.0)
22+
parser.add_argument("--max_grad_norm", type=float, default=1.0)
23+
parser.add_argument("--optimizer", default="lamb", choices=["lamb", "adam"])
24+
parser.add_argument("--name", default="", help="Additional info to append to metadata")
25+
parser.add_argument("--log_frequency", type=int, default=1000)
26+
parser.add_argument(
27+
"--load_from", default="scratch", choices=["scratch", "checkpoint", "huggingface"],
28+
)
29+
parser.add_argument("--checkpoint_path", default=None)
30+
parser.add_argument(
31+
"--fsx_prefix",
32+
default="/fsx",
33+
choices=["/fsx", "/opt/ml/input/data/training"],
34+
help="Change to /opt/ml/input/data/training on SageMaker",
35+
)
36+
# SageMaker does not work with 'store_const' args, since it parses into a dictionary
37+
# We will treat any value not equal to None as True, and use --skip_xla=true
38+
parser.add_argument(
39+
"--skip_xla",
40+
choices=["true"],
41+
help="For debugging. Faster startup time, slower runtime, more GPU vRAM.",
42+
)
43+
parser.add_argument(
44+
"--eager",
45+
choices=["true"],
46+
help="For debugging. Faster launch, slower runtime, more GPU vRAM.",
47+
)
48+
parser.add_argument(
49+
"--skip_sop", choices=["true"], help="Only use MLM loss, and exclude the SOP loss.",
50+
)
51+
parser.add_argument(
52+
"--skip_mlm", choices=["true"], help="Only use SOP loss, and exclude the MLM loss.",
53+
)
54+
parser.add_argument(
55+
"--pre_layer_norm",
56+
choices=["true"],
57+
help="Place layer normalization before the attention & FFN, rather than after adding the residual connection. https://openreview.net/pdf?id=B1x8anVFPr",
58+
)
59+
parser.add_argument("--extra_squad_steps", type=str)
60+
parser.add_argument("--fast_squad", choices=["true"])
61+
parser.add_argument("--dummy_eval", choices=["true"])
62+
parser.add_argument("--seed", type=int, default=42)
63+
64+
65+
def populate_squad_parser(parser: argparse.ArgumentParser) -> None:
66+
# Model loading
67+
parser.add_argument("--model_type", default="albert", choices=["albert", "bert"])
68+
parser.add_argument("--model_size", default="base", choices=["base", "large"])
69+
parser.add_argument("--load_from", required=True)
70+
parser.add_argument("--load_step", type=int)
71+
parser.add_argument("--skip_xla", choices=["true"])
72+
parser.add_argument("--eager", choices=["true"])
73+
parser.add_argument(
74+
"--pre_layer_norm",
75+
choices=["true"],
76+
help="See https://github.com/huggingface/transformers/pull/3929",
77+
)
78+
parser.add_argument(
79+
"--fsx_prefix",
80+
default="/fsx",
81+
choices=["/fsx", "/opt/ml/input/data/training"],
82+
help="Change to /opt/ml/input/data/training on SageMaker",
83+
)
84+
# Hyperparameters from https://arxiv.org/pdf/1909.11942.pdf#page=17
85+
parser.add_argument("--batch_size", default=6, type=int)
86+
parser.add_argument("--total_steps", default=8144, type=int)
87+
parser.add_argument("--warmup_steps", default=814, type=int)
88+
parser.add_argument("--learning_rate", default=3e-5, type=float)
89+
parser.add_argument("--dataset", default="squadv2")
90+
parser.add_argument("--seed", type=int, default=42)
91+
# Logging information
92+
parser.add_argument("--name", default="default")
93+
parser.add_argument("--validate_frequency", default=1000, type=int)
94+
parser.add_argument("--checkpoint_frequency", default=500, type=int)
95+
parser.add_argument("--model_dir", help="Unused, but passed by SageMaker")
96+
97+
98+
def populate_sagemaker_parser(parser: argparse.ArgumentParser) -> None:
99+
# SageMaker parameters
100+
parser.add_argument(
101+
"--source_dir",
102+
help="For example, /Users/myusername/Desktop/deep-learning-models/models/nlp/albert",
103+
)
104+
parser.add_argument("--entry_point", default="run_pretraining.py")
105+
parser.add_argument("--role", default=os.environ["SAGEMAKER_ROLE"])
106+
parser.add_argument("--image_name", default=os.environ["SAGEMAKER_IMAGE_NAME"])
107+
parser.add_argument("--fsx_id", default=os.environ["SAGEMAKER_FSX_ID"])
108+
parser.add_argument(
109+
"--subnet_ids", help="Comma-separated string", default=os.environ["SAGEMAKER_SUBNET_IDS"]
110+
)
111+
parser.add_argument(
112+
"--security_group_ids",
113+
help="Comma-separated string",
114+
default=os.environ["SAGEMAKER_SECURITY_GROUP_IDS"],
115+
)
116+
# Instance specs
117+
parser.add_argument(
118+
"--instance_type",
119+
type=str,
120+
default="ml.p3dn.24xlarge",
121+
choices=["ml.p3dn.24xlarge", "ml.p3.16xlarge", "ml.g4dn.12xlarge"],
122+
)
123+
parser.add_argument("--instance_count", type=int, default=1)

models/nlp/albert/models.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,6 @@
88
)
99

1010

11-
def get_initializer(stddev):
12-
return tf.keras.initializers.TruncatedNormal(stddev=stddev)
13-
14-
1511
def load_qa_from_pretrained(
1612
model: Optional[tf.keras.Model] = None,
1713
name: Optional[str] = None,

0 commit comments

Comments
 (0)