-
Notifications
You must be signed in to change notification settings - Fork 265
Description
I have configured this setup in a 4GPU machine. I have done the setup using docker image. I have received the Context prompt. When I feed it an example, it is falling apart. Can you please help me to understand why it is failing?
Context prompt >>> def return1():\n """Returns 1."""\n Traceback (most recent call last): File "generate.py", line 74, in <module> main() File "generate.py", line 59, in main generate_samples_interactive( File "/gpt-neox/megatron/text_generation_utils.py", line 779, in generate_samples_interactive generated_text = neox_args.tokenizer.detokenize(generated_tokens) File "/gpt-neox/megatron/tokenizer/tokenizer.py", line 162, in detokenize return self.tokenizer.decode(token_ids) File "/gpt-neox/megatron/tokenizer/gpt2_tokenization.py", line 279, in decode text = ''.join([self.decoder[token] for token in tokens]) File "/gpt-neox/megatron/tokenizer/gpt2_tokenization.py", line 279, in <listcomp> text = ''.join([self.decoder[token] for token in tokens]) KeyError: 50269 Traceback (most recent call last): File "generate.py", line 74, in <module> main() File "generate.py", line 59, in main generate_samples_interactive( File "/gpt-neox/megatron/text_generation_utils.py", line 779, in generate_samples_interactive generated_text = neox_args.tokenizer.detokenize(generated_tokens) File "/gpt-neox/megatron/tokenizer/tokenizer.py", line 162, in detokenize return self.tokenizer.decode(token_ids) File "/gpt-neox/megatron/tokenizer/gpt2_tokenization.py", line 279, in decode text = ''.join([self.decoder[token] for token in tokens]) File "/gpt-neox/megatron/tokenizer/gpt2_tokenization.py", line 279, in <listcomp> text = ''.join([self.decoder[token] for token in tokens]) KeyError: 50269 Traceback (most recent call last): File "generate.py", line 74, in <module> main() File "generate.py", line 59, in main generate_samples_interactive( File "/gpt-neox/megatron/text_generation_utils.py", line 779, in generate_samples_interactive generated_text = neox_args.tokenizer.detokenize(generated_tokens) File "/gpt-neox/megatron/tokenizer/tokenizer.py", line 162, in detokenize return self.tokenizer.decode(token_ids) File "/gpt-neox/megatron/tokenizer/gpt2_tokenization.py", line 279, in decode text = ''.join([self.decoder[token] for token in tokens]) File "/gpt-neox/megatron/tokenizer/gpt2_tokenization.py", line 279, in <listcomp> text = ''.join([self.decoder[token] for token in tokens]) KeyError: 50269Killing subprocess 118 Killing subprocess 119Killing subprocess 120 Killing subprocess 121Traceback (most recent call last): File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.8/runpy.py", line 87, in _run_code exec(code, run_globals) File "/usr/local/lib/python3.8/dist-packages/deepspeed/launcher/launch.py", line 179, in <module> main() File "/usr/local/lib/python3.8/dist-packages/deepspeed/launcher/launch.py", line 169, in main sigkill_handler(signal.SIGTERM, None) # not coming back File "/usr/local/lib/python3.8/dist-packages/deepspeed/launcher/launch.py", line 147, in sigkill_handler raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd) subprocess.CalledProcessError: Command '['/usr/bin/python', '-u', 'generate.py', '--local_rank=3', '--deepspeed_config', '{"train_batch_size": 128, "train_micro_batch_size_per_gpu": 8, "gradient_accumulation_steps": 4, "optimizer": {"type": "adam", "params": {"lr": 0.00016, "betas": [0.9, 0.999], "eps": 1e-08}}, "fp16": {"fp16": true, "enabled": true, "loss_scale": 0, "initial_scale_power": 16, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1}, "gradient_clipping": 1.0, "zero_optimization": {"stage": 1, "allgather_partitions": true, "allgather_bucket_size": 500000000, "overlap_comm": true, "reduce_scatter":true, "reduce_bucket_size": 500000000, "contiguous_gradients": true, "cpu_offload": false}, "wall_clock_breakdown": true, "zero_allow_untested_optimizer": true}', '--megatron_config', '{"train_batch_size": 128, "train_micro_batch_size_per_gpu": 8, "gradient_accumulation_steps": 4, "optimizer": {"type": "adam", "params": {"lr": 0.00016, "betas": [0.9, 0.999], "eps": 1e-08}}, "fp16": {"fp16": true, "enabled": true, "loss_scale": 0, "initial_scale_power": 16, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1}, "gradient_clipping": 1.0, "zero_optimization": {"stage": 1, "allgather_partitions": true, "allgather_bucket_size": 500000000, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 500000000, "contiguous_gradients": true, "cpu_offload": false}, "wall_clock_breakdown": true, "zero_allow_untested_optimizer": true, "precision": "fp16", "num_layers": 32, "hidden_size": 2560, "num_attention_heads": 32, "seq_length": 2048, "max_position_embeddings": 2048, "pos_emb": "rotary", "no_weight_tying": true, "attention_config": ["global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global"], "sparsity_config": {}, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": true, "lr_decay_style": "cosine", "lr_decay_iters": 160000, "zero_stage": 1, "zero_reduce_scatter": true, "zero_contiguous_gradients": true, "zero_reduce_bucket_size": 500000000, "zero_allgather_bucket_size": 500000000, "lr": 0.00016, "data_path": "data/code/code_text_document", "data_impl": "mmap", "save": "checkpoints", "config_files": {"text_generation.yml": "# Parameters used for text generation\\n# Make sure loadis specified somewhere else\\n{\\n # Text gen type:input-file, unconditionalorinteractive\\n \\"text-gen-type\\": \\"interactive\\",\\n \\n # Params for all\\n \\"maximum_tokens\\": 256,\\n \\"temperature\\": 0.5,\\n \\"top_p\\": 0.0,\\n \\"top_k\\": 0,\\n \\"recompute\\": false,\\n \\n # unconditional: samples\\n \\"num-samples\\": 10,\\n\\n # input/output file\\n \\"sample-input-file\\": \\"sample_input.txt\\",\\n \\"sample-output-file\\": \\"sample_output.txt\\",\\n}", "local_setup.yml": "# Suggested data paths when using GPT-NeoX locally\\n{\\n \\"data-path\\": \\"data/code/code_text_document\\",\\n \\n # or for weighted datasets: \\n # \\"train-data-paths\\": [\\"data/enron/enron_text_document\\", \\"data/enron/enron_text_document\\"],\\n # \\"test-data-paths\\": [\\"data/enron/enron_text_document\\", \\"data/enron/enron_text_document\\"],\\n # \\"valid-data-paths\\": [\\"data/enron/enron_text_document\\", \\"data/enron/enron_text_document\\"],\\n # \\"train-data-weights\\": [1., 2.],\\n # \\"test-data-weights\\": [2., 1.],\\n # \\"valid-data-weights\\": [0.5, 0.4],\\n\\n # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. \\n # WARNING: setting this to True will override any user provided weights\\n # \\"weight_by_num_documents\\": false,\\n # \\"weighted_sampler_alpha\\": 0.3,\\n\\n \\"vocab-file\\": \\"data/code-vocab.json\\",\\n \\"merge-file\\": \\"data/code-merges.txt\\",\\n\\n \\"save\\": \\"checkpoints\\",\\n \\"load\\": \\"checkpoints\\",\\n \\"checkpoint_validation_with_forward_pass\\": False,\\n \\n \\"tensorboard-dir\\": \\"tensorboard\\",\\n \\"log-dir\\": \\"logs\\",\\n \\"use_wandb\\": True,\\n \\"wandb_host\\": \\"https://api.wandb.ai\\",\\n \\"wandb_project\\": \\"neox\\"\\n}", "2-7B.yml": "# GPT-2 pretraining setup\\n{\\n # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages\\n # across the node boundaries )\\n \\"pipe-parallel-size\\": 1,\\n \\"model-parallel-size\\": 1,\\n\\n # model settings\\n \\"num-layers\\": 32,\\n \\"hidden-size\\": 2560,\\n \\"num-attention-heads\\": 32,\\n \\"seq-length\\": 2048,\\n \\"max-position-embeddings\\": 2048,\\n \\"norm\\": \\"layernorm\\",\\n \\"pos-emb\\": \\"rotary\\",\\n \\"no-weight-tying\\": true,\\n\\n # these should provide some speedup but takes awhile to build, set to true if desired\\n \\"scaled-upper-triang-masked-softmax-fusion\\": true,\\n \\"bias-gelu-fusion\\": true,\\n\\n # optimizer settings\\n \\"zero_allow_untested_optimizer\\": true,\\n \\"optimizer\\": {\\n \\"type\\": \\"adam\\",\\n \\"params\\": {\\n \\"lr\\": 0.00016,\\n \\"betas\\": [0.9, 0.999],\\n \\"eps\\": 1.0e-8,\\n }\\n },\\n \\"zero_optimization\\": {\\n \\"stage\\": 1,\\n \\"allgather_partitions\\": True,\\n \\"allgather_bucket_size\\": 500000000,\\n \\"overlap_comm\\": True,\\n \\"reduce_scatter\\": True,\\n \\"reduce_bucket_size\\": 500000000,\\n \\"contiguous_gradients\\": True,\\n \\"cpu_offload\\": False\\n },\\n\\n # batch / data settings\\n \\"train_micro_batch_size_per_gpu\\": 8,\\n \\"gradient_accumulation_steps\\": 4,\\n \\"data-impl\\": \\"mmap\\",\\n \\"split\\": \\"989,10,1\\",\\n\\n # activation checkpointing\\n \\"checkpoint-activations\\": true,\\n \\"checkpoint-num-layers\\": 1,\\n \\"partition-activations\\": true,\\n \\"synchronize-each-layer\\": true,\\n\\n # regularization\\n \\"gradient_clipping\\": 1.0,\\n \\"weight-decay\\": 0,\\n \\"hidden-dropout\\": 0,\\n \\"attention-dropout\\": 0,\\n\\n # precision settings\\n \\"fp16\\": { \\n \\"fp16\\": true,\\n \\"enabled\\": true,\\n \\"loss_scale\\": 0,\\n \\"initial_scale_power\\": 16,\\n \\"loss_scale_window\\": 1000,\\n \\"hysteresis\\": 2,\\n \\"min_loss_scale\\": 1\\n },\\n\\n # misc. training settings\\n \\"train-iters\\": 160000,\\n \\"lr-decay-iters\\": 160000,\\n \\"distributed-backend\\": \\"nccl\\",\\n \\"lr-decay-style\\": \\"cosine\\",\\n \\"warmup\\": 0.01,\\n \\"save-interval\\": 1000,\\n \\"eval-interval\\": 1000,\\n \\"eval-iters\\": 10,\\n\\n # logging\\n \\"log-interval\\": 100,\\n \\"steps_per_print\\": 10,\\n \\"keep-last-n-checkpoints\\": 1,\\n \\"wall_clock_breakdown\\": true,\\n}\\n"}, "load": "checkpoints", "save_interval": 1000, "batch_size": 8, "train_iters": 160000, "eval_iters": 10, "keep_last_n_checkpoints": 1, "split": "989,10,1", "vocab_file": "data/code-vocab.json", "merge_file": "data/code-merges.txt", "attention_dropout": 0, "hidden_dropout": 0, "weight_decay": 0, "checkpoint_activations": true, "synchronize_each_layer": true, "partition_activations": true, "gas": 4, "clip_grad": 1.0, "dynamic_loss_scale": true, "pipe_parallel_size": 1, "is_pipe_parallel": true, "use_wandb": true, "wandb_group": "9j43ZWqmpkAaRAbvTjSFUt_2hatssyt", "log_dir": "logs", "tensorboard_dir": "tensorboard", "log_interval": 100, "text_gen_type": "interactive", "temperature": 0.5, "maximum_tokens": 256, "sample_input_file": "sample_input.txt", "sample_output_file": "sample_output.txt", "num_samples": 10, "user_script": "generate.py", "global_num_gpus": 4}']' returned non-zero exit status 1. mchorse@f4a108abb6e6:/gpt-neox$