Context prompt - example not working

I have configured this setup in a 4GPU machine. I have done the setup using docker image. I have received the Context prompt. When  I feed it an example, it is falling apart. Can you please help me to understand why it is failing?

`Context prompt >>> def return1():\n """Returns 1."""\n
Traceback (most recent call last):
  File "generate.py", line 74, in <module>
    main()
  File "generate.py", line 59, in main
    generate_samples_interactive(
  File "/gpt-neox/megatron/text_generation_utils.py", line 779, in generate_samples_interactive
    generated_text = neox_args.tokenizer.detokenize(generated_tokens)
  File "/gpt-neox/megatron/tokenizer/tokenizer.py", line 162, in detokenize
    return self.tokenizer.decode(token_ids)
  File "/gpt-neox/megatron/tokenizer/gpt2_tokenization.py", line 279, in decode
    text = ''.join([self.decoder[token] for token in tokens])
  File "/gpt-neox/megatron/tokenizer/gpt2_tokenization.py", line 279, in <listcomp>
    text = ''.join([self.decoder[token] for token in tokens])
KeyError: 50269
Traceback (most recent call last):
  File "generate.py", line 74, in <module>
    main()
  File "generate.py", line 59, in main
    generate_samples_interactive(
  File "/gpt-neox/megatron/text_generation_utils.py", line 779, in generate_samples_interactive
    generated_text = neox_args.tokenizer.detokenize(generated_tokens)
  File "/gpt-neox/megatron/tokenizer/tokenizer.py", line 162, in detokenize
    return self.tokenizer.decode(token_ids)  File "/gpt-neox/megatron/tokenizer/gpt2_tokenization.py", line 279, in decode    text = ''.join([self.decoder[token] for token in tokens])  File "/gpt-neox/megatron/tokenizer/gpt2_tokenization.py", line 279, in <listcomp>
    text = ''.join([self.decoder[token] for token in tokens])
KeyError: 50269
Traceback (most recent call last):
  File "generate.py", line 74, in <module>
    main()
  File "generate.py", line 59, in main
    generate_samples_interactive(
  File "/gpt-neox/megatron/text_generation_utils.py", line 779, in generate_samples_interactive
    generated_text = neox_args.tokenizer.detokenize(generated_tokens)
  File "/gpt-neox/megatron/tokenizer/tokenizer.py", line 162, in detokenize
    return self.tokenizer.decode(token_ids)
  File "/gpt-neox/megatron/tokenizer/gpt2_tokenization.py", line 279, in decode
    text = ''.join([self.decoder[token] for token in tokens])
  File "/gpt-neox/megatron/tokenizer/gpt2_tokenization.py", line 279, in <listcomp>    text = ''.join([self.decoder[token] for token in tokens])
KeyError: 50269Killing subprocess 118
Killing subprocess 119Killing subprocess 120
Killing subprocess 121Traceback (most recent call last):
  File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.8/dist-packages/deepspeed/launcher/launch.py", line 179, in <module>
    main()
  File "/usr/local/lib/python3.8/dist-packages/deepspeed/launcher/launch.py", line 169, in main
    sigkill_handler(signal.SIGTERM, None)  # not coming back
  File "/usr/local/lib/python3.8/dist-packages/deepspeed/launcher/launch.py", line 147, in sigkill_handler
    raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd)
subprocess.CalledProcessError: Command '['/usr/bin/python', '-u', 'generate.py', '--local_rank=3', '--deepspeed_config', '{"train_batch_size": 128, "train_micro_batch_size_per_gpu": 8, "gradient_accumulation_steps": 4, "optimizer": {"type": "adam", "params": {"lr": 0.00016, "betas": [0.9, 0.999], "eps": 1e-08}}, "fp16": {"fp16": true, "enabled": true, "loss_scale": 0, "initial_scale_power": 16, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1}, "gradient_clipping": 1.0, "zero_optimization": {"stage": 1, "allgather_partitions": true, "allgather_bucket_size": 500000000, "overlap_comm": true, "reduce_scatter":true, "reduce_bucket_size": 500000000, "contiguous_gradients": true, "cpu_offload": false}, "wall_clock_breakdown": true, "zero_allow_untested_optimizer": true}', '--megatron_config', '{"train_batch_size": 128, "train_micro_batch_size_per_gpu": 8, "gradient_accumulation_steps": 4, "optimizer": {"type": "adam", "params": {"lr": 0.00016, "betas": [0.9, 0.999], "eps": 1e-08}}, "fp16": {"fp16": true, "enabled": true, "loss_scale": 0, "initial_scale_power": 16, "loss_scale_window": 1000, "hysteresis": 2, "min_loss_scale": 1}, "gradient_clipping": 1.0, "zero_optimization": {"stage": 1, "allgather_partitions": true, "allgather_bucket_size": 500000000, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 500000000, "contiguous_gradients": true, "cpu_offload": false}, "wall_clock_breakdown": true, "zero_allow_untested_optimizer": true, "precision": "fp16", "num_layers": 32, "hidden_size": 2560, "num_attention_heads": 32, "seq_length": 2048, "max_position_embeddings": 2048, "pos_emb": "rotary", "no_weight_tying": true, "attention_config": ["global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global", "global"], "sparsity_config": {}, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": true, "lr_decay_style": "cosine", "lr_decay_iters": 160000, "zero_stage": 1, "zero_reduce_scatter": true, "zero_contiguous_gradients": true, "zero_reduce_bucket_size": 500000000, "zero_allgather_bucket_size": 500000000, "lr": 0.00016, "data_path": "data/code/code_text_document", "data_impl": "mmap", "save": "checkpoints", "config_files": {"text_generation.yml": "# Parameters used for text generation\\n# Make sure `load` is specified somewhere else\\n{\\n  # Text gen type: `input-file`, `unconditional` or `interactive`\\n  \\"text-gen-type\\": \\"interactive\\",\\n \\n  # Params for all\\n  \\"maximum_tokens\\": 256,\\n  \\"temperature\\": 0.5,\\n  \\"top_p\\": 0.0,\\n  \\"top_k\\": 0,\\n  \\"recompute\\": false,\\n  \\n  # `unconditional`: samples\\n  \\"num-samples\\": 10,\\n\\n  # input/output file\\n  \\"sample-input-file\\": \\"sample_input.txt\\",\\n  \\"sample-output-file\\": \\"sample_output.txt\\",\\n}", "local_setup.yml": "# Suggested data paths when using GPT-NeoX locally\\n{\\n  \\"data-path\\": \\"data/code/code_text_document\\",\\n  \\n  # or for weighted datasets: \\n  # \\"train-data-paths\\": [\\"data/enron/enron_text_document\\", \\"data/enron/enron_text_document\\"],\\n  # \\"test-data-paths\\": [\\"data/enron/enron_text_document\\", \\"data/enron/enron_text_document\\"],\\n  # \\"valid-data-paths\\": [\\"data/enron/enron_text_document\\", \\"data/enron/enron_text_document\\"],\\n  # \\"train-data-weights\\": [1., 2.],\\n # \\"test-data-weights\\": [2., 1.],\\n  # \\"valid-data-weights\\": [0.5, 0.4],\\n\\n  # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. \\n  # WARNING: setting this to True will override any user provided weights\\n  # \\"weight_by_num_documents\\": false,\\n  # \\"weighted_sampler_alpha\\": 0.3,\\n\\n  \\"vocab-file\\": \\"data/code-vocab.json\\",\\n  \\"merge-file\\": \\"data/code-merges.txt\\",\\n\\n  \\"save\\": \\"checkpoints\\",\\n  \\"load\\": \\"checkpoints\\",\\n  \\"checkpoint_validation_with_forward_pass\\": False,\\n  \\n  \\"tensorboard-dir\\": \\"tensorboard\\",\\n  \\"log-dir\\": \\"logs\\",\\n  \\"use_wandb\\": True,\\n  \\"wandb_host\\": \\"https://api.wandb.ai\\",\\n  \\"wandb_project\\": \\"neox\\"\\n}", "2-7B.yml": "# GPT-2 pretraining setup\\n{\\n   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages\\n   # across the node boundaries )\\n   \\"pipe-parallel-size\\": 1,\\n   \\"model-parallel-size\\": 1,\\n\\n   # model settings\\n   \\"num-layers\\": 32,\\n   \\"hidden-size\\": 2560,\\n   \\"num-attention-heads\\": 32,\\n   \\"seq-length\\": 2048,\\n   \\"max-position-embeddings\\": 2048,\\n   \\"norm\\": \\"layernorm\\",\\n   \\"pos-emb\\": \\"rotary\\",\\n   \\"no-weight-tying\\": true,\\n\\n   # these should provide some speedup but takes awhile to build, set to true if desired\\n   \\"scaled-upper-triang-masked-softmax-fusion\\": true,\\n   \\"bias-gelu-fusion\\": true,\\n\\n   # optimizer settings\\n   \\"zero_allow_untested_optimizer\\": true,\\n   \\"optimizer\\": {\\n     \\"type\\": \\"adam\\",\\n     \\"params\\": {\\n       \\"lr\\": 0.00016,\\n       \\"betas\\": [0.9, 0.999],\\n       \\"eps\\": 1.0e-8,\\n     }\\n   },\\n   \\"zero_optimization\\": {\\n    \\"stage\\": 1,\\n    \\"allgather_partitions\\": True,\\n    \\"allgather_bucket_size\\": 500000000,\\n    \\"overlap_comm\\": True,\\n    \\"reduce_scatter\\": True,\\n    \\"reduce_bucket_size\\": 500000000,\\n    \\"contiguous_gradients\\": True,\\n    \\"cpu_offload\\": False\\n  },\\n\\n   # batch / data settings\\n   \\"train_micro_batch_size_per_gpu\\": 8,\\n   \\"gradient_accumulation_steps\\": 4,\\n   \\"data-impl\\": \\"mmap\\",\\n   \\"split\\": \\"989,10,1\\",\\n\\n   # activation checkpointing\\n   \\"checkpoint-activations\\": true,\\n   \\"checkpoint-num-layers\\": 1,\\n   \\"partition-activations\\": true,\\n   \\"synchronize-each-layer\\": true,\\n\\n   # regularization\\n   \\"gradient_clipping\\": 1.0,\\n   \\"weight-decay\\": 0,\\n   \\"hidden-dropout\\": 0,\\n   \\"attention-dropout\\": 0,\\n\\n   # precision settings\\n   \\"fp16\\": { \\n     \\"fp16\\": true,\\n     \\"enabled\\": true,\\n     \\"loss_scale\\": 0,\\n     \\"initial_scale_power\\": 16,\\n     \\"loss_scale_window\\": 1000,\\n  \\"hysteresis\\": 2,\\n     \\"min_loss_scale\\": 1\\n   },\\n\\n   # misc. training settings\\n   \\"train-iters\\": 160000,\\n   \\"lr-decay-iters\\": 160000,\\n   \\"distributed-backend\\": \\"nccl\\",\\n  \\"lr-decay-style\\": \\"cosine\\",\\n   \\"warmup\\": 0.01,\\n   \\"save-interval\\": 1000,\\n   \\"eval-interval\\": 1000,\\n   \\"eval-iters\\": 10,\\n\\n   # logging\\n   \\"log-interval\\": 100,\\n   \\"steps_per_print\\": 10,\\n   \\"keep-last-n-checkpoints\\": 1,\\n   \\"wall_clock_breakdown\\": true,\\n}\\n"}, "load": "checkpoints", "save_interval": 1000, "batch_size": 8, "train_iters": 160000, "eval_iters": 10, "keep_last_n_checkpoints": 1, "split": "989,10,1", "vocab_file": "data/code-vocab.json", "merge_file": "data/code-merges.txt", "attention_dropout": 0, "hidden_dropout": 0, "weight_decay": 0, "checkpoint_activations": true, "synchronize_each_layer": true, "partition_activations": true, "gas": 4, "clip_grad": 1.0, "dynamic_loss_scale": true, "pipe_parallel_size": 1, "is_pipe_parallel": true, "use_wandb": true, "wandb_group": "9j43ZWqmpkAaRAbvTjSFUt_2hatssyt", "log_dir": "logs", "tensorboard_dir": "tensorboard", "log_interval": 100, "text_gen_type": "interactive", "temperature": 0.5, "maximum_tokens": 256, "sample_input_file": "sample_input.txt", "sample_output_file": "sample_output.txt", "num_samples": 10, "user_script": "generate.py", "global_num_gpus": 4}']' returned non-zero exit status 1.
mchorse@f4a108abb6e6:/gpt-neox$`

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Context prompt - example not working #27

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Context prompt - example not working #27

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions