Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ Lastly, if you want to study the effect of multitask prompted training (a.k.a. i
- T-Zero ++: https://huggingface.co/bigscience/T0pp
- T-Zero Single Prompt: https://huggingface.co/bigscience/T0_single_prompt
- T-Zero Original Task Only: https://huggingface.co/bigscience/T0_original_task_only
- T-Zero 3B: https://huggingface.co/bigscience/T0_3B
- T-Zero 3B: https://huggingface.co/bigscience/T0_3Bx§

## Citation

Expand Down
260 changes: 147 additions & 113 deletions evaluation/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ def parse_args():
parser.add_argument(
"--dataset_name",
type=str,
default=None,
help="The name of the dataset to use (via the datasets library).",
required=True,
)
Expand All @@ -61,12 +60,17 @@ def parse_args():
default=None,
help="The configuration name of the dataset to use (via the datasets library).",
)
parser.add_argument(
"--template_config_name",
type=str,
default=None,
help="The name of the dataset_config_name of the template we want to use, example: use XNLI En prompts for XNLI Fr",
)
parser.add_argument(
"--template_name",
type=str,
default=None,
help="The template/prompt name",
required=True,
help="The template/prompt name. If None, we run all templates.",
)
parser.add_argument(
"--max_length",
Expand Down Expand Up @@ -128,115 +132,40 @@ def parse_args():
action="store_true",
help="Activate debug mode and run training only with a subset of data.",
)
parser.add_argument(
"--parallelize",
action="store_true",
help=(
"If passed, will call `model.parallelize` which splits the model on all GPUs available when applicable (model parallelism). "
"Note that this feature is still experimental in HF Transformers."
),
)
args = parser.parse_args()

return args

args = parser.parse_args()

def main():
args = parse_args()
# TODO @thomasw21 hack!
if args.dataset_config_name == "None":
args.dataset_config_name = None
if args.template_config_name == "None":
args.template_config_name = None

# Initialize the accelerator. We will let the accelerator handle device placement for us.
accelerator = Accelerator()
# Make one log on every process with the configuration for debugging.
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
logger.info(accelerator.state)

# Setup logging, we only want one process per machine to log things on the screen.
# accelerator.is_local_main_process is only True for one process per machine.
logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
if accelerator.is_local_main_process:
datasets.utils.logging.set_verbosity_warning()
transformers.utils.logging.set_verbosity_info()
else:
datasets.utils.logging.set_verbosity_error()
transformers.utils.logging.set_verbosity_error()
return args

def run_template(template_name, prompts, model, tokenizer, raw_datasets, accelerator: Accelerator, args):

# Handle the output directory creation
if accelerator.is_main_process:
os.makedirs(args.output_dir, exist_ok=True)
accelerator.wait_for_everyone()

# In distributed evaluation, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.
if args.dataset_name is not None:
# Downloading and loading a dataset from the hub.
if args.dataset_name == "anli":
raw_datasets = load_dataset(args.dataset_name, split=args.dataset_config_name)
else:
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name, split="validation")
#TODO(Victor): enable loading pre-processed dataset from https://huggingface.co/datasets/bigscience/P3

# Trim a number of evaluation examples
if args.debug:
raw_datasets = raw_datasets.select(range(min(len(raw_datasets),100)))

column_names = raw_datasets.column_names


# Load pretrained model and tokenizer
#
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
if args.config_name:
config = AutoConfig.from_pretrained(args.config_name)
elif args.model_name_or_path:
config = AutoConfig.from_pretrained(args.model_name_or_path)
else:
raise ValueError(
"Either `args.config_name` or `args.model_name_or_path` should be provided."
)

if args.tokenizer_name:
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
elif args.model_name_or_path:
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
else:
raise ValueError(
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
result_dir = None
if args.output_dir is not None and accelerator.is_main_process:
paths = [
args.dataset_name,
args.dataset_config_name,
template_name,
]
result_dir = os.path.join(
args.output_dir,
*[path.replace(" ", "_").replace("/", "_") for path in paths if path is not None]
)
os.makedirs(result_dir, exist_ok=True)

if tokenizer.pad_token is None:
for token in [tokenizer.eos_token, tokenizer.bos_token, tokenizer.sep_token]:
if token is not None:
tokenizer.pad_token = token
if tokenizer.pad_token is None:
raise ValueError("Please define a pad token id.")

template = prompts[template_name]

model = ModelBase.from_config(
config=config,
model_name_or_path=args.model_name_or_path,
parallelize=args.parallelize
)

# Preprocessing the datasets.
# First we tokenize all the texts.
padding = "max_length" if args.pad_to_max_length else False

# Get the prompt to apply and the possible targets.
# TODO(Victor): If pulling from pre-processed data, remove this logic.
prompts = DatasetTemplates(
f"{args.dataset_name}"
if args.dataset_config_name is None
else f"{args.dataset_name}/{args.dataset_config_name}"
)
template = prompts[args.template_name]

column_names = raw_datasets.column_names
def preprocess_function(examples):
bs = len(examples[column_names[0]])

Expand Down Expand Up @@ -265,8 +194,9 @@ def preprocess_function(examples):
tokenized_targets = [
tokenizer(
ans_choi,
padding=True,
max_length=args.target_max_length,
# padding is on the right here.
padding=False,
max_length=args.max_length,
truncation=True,
)
for ans_choi in answer_choices_texts
Expand Down Expand Up @@ -319,17 +249,16 @@ def preprocess_function(examples):

eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)


# Use the device given by the `accelerator` object.
if not args.parallelize:
model.to(accelerator.device)

# Prepare everything with our `accelerator`.
eval_dataloader = accelerator.prepare(eval_dataloader)


# Metrics
metric = load_metric("accuracy")
metric = load_metric(
"accuracy",
process_id=accelerator.process_index,
num_process=accelerator.num_processes,
experiment_id=f"{args.dataset_name}_{args.dataset_config_name}_{args.template_name}"
)

# Eval!
total_batch_size = args.per_device_eval_batch_size * accelerator.num_processes
Expand Down Expand Up @@ -359,14 +288,119 @@ def preprocess_function(examples):
results = {
"dataset_name": args.dataset_name,
"dataset_config_name": args.dataset_config_name,
"template_name": args.template_name,
"evaluation": eval_metric
"template_name": template_name,
"evaluation": eval_metric,
"arguments": str(args)
}
if accelerator.is_main_process:
if args.output_dir is not None:
with open(os.path.join(args.output_dir, "results.json"), "w") as f:
json.dump(results, f, indent=4)
if result_dir is not None:
with open(os.path.join(result_dir, "results.json"), "w") as f:
json.dump(results, f, indent=2)

def main():
args = parse_args()

# Initialize the accelerator. We will let the accelerator handle device placement for us.
accelerator = Accelerator()
# Make one log on every process with the configuration for debugging.
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
logger.info(accelerator.state)

# Setup logging, we only want one process per machine to log things on the screen.
# accelerator.is_local_main_process is only True for one process per machine.
logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
if accelerator.is_local_main_process:
datasets.utils.logging.set_verbosity_warning()
transformers.utils.logging.set_verbosity_info()
else:
datasets.utils.logging.set_verbosity_error()
transformers.utils.logging.set_verbosity_error()

accelerator.wait_for_everyone()

# In distributed evaluation, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.
# Downloading and loading a dataset from the hub.
if args.dataset_name == "anli":
raw_datasets = load_dataset(args.dataset_name, split=args.dataset_config_name)
else:
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name, split="validation")
#TODO(Victor): enable loading pre-processed dataset from https://huggingface.co/datasets/bigscience/P3

# Trim a number of evaluation examples
if args.debug:
raw_datasets = raw_datasets.select(range(min(len(raw_datasets),100)))

# Load pretrained model and tokenizer
#
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
if args.config_name:
config = AutoConfig.from_pretrained(args.config_name)
elif args.model_name_or_path:
config = AutoConfig.from_pretrained(args.model_name_or_path)
else:
raise ValueError(
"Either `args.config_name` or `args.model_name_or_path` should be provided."
)

if args.tokenizer_name:
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer, padding_side="left")
elif args.model_name_or_path:
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer, padding_side="left")
else:
raise ValueError(
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
)

if tokenizer.pad_token is None:
for token in [tokenizer.eos_token, tokenizer.bos_token, tokenizer.sep_token]:
if token is not None:
tokenizer.pad_token = token
if tokenizer.pad_token is None:
raise ValueError("Please define a pad token id.")


model = ModelBase.from_config(
config=config,
model_name_or_path=args.model_name_or_path
)
model = accelerator.prepare_model(model)

# Get the prompt to apply and the possible targets.
# TODO(Victor): If pulling from pre-processed data, remove this logic.

if (args.dataset_config_name is None and args.template_config_name is None) or args.dataset_name == "anli":
prompt_dataset_name = f"{args.dataset_name}"
elif args.template_config_name is not None:
prompt_dataset_name = f"{args.dataset_name}/{args.template_config_name}"
else:
prompt_dataset_name = f"{args.dataset_name}/{args.dataset_config_name}"

prompts = DatasetTemplates(
prompt_dataset_name
)

if args.template_name is not None:
template_names = [args.template_name]
else:
template_names = prompts.all_template_names

for template_name in template_names:
run_template(
template_name=template_name,
prompts=prompts,
model=model,
tokenizer=tokenizer,
raw_datasets=raw_datasets,
accelerator=accelerator,
args=args
)

if __name__ == "__main__":
main()
24 changes: 13 additions & 11 deletions t0/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def from_config(config, **kwargs) -> "ModelBase":
raise NotImplementedError

class EncoderDecoderModel(ModelBase):
def __init__(self, config, model_name_or_path: Optional[str], parallelize: bool, **kwargs):
def __init__(self, config, model_name_or_path: Optional[str], **kwargs):
"""

Args:
Expand All @@ -46,11 +46,9 @@ def __init__(self, config, model_name_or_path: Optional[str], parallelize: bool,
)
else:
logger.info("Training new model from scratch")
self._model = AutoModelForSeq2SeqLM.from_config(config)

if parallelize:
assert torch.cuda.is_available(), "You need at least 1 GPU to call `parallelize` (even though if there is only 1 GPU, there won't be any model parallelism)."
self._model.parallelize()
self._model = AutoModelForSeq2SeqLM.from_config(
config,
)


def forward(self, batch) -> torch.Tensor:
Expand Down Expand Up @@ -78,19 +76,23 @@ def __init__(self, config, model_name_or_path: Optional[str], **kwargs):
)
else:
logger.info("Training new model from scratch")
self._model = AutoModelForCausalLM.from_config(config)
self._model = AutoModelForCausalLM.from_config(
config,
)

def forward(self, batch):
device = batch["input_ids"].device
_, prefix_length = batch["input_ids"].shape

model_inputs = {
"input_ids": torch.cat([batch["input_ids"], batch["labels"]], dim=-1),
"attention_mask": torch.cat([batch["attention_mask"], batch["labels_attention_mask"]], dim=-1),
}
# Set position ids correctly to take care of padding tokens between inputs_ids and labels
# Empty attention_mask is a forbidden value, ie full of zeros. In fact the first element should be 1 as the input
# cannot be empty
assert torch.all(model_inputs["attention_mask"][:,0] == 1), "First element in the attention mask should be 1."
position_ids = torch.cumsum(model_inputs["attention_mask"].to(torch.long), dim=-1) - 1
position_ids = torch.maximum(
torch.cumsum(model_inputs["attention_mask"].to(torch.long), dim=-1) - 1,
torch.zeros(1, dtype=torch.long, device=device)[None, None]
)
model_inputs["position_ids"] = position_ids

logits = self._model(**model_inputs).logits[:, prefix_length-1:-1]
Expand Down