Skip to content
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions tools/preprocess_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ def encode(self, json_line):
ids = {}
for key in self.args.json_keys:
text = data[key]
if self.args.prepend_space:
text = " " + text
doc_ids = []
for sentence in Encoder.splitter.tokenize(text):
sentence_ids = Encoder.tokenizer.tokenize(sentence)
Expand Down Expand Up @@ -117,6 +119,8 @@ def get_args():
help='Path to the BPE merge file (if necessary).')
group.add_argument('--append-eod', action='store_true',
help='Append an <eod> token to the end of a document.')
group.add_argument('--prepend-space', action='store_true',
help='Prepends a space to the beginning of a document')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a mention in which context it's useful, typically it is when you compute targets.

group.add_argument("--tokenizer-name-or-path", type=str, default=None,
help="Name or path of the huggingface tokenizer.")
group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
Expand Down