Sorry, last change was meant to a PR. This reverts commit d0fcf41.

TevenLeScao · TevenLeScao · commit 497aa1bfd3b8 · 2022-03-11T00:48:06.000-05:00
diff --git a/tools/preprocess_data_many_cores.py b/tools/preprocess_data_many_cores.py
@@ -35,8 +35,6 @@
 import sys
 import threading
 import time
-
-import datasets
 import torch
 from multiprocessing.connection import Connection
 
@@ -73,7 +71,7 @@ def tokenize(self, *text):
 
 class Encoder(object):
     def __init__(self, args):
-        self.content_keys = args.content_keys
+        self.json_keys = args.json_keys
         self.append_eod = args.append_eod
         # Use Encoder class as a container for global data
         self.tokenizer = build_tokenizer(args)
@@ -93,14 +91,11 @@ def __init__(self, args):
         else:
             self.splitter = IdentitySplitter()
 
-    def encode(self, data):
+    def encode(self, json_line):
+        data = json.loads(json_line)
         ids = {}
-        # TODO: a character is not a byte for non-ascii scripts. this was like this before, maybe fix at some point
-        #  (counting the actual bytes will slow down processing though)
-        bytes = 0
-        for key in self.content_keys:
+        for key in self.json_keys:
             text = data[key]
-            bytes += len(text)
             doc_ids = []
             for sentence in self.splitter.tokenize(text):
                 sentence_ids = self.tokenizer.tokenize(sentence)
@@ -109,7 +104,7 @@ def encode(self, data):
             if len(doc_ids) > 0 and self.append_eod:
                 doc_ids[-1].append(self.tokenizer.eod)
             ids[key] = doc_ids
-        return ids, bytes
+        return ids, len(json_line)
 
 
 def process_samples(simple_queue, process_id, args, level, writer: Connection):
@@ -118,7 +113,7 @@ def process_samples(simple_queue, process_id, args, level, writer: Connection):
     output_bin_files = {}
     output_idx_files = {}
     builders = {}
-    for key in args.content_keys:
+    for key in args.json_keys:
         output_filename = get_output_filename(args.output_prefix, key, level, process_id)
         output_bin_files[key] = data_file_path(output_filename)
         output_idx_files[key] = index_file_path(output_filename)
@@ -127,38 +122,33 @@ def process_samples(simple_queue, process_id, args, level, writer: Connection):
                                                      impl=args.dataset_impl,
                                                      dtype=best_dtype)
 
-    doc_lines = simple_queue.get()
-    while doc_lines is not None:
-        process_lines(doc_lines, encoder, builders, writer)
+    json_lines = simple_queue.get()
+    while json_lines is not None:
+        process_json_lines(json_lines, encoder, builders, writer)
 
-        doc_lines = simple_queue.get()
+        json_lines = simple_queue.get()
 
     # In case finished, we still need to add None to signal to everyone else
     simple_queue.put(None)
     # Send None as end of sequence signal
     writer.send((None, process_id))
     writer.close()
 
-    for key in args.content_keys:
+    for key in args.json_keys:
         builders[key].finalize(output_idx_files[key])
 
     print(f"Worker {process_id} finished", flush=True)
 
 
-def process_lines(lines, encoder, builders, writer):
+def process_json_lines(json_lines, encoder, builders, writer):
     total_bytes_processed = 0
-    for line in lines:
+    for json_line in json_lines:
+        if json_line.strip() == "":
+            continue
 
-        if isinstance(line, str):
-            if line.strip() == "":
-                continue
-            data = json.loads(line)
-            doc, bytes_processed = encoder.encode(data)
-            total_bytes_processed += bytes_processed
+        doc, bytes_processed = encoder.encode(json_line)
 
-        elif isinstance(line, dict):
-            doc, bytes_processed = encoder.encode(line)
-            total_bytes_processed += bytes_processed
+        total_bytes_processed += bytes_processed
 
         for key, sentences in doc.items():
             if len(sentences) == 0:
@@ -167,16 +157,16 @@ def process_lines(lines, encoder, builders, writer):
                 builders[key].add_item(torch.IntTensor(sentence))
             builders[key].end_document()
 
-    writer.send((len(lines), total_bytes_processed))
+    writer.send((len(json_lines), total_bytes_processed))
 
 
 def get_args():
     parser = argparse.ArgumentParser()
     group = parser.add_argument_group(title='input data')
     group.add_argument('--input', type=str, required=True,
-                       help='Path to input JSON or arrow file')
-    group.add_argument('--content-keys', nargs='+', default=['text'],
-                       help='space separate listed of keys to extract from data')
+                       help='Path to input JSON')
+    group.add_argument('--json-keys', nargs='+', default=['text'],
+                       help='space separate listed of keys to extract from json')
     group.add_argument('--split-sentences', action='store_true',
                        help='Split documents into sentences.')
     group.add_argument('--keep-newlines', action='store_true',
@@ -229,7 +219,7 @@ def get_args():
 
     return args
 
-def fill_simple_queue_from_file(filename, simple_queue, chunk_size:int):
+def fill_simple_queue(filename, simple_queue, chunk_size:int):
     # TODO: Assess if instead we could feed pointers which process can then load.
     with open(filename, "r") as f:
         print("Start filling queue", flush=True)
@@ -241,18 +231,6 @@ def fill_simple_queue_from_file(filename, simple_queue, chunk_size:int):
                 return
             simple_queue.put(acc)
 
-def fill_simple_queue_from_arrow(dirname, simple_queue, chunk_size:int):
-    # TODO: Assess if instead we could feed pointers which process can then load.
-    dataset = datasets.load_from_disk(dirname)
-    print("Start filling queue", flush=True)
-    while True:
-        acc = tuple(itertools.islice(dataset, chunk_size))
-        if len(acc) == 0:
-            simple_queue.put(None)
-            print(f"Finished reading input file", flush=True)
-            return
-        simple_queue.put(acc)
-
 def log(readers, log_interval):
     print("Start Logging", flush=True)
     proc_start = time.time()
@@ -322,12 +300,7 @@ def main():
     process_ids = list(range(len(writers)))
     processes = [multiprocessing.Process(target=process_samples, args=(simple_queue, process_id, args, level, writer)) for process_id, writer in zip(process_ids, writers)]
     log_thread = threading.Thread(target=log, args=(list(readers), args.log_interval))
-    if os.path.isfile(args.input):
-        print("assuming `jsonl` input.")
-        fill_thread = multiprocessing.Process(target=fill_simple_queue_from_file, args=(args.input, simple_queue, chunk_size))
-    elif os.path.isdir(args.input):
-        print("assuming arrow folder input for HF-datasets")
-        fill_thread = multiprocessing.Process(target=fill_simple_queue_from_arrow, args=(args.input, simple_queue, chunk_size))
+    fill_thread = multiprocessing.Process(target=fill_simple_queue, args=(args.input, simple_queue, chunk_size))
 
     fill_thread.start()
     log_thread.start()
@@ -359,7 +332,7 @@ def main():
     output_bin_files = {}
     output_idx_files = {}
     builders = {}
-    for key in args.content_keys:
+    for key in args.json_keys:
         output_filename = f"{args.output_prefix}_{key}_{level}"
         output_bin_files[key] = data_file_path(output_filename)
         output_idx_files[key] = index_file_path(output_filename)
@@ -368,15 +341,15 @@ def main():
                                                      impl=args.dataset_impl,
                                                      dtype=best_dtype)
 
-    for key in args.content_keys:
+    for key in args.json_keys:
         for process_id in process_ids:
             output_filename = get_output_filename(args.output_prefix, key, level, process_id)
             builders[key].merge_file_(output_filename)
         builders[key].finalize(output_idx_files[key])
 
     # Remove temporary files
     print("Removing shard files")
-    for key in args.content_keys:
+    for key in args.json_keys:
         for process_id in process_ids:
             output_filename = get_output_filename(args.output_prefix, key, level, process_id)
             os.remove(data_file_path(output_filename))