Add option to compile_modules pipeline to ingest raw BBs

boomanaiden154 · web-flow · commit 0e93371b08d7 · 2025-05-16T08:51:42.000-07:00
This patch adds an option to the compile_modules pipeline to ingest basic blocks in a hex file (like in the BHive dataset) rather than compiling IR stored in parquet. This enables us to ingest a slightly processed CSV from the BHive dataset to then annotate and benchmark it with our pipeline to ensure the numbers match up reasonably well. Reviewers: virajbshah, ondrasej, orodley Reviewed By: ondrasej Pull Request: #339
diff --git a/gematria/datasets/pipelines/compile_modules.py b/gematria/datasets/pipelines/compile_modules.py
@@ -31,7 +31,6 @@
     'parquet_folder',
     None,
     'The path to the folder containing parquet files',
-    required=True,
 )
 
 _OUTPUT_FILE = flags.DEFINE_string(
@@ -71,20 +70,27 @@
     ' cannot be found.',
 )
 
+_INPUT_HEX_BBS_FILE_PATTERN = flags.DEFINE_string(
+    'input_hex_bbs_file_pattern',
+    None,
+    'The path to text files containing new line separated basic blocks.',
+)
+
 
 def main(argv) -> None:
   del argv  # Unused.
 
   beam_options = pipeline_options.PipelineOptions()
 
   pipeline_constructor = compile_modules_lib.get_bbs(
-      os.path.join(_PARQUET_FOLDER.value, '*.parquet'),
-      _OUTPUT_FILE.value,
-      _REMOVE_MEMORY_ACCESSING_INSTRUCTIONS.value,
-      ANNOTATOR_MAPPING[_ANNOTATOR_TYPE.value],
-      _MAX_ANNOTATION_ATTEMPTS.value,
-      _OUTPUT_VOCAB_FILE.value,
-      _SKIP_NO_LOOP_REGISTER.value,
+      input_file_pattern=os.path.join(_PARQUET_FOLDER.value, '*.parquet'),
+      output_file=_OUTPUT_FILE.value,
+      remove_memory_accessing_instructions=_REMOVE_MEMORY_ACCESSING_INSTRUCTIONS.value,
+      annotator_type=ANNOTATOR_MAPPING[_ANNOTATOR_TYPE.value],
+      max_annotation_attempts=_MAX_ANNOTATION_ATTEMPTS.value,
+      vocab_output_file=_OUTPUT_VOCAB_FILE.value,
+      skip_no_loop_register=_SKIP_NO_LOOP_REGISTER.value,
+      input_hex_bbs_file_pattern=_INPUT_HEX_BBS_FILE_PATTERN.value,
   )
 
   with beam.Pipeline(options=beam_options) as pipeline:
diff --git a/gematria/datasets/pipelines/compile_modules_lib.py b/gematria/datasets/pipelines/compile_modules_lib.py
@@ -231,13 +231,14 @@ def process(self, bb_hex: str) -> Iterable[str]:
 
 
 def get_bbs(
-    input_file_pattern: str,
+    input_file_pattern: str | None,
     output_file: str,
     remove_memory_accessing_instructions: bool,
     annotator_type: bhive_to_exegesis.AnnotatorType,
     max_annotation_attempts: int,
     vocab_output_file: str,
     skip_no_loop_register: bool,
+    input_hex_bbs_file_pattern: str | None,
 ) -> Callable[[beam.Pipeline], None]:
   """Creates a pipeline to process BBs from IR modules.
 
@@ -247,7 +248,8 @@ def get_bbs(
 
   Args:
     input_file_pattern: A grep-like pattern to use to search for the Parquet
-      files to process.
+      files to process. This cannot be used at the same time as
+      input_hex_bbs_file_pattern.
     output_file: The output file pattern to use when writing the basic blocks
       to disk.
     remove_memory_accessing_instructions: Whether or not to remove memory
@@ -259,31 +261,45 @@ def get_bbs(
     vocab_output_file: The output pattern for the vocabulary file.
     skip_no_loop_register: Whether or not to omit basic blocks for which a free
       register to use as a loop counter cannot be found.
+    input_hex_bbs_file_pattern: A grep-like file pattern to use to search for
+      text files that contain basic blocks in hex format. This cannot be used
+      at the same time as input_file_pattern.
 
   Returns:
     A function that accepts a beam pipeline and adds on all the steps needed
     to process the input IR modules.
   """
 
-  def pipeline(root: beam.Pipeline) -> None:
-    parquet_data = root | 'Read' >> beam.io.ReadFromParquet(
-        input_file_pattern, columns=['content']
-    )
-    module_data = parquet_data | 'Load' >> beam.Map(
-        lambda parquet_row: parquet_row['content']
-    )
-    module_data_shuffled = module_data | 'Shuffle' >> beam.Reshuffle()
-    optimized_modules = module_data_shuffled | 'Optimize' >> beam.ParDo(
-        OptimizeModules(
-            ['default<O0>', 'default<O1>', 'default<O2>', 'default<O3>']
-        )
-    )
-    lowered_modules = optimized_modules | 'Lower' >> beam.ParDo(
-        LowerModulesAsm(['-O0', '-O1', '-O2', '-O3'])
-    )
-    bb_hex_values = lowered_modules | 'Get BBs' >> beam.ParDo(
-        GetBBsFromModule()
+  if (input_file_pattern is None) == (input_hex_bbs_file_pattern is None):
+    raise ValueError(
+        'Exactly one of input_file_pattern and input_hex_bbs_file_pattern must'
+        ' be set.'
     )
+
+  def pipeline(root: beam.Pipeline) -> None:
+    if input_hex_bbs_file_pattern is not None:
+      bb_hex_values = root | 'Read' >> beam.io.ReadFromText(
+          input_hex_bbs_file_pattern
+      )
+    else:
+      parquet_data = root | 'Read' >> beam.io.ReadFromParquet(
+          input_file_pattern, columns=['content']
+      )
+      module_data = parquet_data | 'Load' >> beam.Map(
+          lambda parquet_row: parquet_row['content']
+      )
+      module_data_shuffled = module_data | 'Shuffle' >> beam.Reshuffle()
+      optimized_modules = module_data_shuffled | 'Optimize' >> beam.ParDo(
+          OptimizeModules(
+              ['default<O0>', 'default<O1>', 'default<O2>', 'default<O3>']
+          )
+      )
+      lowered_modules = optimized_modules | 'Lower' >> beam.ParDo(
+          LowerModulesAsm(['-O0', '-O1', '-O2', '-O3'])
+      )
+      bb_hex_values = lowered_modules | 'Get BBs' >> beam.ParDo(
+          GetBBsFromModule()
+      )
     bb_hex_values_deduplicated = (
         bb_hex_values | 'Deduplicate' >> DeduplicateValues()
     )
diff --git a/gematria/datasets/pipelines/compile_modules_lib_test.py b/gematria/datasets/pipelines/compile_modules_lib_test.py
@@ -191,13 +191,52 @@ def test_get_bbs(self, annotator_type):
     )
 
     pipeline_constructor = compile_modules_lib.get_bbs(
-        test_parquet_file.full_path,
-        output_file_pattern,
-        False,
-        annotator_type,
-        50,
-        vocab_output_file_pattern,
-        False,
+        input_file_pattern=test_parquet_file.full_path,
+        output_file=output_file_pattern,
+        remove_memory_accessing_instructions=False,
+        annotator_type=annotator_type,
+        max_annotation_attempts=50,
+        vocab_output_file=vocab_output_file_pattern,
+        skip_no_loop_register=False,
+        input_hex_bbs_file_pattern=None,
+    )
+
+    with test_pipeline.TestPipeline() as pipeline_under_test:
+      pipeline_constructor(pipeline_under_test)
+
+    block_hex_values = []
+    for annotated_block in tfrecord.read_protos(
+        [output_file_pattern + '-00000-of-00001'],
+        execution_annotation_pb2.BlockWithExecutionAnnotations,
+    ):
+      block_hex_values.append(annotated_block.block_hex)
+
+    self.assertLen(block_hex_values, 2)
+    self.assertContainsSubset(['B801000000', 'B802000000'], block_hex_values)
+
+    with open(
+        vocab_output_file_pattern + '-00000-of-00001'
+    ) as vocab_file_handle:
+      vocab_tokens = [token.strip() for token in vocab_file_handle.readlines()]
+
+    self.assertCountEqual(['_D_', '_IMMEDIATE_', 'MOV', 'EAX'], vocab_tokens)
+
+  def test_get_bbs_hex_file(self):
+    test_bb_file = self.create_tempfile()
+    output_file_dir = self.create_tempdir()
+    output_file_pattern = os.path.join(output_file_dir, 'bbs')
+    vocab_output_file_pattern = os.path.join(output_file_dir, 'bbvocab')
+    test_bb_file.write_text('B801000000\nB802000000\n')
+
+    pipeline_constructor = compile_modules_lib.get_bbs(
+        input_file_pattern=None,
+        output_file=output_file_pattern,
+        remove_memory_accessing_instructions=False,
+        annotator_type=bhive_to_exegesis.AnnotatorType.exegesis,
+        max_annotation_attempts=50,
+        vocab_output_file=vocab_output_file_pattern,
+        skip_no_loop_register=False,
+        input_hex_bbs_file_pattern=test_bb_file.full_path,
     )
 
     with test_pipeline.TestPipeline() as pipeline_under_test: