Add Corpus abstraction & sorting w/ size (#77)

Northbadge · web-flow · commit 220c0f5aea02 · 2022-08-15T13:59:42.000-07:00
* Add Corpus abstraction &amp; sorting w/ size

- Rather than pass around a list of module_specs,pass around a Corpus object instead
- built in filter and sample method
- sample will sort by size descending, with the goal of optimizing compile order

* change .size -&gt; __len__

* Default to unbiased sampling, add sampler option

* Add separate constructor for testing

* Make sampler not select repeats

* replace 'corp' with 'cps'

* Switch to optimized algorithm

* resolve comments
diff --git a/compiler_opt/rl/corpus.py b/compiler_opt/rl/corpus.py
@@ -13,6 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ModuleSpec definition and utility command line parsing functions."""
+import math
+import random
+import re
 
 from absl import logging
 from dataclasses import dataclass
@@ -30,9 +33,100 @@ class ModuleSpec:
   """
   name: str
   exec_cmd: Tuple[str, ...] = ()
-
-
-def build_modulespecs_from_datapath(
+  size: int = 0
+
+
+class SamplerBucketRoundRobin:
+  """Calls return a list of module_specs sampled randomly from n buckets, in
+  round-robin order. The buckets are sequential sections of module_specs of
+  roughly equal lengths."""
+
+  def __init__(self):
+    self._ranges = {}
+
+  def __call__(self,
+               module_specs: List[ModuleSpec],
+               k: int,
+               n: int = 20) -> List[ModuleSpec]:
+    """
+    Args:
+      module_specs: list of module_specs to sample from
+      k: number of modules to sample
+      n: number of buckets to use
+    """
+    # Credits to yundi@ for the highly optimized algo.
+    # Essentially, split module_specs into k buckets, then define the order of
+    # visiting the k buckets such that it approximates the behaviour of having
+    # n buckets.
+    specs_len = len(module_specs)
+    if (specs_len, k, n) not in self._ranges:
+      quotient = k // n
+      # rev_map maps from bucket # (implicitly via index) to order of visiting.
+      # lower values should be visited first, and earlier indices before later.
+      rev_map = [i % quotient for i in range(k)] if quotient else [0] * k
+      # mapping defines the order in which buckets should be visited.
+      mapping = [t[0] for t in sorted(enumerate(rev_map), key=lambda x: x[1])]
+
+      # generate the buckets ranges, in the order which they should be visited.
+      bucket_size_float = specs_len / k
+      self._ranges[(specs_len, k, n)] = tuple(
+          (math.floor(bucket_size_float * i),
+           math.floor(bucket_size_float * (i + 1))) for i in mapping)
+
+    return [
+        module_specs[random.randrange(start, end)]
+        for start, end in self._ranges[(specs_len, k, n)]
+    ]
+
+
+class Corpus:
+  """Represents a corpus. Comes along with some utility functions."""
+
+  def __init__(self,
+               data_path: str,
+               additional_flags: Tuple[str, ...] = (),
+               delete_flags: Tuple[str, ...] = ()):
+    self._module_specs = _build_modulespecs_from_datapath(
+        data_path=data_path,
+        additional_flags=additional_flags,
+        delete_flags=delete_flags)
+    self._root_dir = data_path
+    self._module_specs.sort(key=lambda m: m.size, reverse=True)
+
+  @classmethod
+  def from_module_specs(cls, module_specs: List[ModuleSpec]):
+    """Construct a Corpus from module specs. Mostly for testing purposes."""
+    cps = cls.__new__(cls)  # Avoid calling __init__
+    super(cls, cps).__init__()
+    cps._module_specs = list(module_specs)  # Don't mutate the original list.
+    cps._module_specs.sort(key=lambda m: m.size, reverse=True)
+    cps.root_dir = None
+    return cps
+
+  def sample(self,
+             k: int,
+             sort: bool = False,
+             sampler=SamplerBucketRoundRobin()) -> List[ModuleSpec]:
+    """Samples `k` module_specs, optionally sorting by size descending."""
+    # Note: sampler is intentionally defaulted to a mutable object, as the
+    # only mutable attribute of SamplerBucketRoundRobin is its range cache.
+    k = min(len(self._module_specs), k)
+    if k < 1:
+      raise ValueError('Attempting to sample <1 module specs from corpus.')
+    sampled_specs = sampler(self._module_specs, k=k)
+    if sort:
+      sampled_specs.sort(key=lambda m: m.size, reverse=True)
+    return sampled_specs
+
+  def filter(self, p: re.Pattern):
+    """Filters module specs, keeping those which match the provided pattern."""
+    self._module_specs = [ms for ms in self._module_specs if p.match(ms.name)]
+
+  def __len__(self):
+    return len(self._module_specs)
+
+
+def _build_modulespecs_from_datapath(
     data_path: str,
     additional_flags: Tuple[str, ...] = (),
     delete_flags: Tuple[str, ...] = ()
@@ -65,14 +159,17 @@ def build_modulespecs_from_datapath(
   module_specs: List[ModuleSpec] = []
 
   # This takes ~7s for 30k modules
-  for module_path in module_paths:
+  for rel_module_path in module_paths:
+    full_module_path = os.path.join(data_path, rel_module_path)
     exec_cmd = _load_and_parse_command(
-        module_path=os.path.join(data_path, module_path),
+        module_path=full_module_path,
         has_thinlto=has_thinlto,
         additional_flags=additional_flags,
         delete_flags=delete_flags,
         cmd_override=cmd_override)
-    module_specs.append(ModuleSpec(name=module_path, exec_cmd=tuple(exec_cmd)))
+    size = os.path.getsize(full_module_path + '.bc')
+    module_specs.append(
+        ModuleSpec(name=rel_module_path, exec_cmd=tuple(exec_cmd), size=size))
 
   return module_specs
 
diff --git a/compiler_opt/rl/corpus_test.py b/compiler_opt/rl/corpus_test.py
@@ -16,6 +16,7 @@
 # pylint: disable=protected-access
 import json
 import os
+import re
 
 import tensorflow as tf
 
@@ -137,7 +138,7 @@ def test_get_without_thinlto(self):
     tempdir.create_file('2.bc')
     tempdir.create_file('2.cmd', content='\0'.join(['-cc1', '-O3']))
 
-    ms_list = corpus.build_modulespecs_from_datapath(
+    ms_list = corpus._build_modulespecs_from_datapath(
         tempdir.full_path, additional_flags=('-add',))
     self.assertEqual(len(ms_list), 2)
     ms1 = ms_list[0]
@@ -165,7 +166,7 @@ def test_get_with_thinlto(self):
     tempdir.create_file(
         '2.cmd', content='\0'.join(['-cc1', '-fthinlto-index=abc']))
 
-    ms_list = corpus.build_modulespecs_from_datapath(
+    ms_list = corpus._build_modulespecs_from_datapath(
         tempdir.full_path,
         additional_flags=('-add',),
         delete_flags=('-fthinlto-index',))
@@ -201,7 +202,7 @@ def test_get_with_override(self):
     tempdir.create_file('2.thinlto.bc')
     tempdir.create_file('2.cmd', content='\0'.join(['-fthinlto-index=abc']))
 
-    ms_list = corpus.build_modulespecs_from_datapath(
+    ms_list = corpus._build_modulespecs_from_datapath(
         tempdir.full_path,
         additional_flags=('-add',),
         delete_flags=('-fthinlto-index',))
@@ -220,6 +221,111 @@ def test_get_with_override(self):
                       '-fthinlto-index=' + tempdir.full_path + '/2.thinlto.bc',
                       '-mllvm', '-thinlto-assume-merged', '-add'))
 
+  def test_size(self):
+    corpus_description = {'modules': ['1'], 'has_thinlto': False}
+    tempdir = self.create_tempdir()
+    tempdir.create_file(
+        'corpus_description.json', content=json.dumps(corpus_description))
+    bc_file = tempdir.create_file('1.bc')
+    tempdir.create_file('1.cmd', content='\0'.join(['-cc1']))
+    self.assertEqual(
+        os.path.getsize(bc_file.full_path),
+        corpus._build_modulespecs_from_datapath(
+            tempdir.full_path, additional_flags=('-add',))[0].size)
+
+
+class CorpusTest(tf.test.TestCase):
+
+  def test_constructor(self):
+    corpus_description = {'modules': ['1'], 'has_thinlto': False}
+    tempdir = self.create_tempdir()
+    tempdir.create_file(
+        'corpus_description.json', content=json.dumps(corpus_description))
+    tempdir.create_file('1.bc')
+    tempdir.create_file('1.cmd', content='\0'.join(['-cc1']))
+
+    cps = corpus.Corpus(tempdir.full_path, additional_flags=('-add',))
+    self.assertEqual(
+        corpus._build_modulespecs_from_datapath(
+            tempdir.full_path, additional_flags=('-add',)), cps._module_specs)
+    self.assertEqual(len(cps), 1)
+
+  def test_sample(self):
+    cps = corpus.Corpus.from_module_specs(module_specs=[
+        corpus.ModuleSpec(name='smol', size=1),
+        corpus.ModuleSpec(name='middle', size=200),
+        corpus.ModuleSpec(name='largest', size=500),
+        corpus.ModuleSpec(name='small', size=100)
+    ])
+    sample = cps.sample(4, sort=True)
+    self.assertLen(sample, 4)
+    self.assertEqual(sample[0].name, 'largest')
+    self.assertEqual(sample[1].name, 'middle')
+    self.assertEqual(sample[2].name, 'small')
+    self.assertEqual(sample[3].name, 'smol')
+
+  def test_filter(self):
+    cps = corpus.Corpus.from_module_specs(module_specs=[
+        corpus.ModuleSpec(name='smol', size=1),
+        corpus.ModuleSpec(name='largest', size=500),
+        corpus.ModuleSpec(name='middle', size=200),
+        corpus.ModuleSpec(name='small', size=100)
+    ])
+
+    cps.filter(re.compile(r'.+l'))
+    sample = cps.sample(999, sort=True)
+    self.assertLen(sample, 3)
+    self.assertEqual(sample[0].name, 'middle')
+    self.assertEqual(sample[1].name, 'small')
+    self.assertEqual(sample[2].name, 'smol')
+
+  def test_sample_zero(self):
+    cps = corpus.Corpus.from_module_specs(
+        module_specs=[corpus.ModuleSpec(name='smol')])
+
+    self.assertRaises(ValueError, cps.sample, 0)
+    self.assertRaises(ValueError, cps.sample, -213213213)
+
+  def test_bucket_sample(self):
+    cps = corpus.Corpus.from_module_specs(
+        module_specs=[corpus.ModuleSpec(name='', size=i) for i in range(100)])
+    # Odds of passing once by pure luck with random.sample: 1.779e-07
+    # Try 32 times, for good measure.
+    for i in range(32):
+      sample = cps.sample(
+          k=20, sampler=corpus.SamplerBucketRoundRobin(), sort=True)
+      self.assertLen(sample, 20)
+      for idx, s in enumerate(sample):
+        # Each bucket should be size 5, since n=20 in the sampler
+        self.assertEqual(s.size // 5, 19 - idx)
+
+  def test_bucket_sample_all(self):
+    # Make sure we can sample everything, even if it's not divisible by the
+    # `n` in SamplerBucketRoundRobin.
+    # Create corpus with a prime number of modules.
+    cps = corpus.Corpus.from_module_specs(
+        module_specs=[corpus.ModuleSpec(name='', size=i) for i in range(101)])
+
+    # Try 32 times, for good measure.
+    for i in range(32):
+      sample = cps.sample(
+          k=101, sampler=corpus.SamplerBucketRoundRobin(), sort=True)
+      self.assertLen(sample, 101)
+      for idx, s in enumerate(sample):
+        # Since everything is sampled, it should be in perfect order.
+        self.assertEqual(s.size, 100 - idx)
+
+  def test_bucket_sample_small(self):
+    # Make sure we can sample even when k < n.
+    cps = corpus.Corpus.from_module_specs(
+        module_specs=[corpus.ModuleSpec(name='', size=i) for i in range(100)])
+
+    # Try all 19 possible values 0 < i < n
+    for i in range(1, 20):
+      sample = cps.sample(
+          k=i, sampler=corpus.SamplerBucketRoundRobin(), sort=True)
+      self.assertLen(sample, i)
+
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/compiler_opt/rl/local_data_collector.py b/compiler_opt/rl/local_data_collector.py
@@ -16,7 +16,6 @@
 
 import concurrent.futures
 import itertools
-import random
 import time
 from typing import Callable, Dict, Iterator, List, Tuple, Optional
 
@@ -34,7 +33,7 @@ class LocalDataCollector(data_collector.DataCollector):
 
   def __init__(
       self,
-      module_specs: List[corpus.ModuleSpec],
+      cps: corpus.Corpus,
       num_modules: int,
       worker_pool: List[compilation_runner.CompilationRunnerStub],
       parser: Callable[[List[str]], Iterator[trajectory.Trajectory]],
@@ -44,7 +43,7 @@ def __init__(
     # TODO(mtrofin): type exit_checker_ctor when we get typing.Protocol support
     super().__init__()
 
-    self._module_specs = module_specs
+    self._corpus = cps
     self._num_modules = num_modules
     self._parser = parser
     self._worker_pool = worker_pool
@@ -86,7 +85,7 @@ def _schedule_jobs(
     jobs = [(module_spec, policy_path, self._reward_stat_map[module_spec.name])
             for module_spec in sampled_modules]
 
-    # Naive load balancing.
+    # TODO: Issue #91. Naive load balancing.
     ret = []
     for i in range(len(jobs)):
       ret.append(self._worker_pool[i % len(self._worker_pool)].collect_data(
@@ -108,7 +107,7 @@ def collect_data(
       They will be reported using `tf.scalar.summary` by the trainer so these
       information is viewable in TensorBoard.
     """
-    sampled_modules = random.sample(self._module_specs, k=self._num_modules)
+    sampled_modules = self._corpus.sample(k=self._num_modules, sort=False)
     results = self._schedule_jobs(policy_path, sampled_modules)
 
     def wait_for_termination():
diff --git a/compiler_opt/rl/local_data_collector_test.py b/compiler_opt/rl/local_data_collector_test.py
@@ -116,7 +116,8 @@ def _test_iterator_fn(data_list):
 
     with LocalWorkerPool(worker_class=MyRunner, count=4) as lwp:
       collector = local_data_collector.LocalDataCollector(
-          module_specs=[corpus.ModuleSpec(name='dummy')] * 100,
+          cps=corpus.Corpus.from_module_specs(
+              module_specs=[corpus.ModuleSpec(name='dummy')] * 100),
           num_modules=9,
           worker_pool=lwp,
           parser=create_test_iterator_fn(),
@@ -177,7 +178,8 @@ def wait(self, _):
 
     with LocalWorkerPool(worker_class=Sleeper, count=4) as lwp:
       collector = local_data_collector.LocalDataCollector(
-          module_specs=[corpus.ModuleSpec(name='dummy')] * 200,
+          cps=corpus.Corpus.from_module_specs(
+              module_specs=[corpus.ModuleSpec(name='dummy')] * 200),
           num_modules=4,
           worker_pool=lwp,
           parser=parser,
diff --git a/compiler_opt/rl/train_locally.py b/compiler_opt/rl/train_locally.py
@@ -99,9 +99,8 @@ def train_eval(agent_name=constant.AgentName.PPO,
   saver = policy_saver.PolicySaver(policy_dict=policy_dict)
 
   logging.info('Loading module specs from corpus at %s.', FLAGS.data_path)
-  module_specs = corpus.build_modulespecs_from_datapath(
-      FLAGS.data_path, problem_config.flags_to_add(),
-      problem_config.flags_to_delete())
+  cps = corpus.Corpus(FLAGS.data_path, problem_config.flags_to_add(),
+                      problem_config.flags_to_delete())
   logging.info('Done loading module specs from corpus.')
 
   dataset_fn = data_reader.create_sequence_example_dataset_fn(
@@ -136,7 +135,7 @@ def sequence_example_iterator_fn(seq_ex: List[str]):
       count=FLAGS.num_workers,
       moving_average_decay_rate=moving_average_decay_rate) as worker_pool:
     data_collector = local_data_collector.LocalDataCollector(
-        module_specs=module_specs,
+        cps=cps,
         num_modules=num_modules,
         worker_pool=worker_pool,
         parser=sequence_example_iterator_fn,
diff --git a/compiler_opt/tools/generate_default_trace.py b/compiler_opt/tools/generate_default_trace.py