added config options for vocab generation (#30)

boomanaiden154 · web-flow · commit 0d15a320ece2 · 2022-06-30T00:18:05.000-07:00
diff --git a/compiler_opt/rl/inlining/__init__.py b/compiler_opt/rl/inlining/__init__.py
@@ -33,3 +33,6 @@ def get_signature_spec(self):
 
   def get_preprocessing_layer_creator(self):
     return config.get_observation_processing_layer_creator()
+
+  def get_nonnormalized_features(self):
+    return config.get_nonnormalized_features()
diff --git a/compiler_opt/rl/inlining/config.py b/compiler_opt/rl/inlining/config.py
@@ -97,3 +97,7 @@ def observation_processing_layer(obs_spec):
                                      with_z_score_normalization, eps))
 
   return observation_processing_layer
+
+def get_nonnormalized_features():
+  return ['reward', 'inlining_default', 'inlining_decision']
+
diff --git a/compiler_opt/rl/problem_configuration.py b/compiler_opt/rl/problem_configuration.py
@@ -93,10 +93,12 @@ def get_preprocessing_layer_creator(
       self) -> Callable[[types.TensorSpec], tf.keras.layers.Layer]:
     raise NotImplementedError
 
+  def get_nonnormalized_features(self) -> Iterable[str]:
+    return []
+
   @abc.abstractmethod
   def get_runner_type(self) -> 'type[compilation_runner.CompilationRunner]':
     raise NotImplementedError
 
-
 def is_thinlto(module_paths: Iterable[str]) -> bool:
   return tf.io.gfile.exists(next(iter(module_paths)) + '.thinlto.bc')
diff --git a/compiler_opt/rl/regalloc/__init__.py b/compiler_opt/rl/regalloc/__init__.py
@@ -33,3 +33,6 @@ def get_signature_spec(self):
 
   def get_preprocessing_layer_creator(self):
     return config.get_observation_processing_layer_creator()
+
+  def get_nonnormalized_features(self):
+    return config.get_nonnormalized_features()
diff --git a/compiler_opt/rl/regalloc/config.py b/compiler_opt/rl/regalloc/config.py
@@ -88,22 +88,26 @@ def observation_processing_layer(obs_spec):
     if obs_spec.name in ('max_stage', 'min_stage'):
       return tf.keras.layers.Embedding(7, 4)
 
-    quantile = quantile_map[obs_spec.name]
-
-    first_non_zero = 0
-    for x in quantile:
-      if x > 0:
-        first_non_zero = x
-        break
-
-    normalize_fn = feature_ops.get_normalize_fn(quantile, with_sqrt,
-                                                with_z_score_normalization, eps)
-    log_normalize_fn = feature_ops.get_normalize_fn(
-        quantile,
-        with_sqrt,
-        with_z_score_normalization,
-        eps,
-        preprocessing_fn=lambda x: tf.math.log(x + first_non_zero))
+    normalize_fn = log_normalize_fn = None
+    if obs_spec.name not in get_nonnormalized_features():
+      quantile = quantile_map[obs_spec.name]
+
+      first_non_zero = 0
+      for x in quantile:
+        if x > 0:
+          first_non_zero = x
+          break
+
+      normalize_fn = feature_ops.get_normalize_fn(quantile,
+                                                  with_sqrt,
+                                                  with_z_score_normalization,
+                                                  eps)
+      log_normalize_fn = feature_ops.get_normalize_fn(
+          quantile,
+          with_sqrt,
+          with_z_score_normalization,
+          eps,
+          preprocessing_fn=lambda x: tf.math.log(x + first_non_zero))
 
     if obs_spec.name in ['nr_rematerializable', 'nr_broken_hints']:
       return tf.keras.layers.Lambda(normalize_fn)
@@ -137,3 +141,9 @@ def progress_processing_fn(obs):
     raise KeyError('Missing preprocessing function for some feature.')
 
   return observation_processing_layer
+
+def get_nonnormalized_features():
+  return ['mask', 'nr_urgent',
+          'is_hint', 'is_local',
+          'is_free', 'max_stage',
+          'min_stage', 'reward']
diff --git a/compiler_opt/tools/sparse_bucket_generator.py b/compiler_opt/tools/sparse_bucket_generator.py
@@ -21,15 +21,18 @@
 import math
 import multiprocessing as mp
 import os
-from typing import Callable, Dict, List
+from typing import Callable, Dict, List, Iterable
 
 from absl import app
 from absl import flags
 from absl import logging
+import gin
 
 import numpy as np
 import tensorflow as tf
 
+from compiler_opt.rl import registry
+
 flags.DEFINE_string('input', None,
                     'Path to input file containing tf record datasets.')
 flags.DEFINE_string('output_dir', None,
@@ -41,16 +44,23 @@
     'Each process does vocab generation for each feature.', 1)
 flags.DEFINE_integer('num_buckets', 1000,
                      'Number of quantiles to bucketize feature values into.')
+flags.DEFINE_multi_string('gin_files', [],
+                          'List of paths to gin configuration files.')
+flags.DEFINE_multi_string(
+    'gin_bindings', [],
+    'Gin bindings to override the values set in the config files.')
 
 FLAGS = flags.FLAGS
 
 
 def _get_feature_info(
-    serialized_proto: tf.Tensor) -> Dict[str, tf.io.RaggedFeature]:
+    serialized_proto: tf.Tensor,
+    features_to_not_process: Iterable[str]) -> Dict[str, tf.io.RaggedFeature]:
   """Provides feature information by analyzing a single serialized example.
 
   Args:
     serialized_proto: serialized SequenceExample.
+    features_to_not_process: A list of feature names that should not be processed
 
   Returns:
     Dictionary of Tensor formats indexed by feature name.
@@ -59,6 +69,8 @@ def _get_feature_info(
   example.ParseFromString(serialized_proto.numpy())
   sequence_features = {}
   for key, feature_list in example.feature_lists.feature_list.items():
+    if key in features_to_not_process:
+      continue
     feature = feature_list.feature[0]
     kind = feature.WhichOneof('kind')
     if kind == 'float_list':
@@ -123,17 +135,23 @@ def _generate_vocab(feature_values_arrays, feature_name):
 
 
 def main(_) -> None:
+  gin.parse_config_files_and_bindings(
+      FLAGS.gin_files, bindings=FLAGS.gin_bindings, skip_unknown=False)
+  logging.info(gin.config_str())
+  problem_config = registry.get_configuration()
+
   """Generate num_buckets quantiles for each feature."""
   tf.io.gfile.makedirs(FLAGS.output_dir)
   dataset = tf.data.Dataset.list_files(FLAGS.input)
   dataset = tf.data.TFRecordDataset(dataset)
+  features_to_not_process = problem_config.get_nonnormalized_features()
 
   sequence_features = {}
   # TODO(b/222775595): need to fix this after update to logic for handling
   # empty examples during trace generation.
   for raw_example in dataset:
     try:
-      sequence_features = _get_feature_info(raw_example)
+      sequence_features = _get_feature_info(raw_example, features_to_not_process)
       logging.info('Found valid sequence_features dict: %s', sequence_features)
       break
     except IndexError:
diff --git a/docs/adding_features.md b/docs/adding_features.md
@@ -29,6 +29,13 @@ First and foremost, **you must regenerate the vocabulary** - technically you
 just need a vocab file for the new feature, but it's simpler to regenerate it
 all. See the [demo section](demo/demo.md#collect-trace-and-generate-vocab)
 
+**Note:** You only need to regenerate the vocabulary if the feature is going
+to be normalized by a preprocessing layer for your model. If your feature does
+not need to get put through a lambda normalization preprocessing layer, make sure
+to regenerate the vocabulary and that your feature is added to the list
+returned by `get_nonnormalized_features()` in `config.py`. In either case,
+it is still quite simple and fast to just call the vocab generation again.
+
 After that, retrain from [scratch](demo/demo.md#train-a-new-model).
 
 ## Notes
diff --git a/docs/demo/demo.md b/docs/demo/demo.md
@@ -266,6 +266,7 @@ in the trace changes.
 rm -rf $DEFAULT_VOCAB &&
   PYTHONPATH=$PYTHONPATH:. python3 \
     compiler_opt/tools/sparse_bucket_generator.py \
+    --gin_files=compiler_opt/rl/inlining/gin_configs/common.gin \
     --input=$DEFAULT_TRACE \
     --output_dir=$DEFAULT_VOCAB
 ```