Merge pull request #172 from saltytine/main

hyperc54 · web-flow · commit 96e15a7e8e1a · 2025-08-13T09:07:42.000+02:00
Improve maintainability by removing magic numbers and implementing a few fixes
diff --git a/basic_pitch/callbacks.py b/basic_pitch/callbacks.py
@@ -23,6 +23,8 @@
 
 from basic_pitch import visualize
 
+TENSORBOARD_LOGS_SUBDIR = "tensorboard_logs"
+
 
 class VisualizeCallback(tf.keras.callbacks.Callback):
     # TODO RACHEL make this WAY faster
@@ -48,7 +50,7 @@ def __init__(
         super().__init__()
         self.train_iter = iter(train_ds)
         self.validation_iter = iter(validation_ds)
-        self.tensorboard_dir = os.path.join(tensorboard_dir, "tensorboard_logs")
+        self.tensorboard_dir = os.path.join(tensorboard_dir, TENSORBOARD_LOGS_SUBDIR)
         self.file_writer = tf.summary.create_file_writer(tensorboard_dir)
         self.sonify = sonify
         self.contours = contours
diff --git a/basic_pitch/commandline_printing.py b/basic_pitch/commandline_printing.py
@@ -23,6 +23,8 @@
 
 TF_LOG_LEVEL_KEY = "TF_CPP_MIN_LOG_LEVEL"
 TF_LOG_LEVEL_NO_WARNINGS_VALUE = "3"
+DEFAULT_PRINT_INDENT = "  "
+
 s_print_lock = threading.Lock()
 OUTPUT_EMOJIS = {
     "MIDI": "💅",
@@ -39,8 +41,7 @@ def generating_file_message(output_type: str) -> None:
         output_type: string indicating which kind of file is being generated
 
     """
-    print(f"\n\n  Creating {output_type.replace('_', ' ').lower()}...")
-
+    print(f"\n\n{DEFAULT_PRINT_INDENT}Creating {output_type.replace('_', ' ').lower()}...")
 
 def file_saved_confirmation(output_type: str, save_path: Union[pathlib.Path, str]) -> None:
     """Print a confirmation that the file was saved succesfully
@@ -50,8 +51,8 @@ def file_saved_confirmation(output_type: str, save_path: Union[pathlib.Path, str
         save_path: The path to output file.
 
     """
-    print(f"  {OUTPUT_EMOJIS[output_type]} Saved to {save_path}")
-
+    emoji = OUTPUT_EMOJIS.get(output_type, "")
+    print(f"{DEFAULT_PRINT_INDENT}{emoji} Saved to {save_path}")
 
 def failed_to_save(output_type: str, save_path: Union[pathlib.Path, str]) -> None:
     """Print a failure to save message
@@ -63,13 +64,14 @@ def failed_to_save(output_type: str, save_path: Union[pathlib.Path, str]) -> Non
     """
     print(f"\n🚨 Failed to save {output_type.replace('_', ' ').lower()} to {save_path} \n")
 
-
 @contextmanager
 def no_tf_warnings() -> Iterator[None]:
     """
     Supress tensorflow warnings in this context
     """
     tf_logging_level = os.environ.get(TF_LOG_LEVEL_KEY, TF_LOG_LEVEL_NO_WARNINGS_VALUE)
     os.environ[TF_LOG_LEVEL_KEY] = TF_LOG_LEVEL_NO_WARNINGS_VALUE
-    yield
-    os.environ[TF_LOG_LEVEL_KEY] = tf_logging_level
+    try:
+        yield
+    finally:
+        os.environ[TF_LOG_LEVEL_KEY] = tf_logging_level
diff --git a/basic_pitch/constants.py b/basic_pitch/constants.py
@@ -19,6 +19,9 @@
 
 from enum import Enum
 
+
+SEMITONES_PER_OCTAVE = 12 # for frequency bin calculations
+
 FFT_HOP = 256
 N_FFT = 8 * FFT_HOP
 
@@ -54,7 +57,7 @@
 
 
 def _freq_bins(bins_per_semitone: int, base_frequency: float, n_semitones: int) -> np.array:
-    d = 2.0 ** (1.0 / (12 * bins_per_semitone))
+    d = 2.0 ** (1.0 / (SEMITONES_PER_OCTAVE * bins_per_semitone))
     bin_freqs = base_frequency * d ** np.arange(bins_per_semitone * n_semitones)
     return bin_freqs
 
diff --git a/basic_pitch/inference.py b/basic_pitch/inference.py
@@ -181,6 +181,14 @@ def predict(self, x: npt.NDArray[np.float32]) -> Dict[str, npt.NDArray[np.float3
             }
 
 
+DEFAULT_ONSET_THRESHOLD = 0.5
+DEFAULT_FRAME_THRESHOLD = 0.3
+DEFAULT_MINIMUM_NOTE_LENGTH_MS = 127.7
+DEFAULT_MINIMUM_MIDI_TEMPO = 120
+DEFAULT_SONIFICATION_SAMPLERATE = 44100
+DEFAULT_OVERLAPPING_FRAMES = 30
+DEFAULT_MIDI_VELOCITY_SCALE = 127
+
 def window_audio_file(
     audio_original: npt.NDArray[np.float32], hop_size: int
 ) -> Iterable[Tuple[npt.NDArray[np.float32], Dict[str, float]]]:
@@ -284,7 +292,7 @@ def run_inference(
         model = Model(model_or_model_path)
 
     # overlap 30 frames
-    n_overlapping_frames = 30
+    n_overlapping_frames = DEFAULT_OVERLAPPING_FRAMES
     overlap_len = n_overlapping_frames * FFT_HOP
     hop_size = AUDIO_N_SAMPLES - overlap_len
 
@@ -405,7 +413,7 @@ def save_note_events(
         writer = csv.writer(fhandle, delimiter=",")
         writer.writerow(["start_time_s", "end_time_s", "pitch_midi", "velocity", "pitch_bend"])
         for start_time, end_time, note_number, amplitude, pitch_bend in note_events:
-            row = [start_time, end_time, note_number, int(np.round(127 * amplitude))]
+            row = [start_time, end_time, note_number, int(np.round(DEFAULT_MIDI_VELOCITY_SCALE * amplitude))]
             if pitch_bend:
                 row.extend(pitch_bend)
             writer.writerow(row)
@@ -414,15 +422,15 @@ def save_note_events(
 def predict(
     audio_path: Union[pathlib.Path, str],
     model_or_model_path: Union[Model, pathlib.Path, str] = ICASSP_2022_MODEL_PATH,
-    onset_threshold: float = 0.5,
-    frame_threshold: float = 0.3,
-    minimum_note_length: float = 127.70,
+    onset_threshold: float = DEFAULT_ONSET_THRESHOLD,
+    frame_threshold: float = DEFAULT_FRAME_THRESHOLD,
+    minimum_note_length: float = DEFAULT_MINIMUM_NOTE_LENGTH_MS,
     minimum_frequency: Optional[float] = None,
     maximum_frequency: Optional[float] = None,
     multiple_pitch_bends: bool = False,
     melodia_trick: bool = True,
     debug_file: Optional[pathlib.Path] = None,
-    midi_tempo: float = 120,
+    midi_tempo: float = DEFAULT_MINIMUM_MIDI_TEMPO,
 ) -> Tuple[
     Dict[str, np.array],
     pretty_midi.PrettyMIDI,
@@ -497,16 +505,16 @@ def predict_and_save(
     save_model_outputs: bool,
     save_notes: bool,
     model_or_model_path: Union[Model, str, pathlib.Path],
-    onset_threshold: float = 0.5,
-    frame_threshold: float = 0.3,
-    minimum_note_length: float = 127.70,
+    onset_threshold: float = DEFAULT_ONSET_THRESHOLD,
+    frame_threshold: float = DEFAULT_FRAME_THRESHOLD,
+    minimum_note_length: float = DEFAULT_MINIMUM_NOTE_LENGTH_MS,
     minimum_frequency: Optional[float] = None,
     maximum_frequency: Optional[float] = None,
     multiple_pitch_bends: bool = False,
     melodia_trick: bool = True,
     debug_file: Optional[pathlib.Path] = None,
-    sonification_samplerate: int = 44100,
-    midi_tempo: float = 120,
+    sonification_samplerate: int = DEFAULT_SONIFICATION_SAMPLERATE,
+    midi_tempo: float = DEFAULT_MINIMUM_MIDI_TEMPO,
 ) -> None:
     """Make a prediction and save the results to file.
 
diff --git a/basic_pitch/layers/nnaudio.py b/basic_pitch/layers/nnaudio.py
@@ -26,12 +26,27 @@
 
 import scipy.signal
 
+DEFAULT_BAND_CENTER = 0.5
+DEFAULT_KERNEL_LENGTH = 256
+DEFAULT_TRANSITION_BANDWIDTH = 0.03
+DEFAULT_DTYPE = tf.float32
+DEFAULT_WINDOW_BANDWIDTH = 1.5
+DEFAULT_CQT_HOP_LENGTH = 512
+DEFAULT_CQT_FMIN = 32.70
+DEFAULT_CQT_N_BINS = 84
+DEFAULT_CQT_BINS_PER_OCTAVE = 12
+DEFAULT_CQT_BASIS_NORM = 1
+DEFAULT_CQT_WINDOW = "hann"
+DEFAULT_CQT_PAD_MODE = "reflect"
+DEFAULT_CQT_OUTPUT_FORMAT = "Magnitude"
+DEFAULT_LOW_PASS_TRANSITION_BANDWIDTH = 0.001
+
 
 def create_lowpass_filter(
-    band_center: float = 0.5,
-    kernel_length: int = 256,
-    transition_bandwidth: float = 0.03,
-    dtype: tf.dtypes.DType = tf.float32,
+    band_center: float = DEFAULT_BAND_CENTER,
+    kernel_length: int = DEFAULT_KERNEL_LENGTH,
+    transition_bandwidth: float = DEFAULT_TRANSITION_BANDWIDTH,
+    dtype: tf.dtypes.DType = DEFAULT_DTYPE,
 ) -> np.ndarray:
     """
     Calculate the highest frequency we need to preserve and the lowest frequency we allow
@@ -106,15 +121,15 @@ def get_early_downsample_params(
 ) -> Tuple[Union[float, int], int, float, np.array, bool]:
     """Compute downsampling parameters used for early downsampling"""
 
-    window_bandwidth = 1.5  # for hann window
+    window_bandwidth = DEFAULT_WINDOW_BANDWIDTH  # for hann window
     filter_cutoff = fmax_t * (1 + 0.5 * window_bandwidth / Q)
     sr, hop_length, downsample_factor = early_downsample(sr, hop_length, n_octaves, sr // 2, filter_cutoff)
     if downsample_factor != 1:
         earlydownsample = True
         early_downsample_filter = create_lowpass_filter(
             band_center=1 / downsample_factor,
-            kernel_length=256,
-            transition_bandwidth=0.03,
+            kernel_length=DEFAULT_KERNEL_LENGTH,
+            transition_bandwidth=DEFAULT_TRANSITION_BANDWIDTH,
             dtype=dtype,
         )
     else:
@@ -455,19 +470,19 @@ class CQT2010v2(tf.keras.layers.Layer):
     def __init__(
         self,
         sr: int = 22050,
-        hop_length: int = 512,
-        fmin: float = 32.70,
+        hop_length: int = DEFAULT_CQT_HOP_LENGTH,
+        fmin: float = DEFAULT_CQT_FMIN,
         fmax: Optional[float] = None,
-        n_bins: int = 84,
+        n_bins: int = DEFAULT_CQT_N_BINS,
         filter_scale: int = 1,
-        bins_per_octave: int = 12,
+        bins_per_octave: int = DEFAULT_CQT_BINS_PER_OCTAVE,
         norm: bool = True,
-        basis_norm: int = 1,
-        window: str = "hann",
-        pad_mode: str = "reflect",
+        basis_norm: int = DEFAULT_CQT_BASIS_NORM,
+        window: str = DEFAULT_CQT_WINDOW,
+        pad_mode: str = DEFAULT_CQT_PAD_MODE,
         earlydownsample: bool = True,
         trainable: bool = False,
-        output_format: str = "Magnitude",
+        output_format: str = DEFAULT_CQT_OUTPUT_FORMAT,
         match_torch_exactly: bool = True,
     ):
         super().__init__()
@@ -516,7 +531,11 @@ def build(self, input_shape: tf.TensorShape) -> None:
         # This will be used to calculate filter_cutoff and creating CQT kernels
         Q = float(self.filter_scale) / (2 ** (1 / self.bins_per_octave) - 1)
 
-        self.lowpass_filter = create_lowpass_filter(band_center=0.5, kernel_length=256, transition_bandwidth=0.001)
+        self.lowpass_filter = create_lowpass_filter(
+            band_center=DEFAULT_BAND_CENTER,
+            kernel_length=DEFAULT_KERNEL_LENGTH,
+            transition_bandwidth=DEFAULT_LOW_PASS_TRANSITION_BANDWIDTH,
+        )
 
         # Calculate num of filter requires for the kernel
         # n_octaves determines how many resampling requires for the CQT
diff --git a/basic_pitch/models.py b/basic_pitch/models.py
@@ -35,6 +35,22 @@
 
 MAX_N_SEMITONES = int(np.floor(12.0 * np.log2(0.5 * AUDIO_SAMPLE_RATE / ANNOTATIONS_BASE_FREQUENCY)))
 
+DEFAULT_LABEL_SMOOTHING = 0.2
+DEFAULT_POSITIVE_WEIGHT = 0.5
+
+CONTOUR_KERNEL_SIZE_1 = (5, 5)
+CONTOUR_KERNEL_SIZE_2 = (3, 39)  # 3*13
+CONTOUR_KERNEL_SIZE_3 = (5, 5)
+CONTOUR_FILTERS_2 = 8
+
+NOTES_KERNEL_SIZE_1 = (7, 7)
+NOTES_STRIDES_1 = (1, 3)
+NOTES_KERNEL_SIZE_2 = (7, 3)
+
+ONSET_KERNEL_SIZE_1 = (5, 5)
+ONSET_STRIDES_1 = (1, 3)
+ONSET_KERNEL_SIZE_2 = (3, 3)
+
 
 def transcription_loss(y_true: tf.Tensor, y_pred: tf.Tensor, label_smoothing: float) -> tf.Tensor:
     """Really a binary cross entropy loss. Used to calculate the loss between the predicted
@@ -103,7 +119,7 @@ def onset_loss(
     return lambda x, y: transcription_loss(x, y, label_smoothing=label_smoothing)
 
 
-def loss(label_smoothing: float = 0.2, weighted: bool = False, positive_weight: float = 0.5) -> Dict[str, Any]:
+def loss(label_smoothing: float = DEFAULT_LABEL_SMOOTHING, weighted: bool = False, positive_weight: float = DEFAULT_POSITIVE_WEIGHT) -> Dict[str, Any]:
     """Creates a keras-compatible dictionary of loss functions to calculate
     the loss for the contour, note and onset posteriorgrams.
 
@@ -206,7 +222,7 @@ def model(
     # contour layers - fully convolutional
     x_contours = tfkl.Conv2D(
         n_filters_contour,
-        (5, 5),
+        CONTOUR_KERNEL_SIZE_1,
         padding="same",
         kernel_initializer=_initializer(),
         kernel_constraint=_kernel_constraint(),
@@ -216,8 +232,8 @@ def model(
     x_contours = tfkl.ReLU()(x_contours)
 
     x_contours = tfkl.Conv2D(
-        8,
-        (3, 3 * 13),
+        CONTOUR_FILTERS_2,
+        CONTOUR_KERNEL_SIZE_2,
         padding="same",
         kernel_initializer=_initializer(),
         kernel_constraint=_kernel_constraint(),
@@ -230,7 +246,7 @@ def model(
         contour_name = "contour"
         x_contours = tfkl.Conv2D(
             1,
-            (5, 5),
+            CONTOUR_KERNEL_SIZE_3,
             padding="same",
             activation="sigmoid",
             kernel_initializer=_initializer(),
@@ -246,9 +262,9 @@ def model(
 
     x_contours_reduced = tfkl.Conv2D(
         n_filters_notes,
-        (7, 7),
+        NOTES_KERNEL_SIZE_1,
         padding="same",
-        strides=(1, 3),
+        strides=NOTES_STRIDES_1,
         kernel_initializer=_initializer(),
         kernel_constraint=_kernel_constraint(),
     )(x_contours_reduced)
@@ -258,7 +274,7 @@ def model(
     note_name = "note"
     x_notes_pre = tfkl.Conv2D(
         1,
-        (7, 3),
+        NOTES_KERNEL_SIZE_2,
         padding="same",
         kernel_initializer=_initializer(),
         kernel_constraint=_kernel_constraint(),
@@ -271,9 +287,9 @@ def model(
     # onsets - fully convolutional
     x_onset = tfkl.Conv2D(
         n_filters_onsets,
-        (5, 5),
+        ONSET_KERNEL_SIZE_1,
         padding="same",
-        strides=(1, 3),
+        strides=ONSET_STRIDES_1,
         kernel_initializer=_initializer(),
         kernel_constraint=_kernel_constraint(),
     )(x)
@@ -282,7 +298,7 @@ def model(
     x_onset = tfkl.Concatenate(axis=3, name="concat")([x_notes_pre, x_onset])
     x_onset = tfkl.Conv2D(
         1,
-        (3, 3),
+        ONSET_KERNEL_SIZE_2,
         padding="same",
         activation="sigmoid",
         kernel_initializer=_initializer(),
diff --git a/basic_pitch/nn.py b/basic_pitch/nn.py
@@ -22,6 +22,8 @@
 
 from basic_pitch.layers.math import log_base_b
 
+SEMITONES_PER_OCTAVE = 12
+
 
 class HarmonicStacking(tf.keras.layers.Layer):
     """Harmonic stacking layer
@@ -47,7 +49,7 @@ def __init__(
         self.bins_per_semitone = bins_per_semitone
         self.harmonics = harmonics
         self.shifts = [
-            int(tf.math.round(12.0 * self.bins_per_semitone * log_base_b(float(h), 2))) for h in self.harmonics
+            int(tf.math.round(SEMITONES_PER_OCTAVE * self.bins_per_semitone * log_base_b(float(h), 2))) for h in self.harmonics
         ]
         self.n_output_freqs = n_output_freqs
 
@@ -96,7 +98,7 @@ def call(self, x: tf.Tensor) -> tf.Tensor:
         """x: (batch, time, ch)"""
         shapes = K.int_shape(x)
         tf.assert_equal(shapes[2], 1)
-        return tf.keras.layers.Reshape([shapes[1]])(x)  # ignore batch size
+        return tf.squeeze(x, axis=2)
 
 
 class FlattenFreqCh(tf.keras.layers.Layer):
@@ -109,4 +111,8 @@ class FlattenFreqCh(tf.keras.layers.Layer):
 
     def call(self, x: tf.Tensor) -> tf.Tensor:
         shapes = K.int_shape(x)
-        return tf.keras.layers.Reshape([shapes[1], shapes[2] * shapes[3]])(x)  # ignore batch size
+        batch_size = tf.shape(x)[0]
+        time_dim = shapes[1]
+        freq_dim = shapes[2]
+        ch_dim = shapes[3]
+        return tf.reshape(x, [batch_size, time_dim, freq_dim * ch_dim])
diff --git a/basic_pitch/note_creation.py b/basic_pitch/note_creation.py