@@ -14,9 +14,8 @@ def _get_detector():
1414 global _swift_f0_detector
1515 if _swift_f0_detector is None :
1616 # Initialize for general music/speech (wide frequency range) fmin=46.875, fmax=2093.75
17- # fixme: is this correct?
1817 # For speech only: fmin=65, fmax=400
19- _swift_f0_detector = SwiftF0 (fmin = 65 , fmax = 400 , confidence_threshold = 0.9 )
18+ _swift_f0_detector = SwiftF0 (fmin = 46.875 , fmax = 2093.75 , confidence_threshold = 0.9 )
2019 return _swift_f0_detector
2120
2221
@@ -34,9 +33,21 @@ def get_pitch_with_file(
3433 if len (audio .shape ) > 1 :
3534 audio = np .mean (audio , axis = 1 )
3635
37- # Normalize audio to float if needed
38- if audio .dtype != np .float32 and audio .dtype != np .float64 :
39- audio = audio .astype (np .float32 ) / (2 ** 15 )
36+ # Normalize audio to float32 based on dtype
37+ if audio .dtype == np .uint8 :
38+ # uint8: range [0, 255] -> subtract 128 and divide by 128
39+ audio = (audio .astype (np .float32 ) - 128.0 ) / 128.0
40+ elif audio .dtype in [np .int16 , np .int32 , np .int64 ]:
41+ # Signed integers: use iinfo to get max value and normalize
42+ dtype_info = np .iinfo (audio .dtype )
43+ max_val = max (abs (dtype_info .min ), abs (dtype_info .max ))
44+ audio = audio .astype (np .float32 ) / float (max_val )
45+ elif audio .dtype == np .float64 :
46+ # float64: cast to float32
47+ audio = audio .astype (np .float32 )
48+ elif audio .dtype != np .float32 :
49+ # Fallback for other types: assume int16 range
50+ audio = audio .astype (np .float32 ) / 32768.0
4051
4152 return get_pitch_with_swift_f0 (audio , sample_rate )
4253
0 commit comments