11import { Processor } from '../../base/processing_utils.js' ;
2- import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js' ;
3- import { max , softmax } from '../../utils/maths.js' ;
2+ import { PyAnnoteFeatureExtractor } from './feature_extraction_pyannote.js' ;
43
54export class PyAnnoteProcessor extends Processor {
6- static feature_extractor_class = AutoFeatureExtractor
5+ static feature_extractor_class = PyAnnoteFeatureExtractor
76
87 /**
98 * Calls the feature_extractor function with the given audio input.
@@ -14,58 +13,12 @@ export class PyAnnoteProcessor extends Processor {
1413 return await this . feature_extractor ( audio )
1514 }
1615
17- /**
18- * NOTE: Can return fractional values. `Math.ceil` will ensure correct value.
19- * @param {number } samples The number of frames in the audio.
20- * @returns {number } The number of frames in the audio.
21- */
22- samples_to_frames ( samples ) {
23- return ( ( samples - this . config . offset ) / this . config . step ) ;
16+ /** @type {PyAnnoteFeatureExtractor['post_process_speaker_diarization'] } */
17+ post_process_speaker_diarization ( ...args ) {
18+ return /** @type {PyAnnoteFeatureExtractor } */ ( this . feature_extractor ) . post_process_speaker_diarization ( ...args ) ;
2419 }
2520
26- /**
27- * Post-processes the speaker diarization logits output by the model.
28- * @param {import('../../utils/tensor.js').Tensor } logits The speaker diarization logits output by the model.
29- * @param {number } num_samples Number of samples in the input audio.
30- * @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>> } The post-processed speaker diarization results.
31- */
32- post_process_speaker_diarization ( logits , num_samples ) {
33- const ratio = (
34- num_samples / this . samples_to_frames ( num_samples )
35- ) / this . config . sampling_rate ;
36-
37- const results = [ ] ;
38- for ( const scores of logits . tolist ( ) ) {
39- const accumulated_segments = [ ] ;
40-
41- let current_speaker = - 1 ;
42- for ( let i = 0 ; i < scores . length ; ++ i ) {
43- const probabilities = softmax ( scores [ i ] ) ;
44- const [ score , id ] = max ( probabilities ) ;
45- const [ start , end ] = [ i , i + 1 ] ;
46-
47- if ( id !== current_speaker ) {
48- // Speaker has changed
49- current_speaker = id ;
50- accumulated_segments . push ( { id, start, end, score } ) ;
51- } else {
52- // Continue the current segment
53- accumulated_segments . at ( - 1 ) . end = end ;
54- accumulated_segments . at ( - 1 ) . score += score ;
55- }
56- }
57-
58- results . push ( accumulated_segments . map (
59- // Convert frame-space to time-space
60- // and compute the confidence
61- ( { id, start, end, score } ) => ( {
62- id,
63- start : start * ratio ,
64- end : end * ratio ,
65- confidence : score / ( end - start ) ,
66- } )
67- ) ) ;
68- }
69- return results ;
21+ get sampling_rate ( ) {
22+ return this . feature_extractor . config . sampling_rate ;
7023 }
7124}
0 commit comments