
    SܶiQ                        d dl Z d dlmZ d dlmZmZ d dlmZmZ d dl	Z
d dlZd dlmc mZ ddlmZ dZdZd	Zd
Zeez  Z eee          Zedz  Z eee          Z eee          ZefdedefdZefdddedefdZ ed          dedej        fd            Z 	 	 	 ddeee
j!        ej        f         dededeeeej"        f                  fdZ#dS )    N)	lru_cache)CalledProcessErrorrun)OptionalUnion   )	exact_divi>  i           filesrc                    ddddd| dddd	d
ddt          |          dg}	 t          |dd          j        }n<# t          $ r/}t	          d|j                                                   |d}~ww xY wt          j        |t          j	                  
                                                    t          j                  dz  S )a?  
    Open an audio file and read as mono waveform, resampling as necessary

    Parameters
    ----------
    file: str
        The audio file to open

    sr: int
        The sample rate to resample the audio if necessary

    Returns
    -------
    A NumPy array containing the audio waveform, in float32 dtype.
    ffmpegz-nostdinz-threads0z-iz-fs16lez-ac1z-acodec	pcm_s16lez-ar-T)capture_outputcheckzFailed to load audio: Ng      @)strr   stdoutr   RuntimeErrorstderrdecodenp
frombufferint16flattenastypefloat32)r   r   cmdoutes        P/root/projects/openclaw-proxy/venv/lib/python3.11/site-packages/whisper/audio.py
load_audior'      s    * 	Cdgs;s2ww
CP#d$777> P P PGAHOO4E4EGGHHaOP =bh''//1188DDwNNs   8 
A1*A,,A1)axislengthr)   c                Z   t          j        |           r| j        |         |k    r0|                     |t          j        || j                            } | j        |         |k     rHdg| j        z  }d|| j        |         z
  f||<   t          j        | d |ddd         D                       } n{| j        |         |k    r$| 	                    t          |          |          } | j        |         |k     r5dg| j        z  }d|| j        |         z
  f||<   t          j        | |          } | S )	zO
    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
    )device)dimindex)r   r   r   c                     g | ]	}|D ]}|
S  r0   ).0sizespads      r&   
<listcomp>zpad_or_trim.<locals>.<listcomp>N   s%    !U!U!U%u!U!U#!U!U!U!U    Nr(   )indicesr)   )torch	is_tensorshapeindex_selectaranger,   ndimFr3   takeranger   )arrayr*   r)   
pad_widthss       r&   pad_or_trimrB   A   s?    u .;tv%%&&VEL I I I '  E ;tv%% EJ.J !6EK,=#=>JtE%!U!U:ddd3C!U!U!UVVE;tv%%JJuV}}4J@@E;tv%% EJ.J !6EK,=#=>JtF5*--ELr5   )maxsizen_melsreturnc                 j   |dv sJ d|             t           j                            t           j                            t                    dd          }t          j        |d          5 }t          j        |d|                    	                    |           cddd           S # 1 swxY w Y   dS )	ad  
    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
    Allows decoupling librosa dependency; saved using:

        np.savez_compressed(
            "mel_filters.npz",
            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
            mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128),
        )
    >   P      zUnsupported n_mels: assetszmel_filters.npzF)allow_picklemel_N)
ospathjoindirname__file__r   loadr7   
from_numpyto)r,   rD   filters_pathfs       r&   mel_filtersrV   [   s     Y ?v ? ?7<< 9 98EVWWL	E	2	2	2 ?a/// 23366v>>? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?s   +0B((B,/B,rG   audiopaddingr,   c                    t          j        |           s8t          | t                    rt	          |           } t          j        |           } ||                     |          } |dk    rt          j        | d|f          } t          j	        t                                        | j                  }t          j        | t          t          |d          }|dddf                                         dz  }t          | j        |          }||z  }t          j        |d	                                          }	t          j        |	|	                                d
z
            }	|	dz   dz  }	|	S )a}  
    Compute the log-Mel spectrogram of

    Parameters
    ----------
    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz

    n_mels: int
        The number of Mel-frequency filters, only 80 and 128 are supported

    padding: int
        Number of zero samples to pad to the right

    device: Optional[Union[str, torch.device]]
        If given, the audio tensor is moved to this device before STFT

    Returns
    -------
    torch.Tensor, shape = (n_mels, n_frames)
        A Tensor that contains the Mel spectrogram
    Nr   T)windowreturn_complex.r(   r   g|=)ming       @g      @)r7   r8   
isinstancer   r'   rR   rS   r=   r3   hann_windowN_FFTr,   stft
HOP_LENGTHabsrV   clamplog10maximummax)
rW   rD   rX   r,   rZ   r`   
magnitudesfiltersmel_speclog_specs
             r&   log_mel_spectrogramrk   n   sB   8 ?5!! (eS!! 	&u%%E ''  {{ea\**u%%((66F:eUJvdSSSDc3B3h##%%*J%,//G#H{8///5577H}Xx||~~';<<H3#%HOr5   )rG   r   N)$rL   	functoolsr   
subprocessr   r   typingr   r   numpyr   r7   torch.nn.functionalnn
functionalr=   utilsr	   SAMPLE_RATEr_   ra   CHUNK_LENGTH	N_SAMPLESN_FRAMESN_SAMPLES_PER_TOKENFRAMES_PER_SECONDTOKENS_PER_SECONDr   intr'   rB   TensorrV   ndarrayr,   rk   r0   r5   r&   <module>r~      s   				       . . . . . . . . " " " " " " " "                     
;&	9Y
++ 1n Ik:66 Ik+>??  %0 %O %OS %Oc %O %O %O %OP &/ r   s     4 4? ? ? ? ? ?( 15	/ /bj%,./// / U3,-.	/ / / / / /r5   