
    bi                        d dl Z d dlZd dlmZ d dlmZmZmZ ddlm	Z	m
Z
  e
j        e          Z e	            rd dlZerddlmZ d Z	 	 	 	 dded	         d
ed         dedz  dedz  dedef         f
dZ	 	 	 	 dded	         d
ed         dedz  dedz  dedef         f
dZ	 	 	 ddd	d
ed         dedz  dedz  dedef         f
dZ	 	 	 ddd	d
ed         dedz  dedz  dedef         f
dZ	 	 	 ddd	d
ed         dedz  dedz  dedef         f
dZeeeeedZ G d ded          Z G d d          Zd dededz  fdZdS )!    Nwraps)TYPE_CHECKINGOptional	TypedDict   )is_torch_availablelogging)PreTrainedConfigc                 V     ddddt                     d fd	            }|S )ad  
    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).

    Args:
        rope_forward (Callable):
            The forward pass of the RoPE implementation.

    Returns:
        The decorated forward pass.
    Nc                 b   t          j        |          dz   }|#| j        }| j        }d}| j        j        d         }n=| j        |         }t          | | d          }| d}| j        j        |         d         }||k    rkt          | | d          s't          |         }	 |	| j        ||dz   |          \  }
}| 	                    | d	|
d
           t          | | d|
           dS |                    |          }| 	                    | d	|d
           t          | | d|           dS )zbLongrope uses long factor if sequence is larger than original pretraining length, short otherwise.r   N  original_max_position_embeddings_original_inv_freq__long_inv_freqseq_len
layer_typeinv_freqF
persistentlong_inv_freqoriginal_inv_freq)torchmax	rope_typer   configrope_parametersgetattrhasattrROPE_INIT_FUNCTIONSregister_buffersetattrto)selfposition_idsdevicer   r   r   r   prefixr   rope_init_fnr   r   s               [/root/projects/butler/venv/lib/python3.11/site-packages/transformers/modeling_rope_utils.pylongrope_frequency_updatez6dynamic_rope_update.<locals>.longrope_frequency_update.   s   )L))A-I $ 6F/3{/JKm/n,,z2I '.O.O.O P P"%%%F/3{/J:/V20, 5554J!>!>!>?? 29=#/<K<q@)	$ $ $ q   F!4!4!4mPU VVVDV222MBBBBB !2 4 4V < <  F!4!4!46GTY ZZZDV6668IJJJJJ    c                    t          j        |          dz   }|| j        }| j        }| j        }d}n>| j        |         }t          | | d| j                  }t          | | d          }| d}||k    rXt          |         }	 |	| j        |||          \  }
| _        | 	                    | d|
d	
           t          | | d|           || j        k     rj|| j        k    ra|                    |          }| 	                    | d|d	
           t          | | d|           t          | | d| j                   dS dS dS )a  
        dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - growing beyond the cached sequence length (allow scaling)
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        r   Nr   _max_seq_len_cachedr   r   r   r   Fr   r   )r   r   r   max_seq_len_cachedr   r    r"   r   attention_scalingr#   r$   original_max_seq_lenr%   )r&   r'   r(   r   r   r   r0   r   r)   r*   r   s              r+   dynamic_frequency_updatez5dynamic_rope_update.<locals>.dynamic_frequency_updateQ   s    )L))A-I!%!8 $ 6FFz2I!(*/Q/Q/QSWSj!k!k '.O.O.O P P"%%%F'''.y9L/;|%	0 0 0,Hd,   F!4!4!4h5 QQQDZ<<<gFFFT...3EHa3a3a !2 4 4V < <  F!4!4!46GTY ZZZDV6668IJJJDZ<<<d>WXXXXX /.3a3ar-   c                     || j         n| j         |         }|d|ini }d|v r | |fd|j        i| n|dk    r | |fd|j        i|  | ||fi |S )Nr   dynamicr(   longrope)r   r(   )	r&   xr'   r   r   kwargsr3   r,   rope_forwards	         r+   wrapperz$dynamic_rope_update.<locals>.wrapperw   s    &0&8DNNdnZ>X	/9/E,
++2	!!$$T<SSSFSSSS*$$%%dLTTTVTTT|D!\<<V<<<r-   Nr   )r9   r:   r3   r,   s   ` @@r+   dynamic_rope_updater<   !   s{    !K !K !K !KF$Y $Y $Y $YL <= = = = = = = = Nr-   r   r   r(   ztorch.devicer   r   returnztorch.Tensorc                    |                                   || j        |         n| j        }|d         }|d         }|                    dd          }t          | dd          p| j        | j        z  }t          ||z            }	d}
d|t          j        d|	dt          j	        	          
                    |t          j        
          |	z  z  z  }||z  }||
fS )a  
    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    Nfactor
rope_thetapartial_rotary_factor      ?head_dimr      dtyper(   rF   )standardize_rope_paramsr   getr    hidden_sizenum_attention_headsintr   arangeint64r%   float)r   r(   r   r   rope_parameters_dictr?   baserA   rC   dimattention_factorr   s               r+   '_compute_linear_scaling_rope_parametersrT      s    B ""$$$AKAW61*==]c]s!(+F  -D0445LcRRvz400dF4F&Jd4dH
h..
/
/C du|AsAU[IIILLTZbgbmLnnqttuvH
 H%%%r-   c                    |                                   || j        |         n| j        }|d         }|                    dd          }t          | d| j        | j        z            }t          ||z            }|d         }	d}
|| j        }nit          |t          j
                  r:t          j        |t          j        | j        |j        |j                            }nt          || j                  }||	|z  | j        z  |	dz
  z
  ||d	z
  z  z  z  }d|t          j        d
|d	t          j                                      |t          j                  |z  z  z  }||
fS )a	  
    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The default sequence length used to update the dynamic RoPE at
                inference time
            *   rope_parameters (`dict[str, float]`): The standard RoPE scaling parameters, from which `factor`
                will be accessed. The value of `factor` is used to determine the new base frequency, along with the
                current sequence length (seq_len), the maximum positional embeddings (max_position_embeddings), and the
                computed dimensionality (dim) of the rotary embeddings. If seq_len <= max_position_embeddings, this
                factor has no effect. If seq_len <= max_position_embeddings, this factor effectively stretches the
                context window using an exponent derived from `dim`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length, used to update the dynamic RoPE at inference time. If `None` or shorter than
            max_position_embeddings, this value will be overridden by max_position_embeddings.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    Nr@   rA   rB   rC   r?   rF   r(   r   rD   r   rE   rG   )rH   r   rI   r    rJ   rK   rL   max_position_embeddings
isinstancer   TensormaximumtensorrF   r(   r   rM   rN   r%   rO   )r   r(   r   r   rP   rQ   rA   rC   rR   r?   rS   r   s               r+   _compute_dynamic_ntk_parametersr\      s   V ""$$$AKAW61*==]c]s-D0445LcRRvz6+=A[+[\\H
h..
/
/C!(+F 0	GU\	*	* ?-L7w}U\Ucddd
 

 gv=>> FW$v'EE&ST*U[^behibi[jkkDdu|AsAU[IIILLTZbgbmLnnqttuvH%%%r-   c                    |                                   || j        |         n| j        }|d         }|                    dd          }t          | d| j        | j        z            }t          ||z            }|d         }	|                    d          }
|                    d          }|                    d	          }|d
         }|	
| j        |z  }	dd}|
6|r)|r't           ||	|           ||	|          z            }
n ||	          }
|                    d          pd}|                    d          pd}d fd}d }|t          j
        d|d                              |t          j                  |z  z  }d|z  }d|	|z  z  }| j                            dd          } |||||||          \  }}d ||||dz                                |t          j                  z
  }|d|z
  z  ||z  z   }||
fS )a	  
    Computes the inverse frequencies with NTK scaling. Please refer to the
    [original paper](https://huggingface.co/papers/2309.00071)

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
            *   rope_parameters (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
                keys will be accessed:
                *   `attention_factor` (`float`, *optional*): The scaling factor to be applied to the computed cos/sin.
                    If None, the value is inferred from `factor`, `mscale`, and `mscale_all_dim` as available.
                *   `beta_fast` (`float`, *optional*, defaults to 32): Parameter to set the boundary for extrapolation
                    (only) in the linear ramp function.
                *   `beta_slow` (`float`, *optional*, defaults to 1): Parameter to set the boundary for interpolation
                    (only) in the linear ramp function.
                *   `factor` (`float`, *optional*): The scaling factor applied when interpolating the position IDs to
                    extend the possible context length. Additionally, if `attention_factor` is None, the log of this
                    value is used to compute a value for `attention_factor`, possibly in conjunciton with `mscale` and
                    `mscale_all_dim`, if provided.
                *   `mscale` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                    `mscale_all_dim` are provided, `mscale` acts scalar augmenting `log(factor)` when computing the
                    numerator for the inferred value of `attention_factor`. If not provided, `attention_factor` will be
                    calculated based on `factor` only.
                *   `mscale_all_dim` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                    `mscale_all_dim` are provided, `mscale_all_dim` acts scalar augmenting `log(factor)` when computing
                    the denominator for the inferred value of `attention_factor`. If not provided, `attention_factor`
                    will be calculated based on `factor` only.
                *   `original_max_position_embeddings` (`int`): The original max position embeddings used during pretraining.
                *   `truncate` (`bool`, *optional*): Whether to truncate the correction range.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
                will be returned for the first fraction of the head_dim.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    Nr@   rA   rB   rC   r?   rS   mscalemscale_all_dimr   r   c                 L    | dk    rdS d|z  t          j        |           z  dz   S )Nr   rB   g?)mathlog)scaler^   s     r+   
get_mscalez,_compute_yarn_parameters.<locals>.get_mscaleN  s,    A::3V|dhuoo-33r-   	beta_fast    	beta_slowc                     |t          j        || dz  t           j        z  z            z  dt          j        |          z  z  S )zPInverse dimension formula to find the dimension based on the number of rotationsrD   )ra   rb   pi)num_rotationsrR   rQ   rW   s       r+   find_correction_dimz5_compute_yarn_parameters.<locals>.find_correction_dim`  sA    dh6-!:Kdg:UVWWW\]`d`him`n`n\noor-   c                      | |||          } ||||          }|r(t          j        |          }t          j        |          }t          |d          t	          ||dz
            fS )z.Find dimension range bounds based on rotationsr   r   )ra   floorceilr   min)	low_rothigh_rotrR   rQ   rW   truncatelowhighrk   s	           r+   find_correction_rangez7_compute_yarn_parameters.<locals>.find_correction_ranged  st    !!'36MNN""8S$8OPP 	#*S//C9T??D3{{CcAg....r-   c                     | |k    r|dz  }t          j        |t           j                  | z
  || z
  z  }t          j        |dd          }|S )NgMbP?rE   r   r   )r   rM   float32clamp)ro   r   rR   linear_func	ramp_funcs        r+   linear_ramp_factorz4_compute_yarn_parameters.<locals>.linear_ramp_factorm  sQ    #::5LC|Cu}===Cc	RKQ22	r-   r   rD   rG   rr   T)r   )rH   r   rI   r    rJ   rK   rL   rW   rO   r   rM   r%   )r   r(   r   r   rP   rQ   rA   rC   rR   r?   rS   r^   r_   r   rd   re   rg   ru   r{   	pos_freqsinv_freq_extrapolationinv_freq_interpolationrr   rs   rt   inv_freq_extrapolation_factorr   rk   s                              @r+   _compute_yarn_parametersr      s   t ""$$$AKAW61*==]c]s-D0445LcRRvz6+=A[+[\\H
h..
/
/C!(+F+//0BCC!%%h//F)--.>??N';<^'_$
 ~/2RR4 4 4 4  	2n 	2$ZZ%?%?**VUcBdBd%dee)z&11 %((55;I$((55:Ip p p/ / / / /   aa003363UUX[[\I 9_ FY$67%))*d;;H%%iCGgiqrrIC %&(:(:3cQh(O(O(R(RZ`hmhs(R(t(t$t!!&C"CD
 #@
@	A  %%%r-   c                 D   |                                   || j        |         n| j        }|d         }|                    dd          }t          | d| j        | j        z            }t          ||z            }|d         }	|d         }
|                    d          }|                    d	          }|d
         }|
| j        |z  }|G|dk    rd}n>t          j	        dt          j
        |          t          j
        |          z  z             }|r(||k    r"t          j        |	t          j        |          }n!t          j        |
t          j        |          }t          j        d|dt          j        |                                          |z  }d|||z  z  z  }||fS )a  
    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
    [original implementation](https://github.com/microsoft/LongRoPE)

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
            *   original_max_position_embeddings (`int`, *optional*): The original max position embeddings used during
                pretraining. If not provided, defaults to `max_position_embeddings`.
            *   rope_parameters (`dict[str, float]`): The standard RoPE scaling parameters, from which the following keys
                will be accessed:
                *   `attention_factor` (`float`, *optional*): The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, inferred from
                    the value of `factor`.
                *   `factor` (`float`, *optional*): The scaling factor to apply to the RoPE embeddings. If both
                    `max_position_embeddings` and `original_max_position_embeddings` are provided, this value will be
                    overridden s the ratio between those values.
                *   `long_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                    frequencies if `seq_len` is provided and greater than `original_max_position_embeddings`.
                *   `short_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                    frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
                will be returned for the first fraction of the head_dim.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    Nr@   rA   rB   rC   long_factorshort_factorr?   rS   r   r   rV   r   rD   )rH   r   rI   r    rJ   rK   rL   rW   ra   sqrtrb   r   r[   rw   rM   rN   rO   )r   r(   r   r   rP   rQ   rA   rC   rR   r   r   r?   rS   r   ext_factorsinv_freq_shaper   s                    r+   _compute_longrope_parametersr     s   d ""$$$AKAW61*==]c]s-D0445LcRRvz6+=A[+[\\H
h..
/
/C&}5K'7L!%%h//F+//0BCC';<^'_$
 ~/2RR S=="#yTXf-=-=Ii@j@j-j)jkk  U7===l;emFSSSl<u}VTTT\!S!5;vNNNTTVVY\\NkD.$889H%%%r-   c                    |                                   || j        |         n| j        }|d         }|                    dd          }t          | dd          p| j        | j        z  }t          ||z            }d}	d|t          j        d|dt          j	                  
                    |t          j        	          |z  z  z  }
|d
         }|d         }|d         }|d         }||z  }||z  }dt          j        z  |
z  }t          j        ||k    |
|z  |
          }||z  |z
  ||z
  z  }d|z
  |z  |z  ||z  z   }||k      ||k     z  }t          j        |||          }||	fS )au
  
    Computes the inverse frequencies for llama 3.1.

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   rope_parameters (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
                keys will be accessed:
                *   `factor` (`float`, *optional*): The scaling factor applied to the inverse frequencies when 1) the
                    wavelength is greater than `low_freq_wavelen` prior to smoothing, and 2) to all inverse frequencies
                    during smoothing.
                *   `high_freq_factor` (`float`): The scale factor used to compute `high_freq_wavelen` and
                    the value for the denominator of the smoothing factor prior to the `low_freq_factor` shift.
                *   `low_freq_factor` (`float`): The scale factor used to compute `low_freq_wavelen` and
                    the shift applied to the numerator and denominator of the smoothing factor.
                    frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.
                *   `original_max_position_embeddings` (`int`): The original max position embeddings used
                    during pretraining. If not provided, the function falls back to `max_position_embeddings`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    Nr@   rA   rB   rC   r   rD   rE   rG   r?   low_freq_factorhigh_freq_factorr   r   )rH   r   rI   r    rJ   rK   rL   r   rM   rN   r%   rO   ra   ri   where)r   r(   r   r   rP   rQ   rA   rC   rR   rS   r   r?   r   r   old_context_lenlow_freq_wavelenhigh_freq_wavelenwaveleninv_freq_llamasmooth_factorsmoothed_inv_freqis_medium_freqs                         r+   _compute_llama3_parametersr     s   Z ""$$$AKAW61*==]c]s  -D0445LcRRvz400dF4F&Jd4dH
h..
/
/C du|AsAU[IIILLTZbgbmLnnqttuvH!(+F*+<=O+,>?*+MNO&8'*::$'kH$G [+;!;X=NPXYYN$w.@EUXgEghM]*n<vEXfHff!223BR8R6SSN[1BNSSN+++r-   )linearr5   yarnr6   llama3c                       e Zd ZU dZeed<   edz  ed<   edz  ed<   edz  ed<   edz  ed<   edz  ed<   edz  ed	<   edz  ed
<   ee         dz  ed<   ee         dz  ed<   edz  ed<   edz  ed<   dS )RopeParametersaY
  
    Args:
        rope_theta (`float`):
            The base period of the RoPE embeddings.
        rope_type (`str`, *optional*, defaults to "default"):
            The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
            'llama3'], with 'default' being the original RoPE implementation.
        partial_rotary_factor (`float`, *optional*):
            The percentage of the query and key head embedding on which RoPE will be applied.
        factor (`float`, *optional*):
            Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
            most scaling types, a `factor` of x will enable the model to handle sequences of length x *
            original maximum pre-trained length.
        original_max_position_embeddings (`int`, *optional*):
            Used with 'yarn', 'longrope' and 'llama3'. The original max position embeddings used during
            pretraining.
        attention_factor (`float`, *optional*):
            Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
            computation. If unspecified, it defaults to value recommended by the implementation, using the
            `factor` field to infer the suggested value.
        beta_fast (`float`, *optional*):
            Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
            ramp function. If unspecified, it defaults to 32.
        beta_slow (`float`, *optional*):
            Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
            ramp function. If unspecified, it defaults to 1.
        short_factor (`list[float]`, *optional*):
            Only used with 'longrope'. The scaling factor to be applied to short contexts (<
            `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
            size divided by the number of attention heads divided by 2
        long_factor (`list[float]`, *optional*):
            Only used with 'longrope'. The scaling factor to be applied to long contexts (<
            `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
            size divided by the number of attention heads divided by 2
        low_freq_factor (`float`, *optional*):
            Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
        high_freq_factor (`float`, *optional*):
            Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    r@   Nr   rA   r?   r   rS   re   rg   r   r   r   r   )	__name__
__module____qualname____doc__rO   __annotations__strrL   list r-   r+   r   r   :  s         & &P Tz 4<'''DL&)Dj000dl"""t|t|u+$$$$et####T\!!!dl"""""r-   r   F)totalc                      e Zd ZdZdZddedz  fdZd Zdddd	edz  fd
Zdde	d	edz  fdZ
dde	d	edz  fdZdde	d	edz  fdZdde	d	edz  fdZdde	d	edz  fdZdde	d	edz  fdZe	 	 ddededededz  d	edz  f
d            ZdS )RotaryEmbeddingConfigMixinz[
    A Mixin containing the functionality to standardize and validate RoPE parameters.
    g     @Nignore_keys_at_rope_validationc                 
   |                     dd           }|p| j        | _        | j        | j        ni | _        |                     dt          | d| j                            }| j                            d|           |                    dt          | dd                     }|3| j                            d|           |t                      n|}|dhz  }|                                  |                     |           |S )Nrope_scalingr@   rA   ignore_keys)	popr   r    default_theta
setdefaultrI   setrH   validate_rope)r&   r   r8   r   r@   rA   s         r+   convert_rope_params_to_dictz6RotaryEmbeddingConfigMixin.convert_rope_params_to_dictx  s   zz.$77+Ct/C7;7K7Wt33]_ ZZgdL$J\.].]^^
''jAAA &

+BGDRikoDpDp q q , ++,CEZ[[[7?Ec + .LOfNg-g*$$&&&'EFFFr-   c                    t          | dd          }t          | dd          }t          | dd          pi }t          | dd          }|s|st                              d           dS |:|i k    s4t          |                                                              |          s|                    d|                    dd	                     |                    d|           |||d<   |d         d
v r@t          | d          r| j	        | j
        d<   n| j
                            d| j                   nt          |          D ]}||                             d||                             dd	                     ||                             d|           ||||         d<   ||         d         d
v r&| j
        |                             d| j                   || _
        dS )z
        Helper to standardize the config's rope params field by ensuring the params are defined for each
        later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility)
        r@   NrA   r   layer_typeszG`standardize_rope_params` was called but no RoPE parameters were found.r   typedefault)r   r   r6   r   )r    loggerwarningr   keysissubsetr   rI   r!   r   r   rW   )r&   r@   rA   r   r   r   s         r+   rH   z2RotaryEmbeddingConfigMixin.standardize_rope_params  s)    T<66
 '.Et L L!$(94@@FBdM488    	:  	NNdeeeF Or$9$9_EYEYE[E[A\A\AeAefqArAr$9&&{O4G4GPY4Z4Z[[[&&|Z@@@$0;P 78 {+/MMM4!CDD v PTOtD()KLL(334VX\Xtuuu "+.. 	 	

+66{OT^D_DcDcdjluDvDvwww
+66|ZPPP(4K`OJ/0GH":.{;?]]](4??:D<X    /r-   r&   r   r   c                    | j         }|dS t          | dd          :t          |                                                              | j                  rnd|i}|                                D ]t}|                    d|                    dd                    }t          | d| dd          }||d<   | |||	           Vt          	                    d
| d           udS )zY
        Validate the RoPE config arguments, given a `"PreTrainedConfig"` object
        Nr   full_attentionr   r   r   
_validate__rope_parametersr   zMMissing validation function in 'RotaryEmbeddingConfigMixin' for 'rope_type'='')
r   r    r   r   r   r   valuesrI   r   r   )r&   r   rP   r   r   validation_fns         r+   r   z(RotaryEmbeddingConfigMixin.validate_rope  s&     $3'F4--9cBVB[B[B]B]>^>^>g>g?
 ?
9 $46J#K 3::<< 
	 
	O'++K9L9LVU^9_9_``I#D*Ry*R*R*RTXYYM+4OK((o;GGGGGpdmppp   
	 
	r-   r   c                     ddh}t          |                                          }|d         }|                     ||||           d S )Nr   r@   r   )r   r   _check_received_keys)r&   r   r   required_keysreceived_keysr   s         r+   !_validate_default_rope_parametersz<RotaryEmbeddingConfigMixin._validate_default_rope_parameters  sS    $l3O002233#K0	!!)]MWb!cccccr-   c                    h d}t          |                                          }|d         }|                     ||||           |d         }|t          |t                    r|dk     rt
                              d|            d S d S )N>   r?   r   r@   r   r   r?   rB   ;`rope_parameters`'s factor field must be a float >= 1, got r   r   r   rX   rO   r   r   r&   r   r   r   r   r   r?   s          r+    _validate_linear_rope_parametersz;RotaryEmbeddingConfigMixin._validate_linear_rope_parameters  s    ===O002233#K0	!!)]MWb!ccc *>FE!:!:>fsllNNaY_aabbbbb ?Klr-   c                    ddh}t          |                                          }|d         }|                     ||||           |d         }|t          |t                    r|dk     rt
                              d|            d S d S )Nr   r?   r   rB   r   r   r   s          r+   !_validate_dynamic_rope_parametersz<RotaryEmbeddingConfigMixin._validate_dynamic_rope_parameters  s    $h/O002233#K0	!!)]MWb!ccc *>FE!:!:>fsllNNaY_aabbbbb ?Klr-   c           	         h d}h d}t          |                                          }|d         }|                     |||||           |d         }|t          |t                    r|dk     rt
                              d|            |                    d          }|8t          |t                    r|d	k     rt
                              d
|            |                    d          }	|	2t          |	t                    st
                              d|	            |                    d          }
|
2t          |
t                    st
                              d|
            |	pd|
pdk     r!t
                              d|	 d|
 d           | j        d         }| j	        |z  }||k    r,|dk    r(t
          
                    d| d| d| d           d S d S d S )N>   r?   r   r@   r   >   r^   rr   re   rg   r_   rS   r   r   r?   rB   r   rS   r   O`rope_parameters`'s attention_factor field must be a float greater than 0, got re   z9`rope_parameters`'s beta_fast field must be a float, got rg   z9`rope_parameters`'s beta_slow field must be a float, got rf   r   zR`rope_parameters`'s beta_fast field must be greater than beta_slow, got beta_fast=z( (defaults to 32 if None) and beta_slow=z (defaults to 1 if None)r   zKThe explicitly set RoPE scaling factor (config.rope_parameters['factor'] = z) does not match the ratio implicitly set by other parameters (implicit factor = post-yarn context length / pre-yarn context length = config.max_position_embeddings / config.rope_parameters['original_max_position_embeddings'] = z). Using the explicit factor (z) in YaRN. This may cause unexpected behaviour in model usage, please correct the 'original_max_position_embeddings' fields in the model config.)r   r   r   rX   rO   r   r   rI   r   rW   warning_once)r&   r   r   r   optional_keysr   r   r?   rS   re   rg   r   implicit_factors                r+   _validate_yarn_rope_parametersz9RotaryEmbeddingConfigMixin._validate_yarn_rope_parameters  s}   aaa
 
 
 O002233#K0	!!)]M=fq!rrr *>FE!:!:>fsllNNaY_aabbb*../ABB'<Le1T1T'XhklXlXlNNtbrtt   $''44	 Iu)E)E NNbW`bbccc#''44	 Iu)E)E NNbW`bbcccO	Q//NN^en ^ ^:C^ ^ ^   ,0+?@b+c(69YYf$$A)=)=~^d ~ ~ #	~ ~ CI	~ ~ ~     %$)=)=r-   c                    h d}ddh}t          |                                          }|d         }|                     |||||           |                    dd          }t	          | d| j        | j        z            }t          ||z            }	|                    d	          }
t          |
t                    s6t          d
 |
D                       rt                              d|
            t          |
          |	dz  k    r0t                              d|	dz   dt          |
                      |                    d          }t          |t                    s6t          d |D                       rt                              d|            t          |          |	dz  k    r0t                              d|	dz   dt          |                      |                    d          }|d         }||t                              d           nW||t                              d           n8t          |t                    r|dk     rt                              d|            |                    d          }|:t          |t                    r|dk     r!t                              d|            d S d S d S )N>   r   r@   r   r   r   rS   r?   r   r   rA   rB   rC   r   c              3   N   K   | ] }t          |t          t          f          V  !d S r;   rX   rL   rO   .0r7   s     r+   	<genexpr>zPRotaryEmbeddingConfigMixin._validate_longrope_rope_parameters.<locals>.<genexpr>5  s1      5h5hVWjS%L6Q6Q5h5h5h5h5h5hr-   zF`rope_parameters`'s short_factor field must be a list of numbers, got rD   z8`rope_parameters`'s short_factor field must have length z, got r   c              3   N   K   | ] }t          |t          t          f          V  !d S r;   r   r   s     r+   r   zPRotaryEmbeddingConfigMixin._validate_longrope_rope_parameters.<locals>.<genexpr>=  s1      4f4fUVZC<5P5P4f4f4f4f4f4fr-   zE`rope_parameters`'s long_factor field must be a list of numbers, got z7`rope_parameters`'s long_factor field must have length r   av  This model config has set a `rope_parameters['original_max_position_embeddings']` field, to be used together with `max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_parameters`with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, as it is compatible with most model architectures.z4Missing required keys in `rope_parameters`: 'factor'r   g        r   )r   r   r   rI   r    rJ   rK   rL   rX   r   allr   r   lenr   rO   )r&   r   r   r   r   r   r   rA   rC   rR   r   r   r?   r   rS   s                  r+   "_validate_longrope_rope_parametersz=RotaryEmbeddingConfigMixin._validate_longrope_rope_parameters)  s    vvv+X6O002233#K0	!!)]M=fq!rrr / 3 34KS Q Q4T-=AY-YZZ(2233&**>::,-- 	t#5h5h[g5h5h5h2h2h 	tNNrdprrsss|q((NNn3RS8nn[^_k[l[lnn   &))-88+t,, 	r4f4fZe4f4f4f1f1f 	rNNpcnppqqq{sax''NNl#QR(llZ]^iZjZjll   !$$X..+:;]+^( >>JE    ^ @ HNNQRRRRFE** 	cfsllNNaY_aabbb*../ABB'<Le1T1T'XhknXnXnNNtbrtt     ('XnXnr-   c                 .   h d}|d         }t          |                                          }|                     ||||           |d         }|t          |t                    r|dk     rt
                              d|            |d         }|d         }|t          |t                    st
                              d	|            |t          |t                    st
                              d
|            ||k    r t
                              d| d|            |d         }	|	t          |	t                    st
                              d|	            |	| j        k    r't
                              d|	 d| j                    d S d S )N>   r?   r   r@   r   r   r   r   r   r?   rB   r   r   r   z?`rope_parameters`'s low_freq_factor field must be a float, got z@`rope_parameters`'s high_freq_factor field must be a float, got zf`rope_parameters`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=z and low_freq_factor=r   zS`rope_parameters`'s original_max_position_embeddings field must be an integer, got zj`rope_parameters`'s original_max_position_embeddings field must be less than max_position_embeddings, got z and max_position_embeddings=)	r   r   r   rX   rO   r   r   rL   rW   )
r&   r   r   r   r   r   r?   r   r   r   s
             r+    _validate_llama3_rope_parametersz;RotaryEmbeddingConfigMixin._validate_llama3_rope_parameters[  s   
 
 
 $K0	O002233!!)]MWb!ccc *>FE!:!:>fsllNNaY_aabbb)*;<*+=>"*_e*L*L"NNn]lnnooo#:6F+N+N#NNp^nppqqq..NNL#L L:IL L  
 ,;;]+^(+3:Ffhk;l;l3NN636 6   ,t/KKKNNq3q qRVRnq q     LKr-   r   r   r   r   c                 4   d|v r|dhz  }|                     d           |pt                      }d|vr|                     d           |||z  }||z
  }|rt          d|  d|           ||z
  |z
  }|r"t                              d|  d|            dS dS )z\Compare the received keys in `config.rope_parameters` against the expected and optional keysr   r   rA   Nz<Missing required keys in `rope_parameters` for 'rope_type'='z': z8Unrecognized keys in `rope_parameters` for 'rope_type'=')addr   KeyErrorr   r   )r   r   r   r   r   missing_keysunused_keyss          r+   r   z/RotaryEmbeddingConfigMixin._check_received_keys  s     ]""fX%Mk***%."-775666 "[(M$}4 	xvZcvvhtvvwww#m3mC 	sNNqV_qqdoqqrrrrr	s 	sr-   r;   )NN)r   r   r   r   r   r   r   rH   r   dictr   r   r   r   r   r   staticmethodr   r   r   r-   r+   r   r   q  s!         M #PT*    2./ ./ ./` . S4Z    6d d dTWZ^T^ d d d dc c cSVY]S] c c c cc c cTWZ^T^ c c c c1 1d 1QTW[Q[ 1 1 1 1f0 0$ 0UX[_U_ 0 0 0 0d' ' 'SVY]S] ' ' ' 'R 
 %)"&s sss s Tz	s
 4Zs s s \s s sr-   r   r   c                     t          j        dt                     |                                  |                     |           dS )zq
    This is a deprecated function.
    It has been kept for backward compatibility with custom code models.
    aX  `rope_config_validation` is deprecated and has been removed. Its functionality has been moved to RotaryEmbeddingConfigMixin.validate_rope method. PreTrainedConfig inherits this class, so please call self.validate_rope() instead. Also, make sure to use the new rope_parameters syntax. You can call self.standardize_rope_params() in the meantime.r   N)warningswarnFutureWarningrH   r   )r   r   s     r+   rope_config_validationr     sT    
 M	G
 	   ""$$$
[11111r-   )NNNN)NNNr;   ) ra   r   	functoolsr   typingr   r   r   utilsr	   r
   
get_loggerr   r   r   configuration_utilsr   r<   rL   r   tuplerO   rT   r\   r   r   r   r"   r   r   r   r   r   r-   r+   <module>r      s           5 5 5 5 5 5 5 5 5 5 . . . . . . . . 
	H	%	%  LLL 6555555` ` `H ,0'+!	3& 3&'(3&^$3& 4Z3& d
	3&
 >5 !3& 3& 3& 3&n ,0'+!	C& C&'(C&^$C& 4ZC& d
	C&
 >5 !C& C& C& C&P (,!	D& D&D&^$D& 4ZD& d
	D&
 >5 !D& D& D& D&R (,!	U& U&U&^$U& 4ZU& d
	U&
 >5 !U& U& U& U&t (,!	L, L,L,^$L, 4ZL, d
	L,
 >5 !L, L, L, L,f 6.$,(  4# 4# 4# 4# 4#Ye 4# 4# 4# 4#nos os os os os os os osd	2 2#= 2CRVJ 2 2 2 2 2 2r-   