
    bi*                        d dl Z d dlZd dlmZ d dlmZ d dlZej        j        j	        ej        j        j
        ej        j        j        ej        j        j        ej        j        j        ej        j        j        ej        j        j        ej        j        j        ej        j        j        ej        j        j        ej        j        j        ej        j        j        ej        j        j        ej        j        j        dZ	 d2dej        ded	ed
ej        dz  dej        f
dZ		 d2dej        deded
ej        dz  dej        f
dZ
dej        dedej        fdZdej        dej        fdZdej        dej        fdZdej        dej        fdZd3dej        dedej        fdZd4dej        ded
ej        dz  dej        fdZd4dej        ded
ej        dz  dej        fdZ	 	 	 	 d5dej        dededed
ej        dz  dej        fdZ	 	 	 	 d5dej        dededed
ej        dz  dej        fd Z	 	 	 	 	 d6dej        dededed	ed
ej        dz  dej        fd#Z	 	 d7dej        ded
ej        dz  dej        fd$Z	 d8dej        d&eded
ej        dz  dej        f
d'Zdej        d(ej        dej        fd)Zd9d+Zd, Zd- Z d.Z!ed/             Z"ed0             Z#ed1             Z$dS ):    N)defaultdict)contextmanager)uniform_normal_	constant_ones_zeros_eye_dirac_xavier_uniform_xavier_normal_kaiming_uniform_kaiming_normal_trunc_normal_orthogonal_sparse_              ?tensorab	generatorreturnc                 Z    t          | dd          st          d         | |||          S | S )N_is_hf_initializedFr   )r   r   r   getattrTORCH_INIT_FUNCTIONS)r   r   r   r   s       V/root/projects/butler/venv/lib/python3.11/site-packages/transformers/initialization.pyr   r   *   s;     6/77 W#J/!qIVVVVM    meanstdc                 Z    t          | dd          st          d         | |||          S | S )Nr   Fr   )r!   r"   r   r   )r   r!   r"   r   s       r   r   r   2   s<     6/77 `#I.vDcU^____Mr    valc                 V    t          | dd          st          d         | |          S | S )Nr   Fr   )r$   r   )r   r$   s     r   r   r   :   s5    6/77 B#K0SAAAAMr    c                 R    t          | dd          st          d         |           S | S )Nr   Fr   r   r   s    r   r   r   @   s/    6/77 5#G,V444Mr    c                 R    t          | dd          st          d         |           S | S )Nr   Fr	   r   r'   s    r   r	   r	   F   s/    6/77 6#H-f555Mr    c                 R    t          | dd          st          d         |           S | S )Nr   Fr
   r   r'   s    r   r
   r
   L   s/    6/77 4#F+F333Mr       groupsc                 V    t          | dd          st          d         | |          S | S )Nr   Fr   )r+   r   )r   r+   s     r   r   r   R   s5    6/77 E#H-fVDDDDMr    gainc                 X    t          | dd          st          d         | ||          S | S )Nr   Fr   r-   r   r   r   r-   r   s      r   r   r   X   s9    6/77 _#$56vDT]^^^^Mr    c                 X    t          | dd          st          d         | ||          S | S )Nr   Fr   r/   r   r0   s      r   r   r   ^   s9    6/77 ^#$45f4S\]]]]Mr    fan_in
leaky_relumodenonlinearityc                 \    t          | dd          st          d         | ||||          S | S )Nr   Fr   r   r4   r5   r   r   r   r   r4   r5   r   s        r   r   r   d   sH     6/77 
#$67ad
 
 
 	
 Mr    c                 \    t          | dd          st          d         | ||||          S | S )Nr   Fr   r7   r   r8   s        r   r   r   r   sH     6/77 
#$56ad
 
 
 	
 Mr                  @c                 ^    t          | dd          st          d         | |||||          S | S )Nr   Fr   )r!   r"   r   r   r   r   )r   r!   r"   r   r   r   s         r   r   r      sB     6/77 p#O4V$CSTXYenooooMr    c                 X    t          | dd          st          d         | ||          S | S )Nr   Fr   r/   r   r0   s      r   r   r      s:    
 6/77 [#M26PYZZZZMr    {Gz?sparsityc                 Z    t          | dd          st          d         | |||          S | S )Nr   Fr   )r?   r"   r   r   )r   r?   r"   r   s       r   r   r      s<     6/77 h#I.vc]fggggMr    otherc                     t          | dd          s@t          j                    5  |                     |          cd d d            S # 1 swxY w Y   | S )Nr   F)r   torchno_gradcopy_)r   rA   s     r   rE   rE      s    6/77 ']__ 	' 	'<<&&	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	'Ms   AAAnormalc                    t           j        j                            |           \  }}|dk    r|}n|dk    r|}n|dk    r||z   dz  }d|z  }|dk    r(t	          | t          j        |          dz             d S |d	k    r%t          | t          j        |                     d S |d
k    r+t          j        d|z            }t          | | |           d S t          d|           )Nr2   fan_outfan_avg   r   truncated_normalg۶%?)r"   rF   uniform   zinvalid distribution )
rC   nninit_calculate_fan_in_and_fan_outr   mathsqrtr   r   
ValueError)r   r4   distributionr2   rH   denomvariancebounds           r   _variance_scalingrX      s   hmAA&IIOFGx						'!Q&U{H)))f$)H"5"58K"KLLLLLL		!	!DIh//000000		"	"	!h,''%'''''???@@@r    c                 L    t          | dd          st          | dd           | S )Nr   Fr2   rK   r4   rT   r   rX   r'   s    r   lecun_normal_r\      s3    6/77 R&x>PQQQQMr    c                 L    t          | dd          st          | dd           | S )Nr   Fr2   rF   rZ   r[   r'   s    r   default_flax_embed_init_r^      s2    6/77 H&xhGGGGMr    )
ztorch.nn.initztorch.nn.modules.activationztorch.nn.modules.transformerztorch.nn.modules.linearztorch.nn.modules.lossztorch.nn.modules.batchnormztorch.nn.modules.convztorch.nn.modules.normalizationztorch.nn.modules.rnnztorch.nn.modules.sparsec            	   #   ~  K   t          t                    } 	 t          D ]}|t          j        v rzt          j        |         }t
                                          D ]N}t          ||          r<t          ||          | |         |<   t          ||t                      |                    OdV  |                                 D ]0\  }}|                                D ]\  }}t          |||           1dS # |                                 D ]0\  }}|                                D ]\  }}t          |||           1w xY w)a  
    Guard the `torch.nn.init` primitive functions to behave exactly like the functions in this file, i.e. be
    protected against the `_is_hf_initialized` flag to avoid re-init if the param was already loaded.

    Usually, all models are using the init from `transformers` which are already guarded, but just to make extra sure
    and for remote code, we also use this context manager.
    N)r   dictTORCH_MODULES_TO_PATCHsysmodulesr   keyshasattrr   setattrglobalsitems)	originalsmodule_namemodule	func_name	functionsfuncs         r   guard_torch_init_functionsro      sq      D!!I11 	I 	IKck))[1!5!:!:!<!< I IIvy11 I7>vy7Q7Q	&))4	799Y3GHHH "+!2!2 	1 	1FI#,??#4#4 1 1	4	400001	1 	1!2!2 	1 	1FI#,??#4#4 1 1	4	400001	1s   BC5 5AD<c            	   #     K   ddl m}  d }t          t                    }	 t          D ]x}|t
          j        v rht
          j        |         }t                                          D ]<}t          ||          r*t          ||          ||         |<   t          |||           =y| j        }|| _        dV  |                                D ]0\  }}|                                D ]\  }}t          |||           1|| _        dS # |                                D ]0\  }}|                                D ]\  }}t          |||           1|| _        w xY w)ac  
    Disable weight initialization both at the torch-level, and at the transformers-level (`init_weights`).
    This is used to speed-up initializing an empty model with deepspeed, as we do not initialize the model on meta device
    with deepspeed, but we still don't need to run expensive weight initializations as we are loading params afterwards.
    r*   PreTrainedModelc                      d S N argskwargss     r   
empty_funcz#no_init_weights.<locals>.empty_func       r    N)modeling_utilsrr   r   r`   ra   rb   rc   r   rd   re   r   rf   init_weightsrh   )	rr   ry   ri   rj   rk   rl   original_init_weightsrm   rn   s	            r   no_init_weightsr~      s      0/////   D!!I=1 	? 	?Kck))[1!5!:!:!<!< ? ?Ivy11 ?7>vy7Q7Q	&))4	:>>> !0 <'1$ "+!2!2 	1 	1FI#,??#4#4 1 1	4	400001 (=$$$	 "+!2!2 	1 	1FI#,??#4#4 1 1	4	400001 (=$<<<<s   BD AEc               #   h   K   ddl m}  d }	 | j        }|| _        dV  || _        dS # || _        w xY w)a  
    Disable weight tying during loading with `from_pretrained`. This is needed as we want to have access to ALL
    weights in the state_dict during `from_pretrained`, and otherwise tying them would remove them from it, as it's
    called in `post_init` when instantiating.
    r*   rq   c                      d S rt   ru   rv   s     r   ry   z"no_tie_weights.<locals>.empty_func$  rz   r    N)r{   rr   tie_weights)rr   ry   original_tie_weightss      r   no_tie_weightsr     sm       0/////  ;.:&0# ';###&:#::::s   ( 	1)r   r   N)r*   )r   N)r   r2   r3   N)r   r   r:   r;   N)r*   N)r>   N)r2   rF   )%rQ   rb   collectionsr   
contextlibr   rC   rN   rO   r   r   r   r   r	   r
   r   r   r   r   r   r   r   r   r   Tensorfloat	GeneratorintstrrE   rX   r\   r^   ra   ro   r~   r   ru   r    r   <module>r      s    



 # # # # # # % % % % % %  &x}$(X] hm"HMhm"x}4hm26x}4X]08=,x}$  & _c L"-2EJ_W[E[
\    dh L %27JO/\`J`
\   el  5<    %, 5<    5< EL     %,     5<  U\     EL  Z^H^ jojv     5< u uY]G] iniu     $(, L  	
 % \     $(, L  	
 % \     (,
 
L


 

 	

 
 %
 \
 
 
 
 (, L
 % \	    cg L$)05IN[_I_
\   %, u|     A A A A,      1 1 14 != != !=H ; ; ; ; ;r    