o
    Ih                     @   s  d dl Z d dlmZ d dlZd dlm  mZ d dlm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZmZ ddlmZmZmZ ddlmZm Z m!Z! ddl"m#Z#m$Z$ 				ddede%e de%e fddZ&de&_'dd Z(dS )    N)Optional)mm_args   )ir)CppGemmTemplate)CppGroupedGemmTemplatecreate_epilogue_with_attr)	TensorBox)addadd_needs_realized_inputsatenpermuteregister_loweringto_dtypeview)autotune_select_algorithmChoiceCallerExternKernelChoice)use_aten_gemm_kernelsuse_cpp_gemm_templateuse_max_autotune)opsVxwbc                    s    }t|dkrtd|d gt|}t sJ dd |D }g }	tt|d ddg d^ }
 }
tdd |D d	d fd
dt|D d}g|}|dd |D  t	j
|	 |fi | t|	dkspJ td|	| }|jj fddt|D tj|d  d__fddt|D }t|dkrt|D ]}t|| g |d d ||   d R ||< q|S )N   c                 S   s$   g | ]}|d u r
|nt j|qS N)r   ExternKernelrealize_input.0bias r%   T/var/www/vscode/kcb/lib/python3.10/site-packages/torch/_inductor/mkldnn_lowerings.py
<listcomp>0   s   $ z)grouped_gemm_lowering.<locals>.<listcomp>r   r   layoutc                 S   s   g | ]}|d uqS r   r%   r"   r%   r%   r&   r'   6   s    Tc                    s   i | ]}| qS r%   r%   )r#   numr   r%   r&   
<dictcomp>9   s    z)grouped_gemm_lowering.<locals>.<dictcomp>)has_biastrans_wepilogue_creatoract_mappingc                 S   s   g | ]}|d ur|qS r   r%   r"   r%   r%   r&   r'   =   s    grouped_gemmc                    s    g | ]}t  t|fgqS r%   )r   MultiOutputlistr#   gemm_idx)r)   template_bufr%   r&   r'   N   s    )devicec                    s   g | ]
}t j | qS r%   )r   r
   creater4   )return_bufsr%   r&   r'   T   s    )get_sizelenr   r   r   r   dictrangeextendr   add_choicesr   datar   MultiOutputLayout
get_devicer)   outputs)r   r   r   attrscalars	algorithmr)   x_sizenum_gemmchoices_kwargsinput_nodesresultreturn_tensorsr5   r%   )r)   r9   r6   r   r&   grouped_gemm_lowering    s\   	
&

 
rO   Tc               !      s  t jjrddlm tt jjjddj	j
dtt jjjjddjj
dtt jjjddjj
dtt jjjjddjj
dt jjjt jjjt jjjt jjjtjjt jjjg} tt jjjdtdtd	tffd
d}tt jjjjdtdtdtd	tffdd}tt jjjjdtdtdtd	tffdd}tt jjj	 d?dtdtdtffdd}tt jjjj	 d?dtdtdtdtffdd}tt jjjdtdtd	tffdd}ttjjdtdtdtdtdtdtdtd td!tt d"td#td$td%td&td'td(tf fd)d*}tt jjjd d+dtd,td-td.td	tf
fd/d0}tt jjjjd d+tt jjjjd d+dtd,td-td.td1td	tffd2d3}	tt jjjd d+	 d?dtd,td-td.td	tf
fd4d5}
tt jjjjd d+tt jjjjd d+	 d?dtd,td-td.td6td	tffd7d8}t jjrtt jjj d9dj!j
d | "t jjj  tt jjj d d:dtd;td<tdt#t f fd=d>}t$|  d S 	 d S )@Nr   	mkldnn_irzmkldnn::_linear_pointwiseF)has_out_variantkernel_creatorzonednn::qlinear_pointwiser   weightr$   c
           
         s$   t  j| |||||||||	
S r   )r
   r8   ConvolutionUnary)
r   rT   r$   paddingstridedilationgroupsrD   rE   rF   rP   r%   r&   convolution_unary   s   z5register_onednn_fusion_ops.<locals>.convolution_unaryotherc                    *   t  j| |||||||||	|
||S r   )r
   r8   ConvolutionBinaryr   r[   rT   r$   rV   rW   rX   rY   binary_attrbinary_alpha
unary_attrunary_scalarsunary_algorithmrP   r%   r&   convolution_binary   "   z6register_onednn_fusion_ops.<locals>.convolution_binaryc                    r\   r   )r
   r8   ConvolutionBinaryInplacer^   rP   r%   r&   convolution_binary_inplace   re   z>register_onednn_fusion_ops.<locals>.convolution_binary_inplacer   r   c                    s  |   }t|dkrt| d|d g} |d urtj|}g }t rrt|ddg}	t| |	|d^ }
}} }	t	|| |	rr fdd}t
|d udd	krNd n|d
}|d ur\g d|d< tj|||d u rh| |gn| ||gfi | t|dks{t rt
 d}|d u rd |d< |j|d u r| |gn| ||g|fi | | tjjv sJ ddd i}td||d u r| |gn| ||g||d}t|dkrt|g |d d |  d R }|S )Nr   r   r   r   r(   c                    s   t |  dS )NrE   rF   r   bufrF   rD   rE   r%   r&   r/      s   zJregister_onednn_fusion_ops.<locals>.linear_unary.<locals>.epilogue_creatorTnoner-   r.   r/   )r   r   r   input_indices)rD   rE   rF   Bc                 S      t jj|   S r   r   graph	constantsget_namer+   r%   r%   r&   <lambda>      zBregister_onednn_fusion_ops.<locals>.linear_unary.<locals>.<lambda>linear_unaryinput_gen_fnsr:   r;   r   r   r    r!   r   r   r   r   r<   r   r?   r   appendbindrt   r   rr   rs   r   )r   r   r   rD   rE   rF   r)   rG   rI   transposed_wrJ   r/   rK   ry   rM   )aten_mkldnn_linear_unaryrk   r&   rw      sd   
$z0register_onednn_fusion_ops.<locals>.linear_unaryyc                    s  |   }t|dkrt| d|d g}   }t|dkr&td|d g|d ur0tj|}g }t rt|ddg}	t| |	|d^ }
}} }	t	|| |	r fdd}t
|d ud|d	}|d u rhg d
ng d|d< tj|||d u r{| |gn| ||gfi | t|dkst rt
 d}|d u rd |d< |j|d u r| |gn| ||g|fi | | tjjv sJ ddd i}td||d u r| |gn| ||g||d}t|dkrt|g |d d |  d R }|S )Nr   r   r   r   r(   c                    s   t |  dS )N)r[   r   ri   rD   r   r%   r&   r/   ?  s   zKregister_onednn_fusion_ops.<locals>.linear_binary.<locals>.epilogue_creatorTrm   )r   r   r   )   r   r   r   rn   )rD   ro   c                 S   rp   r   rq   r+   r%   r%   r&   ru   [  rv   zCregister_onednn_fusion_ops.<locals>.linear_binary.<locals>.<lambda>linear_binaryrx   rz   )r   r   r   r   rD   r)   rG   y_sizerI   r}   rJ   r/   rK   ry   rM   )aten_mkldnn_linear_binaryr   r&   r   *  sl   
$z1register_onednn_fusion_ops.<locals>.linear_binaryc                    s&   t  j| |||||||||	|
S r   )r
   r8   ConvolutionTransposeUnary)r   rT   r$   rV   output_paddingrW   rX   rY   rD   rE   rF   rP   r%   r&   convolution_transpose_unaryh  s   z?register_onednn_fusion_ops.<locals>.convolution_transpose_unaryw0w1w2w3hxcxreversebatch_sizesmodehidden_size
num_layers
has_biasesbidirectionalbatch_firsttrainc                    s4   t tj j| |||||||||	|
|||||S r   )pytreetree_mapr
   r8   MkldnnRnnLayer)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rP   r%   r&   mkldnn_rnn_layer  s*   z4register_onednn_fusion_ops.<locals>.mkldnn_rnn_layer)type_promotion_kindpacked_weightw_scalew_zpc                    s   t |tksJ tjjtj|tjddd}t |tksJ tjjtj|tj	ddd}t
 j| |||||||||	|
||||||S )Ndtypex_scalenamex_zp)typefloatr   rr   add_tensor_constanttorchtensorfloat32intint32r
   r8   QConvPointWisePT2E)r   r   r   r   r   r   r$   rW   rV   rX   rY   o_inv_scaleo_zero_pointoutput_dtyperD   rE   rF   rP   r%   r&   qconvolution_unary  s:   z6register_onednn_fusion_ops.<locals>.qconvolution_unaryaccumc                    s   t |tksJ tjjtj|tjddd}t |tksJ tjjtj|tj	ddd}|dkrM|tjtj
fv rM| tjtj
fv rM| |krMt||}t j| |||||||||	|
|||||||||||S )Nr   r   r   r   sum)r   r   r   rr   r   r   r   r   r   r   bfloat16	get_dtyper   r
   r8   QConvPointWiseBinaryPT2E)r   r   r   r   r   r   r   r$   rW   rV   rX   rY   r   r   r   accum_scaleaccum_zpr_   alphara   rb   unary_algorithmmrP   r%   r&   qconvolution_binary  sN   
z7register_onednn_fusion_ops.<locals>.qconvolution_binaryc                    s  |  tju sJ d|  }t|dkrt| d|d g} t
tjs;t	
t
ks,J tjjtj
tjddd
n 
  tdd 
 D rOt
g 
t
 d	v s[J d
d u rmtjjtjdtjdddttjst	tks{J tjjtjtjdddn   dksJ d|d u rtjjtjdtjddd}  |  |  tjkrttj|tjrtjj|  tj}tjjtj|tjd| d}d u rd n  g }t rt| ||d^ }}} }ttj|tjrtttjj|  tjj|  rt|| |rtjj|    }tj!|tj
dd}tjj|| d d	 	
fdd}|   tj"tjfv scJ t#j$||d u rt| 
||gn| 
||gd u|d u rg dng dd t|dkst% rt& d}d u rd |d< |'j(d u r| 
||fn| 
||f|fi | | tjjv sJ dd dd dd dd d}ttj
tjrd d |d< ttjtjr
d!d |d< t)d"|d u r| 
||gn| 
||g||d#}t|dkr@t|g |d d | d R }|S )$Nz2Only int8 weights are supported by oneDNN qlinear.r   r   r   r   r   c                 s       | ]}|d kV  qdS r   Nr%   r#   dimr%   r%   r&   	<genexpr>P      zDregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.<genexpr>r   r   x_scale must be 0D or 1Dr   r   r   z(x_zp is incompatible with oneDNN qlinearr   r)   	out_dtyper   _BMatrixCompensc                    sJ  t jt jt jt jfv sJ |      d d ur, fdd}tj|  t j|| 	 d}
dkrSt
|
	d}t jkrr| fdd}tj| ||	 d}|S t jt jfv rdd	lm  |  fd
d}tj| tj|ttd|	 d}|S )Nc           	         s   | }t |tj}| d f}d}d}|}|}t t |||}t |t t t ||||} d ur`|}tjtjfv sNJ tjkrZt |tj}t ||}|S )Nr   r%   r   r   r   r   mulsubr   r   )	indexinputweight_compens_index_x_scale_x_zp_w_scale_weight_compotemp_bias)r$   
bias_dtypebias_loaderinput_loaderw_scale_loaderweight_compens_loaderx_scale_loaderx_zp_loaderr%   r&   inner_fn  sD   

z]register_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.epilogue_creator.<locals>.inner_fnr7   r   r   rangesrl   rh   c                        | }t |S r   r   r   r   r   output_cast_loaderr   r%   r&   inner_fn_cast_output_to_bf16     zqregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.epilogue_creator.<locals>.inner_fn_cast_output_to_bf16r   _create_constantsc           	         s   | } d| |t jd\}}t|| | }t jkr) ddt jd\}}n
 ddt jd\}}tt|||}t|S Ng      ?r   r      i   r   r   r   rounduint8minimummaximumr   	r   scale
zero_pointr   	inv_scalevalqminqmaxclampedr   r   requant_input_loaderr%   r&   inner_fn_requant   s   


zeregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.epilogue_creator.<locals>.inner_fn_requantr   r   r   r   r   r   int8make_loaderr   	PointwiserB   r:   r	   get_device_or_errorloweringr   	functoolspartialr   r   input_bufferr   
output_bufr   r   rF   rD   r$   r   o_scaler   r   rE   r   weight_compensr   r   )	r   r   r   r   r   r   r   r   r   r&   r/     sd   
5
'zKregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.epilogue_creator)r   r   r   r         )   r   r   r   r   r	  r
  r-   r/   rn   )output_scaleoutput_zero_pointr   post_op_namepost_op_argspost_op_algorithmr$   c                 S   rp   r   rq   r+   r%   r%   r&   ru   A  rv   zCregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.<lambda>c                 S   rp   r   rq   r+   r%   r%   r&   ru   B  rv   c                 S   rp   r   rq   r+   r%   r%   r&   ru   C  rv   c                 S   rp   r   rq   r+   r%   r%   r&   ru   D  rv   )r   r	  r
  r  c                 S   rp   r   rq   r+   r%   r%   r&   ru   K  rv   c                 S   rp   r   rq   r+   r%   r%   r&   ru   P  rv   qlinear_unaryrx   )*r   r   r   r:   r;   r   
isinstancer   r
   r   r   r   rr   r   r   r   realizeallr   r   	get_numelInputsKernelunwrap_storage_for_inputConstantBufferrs   rt   tor   r   equal
zeros_liker   to_denser   r   r   r?   r   r<   r{   r|   r   )r   r   r   r   r   r   r$   r  r   r   rD   rE   rF   r)   rG   w_zp_tensorrI   rJ   W_tensorweight_compens_tensorr/   rK   ry   rM   )aten_mkldnn_qlinear_unaryr  r&   r  1  s   





" 



	


	$z1register_onednn_fusion_ops.<locals>.qlinear_unaryx2c                    s  |   }
  }t|t|ksJ t|dkr.|dkr.t| d|d g} t
d|d g
ttjsKttks<J tj	j
tjtjdddn   tdd   D r_tg t  d	v skJ d
d u r}tj	j
tjdtjddd|d u rtj	j
tjdtjddd}ttjsttksJ tj	j
tjtjdddn    |  | tjkrttj|tjrtj	j|  tj}tj	j
tj|tjd| d}|dkrtjtjfv r
 tjtjfv r
 kr
t

n
 ksJ d
  d ur#  nd g }t r|dkrt| |
|d^ }}} }
ttjtjrt jdkrttj|tjrtt tj	j|  tj	j|  rt!|| |rtj	j|  }|" }tj#|tjdd}tj	j
|| d d	 	
fdd}t$j%|| d u r| ||
gn	| ||
 g d u| d u rg dng dd t|dkst& r%t'||||d
} d u rd |d< |(j) d u r| ||
fn	| ||
 f|fi | | tj	jv s0J dd dd dd d} d urGdd |d < t*d!| d u rX| ||
gn	| ||
 g||d"}t|dkr|dkrt|g |d d |  d R }|S )#Nr   r   r   r   r   r   c                 s   r   r   r%   r   r%   r%   r&   r     r   zEregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.<genexpr>r   r   r   r   r   r   zCdtype of accum for qlinear post op sum should be the same as outputr   r   r   c              
      sX  t jt jt jt jfv sJ |       	d 
d ur0
 
	f
dd}tj|  t j|| 	 d}dkrYt
|d}t jkrx| fdd}tj| ||	 d}|S t jt jfv rdd	lm  |  fd
d}tj| t jtj|ttd|	 d}|S )Nc           
         s  | }| }d}	d}t |tj}| d f}|}|}t t |||}t |t t t ||||} d urd|}	tjtjfv sRJ tjkr^t |	tj}	t ||	}tjtjfv snJ tjkrzt |tj}t ||}|S )Nr%   r   r   )
r   r   _x2r   r   r   r   _weight_compensr   r   )
r$   r   r   r   r   r   x2_dtype	x2_loaderr   r   r%   r&   r     sR   


z^register_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.epilogue_creator.<locals>.inner_fnr   rl   rh   c                    r   r   r   r   r   r%   r&   r   =  r   zrregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.epilogue_creator.<locals>.inner_fn_cast_output_to_bf16r   r   c           	         s   | } d| |t jd\}}t|| | }t jkr) ddt jd\}}n
 ddt jd\}}tt|||}t|t jS r   r   r   r   r%   r&   r   L  s   


zfregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.epilogue_creator.<locals>.inner_fn_requantr   r   r  r$   r   r  r   r   r   ra   rb   r   r  r"  r%  r   r   )
r   r   r   r   r   r   r   r&  r   r   r&   r/     sl   
7
'zLregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.epilogue_creator)r   r   r   r   r	  r
  r  )   r   r   r   r   r	  r
  r  r  )
r  r  r   other_scaleother_zpbinary_post_opr`   unary_post_opunary_post_op_argsunary_post_op_algorithmr$   c                 S   rp   r   rq   r+   r%   r%   r&   ru     rv   zDregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.<lambda>c                 S   rp   r   rq   r+   r%   r%   r&   ru     rv   c                 S   rp   r   rq   r+   r%   r%   r&   ru     rv   )r   r	  r
  c                 S   rp   r   rq   r+   r%   r%   r&   ru     rv   r(  qlinear_binaryrx   )+r:   r;   r   r  r   r
   r   r   r   rr   r   r   r   r   r  r  r   r   r   r  r  r  rs   rt   r  r   r   r   r   
get_layoutsizer  r  r   r  r   r   r?   r   r<   r{   r|   r   )r   r   r   r   r   r   r"  r$   r  r   r   x2_scalex2_zpr_   r   ra   rb   r   r)   rG   x2_sizer  rI   rJ   r  r   r/   rK   ry   rM   )aten_mkldnn_qlinear_binaryr'  r&   r/  _  s  








	
& 



	

	$z2register_onednn_fusion_ops.<locals>.qlinear_binaryzmkl::_mkl_linearr(   packed_worig_wc                   s   g }t  r-t|ddg}t| ||d^ }}} }t|| |r-tj||| ||gdddgd t|dks6t rE| j	| ||f|d |d |
 tjjv sOJ |
 tjjv sYJ dd	 d
d	 d}	td|| ||g||	d}
|d urwt|
|}
|
S )Nr   r   r(   Tr   )r.   rn   )ro   
batch_sizec                 S   rp   r   rq   r+   r%   r%   r&   ru     rv   zGregister_onednn_fusion_ops.<locals>.mkl_packed_linear.<locals>.<lambda>c                 S   rp   r   rq   r+   r%   r%   r&   ru     rv   )r   r   packed_linearrx   )r   r   r   r   r   r?   r;   r   r{   r|   rt   r   rr   rs   r   r   )r   r6  r7  r   r8  r)   rI   r}   rJ   ry   rM   )aten_mkl_linearr%   r&   mkl_packed_linear  sF   

z5register_onednn_fusion_ops.<locals>.mkl_packed_linearr   )%r   _C_has_mkldnn rQ   r   r   mkldnn_linear_pointwiseLinearUnaryr8   binaryLinearBinaryonednnqlinear_pointwiseQLinearPointwisePT2EQLinearPointwiseBinaryPT2E_convolution_pointwise_convolution_pointwise_ _convolution_transpose_pointwiser   r   defaultqconv2d_pointwiser   r
   boolr3   r   binary_tensorhas_mklmkl_mkl_linearMKLPackedLinearr{   r   r   )cpu_needs_realized_inputsrZ   rd   rg   rw   r   r   r   r   r   r  r/  r;  r%   )r:  r   r~   r5  r!  rQ   r&   register_onednn_fusion_opsc   s  


	!!A=
	
(3H  /  
B2rT  )NNNN))r  typingr   r   torch.utils._pytreeutils_pytreer    torch._inductor.kernel.mm_commonr   r>  r   codegen.cpp_gemm_templater   !codegen.cpp_grouped_gemm_templater   codegen.cpp_utilsr	   r
   r   r   r   r   r   r   r   r   select_algorithmr   r   r   r   r   r   virtualizedr   r   r3   rO   _inductor_lowering_functionrT  r%   r%   r%   r&   <module>   s6   $	
@