o
    Ih$!                     @   s0  d dl Z d dlZd dlmZ d dlmZ ddlmZmZ	 ddl
mZmZmZmZ ddlmZmZmZmZmZ ddlmZ d	d
lmZmZmZmZmZmZ e eZ ej!j"Z"edd Z#dd Z$dd Z%ede#ddZ&eej'dZ(eej)de"j)j*dZ+e	,e"j'ddddZ-e	,e"j)d	d	ddddZ.dS )    N)counters)CKGemmTemplate   )irlowering)autotune_select_algorithmExternKernelChoiceSymbolicGridFnTritonTemplate)use_aten_gemm_kernelsuse_ck_gemm_templateuse_cpp_bmm_templateuse_cutlass_templateuse_triton_template)V   )_is_static_problemaddmm_epiloguemm_args
mm_configs
mm_optionsshould_fallback_to_atenc                C   s"   |||d |||d  | dfS )NBLOCK_MBLOCK_Nr    )bmnmetacdivr   r   N/var/www/vscode/kcb/lib/python3.10/site-packages/torch/_inductor/kernel/bmm.pybmm_grid%   s   "r!   c                 C   s(   | dks|dks|dkrdS | | dkS )N   Ti   r   )r   r   kr   r   r    _is_large_block_for_cpu*   s   r$   c                C   s&   |dkrt | ||dtdS t | ||S )Ncpug      ?)scaleexclude)r   r$   )r   r   r#   device_typer   r   r    bmm_configs1   s   r)   bmma  
{{def_kernel("A", "B")}}
    M = {{size("A", -2)}}
    N = {{size("B", -1)}}
    K = {{size("A", -1)}}

    stride_aq = {{stride("A", 0)}}
    stride_am = {{stride("A", 1)}}
    stride_ak = {{stride("A", 2)}}

    stride_bq = {{stride("B", 0)}}
    stride_bk = {{stride("B", 1)}}
    stride_bn = {{stride("B", 2)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1):
        ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    else:
        ram = rm % M
    if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1):
        rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    else:
        rbn = rn % N

    rk = tl.arange(0, BLOCK_K)

    idx_q = tl.program_id(1)  # batch dimension for BMM
    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak + idx_q*stride_aq)
    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn + idx_q*stride_bq)

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
    for k in range(K, 0, -BLOCK_K):
        if EVEN_K:
            a = tl.load(A)
            b = tl.load(B)
        else:
            a = tl.load(A, mask=rk[None, :] < k, other=0.)
            b = tl.load(B, mask=rk[:, None] < k, other=0.)
        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
        A += BLOCK_K * stride_ak
        B += BLOCK_K * stride_bk

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    idx_q = tl.program_id(1)  # batch dimension for BMM
    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_q", "idx_m", "idx_n"), "acc", "mask")}}
)namegridsourcezat::bmm_outzat::baddbmm_out)op_overloadlayoutc             
      sD  t dd | |fD r`|  d dks| d dkr2t| d} t|d}tjt| |ddS dd }d	d
   fdd}|| rPtjjj	d }|| |} ||r`tjjj	d }|||}t
| ||d\}}}	}} }td d| d| d|	   d7  < td|||	|  | | t rt| |f|gng }
t|rt|||	t| dD ]}tj|
f| |f|dt||||	| qt|\}}|r|rt||||	rddlm} ||
|| |g t|| |rddlm } |!|
|| |g t"||||	r	t#$|
|| |g t%|
r|
&t| |f| t'd|
| |g|S )Nc                 s   s    | ]
}|  jd kV  qdS )r%   N)
get_devicetype).0xr   r   r    	<genexpr>   s    ztuned_bmm.<locals>.<genexpr>r   r   )axisc                 S   s,   t | sdS t j| dd\}}t|t jS )NTF)freeze)r   is_storage_and_layoutas_storage_and_layout
isinstanceFlexibleLayout)t_r0   r   r   r    is_valid_to_require_contiguous   s   
z1tuned_bmm.<locals>.is_valid_to_require_contiguousc                 S   sP   |d dko| d dkp|d | d kp'|d dko'| d dkp'|d | d kS )Nr6   r   r   )sizesstridesr   r   r     is_preferred_layout_as_bmm_input   s   &&z3tuned_bmm.<locals>.is_preferred_layout_as_bmm_inputc                    s6   |j d  }|j d  } ||stj| } | S )Nval)r   sizestrider   ExternKernelrequire_contiguous)r=   meta_trA   rB   rC   r   r    may_require_contiguous   s
   
z)tuned_bmm.<locals>.may_require_contiguousr   r/   aten_mm_infoz	aten.bmm_r>   zPTuned aten.bmm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sr(   input_nodesr0   )CUTLASS3xGemmTemplate)CppBmmTemplater*   )(allget_sizeL	unsqueezesum_mulr   graphcurrent_nodeargsr   r   loginfo	get_dtyper   aten_bmmbindr   r)   r   get_device_typebmm_templatemaybe_append_choicer   r   r   codegen.cuda.gemm_templaterP   add_cutlass_gemm_choicesr   codegen.cpp_bmm_templaterQ   add_choicesr   r   add_ck_gemm_choicesr   appendr   )mat1mat2r0   r?   rK   	meta_mat1	meta_mat2r   r   r#   choicesconfigstatic_shape
is_nonzerorP   rQ   r   rJ   r    	tuned_bmm   sh    


&
rq   )alphabetar0   c             
   C   s  t ||| |d\}}}}}}} td d| d| d|   d7  < td|||| | |  | t rDtj| ||f|||dgng }	t|rwt	|||t
|dD ]!}
tj|	f| ||f|d	t|
||||dt|j||d
 qUtd|	| ||g|S )Nr/   rL   zaten.baddbmm_r>   r   z\Tuned aten.baddbmm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, inp=%s, output_layout=%s)rr   rs   rM   rN   )prefix_argsepilogue_fnbaddbmm)r   r   r[   r\   r]   r   aten_baddbmmr_   r   r)   r   r`   ra   rb   r   r   dtyper   )inpri   rj   rr   rs   r0   r   r   r#   rm   rn   r   r   r    tuned_baddbmm   s<   &	rz   )/loggingtorchtorch._dynamo.utilsr   7torch._inductor.codegen.rocm.ck_universal_gemm_templater    r   r   rT   select_algorithmr   r   r	   r
   utilsr   r   r   r   r   virtualizedr   	mm_commonr   r   r   r   r   r   	getLogger__name__r[   opsatenr!   r$   r)   ra   r*   r^   rv   outrw   register_loweringrq   rz   r   r   r   r    <module>   s8    


G

V