o
    Ih                     @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZ d dlZd dlZd dlZddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl#m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3m4Z4 ddl5m6Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZB ddlCmDZD ddlEmFZF ddlGmHZHmIZImJZJ erd dlKmLZL ddl*mMZMmNZN eOePZQdd ZRG dd deSZTG d d! d!e@ZUeU jVZWe@ jVZXejYd"ejZd#ej[d$ej\d%ej]d&ej^d'ej_d(ej`d)ejad*ejbd+ejcd,ejdd-ejed.iZfd/d0 Zgd1d2 ZhG d3d4 d4e?Zieijd5 G d6d7 d7e;ZkejlG d8d9 d9Zmd:d; Znd<d= ZoG d>d? d?eIZpG d@dA dAeJZqdS )B    )annotationsN)defaultdict)inf)AnyCallablecastOptionalTYPE_CHECKINGUnion   )is_integer_dtype)
OrderedSet)FloorDivModularIndexing)symbol_is_typeSymT)ValueRanges   )configir)HalideCodeCache)get_reduction_combine_fn)is_metric_table_enabledlog_kernel_metadata)AddParenHandler)HalideInputSpec
HalideMeta)get_bounds_index_exprget_kernel_metadataparallel_num_threadssympy_index_symbol
sympy_subs)_opsV   )	BackendFeatureCSEVariableDeferredLineIndentedBufferKernelArgTypeOpOverridesPythonPrinterSizeArg	TensorArg)DTYPE_TO_CPP)cexpr)constant_repr
SIMDKernelSIMDScheduling)Sequence)ReductionType	StoreModec                 C  sv   t | tr*d|   krdks*n ttj}| |jkrdS | |jkr$dS d| dS t | tr7dt|  dS t	| S )Ni   izhl.Int(64).min()zhl.Int(64).max()zhl.i64()zhl.f64()

isinstanceinttorchiinfoint64minmaxfloatr0   repr)valinfo rB   R/var/www/vscode/kcb/lib/python3.10/site-packages/torch/_inductor/codegen/halide.pyhalide_constant=   s    


rD   c                      s   e Zd Zd fddZ  ZS )UnsupportedreturnNonec                   s   t  d|  d S )Nz!halide backend does not support: )super__init__)selfthing	__class__rB   rC   rI   K      zUnsupported.__init__rF   rG   )__name__
__module____qualname__rI   __classcell__rB   rB   rL   rC   rE   J   s    rE   c                      s   e Zd Zedd Zedd Zdd Zdd Zd	d
 Zdd Z	e	Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Z fd+d,Zd-d. ZeZd/d0 Zd1d2 Z  ZS )3HalidePrinterc                 C  s   dt jj d|  dS )Nhl.cast(, r6   )r#   kernelindex_dtypeexprrB   rB   rC   
cast_indexP   s   zHalidePrinter.cast_indexc                 C     d|  dS )Nhl.cast(hl.Float(32), r6   rB   rY   rB   rB   rC   
cast_floatT      zHalidePrinter.cast_floatc                 C  s   d| dS )Nhl.f32(r6   rB   rJ   rZ   rB   rB   rC   _print_FloatX   s   zHalidePrinter._print_Floatc                 C  *   t |jdks	J d| |jd  dS )Nr$   r`   r   r6   lenargs_printra   rB   rB   rC   _print_ToFloat[      zHalidePrinter._print_ToFloatc                 C  0   t |jdks	J | d| |jd  dS )Nr$   	hl.floor(r   r6   re   rf   r[   rg   ra   rB   rB   rC   _print_floor_      zHalidePrinter._print_floorc                 C  rj   )Nr$   	hl.trunc(r   r6   rl   ra   rB   rB   rC   _print_Truncc   rn   zHalidePrinter._print_Truncc                 C  rj   )Nr$   hl.ceil(r   r6   rl   ra   rB   rB   rC   _print_ceilingi   rn   zHalidePrinter._print_ceilingc                 C  s   d|  | | dS Nzhl.sqrt(r6   )r^   rg   ra   rB   rB   rC   _helper_sqrtm   s   zHalidePrinter._helper_sqrtc                 C  sH   |  |jd }|  |jd }|  |jd }d| d| d| dS )Nr   r$   r   
hl.select(rV   r6   )doprintrf   )rJ   rZ   cpqrB   rB   rC   _print_Wherep   s   zHalidePrinter._print_Wherec                 C  r   t |jdkr| |jd S t |jd }| tj|jd |  }| tj|j|d   }d| d| dS )Nr$   r   r   hl.min(rV   r6   )re   rf   rg   sympyMinrJ   rZ   midabrB   rB   rC   
_print_Minv   s   zHalidePrinter._print_Minc                 C  r{   )Nr$   r   r   hl.max(rV   r6   )re   rf   rg   r}   Maxr   rB   rB   rC   
_print_Max   s   zHalidePrinter._print_Maxc                 C  rj   )Nr$   hl.abs(r   r6   rl   ra   rB   rB   rC   
_print_Abs   rn   zHalidePrinter._print_Absc                 C  rc   )Nr$   zhl.cos((r   r6   rd   ra   rB   rB   rC   _print_OpaqueUnaryFn_cos   ri   z&HalidePrinter._print_OpaqueUnaryFn_cosc                 C  rc   )Nr$   z	hl.cosh((r   r6   rd   ra   rB   rB   rC   _print_OpaqueUnaryFn_cosh   ri   z'HalidePrinter._print_OpaqueUnaryFn_coshc                 C  rc   )Nr$   z	hl.acos((r   r6   rd   ra   rB   rB   rC   _print_OpaqueUnaryFn_acos   ri   z'HalidePrinter._print_OpaqueUnaryFn_acosc                 C  rc   )Nr$   zhl.sin((r   r6   rd   ra   rB   rB   rC   _print_OpaqueUnaryFn_sin   ri   z&HalidePrinter._print_OpaqueUnaryFn_sinc                 C  rc   )Nr$   z	hl.sinh((r   r6   rd   ra   rB   rB   rC   _print_OpaqueUnaryFn_sinh   ri   z'HalidePrinter._print_OpaqueUnaryFn_sinhc                 C  rc   )Nr$   z	hl.asin((r   r6   rd   ra   rB   rB   rC   _print_OpaqueUnaryFn_asin   ri   z'HalidePrinter._print_OpaqueUnaryFn_asinc                 C  rc   )Nr$   zhl.tan((r   r6   rd   ra   rB   rB   rC   _print_OpaqueUnaryFn_tan   ri   z&HalidePrinter._print_OpaqueUnaryFn_tanc                 C  rc   )Nr$   z	hl.tanh((r   r6   rd   ra   rB   rB   rC   _print_OpaqueUnaryFn_tanh   ri   z'HalidePrinter._print_OpaqueUnaryFn_tanhc                 C  rc   )Nr$   z	hl.atan((r   r6   rd   ra   rB   rB   rC   _print_OpaqueUnaryFn_atan   ri   z'HalidePrinter._print_OpaqueUnaryFn_atanc                   sT   |j r	t |S |j\}}| | |}| | |}| d| d| dS )Nrk   z / r6   )
is_integerrH   _print_FloorDivrf   r^   rv   r[   )rJ   rZ   xdivrL   rB   rC   r      s   
zHalidePrinter._print_FloorDivc                 C  rj   )Nr$   	hl.round(r   r6   rl   ra   rB   rB   rC   _print_Round   rn   zHalidePrinter._print_Roundc                 C  s   |j \}}d| d| dS )N() / (z+hl.f32(0)))rf   )rJ   rZ   r   r   rB   rB   rC   _print_IntTrueDiv   s   
zHalidePrinter._print_IntTrueDivc                 C  s>   |j \}}| |}t|}dd|  d| dd| dS )Nr`   g      $@z)*hl.round((z	)*hl.f32()))rf   rg   r8   )rJ   rZ   r@   nrB   rB   rC   _print_RoundDecimal   s   

"z!HalidePrinter._print_RoundDecimal) rP   rQ   rR   staticmethodr[   r^   rb   rh   rm   rp   _print_TruncToIntrr   rt   rz   r   r   r   r   r   r   r   r   r   r   r   r   r   r   _print_RoundToIntr   r   rS   rB   rB   rL   rC   rT   O   s<    

	
	rT   z	hl.Bool()zhl.BFloat(16)zhl.Float(16)zhl.Float(32)zhl.Float(64)z	hl.Int(8)z
hl.Int(16)z
hl.Int(32)z
hl.Int(64)z
hl.UInt(8)zhl.UInt(16)zhl.UInt(32)zhl.UInt(64)c                 C  s   t |  S N)_halide_typedtyperB   rB   rC   halide_type      r   c                 C  s<   t | r| jr| tjkrtj} | tjtjfv rtj} t| S r   )	r   	is_signedr9   r;   int32float16bfloat16float32r   r   rB   rB   rC   halide_acc_type   s
   r   c                   @  s  e Zd Ze		ddddZedd	d
Zedd Zedd Zedd Z	edd Z
edd Zedd Zedd Zedd Zedd Zedd Zedd  Zed!d" Zed#d$ Zed%d& Zed'd( Zed)d* Zed+d, Zed-d. Zed/d0 Zed1d2 Zed3d4 Zed5d6 Zed7d8 Zed9d: Zed;d< Zed=d> Z ed?d@ Z!edAdB Z"edCdD Z#edEdF Z$edGdH Z%edIdJ Z&edKdL Z'edMdN Z(edOdP Z)edQdR Z*edSdT Z+edUdV Z,edWdX Z-edYdZ Z.ed[d\ Z/ed]d^ Z0ed_d` Z1edadb Z2edcdd Z3ededf Z4edgdh Z5edidj Z6edkdl Z7edmdn Z8edodp Z9edqdr Z:edsdt Z;edudv Z<edwdx Z=edydz Z>ed{d| Z?ed}d~ Z@edddZAedd ZBedd ZCedd ZDdS )HalideOverridesNTr   torch.dtype	src_dtypeOptional[torch.dtype]c                 C  s,   |t jkrd|  dS dt| d|  dS )Nr   z != 0)rU   rV   r6   )r9   boolr   )r   r   r   use_compute_typesrB   rB   rC   to_dtype   s   
zHalideOverrides.to_dtypec                 C  s\   |t jt jfv rdt| d|  d} dt| d|  d}|t jt jfv r,d| d}|S )NrU   rV   r6   zhl.reinterpret(r]   )r9   r   r   r   )r   r   r   linerB   rB   rC   to_dtype_bitcast   s   z HalideOverrides.to_dtype_bitcastc                 C  s   |  t||S r   )r   rD   )clsvaluer   rB   rB   rC   constant  s   zHalideOverrides.constantc                 C  r\   )Nr   r6   rB   r   rB   rB   rC   abs  r_   zHalideOverrides.absc                 C  s0   t | dsd|  dS d|  d| j d|  dS )Nnamehl.exp(r6   z"hl.fast_exp(hl.cast(hl.Float(32), z)) if z!.type().bits() <= 32 else hl.exp(hasattrr   r   rB   rB   rC   exp
  s   
zHalideOverrides.expc                 C  r\   )Nr   r6   rB   r   rB   rB   rC   libdevice_exp  r_   zHalideOverrides.libdevice_expc                 C  r\   rs   rB   r   rB   rB   rC   sqrt  r_   zHalideOverrides.sqrtc                 C  h   t | dsd|  d| dS d| j d| d}d|  d| d	|  d
|  d| d| j d|  d| dS )Nr   r|   rV   r6   rU   	.type(), hl.select((<)|hl.is_nan(), ) if z.type().is_float() else hl.min(r   r   r   rB   rB   rC   minimum     
8zHalideOverrides.minimumc                 C  r   )Nr   r   rV   r6   rU   r   r   >r   r   r   z.type().is_float() else hl.max(r   r   rB   rB   rC   maximum   r   zHalideOverrides.maximumc                 C  s6   t |drd|j d| d}d|  d| d| dS )Nr   rU   r   r6   ru   rV   r   )r   r   rw   rB   rB   rC   where(  s   
zHalideOverrides.wherec                 C  r\   )Nzhl.cos(r6   rB   r   rB   rB   rC   cos.  r_   zHalideOverrides.cosc                 C  r\   )Nzhl.sin(r6   rB   r   rB   rB   rC   sin2  r_   zHalideOverrides.sinc                 C     t d)NlgammarE   r   rB   rB   rC   r   6     zHalideOverrides.lgammac                 C  r\   )Nzhl.erf(r6   rB   r   rB   rB   rC   erf:  r_   zHalideOverrides.erfc                 C  r\   )Nzhl.cosh(r6   rB   r   rB   rB   rC   cosh>  r_   zHalideOverrides.coshc                 C  r\   )Nzhl.sinh(r6   rB   r   rB   rB   rC   sinhB  r_   zHalideOverrides.sinhc                 C  r\   )Nzhl.acos(r6   rB   r   rB   rB   rC   acosF  r_   zHalideOverrides.acosc                 C  r\   )Nz	hl.acosh(r6   rB   r   rB   rB   rC   acoshJ  r_   zHalideOverrides.acoshc                 C  r\   )Nzhl.asin(r6   rB   r   rB   rB   rC   asinN  r_   zHalideOverrides.asinc                 C  r\   )Nz	hl.asinh(r6   rB   r   rB   rB   rC   asinhR  r_   zHalideOverrides.asinhc                 C     d|  d| dS )Nz	hl.atan2(rV   r6   rB   r   yrB   rB   rC   atan2V     zHalideOverrides.atan2c                 C  r\   )Nzhl.atan(r6   rB   r   rB   rB   rC   atanZ  r_   zHalideOverrides.atanc                 C  r\   )Nz	hl.atanh(r6   rB   r   rB   rB   rC   atanh^  r_   zHalideOverrides.atanhc                 C  r   )Ncopysignr   r   rB   rB   rC   r   b  r   zHalideOverrides.copysignc                 C  r   )Nerfinvr   r   rB   rB   rC   r   f  r   zHalideOverrides.erfinvc                 C  r   )Nz	hl.hypot(rV   r6   rB   r   rB   rB   rC   hypotj  r   zHalideOverrides.hypotc                 C  r   )N	nextafterr   r   rB   rB   rC   r   n  r   zHalideOverrides.nextafterc                 C     |  d| S Nz & rB   r   rB   rB   rC   logical_andr     zHalideOverrides.logical_andc                 C  s
   |  dS )Nz == 0rB   r   rB   rB   rC   logical_notv     
zHalideOverrides.logical_notc                 C  r   Nz | rB   r   rB   rB   rC   
logical_orz  r   zHalideOverrides.logical_orc                 C  r   )Nr    ^ r6   rB   r   rB   rB   rC   logical_xor~  r   zHalideOverrides.logical_xorc                 C  r   r   rB   r   rB   rB   rC   bitwise_and  r   zHalideOverrides.bitwise_andc                 C  s
   d|  S )N~rB   r   rB   rB   rC   bitwise_not  r   zHalideOverrides.bitwise_notc                 C  r   r   rB   r   rB   rB   rC   
bitwise_or  r   zHalideOverrides.bitwise_orc                 C  r   )Nr   rB   r   rB   rB   rC   bitwise_xor  r   zHalideOverrides.bitwise_xorc                 C  r   )Nz << rB   r   rB   rB   rC   bitwise_left_shift  r   z"HalideOverrides.bitwise_left_shiftc                 C  r   )Nz >> rB   r   rB   rB   rC   bitwise_right_shift  r   z#HalideOverrides.bitwise_right_shiftc                 C  r   )Nzhalide_helpers.rand(rV   r6   rB   seedoffsetrB   rB   rC   rand  r   zHalideOverrides.randc                 C  r   )Nzhalide_helpers.randn(rV   r6   rB   r   rB   rB   rC   randn  r   zHalideOverrides.randnc              	   C  s   d|  d| d| d| d	S )Nzhalide_helpers.randint64(rV   r6   rB   )r   r   lowhighrB   rB   rC   	randint64  s   zHalideOverrides.randint64c                 C  s"   t | d dtjjd| S )Nr    + load_seed_offset)opsloadr#   rW   rf   seed_offset)r   r   rB   rB   rC   	load_seed  s   "zHalideOverrides.load_seedc                 C  r\   )Nz1./hl.sqrt(r6   rB   r   rB   rB   rC   rsqrt     zHalideOverrides.rsqrtc                 C  r\   )Nzhl.tan(r6   rB   r   rB   rB   rC   tan  r_   zHalideOverrides.tanc                 C  r\   )Nzhl.tanh(r6   rB   r   rB   rB   rC   tanh  r_   zHalideOverrides.tanhc                 C  r\   )Nz3(hl.reinterpret(hl.UInt(32), hl.cast(hl.Float(32), z)) >> 31) != 0rB   r   rB   rB   rC   signbit  r_   zHalideOverrides.signbitc                 C  s   |  d|  d| d| S )Nz - hl.trunc(/z)*rB   r   rB   rB   rC   fmod  s   zHalideOverrides.fmodc                 C  r   )Nzhl.pow(rV   r6   rB   r   rB   rB   rC   pow  r   zHalideOverrides.powc                 C  r\   )Nzhl.log(r6   rB   r   rB   rB   rC   log  r_   zHalideOverrides.logc                 C  r\   )Nz hl.is_inf(hl.cast(hl.Float(32), r   rB   r   rB   rB   rC   isinf  r  zHalideOverrides.isinfc                 C  r\   )Nz hl.is_nan(hl.cast(hl.Float(32), r   rB   r   rB   rB   rC   isnan  r  zHalideOverrides.isnanc                 C  r\   )Nr   r6   rB   r   rB   rB   rC   round  r_   zHalideOverrides.roundc                 C  r\   )Nrk   r6   rB   r   rB   rB   rC   floor  r_   zHalideOverrides.floorc                 C  r   )Nr   r   z + hl.f32(0))rB   r   rB   rB   rC   int_truediv  r   zHalideOverrides.int_truedivc                 C     d| j  d|  d| dS )Nz"hl.floor(hl.cast(hl.Float(max(32, .type().bits())), ) / r6   r   r   rB   rB   rC   floordiv  s   zHalideOverrides.floordivc                 C  sL   t t d|tj}t t |dtj}t ||}d|j d| dS )N0rU   r   r6   )r  r   ltr9   int8subr   )r   r   leftrightr  rB   rB   rC   sign  s   zHalideOverrides.signc                 C  r\   )Nro   r6   rB   r   rB   rB   rC   trunc  r_   zHalideOverrides.truncc                 C  r  )Nz"hl.trunc(hl.cast(hl.Float(max(32, r  r  r6   r  r   rB   rB   rC   truncdiv  s   zHalideOverrides.truncdivc                 C  r\   )Nrq   r6   rB   r   rB   rB   rC   ceil  r_   zHalideOverrides.ceilc                 C  r\   )Nr   z, 0)rB   r   rB   rB   rC   relu  r_   zHalideOverrides.reluc                 C  sR   t j|}t jjt j|t j|t|d}|tjtj	fvr't
||S |S Nbounds)r#   rW   prepare_indexinggenfuncindex_to_strused_dims_from_indexr   r9   r   r;   r  r   )r   rZ   r   indexvarrB   rB   rC   
index_expr   s   

zHalideOverrides.index_exprc                 C  s.   t |tj}t |||}||_tt|S r   )r  r   r9   r   halide_clampindirect_indexing_sizer    str)r   	index_varsizecheckwrap_negrB   rB   rC   indirect_indexing  s   z!HalideOverrides.indirect_indexingc                 C  sN   t jt j|d }t|ttjfsd|j d| d}d| d| dS )Nr$   rU   r   r6   z	hl.clamp(z, 0, )	r#   rW   kexprrename_indexingr7   r8   r}   Integerr   )r   r   r3  r4  endrB   rB   rC   r/    s   zHalideOverrides.halide_clampc                 C  s~   t j| |}| }W d    n1 sw   Y  |jjr"t|}t jjd|j dt| dg t	
|d}t|||S )NrU   r   r6   r&  )r#   rW   
mask_loadsr'  is_boolr   r)  r   rD   r   wrapr  r   )maskbodyothernew_maskresultrB   rB   rC   masked  s   zHalideOverrides.maskedc                 C  r   )Nfrexp)NotImplementedErrorr   rB   rB   rC   rD  .  r   zHalideOverrides.frexp)NT)r   r   r   r   )r   r   r   r   )TT)ErP   rQ   rR   r   r   r   classmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r	  r
  r  r  r  r  r  r  r  r  r  r  r   r!  r"  r#  r$  r.  r6  r/  rC  rD  rB   rB   rB   rC   r      s   




























































r   halidec                      sN   e Zd ZedZ	dd fd	d
Zdd Zdd ZdddZ	dd Z
  ZS )HalideCSEVariablez\b(tmp\d+)\[\?\]Nr'  ValueRanges[Any]r   r   rF   rG   c                   s   t  ||| d | _d S r   )rH   rI   	used_dimsrJ   r   r'  r   rL   rB   rC   rI   9  s   
zHalideCSEVariable.__init__c                 C  sd   t | jpd}t|| D ]}t|tr(|jd us"J |||f||j qtj	
|| _d S )NrB   )r   rJ  	itertoolschainvaluesr7   rH  updater#   rW   sort_used_dims)rJ   r   rf   kwargsusedargrB   rB   rC   update_on_argsB  s   
z HalideCSEVariable.update_on_argsc                 C  s6   t |dkr| j dS | j ddtt| dS )Nr   z[()][rV   ])re   r   joinmapr1  )rJ   dimsrB   rB   rC   	index_strJ  s   zHalideCSEVariable.index_strr1  c                 C  s"   | j d u r| j dS | | j S )Nz[?])rJ  r   rZ  rJ   rB   rB   rC   __str__P  s   
zHalideCSEVariable.__str__c                   s<   | j d urtdd | j D sJ |  fdd| j D S )Nc                 s      | ]	}t |tjV  qd S r   r7   r}   Expr.0r   rB   rB   rC   	<genexpr>W      
z-HalideCSEVariable.subs_str.<locals>.<genexpr>c                   s   g | ]}  ||qS rB   )getra  r   replacementsrB   rC   
<listcomp>Z      z.HalideCSEVariable.subs_str.<locals>.<listcomp>)rJ  allrZ  )rJ   rg  rB   rf  rC   subs_strV  s   zHalideCSEVariable.subs_strr   )r'  rI  r   r   rF   rG   )rF   r1  )rP   rQ   rR   recompileundefined_rerI   rT  rZ  r\  rk  rS   rB   rB   rL   rC   rH  6  s    
	
rH  c                      sB   e Zd ZU ded< ded< ded< d fdd	ZdddZ  ZS )DimensionInfozOptional[sympy.Expr]rZ   
sympy.Exprr3  striderF   rG   c                   s<   t    tjj|dr| }| }|| _|| _|| _d S Nr   )	rH   rI   r#   graphsizevarsstatically_known_ltrZ   r3  rq  )rJ   rZ   r3  rq  rL   rB   rC   rI   c  s   

zDimensionInfo.__init__NFc                 C  s   | j d usJ | j }|r|dkrdS |rHi |}|jD ]'}t|tjrBt|tjs+J tj	
|j}t|ts9J t||||< qt||}tj	|S )Nr   hl.Var())rZ   free_symbolsr   r   TMPr7   r}   Symbolr#   rW   lookup_cse_varr   rH  r    rk  r!   r*  )rJ   rg  	zero_varsrZ   symr-  rB   rB   rC   rZ  l  s   

zDimensionInfo.index_strrO   NF)rP   rQ   rR   __annotations__rI   rZ  rS   rB   rB   rL   rC   ro  ]  s   
 	ro  c                 C  sj   t jj| |r
dS zt jj| }t jj|}W n
 ty$   Y dS w ||kr1t jj| | ||kS NTF)r#   rs  rt  statically_known_equals	size_hint	TypeErrorguard_equals)r  r  r   r   rB   rB   rC   eq}  s   r  c                 C  s   t jj| |r
dS zt jj| }t jj|}W n ty4   t| |}|| kr1| |k Y S Y dS w ||k rAt jj| | ||k S r  )	r#   rs  rt  ru  r  r  r}   gcdguard_lt)r  r  r   r   r  rB   rB   rC   r    s   r  c                      sZ  e Zd ZU eZeZded< de fddZdfddZ	dgddZ
dh fddZdd Zdd Zdi fddZdd Zdjd"d#Zd$d% Zd&d' Zdid(d)Zd*d+ Zdkd-d.Zdld0d1Zdmd2d3Z	dndod8d9Zdpd>d?Zd@dA ZdqdHdIZe dJdrdLdMZdrdNdOZdmdPdQZ dRdS Z!dsdUdVZ"dndWdXZ#e$dYdZ Z%dndmd[d\Z&d]d^ Z'dtdcddZ(  Z)S )uHalideKernelzCallable[[sympy.Expr], str]r7  tilingdict[str, sympy.Expr]rF   rG   c                   s|   t  j|fi | | j| _| j| _| j| _t | _| j| _	| j| _
i | _i | _i | _i | _i | _i | _tt| _d| _d S r}  )rH   rI   r?  computeloadsstoresr(   indexing_code_dominside_reductionneeds_dom_indexinghas_reductionbuffer_dimensionsbuffer_offsetshalide_varsindex_replacementsreduction_renamesdom_renamesr   listbuffer_aliaseshas_indirect_indexing)rJ   r  rQ  rL   rB   rC   rI     s   

zHalideKernel.__init__r   r   r1  c                 C  s   t |S r   )r   )rJ   r   rB   rB   rC   dtype_to_str  r   zHalideKernel.dtype_to_strNc                 C  s$   | j | d|d t|||S )Nz = hl.Func(r6   )r?  	writelinerH  rK  rB   rB   rC   create_cse_var  s   zHalideKernel.create_cse_varindicesSequence[sympy.Expr]c                   s  j s	js	jrJ tjtjjjt	dt
tt j|}tt   dd tjdd jD D dd  fdd	} fd
d}|D ]8}|trc|ttdtdtd| |trw|ttdtd|  t |j qItdd  D _d}tjD ]} fdd|j ! D }|j"fddd |s|#|$d|j% d}tj&j'g }	|t(|k rt)|j%sfdd|D }
|t(|
7 }|
sJ |t*tjjj+|
 |
,fdd|D  |
rt*tj-|
t)dr$|j% t)drJ g }
t(|}d}t.dt(j }|j/r?t.dt(j j|< j|< |	#|f 9 fdd|D }|t(|7 }t(|
}fdd|
D }
t(|
|k sy|dksyJ |
,| |
s |t(|k rt)|j%r|D ]}zRd}dt)|j0s|	| \}}|d7 }|9 t)|j0rd}tj&j1}t)|j2|s|	| \}}|d7 }||| 7 }||9 }t)|j2|r|j |3 < W q t4y"   |sJ tj&j1}tj&j'}|	D ]\}}||| 7 }||9 }qtjj5t||j0|j2jj |3 < Y qw qjD ]}j67| d|j8d  q'jrN9d!fd"dj: D  d#S d#S )$a  
        Hook called right before codegen with every index that will be
        used in the fused kernel.

        This populates self.halide_vars/index_replacements/reduction_renames which is an alternate indexing
        scheme that avoids using divide and modulus.  Instead of xindex/yindex/rindex
        we base indexing on a larger number of vars whose product combines to those.

        This function populates self.halide_vars, self.index_replacements, and self.reduction_renames
        fallbackc                 S  s   i | ]}|  |qS rB   symbolre  rB   rB   rC   
<dictcomp>  s    z2HalideKernel.finalize_indexing.<locals>.<dictcomp>c                 S  s   g | ]}|j  qS rB   )nodesrN  )ra  treerB   rB   rC   rh        z2HalideKernel.finalize_indexing.<locals>.<listcomp>c                 S  s   t tjj| S r   )r}   simplifyr#   rs  rt  remove_precomputed_replacementsrY   rB   rB   rC   r    s   z0HalideKernel.finalize_indexing.<locals>.simplifyc                   sJ   | v r#|  }  |j|j| tjj|t|j	|
  d S d S r   )addrootlookupdivisorr#   rs  rt  evaluate_minr   lengthr  )baser  modulusnodeall_used_symbolssym_to_noderB   rC   visit_modular_indexing  s   z>HalideKernel.finalize_indexing.<locals>.visit_modular_indexingc                   s>   | v r|  }  |j|j| t|j|  d S d S r   )r  r  r  r  r   r  r  )r  r  r  r  rB   rC   visit_floor_div  s   
z7HalideKernel.finalize_indexing.<locals>.visit_floor_divr  r  r  c                 s  r]  r   )r   r   INDIRECTra  r|  rB   rB   rC   rb    rc  z1HalideKernel.finalize_indexing.<locals>.<genexpr>Fc                   s   g | ]
}|   v r|qS rB   r  re  )r  rB   rC   rh        c                   s
    | j S r   r  )r   )r  rB   rC   <lambda>  s   
 z0HalideKernel.finalize_indexing.<locals>.<lambda>keyr$   r   c                   s"   g | ]}t |j r|jqS rB   r  r  r  re  )r  r  rB   rC   rh    s
    c                   s2   g | ]}t  |jrt |jr|j  qS rB   )r  r  re  )r  r:  r  rB   rC   rh     s    

Thhrc                   s   g | ]}t |j r|jqS rB   r  re  r  rB   rC   rh  8  s    c                   s$   g | ]}t | st|  qS rB   )r  r}   r  ra  s)	next_sizerB   rC   rh  ;  s    z
 = hl.Var(r6   rdomc                      i | ]
\}}| j | qS rB   r  ra  vrvr[  rB   rC   r  h  r  N);r  r  r  	functoolspartialr#   rs  rt  r  r   dictfromkeysrX  rH   r(  r   r   rL  rM  from_iterablerange_treeshasr   replacer}   Wildr   rO  rw  anyr  reversedr  rN  sortappendr  numelSOnere   r  reduceevaluate_maxextendr  r    is_reductionr  Zeror  r  
IndexErrorsimplify_with_rangesindexing_coder  r   codegen_rdomitems)rJ   r  r  r  r,  had_fallbackr  r  handled_countadded_sym_sizesizes_to_addr|  	new_sizes	prior_lenr  idxr3  r  rZ   
full_indexrq  rL   )r  r  r:  r  rJ   r  r  r  rC   finalize_indexing  s   








.

zHalideKernel.finalize_indexingc                   s    j rdnd}| jv r j| S i } j D ]$} j s#| jv r#qtd|j}|s.J td| |	d ||< q 
| d fdd| D  | j|< |S )	zCRDom based indexing uses explicit iteration ranges for Func updatesioz^h(\d+)$r  r$   domc                   r  rB   r  r  r[  rB   rC   r  z  r  z3HalideKernel.setup_dom_indexing.<locals>.<dictcomp>)r  r  r  keysr  rl  matchr   r    groupr  r  )rJ   prefixrenamesr-  mrB   r[  rC   setup_dom_indexingk  s   


zHalideKernel.setup_dom_indexingc              	     sl    fdd|  D } j| dd| d t| D ]\}} j| d| d| d q d S )	Nc                   s$   g | ]}d    | dqS )hl.Range(0, r6   )r7  r8  )ra  r3  r[  rB   rC   rh    s    z-HalideKernel.codegen_rdom.<locals>.<listcomp>z = hl.RDom([rV   ]) = rU  rV  )rN  r  r  rW  	enumerater  )rJ   r   varsrsizesr  rsymrB   r[  rC   r    s   
 zHalideKernel.codegen_rdomr,  rp  c                   s*   t  |}t|| j}tjj|| jS r   )	rH   r(  r!   r  r#   rs  rt  r  r  )rJ   r,  rL   rB   rC   r(    s   zHalideKernel.prepare_indexingc                 C  s$   t |tjr| |jjS | j| S )zThe size of an index symbol)r   r   rx  rz  r   r0  r  )rJ   r|  rB   rB   rC   sym_size  s   
zHalideKernel.sym_sizer-  is_storer   c                   s4  g t |jdd dD ] }t|tjtjfr| qt|tjtjtj	fs+J |qt
jj}dd D g }t
|}t|t
jrJ|jn|gD ]i}fdd|jD tdkrd||7 }qMtd	krud   |7  < qMg }tt|D ]0}	||	 d
usJ ||	 \}
}t|
t@ rfdd|
D  ||7 }q}||
|f q}g ||f}qM fdd}g }|D ]\}}|D ]	}||7 }q|||| qÈ D ]\}}||||g q|jdd d |s	jr|tt
jjd	d	 n tjj|d jd	s)| dtt
jj r!d	n|d jd	 |rc sc|j!v rRtjj"|j!| rR#||j!|   j!| }ntjj$|drc#|| d}|}t%& D ].}	'||| rz||f  S  rJ | d|	 }|j(| vrj(| | qid
S )zEConvert address-based indexing into dimensions using self.halide_varsc                 S  s   | j S r   r  r   rB   rB   rC   r    s    z5HalideKernel.indexing_to_dimensions.<locals>.<lambda>r  c                 S  s   i | ]}|t jjqS rB   )r}   r  r  r  rB   rB   rC   r    r  z7HalideKernel.indexing_to_dimensions.<locals>.<dictcomp>c                      g | ]}| v r|qS rB   rB   ra  r  )
split_exprrB   rC   rh    ri  z7HalideKernel.indexing_to_dimensions.<locals>.<listcomp>r   r$   Nc                      g | ]}| vr|qS rB   rB   r  )	part_varsrB   rC   rh    ri  c                   s   t | } t|dkr,t jdd}| ||d  }|r,t|d |d || S  r2J | t t| fdd|D d }t j	j
}t| t jrn| jD ]}t|t jrm||9 }t | | } t t || }qPt| ||S )Nr$   wild)excluder   c                   s   i | ]
}|  |d  qS )r$   )r  r  r[  rB   rC   r    r  zRHalideKernel.indexing_to_dimensions.<locals>.expr_to_dimension.<locals>.<dictcomp>)r}   factorre   r  r  ro  r  r  r!   r  r  r7   Mulrf   r9  ceiling)rZ   symsstride_wildr  r  rq  term)r  rJ   symbolsrB   rC   expr_to_dimension  s*   

z>HalideKernel.indexing_to_dimensions.<locals>.expr_to_dimensionc                 S  s   t jjj| jtdS )Nr  )r#   rs  rt  r  rq  r   )drB   rB   rC   r    s    _view))sortedrw  r   r   HALIDErx  r  UNBACKED_INTSIZEPRECOMPUTED_SIZEr}   r  r  expandr8  r7   Addrf   re   ranger   r  popr  r  r  ro  r#   rs  rt  r  rq  insertr  statically_known_geqapply_offset_to_dimensionstatically_known_gtrL  countinstall_dimsr  )rJ   r-  r,  r  r|  r   split_failedpartnew_split_failedr  
other_vars
other_partr  rY  r
  rZ   r  orig_varrB   )r  r  rJ   r  r  rC   indexing_to_dimensions  s   


 

z#HalideKernel.indexing_to_dimensionsc                 C  s   || j vr|| j |< || j|< dS | j| |ks#t| j | t|kr%dS |r.| j | |kS t| j | |D ]'\}}|j|jkrC dS |j|jksO|j|jkr]tjj	
|j|j|_d|_q6dS )z>Try to set self.buffer_dimensions[var], return True on successTFN)r  r  re   ziprq  r3  rZ   r#   rs  rt  r  )rJ   r-  rY  r   r  oldnewrB   rB   rC   r    s(   


zHalideKernel.install_dimsc                 C  s   |dkrd S t tt|D ].}|| jdks"tjj||| jr<t||| j}|||| j 8 }||  j	|7  _	q|dksCJ d S )Nr   r$   )
r  r  re   rq  r#   rs  rt  r  r   rZ   )rJ   rY  r   r  r!  rB   rB   rC   r    s   
z&HalideKernel.apply_offset_to_dimensionc                 C  s   t tj  }|jD ]I}t|tjsJ t|tjr2| |j	}t|t
r)|jdus+J ||j q	t|tjr>|| q	t|tjtjtjtjfrLq	td| | |S )zIDetect which range trees are used to populate HalideCSEVariable.used_dimsNzunhandled symbol )r   r}   ry  rw  r7   r   r   rx  rz  r   rH  rJ  rO  r  r  r  r  r  INDEXrE  rP  )rJ   r,  rJ  r|  cse_varrB   rB   rC   r+    s"   

z!HalideKernel.used_dims_from_indexc                   sP   t dd  D sJ  fddt| j| j D }t|t ks&J |S )Nc                 s  r]  r   r^  r`  rB   rB   rC   rb  7      z.HalideKernel.sort_used_dims.<locals>.<genexpr>c                   r   rB   rB   r  rJ  rB   rC   rh  8  s
    z/HalideKernel.sort_used_dims.<locals>.<listcomp>)rj  rL  rM  r  r  rN  re   )rJ   rJ  orderedrB   r-  rC   rP  6  s   
zHalideKernel.sort_used_dimsFc                   sH   d  fdd|D }t|dkrd}|S t|dkr"| d}|S )NrV   c                 3  s    | ]	}|  V  qd S r   )rZ  )ra  r  rg  r{  rB   rC   rb  C  r,  z.HalideKernel.make_index_str.<locals>.<genexpr>r   ()r$   ,)rW  re   )rJ   rY  rg  r{  rZ  rB   r/  rC   make_index_strB  s   
zHalideKernel.make_index_strr   c           
   
   C  s  | j |}| |}| ||d\}}| d| | d}tj|}|tj	tj
fv r6tj}d| d}| jrt| jtrE| jjdusGJ tg | || jjR }| | |}|jr| j|j d | j|j d| j d | | jp~d	}	| j| d
t| d|	 d | j| d| dt| d|j d |S | j| d| j d| dt| d |S | || |S )z"Codegen a load from an InputBufferFrU  rV  r]   r6   Nz!_mask = hl.RDom([hl.Range(0, 1)])z_mask.where(r   z = hl.cast(rV   r  z + hl.cast(z_mask)z = hl.select(z
, hl.cast(z, 0)))rf   inputr(  r&  r2  r#   rs  	get_dtyper9   r   r   r   
_load_maskr7   rH  rJ  r   r+  newfuncrP  r?  r  r   r7  _load_otherr   r)  )
rJ   r   r,  r-  rY  r   r   rJ  rB  r@  rB   rB   rC   r  K  s@   

  zHalideKernel.loadc                 C  s   | j jtdd| S )Nz\[.* )csevarname_maprl  r  rJ   r   rB   rB   rC   rz  r  rN   zHalideKernel.lookup_cse_varr   r&   moder5   c              
   C  s>  t |tsJ | j|}| |}| ||d\}}| |s$|durR|  }| ||}|	|}	d
dgt| p>d}
| jt|| d|
 d| d n| j|dd	}t|}	tj|}|du rx| d| d
t| d|	 d}n|dkr| d| dt| d|	 d}ntd| | jt|| dS )z"Codegen a store to an OutputBufferTNrV   rv  r0  rU  z] = hl.undef(z.type()))r{  z] = hl.cast(r6   
atomic_addz] += hl.cast(zstore mode=)r7   rH  rf   outputr(  r&  is_indirect_indexingr  r2  rk  rW  re   r?  r  r'   r1  r#   rs  r4  r   rE  )rJ   r   r,  r   r<  r-  rY  rg  rZ  	value_str
undef_dimsr   r   rB   rB   rC   storeu  s*   

""zHalideKernel.storer   reduction_typer4   +Union[CSEVariable, tuple[CSEVariable, ...]]c                   sT  | j sJ | jr
J |||f}|| jjv r| jj| S t|tr3|dks&J | j|  | jj|< }|S t|tr=|jdus?J t	| j
 |  fdd|jD } t	|j rj| | | t	g |j R }|| j
}tj||}	t|}
|dv r|j d| }| j| d| d| d	 g }d
}t| j
D ]%\}}|| d| d |d
kr|d  d| 7  < || j| 9 }q| j| dd|  nN|dkr| ||}nCt||
}ttt  |||}W d   n1 sw   Y  d|
 dt |	 d	}| j| d|  | j| d|  || jj|< |S )zCodegen a reduction operationwelford_combineNc                   r  rB   rB   r  reduction_varsrB   rC   rh    ri  z*HalideKernel.reduction.<locals>.<listcomp>)argmaxargmin_z = hl.z(rdom, r6   r$   rU  rV  *r  r  welford_reducerU   rV   )!r  r5  r9  reduction_cacher7   tuplewelford_combine_implrH  rJ  r   r  r6  r)  rP  rk  r   	Reductiondefault_accumulatorr   r   r?  r  r  r  r  rW  welford_reduce_fallbackr   r#   set_ops_handlerr   r   rD   )rJ   r   r   rC  r   	cache_keyresult_tuple
result_varr@  defaultacc_typer,  partsrq  r  r|  
combine_fncombine_strdefault_strrB   rF  rC   	reduction  sZ   





zHalideKernel.reductionc                 C  sv  t |tr
|jd usJ t |tr|jd usJ t |tr"|jd us$J tg |j|j|jR p3| j}|t| j8 }| | |}dd |||fD }|j}| j	
| dd| d | j	
| d| d | j	
| d| d	 | j	
| d
| d | j	
| d|| j  | j	
| d|| j  | j	
| d|| j  | j	
| d| d| d | j	
| d| d| d | j	
| d| d| d| d | d| d| d| d| d| d| d| d| d| dg}| j	
| dd| d g }	td D ]}
|	| |j | j	
|	d!  d"| d#|
 d$ qt|	S )%Nc                 S  s   g | ]	}d |j  dqS )rU   z.type(), 0)r  r`  rB   rB   rC   rh    s    z5HalideKernel.welford_combine_impl.<locals>.<listcomp>z = hl.Tuple([rV   r  z
_mean_1 = z[0]z_m2_1 = z[1]z_weight_1 = z[2]z
_mean_2 = z_m2_2 = z_weight_2 = z	_delta = z
_mean_2 - _mean_1z_new_weight = z_weight_1 + 	_weight_2z_w2_over_w = hl.select(z_new_weight == 0.0, 0.0, z_weight_2 / z_new_weight)z
_mean_1 + z	_delta * 
_w2_over_wz_m2_1 + z_m2_2 + z_weight_1 * _new_weightr   rK  r  rU  rV  )r7   rH  rJ  r   r  r  r6  rP  r   r?  r  rW  rk  r  r  rO  )rJ   meanm2weightrJ  rW  rX  pfxrO  unpackedr  rB   rB   rC   rP    sD   &&z!HalideKernel.welford_combine_impldtypestuple[torch.dtype, ...]r[  UCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]values_origtuple[CSEVariable, ...]c              
     s  j sJ t|t|ksJ g }ttj   |D ]:}t|tr%|jd us'J t|jtj@ r7|	| n|	
| g |jg jd d   |j q jritjtj@ skJ dd t||D }jd j}j d}| d}	j| d| d tjdksJ d	g j\}
|
t|	i|
t|	d it|dkrd
d }g}g}n dd }fddtt|D }fddtt|D }j d||  ttt  |||}W d    n	1 sw   Y  j d||  t|dkr3fS  fdd|D }t|D ]\}}j| d d| d qAt|S )Nr$   c                 S  s&   g | ]\}}d t | d| dqS )rU   rV   r6   )r   )ra  r   r   rB   rB   rC   rh    s    z%HalideKernel.scan.<locals>.<listcomp>rK  _rdomz.xz = hl.RDom([hl.Range(1, z)])z&multi-dimensional scan not implementedc                 S  s   | d S rr  rB   r   rB   rB   rC   maybe_tuple$  r   z&HalideKernel.scan.<locals>.maybe_tuplec                 S  s   dd |  dS )Nz
hl.Tuple([rV   r  )rW  r   rB   rB   rC   rn  +  s   c                   "   g | ]}  d | d qS rU  rV  rk  ra  r  )rW  scan_renames_prirB   rC   rh  .      c                   ro  rp  rq  rr  )rW  scan_renames_currB   rC   rh  2  rt  r  c                   s   g | ]
}  qS rB   )r6  rP  )ra  rJ  )all_used_dimsrJ   rB   rC   rh  C  r  rU  rV  )r  re   r   r}   ry  r7   rH  rJ  r  r  r)  rO  r6  rP  r'  r7  r8  r  r  r   r?  r  r    rk  r  r#   rT  r   r   r  rO  )rJ   rh  r[  rk  rN  r   initialr  scan_domscanscan_varrn  	read_left
read_rightr\  unpack_varsr  r  rB   )rv  rW  ru  rs  rJ   rC   ry    sn   




"zHalideKernel.scanr&  rH  c                C  s,   | j j| j||d}t|tsJ ||_|S r%  )r9  generater?  r7   rH  rJ  )rJ   r   rJ  r'  r-  rB   rB   rC   r)  H  s   zHalideKernel.genfuncc                 C  s"   | j  }t|tsJ ||_|S r   )r9  newvarr7   rH  rJ  )rJ   rJ  r-  rB   rB   rC   r6  P  s   
zHalideKernel.newfuncc                 C  s   t j|  S )a  
        We map all tensors to 1D buffers in Halide since Halide has trouble representing some strides that PyTorch
        supports.  If there are gaps in the underlying layout the numel we pass to Halide includes the gaps while
        PyTorch's numel excludes them.
        )r#   rs  
get_buffer
get_layoutstorage_sizer;  rB   rB   rC   halide_buffer_numelV  s   z HalideKernel.halide_buffer_numelc                   s   dd }g }| j  \}}}}tt|||dD ].\} || f t trF jdkr2 jdu s4J |	 fdd| j
 jdD  q|S )	zX
        Halide requires scalar inputs before outputs, so need to reorder args.
        c                 S  s6   | \}}t |trdS d|jv rdS d|jv sJ dS )Nr$   out_ptrr   in_ptrr   )r7   r,   r   )	arg_tuple	_call_strrS  rB   rB   rC   	arg_orderc  s   

z.HalideKernel.halide_argdefs.<locals>.arg_orderr  r   Nc              	   3  s.    | ]}d t | j j j jdfV  qd S )Nalias_of)r-   bufferr   r   r   )ra  aliasrS  rB   rC   rb  s  s    
z.HalideKernel.halide_argdefs.<locals>.<genexpr>rB   )rf   python_argdefsr  r'  r  r7   r-   r   r  r  r  rd  r   )rJ   r  rB  rJ  r   r   call_strrB   r  rC   halide_argdefs^  s   

zHalideKernel.halide_argdefsr   c                   s  g }   D ]U\}}t|trd}d}d}d}n4 fdd j|j D } fdd j|j D }t|t|ks<J t j|j }t|j	  d}|
t||j||||jd qtj }|jdkrwtjjg}	tjj}
d	t i}d}nT|jd
ksJ d|jdksJ dtjjg}	tjj}
tj|}d|	d vrdD ]\}}|j|kr|j|kr|	
d| |   nq|	
d d	|ji}td|j}|	
d |	
d tjj s|	
d tjj!r|	
d d j"v r|	
d t#|d$|	|
||dS )z)Compute metadata required by codecache.pyNlongc                      g | ]
}t  |jqS rB   )r/   r8  r3  r`  r[  rB   rC   rh        z3HalideKernel.halide_kernel_meta.<locals>.<listcomp>c                   r  rB   )r/   r8  rq  r`  r[  rB   rC   rh    r  rL  )shaperq  r   r  cpuparallelismcudazonly cpu/cuda supportedr   zonly default device supportedcuda_capability))      )r  r   )      )r  r   )r  r$   cuda_capability_user_contextstrict_float
no_runtime
no_assertsdebug64large_buffers-)target	schedulerscheduler_flagscuda_device)%r  r7   r,   r  r   re   r/   r  r.   r   r  r   r  r#   rs  get_current_device_or_throwtyper   rG  
cpu_targetscheduler_cpur   r,  
gpu_targetscheduler_cudar9   r  get_device_propertiesmajorminormulti_processor_countr=   assertsr  rX   r   rW  )rJ   argtypesrJ  rS  r  rq  r   r   current_devicer  schdulerr  r  
capabilityr  r  rB   r[  rC   halide_kernel_meta  s~   















zHalideKernel.halide_kernel_metac                   s`   j jrtd  }t }|jddd |    D ]F\}}t|t	r5|
|j d j d q|js<J |d|jv rCdnd	}t|j}t j|j }|
|j d
| d| d| d q|d |    D ]\}}|
|j d|j  qr j  D ]\}	}
|
|	 d
|
  q| j  fdd} jjD ]}t|trtj||}|
| q|
d |
d   D ]\}}t|t	rtjjj|jdd}|
|j d| d qɈ j|j }g }t|D ]k\}}  tjjj|j!dd|}|"d| d d|jvr`|
|j d| d z|
|j d| dt#|j$ d W n
 t%y@   Y nw z|
|j d| dt#|j! d W q t%y_   Y qw q|
|j dd&| d q|'d |d(  |j)r|jdt*+|j)d |j,d!|j)d|j-d"	dd |. S |jd#|j,d$dd |. S )%z3Called at the end to generate a final kernel stringinplace_buffersz
            import halide as hl
            from torch._inductor.runtime import halide_helpers
            from math import inf, nan

            @hl.generator(name="kernel")
            class Kernel:
        Tstripz = hl.InputScalar(r6   outzhl.OutputBufferzhl.InputBufferr  r   rV   z&
            def generate(g):
        z = g.c                   s2   t t jj| d }|jd usJ |t|S )Nr$   )r   rH  r9  r:  r  rJ  r1  )r  r-  r[  rB   rC   update_index  s   z1HalideKernel.codegen_kernel.<locals>.update_indexr8  zassert g.using_autoscheduler()r$   r  z.set_estimate(r  z.dim(z).set_min(0)z).set_stride(z).set_extent(z.set_estimates([r  r   zN
            if __name__ == "__main__":
                hl.main()
            z:
                else:
                    hl.load_plugin(z))
                    target = hl.Target(z=)
                    autoscheduler = hl.AutoschedulerParams(a  )
                    with hl.GeneratorContext(target, autoscheduler):
                        gen = Kernel()
                        pipeline = gen._build_pipeline()
                        # gen.compile_to_callable() does not run the autoscheduler
                        pipeline.apply_autoscheduler(target, autoscheduler)
                        kernel = pipeline.compile_to_callable([
                                gen._get_input_parameter(a.name)._to_argument()
                                for a in gen._get_arginfos()
                                if a.dir == hl.ArgInfoDirection.Input
                            ], target)
                zR
                  else:
                      with hl.GeneratorContext(hl.Target(zX)):
                          kernel = Kernel().compile_to_callable()
                  )/rf   r  rE   r  r(   splice	do_indentr  r7   r,   r  r   rX   r  r   r   re   r  aliasesr  r?  _linesr1  rH  rn  r  r#   rs  rt  r  rZ   r  _autoscheduler_workaroundsr3  r  r8   rq  r  rW  do_unindentrstripr  r   find_libautoscheduler  r  getvalue)rJ   r   metacoderJ  rS  argclsargtypendimr(  r)  r  r   hintrY  range_hintsr  dimrB   r[  rC   codegen_kernel  s   

&



 

zHalideKernel.codegen_kernelc                 C  s6   t |dkrtjjdkrtj jdkrtd| } | S )Nr$   Anderson2021r  r   )	re   r   rG  r  r#   rs  r  r  r=   )r   rY  rB   rB   rC   r  N  s
   
z'HalideKernel._autoscheduler_workaroundsc                 C  s^   t jj}dd |  D }t j }|jdkr$||jt j}|| |j	|||dd dS )zCodegen a call to this kernelc                 S  s    g | ]\}}|j d u r| qS r   r  )ra  r   rS  rB   rB   rC   rh  \  s     z,HalideKernel.call_kernel.<locals>.<listcomp>r  F)devicetritonN)
r#   rs  wrapper_coder  r  r  write_get_raw_streamr,  r  generate_kernel_call)rJ   r   r  wrapper	call_argsr  stream_namerB   rB   rC   call_kernelY  s   



zHalideKernel.call_kernelc                 C  s   dS r}  rB   )rJ   r4  rB   rB   rC   generate_asserth  s   zHalideKernel.generate_assertrZ   r3  lowerupperc                 C  s   d S r   rB   )rJ   rZ   r3  r  r  rB   rB   rC   check_boundsk  s   zHalideKernel.check_bounds)r  r  rF   rG   )r   r   rF   r1  )NN)r  r  )r,  rp  )r-  r1  r,  rp  r  r   r}  )r   r1  r,  rp  )r   r1  r   )
r   r1  r,  rp  r   r&   r<  r5   rF   rG   )
r   r   r   r   rC  r4   r   rD  rF   rD  )rh  ri  r[  rj  rk  rl  rF   rl  )rF   rH  )rF   r   )rZ   rp  r3  rp  r  r   r  r   )*rP   rQ   rR   r   	overridestexprr7  r~  rI   r  r  r  r  r  r(  r  r&  r  r  r+  rP  r2  r  rz  rB  r^  rP  ry  r   unknownr)  r6  r  r  r  r  r   r  r  r  r  rS   rB   rB   rL   rC   r    sJ   
 

 ,	
h


	
'
=
&T


$
Sy

r  c                   @  s&   e Zd ZeZed
ddZdd Zd	S )HalideSchedulingr  torch.devicerF   OrderedSet[BackendFeature]c                 C  s,   t tjtjtjg}tjjr|tj	 |S r   )
r   r%   TUPLE_REDUCTIONPREFER_STORE_LOOP_ORDERREDUCE_TO_SINGLE_ELEMENTr   rG  scan_kernelsr  SCAN)r   r  rB  rB   rB   rC   get_backend_featurest  s   z%HalideScheduling.get_backend_featuresc           
      C  s   t jj}||jv r|j| }|S d|  }||j|< |d t }|d| d |j	|dd |d t
||\}}| d| }	||| |	 td	r\t|d
| |S )z6Codegen kernel definition to go in output wrapper codehalide_kernel_zEfrom torch._inductor.runtime.hints import HalideMeta, HalideInputSpeczasync_compile.halide(z, '''Tr  z''')
kernel_metadatar8  )r#   rs  r  src_to_kernelnext_kernel_suffixadd_import_oncer(   r  r  r  r   define_kernelr  r   r   )
rJ   src_codenode_schedulerW   r  kernel_namecompile_wrapperoriginsdetailed_originsmetadata_commentrB   rB   rC   r    s.   




zHalideScheduling.define_kernelN)r  r  rF   r  )rP   rQ   rR   r  kernel_typerF  r  r  rB   rB   rB   rC   r  q  s
    r  )r
__future__r   dataclassesr  rL  loggingrl  collectionsr   mathr   typingr   r   r   r   r	   r
   r}   r9   torch._logging_prims_commonr   utils._ordered_setr   utils._sympy.functionsr   r   utils._sympy.symbolr   r   utils._sympy.value_rangesr   r8  r   r   	codecacher   r   metricsr   r   ops_handlerr   runtime.hintsr   r   utilsr   r   r   r    r!   virtualizedr"   r  r#   commonr%   r&   r'   r(   r)   r*   r+   r,   r-   cppr.   	cpp_utilsr/   simdr0   r1   r2   collections.abcr3   r4   r5   	getLoggerrP   r  rD   RuntimeErrorrE   rT   rv   r  pexprr   r   r   r   float64r  int16r   r;   uint8uint16uint32uint64r   r   r   r   _initialize_pointwise_overridesrH  	dataclassro  r  r  r  r  rB   rB   rB   rC   <module>   s    ,
}  
H'       ^