
    !&hK                     N   d dl mZmZ d dlmZmZmZmZ d dlm	Z	 d dl
mZ d dlZd dlmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlZd	efd
Z ej<                         defd       Z  ej<                         de!fd       Z" ej<                         de!fd       Z# ej<                         de!fd       Z$de!fdZ% ej<                         de!fd       Z& ej<                  d      d        Z'de!fdZ( ed       G d d             Z) G d de      Z*y)    )BaseBackend	GPUTarget)irpassesllvmnvidia)
PTXASError)	dataclassN)AnyDictTupleOptional)
ModuleType)Pathtargetc                 >    dt         t        t        t        f   fd}|S )Nreturnc                     | j                   j                  }|j                   j                  }||k(  sJ d       |dk(  ryy)Nz%lhs and rhs bitwidth must be the same   )   r       )r   r   r   )scalarprimitive_bitwidth)lhs_typerhs_typelhs_bitwidthrhs_bitwidths       u/var/www/pru.catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/triton/backends/nvidia/compiler.pycheck_dot_compatibilityz-min_dot_size.<locals>.check_dot_compatibility   sB    9999|+T-TT+1    )r   int)r   r   s     r   min_dot_sizer"      s!     uS#s]7K   #"r    binaryc                    | t        j                  d      z  } t        j                  j	                  d| j                          dd      t        j                  j                  t        j                  j                  t              d|       g}|D ]  }t        j                  j                  |      s#t        j                  j                  |      sCt        j                  |dgt        j                        }|mt        j                   d|j#                  d	      t        j$                  
      }|||j'                  d      fc S  t)        d|        )NEXETRITON__PATH bin	--version)stderrz.*release (\d+\.\d+).*utf-8flags   zCannot find )	sysconfigget_config_varosenvirongetupperpathjoindirname__file__existsisfile
subprocesscheck_outputSTDOUTresearchdecode	MULTILINEgroupRuntimeError)r#   pathsr6   resultversions        r   _path_to_binaryrH   !   s   
i&&u--F


 06;
RWW__X.v>E
  277>>$BGGNN4$8,,dK-@IZIZ[F!))$=v}}W?U]_]i]ij&q!1112 fX.
//r    archc                 *    | dk\  rdnd}t        |      S )Nd   zptxas-blackwellptxas)rH   )rI   names     r   	get_ptxasrN   3   s     $D4  r    c                     t         j                  j                  d      }||S t        j                  t        |       d   dg      j                  d      }|S )NTRITON_MOCK_PTX_VERSIONr   r*   r,   )r2   r3   r4   r<   r=   rN   rA   )rI   mock_verrG   s      r   get_ptxas_versionrR   9   sO    zz~~78H%%yq'9;&GHOOPWXGNr    r   c                     t        | t              sJ t        t        | j	                  d            \  }}|dk(  r|dk  rd|z   S d|z   dz
  S |dk(  rd|z   S |dk(  rd	|z   S t        d
| z         )zK
    Get the highest PTX version supported by the current CUDA driver.
    .      P   r/      F   
   ?   z?Triton only support CUDA 10.0 or higher, but got CUDA version: )
isinstancestrmapr!   splitrD   )cuda_versionmajorminors      r   ptx_get_versionrc   B   s    
 lC(((sL..s34LE5{19::>!{Ez{Ez
X[gg
hhr    c                 T    | j                   }|t        |      \  }}t        |      }|S N)ptx_versionrN   rc   )optionsrI   rf   _r`   s        r   get_ptx_version_from_optionsri   U   s0    %%K#D/<%l3r    c                 @    t        | |      }t        d|      }d| }|S )NV   z+ptx)ri   min)rg   rI   rf   llvm_ptx_versionfeaturess        r   get_featuresro   ]   s0    .w=K 2{+&'(HOr    c                     t        | d      5 }t        j                  |j                               j	                         cd d d        S # 1 sw Y   y xY w)Nrb)openhashlibsha256read	hexdigest)r6   fs     r   	file_hashrx   k   s>    	dD	 4Q~~affh'1134 4 4s   1AA
capabilityc                 "    | dk\  rdnd}d|  | S )NZ   ar(   sm_ )ry   suffixs     r   sm_arch_from_capabilityr   q   s!    "$S"FVH%%r    T)frozenc                   f   e Zd ZU dZeed<   dZeed<   dZeed<   dZeed<   dZ	eed	<   dZ
eed
<   dZeed<   dZee   ed<   dZeed<   dZeed<   dZeed<   dZeed<   dZee   ed<   dZee   ed<   dZeed<   dZee   ed<   dZeed<   dZeed<   dZeed<   d Zeed!<   dZeed"<   dZ eed#<   d$ Z!d% Z"y)&CUDAOptions   	num_warpsr/   num_ctas   
num_stagesr   num_buffers_warp_specnum_consumer_groupsreg_dec_producerreg_inc_consumerNmaxnreg)r/   r/   r/   cluster_dimsrf   Tenable_fp_fusionFlaunch_cooperative_grid)fp8e5fp8e4b15supported_fp8_dtypesr~   deprecated_fp8_dtypestf32default_dot_input_precision)r   tf32x3ieeeallowed_dot_input_precisionsmax_num_imprecise_acc_defaultextern_libsdebugcudabackend_namesanitize_overflowrI   c                    t        t              j                  dz  }| j                  i nt	        | j                        }|j                  dd       s%t        j                  dt        |dz              |d<   t        j                  | dt        |j                                      | j                  dkD  r| j                  | j                  dz
  z  dk(  sJ d       y )	Nlib	libdeviceTRITON_LIBDEVICE_PATHzlibdevice.10.bcr   r   r/   znum_warps must be a power of 2)r   r9   parentr   dictr4   r2   getenvr]   object__setattr__tupleitemsr   )selfdefault_libdirr   s      r   __post_init__zCUDAOptions.__post_init__   s    h..6 ,,4b$t?O?O:P{D1')yy1H#n_pNpJq'rK$4k6G6G6I0JK~~!t~~!9K'LQR&R 	0/	0R&Rr    c           	      ^   t        | j                        }t        d t        |d         D              |d<   dj	                  t        |j                               D cg c]  \  }}| d|  c}}      }t        j                  |j                  d            j                         S c c}}w )Nc              3   <   K   | ]  \  }}|t        |      f  y wre   )rx   ).0kvs      r   	<genexpr>z#CUDAOptions.hash.<locals>.<genexpr>   s     (htq!!Yq\):(hs   r   rh   -r,   )
r   __dict__r   sortedr7   r   rs   rt   encoderv   )r   	hash_dictrM   valkeys        r   hashzCUDAOptions.hash   s    '	#((hviXeNfGg(h#h	- hh	@Q9RSID#4&#ST~~cjj12<<>> Ts   B)
)#__name__
__module____qualname__r   r!   __annotations__r   r   r   r   r   r   r   r   r   r   rf   r   boolr   r   r   r]   r   r   r   r   r   r   r   r   r   rI   r   r   r~   r    r   r   r   w   s   IsHcJ!"3"  cc "GXc]!#L%#K!d!$)T)'<%*<(*5:*'--/I %*I*.!4.KE4L#"t"D#0?r    r   c                        e Zd Zedefd       Zd Zdeddf fdZdefdZ	d Z
d	 Zdeeef   fd
Zd Zed        Zed        Zd Zd Zd Zd Z ej0                         d        Z xZS )CUDABackendr   c                      | j                   dk(  S )Nr   )backend)r   s    r   supports_targetzCUDABackend.supports_target   s    ~~''r    c                     d}t        j                  ||      }|st        d|       t        |j	                  d            S )Nz	^sm(\d+)$z(TRITON_OVERRIDE_ARCH must have the form r/   )r?   	fullmatch
ValueErrorr!   rC   )r   rI   patternmatchs       r   _parse_archzCUDABackend._parse_arch   s@    Wd+GyQRR5;;q>""r    r   Nc                 2    t         |   |       d| _        y )Ncubin)super__init__
binary_ext)r   r   	__class__s     r   r   zCUDABackend.__init__   s     !r    c                 V   dt        j                  dd| j                  j                         i}|j	                  t
        j                  j                         D ci c]  }||v s||   |||    c}       t        | j                  |d               }d|vrFt        t
        j                        }|dk\  r|j                  d       t        t        |            |d<   d|vr
|dk\  rd	|d<   d
|vrt        j                  dd      dk(  |d
<   |dk(  rdnd|d<   t        di |S c c}w )NrI   TRITON_OVERRIDE_ARCHsmr   Y   fp8e4nvr   r{   )r   r   TRITON_DEFAULT_FP_FUSION1i   @r   r   r~   )r2   r   r   rI   updater   __dataclass_fields__keysr!   r   setr   addr   r   )r   optsargsr   ry   r   s         r   parse_optionszCUDABackend.parse_options   s3   		"8Bt{{?O?O>P:QRS)I)I)N)N)PuATUY]T]aefgahatQQZuv))$v,78
!-#&{'G'G#H R$((3+08L1M+ND'("$.R0>,-T)')yy1KS'QUX'XD#$9Cr9Iq,-"T""% vs   	D&%D&+D&c                     |j                   |j                  |j                  |j                  d   |j                  d   |j                  d   fS )Nr   r/      )r   r   sharedr   )r   metadatas     r   pack_metadatazCUDABackend.pack_metadata   sO    OO!!!$!!!$!!!$
 	
r    c                     dd l mc mc m} t	        | j                  |j                              }|dk\  r|j                  n|j                  t        | j                        d}|S )Nr   rW   )convert_custom_typesr"   )triton.language.extra.cudalanguageextrar   r!   r   rI   convert_custom_float8_sm80convert_custom_float8_sm70r"   r   )r   rg   r   ry   codegen_fnss        r   get_codegen_implementationz&CUDABackend.get_codegen_implementation   sV    11))',,78
 0:R/?D++TEdEd%

 r    c                     ddl m} d|iS )Nr   )r   ztriton.language.extra.libdevice)r   r   )r   r   s     r   get_module_mapzCUDABackend.get_module_map   s    819==r    c                 .    t        j                  |       y re   )r   load_dialects)r   ctxs     r   r   zCUDABackend.load_dialects   s    S!r    c                 v   t        j                  | j                        }|j                          t        j
                  j                  |       t        j                  j                  |       t        j
                  j                  |       t        j                  j                  |       t        j                  j                  |       t        j
                  j                  |       t        j
                  j                  |       t        j                  j                  |       |j                  |        | S re   )r   pass_managercontextenable_debugr   commonadd_inlinerttiradd_rewrite_tensor_pointeradd_canonicalizeradd_combineadd_reorder_broadcastadd_cseadd_symbol_dceadd_loop_unrollrun)modr   optpms       r   	make_ttirzCUDABackend.make_ttir   s    __S[[)
!!"%..r2''+#))"-b!$$R(##B'
s
r    c                 f   t        j                         }|j                  <|j                  d   |_        |j                  d   |_        |j                  d   |_        t        j                  | j                        }|j                         }t        j                  j                  |d| |j                  d|j                         t        j                  j!                  |       |dz  dk\  rt        j                  j#                  |       t         j                  j$                  j'                  ||       t        j                  j)                  |       t        j                  j+                  |       t        j                  j-                  |       t        j                  j)                  |       t        j                  j/                  ||dk\         t        j0                  j3                  |       |dz  d	v rt        j                  j5                  |       t        j0                  j7                  |       t        j0                  j9                  |       t        j                  j;                  |       t        j0                  j7                  |       t        j                  j=                  |       t        j                  j?                  ||j@                         t        j                  jC                  ||j@                         t        j                  jE                  ||j@                         t        j                  jG                  ||jH                  |j@                  |jJ                  |jL                         t        j                  jO                  ||jP                  |       t        j                  jS                  ||j@                         t        j                  jU                  ||j@                         nS|dz  dk\  r+t        j                  j5                  |       t        j0                  j7                  |       t        j0                  j9                  |       t        j                  j;                  |       t        j                  j?                  ||j@                         t        j                  jC                  ||j@                         t        j                  jE                  ||j@                         t        j                  jG                  ||jH                  |j@                  |jJ                  |jL                         t        j                  jO                  ||jP                  |       t        j                  j=                  |       t         j                  j$                  jW                  |       t         j                  j$                  jY                  |       t        j                  jU                  ||j@                         t        j0                  j7                  |       nt        j0                  j9                  |       t        j                  j[                  |       t        j                  j/                  ||dk\         t        j                  j]                  |       t        j                  j)                  |       t        j                  j_                  |       t        j                  ja                  |       t        j0                  j3                  |       t        j0                  jc                  |       |dz  d
k\  rRt         j                  j$                  je                  |       t         j                  j$                  jg                  |       t        j0                  j7                  |       |dz  d
k\  r*t        j                  ji                  ||j@                         |jk                  |        |j                  |j                  |j
                  f|d<   | S )Nr   r/   r   zcuda:r   rZ   r   rW   )r   	   r  r   )6r   ClusterInfor   clusterDimXclusterDimYclusterDimZr   r   r   r   r   r   add_convert_to_ttgpuirr   r   ttgpuiradd_coalesceadd_f32_dot_tc	ttnvgpuiradd_plan_ctaadd_remove_layout_conversionsadd_optimize_thread_localityadd_accelerate_matmuladd_optimize_dot_operandsr   r   add_fuse_nested_loopsr   add_licmadd_optimize_accumulator_init add_combine_tensor_select_and_ifadd_ws_task_partitionr   add_taskid_propagateadd_ws_data_partitionadd_ws_code_partitionr   r   r   add_pipeliner   add_ping_pong_syncadd_ws_loweringadd_promote_lhs_to_tmemadd_keep_acc_in_tmemadd_prefetchadd_coalesce_async_copyadd_reduce_data_duplicationadd_reorder_instructionsr   add_fence_insertionadd_tma_loweringadd_ws_canonicalizationr  )r  r   r  ry   cluster_infor  dump_enableds          r   
make_ttgirzCUDABackend.make_ttgir   s_   ))+''*'7'7':L$'*'7'7':L$'*'7'7':L$__S[[)(**2zl/CS]]TVX[XdXde##B'q NN))"-,,R>44R833B7,,R044R800Z25EFb!v%NN004MM++B/MM""2&NN88<MM++B/NN;;B?NN00S5L5LMNN//C4K4KLNN00S5L5LMNN00S5N5NPSPgPg141E1EsG[G[]NN''CNNLINN--b#2I2IJNN**2s/F/FG2#NN004MM++B/MM""2&NN88<NN00S5L5LMNN//C4K4KLNN00S5L5LMNN00S5N5NPSPgPg141E1EsG[G[]NN''CNNLINN;;B?MM##;;B?MM##88<NN**2s/F/FGMM++B/MM""2&##B'00Z25EF..r244R82226//3b!$$R(q MM##77;MM##44R8''+q NN222s7N7NO
s$0$<$<l>V>VXdXpXp#q 
r    c                 ^	   t        || j                  j                        }|}t        j                  |j
                        }|j                          t        j                  j                  j                  |       t        j                  j                  |       t        j                  j                  |       t        j                  j                  |       t        j                  j!                  |       t        j                  j                  j#                  |       t        j                  j%                  |       t        j                  j                  j'                  |||       t        j(                  j+                  |       t        j(                  j-                  |       t        j                  j                  j/                  |       t        j                  j                  j1                  |       t        j(                  j+                  |       t        j(                  j-                  |       t        j(                  j3                  |       t4        j6                  j9                  dd      dk(  rt        j:                  j=                  |       |j?                  |       tA        jB                          tA        j
                         }t4        j6                  j9                  dd      dk(  rtE        d      tA        jF                  ||      }	tI        |      }
tK        || j                  j                        }d}tA        jL                  |	||
|       t        jN                  |	       |jP                  R|	jS                         D ]?  }|jU                         r|jW                         s%|jY                  |jP                         A |jZ                  r4|jZ                  D cg c]  \  }}|	 }}}tA        j\                  |	|       tA        j^                  |	t@        j`                         |jc                  d      }|||d<   |jc                  d	      |d
<   |jc                  d      |d<   |jc                  d      |d<   |jc                  d      |d<   te        |	      }~	~|S c c}}w )NTRITON_DISABLE_LINE_INFO0TRITON_ENABLE_ASANr   zYAddress Sanitizer Error: Address sanitizer is currently only supported on the AMD backendnvptx64-nvidia-cudazttg.total-num-warpsr   z
ttg.sharedr   zttg.tensor_memory_size	tmem_sizezttg.global_scratch_memory_sizeglobal_scratch_sizez#ttg.global_scratch_memory_alignmentglobal_scratch_align)3ri   r   rI   r   r   r   r   r   r   r  add_lower_mmar  r  add_allocate_warp_groupsconvertadd_scf_to_cfadd_allocate_shared_memoryadd_allocate_tensor_memory"add_allocate_global_scratch_memoryadd_to_llvmirr   r   r   add_nvgpu_to_llvmadd_warp_specialize_to_llvmr   r2   r3   r4   llvmiradd_di_scoper  r   init_targetsrD   	to_moduler   ro   attach_datalayoutset_nvvm_reflect_ftzr   get_functionsis_declarationis_external_linkageset_nvvm_maxnregr   link_extern_libsoptimize_moduleOPTIMIZE_O3get_int_attrr]   )r   srcr   rg   ry   rf   r  r  r   llvm_modprocrn   tripler   rM   r6   rE   total_num_warpsrets                      r   	make_llirzCUDABackend.make_llir<  sg   27DKK<L<LM__S[[)
--b177;//3$$R(11"5::2>99"=++B
KH''+b!11"5;;B?''+b!$$R(::>>4c:cAMM&&r*
s,,.::>>.4;km m>>#w/&z2)9)9:&xx@##H- ??&++- 8'')a.C.C.E&&w78 .5.A.ABltTTBEB!!(E2Xt'7'78 **+@A&$3H[! --l; # 0 01I J*-*:*:;[*\&'+.+;+;<a+b'((m
# Cs   )R)c           	         t        || j                  j                        }d}t        |      }t	        || j                  j                        }t        j                  ||||dg|j                  d      }	t        j                  d|	      }
t        |
      dk(  sJ |
d   |d<   |dz   d	|dz   }t        j                  d
d| |	t        j                        }	t        j                  dd| |	t        j                        }	t        j                  dd|	      }	t        j                  j                  dd      dk(  rt!        d       t!        |	       |	S )Nr2  znvptx-short-ptrFz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r/   r   rM   rZ   rT   z\.version \d+\.\d+z	.version r-   z\.target sm_\d+z.target sm_z,\s*debug|debug,\s*r(   NVPTX_ENABLE_DUMPr0  r   z // -----// NVPTX Dump //----- //)ri   r   rI   r   ro   r   translate_to_asmr   r?   findalllensubrB   r2   r3   r4   print)r   rN  r   r  ry   rf   rQ  rP  rn   rS  namess              r   make_ptxzCUDABackend.make_ptx}  s4   238H8HI&&z2T[[%5%56##CxBSATVYVjVjlqr

FL5zQ 8$b);r>*:;ff*i},EsRTR^R^_ff';zl)CSPRP\P\]ff+R5::>>-s3s:45#J
r    c                    t        | j                  j                        \  }}t        j                  ddd      5 }t        j                  ddd      5 }|j                  |       |j                          |j                  dz   }	t        j                  j                  dd	      d
k(  rddgndg}
|j                  rg ndg}t        |      }t        j                  j                  dd	      d
k(  rdd	gng }|g|
|d|d| |j                  d|	}	 t        j                  |dd|       t        j                  j!                  |j                        rt        j"                  |j                         t        j                  j!                  |j                        rt        j"                  |j                         t'        |	d      5 }|j)                         }d d d        t        j                  j!                  |	      rt        j"                  |	       d d d        d d d        S # t        j$                  $ r}t'        |j                        5 }|j)                         }d d d        n# 1 sw Y   nxY wt        j                  j!                  |j                        rt        j"                  |j                         |j*                  dk(  rd}n2|j*                  dt,        j.                  z   k(  rd}nd|j*                   }t1        | d ddj3                  |       d      d }~ww xY w# 1 sw Y   UxY w# 1 sw Y   &xY w# 1 sw Y   S xY w)NFwz.ptx)deletemoder   rz.logz.or/  r0  r   z	-lineinfoz-suppress-debug-infoz--fmad=falseDISABLE_PTXAS_OPTz--opt-levelz-vz--gpu-name=z-oT)check	close_fdsr+      z!Internal Triton PTX codegen error   z`ptxas` raised SIGSEGVz`ptxas` failed with error code z
`ptxas` stderr:
z
Repro command:  
rq   )rN   r   rI   tempfileNamedTemporaryFilewriteflushrM   r2   r3   r4   r   r   r<   r  r6   r:   removeCalledProcessErrorrr   ru   
returncodesignalSIGSEGVr	   r7   )r   rN  r   r  ry   rL   rh   fsrcflogfbin	line_infofmadrI   	opt_level	ptxas_cmdelog_filelogerrorrw   r   s                        r   
make_cubinzCUDABackend.make_cubin  s   T[[--.q((CO &	 SW''u3vN&	 RVJJsOJJL99t#DACPjPSBUX[B\&<=bman --2N3CD*:6D02

?RTW0X\_0_,egIqqTq4q)q{SWRXEYq[_[d[dqfjqlpqILydS77>>$)),IIdii(77>>$)),IIdii($ dD! !Q!ww~~d#		$M&	  &	 N + 00 L$))_ *"--/C* * *77>>$)),IIdii(<<3&?E\\S6>>%994E=all^LE E7 +558E :33688I3F2Gr"K L LL"! !G&	  &	  &	 N sz   MC L;B)H)>L;
L.<L;M)L+<L&I+	"	L&+I40B6L&&L++L;.L83L;;M	 MMc                       j                  j                         fd|d<    fd|d<    fd|d<    fd|d<    fd	|d
<   y )Nc                 *    j                  | |      S re   )r  rN  r   rg   r   s     r   <lambda>z(CUDABackend.add_stages.<locals>.<lambda>  s    t~~c8W/U r    r   c                 ,    j                  | |      S re   )r-  rN  r   ry   rg   r   s     r   r  z(CUDABackend.add_stages.<locals>.<lambda>  s    XwXb0c r    ttgirc                 ,    j                  | |      S re   )rT  r  s     r   r  z(CUDABackend.add_stages.<locals>.<lambda>  s    t~~c8WV`/a r    llirc                 T    j                  | |j                  j                        S re   )r]  r   rI   r  s     r   r  z(CUDABackend.add_stages.<locals>.<lambda>  s#    dmmC7TXT_T_TdTd.e r    ptxc                 T    j                  | |j                  j                        S re   )r~  r   rI   r  s     r   r  z(CUDABackend.add_stages.<locals>.<lambda>  s#    XwX\XcXcXhXh0i r    r   )r   rI   )r   stagesrg   ry   s   ` `@r   
add_stageszCUDABackend.add_stages  sD    %%gll3
Uvcwaveuiwr    c                 v    t        | j                  j                        }| d| j                  j                   S )Nr   )rR   r   rI   )r   rG   s     r   r   zCUDABackend.hash  s2    #DKK$4$45!DKK,,-..r    )r   r   r   staticmethodr   r   r   r   r   r   r   r   r   r]   r   r   r   r  r-  rT  r]  r~  r  	functools	lru_cacher   __classcell__)r   s   @r   r   r      s    (	 ( (#"y "T "#S #,
>S*_ 5 >"   F FP?B,)Vj Y/ /r    r   )+triton.backends.compilerr   r   triton._C.libtritonr   r   r   r   triton.runtime.errorsr	   dataclassesr
   r  typingr   r   r   r   typesr   rs   r?   rj  rq  r2   r<   pathlibr   r0   r"   r  r]   rH   r!   rN   rR   rc   ri   ro   rx   r   r   r   r~   r    r   <module>r     sk   ; 8 8 , !  - -   	   	   # # 0C 0 0" !C ! !
 C   iS i i$  
 
 
 T4 4
& & $'? '? '?Tg/+ g/r    