Files
triton/master/searchindex.js
2022-09-12 00:51:39 +00:00

1 line
38 KiB
JavaScript

Search.setIndex({docnames:["getting-started/installation","getting-started/tutorials/01-vector-add","getting-started/tutorials/02-fused-softmax","getting-started/tutorials/03-matrix-multiplication","getting-started/tutorials/04-low-memory-dropout","getting-started/tutorials/05-layer-norm","getting-started/tutorials/06-fused-attention","getting-started/tutorials/07-libdevice-function","getting-started/tutorials/index","getting-started/tutorials/sg_execution_times","index","programming-guide/chapter-1/introduction","programming-guide/chapter-2/related-work","python-api/generated/triton.Config","python-api/generated/triton.autotune","python-api/generated/triton.heuristics","python-api/generated/triton.jit","python-api/generated/triton.language.arange","python-api/generated/triton.language.atomic_add","python-api/generated/triton.language.atomic_and","python-api/generated/triton.language.atomic_cas","python-api/generated/triton.language.atomic_max","python-api/generated/triton.language.atomic_min","python-api/generated/triton.language.atomic_or","python-api/generated/triton.language.atomic_xchg","python-api/generated/triton.language.atomic_xor","python-api/generated/triton.language.broadcast_to","python-api/generated/triton.language.cos","python-api/generated/triton.language.dot","python-api/generated/triton.language.exp","python-api/generated/triton.language.load","python-api/generated/triton.language.log","python-api/generated/triton.language.max","python-api/generated/triton.language.maximum","python-api/generated/triton.language.min","python-api/generated/triton.language.minimum","python-api/generated/triton.language.multiple_of","python-api/generated/triton.language.num_programs","python-api/generated/triton.language.program_id","python-api/generated/triton.language.rand","python-api/generated/triton.language.randint","python-api/generated/triton.language.randint4x","python-api/generated/triton.language.randn","python-api/generated/triton.language.ravel","python-api/generated/triton.language.reshape","python-api/generated/triton.language.sigmoid","python-api/generated/triton.language.sin","python-api/generated/triton.language.softmax","python-api/generated/triton.language.sqrt","python-api/generated/triton.language.store","python-api/generated/triton.language.sum","python-api/generated/triton.language.where","python-api/generated/triton.language.zeros","python-api/generated/triton.testing.Benchmark","python-api/generated/triton.testing.do_bench","python-api/generated/triton.testing.perf_report","python-api/triton","python-api/triton.language","python-api/triton.testing"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":4,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":3,"sphinx.domains.rst":2,"sphinx.domains.std":2,"sphinx.ext.intersphinx":1,sphinx:56},filenames:["getting-started/installation.rst","getting-started/tutorials/01-vector-add.rst","getting-started/tutorials/02-fused-softmax.rst","getting-started/tutorials/03-matrix-multiplication.rst","getting-started/tutorials/04-low-memory-dropout.rst","getting-started/tutorials/05-layer-norm.rst","getting-started/tutorials/06-fused-attention.rst","getting-started/tutorials/07-libdevice-function.rst","getting-started/tutorials/index.rst","getting-started/tutorials/sg_execution_times.rst","index.rst","programming-guide/chapter-1/introduction.rst","programming-guide/chapter-2/related-work.rst","python-api/generated/triton.Config.rst","python-api/generated/triton.autotune.rst","python-api/generated/triton.heuristics.rst","python-api/generated/triton.jit.rst","python-api/generated/triton.language.arange.rst","python-api/generated/triton.language.atomic_add.rst","python-api/generated/triton.language.atomic_and.rst","python-api/generated/triton.language.atomic_cas.rst","python-api/generated/triton.language.atomic_max.rst","python-api/generated/triton.language.atomic_min.rst","python-api/generated/triton.language.atomic_or.rst","python-api/generated/triton.language.atomic_xchg.rst","python-api/generated/triton.language.atomic_xor.rst","python-api/generated/triton.language.broadcast_to.rst","python-api/generated/triton.language.cos.rst","python-api/generated/triton.language.dot.rst","python-api/generated/triton.language.exp.rst","python-api/generated/triton.language.load.rst","python-api/generated/triton.language.log.rst","python-api/generated/triton.language.max.rst","python-api/generated/triton.language.maximum.rst","python-api/generated/triton.language.min.rst","python-api/generated/triton.language.minimum.rst","python-api/generated/triton.language.multiple_of.rst","python-api/generated/triton.language.num_programs.rst","python-api/generated/triton.language.program_id.rst","python-api/generated/triton.language.rand.rst","python-api/generated/triton.language.randint.rst","python-api/generated/triton.language.randint4x.rst","python-api/generated/triton.language.randn.rst","python-api/generated/triton.language.ravel.rst","python-api/generated/triton.language.reshape.rst","python-api/generated/triton.language.sigmoid.rst","python-api/generated/triton.language.sin.rst","python-api/generated/triton.language.softmax.rst","python-api/generated/triton.language.sqrt.rst","python-api/generated/triton.language.store.rst","python-api/generated/triton.language.sum.rst","python-api/generated/triton.language.where.rst","python-api/generated/triton.language.zeros.rst","python-api/generated/triton.testing.Benchmark.rst","python-api/generated/triton.testing.do_bench.rst","python-api/generated/triton.testing.perf_report.rst","python-api/triton.rst","python-api/triton.language.rst","python-api/triton.testing.rst"],objects:{"triton.Config":{__init__:[13,1,1,""]},"triton.language":{arange:[17,2,1,""],atomic_add:[18,2,1,""],atomic_and:[19,2,1,""],atomic_cas:[20,2,1,""],atomic_max:[21,2,1,""],atomic_min:[22,2,1,""],atomic_or:[23,2,1,""],atomic_xchg:[24,2,1,""],atomic_xor:[25,2,1,""],broadcast_to:[26,2,1,""],cos:[27,2,1,""],dot:[28,2,1,""],exp:[29,2,1,""],load:[30,2,1,""],log:[31,2,1,""],max:[32,2,1,""],maximum:[33,2,1,""],min:[34,2,1,""],minimum:[35,2,1,""],multiple_of:[36,2,1,""],num_programs:[37,2,1,""],program_id:[38,2,1,""],rand:[39,2,1,""],randint4x:[41,2,1,""],randint:[40,2,1,""],randn:[42,2,1,""],ravel:[43,2,1,""],reshape:[44,2,1,""],sigmoid:[45,2,1,""],sin:[46,2,1,""],softmax:[47,2,1,""],sqrt:[48,2,1,""],store:[49,2,1,""],sum:[50,2,1,""],where:[51,2,1,""],zeros:[52,2,1,""]},"triton.testing":{Benchmark:[53,0,1,""],do_bench:[54,2,1,""],perf_report:[55,2,1,""]},"triton.testing.Benchmark":{__init__:[53,1,1,""]},triton:{Config:[13,0,1,""],autotune:[14,2,1,""],heuristics:[15,2,1,""],jit:[16,2,1,""]}},objnames:{"0":["py","class","Python class"],"1":["py","method","Python method"],"2":["py","function","Python function"]},objtypes:{"0":"py:class","1":"py:method","2":"py:function"},terms:{"0":[1,2,3,4,5,6,7,9,11,12,37,38,39,42,52,54],"00":9,"0000":3,"000000":2,"000001":1,"000002":2,"004273":1,"005597":5,"007961":2,"01":[1,3,9],"012395":2,"02":[2,9],"023256":5,"0249":7,"025776":3,"028308":3,"03":[3,9],"04":[4,9],"0424":7,"05":[5,9],"05682v2":6,"06":[6,9],"0625":3,"064941":3,"07":[7,9],"072":[6,9],"08199":4,"08452":4,"084721":1,"092307":5,"0938":3,"096718":2,"097543":2,"097818":3,"098578":5,"0f":12,"0s":4,"1":[1,2,3,4,5,6,10,12,15,37,38,39,42],"10":[1,3,4,5,6,7,39,40,41,42],"100":[2,6,54],"1024":[1,3,4,5,7,14],"10240":5,"1045":3,"1048576":1,"106434":4,"10752":5,"11":[0,1,3,5],"111115":5,"11264":5,"1151":5,"1152":3,"115360":3,"11776":5,"118":[1,9],"118889":3,"12":[1,3,5],"120002":3,"12160":2,"12288":[2,5],"123":4,"12416":2,"12544":2,"12672":2,"126988":5,"127":1,"128":[1,2,3,5,6,14],"1280":3,"12800":5,"13":[1,3,5],"131072":1,"1328":3,"13312":5,"133347":2,"134217728":1,"134567":5,"13686":4,"13824":5,"138541":3,"14":[1,3,5],"140799":3,"1408":3,"14135v2":6,"142849":5,"142862":2,"14336":5,"144":[5,9],"147202":3,"14848":5,"149375":2,"149397":4,"15":[1,3,5,9],"150050":3,"153":2,"1536":[3,5],"15360":5,"153868":5,"154":2,"15872":5,"16":[2,3,5,6,12,52],"160":2,"162":2,"16384":1,"1664":3,"167004":2,"16777216":1,"17":[3,5],"171410":5,"173427":5,"177767":5,"17879":4,"1792":3,"18":[3,5,9],"180982":5,"181847":3,"1823":2,"185964":5,"19":[1,3,5],"190":2,"190482":1,"192":1,"1920":3,"192434":5,"198":2,"198054":5,"1982":12,"1983":11,"1984":12,"1989":12,"199":2,"1991":[11,12],"1999":12,"1d":[1,2,3],"1e":[1,2,3,5],"1s":4,"2":[1,2,3,4,5,6,7,10,12,13,15,37,38,54],"20":[3,5,6,54],"200000":1,"200001":3,"2004":12,"2006":12,"2011":4,"2012":12,"2013":11,"2014":[4,11],"2016":[11,12],"2017":11,"2018":[11,12],"2019":12,"2021":[11,12],"2048":[2,3,5,6],"206879":2,"207081":5,"2097152":1,"21":[3,5,9],"2112":6,"212868":4,"2141":1,"214186":4,"214870":5,"216187":2,"2176":3,"219":1,"22":[3,5],"220":3,"2205":6,"225175":5,"23":[3,5],"2304":3,"237267":5,"24":[3,5],"242181":3,"2432":3,"245":3,"249":[3,9],"25":[3,5,6,54],"251457":5,"254540":5,"256":[1,2,3,5,13],"2560":[3,5],"257735":3,"26":[3,5],"260869":3,"262144":1,"2656":3,"2688":3,"269882":5,"27":[3,5],"273":[7,9],"276800":3,"277":5,"277102":3,"28":[1,3,5],"280":[4,9],"2812":3,"2816":3,"2891":3,"29":[3,5],"293429":4,"2944":3,"297068":5,"299883":3,"2d":[3,5,28],"2m":2,"2mn":2,"3":[0,1,2,3,4,5,6,12],"30":3,"304904":5,"305746":3,"305878":3,"307030":3,"3072":[3,5],"3076":1,"308771":5,"31":[1,2,3,9],"3125":3,"313806":5,"314362":3,"315309":5,"32":[3,5,6,13],"3200":3,"323":5,"32768":1,"3281":3,"33":3,"3328":3,"333321":1,"334":5,"33554432":1,"337026":5,"337844":3,"34":3,"341":1,"34172":4,"3438":3,"3456":3,"3477":3,"347810":5,"349836":3,"35":3,"350922":5,"3516":3,"355034":3,"3555":3,"357796":5,"3584":[3,5],"359066":2,"36":[3,5,9],"360017":2,"360174":3,"360920":5,"362445":1,"365":5,"367358":5,"368435":5,"369452":5,"371":5,"3712":3,"3713":1,"371721":4,"372618":3,"372800":3,"373":5,"373605":3,"374":5,"375":5,"376":5,"377":5,"378":5,"379":5,"38":1,"380":5,"380953":3,"381":5,"382":5,"383":5,"384":[1,2,3,5],"3840":3,"384000":3,"384185791015625e":7,"387":5,"387087":5,"389":5,"389355":5,"389441":3,"39":3,"3906":3,"392":5,"393":5,"393507":3,"396":5,"396572":3,"3968":3,"397":5,"397770":3,"398206":3,"3984":3,"3986":4,"3d":[37,38],"3mn":2,"4":[1,2,3,5,6,12,13,14,40],"40":3,"400":5,"400001":1,"400016":[1,2],"402":5,"4023":3,"403344":4,"403347":4,"403381":5,"405":5,"406":[2,5],"4062":3,"407":5,"408":5,"408716":4,"409":5,"4096":[1,2,3,5,6],"410":5,"4105":7,"411":5,"412":2,"413":5,"414":5,"415":2,"41576":4,"4194304":1,"420828":5,"42142":4,"426":5,"428372":4,"428568":1,"428801":3,"429770":[1,2],"430545":3,"431969":4,"433489":5,"435930":3,"445676":5,"446623":3,"447":5,"448":5,"448255":1,"4492":3,"45":3,"4531":3,"454":5,"455":5,"459443":5,"46":[1,9],"460287":3,"4608":5,"4609":3,"461":5,"467852":3,"4688":3,"47":3,"470582":5,"471":5,"472":1,"48":6,"481":5,"481028":5,"482":5,"484358":3,"484863":5,"486200":5,"492442":3,"494":5,"4940":1,"496":5,"498981":2,"4m":2,"4x":2,"5":[1,3,4,5,6,12,54],"500":5,"5000":3,"500614":3,"502740":5,"504":5,"507077":3,"51":3,"511":5,"511628":2,"512":[2,3,4,5],"5120":5,"512412":3,"512459":3,"514":5,"518":5,"52":3,"520":5,"524288":1,"526831":3,"53":3,"5312":3,"533":5,"5351":7,"54":3,"540970":3,"541":4,"542675":5,"5430":7,"544253":3,"546":2,"547":5,"548254":3,"552332":3,"559798":5,"5632":5,"563555":3,"564":5,"566038":2,"568431":4,"57":3,"578556":5,"585":[2,5],"5859":3,"586858":4,"587":5,"587162":5,"587863":5,"5898":3,"59":3,"593522":3,"599987":5,"599991":5,"5mn":2,"6":[0,1,3,5],"600000":1,"600004":2,"603776":3,"603966":2,"604578":3,"606":[2,5],"6094":3,"609605":5,"614":[1,2],"6144":5,"615390":1,"62":3,"626943":3,"627":5,"63":3,"630":5,"631610":5,"633":5,"634072":5,"64":[1,3,6],"640":[2,3],"641231":3,"648067":3,"64kb":5,"655":2,"65536":[1,5],"656574":1,"661740":2,"664":2,"665439":3,"6656":5,"666652":5,"668":5,"669909":5,"67":3,"670":5,"67086":4,"67108864":1,"6724":1,"678":5,"68":3,"680630":5,"688":5,"69":3,"690":5,"690139":3,"694":5,"694907":5,"695045":5,"6953":3,"698":5,"699062":5,"7":[0,1,3,5,12],"700":5,"702":5,"7031":3,"706":2,"7070":3,"707878":4,"71":3,"712":5,"714281":5,"715711":3,"7168":5,"719258":4,"72":3,"722":[1,2],"725":5,"728":5,"73":3,"730230":3,"734716":5,"735":[2,9],"737435":1,"743443":4,"748936":3,"75":3,"7500":3,"750943":3,"754967":2,"755369":2,"76":1,"768":[2,3],"7680":5,"768000":3,"77":3,"773130":3,"78":3,"780":1,"781":2,"784810":5,"79":3,"795401":3,"79719":4,"8":[1,2,3,5,6,12,13,14,52,54],"80":[3,54],"800002":1,"803739":5,"806694":4,"81":3,"810":2,"811":2,"811163":1,"812":[1,2],"814809":5,"814814":2,"8149":7,"817432":4,"8192":[1,5],"82":3,"822459":3,"823517":[1,2],"826188":5,"83":3,"833":1,"833728":3,"834951":2,"838026":4,"8388608":1,"84":3,"842":1,"84284":4,"843":1,"844306":5,"847":1,"848":1,"849":1,"85":3,"850":1,"858555":3,"859062":5,"86":3,"860458":3,"87":3,"870":9,"8704":5,"872604":5,"873439":5,"873477":3,"876695":5,"879498":3,"88":3,"882344":5,"8828":3,"885254":5,"8867":3,"888257":5,"888756":3,"89":3,"8906":3,"891575":5,"8945":3,"895043":5,"896":3,"8mn":2,"9":[0,1,2,3,4,5],"90":3,"908442":3,"91":3,"917732":3,"92":3,"9216":5,"9219":3,"922689":3,"925276":2,"928814":3,"929456":3,"93":[2,3],"934503":5,"9375":3,"94":[2,3],"9492":3,"95":2,"952835":4,"9531":3,"956960":3,"96":[2,3],"965524":5,"967074":5,"9688":3,"969169":5,"97":2,"971190":2,"9728":5,"9733":1,"974373":5,"976995":5,"978909":3,"98":2,"9805":3,"98432":[1,7],"9844":3,"998493":3,"999982":5,"999986":5,"999995":1,"999999":1,"abstract":[11,12],"break":12,"byte":2,"case":[1,2,11,12,15,20],"class":[2,5,6,11,12,13,53],"default":54,"do":[2,3,6,11,12,14,18,19,21,22,23,24,25,30,49],"float":[2,6,7,11,12,54],"function":[1,2,3,4,5,6,8,9,12,13,14,15,16,53,54,55],"import":[1,2,3,4,5,6,7,11,12],"int":[1,11,12,15,17,26,37,38,44,52,54],"new":[26,44,52],"return":[1,2,3,4,5,6,14,17,18,19,20,21,22,23,24,25,28,30,32,34,37,38,39,40,41,42,43,50,51,52,54,55],"static":[0,11,12],"super":3,"switch":3,"true":[1,2,3,5,6,28,51],"try":[3,5,6,7,13],"var":[5,12],"voil\u00e0":4,"while":[3,11],A:[3,4,5,11,12],And:[0,3],As:[2,3,4,11,12],At:[4,12],But:4,By:54,For:[3,7,11,12,13],If:[4,12,18,19,21,22,23,24,25,40,49,51,53],In:[1,2,3,4,7,12],It:[1,3,4,8,10,12,14,16],NOT:5,Of:11,On:12,One:3,The:[1,2,3,4,7,11,12,18,19,20,21,22,23,24,25,26,28,37,38,39,40,41,42,44,49,51,55],There:1,These:12,To:[1,4,8,11,12,14],_:5,__expf:2,__init__:[13,53],__nv_asin:7,__nv_asinf:7,__nvasinf:7,_a:5,_attent:6,_bwd_kernel:6,_bwd_preprocess:6,_da:5,_dout:5,_dropout:4,_fwd_kernel:6,_layer_norm_bwd_dwdb:5,_layer_norm_bwd_dx_fus:5,_layer_norm_fwd_fus:5,_matmul:3,_mean1:5,_mean2:5,_mean:5,_seeded_dropout:4,_var:5,a100:[3,6,12],a_arg:5,a_hat:5,a_ptr:3,ab:[1,7],abl:12,about:[1,2,3,4,10],abov:[1,2,3,4,12,14],academ:11,acc:[3,6,11,12],acc_scal:6,acceler:11,access:[1,3,11,12,16],accomod:3,accordingli:12,account:12,accumul:[3,6,12],accuraci:[3,11],achiev:[3,11,12],across:[2,4,11,12],activ:3,actual:[3,11,12],ad:5,add:[1,4,5,9,18],add_kernel:1,addit:[2,8,9,11,54],addition:12,address:[11,30],adopt:12,advanc:[2,3,11],advoc:12,affect:3,affin:12,after:3,against:[0,1,2,3,10],aggreg:7,aggress:[11,12],agnost:[11,12],ahead:12,aim:[2,10],al:[6,11,12],alex:4,algebra:12,algorithm:[3,4,6,11,12],alia:12,all:[2,3,4,7,8,11,12,14,32,34,36,50,53],allclos:[2,3],allen1984:12,allen:12,alloc:[1,2,3,5,11],allow:[1,2,11,12],allow_tf32:28,along:[1,3,32,34,37,38,50,54],alpha:6,also:[1,2,3,4,5,7,11,12],altern:4,alwai:[12,51],amd:[6,11],amen:12,amount:[5,11],ampl:12,an:[1,2,3,4,7,11,12,13,18,19,20,21,22,23,24,25,39,40,41,42],analog:1,analysi:[11,12],analyz:12,ancourt1991:12,ancourt:12,ani:[1,2,3,12,14,15,53],anoth:[2,12],anytim:14,apart:12,apex:5,apex_layer_norm:5,api:53,appear:53,appli:[3,4,5,6,7,11,12,18,19,21,22,23,24,25],applic:[4,12,15],approach:[11,12],appropri:1,approxim:2,ar:[0,1,2,3,4,11,12,13,14,16,30,36,49,51,53],arang:[1,2,3,4,5,6,7],arbitrari:3,arc:7,architectur:[3,11],area:12,arg:[1,2,3,5,6,13,15,16,53],argument:[1,2,3,13,14,15,16,51,53],arrai:[12,52],arrang:3,art:[11,12],artifici:4,arxiv:[6,11,12],asin_kernel:7,asinf:7,ask:2,aspect:12,asplo:11,assert:[1,2,3,4,5,6,7],assert_almost_equ:[5,6],assum:[2,53],asynchron:[1,11],atom:[18,19,20,21,22,23,24,25],attent:[8,9],auguin1983:11,auguin:11,auto:[2,3,12,13,14,15],autograd:[5,6],autom:11,automat:[2,3,7,11,12,13],autotun:[3,12],avail:[0,4,7,11,12],avoid:[2,14,51],awar:11,awkward:4,axi:[1,2,3,4,5,6,7,32,34,37,38,50,53],b:[3,11,12],b_ptr:3,back:[1,2,3,4,5,6],backpropag:4,backward:[5,6],bad:4,baghdadi2021:[11,12],baghdadi:[11,12],balanc:12,bandwidth:2,base:[4,7,10,11,12],baseexcept:6,basic:[1,8,12],batch:6,bc:7,becom:11,been:[1,11,12],befor:[3,13,14,18,19,20,21,22,23,24,25],begin:12,behavior:[12,14],being:[2,4],believ:12,below:[4,8,12],bench:[0,14],bench_flash_attent:6,bench_layer_norm:5,benchmark:[0,5,6,54,55],benefit:[2,11,12],best:[1,11],beta:6,between:[1,7,11],bfloat16:28,bia:5,bit:4,block:[1,2,3,4,6,11,12,18,19,20,21,22,23,24,25,26,27,28,29,30,31,33,35,39,40,41,42,43,45,46,47,48,49,51],block_dmodel:6,block_m:6,block_n:6,block_siz:[1,2,4,5,7,12,14,15],block_size_k:3,block_size_m:[3,5],block_size_n:[3,5],block_start:[1,4,7],blue:[1,2,3,5,6],boil:12,bool:[51,53],both:[7,12,51],bound:[1,2,3,12],branch:12,broad:11,broadcast:[26,30,49,51],buffer:6,bug:6,build:[0,3],built:[1,12],bwd:6,c:[3,11,12],c_mask:3,c_ptr:3,cach:[11,12,30],cache_modifi:30,calcul:7,call:[1,3,7,12,13,16,40],callabl:[1,15,16,54],can:[0,1,2,3,4,7,11,12,14,55],cannot:[3,11,12],capabl:[10,11],causal:6,cd:[0,8],cdiv:[1,3,4,5,6,7],ceil:15,certain:15,cgo:[11,12],challeng:4,chang:[3,4,14,30],chapter:10,characterist:12,cheap:11,check:[3,10],checkpoint:4,chen2018:11,chen:11,chip:[2,6],choic:10,click:[1,2,3,4,5,6,7],clone:[0,5,6],close:12,cmake:0,cmp:20,coalesc:11,code:[1,2,3,4,5,6,7,8,11,12],col:[3,5,6,12],col_offset:2,color:53,column:[2,3],com:[0,5,7],combin:11,come:[2,3,12],command:0,common:12,commonli:12,compar:[2,3,4,5,6,10,12,20],compat:28,compil:[2,3,6,10,11,13,16,36],complet:12,complex:12,compos:[4,11],composit:12,comprehens:[11,12],comput:[4,5,6,7,10,11,12,15,27,29,31,33,35,45,46,47,48],computation:[11,12],concern:12,concis:[1,53],condit:[12,51],config:[3,5,6,14],configur:[3,13,14,55],confirm:2,connectom:11,consecut:12,consequ:11,consid:2,consist:4,constexpr:[1,2,3,4,5,6,7,39,40,41,42,47],constraint:[3,6,12],construct:11,constructor:53,consum:3,contain:[12,20,53],contextu:12,contigu:[3,6,17,43],control:[11,12],conveni:3,convert:[1,3,16],convolut:11,cooper:13,copi:[4,11,20],core:[11,12,39,40,41,42,47],correct:[1,7],correspond:[1,2,3,53],cosin:27,cost:12,could:[2,12],cours:11,cpython:0,creat:[1,2,3,5,11],crucial:4,csv:1,ctx:[5,6],cu_seqlen:6,cubla:[3,11],cuda:[1,2,3,4,5,6,7,11],cudnn:11,cumsum:6,current:38,custom:[1,2,3,10],cut:3,cvpr:11,d:[2,4,6,14,16],d_head:6,d_ptr:6,da:5,dao:6,dart:12,darte1999:12,data:[1,3,4,5,6,7,11,12,18,19,20,21,22,23,24,25,30,51,52],data_ptr:16,dataflow:12,david:4,db:5,db_ref:5,db_tri:5,dbia:5,deal:4,decad:11,decim:5,declar:1,decompos:12,decor:[1,3,14,15,16],decreas:4,dedic:3,deep:[3,4,11,12],def:[1,2,3,4,5,6,7,14,15],defin:[1,2,3,12,30],definit:12,delta:6,denom:6,denomin:2,denot:1,dens:12,depend:[0,8,12,51],deploi:11,describ:[4,12],design:12,desir:[26,44],detail:[3,12],detect:11,develop:[11,12],devic:[1,2,3,5,6,7],dg:5,di:6,dialect:12,dict:[14,15],dictionari:[13,15],diesel:12,differ:[1,2,3,4,7,11,12,14,53],difficult:12,difficulti:[3,11],dijkstra82:12,dijkstra:12,dim:[2,6,12],dimens:[3,28,32,34,50],dimension:[3,12,28],dir:0,direct:3,disjoint:12,disk:1,dissert:12,distribut:[2,4,12],divid:6,divis:3,dk:6,dk_ptr:6,dnn:[10,11,12],do_bench:[1,2,3,5,6],do_ptr:6,do_scal:6,doc:[4,7],doe:[1,2,3,12],doesn:12,domain:[11,12],don:[1,2,3],done:[3,11,32,34,50],dot:[3,6],doubl:7,doubli:3,doubt:12,dout:[5,6],down:[3,12],download:[0,1,2,3,4,5,6,7,8],dp:6,dq:6,dq_ptr:6,dram:[1,2],dropout:[8,9],dror:4,ds:6,dsl:[10,11,12],dtype:[1,2,3,5,6,18,19,20,21,22,23,24,25,30,49,52],dv:6,dv_ptr:6,dw:5,dw_ref:5,dw_tri:5,dweight:5,dx:5,dx_ref:5,dx_tri:5,dy:5,e:[0,2,3,4,8,11,12,52],each:[1,2,3,4,11,12,13,15],earli:14,early_config_prun:14,eas:12,easi:[3,4],easier:[1,2,11],easili:3,ed:[1,3],education:2,effect:12,effici:[3,4,11,41],effort:12,eg:14,either:[1,37,38,51],elango2018:12,elango:12,element:[1,2,3,4,5,27,29,31,32,33,34,35,45,46,47,48,49,50,51,53],element_s:[2,5],element_ti:[18,19,20,21,22,23,24,25,30,49],elementwis:[2,30],els:[3,5,6],emerg:11,empti:[3,5,6],empty_lik:[1,2,4,5,6,7],enabl:12,encod:[7,12],encourag:4,end:[11,12,17],enforc:12,engin:12,enqueu:[1,2,5],ensur:12,entir:12,entri:41,environ:10,ep:5,equal:12,error:3,especi:11,et:[4,6,11,12],euromicro:11,evalu:[3,4,14,51],even:[4,12],evict_first:5,evict_last:[5,6],eviction_polici:[5,6,30,49],evidenc:11,evolv:11,exampl:[1,2,3,4,5,6,7,8,11,12,13],except:[5,6],exchang:24,execut:[9,11,12,13,55],exist:[11,12],exp:[2,6],expect:[2,20],expens:[11,12,15],explor:[4,11],exponenti:[2,29],express:[11,12],extend:[3,4],extern:7,extern_lib:7,extra:1,extras_requir:5,extrem:12,f:[1,2,3,6,7,12],facilit:[11,12],fact:12,fairli:3,fals:[5,6,18,19,21,22,23,24,25,28,30,47,49,51,53,54],far:2,fast:[2,11,12],faster:[2,40],fastest:12,featur:5,feel:3,fetch:11,few:12,field:[11,14],figur:12,file:[1,2,3,9],fill:52,fill_valu:6,fine:4,first:[1,3,4,10,12,28,33,35],first_pid_m:3,firstli:4,fit:2,fix:[6,53],flag:2,flash:6,flash_attn:6,flash_attn_func:6,flash_attn_interfac:6,flatten:43,flexibl:11,float16:[3,5,6,28,52],float32:[1,2,3,4,5,6,28,39,42],flow:[11,12],fly:4,fn:[6,16,54],focu:[3,12],folder:4,follow:[0,2,3,10,11,12],footprint:4,forc:4,forget:1,formal:12,format:12,forward:[5,6],found:20,foundat:12,four:41,fp16:3,fp32:3,frac:4,framework:[11,12],free:3,from:[1,2,3,4,6,7,11,12,30,51],full:[1,2,3,4,5,6,7],fulli:12,func:12,fundament:12,further:[4,12],fuse:[3,5,8,9],fusedlayernorm:5,fusion:[2,12],fwd:6,g:[3,4,11,12,52],galleri:[1,2,3,4,5,6,7,8],gb:[1,2,5],gbp:[1,2,5],gener:[1,2,3,4,5,6,7,8,11,12,39,40,41,42,53],geoffrei:4,geq:12,get:[1,2,3,4,9],girbal2006:12,girbal:12,git:0,github:[0,5],give:11,given:[2,3,4,26,37,38,39,40,41,42,44,52],global:12,go:[1,3,12],good:[1,12],gpgpu:11,gpu:[1,2,4,10,11,12,13,16],grad:[5,6],grad_scale_gain_bias_nam:5,grad_scale_nam:5,grad_to_non:[5,54],gradient:54,grammat:12,graphic:11,greater:2,green:[1,2,3,5],grid:[1,2,3,4,5,6,7,37,38],grid_m:3,grid_n:3,grosser2012:12,grosser:12,group:3,group_id:3,group_m:3,group_size_m:3,grow:12,guard:[1,2],guid:[7,11],h:6,ha:[1,3,4,11,12,37,38],had:1,half:6,halid:[11,12],hand:12,handl:[1,2,4,12],handwritten:11,hard:3,harder:12,hardwar:[3,10,12],has_apex:5,has_flash:6,hasattr:5,hasn:1,have:[2,4,6,11,12,16,28,51,53],head:6,heavi:11,helper:[1,2],henc:3,here:[1,2,3,4,5,6,7],heurist:[2,5],hierarch:11,hierarchi:12,high:[3,11,12],higher:3,highli:11,highlight:12,hint:12,hinton:4,hit:3,how:[1,2,3,10,11,15],howev:[2,12],html:[4,7],http:[0,4,5,6,7],i:[1,2,3,4,5,6,11,12],id:[3,38],idea:11,ideal:2,ident:2,identifi:1,idx:[18,19,21,22,23,24,25,30,49],ieee_round:47,ilya:4,imag:[11,12],immedi:6,implement:[1,2,3,4,6,11,12],implicitli:[1,16,30,49],importantli:12,impos:12,improv:[3,4],incompat:[3,12],incorrect:3,increas:[1,2,3,4],incred:11,increment:[6,12],inde:12,independ:[2,12],index:[1,7],indic:[12,51],induc:12,industri:11,inequ:12,inf:[2,6],inform:12,infrastructur:12,initi:[1,3,6],inner:[3,28],inplac:3,input:[1,2,3,4,5,7,12,14,15,26,27,28,29,31,32,33,34,35,36,43,44,45,46,47,48,50],input_ptr:2,input_row_strid:2,instal:[8,10],instanc:[1,2,3,4,11,13,37,38],instanti:4,instead:[2,51],instruct:[10,11],int1:[18,19,21,22,23,24,25,30,49],int32:[4,6,40,41],integ:12,interchang:12,interest:[11,12],intermedi:12,intern:[2,12],interv:17,intrins:12,introduc:4,introduct:10,invari:[2,12],invoc:4,invok:7,ipynb:[1,2,3,4,5,6,7],ir:12,irregular:[2,12],is_contigu:[3,4,5],is_cuda:[1,7],isn:3,issu:[11,12],iter:[3,11,12],its:[1,2,3,12,14],j:[3,5,11,12],jit:[1,2,3,4,5,6,7,14,15],jmlr:4,john:4,johnson:4,journal:12,jrk2013:11,jupyt:[1,2,3,4,5,6,7,8],just:[3,12,15],k:[3,4,6,11,12],k_ptr:6,kb:11,keep:4,kei:[3,11,14],kellei:11,kernel:[4,5,10,11,13,14,15],keyword:[1,13],ki:12,kind:2,know:36,known:12,krizhevski:4,kwarg:[13,16],l:6,l_i:6,l_i_new:6,l_ij:6,l_ptr:6,label:[1,2,3,53],lam1991:11,lam:11,lambda:[1,2,3,4,5,6,7,15],languag:[1,2,3,4,5,6,7,10,11,16],larg:[11,12],last:3,later:[2,12],latest:0,lattner2004:12,lattner2019:12,lattner:12,launch:[1,2,3,37,38],law:12,layer:[8,9,11,12],layer_norm:5,layernorm:5,lead:[4,11,12],leaky_relu:3,leakyrelu:3,learn:[1,2,3,4,10,11,12],least:12,lee2017:11,lee:11,left:12,legal:12,length:[1,6],less:[4,5,11,12],let:[1,2,4,36],letter:12,level:[3,11,12],li:11,libdevic:[8,9],librari:[0,3,11,12],lifelong:12,like:[1,4,6,11,12,40],limit:[2,4],lindenstrauss:4,line:[1,2,3,4,12,53],line_arg:[1,2,3,5,6,53],line_nam:[1,2,3,5,6,53],line_v:[1,2,3,5,6,53],linear:[11,12],link:0,list:[1,3,14,15,53,54,55],litteratur:12,lk:6,ll:4,llvm11:0,llvm:[0,12],lo:6,load:[1,2,3,4,5,6,7,12,51],local:[7,11,12],locat:[3,18,19,20,21,22,23,24,25,30,49],log2:15,log:53,logarithm:[1,31],logic:[19,23,25],look:[4,10,11],loop:[3,6,12,13],low:[8,9,12],lq:6,lv:6,m:[0,2,3,5,6,11],m_i:6,m_i_new:6,m_ij:6,m_ptr:6,machin:[11,12],machineri:[11,12],made:11,mai:[2,12,15],main:[3,11,12],maintain:[2,12],major:[3,12],make:[1,2,11,12],manag:[4,11],mani:[11,12],manual:[2,12],manual_se:[1,2,3,5,6,7],map:3,mapl:12,mark:[4,6,55],markedli:11,mask:[1,2,3,4,5,7,18,19,21,22,23,24,25,30,49,51],match:[3,20],math:15,mathbb:12,mathbf:12,mathcal:[12,42],mathemat:12,matmul:[3,6,12],matmul_kernel:3,matric:[2,3],matrix:[2,4,8,9,11,12,13,28],matrix_s:12,matter:[3,11,12],max:[1,2,5,6,7,21],max_fused_s:5,max_m:[1,2,3,5],maxim:[5,10,12,41],maximum:[1,2,6,7,32],mb:[9,11],mean1:5,mean2:5,mean:[3,5,6,12,14],mechan:[2,12],median:54,memori:[1,2,3,8,9,11,12,18,19,20,21,22,23,24,25,30,49,51],mention:3,meta:[1,2,3,4,5,7,13,14,15],metaparamet:1,method:[12,13,16,53,55],methodolog:12,micro:11,min:[3,5,22],min_m:[1,2,3,5],minimum:34,minut:[1,2,3,4,5,6,7],miss:12,mitig:12,ml:11,mlir:12,mn:2,mode:[5,6],model:[1,11,12,14],modern:[3,10,11,12],modular:12,modulenotfounderror:5,moment:6,moor:12,mora:4,more:[2,3,4,10,11,12,53],most:[3,12],mostli:13,move:3,movement:4,ms:[1,2,3,5,6,54],much:[2,3],mullapudi2016:12,mullapudi:12,multi:[3,11,12],multipl:[1,4,8,9,11,12,13,14,36,40],multiple_of:6,multipli:[3,4,5,12,28],must:[2,3,17,28,51],n:[2,3,5,11,42],n_col:2,n_ctx:6,n_element:[1,4,7],n_head:6,n_round:[39,40,41,42],n_row:2,naiv:[2,4],naive_softmax:2,name:[1,2,3,14,15,53],nativ:[1,2,3],natur:[2,11,31],nb:11,necessari:2,need:[1,2,3,4,40],nelement:2,nest:[3,12],net:12,network:[4,11,12],neural:[4,11,12],neurosci:11,never:4,newdo:6,next:[2,3],next_power_of_2:[2,5],nightli:0,nip:11,nitish:4,nn:[3,5],non:11,none:[2,3,5,6,13,14,18,19,21,22,23,24,25,30,49,53,54],nonzero:51,norm:[4,5,9],normal:[2,6,8,9],normal_:6,normalized_shap:5,note:[0,1,2,3,4,6,12,14,16,51],notebook:[1,2,3,4,5,6,7,8],notic:[2,12],notori:[3,11],novel:11,now:[1,3],num_block:6,num_pid_in_group:3,num_pid_m:3,num_pid_n:3,num_stag:[3,6,13,14],num_warp:[2,3,5,6,13,14],number:[1,2,3,4,5,12,13,14,37,39,40,41,42],numcol:5,numel:[1,4,5,7],numer:[2,11],numrow:5,nvidia:[5,7,11,30],nvvm:7,o:[2,4,6],object:[1,3,11,13,14,16,18,19,20,21,22,23,24,25],obtain:1,obvious:2,occup:5,occur:12,off:5,off_h:6,off_hz:6,off_k:6,off_m:6,off_n:6,off_o:6,off_q:6,off_v:6,off_z:6,offer:11,offici:0,offs_am:3,offs_bn:3,offs_cm:3,offs_cn:3,offs_d:6,offs_k:[3,6],offs_m:6,offs_m_curr:6,offs_n:6,offs_qm:6,offset:[1,4,6,7,39,40,41,42],often:3,omega:12,onc:[2,11,12],one:[2,3,4,8,11,12,53],ones:6,onli:[2,3,4,6,11,12,16],op:[1,2],open:17,openai:0,opencl:11,oper:[1,2,3,4,7,8,11,18,19,20,21,22,23,24,25,51],opportun:11,opsila:11,optim:[11,12],option:[3,14,18,19,21,22,23,24,25,30,49,53,54],orang:5,order:[2,3,8,12],org:[4,6],origin:12,osdi:11,other:[2,3,4,5,10,12,16,28,30,33,35],otherwis:[4,51],our:[1,2,3,11],out:[1,2,3,4,5,6,7,10,12],out_ptr:6,outlin:12,output2:4,output3:4,output:[1,2,3,4,5,6,7],output_ptr:[1,2,4],output_row_start_ptr:2,output_row_strid:2,output_torch:[1,7],output_triton:[1,7],over:[2,4,6,11,12],overfit:4,overflow:2,own:3,p:[4,6,12],p_scale:6,pa:3,packag:16,pact:12,pad:2,par:3,paradigm:[11,12],paragraph:4,parallel:[1,2,3,4,5,10,11,12,13],paralleliz:11,param:15,paramet:[1,3,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55],parametr:[6,11],part:[3,4,12],partial:5,particular:[2,3],particularli:[11,12],partit:11,pass:[1,5,7,12,13],past:[11,12],path:1,pattern:11,pb:3,pdf:6,peak:12,per:[2,4,5],percentil:[6,54],perf:3,perf_model:14,perf_report:[1,2,3,5,6,53],perform:[1,2,4,11,12,14,18,19,20,21,22,23,24,25,54],persist:4,person:12,perspect:12,phase:12,philosophi:12,philox:[4,41],pid:[1,3,4,5,7],pid_m:3,pid_n:3,pip:[0,8],pipelin:[11,12,13],platform:[10,12],pldi:11,pleas:7,plot:[0,1,2,3,53],plot_nam:[1,2,3,5,6,53],pmatrix:12,point:[1,12,41],pointer:[1,2,4,6,16,18,19,20,21,22,23,24,25,30,49],pointerdtyp:[18,19,20,21,22,23,24,25,30,49],polli:12,polyhedr:11,polyhedra:12,popular:12,portabl:[11,12],pose:11,posit:[5,15],possibl:[1,2,3,12,13],power:[2,4,12,15,17],ppopp:12,practic:[1,2,3,11],pragma:11,pre:[0,6,11],pre_hook:13,prealloc:1,predic:14,predict:12,prefer:2,premis:11,present:0,preserv:12,preserve_rng_st:4,prevent:[4,12],primer:12,primit:[11,16],princip:7,principl:12,print:[1,3,4,7],print_data:[1,2,3,5,6],prng:4,probabl:[4,12],problem:1,problemat:12,procedur:12,process:[1,5,11,12],processor:11,produc:[3,4],product:[10,12,28],program:[1,2,3,4,5,10,11,37,38],program_id:[1,2,3,4,5,6,7],programm:[11,12],prohibitev:15,project:[4,11],promot:[3,12],properli:2,properti:12,propos:11,proprietari:3,provid:[1,2,3,4,5,6,10,12,14,32,34,50,54],prune:[4,14],prune_configs_bi:14,pseudo:[3,4,41],pseudorandom:4,ptr:3,ptx:30,purpos:[11,12],push:12,put:4,py:[0,1,2,3,4,5,6,7,9],pypi:[0,5],pytest:[0,6],python:[1,2,3,4,5,6,7,8,16],pytorch:[1,2,4],q:6,q_ptr:6,qk:6,qkv:6,qquad:12,quantiti:6,r:2,rabe:6,ragan:11,rand:[1,4,5,7],randint4x:40,randn:[2,3,4,5,6],randn_lik:[5,6],random:[4,39,40,41,42],randomli:4,rang:[1,2,3,5,6,11,12],rapidli:[11,12],rate:3,rather:11,raw:1,rdom:12,re:[1,3],read:[2,3,8],reader:12,real:11,reason:12,recent:11,recommend:8,recomput:[4,6,11],record_clock:54,rectifi:11,red:6,redmon2016:11,redmon:11,reduct:[2,5,32,34,50],ref_dk:6,ref_dq:6,ref_dv:6,ref_out:6,refer:[1,6,7],regard:7,regardless:[4,51],regim:4,regist:6,regrett:11,regular:[4,12],rel:[1,12],relat:10,releas:[0,11],reli:12,relu:3,remain:[11,53],remateri:6,rememb:3,reorder:12,rep:[5,6,54],repetit:54,repres:[2,3,12,13],requir:[2,4,12],requires_grad:[5,6],requires_grad_:[5,6],research:[11,12],reset:[14,54],reset_to_zero:14,reshap:5,resolut:12,resourc:11,resp:12,respect:12,restrict:12,result:[0,1,2,11,12],ret:2,retain_graph:[5,6],retriev:12,reus:3,revisit:11,right:12,rise:12,role:12,ron:4,root:48,roughli:3,row:[2,3,4,5,6],row_idx:2,row_minus_max:2,row_start_ptr:2,rstd:5,run:[0,1,2,3,4,5,6,7,10,12,14,16,55],runtim:[12,54],ruslan:4,rvar:12,s:[1,2,4,5,12,41],said:12,salakhutdinov:4,salmon2011:4,salmon:4,same:[4,7,11,53],sato2019:12,sato:12,save:[1,2,3,6],save_for_backward:[5,6],save_path:[1,5,6],saved_tensor:[5,6],sc:12,scalabl:12,scalar:[4,11,28,39,40,41,42,52],scale:[6,53],scan:12,schedul:11,scienc:12,scientif:12,scop:12,scope:12,scratchpad:6,script:[0,1,2,3,4,5,6,7],second:[1,2,3,4,5,6,7,12,28,33,35],secondli:4,section:[3,12],see:[1,2,3,4,6,12],seed:[39,40,41,42],seeded_dropout:4,seem:[1,12],select:[7,11,12,51],self:[13,53],semant:[7,12],semi:12,sens:[1,11,12],separ:12,seq:6,sequenc:11,set:[1,4,12],setup:[0,5],sever:[11,12],shall:12,shape:[1,2,3,4,5,6,12,26,30,44,49,51,52],share:11,shaw:4,shift:2,should:[1,3,5,11,12,13,32,34,50,53],show_plot:[1,2,3],shown:12,side:12,sight:12,signal:11,significantli:2,sigplan:12,simd:11,simpl:[1,2,3,4],simplest:8,simpli:[7,12],simplic:3,simplifi:4,sinc:[1,2,3],sine:[7,46],singl:[2,4,11,40],size:[1,2,4,7,12],slower:[11,12],slowest:12,sm80:13,sm:12,sm_scale:6,small:5,smaller:[3,4],smallest:[2,15],snemi3d:11,so:[1,2,3,4,5,12],softmax:[4,6,8,9],softmax_kernel:2,softmax_output:2,softwar:13,solid:12,solut:3,solv:12,some:3,sometim:12,sourc:[1,2,3,4,5,6,7,8,12],space:[11,12],spars:[4,11,12],spatial:12,speak:3,special:11,specif:[3,11],specifi:[12,15,18,19,20,21,22,23,24,25,49],speed:2,sphinx:[1,2,3,4,5,6,7,8],split:12,spmd:[1,11,12],sqrt:5,squar:48,sram:[2,3,5,6],srivastava2014:4,srivastava:4,staat:6,stabil:2,stabl:0,stage:13,stai:6,standard:12,start:[8,17],start_m:6,start_n:6,started_tutori:9,state:[4,11,12],statement:12,staticmethod:[5,6],std:6,step:12,still:[1,2,3,12],stop:17,store:[1,2,3,4,5,6,7,18,19,20,21,22,23,24,25,51],str:[14,15,30,53],straightforward:3,strategi:[4,12],stream:[5,40],strength:11,stride:[2,3,4,5,6],stride_ak:3,stride_am:3,stride_bk:3,stride_bn:3,stride_cm:3,stride_cn:3,stride_kh:6,stride_kk:6,stride_kn:6,stride_kz:6,stride_oh:6,stride_om:6,stride_on:6,stride_oz:6,stride_qh:6,stride_qk:6,stride_qm:6,stride_qz:6,stride_vh:6,stride_vk:6,stride_vn:6,stride_vz:6,stride_xi:3,stride_xj:3,structur:[11,12],style:[1,2,3,5,6,53],subscript:12,substanti:11,substract:2,subtract:2,successfulli:12,suffer:12,suit:11,sum:[1,2,5,6],sum_db:5,sum_dw:5,superhuman:11,support:[4,12],sure:2,surprisingli:11,surround:12,suspicion:2,sutskev:[4,11],sutskever2014:11,swap:20,swizzl:11,synchron:[1,11],system:[0,3,11,12],t:[1,2,3,6,12],t_:12,t_ptr:6,tabul:4,taco:12,take:[3,4,10,14,15],taken:12,target:11,techniqu:[11,12],temperatur:4,tempor:12,tend:12,tension:11,tensor:[1,2,3,4,5,7,11,12,14,16,26,28,30,32,33,34,35,43,44,49,50,51,52,54],tensorrt:11,test:[0,1,5,6,10],test_layer_norm:5,test_op:6,text:12,tflop:3,th:54,than:[2,3,5,11,12,40,53],thei:[3,11,12],them:1,themselv:3,theoret:2,therebi:12,therefor:3,theta:12,theta_:12,thi:[1,2,3,4,5,6,7,11,12,13,14,15,16,41,53],thing:[1,4],think:2,those:2,though:[11,12],thought:12,thread:[2,11,13],through:[8,12],throughout:[6,12,53],throughput:10,tile:12,time:[0,1,2,3,4,5,6,7,11,12,14,40,54],tiramisu:[11,12],tl:[1,2,3,4,5,6,7,52],tmp:[0,6],tog:12,togeth:[4,7],tolist:4,top_k:14,topic:12,torch:[1,2,3,4,5,6,7,16,54],torch_output:3,torch_relu:3,total:[1,2,3,4,5,6,7,9],tradit:[4,11,12],trans_a:[6,28],trans_b:[6,28],transform:[4,12],transpos:6,travers:12,trend:11,tri:[26,44],tri_dk:6,tri_dq:6,tri_dv:6,tri_out:6,trick:2,tricki:4,trigger:[3,14],tril:6,trition:7,triton:[0,1,2,3,4,5,6,7,8,11,12],triton_output:3,trivial:11,tune:[2,3,12,14,15],tuner:13,tupl:[1,26,44,52],tutori:[1,2,3,4,10],tutorials_jupyt:8,tutorials_python:8,tvm:[11,12],two:[1,2,3,12,14,15,17,28],type:[7,15,28,30,51,52],typecast:[30,49],typic:12,u:[0,39],un:12,uncommon:12,underli:7,underneath:12,understand:2,undesir:14,unfortun:[3,12],unifi:11,uniformli:4,unint:51,unit:[0,11],univers:12,unrol:[5,12],up:2,updat:[3,6,12,14],us:[1,2,3,4,5,11,12,13,14,15,16,40,51,53,55],user:7,usr:7,util:[1,5],v100:12,v:6,v_ptr:6,val:[18,19,20,21,22,23,24,25],valid:1,valu:[1,2,3,4,6,7,14,15,17,18,19,20,21,22,23,24,25,27,29,30,31,32,34,36,45,46,47,48,49,50,51,52,53,55],valuabl:2,vari:6,variabl:[3,13],varianc:5,variant:11,variou:8,vasilach:[11,12],vasilache2018:[11,12],vast:12,vec:12,vector:[4,8,9,11,12],vendor:3,veri:[2,4,12],verif:12,verifi:[2,12],via:12,view:43,visibl:12,vision:11,volatil:30,vs:0,w:12,w_shape:5,wa:4,wai:[2,3,4],want:[2,4,51],warmup:[6,54],warp:[2,5,13],wast:2,wdout:5,we:[1,2,3,4,7,11,12],weight:5,well:[4,11,12],whatev:14,wheel:0,when:[2,3,4,11,12,13,14,16,51],where:[1,3,4,5,6,12,15,49],whether:[11,53],which:[1,2,3,4,11,12,14,32,34,50,53],whose:[1,2,3,4,12,14,30],wide:12,wise:[1,2,6,27,29,31,33,35,45,46,47,48,49],wish:[3,12],within:[3,16,17],without:12,wolf:12,wolfe1989:12,won:2,word:12,work:[2,4,6,10,11],workaround:6,workload:[3,13],wors:[3,11,12],would:[1,2,4],wouldn:12,wrapper:3,write:[1,2,3,4,5,6,8,10,12],wrote:2,x:[1,2,3,4,5,7,12,27,29,31,33,35,43,45,46,47,48,51,53],x_arg:5,x_keep:4,x_keep_ptr:4,x_log:[1,53],x_max:2,x_name:[1,2,3,5,6,53],x_ptr:[1,4,7,14,15],x_shape:5,x_size:[14,15],x_val:[1,2,3,5,6,53],xi:12,xii:12,xlabel:53,xo:12,xor:25,y:[1,2,3,5,12,33,35,51,53],y_fwd:5,y_log:53,y_name:[1,2],y_ptr:[1,7],y_ref:5,y_torch:2,y_tri:5,y_triton:2,year:12,yet:[11,12],yi:12,yield:51,yii:12,ylabel:[1,2,3,5,6,53],yo:12,you:[0,1,2,3,4,7,8,11,14,40,51],your:[0,1,10],yourself:[2,3],z:[1,2,6,12],zero:[3,4,5,6,7,14],zeros_lik:6,zip:8},titles:["Installation","Vector Addition","Fused Softmax","Matrix Multiplication","Low-Memory Dropout","Layer Normalization","Fused Attention","Libdevice function","Tutorials","Computation times","Welcome to Triton\u2019s documentation!","Introduction","Related Work","triton.Config","triton.autotune","triton.heuristics","triton.jit","triton.language.arange","triton.language.atomic_add","triton.language.atomic_and","triton.language.atomic_cas","triton.language.atomic_max","triton.language.atomic_min","triton.language.atomic_or","triton.language.atomic_xchg","triton.language.atomic_xor","triton.language.broadcast_to","triton.language.cos","triton.language.dot","triton.language.exp","triton.language.load","triton.language.log","triton.language.max","triton.language.maximum","triton.language.min","triton.language.minimum","triton.language.multiple_of","triton.language.num_programs","triton.language.program_id","triton.language.rand","triton.language.randint","triton.language.randint4x","triton.language.randn","triton.language.ravel","triton.language.reshape","triton.language.sigmoid","triton.language.sin","triton.language.softmax","triton.language.sqrt","triton.language.store","triton.language.sum","triton.language.where","triton.language.zeros","triton.testing.Benchmark","triton.testing.do_bench","triton.testing.perf_report","triton","triton.language","triton.testing"],titleterms:{"default":7,"final":3,"function":7,addit:1,advantag:12,algebra:57,api:10,arang:17,arithmet:3,asin:7,atom:57,atomic_add:18,atomic_and:19,atomic_ca:20,atomic_max:21,atomic_min:22,atomic_or:23,atomic_xchg:24,atomic_xor:25,attent:6,autotun:14,baselin:4,benchmark:[1,2,3,53],binari:0,broadcast_to:26,cach:3,challeng:11,co:27,comparison:57,compil:[12,57],comput:[1,2,3,9],config:13,creation:57,custom:7,distribut:0,do_bench:54,document:10,dot:28,dropout:4,exercis:4,exp:29,from:0,further:10,fuse:[2,6],gener:57,get:10,go:10,heurist:15,hint:57,index:57,instal:0,introduct:11,jit:16,kernel:[1,2,3,7],l2:3,languag:[12,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,57],layer:5,libdevic:7,librari:7,limit:12,linear:57,load:30,log:31,low:4,manipul:57,math:57,matrix:3,max:32,maximum:33,memori:[4,57],min:34,minimum:35,model:57,motiv:[2,3,11],multipl:3,multiple_of:36,normal:5,num_program:37,number:57,op:57,optim:3,packag:0,path:7,perf_report:55,perform:3,pointer:3,polyhedr:12,program:[12,57],program_id:38,python:[0,10],rand:39,randint4x:41,randint:40,randn:42,random:57,ravel:43,reduct:57,refer:[4,11,12],relat:12,represent:12,reshap:44,result:3,s:10,schedul:12,seed:4,shape:57,sigmoid:45,sin:46,softmax:[2,47],sourc:0,sqrt:48,squar:3,start:10,store:49,sum:50,test:[2,3,53,54,55,58],time:9,triton:[10,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58],tutori:8,unit:[2,3],us:7,vector:1,welcom:10,where:51,work:12,zero:52}})