diff --git a/cmake/FindLLVM.cmake b/cmake/FindLLVM.cmake index b615936e6..f9216a24e 100644 --- a/cmake/FindLLVM.cmake +++ b/cmake/FindLLVM.cmake @@ -25,7 +25,7 @@ # LLVM_VERSION_STRING - Full LLVM version string (e.g. 6.0.0svn). # LLVM_VERSION_BASE_STRING - Base LLVM version string without git/svn suffix (e.g. 6.0.0). # -# Note: The variable names were chosen in conformance with the offical CMake +# Note: The variable names were chosen in conformance with the official CMake # guidelines, see ${CMAKE_ROOT}/Modules/readme.txt. # Try suffixed versions to pick up the newest LLVM install available on Debian @@ -196,4 +196,4 @@ include(FindPackageHandleStandardArgs) find_package_handle_standard_args(LLVM REQUIRED_VARS LLVM_ROOT_DIR - VERSION_VAR LLVM_VERSION_STRING) \ No newline at end of file + VERSION_VAR LLVM_VERSION_STRING) diff --git a/docs/programming-guide/chapter-2/related-work.rst b/docs/programming-guide/chapter-2/related-work.rst index bb83d4851..e21ec4de7 100644 --- a/docs/programming-guide/chapter-2/related-work.rst +++ b/docs/programming-guide/chapter-2/related-work.rst @@ -14,7 +14,7 @@ Traditional compilers typically rely on intermediate representations, such as LL Program Representation +++++++++++++++++++++++ -Polyhedral compilation is a vast area of research. In this section we only outline the most basic aspects of this topic, but readers interested in the solid mathematical foundations underneath may refer to the ample litterature on linear and integer programming. +Polyhedral compilation is a vast area of research. In this section we only outline the most basic aspects of this topic, but readers interested in the solid mathematical foundations underneath may refer to the ample literature on linear and integer programming. .. table:: :widths: 50 50 diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h index 313c7b1b3..39d40511e 100644 --- a/include/triton/codegen/analysis/layout.h +++ b/include/triton/codegen/analysis/layout.h @@ -246,7 +246,7 @@ struct N_buffer_info_t { std::map firsts_idx; }; -// abstract for dot and coresponding smem values +// abstract for dot and corresponding smem values class shared_layout: public data_layout { private: static bool is_loop_latch(ir::phi_node *phi, ir::instruction *terminator); diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index 1fe8fed89..526c64b47 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -569,7 +569,7 @@ std::tuple generator::fp8x4_to_fp16x4(Value *in0 "prmt.b32 a1, 0, $2, 0x7060; \n\t" // If input is 0xdcba set a1 to 0xd0c0 "lop3.b32 b0, a0, 0x7fff7fff, 0, 0xc0; \n\t" // b0 = a0 & 0x7fff7fff (strip sign) "lop3.b32 b1, a1, 0x7fff7fff, 0, 0xc0; \n\t" // b1 = a1 & 0x7fff7fff (strip sign) - "shr.b32 b0, b0, 1; \n\t" // b0 >>= 1 (shift into fp16 poistion) + "shr.b32 b0, b0, 1; \n\t" // b0 >>= 1 (shift into fp16 position) "shr.b32 b1, b1, 1; \n\t" // b1 >>= 1 (shift into fp16 position) "lop3.b32 $0, b0, 0x80008000, a0, 0xf8; \n\t" // out0 = b0 | (0x80008000 & a0) (restore sign) "lop3.b32 $1, b1, 0x80008000, a1, 0xf8; \n\t" // out1 = b1 | (0x80008000 & a1) (restore sign) diff --git a/lib/driver/llvm.cc b/lib/driver/llvm.cc index c4a13b806..e17c381cb 100644 --- a/lib/driver/llvm.cc +++ b/lib/driver/llvm.cc @@ -96,7 +96,7 @@ static bool find_and_replace(std::string& str, const std::string& begin, const s std::string path_to_ptxas(int& version) { std::vector rets; std::string ret; - // search pathes for ptxas + // search paths for ptxas std::vector ptxas_prefixes = {"", "/usr/local/cuda/bin/"}; std::string triton_ptxas = tools::getenv("TRITON_PTXAS_PATH"); if(!triton_ptxas.empty()) diff --git a/lib/ir/print.cc b/lib/ir/print.cc index db73ec7d9..4b6e3266f 100644 --- a/lib/ir/print.cc +++ b/lib/ir/print.cc @@ -92,7 +92,7 @@ public: //------------------------- void SlotTracker::process_module() { // Nothing to do at the moment. - // Create slots for global variable & unamed functions & ... + // Create slots for global variable & unnamed functions & ... module_processed = true; } diff --git a/python/src/functions.h b/python/src/functions.h index d5b6c15ef..e27941a0b 100644 --- a/python/src/functions.h +++ b/python/src/functions.h @@ -253,7 +253,7 @@ ir::value *dot(ir::value *lhs, ir::value *rhs, ir::builder *builder) { std::string where_docstr = R"pbdoc( Returns a block of elements from either `x` or `y`, depending on `condition`. Note that `x` and `y` are always evaluated regardless of the value of `condition`. - If you want to avoid unintented memory operations, use the `mask` arguments in `triton.load` and `triton.store` instead. + If you want to avoid unintended memory operations, use the `mask` arguments in `triton.load` and `triton.store` instead. :param condition: When True (nonzero), yield x, otherwise yield y. :type condition: Block of triton.bool diff --git a/python/test/regression/test_performance.py b/python/test/regression/test_performance.py index f30b203bb..16811eaa9 100644 --- a/python/test/regression/test_performance.py +++ b/python/test/regression/test_performance.py @@ -152,7 +152,7 @@ def test_elementwise(N): cur_mem_clock = nvsmi(['clocks.current.memory'])[0] ref_mem_clock = mem_clocks[DEVICE_NAME] max_gpu_perf = get_dram_gbps() - assert abs(cur_mem_clock - ref_mem_clock) < 10, f'GPU memmory must run at {ref_mem_clock} MHz' + assert abs(cur_mem_clock - ref_mem_clock) < 10, f'GPU memory must run at {ref_mem_clock} MHz' z = torch.empty((N, ), dtype=torch.float16, device='cuda') x = torch.randn_like(z) y = torch.randn_like(z) diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index 13a3d6314..46ddfe760 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -55,7 +55,7 @@ def numpy_random(shape, dtype_str, rs: Optional[RandomState] = None, low=None, h def to_triton(x: np.ndarray, device='cuda', dst_type=None) -> Union[TensorWrapper, torch.Tensor]: ''' - Note: We need dst_type becasue the type of x can be different from dst_type. + Note: We need dst_type because the type of x can be different from dst_type. For example: x is of type `float32`, dst_type is `bfloat16`. If dst_type is None, we infer dst_type from x. ''' @@ -424,7 +424,7 @@ def test_unary_op(dtype_x, expr, device='cuda'): # ---------------- # test math ops # ---------------- -# @pytest.mark.paramterize("expr", [ +# @pytest.mark.parametrize("expr", [ # 'exp', 'log', 'cos', 'sin' # ]) diff --git a/python/triton/code_gen.py b/python/triton/code_gen.py index 3951d8b6b..e2956aea9 100644 --- a/python/triton/code_gen.py +++ b/python/triton/code_gen.py @@ -57,7 +57,7 @@ def mangle_ty(ty): elt = mangle_ty(ty.scalar) shape = '_'.join(map(str, ty.shape)) return f'{elt}S{shape}S' - assert False, "Unsupport type" + assert False, "Unsupported type" def mangle_fn(name, arg_tys, constants): diff --git a/python/triton/language/core.py b/python/triton/language/core.py index 29a128321..e52a488b2 100644 --- a/python/triton/language/core.py +++ b/python/triton/language/core.py @@ -192,7 +192,7 @@ class dtype: return builder.get_float_ty() elif self.name == 'fp64': return builder.get_double_ty() - raise ValueError(f'fail to covert {self} to ir type') + raise ValueError(f'fail to convert {self} to ir type') def __str__(self): return self.name @@ -925,7 +925,7 @@ def where(condition, x, y, _builder=None): Note that :code:`x` and :code:`y` are always evaluated regardless of the value of :code:`condition`. - If you want to avoid unintented memory operations, use the :code:`mask` arguments in `triton.load` and `triton.store` instead. + If you want to avoid unintended memory operations, use the :code:`mask` arguments in `triton.load` and `triton.store` instead. The shape of :code:`x` and :code:`y` are both broadcast to the shape of :code:`condition`. :code:`x` and :code:`y` must have the data type. diff --git a/python/triton/ops/blocksparse/matmul.py b/python/triton/ops/blocksparse/matmul.py index 0fa1a5878..4b6d98aac 100644 --- a/python/triton/ops/blocksparse/matmul.py +++ b/python/triton/ops/blocksparse/matmul.py @@ -328,7 +328,7 @@ def dsd_lut(layout, block, step, trans, device): # create increments incs = torch.stack((B_incs, A_incs), dim=1).view(-1).contiguous() # pad by a factor 2*MAX_NUM_STAGES - # to accomodate pre-fetching inside the kernel + # to accommodate pre-fetching inside the kernel pad = torch.zeros(20, device=incs.device, dtype=incs.dtype) incs = torch.cat((incs, pad)) # create lut diff --git a/python/triton/testing.py b/python/triton/testing.py index bfcd6ef6b..594edcbf2 100644 --- a/python/triton/testing.py +++ b/python/triton/testing.py @@ -379,7 +379,7 @@ def cuda_memcheck(**target_kwargs): test_id = kwargs['request'].node.callspec.id cmd = f"{path}::{test_fn.__name__}[{test_id}]" out = subprocess.run(["cuda-memcheck", "pytest", "-vs", cmd], capture_output=True, env=env) - assert out.returncode == 0, "cuda-memcheck returned an error: bounds checkng failed" + assert out.returncode == 0, "cuda-memcheck returned an error: bounds checking failed" assert "ERROR SUMMARY: 0 errors" in str(out.stdout) else: test_fn(*args, **kwargs) diff --git a/python/triton/tools/disasm.py b/python/triton/tools/disasm.py index 3672d4b05..24a0787c5 100644 --- a/python/triton/tools/disasm.py +++ b/python/triton/tools/disasm.py @@ -104,7 +104,7 @@ def extract(file_path, fun): # peek the next line line = sass_lines[line_idx].decode() # Print sass - # label naming convension: LBB#i + # label naming convention: LBB#i for idx, (ctrl, asm) in enumerate(asm_buffer): # Print label if this is BRA target offset = idx * 16 diff --git a/python/tutorials/02-fused-softmax.py b/python/tutorials/02-fused-softmax.py index 7af24e18d..7447b60af 100644 --- a/python/tutorials/02-fused-softmax.py +++ b/python/tutorials/02-fused-softmax.py @@ -78,7 +78,7 @@ def softmax_kernel( input_ptrs = row_start_ptr + col_offsets # Load the row into SRAM, using a mask since BLOCK_SIZE may be > than n_cols row = tl.load(input_ptrs, mask=col_offsets < n_cols, other=-float('inf')) - # Substract maximum for numerical stability + # Subtract maximum for numerical stability row_minus_max = row - tl.max(row, axis=0) # Note that exponentials in Triton are fast but approximate (i.e., think __expf in CUDA) numerator = tl.exp(row_minus_max) diff --git a/python/tutorials/03-matrix-multiplication.py b/python/tutorials/03-matrix-multiplication.py index 7b2a35bd2..49382aecd 100644 --- a/python/tutorials/03-matrix-multiplication.py +++ b/python/tutorials/03-matrix-multiplication.py @@ -18,7 +18,7 @@ You will specifically learn about: # They are notoriously hard to optimize, hence their implementation is generally done by # hardware vendors themselves as part of so-called "kernel libraries" (e.g., cuBLAS). # Unfortunately, these libraries are often proprietary and cannot be easily customized -# to accomodate the needs of modern deep learning workloads (e.g., fused activation functions). +# to accommodate the needs of modern deep learning workloads (e.g., fused activation functions). # In this tutorial, you will learn how to implement efficient matrix multiplications by # yourself with Triton, in a way that is easy to customize and extend. #