[DOCS] Fix spelling (#664)
This PR applies minor spelling fix in comments and string literals to `master`. It shouldn't hurt anything.
This commit is contained in:
@@ -253,7 +253,7 @@ ir::value *dot(ir::value *lhs, ir::value *rhs, ir::builder *builder) {
|
||||
std::string where_docstr = R"pbdoc(
|
||||
Returns a block of elements from either `x` or `y`, depending on `condition`.
|
||||
Note that `x` and `y` are always evaluated regardless of the value of `condition`.
|
||||
If you want to avoid unintented memory operations, use the `mask` arguments in `triton.load` and `triton.store` instead.
|
||||
If you want to avoid unintended memory operations, use the `mask` arguments in `triton.load` and `triton.store` instead.
|
||||
|
||||
:param condition: When True (nonzero), yield x, otherwise yield y.
|
||||
:type condition: Block of triton.bool
|
||||
|
@@ -152,7 +152,7 @@ def test_elementwise(N):
|
||||
cur_mem_clock = nvsmi(['clocks.current.memory'])[0]
|
||||
ref_mem_clock = mem_clocks[DEVICE_NAME]
|
||||
max_gpu_perf = get_dram_gbps()
|
||||
assert abs(cur_mem_clock - ref_mem_clock) < 10, f'GPU memmory must run at {ref_mem_clock} MHz'
|
||||
assert abs(cur_mem_clock - ref_mem_clock) < 10, f'GPU memory must run at {ref_mem_clock} MHz'
|
||||
z = torch.empty((N, ), dtype=torch.float16, device='cuda')
|
||||
x = torch.randn_like(z)
|
||||
y = torch.randn_like(z)
|
||||
|
@@ -55,7 +55,7 @@ def numpy_random(shape, dtype_str, rs: Optional[RandomState] = None, low=None, h
|
||||
|
||||
def to_triton(x: np.ndarray, device='cuda', dst_type=None) -> Union[TensorWrapper, torch.Tensor]:
|
||||
'''
|
||||
Note: We need dst_type becasue the type of x can be different from dst_type.
|
||||
Note: We need dst_type because the type of x can be different from dst_type.
|
||||
For example: x is of type `float32`, dst_type is `bfloat16`.
|
||||
If dst_type is None, we infer dst_type from x.
|
||||
'''
|
||||
@@ -424,7 +424,7 @@ def test_unary_op(dtype_x, expr, device='cuda'):
|
||||
# ----------------
|
||||
# test math ops
|
||||
# ----------------
|
||||
# @pytest.mark.paramterize("expr", [
|
||||
# @pytest.mark.parametrize("expr", [
|
||||
# 'exp', 'log', 'cos', 'sin'
|
||||
# ])
|
||||
|
||||
|
@@ -57,7 +57,7 @@ def mangle_ty(ty):
|
||||
elt = mangle_ty(ty.scalar)
|
||||
shape = '_'.join(map(str, ty.shape))
|
||||
return f'{elt}S{shape}S'
|
||||
assert False, "Unsupport type"
|
||||
assert False, "Unsupported type"
|
||||
|
||||
|
||||
def mangle_fn(name, arg_tys, constants):
|
||||
|
@@ -192,7 +192,7 @@ class dtype:
|
||||
return builder.get_float_ty()
|
||||
elif self.name == 'fp64':
|
||||
return builder.get_double_ty()
|
||||
raise ValueError(f'fail to covert {self} to ir type')
|
||||
raise ValueError(f'fail to convert {self} to ir type')
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
@@ -925,7 +925,7 @@ def where(condition, x, y, _builder=None):
|
||||
|
||||
Note that :code:`x` and :code:`y` are always evaluated regardless of the value of :code:`condition`.
|
||||
|
||||
If you want to avoid unintented memory operations, use the :code:`mask` arguments in `triton.load` and `triton.store` instead.
|
||||
If you want to avoid unintended memory operations, use the :code:`mask` arguments in `triton.load` and `triton.store` instead.
|
||||
|
||||
The shape of :code:`x` and :code:`y` are both broadcast to the shape of :code:`condition`.
|
||||
:code:`x` and :code:`y` must have the data type.
|
||||
|
@@ -328,7 +328,7 @@ def dsd_lut(layout, block, step, trans, device):
|
||||
# create increments
|
||||
incs = torch.stack((B_incs, A_incs), dim=1).view(-1).contiguous()
|
||||
# pad by a factor 2*MAX_NUM_STAGES
|
||||
# to accomodate pre-fetching inside the kernel
|
||||
# to accommodate pre-fetching inside the kernel
|
||||
pad = torch.zeros(20, device=incs.device, dtype=incs.dtype)
|
||||
incs = torch.cat((incs, pad))
|
||||
# create lut
|
||||
|
@@ -379,7 +379,7 @@ def cuda_memcheck(**target_kwargs):
|
||||
test_id = kwargs['request'].node.callspec.id
|
||||
cmd = f"{path}::{test_fn.__name__}[{test_id}]"
|
||||
out = subprocess.run(["cuda-memcheck", "pytest", "-vs", cmd], capture_output=True, env=env)
|
||||
assert out.returncode == 0, "cuda-memcheck returned an error: bounds checkng failed"
|
||||
assert out.returncode == 0, "cuda-memcheck returned an error: bounds checking failed"
|
||||
assert "ERROR SUMMARY: 0 errors" in str(out.stdout)
|
||||
else:
|
||||
test_fn(*args, **kwargs)
|
||||
|
@@ -104,7 +104,7 @@ def extract(file_path, fun):
|
||||
# peek the next line
|
||||
line = sass_lines[line_idx].decode()
|
||||
# Print sass
|
||||
# label naming convension: LBB#i
|
||||
# label naming convention: LBB#i
|
||||
for idx, (ctrl, asm) in enumerate(asm_buffer):
|
||||
# Print label if this is BRA target
|
||||
offset = idx * 16
|
||||
|
@@ -78,7 +78,7 @@ def softmax_kernel(
|
||||
input_ptrs = row_start_ptr + col_offsets
|
||||
# Load the row into SRAM, using a mask since BLOCK_SIZE may be > than n_cols
|
||||
row = tl.load(input_ptrs, mask=col_offsets < n_cols, other=-float('inf'))
|
||||
# Substract maximum for numerical stability
|
||||
# Subtract maximum for numerical stability
|
||||
row_minus_max = row - tl.max(row, axis=0)
|
||||
# Note that exponentials in Triton are fast but approximate (i.e., think __expf in CUDA)
|
||||
numerator = tl.exp(row_minus_max)
|
||||
|
@@ -18,7 +18,7 @@ You will specifically learn about:
|
||||
# They are notoriously hard to optimize, hence their implementation is generally done by
|
||||
# hardware vendors themselves as part of so-called "kernel libraries" (e.g., cuBLAS).
|
||||
# Unfortunately, these libraries are often proprietary and cannot be easily customized
|
||||
# to accomodate the needs of modern deep learning workloads (e.g., fused activation functions).
|
||||
# to accommodate the needs of modern deep learning workloads (e.g., fused activation functions).
|
||||
# In this tutorial, you will learn how to implement efficient matrix multiplications by
|
||||
# yourself with Triton, in a way that is easy to customize and extend.
|
||||
#
|
||||
|
Reference in New Issue
Block a user