diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 5ad6eb304..843265937 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -198,21 +198,24 @@ scanline_layout::scanline_layout(size_t num_warps, bool is_dot = std::any_of(values.begin(), values.end(), [&](ir::value* v) { return dynamic_cast(v); }); - ir::value *ptr = nullptr; + + + std::vector ptrs; for(ir::value *v: values) - for(ir::user *usr: v->get_users()) - if(auto *io = dynamic_cast(usr)){ - if(!ptr || ptr->get_type()->get_tile_rank() < io->get_pointer_operand()->get_type()->get_tile_rank()) - ptr = io->get_pointer_operand(); - } + for(ir::user *usr: v->get_users()) + if(auto *io = dynamic_cast(usr)){ + if(ptrs.empty() || ptrs[0]->get_type()->get_tile_rank() <= io->get_pointer_operand()->get_type()->get_tile_rank()) + ptrs.push_back(io->get_pointer_operand()); + } unsigned i = order_[0]; int contiguous = 1; - if(ptr){ + for(ir::value* ptr: ptrs){ int nbits = ptr->get_type()->get_pointer_element_ty()->get_scalar_ty()->get_primitive_size_in_bits(); - contiguous = std::min(align->get(ptr, i), 128 / nbits); + contiguous = std::max(contiguous, std::min(align->get(ptr, i), 128 / nbits)); } + nts_[i] = clamp(size / num_threads, 1, std::min(contiguous, shape_[i])); mts_[i] = clamp(num_threads, 1, shape_[i] / nts_[i]); size /= shape_[i]; diff --git a/python/triton/language/random.py b/python/triton/language/random.py index dbc16e35d..1a1ecbc37 100644 --- a/python/triton/language/random.py +++ b/python/triton/language/random.py @@ -100,9 +100,9 @@ def uint32_to_uniform_float(x): This is originally designed from uint32, but it works with int32 too as long as the int32 uniformly covers all the possible values it can take. """ - max = 2147483647. + max = 4.656613e-10 # = 1/MAX_INT = 1/2147483647. x = tl.where(x < 0, -x - 1, x) - return x / max + return x * max @triton.jit def pair_uniform_to_normal(u1, u2):