[TESTING] use torch.int for autotuning cache (#840)
For stupid reasons, ops on int8 are 3 times slower than on int, and for another set of stupid reasons we are not using cudaMemset for `zero_`, so using `int8` buffer in `do_bench` makes it slow. Co-authored-by: Philippe Tillet <phil@openai.com>
This commit is contained in:
committed by
GitHub
parent
77bc5187b5
commit
0d7e753227
@@ -128,7 +128,7 @@ elementwise_data = {
|
|||||||
1024 * 16: 0.0219,
|
1024 * 16: 0.0219,
|
||||||
1024 * 64: 0.0791,
|
1024 * 64: 0.0791,
|
||||||
1024 * 256: 0.243,
|
1024 * 256: 0.243,
|
||||||
1024 * 1024: 0.534,
|
1024 * 1024: 0.530,
|
||||||
1024 * 4096: 0.796,
|
1024 * 4096: 0.796,
|
||||||
1024 * 16384: 0.905,
|
1024 * 16384: 0.905,
|
||||||
1024 * 65536: 0.939,
|
1024 * 65536: 0.939,
|
||||||
|
@@ -115,7 +115,9 @@ def nvsmi(attrs):
|
|||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
def do_bench(fn, warmup=25, rep=100, grad_to_none=None, percentiles=[0.5, 0.2, 0.8], record_clocks=False):
|
def do_bench(fn, warmup=25, rep=100, grad_to_none=None,
|
||||||
|
percentiles=(0.5, 0.2, 0.8),
|
||||||
|
record_clocks=False, fast_flush=False):
|
||||||
"""
|
"""
|
||||||
Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
|
Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
|
||||||
the 20-th and 80-th performance percentile.
|
the 20-th and 80-th performance percentile.
|
||||||
@@ -130,6 +132,8 @@ def do_bench(fn, warmup=25, rep=100, grad_to_none=None, percentiles=[0.5, 0.2, 0
|
|||||||
:type grad_to_none: torch.tensor, optional
|
:type grad_to_none: torch.tensor, optional
|
||||||
:param percentiles: Performance percentile to return in addition to the median.
|
:param percentiles: Performance percentile to return in addition to the median.
|
||||||
:type percentiles: list[float]
|
:type percentiles: list[float]
|
||||||
|
:param fast_flush: Use faster kernel to flush L2 between measurements
|
||||||
|
:type fast_flush: bool
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Estimate the runtime of the function
|
# Estimate the runtime of the function
|
||||||
@@ -151,7 +155,10 @@ def do_bench(fn, warmup=25, rep=100, grad_to_none=None, percentiles=[0.5, 0.2, 0
|
|||||||
# doesn't contain any input data before the run
|
# doesn't contain any input data before the run
|
||||||
start_event = [torch.cuda.Event(enable_timing=True) for i in range(n_repeat)]
|
start_event = [torch.cuda.Event(enable_timing=True) for i in range(n_repeat)]
|
||||||
end_event = [torch.cuda.Event(enable_timing=True) for i in range(n_repeat)]
|
end_event = [torch.cuda.Event(enable_timing=True) for i in range(n_repeat)]
|
||||||
cache = torch.empty(int(256e6), dtype=torch.int8, device='cuda')
|
if fast_flush:
|
||||||
|
cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')
|
||||||
|
else:
|
||||||
|
cache = torch.empty(int(256e6), dtype=torch.int8, device='cuda')
|
||||||
# Warm-up
|
# Warm-up
|
||||||
for _ in range(n_warmup):
|
for _ in range(n_warmup):
|
||||||
fn()
|
fn()
|
||||||
|
Reference in New Issue
Block a user