Files
triton/autotune/python/tools.py

175 lines
7.3 KiB
Python
Raw Normal View History

2014-09-06 00:39:38 -04:00
from __future__ import division
import pyopencl
2014-09-11 16:13:46 -04:00
import time
from pyviennacl.atidlas import StatementsTuple
2014-09-06 00:39:38 -04:00
class PhysicalLimitsNV:
2014-09-06 00:39:38 -04:00
def __init__(self, dev):
2014-09-29 03:01:33 +02:00
self.compute_capability = pyopencl.characterize.nv_compute_capability(dev)
if self.compute_capability[0]==1:
if self.compute_capability[1]<=1:
self.warps_per_mp = 24
self.threads_per_mp = 768
self.num_32b_reg_per_mp = 8192
self.reg_alloc_unit_size = 256
else:
self.warps_per_mp = 32
self.threads_per_mp = 1024
self.num_32b_reg_per_mp = 16384
self.reg_alloc_unit_size = 512
self.threads_per_warp = 32
self.thread_blocks_per_mp = 8
self.reg_alloc_granularity = 'block'
self.reg_per_thread = 124
self.shared_mem_per_mp = 16384
self.shared_mem_alloc_unit_size = 512
self.warp_alloc_granularity = 2
self.max_thread_block_size = 512
elif self.compute_capability[0]==2:
self.threads_per_warp = 32
self.warps_per_mp = 48
self.threads_per_mp = 1536
self.thread_blocks_per_mp = 8
self.num_32b_reg_per_mp = 32768
self.reg_alloc_unit_size = 64
self.reg_alloc_granularity = 'warp'
self.reg_per_thread = 63
self.shared_mem_per_mp = 49152
self.shared_mem_alloc_unit_size = 128
self.warp_alloc_granularity = 2
self.max_thread_block_size = 1024
elif self.compute_capability[0]==3:
self.threads_per_warp = 32
self.warps_per_mp = 64
self.threads_per_mp = 2048
self.thread_blocks_per_mp = 16
self.num_32b_reg_per_mp = 65536
self.reg_alloc_unit_size = 256
self.reg_alloc_granularity = 'warp'
if(self.compute_capability[1]==5):
self.reg_per_thread = 255
else:
self.reg_per_thread = 63
self.shared_mem_per_mp = 49152
self.shared_mem_alloc_unit_size = 256
self.warp_alloc_granularity = 4
self.max_thread_block_size = 1024
2014-09-06 00:39:38 -04:00
else:
2014-09-29 03:01:33 +02:00
raise Exception('Compute capability not supported!')
class PhysicalLimitsAMD:
def __init__(self, dev):
self.wavefront_size = 64
WFmax_cu = {'Cypress': 27.6}
LDS_cu = {'Cypress': 32768}
GPR_cu = {'Cypress': 16384}
self.WFmax_cu = WFmax_cu[dev.name]
self.LDS_cu = LDS_cu[dev.name]
self.GPR_cu = GPR_cu[dev.name]
pass
def _int_floor(value, multiple_of=1):
"""Round C{value} down to be a C{multiple_of} something."""
# Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
from math import floor
return int(floor(value/multiple_of))*multiple_of
2014-09-29 03:01:33 +02:00
def _int_ceiling(value, multiple_of=1):
"""Round C{value} up to be a C{multiple_of} something."""
# Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
from math import ceil
return int(ceil(value/multiple_of))*multiple_of
2014-09-29 03:01:33 +02:00
class OccupancyRecord:
2014-09-29 03:01:33 +02:00
def init_nvidia(self, dev, threads, shared_mem, registers):
physical_limits = PhysicalLimitsNV(dev)
limits = []
2014-09-29 03:01:33 +02:00
allocated_warps = max(1,_int_ceiling(threads/physical_limits.threads_per_warp))
max_warps_per_mp = physical_limits.warps_per_mp
2014-09-29 03:01:33 +02:00
limits.append((min(physical_limits.thread_blocks_per_mp, _int_floor(max_warps_per_mp/allocated_warps)), 'warps'))
if registers>0:
if registers > physical_limits.reg_per_thread:
limits.append((0, 'registers'))
else:
allocated_regs = {'warp': allocated_warps,
'block': _int_ceiling(_int_ceiling(allocated_warps, physical_limits.warp_alloc_granularity)*registers*physical_limits.threads_per_warp,allocated_warps)}[physical_limits.reg_alloc_granularity]
max_reg_per_mp = {'warp': _int_floor(physical_limits.num_32b_reg_per_mp/_int_ceiling(registers*physical_limits.threads_per_warp, physical_limits.reg_alloc_unit_size), physical_limits.warp_alloc_granularity),
'block':physical_limits.num_32b_reg_per_mp}[physical_limits.reg_alloc_granularity]
limits.append((_int_floor(max_reg_per_mp/allocated_regs), 'registers'))
if shared_mem>0:
allocated_shared_mem = _int_ceiling(shared_mem, physical_limits.shared_mem_alloc_unit_size)
max_shared_mem_per_mp = physical_limits.shared_mem_per_mp
limits.append((_int_floor(max_shared_mem_per_mp/allocated_shared_mem), 'shared memory'))
limit, limited_by = min(limits)
warps_per_mp = limit*allocated_warps
self.occupancy = 100*warps_per_mp/physical_limits.warps_per_mp
def init_amd(self, dev, threads, shared_mem, NReg):
limits = []
physical_limits = PhysicalLimitsAMD(dev)
WFmax_cu = physical_limits.WFmax_cu
WFsize = physical_limits.wavefront_size
#WFmax without constraint
WFwg = _int_ceiling(threads/WFsize)
WFcu = WFmax_cu if WFwg > WFmax_cu else _int_floor(WFmax_cu,WFwg)
limits.append(WFcu)
#WFmax with LDS constraints
if shared_mem > 0:
WGmax = _int_floor(physical_limits.LDS_cu/shared_mem)
limits.append(WGmax*WFwg)
#WFmax with GPR constraints
if NReg > 0:
WFgpr = _int_floor(physical_limits.GPR_cu/(NReg*WFsize))
limits.append(_int_floor(WFgpr, WFwg))
self.occupancy = 100*min(limits)/physical_limits.WFmax_cu
2014-09-29 03:01:33 +02:00
2014-09-06 00:39:38 -04:00
def __init__(self, dev, threads, shared_mem=0, registers=0):
if 'Advanced Micro Devices' in dev.vendor:
self.init_amd(dev, threads, shared_mem, registers)
elif 'NVidia' in dev.vendor:
self.init_nvidia(dev, threads, shared_mem, registers)
2014-09-29 03:01:33 +02:00
2014-09-11 16:13:46 -04:00
def skip(template, statement, device):
2014-09-29 03:01:33 +02:00
statements = StatementsTuple(statement)
registers_usage = template.registers_usage(statements)/4
lmem_usage = template.lmem_usage(statements)
local_size = template.parameters.local_size_0*template.parameters.local_size_1
occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage)
if template.check(statement) or occupancy_record.occupancy < 15:
2014-09-11 16:13:46 -04:00
return True
2014-09-29 03:01:33 +02:00
return False
2014-09-11 16:13:46 -04:00
def benchmark(template, statement, device):
2014-09-29 03:01:33 +02:00
statements = StatementsTuple(statement)
registers_usage = template.registers_usage(statements)/4
lmem_usage = template.lmem_usage(statements)
local_size = template.parameters.local_size_0*template.parameters.local_size_1
occupancy_record = OccupancyRecord(device, local_size, lmem_usage, registers_usage)
if occupancy_record.occupancy < 15 :
2014-09-11 16:13:46 -04:00
raise ValueError("Template has too low occupancy")
2014-09-29 03:01:33 +02:00
else:
2014-09-28 19:37:56 -04:00
#~ try:
template.execute(statement, True)
statement.result.context.finish_all_queues()
N = 0
current_time = 0
while current_time < 1e-2:
2014-09-29 03:01:33 +02:00
time_before = time.time()
template.execute(statement,False)
statement.result.context.finish_all_queues()
current_time += time.time() - time_before
N+=1
2014-09-28 19:37:56 -04:00
return current_time/N
#~ except:
2014-09-29 03:01:33 +02:00
#~ raise ValueError("Invalid template")