Some improvements
This commit is contained in:
@@ -32,6 +32,8 @@ private:
|
||||
{
|
||||
if (p_.simd_width>1)
|
||||
return TEMPLATE_INVALID_SIMD_WIDTH;
|
||||
if(p_.fetching_policy==FETCH_FROM_LOCAL)
|
||||
return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
|
||||
return TEMPLATE_VALID;
|
||||
}
|
||||
|
||||
|
@@ -47,14 +47,26 @@ class matrix_product_template : public template_base_impl<matrix_product_templat
|
||||
{
|
||||
|
||||
private:
|
||||
unsigned int n_lmem_elements() const
|
||||
unsigned int lmem_usage(statements_container const & statements) const
|
||||
{
|
||||
viennacl::scheduler::statement const & statement = statements.data().front();
|
||||
viennacl::scheduler::statement_node_numeric_type numeric_type = lhs_most(statement.array(), statement.root()).lhs.numeric_type;
|
||||
|
||||
unsigned int N = 0;
|
||||
if (p_.A_fetching_policy==FETCH_FROM_LOCAL)
|
||||
N += p_.kL * (p_.mL+1);
|
||||
if (p_.B_fetching_policy==FETCH_FROM_LOCAL)
|
||||
N += p_.nL * (p_.kL+1);
|
||||
return N;
|
||||
return N*tools::size_of(numeric_type);
|
||||
}
|
||||
|
||||
unsigned int registers_usage(statements_container const & statements) const
|
||||
{
|
||||
viennacl::scheduler::statement const & statement = statements.data().front();
|
||||
viennacl::scheduler::statement_node_numeric_type numeric_type = lhs_most(statement.array(), statement.root()).lhs.numeric_type;
|
||||
|
||||
unsigned int N = p_.mS * p_.nS + p_.mS * p_.kS + p_.kS * p_.nS;
|
||||
return N*tools::size_of(numeric_type);
|
||||
}
|
||||
|
||||
int check_invalid_impl(viennacl::ocl::device const &, statements_container const &) const
|
||||
|
@@ -27,9 +27,17 @@ class reduction_template : public template_base_impl<reduction_template, reducti
|
||||
{
|
||||
|
||||
private:
|
||||
unsigned int n_lmem_elements() const
|
||||
|
||||
unsigned int num_lmem_elements(statements_container const & statements) const
|
||||
{
|
||||
return p_.local_size_0;
|
||||
unsigned int res = 0;
|
||||
for(statements_container::data_type::const_iterator it = statements.data().begin() ; it != statements.data().end() ; ++it)
|
||||
{
|
||||
viennacl::scheduler::statement const & statement = statements.data().front();
|
||||
viennacl::scheduler::statement_node_numeric_type numeric_type = lhs_most(statement.array(), statement.root()).lhs.numeric_type;
|
||||
res += p_.local_size_0*tools::size_of(numeric_type);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
int check_invalid_impl(viennacl::ocl::device const &, statements_container const & statements) const
|
||||
|
@@ -34,7 +34,7 @@ private:
|
||||
return TEMPLATE_VALID;
|
||||
}
|
||||
|
||||
unsigned int n_lmem_elements() const
|
||||
unsigned int lmem_usage() const
|
||||
{
|
||||
return p_.local_size_0*(p_.local_size_1+1);
|
||||
}
|
||||
|
@@ -486,6 +486,10 @@ private:
|
||||
public:
|
||||
template_base(binding_policy_t binding_policy) : binding_policy_(binding_policy) {}
|
||||
|
||||
virtual unsigned int lmem_usage(statements_container const &) const { return 0; }
|
||||
|
||||
virtual unsigned int registers_usage(statements_container const &) const { return 0; }
|
||||
|
||||
virtual ~template_base(){ }
|
||||
|
||||
std::vector<std::string> generate(std::string const & kernel_prefix, statements_container const & statements, viennacl::ocl::device const & device)
|
||||
@@ -521,10 +525,8 @@ class template_base_impl : public template_base
|
||||
{
|
||||
private:
|
||||
virtual int check_invalid_impl(viennacl::ocl::device const &, statements_container const &) const { return TEMPLATE_VALID; }
|
||||
virtual unsigned int n_lmem_elements() const { return 0; }
|
||||
|
||||
protected:
|
||||
|
||||
bool has_misaligned_offset(statements_container const & statements)
|
||||
{
|
||||
for (statements_container::data_type::const_iterator it = statements.data().begin(); it != statements.data().end(); ++it)
|
||||
@@ -565,13 +567,10 @@ public:
|
||||
{
|
||||
using namespace viennacl::tools;
|
||||
|
||||
viennacl::scheduler::statement const & statement = statements.data().front();
|
||||
unsigned int scalartype_size = tools::size_of(lhs_most(statement.array(), statement.root()).lhs.numeric_type);
|
||||
|
||||
//Query device informations
|
||||
size_t lmem_available = static_cast<size_t>(device.local_mem_size());
|
||||
size_t lmem_usage = scalartype_size*n_lmem_elements();
|
||||
if (lmem_usage>lmem_available)
|
||||
size_t lmem_used = lmem_usage(statements);
|
||||
if (lmem_used>lmem_available)
|
||||
return TEMPLATE_LOCAL_MEMORY_OVERFLOW;
|
||||
|
||||
//Invalid work group size
|
||||
|
@@ -18,30 +18,32 @@ import optimize
|
||||
import sys
|
||||
|
||||
DATATYPES = { 'single' : vcl.float32,
|
||||
'double' : vcl.float64
|
||||
}
|
||||
'double' : vcl.float64 }
|
||||
|
||||
TYPES = { 'vector-axpy': vcl.atidlas.VectorAxpyTemplate,
|
||||
'matrix-axpy': vcl.atidlas.MatrixAxpyTemplate,
|
||||
'reduction': vcl.atidlas.ReductionTemplate,
|
||||
'row-wise-reduction': vcl.atidlas.RowWiseReductionTemplate,
|
||||
'matrix-product': vcl.atidlas.MatrixProductTemplate
|
||||
}
|
||||
|
||||
PNAMES = {
|
||||
'vector-axpy': ['simd-width', 'local-size-0', 'num-groups-0', 'fetch'],
|
||||
'matrix-axpy': ['simd-width', 'local-size-0', 'local-size-1', 'num-groups-0', 'num-groups-1', 'fetch'],
|
||||
'reduction': ['simd-width', 'local-size-0', 'num-groups-0', 'fetch'],
|
||||
'row-wise-reduction': ['simd-width', 'local-size-0', 'local-size-1', 'num-groups-0', 'fetch'],
|
||||
'matrix-product': ['simd-width', 'local-size-0', 'kL', 'local-size-1', 'mS', 'kS', 'nS', 'A-fetch-policy', 'B-fetch-policy', 'local-fetch-size-0', 'local-fetch-size-1']
|
||||
}
|
||||
|
||||
PERFINDEX = { 'vector-axpy' : (lambda x: 3*x[0]*x[1][0]/x[2]*1e-9, 'GB/s') ,
|
||||
'matrix-axpy' : (lambda x: 3*x[0]*x[1][0]*x[1][1]/x[2]*1e-9, 'GB/s'),
|
||||
'reduction' : (lambda x: 2*x[0]*x[1][0]*x[1][1]/x[2]*1e-9, 'GB/s'),
|
||||
'row-wise-reduction' : (lambda x: x[0]*x[1][0]*x[1][1]/x[2]*1e-9, 'GB/s'),
|
||||
'matrix-product': (lambda x: 2*x[1][0]*x[1][1]*x[1][2]/x[2]*1e-9, 'GFLOPs/s')
|
||||
}
|
||||
TYPES = { 'vector-axpy': {'template':vcl.atidlas.VectorAxpyTemplate,
|
||||
'parameter-names':['simd-width', 'local-size-0', 'num-groups-0', 'fetch'],
|
||||
'perf-index':lambda x: 3*x[0]*x[1][0]/x[2]*1e-9,
|
||||
'perf-measure':'GB/s'},
|
||||
|
||||
'matrix-axpy': {'template':vcl.atidlas.MatrixAxpyTemplate,
|
||||
'parameter-names':['simd-width', 'local-size-0', 'local-size-1', 'num-groups-0', 'num-groups-1', 'fetch'],
|
||||
'perf-index':lambda x: 3*x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
|
||||
'perf-measure':'GB/s'},
|
||||
|
||||
'reduction': {'template':vcl.atidlas.ReductionTemplate,
|
||||
'parameter-names':['simd-width', 'local-size-0', 'num-groups-0', 'fetch'],
|
||||
'perf-index':lambda x: 2*x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
|
||||
'perf-measure':'GB/s'},
|
||||
|
||||
'row-wise-reduction': {'template':vcl.atidlas.RowWiseReductionTemplate,
|
||||
'parameter-names':['simd-width', 'local-size-0', 'local-size-1', 'num-groups-0', 'fetch'],
|
||||
'perf-index':lambda x: x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
|
||||
'perf-measure':'GB/s'},
|
||||
|
||||
'matrix-product': {'template':vcl.atidlas.MatrixProductTemplate,
|
||||
'parameter-names':['simd-width', 'local-size-0', 'kL', 'local-size-1', 'mS', 'kS', 'nS', 'A-fetch-policy', 'B-fetch-policy', 'local-fetch-size-0', 'local-fetch-size-1'],
|
||||
'perf-index': lambda x: 2*x[1][0]*x[1][1]*x[1][2]/x[2]*1e-9,
|
||||
'perf-measure': 'GFLOP/s'} }
|
||||
|
||||
def parameter_space(operation):
|
||||
simd = [1, 2, 4, 8]
|
||||
@@ -54,25 +56,22 @@ def parameter_space(operation):
|
||||
if operation == 'reduction': return [simd, pow2_1D, pow2_1D, fetch]
|
||||
if operation == 'matrix-axpy': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D, fetch]
|
||||
if operation == 'row-wise-reduction': return [simd, pow2_2D, pow2_2D, pow2_1D, fetch]
|
||||
if operation == 'matrix-product': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D_unrolled, pow2_2D_unrolled, pow2_2D_unrolled, fetch, fetch, pow2_2D, pow2_2D]
|
||||
if operation == 'matrix-product': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D_unrolled, pow2_2D_unrolled, pow2_2D_unrolled, fetch, fetch, [0] + pow2_2D, [0] + pow2_2D]
|
||||
|
||||
def do_tuning(config_fname, spec_fname, viennacl_root):
|
||||
|
||||
config = ConfigObj(config_fname, configspec=spec_fname)
|
||||
map_to_list = lambda T: list(map(T[0], T[1] if isinstance(T[1], list) else [T[1]]))
|
||||
|
||||
for operation in ['vector-axpy', 'matrix-axpy', 'row-wise-reduction', 'matrix-product']:
|
||||
|
||||
tmp_folder = config['tmp-folder'] if 'tmp-folder' in config else ""
|
||||
|
||||
|
||||
if operation in config:
|
||||
p = config[operation]
|
||||
|
||||
p = config[operation]
|
||||
confdevices = p['devices']
|
||||
devices = utils.DEVICES_PRESETS[confdevices] if confdevices in utils.DEVICES_PRESETS else [utils.all_devices[int(i)] for i in confdevices]
|
||||
|
||||
precisions = ['single', 'double'] if 'all' in p['precision'] else p['precision']
|
||||
|
||||
datatypes = [DATATYPES[k] for k in precisions]
|
||||
s = map_to_list((int, p['size']))
|
||||
|
||||
@@ -98,8 +97,8 @@ def do_tuning(config_fname, spec_fname, viennacl_root):
|
||||
fname = os.devnull
|
||||
with open(fname, "w+") as archive:
|
||||
with vcl.Statement(node) as statement:
|
||||
result = optimize.genetic(statement, ctx, TYPES[operation], lambda p: TYPES[operation](p, *other_params),
|
||||
PNAMES[operation], parameter_space(operation), lambda t: PERFINDEX[operation][0]([datatype().itemsize, s, t]), PERFINDEX[operation][1], archive)
|
||||
result = optimize.genetic(statement, ctx, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
|
||||
TYPES[operation]['parameter-names'], parameter_space(operation), lambda t: TYPES[operation]['perf-index']([datatype().itemsize, s, t]), TYPES[operation]['perf-measure'], archive)
|
||||
if result and viennacl_root:
|
||||
vclio.generate_viennacl_headers(viennacl_root, device, datatype, operation, other_params, result[1])
|
||||
|
||||
@@ -121,7 +120,7 @@ def do_tuning(config_fname, spec_fname, viennacl_root):
|
||||
A = vcl.Matrix(s if A_trans=='N' else s[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR)
|
||||
x = vcl.Vector(s[1] if A_trans=='N' else s[0], context=ctx, dtype=datatype)
|
||||
LHS = A if A_trans=='N' else A.T
|
||||
execute(LHS*x, (A_trans,))
|
||||
execute(LHS*x, ())
|
||||
|
||||
if operation=='matrix-product':
|
||||
layouts = map_to_list((str,p['layout']))
|
@@ -1,5 +1,6 @@
|
||||
import random
|
||||
import time
|
||||
import tools
|
||||
import pyviennacl as vcl
|
||||
|
||||
from collections import OrderedDict as odict
|
||||
@@ -25,10 +26,15 @@ class GeneticOperators(object):
|
||||
self.cache = {}
|
||||
|
||||
def init(self):
|
||||
result = [random.choice(L) for L in self.parameters]
|
||||
while self.build_template(self.TemplateType.Parameters(*result)).check(self.statement)!=0:
|
||||
while True:
|
||||
result = [random.choice(L) for L in self.parameters]
|
||||
return result
|
||||
template = self.build_template(self.TemplateType.Parameters(*result))
|
||||
registers_usage = template.registers_usage(vcl.atidlas.StatementsTuple(self.statement))/4
|
||||
lmem_usage = template.lmem_usage(vcl.atidlas.StatementsTuple(self.statement))
|
||||
local_size = template.parameters.local_size_0*template.parameters.local_size_1
|
||||
occupancy_record = tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage)
|
||||
if template.check(self.statement) and occupancy_record.occupancy >= 10 :
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def min_to_hyperbol(a, tup):
|
||||
@@ -100,27 +106,35 @@ class GeneticOperators(object):
|
||||
|
||||
def mutate(self, individual, indpb):
|
||||
for i in range(len(individual)):
|
||||
if random.random() < indpb:
|
||||
individual[i] = random.choice(self.parameters[i])
|
||||
if random.random() < indpb:
|
||||
j = self.parameters[i].index(individual[i])
|
||||
j = max(0,min(random.randint(j-1, j+1),len(self.parameters[i])-1))
|
||||
individual[i] = self.parameters[i][j]
|
||||
return individual,
|
||||
|
||||
def evaluate(self, individual):
|
||||
tupindividual = tuple(individual)
|
||||
print tupindividual
|
||||
if tupindividual not in self.cache:
|
||||
template = self.build_template(self.TemplateType.Parameters(*individual))
|
||||
if template.check(self.statement)!=0:
|
||||
self.cache[tupindividual] = 100
|
||||
registers_usage = template.registers_usage(vcl.atidlas.StatementsTuple(self.statement))/4
|
||||
lmem_usage = template.lmem_usage(vcl.atidlas.StatementsTuple(self.statement))
|
||||
local_size = template.parameters.local_size_0*template.parameters.local_size_1
|
||||
occupancy_record = tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage)
|
||||
if occupancy_record.occupancy < 10 :
|
||||
self.cache[tupindividual] = 10
|
||||
else:
|
||||
template.execute(self.statement, True)
|
||||
self.statement.result.context.finish_all_queues()
|
||||
N = 0
|
||||
current_time = 0
|
||||
while current_time < 1e-2:
|
||||
time_before = time.time()
|
||||
template.execute(self.statement,False)
|
||||
try:
|
||||
template.execute(self.statement, True)
|
||||
self.statement.result.context.finish_all_queues()
|
||||
current_time += time.time() - time_before
|
||||
N+=1
|
||||
self.cache[tupindividual] = current_time/N
|
||||
N = 0
|
||||
current_time = 0
|
||||
while current_time < 1e-2:
|
||||
time_before = time.time()
|
||||
template.execute(self.statement,False)
|
||||
self.statement.result.context.finish_all_queues()
|
||||
current_time += time.time() - time_before
|
||||
N+=1
|
||||
self.cache[tupindividual] = current_time/N
|
||||
except:
|
||||
self.cache[tupindividual] = 10
|
||||
return self.cache[tupindividual],
|
@@ -2,6 +2,7 @@ import array
|
||||
import numpy as np
|
||||
import random
|
||||
import time
|
||||
import sys
|
||||
|
||||
from deap import algorithms
|
||||
from deap import base
|
||||
@@ -10,11 +11,7 @@ from deap import tools
|
||||
|
||||
from genetic_operators import GeneticOperators
|
||||
|
||||
def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, maxtime,
|
||||
stats=None, halloffame=None, verbose=__debug__):
|
||||
logbook = tools.Logbook()
|
||||
logbook.header = ['gen', 'nevals'] + (stats.fields if stats else [])
|
||||
|
||||
def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, maxtime, maxgen, halloffame, compute_perf, perf_metric):
|
||||
# Evaluate the individuals with an invalid fitness
|
||||
invalid_ind = [ind for ind in population if not ind.fitness.valid]
|
||||
fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
|
||||
@@ -24,17 +21,12 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, maxtime,
|
||||
if halloffame is not None:
|
||||
halloffame.update(population)
|
||||
|
||||
record = stats.compile(population) if stats is not None else {}
|
||||
logbook.record(gen=0, nevals=len(invalid_ind), **record)
|
||||
if verbose:
|
||||
print logbook.stream
|
||||
|
||||
# Begin the generational process
|
||||
gen = 0
|
||||
maxtime = time.strptime(maxtime, '%Mm%Ss')
|
||||
maxtime = maxtime.tm_min*60 + maxtime.tm_sec
|
||||
start_time = time.time()
|
||||
while time.time() - start_time < maxtime:
|
||||
while time.time() - start_time < maxtime and gen < maxgen:
|
||||
# Vary the population
|
||||
offspring = algorithms.varOr(population, toolbox, lambda_, cxpb, mutpb)
|
||||
|
||||
@@ -53,12 +45,12 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, maxtime,
|
||||
|
||||
# Update the statistics with the new population
|
||||
gen = gen + 1
|
||||
record = stats.compile(population) if stats is not None else {}
|
||||
logbook.record(gen=gen, nevals=len(invalid_ind), **record)
|
||||
if verbose:
|
||||
print logbook.stream
|
||||
|
||||
return population, logbook
|
||||
|
||||
best_profile = '(%s)'%','.join(map(str,halloffame[0]));
|
||||
best_performance = compute_perf(halloffame[0].fitness.values[0])
|
||||
sys.stdout.write('Generation %d | Time %d | Best %d %s [ for %s ]\n'%(gen, time.time() - start_time, best_performance, perf_metric, best_profile))
|
||||
sys.stdout.write('\n')
|
||||
return population
|
||||
|
||||
def genetic(statement, context, TemplateType, build_template, parameter_names, all_parameters, compute_perf, perf_metric, out):
|
||||
gen = GeneticOperators(context.devices[0], statement, all_parameters, parameter_names, TemplateType, build_template)
|
||||
@@ -70,13 +62,13 @@ def genetic(statement, context, TemplateType, build_template, parameter_names, a
|
||||
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
|
||||
toolbox.decorate("population", gen.repair)
|
||||
toolbox.register("evaluate", gen.evaluate)
|
||||
toolbox.register("mate", tools.cxUniform, indpb=0.3)
|
||||
toolbox.register("mate", tools.cxUniform, indpb=0.1)
|
||||
toolbox.decorate("mate", gen.repair)
|
||||
toolbox.register("mutate", gen.mutate, indpb=0.2)
|
||||
toolbox.register("mutate", gen.mutate, indpb=0.1)
|
||||
toolbox.decorate("mutate", gen.repair)
|
||||
toolbox.register("select", tools.selNSGA2)
|
||||
|
||||
pop = toolbox.population(n=10)
|
||||
pop = toolbox.population(n=70)
|
||||
hof = tools.HallOfFame(1)
|
||||
|
||||
best_performer = lambda x: max([compute_perf(hof[0].fitness.values[0]) for t in x])
|
||||
@@ -89,4 +81,4 @@ def genetic(statement, context, TemplateType, build_template, parameter_names, a
|
||||
stats.register("max (" + perf_metric + ")", lambda x: max([compute_perf(hof[0].fitness.values[0]) for t in x]))
|
||||
stats.register("profile ", lambda x: '(%s)'%','.join(map(str,hof[0])))
|
||||
|
||||
pop, log = eaMuPlusLambda(pop, toolbox, 10, 20, cxpb=0.2, mutpb=0.2, maxtime='5m0s', stats=stats, halloffame=hof, verbose=True)
|
||||
pop = eaMuPlusLambda(pop, toolbox, 70, 100, cxpb=0.1, mutpb=0.1, maxtime='5m0s', maxgen=1000, halloffame=hof, compute_perf=compute_perf, perf_metric=perf_metric)
|
104
autotune/python/tools.py
Normal file
104
autotune/python/tools.py
Normal file
@@ -0,0 +1,104 @@
|
||||
from __future__ import division
|
||||
import pyopencl
|
||||
|
||||
class PhysicalLimits:
|
||||
def __init__(self, dev):
|
||||
self.compute_capability = pyopencl.characterize.nv_compute_capability(dev)
|
||||
if self.compute_capability[0]==1:
|
||||
if self.compute_capability[1]<=1:
|
||||
self.warps_per_mp = 24
|
||||
self.threads_per_mp = 768
|
||||
self.num_32b_reg_per_mp = 8192
|
||||
self.reg_alloc_unit_size = 256
|
||||
else:
|
||||
self.warps_per_mp = 32
|
||||
self.threads_per_mp = 1024
|
||||
self.num_32b_reg_per_mp = 16384
|
||||
self.reg_alloc_unit_size = 512
|
||||
self.threads_per_warp = 32
|
||||
self.thread_blocks_per_mp = 8
|
||||
self.reg_alloc_granularity = 'block'
|
||||
self.reg_per_thread = 124
|
||||
self.shared_mem_per_mp = 16384
|
||||
self.shared_mem_alloc_unit_size = 512
|
||||
self.warp_alloc_granularity = 2
|
||||
self.max_thread_block_size = 512
|
||||
|
||||
elif self.compute_capability[0]==2:
|
||||
self.threads_per_warp = 32
|
||||
self.warps_per_mp = 48
|
||||
self.threads_per_mp = 1536
|
||||
self.thread_blocks_per_mp = 8
|
||||
self.num_32b_reg_per_mp = 32768
|
||||
self.reg_alloc_unit_size = 64
|
||||
self.reg_alloc_granularity = 'warp'
|
||||
self.reg_per_thread = 63
|
||||
self.shared_mem_per_mp = 49152
|
||||
self.shared_mem_alloc_unit_size = 128
|
||||
self.warp_alloc_granularity = 2
|
||||
self.max_thread_block_size = 1024
|
||||
|
||||
elif self.compute_capability[0]==3:
|
||||
self.threads_per_warp = 32
|
||||
self.warps_per_mp = 64
|
||||
self.threads_per_mp = 2048
|
||||
self.thread_blocks_per_mp = 16
|
||||
self.num_32b_reg_per_mp = 65536
|
||||
self.reg_alloc_unit_size = 256
|
||||
self.reg_alloc_granularity = 'warp'
|
||||
if(self.compute_capability[1]==5):
|
||||
self.reg_per_thread = 255
|
||||
else:
|
||||
self.reg_per_thread = 63
|
||||
self.shared_mem_per_mp = 49152
|
||||
self.shared_mem_alloc_unit_size = 256
|
||||
self.warp_alloc_granularity = 4
|
||||
self.max_thread_block_size = 1024
|
||||
|
||||
else:
|
||||
raise Exception('Compute capability not supported!')
|
||||
|
||||
def _int_floor(value, multiple_of=1):
|
||||
"""Round C{value} down to be a C{multiple_of} something."""
|
||||
# Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
|
||||
|
||||
from math import floor
|
||||
return int(floor(value/multiple_of))*multiple_of
|
||||
|
||||
def _int_ceiling(value, multiple_of=1):
|
||||
"""Round C{value} up to be a C{multiple_of} something."""
|
||||
# Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
|
||||
|
||||
from math import ceil
|
||||
return int(ceil(value/multiple_of))*multiple_of
|
||||
|
||||
class OccupancyRecord:
|
||||
|
||||
def __init__(self, dev, threads, shared_mem=0, registers=0):
|
||||
physical_limits = PhysicalLimits(dev)
|
||||
limits = [];
|
||||
allocated_warps = _int_ceiling(threads/physical_limits.threads_per_warp)
|
||||
max_warps_per_mp = physical_limits.warps_per_mp;
|
||||
limits.append((min(physical_limits.thread_blocks_per_mp, _int_floor(max_warps_per_mp/allocated_warps)), 'warps'))
|
||||
|
||||
if registers>0:
|
||||
if registers > physical_limits.reg_per_thread:
|
||||
limits.append((0, 'registers'))
|
||||
else:
|
||||
allocated_regs = {'warp': allocated_warps,
|
||||
'block': _int_ceiling(_int_ceiling(allocated_warps, physical_limits.warp_alloc_granularity)*registers*physical_limits.threads_per_warp,allocated_warps)}[physical_limits.reg_alloc_granularity]
|
||||
max_reg_per_mp = {'warp': _int_floor(physical_limits.num_32b_reg_per_mp/_int_ceiling(registers*physical_limits.threads_per_warp, physical_limits.reg_alloc_unit_size), physical_limits.warp_alloc_granularity),
|
||||
'block':physical_limits.num_32b_reg_per_mp}[physical_limits.reg_alloc_granularity]
|
||||
limits.append((_int_floor(max_reg_per_mp/allocated_regs), 'registers'))
|
||||
|
||||
if shared_mem>0:
|
||||
allocated_shared_mem = _int_ceiling(shared_mem, physical_limits.shared_mem_alloc_unit_size)
|
||||
max_shared_mem_per_mp = physical_limits.shared_mem_per_mp
|
||||
limits.append((_int_floor(max_shared_mem_per_mp/allocated_shared_mem), 'shared memory'))
|
||||
|
||||
self.limit, self.limited_by = min(limits)
|
||||
self.warps_per_mp = self.limit*allocated_warps
|
||||
self.occupancy = 100*self.warps_per_mp/physical_limits.warps_per_mp
|
||||
|
||||
|
||||
|
@@ -72,7 +72,8 @@ int test_all_layouts(int CM, int CN, RefCType & cC, int AM, int AK, RefAType & c
|
||||
typename matrix_maker<RefBType, viennacl::column_major>::result_type Bcol = matrix_maker<RefBType, viennacl::column_major>::make(BcolTmp, cB);
|
||||
typename matrix_maker<RefBType, viennacl::column_major>::result_type BTcol = matrix_maker<RefBType, viennacl::column_major>::make(BTcolTmp, cBT);
|
||||
|
||||
atidlas::matrix_product_parameters parameters_local(1, 8, 16, 16, 4, 2, 6, atidlas::FETCH_FROM_LOCAL, atidlas::FETCH_FROM_LOCAL, 16, 8);
|
||||
|
||||
atidlas::matrix_product_parameters parameters_local(1, 8, 32, 16, 32, 32, 4, atidlas::FETCH_FROM_LOCAL, atidlas::FETCH_FROM_GLOBAL_CONTIGUOUS, 32, 4);
|
||||
atidlas::matrix_product_parameters parameters_global_contiguous(1, 8, 16, 16, 4, 2, 6, atidlas::FETCH_FROM_GLOBAL_CONTIGUOUS, atidlas::FETCH_FROM_GLOBAL_CONTIGUOUS, 0, 0);
|
||||
atidlas::matrix_product_parameters parameters_global_strided(1, 8, 16, 16, 4, 2, 6, atidlas::FETCH_FROM_GLOBAL_STRIDED, atidlas::FETCH_FROM_GLOBAL_STRIDED, 0, 0);
|
||||
|
||||
|
Reference in New Issue
Block a user