Some improvements

This commit is contained in:
Philippe Tillet
2014-09-06 00:39:38 -04:00
parent 544583e6ca
commit 2055a8cc59
12 changed files with 215 additions and 84 deletions

View File

@@ -32,6 +32,8 @@ private:
{ {
if (p_.simd_width>1) if (p_.simd_width>1)
return TEMPLATE_INVALID_SIMD_WIDTH; return TEMPLATE_INVALID_SIMD_WIDTH;
if(p_.fetching_policy==FETCH_FROM_LOCAL)
return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
return TEMPLATE_VALID; return TEMPLATE_VALID;
} }

View File

@@ -47,14 +47,26 @@ class matrix_product_template : public template_base_impl<matrix_product_templat
{ {
private: private:
unsigned int n_lmem_elements() const unsigned int lmem_usage(statements_container const & statements) const
{ {
viennacl::scheduler::statement const & statement = statements.data().front();
viennacl::scheduler::statement_node_numeric_type numeric_type = lhs_most(statement.array(), statement.root()).lhs.numeric_type;
unsigned int N = 0; unsigned int N = 0;
if (p_.A_fetching_policy==FETCH_FROM_LOCAL) if (p_.A_fetching_policy==FETCH_FROM_LOCAL)
N += p_.kL * (p_.mL+1); N += p_.kL * (p_.mL+1);
if (p_.B_fetching_policy==FETCH_FROM_LOCAL) if (p_.B_fetching_policy==FETCH_FROM_LOCAL)
N += p_.nL * (p_.kL+1); N += p_.nL * (p_.kL+1);
return N; return N*tools::size_of(numeric_type);
}
unsigned int registers_usage(statements_container const & statements) const
{
viennacl::scheduler::statement const & statement = statements.data().front();
viennacl::scheduler::statement_node_numeric_type numeric_type = lhs_most(statement.array(), statement.root()).lhs.numeric_type;
unsigned int N = p_.mS * p_.nS + p_.mS * p_.kS + p_.kS * p_.nS;
return N*tools::size_of(numeric_type);
} }
int check_invalid_impl(viennacl::ocl::device const &, statements_container const &) const int check_invalid_impl(viennacl::ocl::device const &, statements_container const &) const

View File

@@ -27,9 +27,17 @@ class reduction_template : public template_base_impl<reduction_template, reducti
{ {
private: private:
unsigned int n_lmem_elements() const
unsigned int num_lmem_elements(statements_container const & statements) const
{ {
return p_.local_size_0; unsigned int res = 0;
for(statements_container::data_type::const_iterator it = statements.data().begin() ; it != statements.data().end() ; ++it)
{
viennacl::scheduler::statement const & statement = statements.data().front();
viennacl::scheduler::statement_node_numeric_type numeric_type = lhs_most(statement.array(), statement.root()).lhs.numeric_type;
res += p_.local_size_0*tools::size_of(numeric_type);
}
return res;
} }
int check_invalid_impl(viennacl::ocl::device const &, statements_container const & statements) const int check_invalid_impl(viennacl::ocl::device const &, statements_container const & statements) const

View File

@@ -34,7 +34,7 @@ private:
return TEMPLATE_VALID; return TEMPLATE_VALID;
} }
unsigned int n_lmem_elements() const unsigned int lmem_usage() const
{ {
return p_.local_size_0*(p_.local_size_1+1); return p_.local_size_0*(p_.local_size_1+1);
} }

View File

@@ -486,6 +486,10 @@ private:
public: public:
template_base(binding_policy_t binding_policy) : binding_policy_(binding_policy) {} template_base(binding_policy_t binding_policy) : binding_policy_(binding_policy) {}
virtual unsigned int lmem_usage(statements_container const &) const { return 0; }
virtual unsigned int registers_usage(statements_container const &) const { return 0; }
virtual ~template_base(){ } virtual ~template_base(){ }
std::vector<std::string> generate(std::string const & kernel_prefix, statements_container const & statements, viennacl::ocl::device const & device) std::vector<std::string> generate(std::string const & kernel_prefix, statements_container const & statements, viennacl::ocl::device const & device)
@@ -521,10 +525,8 @@ class template_base_impl : public template_base
{ {
private: private:
virtual int check_invalid_impl(viennacl::ocl::device const &, statements_container const &) const { return TEMPLATE_VALID; } virtual int check_invalid_impl(viennacl::ocl::device const &, statements_container const &) const { return TEMPLATE_VALID; }
virtual unsigned int n_lmem_elements() const { return 0; }
protected: protected:
bool has_misaligned_offset(statements_container const & statements) bool has_misaligned_offset(statements_container const & statements)
{ {
for (statements_container::data_type::const_iterator it = statements.data().begin(); it != statements.data().end(); ++it) for (statements_container::data_type::const_iterator it = statements.data().begin(); it != statements.data().end(); ++it)
@@ -565,13 +567,10 @@ public:
{ {
using namespace viennacl::tools; using namespace viennacl::tools;
viennacl::scheduler::statement const & statement = statements.data().front();
unsigned int scalartype_size = tools::size_of(lhs_most(statement.array(), statement.root()).lhs.numeric_type);
//Query device informations //Query device informations
size_t lmem_available = static_cast<size_t>(device.local_mem_size()); size_t lmem_available = static_cast<size_t>(device.local_mem_size());
size_t lmem_usage = scalartype_size*n_lmem_elements(); size_t lmem_used = lmem_usage(statements);
if (lmem_usage>lmem_available) if (lmem_used>lmem_available)
return TEMPLATE_LOCAL_MEMORY_OVERFLOW; return TEMPLATE_LOCAL_MEMORY_OVERFLOW;
//Invalid work group size //Invalid work group size

View File

@@ -18,30 +18,32 @@ import optimize
import sys import sys
DATATYPES = { 'single' : vcl.float32, DATATYPES = { 'single' : vcl.float32,
'double' : vcl.float64 'double' : vcl.float64 }
}
TYPES = { 'vector-axpy': vcl.atidlas.VectorAxpyTemplate, TYPES = { 'vector-axpy': {'template':vcl.atidlas.VectorAxpyTemplate,
'matrix-axpy': vcl.atidlas.MatrixAxpyTemplate, 'parameter-names':['simd-width', 'local-size-0', 'num-groups-0', 'fetch'],
'reduction': vcl.atidlas.ReductionTemplate, 'perf-index':lambda x: 3*x[0]*x[1][0]/x[2]*1e-9,
'row-wise-reduction': vcl.atidlas.RowWiseReductionTemplate, 'perf-measure':'GB/s'},
'matrix-product': vcl.atidlas.MatrixProductTemplate
} 'matrix-axpy': {'template':vcl.atidlas.MatrixAxpyTemplate,
'parameter-names':['simd-width', 'local-size-0', 'local-size-1', 'num-groups-0', 'num-groups-1', 'fetch'],
PNAMES = { 'perf-index':lambda x: 3*x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
'vector-axpy': ['simd-width', 'local-size-0', 'num-groups-0', 'fetch'], 'perf-measure':'GB/s'},
'matrix-axpy': ['simd-width', 'local-size-0', 'local-size-1', 'num-groups-0', 'num-groups-1', 'fetch'],
'reduction': ['simd-width', 'local-size-0', 'num-groups-0', 'fetch'], 'reduction': {'template':vcl.atidlas.ReductionTemplate,
'row-wise-reduction': ['simd-width', 'local-size-0', 'local-size-1', 'num-groups-0', 'fetch'], 'parameter-names':['simd-width', 'local-size-0', 'num-groups-0', 'fetch'],
'matrix-product': ['simd-width', 'local-size-0', 'kL', 'local-size-1', 'mS', 'kS', 'nS', 'A-fetch-policy', 'B-fetch-policy', 'local-fetch-size-0', 'local-fetch-size-1'] 'perf-index':lambda x: 2*x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
} 'perf-measure':'GB/s'},
PERFINDEX = { 'vector-axpy' : (lambda x: 3*x[0]*x[1][0]/x[2]*1e-9, 'GB/s') , 'row-wise-reduction': {'template':vcl.atidlas.RowWiseReductionTemplate,
'matrix-axpy' : (lambda x: 3*x[0]*x[1][0]*x[1][1]/x[2]*1e-9, 'GB/s'), 'parameter-names':['simd-width', 'local-size-0', 'local-size-1', 'num-groups-0', 'fetch'],
'reduction' : (lambda x: 2*x[0]*x[1][0]*x[1][1]/x[2]*1e-9, 'GB/s'), 'perf-index':lambda x: x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
'row-wise-reduction' : (lambda x: x[0]*x[1][0]*x[1][1]/x[2]*1e-9, 'GB/s'), 'perf-measure':'GB/s'},
'matrix-product': (lambda x: 2*x[1][0]*x[1][1]*x[1][2]/x[2]*1e-9, 'GFLOPs/s')
} 'matrix-product': {'template':vcl.atidlas.MatrixProductTemplate,
'parameter-names':['simd-width', 'local-size-0', 'kL', 'local-size-1', 'mS', 'kS', 'nS', 'A-fetch-policy', 'B-fetch-policy', 'local-fetch-size-0', 'local-fetch-size-1'],
'perf-index': lambda x: 2*x[1][0]*x[1][1]*x[1][2]/x[2]*1e-9,
'perf-measure': 'GFLOP/s'} }
def parameter_space(operation): def parameter_space(operation):
simd = [1, 2, 4, 8] simd = [1, 2, 4, 8]
@@ -54,25 +56,22 @@ def parameter_space(operation):
if operation == 'reduction': return [simd, pow2_1D, pow2_1D, fetch] if operation == 'reduction': return [simd, pow2_1D, pow2_1D, fetch]
if operation == 'matrix-axpy': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D, fetch] if operation == 'matrix-axpy': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D, fetch]
if operation == 'row-wise-reduction': return [simd, pow2_2D, pow2_2D, pow2_1D, fetch] if operation == 'row-wise-reduction': return [simd, pow2_2D, pow2_2D, pow2_1D, fetch]
if operation == 'matrix-product': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D_unrolled, pow2_2D_unrolled, pow2_2D_unrolled, fetch, fetch, pow2_2D, pow2_2D] if operation == 'matrix-product': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D_unrolled, pow2_2D_unrolled, pow2_2D_unrolled, fetch, fetch, [0] + pow2_2D, [0] + pow2_2D]
def do_tuning(config_fname, spec_fname, viennacl_root): def do_tuning(config_fname, spec_fname, viennacl_root):
config = ConfigObj(config_fname, configspec=spec_fname) config = ConfigObj(config_fname, configspec=spec_fname)
map_to_list = lambda T: list(map(T[0], T[1] if isinstance(T[1], list) else [T[1]])) map_to_list = lambda T: list(map(T[0], T[1] if isinstance(T[1], list) else [T[1]]))
for operation in ['vector-axpy', 'matrix-axpy', 'row-wise-reduction', 'matrix-product']: for operation in ['vector-axpy', 'matrix-axpy', 'row-wise-reduction', 'matrix-product']:
tmp_folder = config['tmp-folder'] if 'tmp-folder' in config else "" tmp_folder = config['tmp-folder'] if 'tmp-folder' in config else ""
if operation in config: if operation in config:
p = config[operation] p = config[operation]
confdevices = p['devices'] confdevices = p['devices']
devices = utils.DEVICES_PRESETS[confdevices] if confdevices in utils.DEVICES_PRESETS else [utils.all_devices[int(i)] for i in confdevices] devices = utils.DEVICES_PRESETS[confdevices] if confdevices in utils.DEVICES_PRESETS else [utils.all_devices[int(i)] for i in confdevices]
precisions = ['single', 'double'] if 'all' in p['precision'] else p['precision'] precisions = ['single', 'double'] if 'all' in p['precision'] else p['precision']
datatypes = [DATATYPES[k] for k in precisions] datatypes = [DATATYPES[k] for k in precisions]
s = map_to_list((int, p['size'])) s = map_to_list((int, p['size']))
@@ -98,8 +97,8 @@ def do_tuning(config_fname, spec_fname, viennacl_root):
fname = os.devnull fname = os.devnull
with open(fname, "w+") as archive: with open(fname, "w+") as archive:
with vcl.Statement(node) as statement: with vcl.Statement(node) as statement:
result = optimize.genetic(statement, ctx, TYPES[operation], lambda p: TYPES[operation](p, *other_params), result = optimize.genetic(statement, ctx, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
PNAMES[operation], parameter_space(operation), lambda t: PERFINDEX[operation][0]([datatype().itemsize, s, t]), PERFINDEX[operation][1], archive) TYPES[operation]['parameter-names'], parameter_space(operation), lambda t: TYPES[operation]['perf-index']([datatype().itemsize, s, t]), TYPES[operation]['perf-measure'], archive)
if result and viennacl_root: if result and viennacl_root:
vclio.generate_viennacl_headers(viennacl_root, device, datatype, operation, other_params, result[1]) vclio.generate_viennacl_headers(viennacl_root, device, datatype, operation, other_params, result[1])
@@ -121,7 +120,7 @@ def do_tuning(config_fname, spec_fname, viennacl_root):
A = vcl.Matrix(s if A_trans=='N' else s[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR) A = vcl.Matrix(s if A_trans=='N' else s[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR)
x = vcl.Vector(s[1] if A_trans=='N' else s[0], context=ctx, dtype=datatype) x = vcl.Vector(s[1] if A_trans=='N' else s[0], context=ctx, dtype=datatype)
LHS = A if A_trans=='N' else A.T LHS = A if A_trans=='N' else A.T
execute(LHS*x, (A_trans,)) execute(LHS*x, ())
if operation=='matrix-product': if operation=='matrix-product':
layouts = map_to_list((str,p['layout'])) layouts = map_to_list((str,p['layout']))

View File

@@ -1,5 +1,6 @@
import random import random
import time import time
import tools
import pyviennacl as vcl import pyviennacl as vcl
from collections import OrderedDict as odict from collections import OrderedDict as odict
@@ -25,10 +26,15 @@ class GeneticOperators(object):
self.cache = {} self.cache = {}
def init(self): def init(self):
result = [random.choice(L) for L in self.parameters] while True:
while self.build_template(self.TemplateType.Parameters(*result)).check(self.statement)!=0:
result = [random.choice(L) for L in self.parameters] result = [random.choice(L) for L in self.parameters]
return result template = self.build_template(self.TemplateType.Parameters(*result))
registers_usage = template.registers_usage(vcl.atidlas.StatementsTuple(self.statement))/4
lmem_usage = template.lmem_usage(vcl.atidlas.StatementsTuple(self.statement))
local_size = template.parameters.local_size_0*template.parameters.local_size_1
occupancy_record = tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage)
if template.check(self.statement) and occupancy_record.occupancy >= 10 :
return result
@staticmethod @staticmethod
def min_to_hyperbol(a, tup): def min_to_hyperbol(a, tup):
@@ -100,27 +106,35 @@ class GeneticOperators(object):
def mutate(self, individual, indpb): def mutate(self, individual, indpb):
for i in range(len(individual)): for i in range(len(individual)):
if random.random() < indpb: if random.random() < indpb:
individual[i] = random.choice(self.parameters[i]) j = self.parameters[i].index(individual[i])
j = max(0,min(random.randint(j-1, j+1),len(self.parameters[i])-1))
individual[i] = self.parameters[i][j]
return individual, return individual,
def evaluate(self, individual): def evaluate(self, individual):
tupindividual = tuple(individual) tupindividual = tuple(individual)
print tupindividual
if tupindividual not in self.cache: if tupindividual not in self.cache:
template = self.build_template(self.TemplateType.Parameters(*individual)) template = self.build_template(self.TemplateType.Parameters(*individual))
if template.check(self.statement)!=0: registers_usage = template.registers_usage(vcl.atidlas.StatementsTuple(self.statement))/4
self.cache[tupindividual] = 100 lmem_usage = template.lmem_usage(vcl.atidlas.StatementsTuple(self.statement))
local_size = template.parameters.local_size_0*template.parameters.local_size_1
occupancy_record = tools.OccupancyRecord(self.device, local_size, lmem_usage, registers_usage)
if occupancy_record.occupancy < 10 :
self.cache[tupindividual] = 10
else: else:
template.execute(self.statement, True) try:
self.statement.result.context.finish_all_queues() template.execute(self.statement, True)
N = 0
current_time = 0
while current_time < 1e-2:
time_before = time.time()
template.execute(self.statement,False)
self.statement.result.context.finish_all_queues() self.statement.result.context.finish_all_queues()
current_time += time.time() - time_before N = 0
N+=1 current_time = 0
self.cache[tupindividual] = current_time/N while current_time < 1e-2:
time_before = time.time()
template.execute(self.statement,False)
self.statement.result.context.finish_all_queues()
current_time += time.time() - time_before
N+=1
self.cache[tupindividual] = current_time/N
except:
self.cache[tupindividual] = 10
return self.cache[tupindividual], return self.cache[tupindividual],

View File

@@ -2,6 +2,7 @@ import array
import numpy as np import numpy as np
import random import random
import time import time
import sys
from deap import algorithms from deap import algorithms
from deap import base from deap import base
@@ -10,11 +11,7 @@ from deap import tools
from genetic_operators import GeneticOperators from genetic_operators import GeneticOperators
def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, maxtime, def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, maxtime, maxgen, halloffame, compute_perf, perf_metric):
stats=None, halloffame=None, verbose=__debug__):
logbook = tools.Logbook()
logbook.header = ['gen', 'nevals'] + (stats.fields if stats else [])
# Evaluate the individuals with an invalid fitness # Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in population if not ind.fitness.valid] invalid_ind = [ind for ind in population if not ind.fitness.valid]
fitnesses = toolbox.map(toolbox.evaluate, invalid_ind) fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
@@ -24,17 +21,12 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, maxtime,
if halloffame is not None: if halloffame is not None:
halloffame.update(population) halloffame.update(population)
record = stats.compile(population) if stats is not None else {}
logbook.record(gen=0, nevals=len(invalid_ind), **record)
if verbose:
print logbook.stream
# Begin the generational process # Begin the generational process
gen = 0 gen = 0
maxtime = time.strptime(maxtime, '%Mm%Ss') maxtime = time.strptime(maxtime, '%Mm%Ss')
maxtime = maxtime.tm_min*60 + maxtime.tm_sec maxtime = maxtime.tm_min*60 + maxtime.tm_sec
start_time = time.time() start_time = time.time()
while time.time() - start_time < maxtime: while time.time() - start_time < maxtime and gen < maxgen:
# Vary the population # Vary the population
offspring = algorithms.varOr(population, toolbox, lambda_, cxpb, mutpb) offspring = algorithms.varOr(population, toolbox, lambda_, cxpb, mutpb)
@@ -53,12 +45,12 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, maxtime,
# Update the statistics with the new population # Update the statistics with the new population
gen = gen + 1 gen = gen + 1
record = stats.compile(population) if stats is not None else {}
logbook.record(gen=gen, nevals=len(invalid_ind), **record) best_profile = '(%s)'%','.join(map(str,halloffame[0]));
if verbose: best_performance = compute_perf(halloffame[0].fitness.values[0])
print logbook.stream sys.stdout.write('Generation %d | Time %d | Best %d %s [ for %s ]\n'%(gen, time.time() - start_time, best_performance, perf_metric, best_profile))
sys.stdout.write('\n')
return population, logbook return population
def genetic(statement, context, TemplateType, build_template, parameter_names, all_parameters, compute_perf, perf_metric, out): def genetic(statement, context, TemplateType, build_template, parameter_names, all_parameters, compute_perf, perf_metric, out):
gen = GeneticOperators(context.devices[0], statement, all_parameters, parameter_names, TemplateType, build_template) gen = GeneticOperators(context.devices[0], statement, all_parameters, parameter_names, TemplateType, build_template)
@@ -70,13 +62,13 @@ def genetic(statement, context, TemplateType, build_template, parameter_names, a
toolbox.register("population", tools.initRepeat, list, toolbox.individual) toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.decorate("population", gen.repair) toolbox.decorate("population", gen.repair)
toolbox.register("evaluate", gen.evaluate) toolbox.register("evaluate", gen.evaluate)
toolbox.register("mate", tools.cxUniform, indpb=0.3) toolbox.register("mate", tools.cxUniform, indpb=0.1)
toolbox.decorate("mate", gen.repair) toolbox.decorate("mate", gen.repair)
toolbox.register("mutate", gen.mutate, indpb=0.2) toolbox.register("mutate", gen.mutate, indpb=0.1)
toolbox.decorate("mutate", gen.repair) toolbox.decorate("mutate", gen.repair)
toolbox.register("select", tools.selNSGA2) toolbox.register("select", tools.selNSGA2)
pop = toolbox.population(n=10) pop = toolbox.population(n=70)
hof = tools.HallOfFame(1) hof = tools.HallOfFame(1)
best_performer = lambda x: max([compute_perf(hof[0].fitness.values[0]) for t in x]) best_performer = lambda x: max([compute_perf(hof[0].fitness.values[0]) for t in x])
@@ -89,4 +81,4 @@ def genetic(statement, context, TemplateType, build_template, parameter_names, a
stats.register("max (" + perf_metric + ")", lambda x: max([compute_perf(hof[0].fitness.values[0]) for t in x])) stats.register("max (" + perf_metric + ")", lambda x: max([compute_perf(hof[0].fitness.values[0]) for t in x]))
stats.register("profile ", lambda x: '(%s)'%','.join(map(str,hof[0]))) stats.register("profile ", lambda x: '(%s)'%','.join(map(str,hof[0])))
pop, log = eaMuPlusLambda(pop, toolbox, 10, 20, cxpb=0.2, mutpb=0.2, maxtime='5m0s', stats=stats, halloffame=hof, verbose=True) pop = eaMuPlusLambda(pop, toolbox, 70, 100, cxpb=0.1, mutpb=0.1, maxtime='5m0s', maxgen=1000, halloffame=hof, compute_perf=compute_perf, perf_metric=perf_metric)

104
autotune/python/tools.py Normal file
View File

@@ -0,0 +1,104 @@
from __future__ import division
import pyopencl
class PhysicalLimits:
def __init__(self, dev):
self.compute_capability = pyopencl.characterize.nv_compute_capability(dev)
if self.compute_capability[0]==1:
if self.compute_capability[1]<=1:
self.warps_per_mp = 24
self.threads_per_mp = 768
self.num_32b_reg_per_mp = 8192
self.reg_alloc_unit_size = 256
else:
self.warps_per_mp = 32
self.threads_per_mp = 1024
self.num_32b_reg_per_mp = 16384
self.reg_alloc_unit_size = 512
self.threads_per_warp = 32
self.thread_blocks_per_mp = 8
self.reg_alloc_granularity = 'block'
self.reg_per_thread = 124
self.shared_mem_per_mp = 16384
self.shared_mem_alloc_unit_size = 512
self.warp_alloc_granularity = 2
self.max_thread_block_size = 512
elif self.compute_capability[0]==2:
self.threads_per_warp = 32
self.warps_per_mp = 48
self.threads_per_mp = 1536
self.thread_blocks_per_mp = 8
self.num_32b_reg_per_mp = 32768
self.reg_alloc_unit_size = 64
self.reg_alloc_granularity = 'warp'
self.reg_per_thread = 63
self.shared_mem_per_mp = 49152
self.shared_mem_alloc_unit_size = 128
self.warp_alloc_granularity = 2
self.max_thread_block_size = 1024
elif self.compute_capability[0]==3:
self.threads_per_warp = 32
self.warps_per_mp = 64
self.threads_per_mp = 2048
self.thread_blocks_per_mp = 16
self.num_32b_reg_per_mp = 65536
self.reg_alloc_unit_size = 256
self.reg_alloc_granularity = 'warp'
if(self.compute_capability[1]==5):
self.reg_per_thread = 255
else:
self.reg_per_thread = 63
self.shared_mem_per_mp = 49152
self.shared_mem_alloc_unit_size = 256
self.warp_alloc_granularity = 4
self.max_thread_block_size = 1024
else:
raise Exception('Compute capability not supported!')
def _int_floor(value, multiple_of=1):
"""Round C{value} down to be a C{multiple_of} something."""
# Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
from math import floor
return int(floor(value/multiple_of))*multiple_of
def _int_ceiling(value, multiple_of=1):
"""Round C{value} up to be a C{multiple_of} something."""
# Mimicks the Excel "floor" function (for code stolen from occupancy calculator)
from math import ceil
return int(ceil(value/multiple_of))*multiple_of
class OccupancyRecord:
def __init__(self, dev, threads, shared_mem=0, registers=0):
physical_limits = PhysicalLimits(dev)
limits = [];
allocated_warps = _int_ceiling(threads/physical_limits.threads_per_warp)
max_warps_per_mp = physical_limits.warps_per_mp;
limits.append((min(physical_limits.thread_blocks_per_mp, _int_floor(max_warps_per_mp/allocated_warps)), 'warps'))
if registers>0:
if registers > physical_limits.reg_per_thread:
limits.append((0, 'registers'))
else:
allocated_regs = {'warp': allocated_warps,
'block': _int_ceiling(_int_ceiling(allocated_warps, physical_limits.warp_alloc_granularity)*registers*physical_limits.threads_per_warp,allocated_warps)}[physical_limits.reg_alloc_granularity]
max_reg_per_mp = {'warp': _int_floor(physical_limits.num_32b_reg_per_mp/_int_ceiling(registers*physical_limits.threads_per_warp, physical_limits.reg_alloc_unit_size), physical_limits.warp_alloc_granularity),
'block':physical_limits.num_32b_reg_per_mp}[physical_limits.reg_alloc_granularity]
limits.append((_int_floor(max_reg_per_mp/allocated_regs), 'registers'))
if shared_mem>0:
allocated_shared_mem = _int_ceiling(shared_mem, physical_limits.shared_mem_alloc_unit_size)
max_shared_mem_per_mp = physical_limits.shared_mem_per_mp
limits.append((_int_floor(max_shared_mem_per_mp/allocated_shared_mem), 'shared memory'))
self.limit, self.limited_by = min(limits)
self.warps_per_mp = self.limit*allocated_warps
self.occupancy = 100*self.warps_per_mp/physical_limits.warps_per_mp

View File

@@ -72,7 +72,8 @@ int test_all_layouts(int CM, int CN, RefCType & cC, int AM, int AK, RefAType & c
typename matrix_maker<RefBType, viennacl::column_major>::result_type Bcol = matrix_maker<RefBType, viennacl::column_major>::make(BcolTmp, cB); typename matrix_maker<RefBType, viennacl::column_major>::result_type Bcol = matrix_maker<RefBType, viennacl::column_major>::make(BcolTmp, cB);
typename matrix_maker<RefBType, viennacl::column_major>::result_type BTcol = matrix_maker<RefBType, viennacl::column_major>::make(BTcolTmp, cBT); typename matrix_maker<RefBType, viennacl::column_major>::result_type BTcol = matrix_maker<RefBType, viennacl::column_major>::make(BTcolTmp, cBT);
atidlas::matrix_product_parameters parameters_local(1, 8, 16, 16, 4, 2, 6, atidlas::FETCH_FROM_LOCAL, atidlas::FETCH_FROM_LOCAL, 16, 8);
atidlas::matrix_product_parameters parameters_local(1, 8, 32, 16, 32, 32, 4, atidlas::FETCH_FROM_LOCAL, atidlas::FETCH_FROM_GLOBAL_CONTIGUOUS, 32, 4);
atidlas::matrix_product_parameters parameters_global_contiguous(1, 8, 16, 16, 4, 2, 6, atidlas::FETCH_FROM_GLOBAL_CONTIGUOUS, atidlas::FETCH_FROM_GLOBAL_CONTIGUOUS, 0, 0); atidlas::matrix_product_parameters parameters_global_contiguous(1, 8, 16, 16, 4, 2, 6, atidlas::FETCH_FROM_GLOBAL_CONTIGUOUS, atidlas::FETCH_FROM_GLOBAL_CONTIGUOUS, 0, 0);
atidlas::matrix_product_parameters parameters_global_strided(1, 8, 16, 16, 4, 2, 6, atidlas::FETCH_FROM_GLOBAL_STRIDED, atidlas::FETCH_FROM_GLOBAL_STRIDED, 0, 0); atidlas::matrix_product_parameters parameters_global_strided(1, 8, 16, 16, 4, 2, 6, atidlas::FETCH_FROM_GLOBAL_STRIDED, atidlas::FETCH_FROM_GLOBAL_STRIDED, 0, 0);