triton/autotune/python/autotune.py

from __future__ import division

import argparse
import itertools
import os

from external.configobj import ConfigObj

import pyopencl as cl
import pyviennacl as vcl
from pyviennacl import backend
from pyviennacl import opencl
from pyviennacl import atidlas

import utils
import vclio
import optimize
import sys

DATATYPES = { 'single' : vcl.float32,
              'double' : vcl.float64 }

TYPES = { 'vector-axpy': {'template':vcl.atidlas.VectorAxpyTemplate,
                          'parameter-names':['simd-width', 'local-size-0', 'num-groups-0', 'fetch'],
                          'perf-index':lambda x: 3*x[0]*x[1][0]/x[2]*1e-9,
                          'perf-measure':'GB/s'},

          'matrix-axpy': {'template':vcl.atidlas.MatrixAxpyTemplate,
                          'parameter-names':['simd-width', 'local-size-0', 'local-size-1', 'num-groups-0', 'num-groups-1', 'fetch'],
                          'perf-index':lambda x: 3*x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
                          'perf-measure':'GB/s'},

          'reduction': {'template':vcl.atidlas.ReductionTemplate,
                        'parameter-names':['simd-width', 'local-size-0', 'num-groups-0', 'fetch'],
                        'perf-index':lambda x: 2*x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
                        'perf-measure':'GB/s'},

          'row-wise-reduction': {'template':vcl.atidlas.RowWiseReductionTemplate,
                                'parameter-names':['simd-width', 'local-size-0', 'local-size-1', 'num-groups-0', 'fetch'],
                                'perf-index':lambda x: x[0]*x[1][0]*x[1][1]/x[2]*1e-9,
                                'perf-measure':'GB/s'},

          'matrix-product': {'template':vcl.atidlas.MatrixProductTemplate,
                            'parameter-names':['simd-width', 'local-size-0', 'kL', 'local-size-1', 'mS', 'kS', 'nS', 'A-fetch-policy', 'B-fetch-policy', 'local-fetch-size-0', 'local-fetch-size-1'],
                            'perf-index': lambda x: 2*x[1][0]*x[1][1]*x[1][2]/x[2]*1e-9,
                            'perf-measure': 'GFLOP/s'} }

def parameter_space(operation):
  simd = [1, 2, 4, 8]
  pow2_1D = [2**k for k in range(12)]
  pow2_2D = [2**i for i in range(8)]
  pow2_2D_unrolled = [2**i for i in range(8)]
  FetchingPolicy = vcl.atidlas.FetchingPolicy
  fetch = [FetchingPolicy.FETCH_FROM_LOCAL, FetchingPolicy.FETCH_FROM_GLOBAL_CONTIGUOUS, FetchingPolicy.FETCH_FROM_GLOBAL_STRIDED]
  if operation == 'vector-axpy': return [simd, pow2_1D, pow2_1D, fetch]
  if operation == 'reduction': return [simd, pow2_1D, pow2_1D, fetch]
  if operation == 'matrix-axpy': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D, fetch]
  if operation == 'row-wise-reduction': return [simd, pow2_2D, pow2_2D, pow2_1D, fetch]
  if operation == 'matrix-product': return [simd, pow2_2D, pow2_2D, pow2_2D, pow2_2D_unrolled,  pow2_2D_unrolled,  pow2_2D_unrolled, fetch, fetch, [0] + pow2_2D, [0] + pow2_2D]

def do_tuning(config_fname, spec_fname, viennacl_root):

  config = ConfigObj(config_fname, configspec=spec_fname)
  map_to_list = lambda T: list(map(T[0], T[1] if isinstance(T[1], list) else [T[1]]))

  for operation in ['vector-axpy', 'matrix-axpy', 'row-wise-reduction', 'matrix-product']:

    tmp_folder = config['tmp-folder'] if 'tmp-folder' in config else ""

    if operation in config:
      p = config[operation]
      confdevices = p['devices']
      devices = utils.DEVICES_PRESETS[confdevices] if confdevices in utils.DEVICES_PRESETS else [utils.all_devices[int(i)] for i in confdevices]
      precisions =  map_to_list((str, p['precision']))
      datatypes = [DATATYPES[k] for k in precisions]
      s = map_to_list((int, p['size']))

      for datatype, device in itertools.product(datatypes, devices):
        ctx = cl.Context([device])
        ctx = vcl.backend.Context(ctx)
        device = ctx.current_device

        if datatype is vcl.float64 and not device.double_fp_config:
          sys.stderr.write('Warning : The device ' + device.name + ' does not support double precision! Skipping ...')
          continue

        pairs = []

        def execute(node, other_params):
          print('-----')
          print(' '.join(map(str, ("Now tuning:", datatype.__name__, '-', operation, '-'.join(other_params), '[' + device.name, '(' + device.platform.name + ')]'))))
          tmp_file = os.path.join(tmp_folder, utils.sanitize_string(device.name) + "-" + datatype.__name__ + "-" + operation + '-'.join(other_params) + ".dat")
          if tmp_folder:
            print('Saving history to ' + tmp_file)
            fname = tmp_file
          else:
            fname = os.devnull
          with open(fname, "w+") as archive:
            with vcl.Statement(node) as statement:
              result = optimize.genetic(statement, ctx, TYPES[operation]['template'], lambda p: TYPES[operation]['template'](p, *other_params),
                                    TYPES[operation]['parameter-names'], parameter_space(operation), lambda t: TYPES[operation]['perf-index']([datatype().itemsize, s, t]), TYPES[operation]['perf-measure'], archive)
            if result and viennacl_root:
              vclio.generate_viennacl_headers(viennacl_root, device, datatype, operation, other_params, result[1])

        if operation=='vector-axpy':
          x = vcl.Vector(s[0], context=ctx, dtype=datatype)
          y = vcl.Vector(s[0], context=ctx, dtype=datatype)
          execute(vcl.ElementProd(vcl.exp(x + y),vcl.cos(x + y)), ())

        if operation=='matrix-axpy':
          A = vcl.Matrix(s, context=ctx, dtype=datatype)
          B = vcl.Matrix(s, context=ctx, dtype=datatype)
          execute(A+B, ())

        if operation=='row-wise-reduction':
          layouts = map_to_list((str,p['layout']))
          if 'all' in layouts:
            layouts = ['N', 'T']
          for A_trans in layouts:
            A = vcl.Matrix(s if A_trans=='N' else s[::-1], context=ctx, dtype=datatype, layout=vcl.COL_MAJOR)
            x = vcl.Vector(s[1] if A_trans=='N' else s[0], context=ctx, dtype=datatype)
            LHS = A if A_trans=='N' else A.T
            execute(LHS*x, ())

        if operation=='matrix-product':
          layouts = map_to_list((str,p['layout']))
          if 'all' in layouts:
            layouts = ['NN', 'NT', 'TN', 'TT']
          for layout in layouts:
            A_trans = layout[0]
            B_trans = layout[1]

            A = vcl.Matrix((s[0], s[1]) if A_trans=='N' else (s[1],s[0]), context=ctx, dtype=datatype, layout=vcl.COL_MAJOR);
            B = vcl.Matrix((s[1], s[2]) if B_trans=='N' else (s[2],s[1]), context=ctx, dtype=datatype, layout=vcl.COL_MAJOR);
            LHS = A if A_trans=='N' else A.T
            RHS = B if B_trans=='N' else B.T
            alpha = vcl.HostScalar(1.0,  context=ctx, dtype = datatype)
            beta = vcl.HostScalar(1.0, context=ctx, dtype = datatype)
            C = vcl.Matrix((s[0], s[2]), context=ctx, dtype = datatype, layout=vcl.COL_MAJOR)
            execute(vcl.Assign(C,LHS*RHS*alpha + C*beta),(A_trans, B_trans))


if __name__ == "__main__":
  parser = argparse.ArgumentParser();

  subparsers = parser.add_subparsers(dest='action')

  print_devices_parser = subparsers.add_parser('list-devices', help='list the devices available')

  tune_parser = subparsers.add_parser('tune', help='tune using a specific configuration file')
  tune_parser.add_argument("--config", default="config.ini", required=False, type=str)
  tune_parser.add_argument("--viennacl-root", default='', required=False, type=str)
  args = parser.parse_args()

  if(args.action=='list-devices'):
      print("----------------")
      print("Devices available:")
      print("----------------")
      devices = [d for platform in cl.get_platforms() for d in platform.get_devices()]
      for (i, d) in enumerate(devices):
          print('Device', i, ':', utils.DEVICE_TYPE_PREFIX[d.type].upper() + ':', d.name, 'on', d.platform.name)
      print("----------------")
  else:
      print("------")
      print("Auto-tuning")
      print("------")
      do_tuning(args.config, 'config_spec.ini', args.viennacl_root)