[PYTHON][EXAMPLES] Better einsum example
This commit is contained in:
@@ -1,38 +1,92 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
#!/usr/bin/env python
|
||||
|
||||
import numpy as np
|
||||
from enum import Enum
|
||||
import triton
|
||||
|
||||
batch_dim = 16
|
||||
ctx_dim = 32
|
||||
head_dim = 8
|
||||
state_dim = 32
|
||||
key_dim = 32
|
||||
n_keys = 32
|
||||
bs = batch_dim * ctx_dim
|
||||
class MODE(Enum):
|
||||
TF = 1
|
||||
TORCH = 2
|
||||
|
||||
# shapes
|
||||
x_shape = (bs, state_dim)
|
||||
qw_shape = (state_dim, head_dim * key_dim)
|
||||
kw_shape = (head_dim, 2, n_keys, key_dim // 2)
|
||||
try:
|
||||
import tensorflow as tf
|
||||
mode = MODE.TF
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
|
||||
np.random.seed(0)
|
||||
x = np.random.uniform(-1.0, 1.0, x_shape).astype(np.float32) # layer input
|
||||
qw = np.random.uniform(-1.0, 1.0, qw_shape).astype(np.float32) # query weights
|
||||
kw = np.random.uniform(-1.0, 1.0, kw_shape).astype(np.float32) # key weights
|
||||
# (bs, head_dim * key_dim) = (bs, state_dim) * (state_dim, head_dim * key_dim)
|
||||
# (bs, head_dim, 2, key_dim//2) <== (bs, head_dim * key_dim)
|
||||
q = np.dot(x, qw).reshape(bs, head_dim, 2, key_dim//2) # normal matmul
|
||||
try:
|
||||
import torch
|
||||
mode = MODE.TORCH
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
|
||||
# (bs, head_dim, 2, n_keys) = (bs, head_dim, 2, key_dim//2) * (head_dim, 2, n_keys, key_dim//2)
|
||||
# outer: bs, n_keys
|
||||
# inner: key_dim//2
|
||||
# batch: head_dim, 2 (key_axis)
|
||||
qk = np.einsum("bhak,hank->bhan", q, kw)
|
||||
cases = []
|
||||
# Matmul
|
||||
cases += [[[4, 1024, 1024], [1024, 1024], [4, 1024, 1024], "btc,ck->btk"]]
|
||||
# Attention
|
||||
cases += [[[4, 256, 8, 2, 64], [8, 2, 512, 64], [4, 256, 8, 2, 512], "bchak,hank->bchan"]]
|
||||
|
||||
tq = torch.from_numpy(q).contiguous().cuda()
|
||||
tkw = torch.from_numpy(kw).contiguous().cuda()
|
||||
tqk = triton.ops.einsum("bhak,hank->bhan", tq, tkw)
|
||||
diff = np.abs(qk - tqk.cpu().numpy())
|
||||
print(np.max(diff))
|
||||
print(np.min(diff))
|
||||
if mode == MODE.TF:
|
||||
sess = tf.InteractiveSession()
|
||||
|
||||
for a_shape, b_shape, c_shape, einsum in cases:
|
||||
|
||||
A = np.random.uniform(-1.0, 1.0, a_shape).astype(np.float16).astype(np.float32)
|
||||
B = np.random.uniform(-1.0, 1.0, b_shape).astype(np.float16).astype(np.float32)
|
||||
E = np.random.uniform(-1.0, 1.0, c_shape).astype(np.float16).astype(np.float32)
|
||||
|
||||
# Execute (tensorflow)
|
||||
if mode == MODE.TF:
|
||||
a = tf.placeholder(tf.float32, a_shape, name="a")
|
||||
b = tf.placeholder(tf.float32, b_shape, name="b")
|
||||
e = tf.placeholder(tf.float32, c_shape, name="e")
|
||||
c = triton.ops.einsum(einsum, a, b, 1)
|
||||
da, db = tf.gradients(c, [a, b], e)
|
||||
feed_dict = { a: A.astype(np.float32),
|
||||
b: B.astype(np.float32),
|
||||
e: E }
|
||||
sess.run(tf.global_variables_initializer())
|
||||
result = sess.run([c, da, db], feed_dict = feed_dict)
|
||||
# Execute (torch)
|
||||
if mode == MODE.TORCH:
|
||||
a = torch.from_numpy(A).cuda()
|
||||
b = torch.from_numpy(B).cuda()
|
||||
e = torch.from_numpy(E).cuda()
|
||||
a.requires_grad_(True)
|
||||
b.requires_grad_(True)
|
||||
c = triton.ops.einsum(einsum, a, b, 1)
|
||||
torch.autograd.backward(c, e)
|
||||
da = a.grad
|
||||
db = b.grad
|
||||
result = [c.cpu().detach().numpy(), da.cpu().detach().numpy(), db.cpu().detach().numpy()]
|
||||
|
||||
# benchmark
|
||||
nanosec = triton.bench_registry[c]
|
||||
ctx = triton.ctx_registry[c]
|
||||
b, m, n, k = tuple((ctx.bmnk[i] for i in range(0, 4)))
|
||||
ops = 2.*b*m*n*k
|
||||
print('C TFLOPS:', ops / triton.bench_registry[c] * 1e-3)
|
||||
#print('DA TFLOPS:', ops / triton.bench_registry[da] * 1e-3)
|
||||
#print('DB TFLOPS:', ops / triton.bench_registry[db] * 1e-3)
|
||||
|
||||
# test
|
||||
ctx = triton.ctx_registry[c]
|
||||
t_a = ctx.trans_a
|
||||
t_b = ctx.trans_b
|
||||
e_a = ctx.einsum_a
|
||||
e_b = ctx.einsum_b
|
||||
e_c = ctx.einsum_c
|
||||
C = np.einsum(einsum, A, B)
|
||||
if not t_a and not t_b: # NN
|
||||
DA = np.einsum(f"{e_c},{e_b}->{e_a}", E, B)
|
||||
DB = np.einsum(f"{e_a},{e_c}->{e_b}", A, E)
|
||||
elif not t_a and t_b: # NT
|
||||
DA = np.einsum(f"{e_c},{e_b}->{e_a}", E, B)
|
||||
DB = np.einsum(f"{e_c},{e_a}->{e_b}", E, A)
|
||||
elif t_a and not t_b: # TN
|
||||
DA = np.einsum(f"{e_b},{e_c}->{e_a}", B, E)
|
||||
DB = np.einsum(f"{e_a},{e_c}->{e_b}", A, E)
|
||||
c, da, db = result[0], result[1], result[2]
|
||||
print('C diff:', np.abs((C - c)).max())
|
||||
print('DA diff:', np.abs((DA - da)).max())
|
||||
print('DB diff:', np.abs((DB - db)).max())
|
Reference in New Issue
Block a user