From 9e54a030062cbce99fcc431501871ca94df39855 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 18 Feb 2020 12:25:05 -0500 Subject: [PATCH] [PYTHON][EXAMPLES] Removed obsolete files --- python/examples/attention/bench.py | 48 ----------------- python/examples/attention/optimized.py | 50 ------------------ python/examples/attention/reference.py | 72 -------------------------- 3 files changed, 170 deletions(-) delete mode 100644 python/examples/attention/bench.py delete mode 100644 python/examples/attention/optimized.py delete mode 100644 python/examples/attention/reference.py diff --git a/python/examples/attention/bench.py b/python/examples/attention/bench.py deleted file mode 100644 index abd4ed24c..000000000 --- a/python/examples/attention/bench.py +++ /dev/null @@ -1,48 +0,0 @@ -import torch -import numpy as np -import reference -import optimized -from time import time - -use_half = True -def cast(x): - if use_half: - return x.half() - else: - return x - -# GPU device -device = torch.device("cuda:0") -# shapes -batch, nhead = 8, 28 -dm, dk, dv = 1024, 1024, 1024 -lq, lk, lv = 1024, 1024, 1024 -# initialize tensors -torch.manual_seed(0) -np.random.seed(0) -query = cast(torch.randn(batch, lq, dm)).cuda() -key = cast(torch.randn(batch, lk, dm)).cuda() -value = cast(torch.randn(batch, lv, dm)).cuda() -# initialize layers -torch.manual_seed(0) -np.random.seed(0) -rattn = cast(reference.MultiHeadAttention(nhead, dm, dk, dv).to(device)) -torch.manual_seed(0) -np.random.seed(0) -tattn = cast(optimized.MultiHeadAttention(nhead, dm, dk, dv).to(device)) -# test -routput, _ = rattn(query, key, value) -toutput, _ = tattn(query, key, value) -diff = torch.max(torch.abs(routput - toutput)) -assert diff < 1e-2 -# benchmark -start = time() -routput, _ = rattn(query, key, value) -end = time() -rtime = end - start -start = time() -toutput, _ = tattn(query, key, value) -end = time() -ttime = end - start -print(f'Torch: {rtime} s') -print(f'Triton: {ttime} s') \ No newline at end of file diff --git a/python/examples/attention/optimized.py b/python/examples/attention/optimized.py deleted file mode 100644 index 96cc14262..000000000 --- a/python/examples/attention/optimized.py +++ /dev/null @@ -1,50 +0,0 @@ -import numpy as np -import torch -import torch.nn as nn -import triton - -class MultiHeadAttention(nn.Module): - ''' Multi-Head Attention module ''' - - def __init__(self, n_head, d_model, d_k, d_v): - super().__init__() - self.n_head = n_head - self.d_k = d_k - self.d_v = d_v - # linear layers - self.w_qs = nn.Linear(d_model, n_head * d_k) - self.w_ks = nn.Linear(d_model, n_head * d_k) - self.w_vs = nn.Linear(d_model, n_head * d_v) - self.fc = nn.Linear(n_head * d_v, d_model) - # initialize weights - nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) - nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) - nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v))) - nn.init.xavier_normal_(self.fc.weight) - # layer normalization - self.layer_norm = nn.LayerNorm(d_model) - - - def forward(self, q, k, v, mask=None): - # dimensions - d_k, d_v, n_head = self.d_k, self.d_v, self.n_head - sz_b, len_q, _ = q.size() - sz_b, len_k, _ = k.size() - sz_b, len_v, _ = v.size() - # linear transformations - residual = q - q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) - k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) - v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) - # scaled dot-product attention - attn = triton.ops.einsum('blhk,bthk->hblt', q, k, [n_head, sz_b, len_q, len_k]) - attn = attn / np.sqrt(d_k) - if mask is not None: - attn = attn.masked_fill(mask[None], -np.inf) - attn = torch.softmax(attn, dim=3) - output = triton.ops.einsum('hblt,bthv->blhv', attn, v, [sz_b, len_q, n_head, d_v]) - output = output.view(sz_b, len_q, -1) - output = self.fc(output) - # epilogue - output = self.layer_norm(output + residual) - return output, attn \ No newline at end of file diff --git a/python/examples/attention/reference.py b/python/examples/attention/reference.py deleted file mode 100644 index e60f474f6..000000000 --- a/python/examples/attention/reference.py +++ /dev/null @@ -1,72 +0,0 @@ -import numpy as np -import torch -import torch.nn as nn - -class ScaledDotProductAttention(nn.Module): - ''' Scaled Dot-Product Attention ''' - - def __init__(self, temperature, attn_dropout=0.1): - super().__init__() - self.temperature = temperature - self.softmax = nn.Softmax(dim=2) - - def forward(self, q, k, v, mask=None): - attn = torch.bmm(q, k.transpose(1, 2)) - attn = attn / self.temperature - if mask is not None: - attn = attn.masked_fill(mask, -np.inf) - attn = self.softmax(attn) - output = torch.bmm(attn, v) - return output, attn - - - -class MultiHeadAttention(nn.Module): - ''' Multi-Head Attention module ''' - - def __init__(self, n_head, d_model, d_k, d_v): - super().__init__() - self.n_head = n_head - self.d_k = d_k - self.d_v = d_v - # linear layers - self.w_qs = nn.Linear(d_model, n_head * d_k) - self.w_ks = nn.Linear(d_model, n_head * d_k) - self.w_vs = nn.Linear(d_model, n_head * d_v) - self.fc = nn.Linear(n_head * d_v, d_model) - # initialize weights - nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) - nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) - nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v))) - nn.init.xavier_normal_(self.fc.weight) - # normalization - self.layer_norm = nn.LayerNorm(d_model) - # scaled dot-product - self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5)) - - - def forward(self, q, k, v, mask=None): - # dimensions - d_k, d_v, n_head = self.d_k, self.d_v, self.n_head - sz_b, len_q, _ = q.size() - sz_b, len_k, _ = k.size() - sz_b, len_v, _ = v.size() - # linear transformations - residual = q - q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) - k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) - v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) - # scaled dot-product attention - q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk - k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk - v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv - if mask: - mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x .. - output, attn = self.attention(q, k, v, mask=mask) - # linear transformation - output = output.view(n_head, sz_b, len_q, d_v) - output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv) - output = self.fc(output) - # normalization - output = self.layer_norm(output + residual) - return output, attn \ No newline at end of file