-
Notifications
You must be signed in to change notification settings - Fork 1
/
transformers.py
106 lines (84 loc) · 3.51 KB
/
transformers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# importing necessary module
import copy
import torch
import torch.nn as nn
from activation import Softmax
from decoder import Decoder
from encoder import Encoder
# class for Complete Transformers Architecture (TA)
class TransformerNet(nn.Module):
def __init__(self,
d_model,
inp_vocab_size,
target_vocab_size,
input_max_len,
target_max_len,
n_blocks=6,
activation="relu",
expansion_factor=4,
n_heads=8,
dropout_size=None):
"""
Complete Transformer Neural Network
Arguments:
d_model : Embedding Dimension
inp_vocab_size : Vocabulary Size of the Input/Source
target_vocab_size : Vocabulary Size of Target for Projection
input_max_len : Maximum Sequence Length of the input
target_max_len : Maximum Sequence Length of the Target
n_blocks : Number of Encoder/Decoder block for the Model
activation : Activation to use inbetween feed forward layer. default is `relu`
expansion_factor : Determine the Inner Dimension of the feed forward layer
n_heads : Number of Attention Heads
dropout_size : percentage of the layer to drop inbetween the layers to prevent overfitting and stablize the training
"""
super(TransformerNet, self).__init__()
# Single Encoder Block
self.encoder = Encoder(
inp_vocab_size,
input_max_len,
embedding_dim=d_model,
num_blocks=n_blocks,
expansion_factor=expansion_factor,
activation=activation,
num_heads=n_heads,
dropout=dropout_size
)
# Single Decoder Block
self.decoder = Decoder(
target_vocab_size,
target_max_len,
embedding_dim=d_model,
num_blocks=n_blocks,
activation=activation,
expansion_factor=expansion_factor,
num_heads=n_heads,
dropout=dropout_size
)
self.softmax = Softmax(axis=-1, keepdim=True)
# Final Fully Connected Layer to project the distribution over target vocabulary
self.fc_out = nn.Linear(d_model, target_vocab_size)
# function to create mask for MMHA
def _create_target_mask(self, tg):
batch_size, trg_len = tg.shape
# mask out the lower diagnol of the matrix for casual attention
trg_mask = torch.tril(torch.ones((batch_size, 1, trg_len, trg_len))).type(torch.bool)
return trg_mask
def forward(self, input, target):
"""
Forward Pass through Transformer Architecture
Inputs:
input : Input to the Encoder
target : Input to the Decoder
eg., for task like Neural Machine Translation (NMT), the input will be source language and target will be language you want to translate to.
Returns:
Probablic Distribution Over Entire Target Vocabulary
"""
trg_mask = self._create_target_mask(target) # mask used for casual attention
# Forward Pass through the Encoder Layer
enc_out = self.encoder(input)
# Forward pass through Decoder Layer
outputs = self.decoder(enc_out, target, trg_mask)
# Softmax over finally linear projected logits along the final dimension
output = self.softmax(self.fc_out(outputs))
return output