diff --git a/preprocess.py b/preprocess.py index fd97df5..11e62f0 100644 --- a/preprocess.py +++ b/preprocess.py @@ -12,7 +12,8 @@ import tarfile import torchtext.data import torchtext.datasets -from torchtext.datasets import TranslationDataset +#from torchtext.datasets import TranslationDataset +from torchtext.legacy.datasets import TranslationDataset import transformer.Constants as Constants from learn_bpe import learn_bpe from apply_bpe import BPE @@ -332,5 +333,5 @@ def filter_examples_with_length(x): if __name__ == '__main__': - main_wo_bpe() - #main() + #main_wo_bpe() + main() diff --git a/train.py b/train.py index f84a4fb..7b9dde3 100644 --- a/train.py +++ b/train.py @@ -76,6 +76,7 @@ def train_epoch(model, training_data, optimizer, opt, device, smoothing): total_loss, n_word_total, n_word_correct = 0, 0, 0 desc = ' - (Training) ' + # * 训练每一个batch for batch in tqdm(training_data, mininterval=2, desc=desc, leave=False): # prepare data @@ -89,7 +90,8 @@ def train_epoch(model, training_data, optimizer, opt, device, smoothing): # backward and update parameters loss, n_correct, n_word = cal_performance( pred, gold, opt.trg_pad_idx, smoothing=smoothing) - loss.backward() + loss.backward() # * 计算梯度 + # * 更新参数 optimizer.step_and_update_lr() # note keeping @@ -162,6 +164,7 @@ def print_performances(header, ppl, accu, start_time, lr): print('[ Epoch', epoch_i, ']') start = time.time() + # * train 单个epoch train_loss, train_accu = train_epoch( model, training_data, optimizer, opt, device, smoothing=opt.label_smoothing) train_ppl = math.exp(min(train_loss, 100)) @@ -170,6 +173,7 @@ def print_performances(header, ppl, accu, start_time, lr): print_performances('Training', train_ppl, train_accu, start, lr) start = time.time() + # * 使用验证集验证数据 valid_loss, valid_accu = eval_epoch(model, validation_data, device, opt) valid_ppl = math.exp(min(valid_loss, 100)) print_performances('Validation', valid_ppl, valid_accu, start, lr) @@ -278,6 +282,7 @@ def main(): print(opt) + # * 这里是设置transformer参数 transformer = Transformer( opt.src_vocab_size, opt.trg_vocab_size, @@ -295,6 +300,7 @@ def main(): dropout=opt.dropout, scale_emb_or_prj=opt.scale_emb_or_prj).to(device) + # * 设置优化器 optimizer = ScheduledOptim( optim.Adam(transformer.parameters(), betas=(0.9, 0.98), eps=1e-09), opt.lr_mul, opt.d_model, opt.n_warmup_steps) diff --git a/transformer/Layers.py b/transformer/Layers.py index 42665d1..b1aabdf 100644 --- a/transformer/Layers.py +++ b/transformer/Layers.py @@ -7,8 +7,9 @@ __author__ = "Yu-Hsiang Huang" -class EncoderLayer(nn.Module): +class EncoderLayer(nn.Module): ''' Compose with two layers ''' + # * 一个encode layer 包含一个MHA 以及 layernorm + feedforward def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): super(EncoderLayer, self).__init__() @@ -21,7 +22,6 @@ def forward(self, enc_input, slf_attn_mask=None): enc_output = self.pos_ffn(enc_output) return enc_output, enc_slf_attn - class DecoderLayer(nn.Module): ''' Compose with three layers ''' diff --git a/transformer/Models.py b/transformer/Models.py index 9af9fd6..4ff6690 100644 --- a/transformer/Models.py +++ b/transformer/Models.py @@ -42,9 +42,10 @@ def get_position_angle_vec(position): return torch.FloatTensor(sinusoid_table).unsqueeze(0) def forward(self, x): + # * 位置编码不需要梯度回传 所以会有detach + # * 这里将输入和pos+encoding 相加 return x + self.pos_table[:, :x.size(1)].clone().detach() - class Encoder(nn.Module): ''' A encoder model with self attention mechanism. ''' @@ -57,8 +58,9 @@ def __init__( self.src_word_emb = nn.Embedding(n_src_vocab, d_word_vec, padding_idx=pad_idx) self.position_enc = PositionalEncoding(d_word_vec, n_position=n_position) self.dropout = nn.Dropout(p=dropout) + # * 看起来这里是 MHA的实现逻辑 多个Encodelayer 堆叠。encodelayer 内部包含MHA 以及layernorm self.layer_stack = nn.ModuleList([ - EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout) + EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers)]) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.scale_emb = scale_emb @@ -69,17 +71,18 @@ def forward(self, src_seq, src_mask, return_attns=False): enc_slf_attn_list = [] # -- Forward - enc_output = self.src_word_emb(src_seq) + enc_output = self.src_word_emb(src_seq) # * src_seq 应该是词表索引 通过查找表获取对应的embedding if self.scale_emb: enc_output *= self.d_model ** 0.5 enc_output = self.dropout(self.position_enc(enc_output)) enc_output = self.layer_norm(enc_output) for enc_layer in self.layer_stack: + # * 计算每个encoder layer 每个layer的输出是下一个layer的输入 enc_output, enc_slf_attn = enc_layer(enc_output, slf_attn_mask=src_mask) enc_slf_attn_list += [enc_slf_attn] if return_attns else [] - if return_attns: + if return_attns: # * 这里看起来是为了打印出权重 return enc_output, enc_slf_attn_list return enc_output, @@ -185,7 +188,7 @@ def __init__( def forward(self, src_seq, trg_seq): - + # ? 这里的mask都是怎么用的? src_mask = get_pad_mask(src_seq, self.src_pad_idx) trg_mask = get_pad_mask(trg_seq, self.trg_pad_idx) & get_subsequent_mask(trg_seq) diff --git a/transformer/Modules.py b/transformer/Modules.py index 3236281..88184a1 100644 --- a/transformer/Modules.py +++ b/transformer/Modules.py @@ -18,7 +18,7 @@ def forward(self, q, k, v, mask=None): if mask is not None: attn = attn.masked_fill(mask == 0, -1e9) - + # * 看起来比原文多了个 dropout attn = self.dropout(F.softmax(attn, dim=-1)) output = torch.matmul(attn, v) diff --git a/transformer/SubLayers.py b/transformer/SubLayers.py index cf9df8b..e7b4e65 100644 --- a/transformer/SubLayers.py +++ b/transformer/SubLayers.py @@ -16,10 +16,11 @@ def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): self.d_k = d_k self.d_v = d_v - self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False) + # * 每个矩阵是concat起来的 d_model 是输入embedding的size + self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False) # 注意这里只是线性乘法 没有bias self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False) self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False) - self.fc = nn.Linear(n_head * d_v, d_model, bias=False) + self.fc = nn.Linear(n_head * d_v, d_model, bias=False) # * 压缩多个layer使用的 self.attention = ScaledDotProductAttention(temperature=d_k ** 0.5) @@ -28,9 +29,10 @@ def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): def forward(self, q, k, v, mask=None): - + # ! 这里输入的qkv实际上都是同一个矩阵 batch_size * seq_length * word_emb_dim d_k, d_v, n_head = self.d_k, self.d_v, self.n_head sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1) + # * sz_b 是batch_size的意思 len_q 是seq_length的意思 residual = q @@ -45,13 +47,18 @@ def forward(self, q, k, v, mask=None): if mask is not None: mask = mask.unsqueeze(1) # For head axis broadcasting. - + # * 计算qkv乘法 + # ! qkv.size = b x n x lq x dv + # ! matmul([b x n x lq x dv], [b x n x lq x dv])= b x n x lq x lq # q * KT + # ! matmul([b x n x lq x lq], [b x n x lq x dv])= b x n x lq x dv # (q * KT) * V + # batch_size = 11;seq_len = 6;emb_sz = 4; n_head = 3 + # torch.rand(batch_size, n_head, seq_len, emb_sz) q, attn = self.attention(q, k, v, mask=mask) # Transpose to move the head dimension back: b x lq x n x dv # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv) q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1) - q = self.dropout(self.fc(q)) + q = self.dropout(self.fc(q)) # * q += residual q = self.layer_norm(q) diff --git a/transformer_docker_run.sh b/transformer_docker_run.sh new file mode 100644 index 0000000..337dd5a --- /dev/null +++ b/transformer_docker_run.sh @@ -0,0 +1 @@ +docker run --rm -it -v /Users/bytedance/Desktop/ai_infra/attention-is-all-you-need-pytorch:/transformer ss4g/transformer_env_torch /bin/bash \ No newline at end of file