transformer详细结构及keras实现

本文主要整理来自tensorflow官方教程中Transformer 模型，需要看原文的点击这里：https://www.tensorflow.org/tutorials/text/transformer

transformer模型结构

transformer和一般的seq2seq模型一样，都是由编码器encoder和解码器decoder两部分组成。在结构上transformer完全抛弃了RNN、CNN基本架构，全部使用self-attention完成网络构建。
在这里插入图片描述

位置编码(position encoding)

transformer模型不同与RNN模型，RNN天然就有位置信息，transformer中通过额外输入每个时刻的位置信息。通过sin和cos函数交替生成位置编码信息。

(

)

sin

1000

PE_{(pos,2i)} = \sin \frac{pos}{10000^{\frac{2i}{d_{model}}}}

PE(pos,2i)?=sin10000dmodel?2i?pos?

(

)

cos

1000

PE_{(pos,2i+1)} = \cos \frac{pos}{10000^{\frac{2i}{d_{model}}}}

PE(pos,2i+1)?=cos10000dmodel?2i?pos?

1
2
3
4
5
6
7
8
9
10
11
12
13

import tensorflow as tf

# 位置编码信息
def positional_embedding(maxlen, model_size):
PE = np.zeros((maxlen, model_size))
for i in range(maxlen):
for j in range(model_size):
if j % 2 == 0:
PE[i, j] = np.sin(i / 10000 ** (j / model_size))
else:
PE[i, j] = np.cos(i / 10000 ** ((j-1) / model_size))
PE = tf.constant(PE, dtype=tf.float32)
return PE

position encoding讲解详细可以参考：
如何理解Transformer论文中的positional encoding，和三角函数有什么关系？

多头注意力(multi-head attention)

attention注意力函数可以看成是将一个输出向量映射成一个查询向量query和一组键key-值value向量对。输出向量为这些值向量的加权求和，其中每个值向量的权重由查询向量和值对应的键向量计算得出。attention可由以下形式表示：

(

)

(

)

att\_output = Attention(Q,K,V) = softmax(\frac{QK^T}{\sqrt{d_k}})V

att_output=Attention(Q,K,V)=softmax(dk?

multi-head attention则是通过h个不同的线性变换对Q，K，V进行投影，最后将不同的attention结果拼接起来：

(

)

(

)

MultiHead(Q,K,V) = Concat(head_1,...,head_h)W^o

MultiHead(Q,K,V)=Concat(head1?,...,headh?)Wo

(

)

head_i = Attention(QW_i^Q, KW_i^K, VW_i^V)

headi?=Attention(QWiQ?,KWiK?,VWiV?)
在self-attention则是取Q，K，V相同。多头注意力使模型联合感知不同位置的不同特征表征。
在这里插入图片描述

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

import tensorflow as tf
from tensorflow import keras

class MultiHeadAttention(keras.Model):
def __init__(self, model_size, num_heads):
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.head_size = model_size // num_heads
self.WQ = [keras.layers.Dense(self.head_size) for _ in range(num_heads)]
self.WK = [keras.layers.Dense(self.head_size) for _ in range(num_heads)]
self.WV = [keras.layers.Dense(self.head_size) for _ in range(num_heads)]
self.WO = keras.layers.Dense(model_size)

def call(self, query, key, value, mask):
# query shape: (batch, query_len, model_size)
# key shape: (batch, key_len, model_size)
# value shape: (batch, value_len, model_size)
context_heads = []
for i in range(self.num_heads):
q = self.WQ[i](query)
k = self.WK[i](key)
v = self.WV[i](value)
matmul_qk = tf.matmul(q, k, transpose_b=True)
dk = tf.dtypes.cast(self.head_size, tf.float32)
# 缩放 matmul_qk
score = matmul_qk / tf.math.sqrt(dk)
if mask is not None:
score += (1 - mask) * -1e9
alpha = tf.nn.softmax(score, axis=-1)
context = tf.matmul(alpha, v)

context_heads.append(context)

concat_attention = tf.concat(context_heads, axis=2)
output = self.WO(concat_attention)

return output

编码器(Encoder)

Encoder由N=6个相同的Layer组成，这里Layer指的开始结构图左侧的单元。
每个Layer由两个sub-layer组成，分别是multi-head self-attention和fully connected feed-forward network。其中全连接层为模型提供非线性变换。
其中每个sub-layer都加了residual connection和layer normalization。

(

)

sublayer\_output = LayerNorm(x+SubLayer(x))

sublayer_output=LayerNorm(x+SubLayer(x))
所以Encoder主要由3部分组成：

输入层：token embedding + position embedding + segment embedding
muti-head attention：muti-head self-attention + add&norm（残差网络+layer normalization）
全连接层：position-wise feed-forward network（全连接层）+ add&norm

position-wise feed-forward network

1
2
3
4
5
6
7
8
9
10
11

# position-wise feed forward network
class FeedForwardNetwork(keras.Model):
def __init__(self, dff_size, model_size):
super(FeedForwardNetwork, self).__init__()
self.dense1 = keras.layers.Dense(dff_size, activation="relu")
self.dense2 = keras.layers.Dense(model_size)

def call(self, x):
x = self.dense1(x)
x = self.dense2(x)
return x

Encoder Layer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

# Encoder Layer层
class EncoderLayer(keras.layers.Layer):
def __init__(self, model_size, num_heads, dff_size, rate=0.1):
super(EncoderLayer, self).__init__()

self.attention = MultiHeadAttention(model_size, num_heads)
self.ffn = FeedForwardNetwork(dff_size, model_size)

# Layer Normalization
self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)

self.dropout1 = keras.layers.Dropout(rate)
self.dropout2 = keras.layers.Dropout(rate)

def call(self, x, training, mask):
# multi head attention
attn_output = self.attention(x, x, x, mask)
attn_output = self.dropout1(attn_output, training=training)
# residual connection
out1 = self.layernorm1(x + attn_output)
# ffn layer
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
# Residual connection
out2 = self.layernorm2(out1 + ffn_output)

return out2

多层Encoder

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

# 多层Encoder
class Encoder(keras.Model):
def __init__(self, num_layers, model_size, num_heads, dff_size, vocab_size, maxlen, rate=0.1):
super(Encoder, self).__init__()
self.model_size = model_size
self.num_layers = num_layers

self.embedding = keras.layers.Embedding(vocab_size, model_size)
self.pos_embedding = positional_embedding(maxlen, model_size)

self.encoder_layers = [EncoderLayer(model_size,num_heads,dff_size,rate) for _ in range(num_layers)]
self.dropout = keras.layers.Dropout(rate)

def call(self, x, training, padding_mask):
seq_len = tf.shape(x)[1]
# input embedding
x = self.embedding(x)
# positional embedding
x += self.pos_embedding
x = self.dropout(x, training=training)

for i in range(self.num_layers):
x = self.encoder_layers[i](x, training, padding_mask)
return x

解码器(Decoder)

Decoder和Encoder类似，只是多了一个attention的sub_layer，具体是解码输入中加入了Masked Multi-Head Attention。所以Decoder由4分布组成：

输入层：token embedding + position embedding
mask muti-head层：mask muti-head self-attention（look ahead mask）+ add&norm
muti-head attention层：muti-head context-attention （encoder-decoder attention）+ add&norm
全连接层：position-wise feed-forward network + add&norm

Decoder Layer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

# Decoder Layer
class DecoderLayer(keras.layers.Layer):
def __init__(self, model_size, num_heads, dff_size, rate=0.1):
super(DecoderLayer, self).__init__()

self.mask_attention = MultiHeadAttention(model_size, num_heads, causal=True)
self.attention = MultiHeadAttention(model_size, num_heads)
self.ffn = FeedForwardNetwork(dff_size, model_size)

self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm3 = keras.layers.LayerNormalization(epsilon=1e-6)

def call(self, x, enc_output, training, padding_mask):
attn_decoder = self.mask_attention(x, x, x, padding_mask)
out1 = self.layernorm1(x + attn_decoder)

attn_encoder_decoder = self.attention(out1, enc_output, enc_output, padding_mask)
out2 = self.layernorm2(out1 + attn_encoder_decoder)

ffn_output = self.ffn(out2)
out3 = self.layernorm3(out2 + ffn_output)

return out3

多层Decoder

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

# 多层Decoder
class Decoder(keras.Model):
def __init__(self, num_layers, model_size, num_heads, dff_size, vocab_size, maxlen, rate=0.1):
super(Decoder, self).__init__()

self.model_size = model_size
self.num_layers = num_layers

self.embedding = keras.layers.Embedding(vocab_size, model_size)
self.pos_embedding = positional_embedding(maxlen, model_size)

self.decoder_layers = [DecoderLayer(model_size,num_heads,dff_size,rate) for _ in range(num_layers)]
self.dropout = keras.layers.Dropout(rate)

def call(self, enc_output, x, training, padding_mask):
seq_len = tf.shape(x)[1]
# input embedding
x = self.embedding(x)
# positional embedding
x += self.pos_embedding
x = self.dropout(x, training=training)

for i in range(self.num_layers):
x = self.decoder_layers[i](x, enc_output, training, padding_mask)

return x

Transformer

模型输入需要做一些填充，padding填充mask和decode mask操作。

1
2
3
4
5
6
7
8
9
10

# padding填充mask
def padding_mask(seq):
mask = tf.math.not_equal(seq, 0)
return mask

# decode mask
def look_ahead_mask(size):
ahead_mask = tf.linalg.band_part(tf.ones((size, size)), -1, 0)
ahead_mask = tf.cast(ahead_mask, dtype=tf.bool)
return ahead_mask

Encoder和Decoder组合成Transformer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

# Encoder和Decoder组合成Transformer
def transformer(num_layers, model_size, num_heads, dff_size, vocab_size, maxlen):
enc_inputs = keras.Input(shape=(maxlen,), name="enc_input")
dec_inputs = keras.Input(shape=(maxlen,), name="dec_input")
dec_outputs = keras.Input(shape=(maxlen,), name="dec_output")

encoder = Encoder(num_layers, model_size, num_heads, dff_size, vocab_size, maxlen)
decoder = Decoder(num_layers, model_size, num_heads, dff_size, vocab_size, maxlen)
final_dense = Keras.layers.Dense(vocab_size, name="final_output")

enc_output = encoder(enc_inputs, True, None)
dec_output = decoder(enc_output, dec_inputs, True, None)

final_output = final_dense(dec_output)

model = keras.models.Model(inputs=[enc_inputs, dec_inputs], outputs=final_output)

return model

应用与其他

机器翻译、文本生成、推荐系统。。。

参考文档
[1]. Transformer模型原理详解
[2]. Attention机制详解（二）——Self-Attention与Transformer
[3]. Attention Is All You Need
[4]. 理解语言的 Transformer 模型