## 1. 数据技巧

``````def startup(self, args={}):
print('AppExp v0.0.1')
torch.manual_seed(1337)
B, T, C = 4, 8, 2 # B: batch_size；T：序列长度；C：通道数，即词汇维度；
X = torch.randn(B, T, C)
xbow1 = self.sum1(X, B, T, C)
print(xbow1)
xbow2 = self.sum2(X, B, T, C)
rst = torch.allclose(xbow1, xbow2)
print(f'比较结果：xbow1==xbow2 => {rst};')
xbow3 = self.sum3(X, B, T, C)
rst = torch.allclose(xbow1, xbow3)
print(f'xbow1和xbow3是否相等？{rst};')

def sum1(self, X, B, T, C):
xbow = torch.zeros((B, T, C)) # bag of words
for b in range(B):
for t in range(T):
xprev = X[b, :t+1] # (t, C)
xbow[b, t] = torch.mean(xprev, 0) # (b, t)
return xbow

def sum2(self, X, B, T, C):
wei = torch.tril(torch.ones(T, T)) # Note1
wei = wei / wei.sum(1, keepdim=True) # Note2
return wei @ X

def sum3(self, X, B, T, C):
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf')) # Note3
wei = F.softmax(wei, dim=-1) # Note4
return wei @ X``````

• Note1：torch.tril为一个下三角矩阵：
``````tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
[1., 1., 0., 0., 0., 0., 0., 0.],
[1., 1., 1., 0., 0., 0., 0., 0.],
[1., 1., 1., 1., 0., 0., 0., 0.],
[1., 1., 1., 1., 1., 0., 0., 0.],
[1., 1., 1., 1., 1., 1., 0., 0.],
[1., 1., 1., 1., 1., 1., 1., 0.],
[1., 1., 1., 1., 1., 1., 1., 1.]])``````
• Note2：每个元素除以它所在行的和，如下所示：
``````tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
[0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
[0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
[0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
[0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])``````
• Note3：是两个矩阵相乘，wei的形状为(8, 8)，X的形状为(4, 8, 2)，根据张量乘法，wei的(8, 8)与X(8, 2)作传统意义上的矩阵乘法运算，形成一个新的(8, 2)，最后再叠加成(4, 8, 2)，我们以其中一个为例：
• Note3:
``````tensor([[1., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
[1., 1., -inf, -inf, -inf, -inf, -inf, -inf],
[1., 1., 1., -inf, -inf, -inf, -inf, -inf],
[1., 1., 1., 1., -inf, -inf, -inf, -inf],
[1., 1., 1., 1., 1., -inf, -inf, -inf],
[1., 1., 1., 1., 1., 1., -inf, -inf],
[1., 1., 1., 1., 1., 1., 1., -inf],
[1., 1., 1., 1., 1., 1., 1., 1.]])``````
• Note4:我们知道，其余项均相同，所以求softmax后，得到与Note2处相同的矩阵：
``````tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
[0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
[0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
[0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
[0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])``````

``````git clone https://gitee.com/yt7589/hwcgpt.git
cd hwcgpt
git checkout v0.0.6``````

## 2. 自注意力机制

``````def startup(self, args={}):
print('AppExp v0.0.1')
torch.manual_seed(1337)
B, T, C = 4, 8, AppRegistry.n_embed # B: batch_size；T：序列长度；C：通道数，即词汇维度；
X = torch.randn(B, T, C)
self.self_attention(X, B, T, C)

def self_attention(self, X, B, T, C):
k = W_K(X) # (B, T, h) # Note1
q = W_Q(X) # (B, T, h) # Note2
wei = q @ k.transpose(-2, -1) / (AppRegistry.head_size**0.5) # (B, T, h) @ (B, h, T) => (B, T, T) # Note3
tril = torch.tril(torch.ones(T, T))
wei = F.softmax(wei, dim=-1)
v = W_V(X)
out = wei @ v
print(f'out: {out.shape};')``````

## 3. 自注意力头

``````class Head(nn.Module):
self.W_K = nn.Linear(n_embed, head_size, bias = False)
self.W_Q = nn.Linear(n_embed, head_size, bias = False)
self.W_V = nn.Linear(n_embed, head_size, bias = False)

def forward(self, X):
B, T, C = X.shape
k = self.W_K(X) # (B, T, h)
q = self.W_Q(X) # (B, T, h)
wei = (q @ k.transpose(-2, -1) / (AppRegistry.head_size**0.5)).to(AppRegistry.device) # (B, T, h) @ (B, h, T) => (B, T, T)
tril = torch.tril(torch.ones(T, T)).to(AppRegistry.device)
wei = F.softmax(wei, dim=-1)
v = self.W_V(X)
return wei @ v``````

``````class BigramLanguageModel(nn.Module):
def __init__(self):
super().__init__()
# 词汇数，单词维度
self.token_embedding_table = nn.Embedding(AppRegistry.vocab_size, AppRegistry.n_embed)
self.position_embedding_table = nn.Embedding(AppRegistry.block_size, AppRegistry.n_embed)

def forward(self, idx):
B, T = idx.shape
tok_emb = self.token_embedding_table(idx) # (B, T, C) C=n_embed
pos_emb = self.position_embedding_table(torch.arange(T, device=AppRegistry.device))
x = tok_emb + pos_emb # (B, T, C)
logits = self.lm_head(x) # (B, T, vocab_size)
return logits

def generate(self, idx, max_new_tokens):
for _ in range(max_new_tokens):
idx_cond = idx[:, -AppRegistry.block_size:]
logits = self(idx_cond)
logits = logits[:, -1, :] # (B, T, C) => (B, C)
probs = F.softmax(logits, dim=-1) # (B, C)
idx_next = torch.multinomial(probs, num_samples=1)
idx = torch.cat((idx, idx_next), dim=1)
return idx``````

``````git clone https://gitee.com/yt7589/hwcgpt.git
cd hwcgpt
git checkout v0.0.7``````

## 4. 多头机制

``````class MultiHeadAttention(nn.Module):
self.proj = nn.Linear(AppRegistry.n_embed, AppRegistry.n_embed)

def forward(self, X):
X = torch.cat([h(X) for h in self.heads], dim=-1)
return self.proj(X)``````

``````class BigramLanguageModel(nn.Module):
def __init__(self):
super().__init__()
# 词汇数，单词维度
self.token_embedding_table = nn.Embedding(AppRegistry.vocab_size, AppRegistry.n_embed)
self.position_embedding_table = nn.Embedding(AppRegistry.block_size, AppRegistry.n_embed)

def forward(self, idx):
B, T = idx.shape
tok_emb = self.token_embedding_table(idx) # (B, T, C) C=n_embed
pos_emb = self.position_embedding_table(torch.arange(T, device=AppRegistry.device))
x = tok_emb + pos_emb # (B, T, C)
logits = self.lm_head(x) # (B, T, vocab_size)
return logits``````

``````git clone https://gitee.com/yt7589/hwcgpt.git
cd hwcgpt
git checkout v0.0.8``````

## 5. 添加前向传播网络

``````class FeedForward(nn.Module):
def __init__(self, n_embed):
super(FeedForward, self).__init__()
self.net = nn.Sequential(
nn.Linear(n_embed, 4 * n_embed),
nn.ReLU(),
nn.Linear(4 * n_embed, n_embed),
)

def forward(self, X):
return self.net(X)``````

``````class BigramLanguageModel(nn.Module):
def __init__(self):
super().__init__()
# 词汇数，单词维度
self.token_embedding_table = nn.Embedding(AppRegistry.vocab_size, AppRegistry.n_embed)
self.position_embedding_table = nn.Embedding(AppRegistry.block_size, AppRegistry.n_embed)
self.ffwd = FeedForward(AppRegistry.n_embed)

def forward(self, idx):
B, T = idx.shape
tok_emb = self.token_embedding_table(idx) # (B, T, C) C=n_embed
pos_emb = self.position_embedding_table(torch.arange(T, device=AppRegistry.device))
x = tok_emb + pos_emb # (B, T, C)
x = self.ffwd(x)
logits = self.lm_head(x) # (B, T, vocab_size)
return logits``````

``````git clone https://gitee.com/yt7589/hwcgpt.git
cd hwcgpt
git checkout v0.0.9``````

## 6. 添加Block

``````class TransformerEncoderBlock(nn.Module):
super(TransformerEncoderBlock, self).__init__()
self.ffwd = FeedForward(n_embed)

def forward(self, X):
X = self.sa(X)
X = self.ffwd(X)
return X``````

``````class BigramLanguageModel(nn.Module):
def __init__(self):
super().__init__()
# 词汇数，单词维度
self.token_embedding_table = nn.Embedding(AppRegistry.vocab_size, AppRegistry.n_embed)
self.position_embedding_table = nn.Embedding(AppRegistry.block_size, AppRegistry.n_embed)
self.blocks = nn.Sequential(
)

def forward(self, idx):
B, T = idx.shape
tok_emb = self.token_embedding_table(idx) # (B, T, C) C=n_embed
pos_emb = self.position_embedding_table(torch.arange(T, device=AppRegistry.device))
x = tok_emb + pos_emb # (B, T, C)
x = self.blocks(x)
logits = self.lm_head(x) # (B, T, vocab_size)
return logits``````

``````git clone https://gitee.com/yt7589/hwcgpt.git
cd hwcgpt
git checkout v0.0.10``````

## 7. 添加Residue连接和dropout

``````git clone https://gitee.com/yt7589/hwcgpt.git
cd hwcgpt
git checkout v0.1.0``````