有興趣既一齊研究點樣叫佢嘔晒啲code出嚟
import torch
import torch.nn as nn
class TransformerBlock(nn.Module):
def __init__(self, embedding_dim, num_heads, feedforward_dim, dropout_rate):
super(TransformerBlock, self).__init__()
# Multi-head self-attention layer
self.self_attention = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads, dropout=dropout_rate)
# Layer normalization after self-attention
self.layer_norm_1 = nn.LayerNorm(embedding_dim)
# Feedforward neural network
self.feedforward = nn.Sequential(
nn.Linear(embedding_dim, feedforward_dim),
nn.ReLU(),
nn.Linear(feedforward_dim, embedding_dim)
)
# Layer normalization after feedforward network
self.layer_norm_2 = nn.LayerNorm(embedding_dim)
# Dropout layer
self.dropout = nn.Dropout(dropout_rate)
def forward(self, x):
# Multi-head self-attention
attention_output, _ = self.self_attention(x, x, x)
# Add residual connection and apply layer normalization
x = self.layer_norm_1(x + self.dropout(attention_output))
# Feedforward network
feedforward_output = self.feedforward(x)
# Add residual connection and apply layer normalization
x = self.layer_norm_2(x + self.dropout(feedforward_output))
return x
class GPT3(nn.Module):
def __init__(self, vocab_size, embedding_dim, num_heads, feedforward_dim, num_layers, dropout_rate):
super(GPT3, self).__init__()
# Embedding layer
self.embedding = nn.Embedding(vocab_size, embedding_dim)
# Transformer layers
self.transformer_blocks = nn.ModuleList([
TransformerBlock(embedding_dim, num_heads, feedforward_dim, dropout_rate)
for _ in range(num_layers)
])
# Output layer
self.output_layer = nn.Linear(embedding_dim, vocab_size)
def forward(self, input_ids):
# Embedding layer
embedded_input = self.embedding(input_ids)
# Pass through transformer layers
for transformer_block in self.transformer_blocks:
embedded_input = transformer_block(embedded_input)
# Output layer
logits = self.output_layer(embedded_input)
return logits