In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import math
import re
from collections import Counter
import matplotlib.pyplot as plt
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
Using device: cuda
In [16]:
!wget https://www.manythings.org/anki/cmn-eng.zip -P /content
!unzip cmn-eng.zip
--2025-11-16 06:22:51-- https://www.manythings.org/anki/cmn-eng.zip Resolving www.manythings.org (www.manythings.org)... 173.254.30.110 Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 1337935 (1.3M) [application/zip] Saving to: ‘/content/cmn-eng.zip.1’ cmn-eng.zip.1 100%[===================>] 1.28M 1.04MB/s in 1.2s 2025-11-16 06:22:54 (1.04 MB/s) - ‘/content/cmn-eng.zip.1’ saved [1337935/1337935] Archive: cmn-eng.zip replace cmn.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename:
In [17]:
train_data_pd = pd.read_csv('/content/cmn.txt', sep='\t', header=None, usecols=[0, 1])
train_data_pd.columns = ['english', 'chinese']
train_data = list(zip(train_data_pd['english'], train_data_pd['chinese']))
print(f"Total train_data: {len(train_data)}")
print(f"First 10 data: {train_data[:10]}")
Total train_data: 30979
First 10 data: [('Hi.', '嗨。'), ('Hi.', '你好。'), ('Run.', '你用跑的。'), ('Stay.', '待著。'), ('Stay.', '且慢。'), ('Stop!', '住手!'), ('Wait!', '等等!'), ('Wait!', '等一下!'), ('Begin.', '开始!'), ('Fight.', '開打。')]
In [18]:
!pip install zhconv
Requirement already satisfied: zhconv in /usr/local/lib/python3.12/dist-packages (1.4.3)
In [19]:
import jieba
import zhconv
class Tokenizer:
def __init__(self, language='en'):
self.language = language
self.word2idx = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
self.idx2word = {0: '<PAD>', 1: '<SOS>', 2: '<EOS>', 3: '<UNK>'}
self.vocab_size = 4
def fit(self, texts, max_vocab_size=10000):
word_counts = Counter()
for text in texts:
tokens = self.tokenize(text)
word_counts.update(tokens)
most_common = word_counts.most_common(max_vocab_size - 4)
for word, _ in most_common:
if word not in self.word2idx:
self.word2idx[word] = self.vocab_size
self.idx2word[self.vocab_size] = word
self.vocab_size += 1
def tokenize(self, text):
text = text.lower().strip()
if self.language == 'en':
text = re.sub(r"([.!?])", r" \1", text)
tokens = text.split()
else:
text = zhconv.convert(text, 'zh-cn')
text = re.sub(r"([。!?,])", r" \1 ", text)
tokens = list(jieba.cut(text))
return tokens
def encode(self, text, max_len=None):
tokens = self.tokenize(text)
indices = [self.word2idx.get(token, 3) for token in tokens]
if max_len:
if len(indices) < max_len:
indices += [0] * (max_len - len(indices))
else:
indices = indices[:max_len]
return indices
def decode(self, indices):
tokens = [self.idx2word.get(idx, '<UNK>') for idx in indices
if idx not in [0, 1, 2]]
if self.language == 'en':
return ' '.join(tokens)
else:
return ''.join(tokens)
en_tokenizer = Tokenizer('en')
zh_tokenizer = Tokenizer('zh')
en_texts = [pair[0] for pair in train_data]
zh_texts = [pair[1] for pair in train_data]
en_tokenizer.fit(en_texts, max_vocab_size=8000)
zh_tokenizer.fit(zh_texts, max_vocab_size=8000)
print(f"English vocab size: {en_tokenizer.vocab_size}")
print(f"Chinese vocab size: {zh_tokenizer.vocab_size}")
English vocab size: 8000 Chinese vocab size: 8000
In [20]:
class TranslationDataset(Dataset):
def __init__(self, data_pairs, src_tokenizer, tgt_tokenizer, max_len=50):
self.data_pairs = data_pairs
self.src_tokenizer = src_tokenizer
self.tgt_tokenizer = tgt_tokenizer
self.max_len = max_len
def __len__(self):
return len(self.data_pairs)
def __getitem__(self, idx):
src_text, tgt_text = self.data_pairs[idx]
src_indices = self.src_tokenizer.encode(src_text)
src_indices = [1] + src_indices + [2]
tgt_indices = self.tgt_tokenizer.encode(tgt_text)
tgt_indices = [1] + tgt_indices + [2]
src_len = len(src_indices)
tgt_len = len(tgt_indices)
src_indices += [0] * (self.max_len - len(src_indices))
tgt_indices += [0] * (self.max_len - len(tgt_indices))
src_indices = src_indices[:self.max_len]
tgt_indices = tgt_indices[:self.max_len]
return {
'src': torch.tensor(src_indices, dtype=torch.long),
'tgt': torch.tensor(tgt_indices, dtype=torch.long),
'src_len': min(src_len, self.max_len),
'tgt_len': min(tgt_len, self.max_len)
}
train_size = int(0.95 * len(train_data))
train_pairs = train_data[:train_size]
val_pairs = train_data[train_size:]
train_dataset = TranslationDataset(train_pairs, en_tokenizer, zh_tokenizer, max_len=50)
val_dataset = TranslationDataset(val_pairs, en_tokenizer, zh_tokenizer, max_len=50)
BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
print(f"Train batches: {len(train_loader)}, Val batches: {len(val_loader)}")
Train batches: 460, Val batches: 25
In [21]:
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_len=5000):
super().__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() *
(-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0) # (1, max_len, d_model)
self.register_buffer('pe', pe)
def forward(self, x):
return x + self.pe[:, :x.size(1), :]
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads, dropout=0.1):
super().__init__()
assert d_model % num_heads == 0
self.d_model = d_model
self.num_heads = num_heads
self.d_k = d_model // num_heads
self.W_q = nn.Linear(d_model, d_model)
self.W_k = nn.Linear(d_model, d_model)
self.W_v = nn.Linear(d_model, d_model)
self.W_o = nn.Linear(d_model, d_model)
self.dropout = nn.Dropout(dropout)
def scaled_dot_product_attention(self, Q, K, V, mask=None):
attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
if mask is not None:
attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
attn_probs = torch.softmax(attn_scores, dim=-1)
attn_probs = self.dropout(attn_probs)
output = torch.matmul(attn_probs, V)
return output
def split_heads(self, x):
# x: (batch_size, seq_len, d_model)
batch_size, seq_len, d_model = x.size()
return x.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
def combine_heads(self, x):
# x: (batch_size, num_heads, seq_len, d_k)
batch_size, num_heads, seq_len, d_k = x.size()
return x.transpose(1, 2).reshape(batch_size, seq_len, self.d_model)
def forward(self, Q, K, V, mask=None):
Q = self.split_heads(self.W_q(Q))
K = self.split_heads(self.W_k(K))
V = self.split_heads(self.W_v(V))
attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
output = self.W_o(self.combine_heads(attn_output))
return output
class FeedForward(nn.Module):
def __init__(self, d_model, d_ff, dropout=0.1):
super().__init__()
self.linear1 = nn.Linear(d_model, d_ff)
self.linear2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(dropout)
self.relu = nn.ReLU()
def forward(self, x):
return self.linear2(self.dropout(self.relu(self.linear1(x))))
class EncoderLayer(nn.Module):
"""Encoder Layer"""
def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
super().__init__()
self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
self.feed_forward = FeedForward(d_model, d_ff, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask):
# Self-attention
attn_output = self.self_attn(x, x, x, mask)
x = self.norm1(x + self.dropout(attn_output))
# Feed-forward
ff_output = self.feed_forward(x)
x = self.norm2(x + self.dropout(ff_output))
return x
class DecoderLayer(nn.Module):
def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
super().__init__()
self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
self.cross_attn = MultiHeadAttention(d_model, num_heads, dropout)
self.feed_forward = FeedForward(d_model, d_ff, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x, enc_output, src_mask, tgt_mask):
# Masked self-attention
attn_output = self.self_attn(x, x, x, tgt_mask)
x = self.norm1(x + self.dropout(attn_output))
# Cross-attention
attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
x = self.norm2(x + self.dropout(attn_output))
# Feed-forward
ff_output = self.feed_forward(x)
x = self.norm3(x + self.dropout(ff_output))
return x
In [22]:
class Transformer(nn.Module):
def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512,
num_heads=8, num_layers=6, d_ff=2048, max_seq_len=100, dropout=0.1):
super().__init__()
self.d_model = d_model
# Embeddings
self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
# Positional Encoding
self.pos_encoding = PositionalEncoding(d_model, max_seq_len)
# Encoder
self.encoder_layers = nn.ModuleList([
EncoderLayer(d_model, num_heads, d_ff, dropout)
for _ in range(num_layers)
])
# Decoder
self.decoder_layers = nn.ModuleList([
DecoderLayer(d_model, num_heads, d_ff, dropout)
for _ in range(num_layers)
])
# Output layer
self.fc = nn.Linear(d_model, tgt_vocab_size)
self.dropout = nn.Dropout(dropout)
def generate_mask(self, src, tgt):
src_mask = (src != 0).unsqueeze(1).unsqueeze(2) # (batch_size, 1, 1, src_len)
tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3) # (batch_size, 1, tgt_len, 1)
seq_len = tgt.size(1)
nopeak_mask = (1 - torch.triu(torch.ones(1, seq_len, seq_len), diagonal=1)).bool()
nopeak_mask = nopeak_mask.to(tgt.device)
tgt_mask = tgt_mask & nopeak_mask
return src_mask, tgt_mask
def forward(self, src, tgt):
src_mask, tgt_mask = self.generate_mask(src, tgt)
# Encoder
src_embedded = self.dropout(self.pos_encoding(
self.encoder_embedding(src) * math.sqrt(self.d_model)))
enc_output = src_embedded
for layer in self.encoder_layers:
enc_output = layer(enc_output, src_mask)
# Decoder
tgt_embedded = self.dropout(self.pos_encoding(
self.decoder_embedding(tgt) * math.sqrt(self.d_model)))
dec_output = tgt_embedded
for layer in self.decoder_layers:
dec_output = layer(dec_output, enc_output, src_mask, tgt_mask)
# Output
output = self.fc(dec_output)
return output
In [23]:
from tqdm import tqdm
def train_epoch(model, dataloader, optimizer, criterion, device):
model.train()
total_loss = 0
progress_bar = tqdm(dataloader, desc='Training')
for batch in progress_bar:
src = batch['src'].to(device)
tgt = batch['tgt'].to(device)
# tgt_input excludes last token
# tgt_output excludes first token (<SOS>)
tgt_input = tgt[:, :-1]
tgt_output = tgt[:, 1:]
optimizer.zero_grad()
# Forward pass
outputs = model(src, tgt_input)
# Calculate loss
loss = criterion(
outputs.contiguous().view(-1, outputs.size(-1)),
tgt_output.contiguous().view(-1)
)
# Backward pass
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
total_loss += loss.item()
# Update progress bar
progress_bar.set_postfix({'loss': loss.item()})
return total_loss / len(dataloader)
def evaluate(model, dataloader, criterion, device):
model.eval()
total_loss = 0
progress_bar = tqdm(dataloader, desc='Evaluating')
with torch.no_grad():
for batch in progress_bar:
src = batch['src'].to(device)
tgt = batch['tgt'].to(device)
tgt_input = tgt[:, :-1]
tgt_output = tgt[:, 1:]
outputs = model(src, tgt_input)
loss = criterion(
outputs.contiguous().view(-1, outputs.size(-1)),
tgt_output.contiguous().view(-1)
)
total_loss += loss.item()
# Update progress bar
progress_bar.set_postfix({'loss': loss.item()})
return total_loss / len(dataloader)
# Initialize model
model = Transformer(
src_vocab_size=en_tokenizer.vocab_size,
tgt_vocab_size=zh_tokenizer.vocab_size,
d_model=256, # Reduced for small dataset
num_heads=8,
num_layers=4, # Fewer layers
d_ff=512,
max_seq_len=50,
dropout=0.1
).to(device)
# Optimizer and loss function
criterion = nn.CrossEntropyLoss(ignore_index=0) # Ignore padding
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
# Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
factor=0.5, patience=2)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
Model parameters: 11,423,552
In [24]:
# Training
NUM_EPOCHS = 30
train_losses = []
val_losses = []
best_val_loss = float('inf')
for epoch in range(NUM_EPOCHS):
train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
val_loss = evaluate(model, val_loader, criterion, device)
train_losses.append(train_loss)
val_losses.append(val_loss)
scheduler.step(val_loss)
if val_loss < best_val_loss:
best_val_loss = val_loss
torch.save(model.state_dict(), 'best_transformer.pth')
print(f'Epoch {epoch+1}/{NUM_EPOCHS}')
print(f'Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}')
print('-' * 50)
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Training Progress')
plt.show()
Training: 100%|██████████| 460/460 [00:21<00:00, 21.48it/s, loss=3.21] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 44.87it/s, loss=4.89]
Epoch 1/30 Train Loss: 3.9005 | Val Loss: 4.2946 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 21.93it/s, loss=3.1] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 47.30it/s, loss=4.62]
Epoch 2/30 Train Loss: 3.0703 | Val Loss: 4.0617 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 22.09it/s, loss=2.8] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 49.01it/s, loss=4.58]
Epoch 3/30 Train Loss: 2.7973 | Val Loss: 3.9491 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 22.09it/s, loss=2.75] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 47.61it/s, loss=4.33]
Epoch 4/30 Train Loss: 2.5948 | Val Loss: 3.8160 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 22.12it/s, loss=2.28] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 46.64it/s, loss=4.3]
Epoch 5/30 Train Loss: 2.4321 | Val Loss: 3.7317 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 22.16it/s, loss=2.2] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 47.66it/s, loss=4.29]
Epoch 6/30 Train Loss: 2.2946 | Val Loss: 3.6690 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 22.12it/s, loss=2.29] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 48.25it/s, loss=4.17]
Epoch 7/30 Train Loss: 2.1696 | Val Loss: 3.5938 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 22.08it/s, loss=2.04] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 48.10it/s, loss=4.21]
Epoch 8/30 Train Loss: 2.0589 | Val Loss: 3.5613 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 22.10it/s, loss=1.79] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 46.59it/s, loss=4.16]
Epoch 9/30 Train Loss: 1.9613 | Val Loss: 3.5195 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 22.15it/s, loss=1.86] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 48.41it/s, loss=4.2]
Epoch 10/30 Train Loss: 1.8659 | Val Loss: 3.4952 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 22.09it/s, loss=1.61] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 48.33it/s, loss=4.21]
Epoch 11/30 Train Loss: 1.7810 | Val Loss: 3.4752 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 22.08it/s, loss=1.7] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 48.07it/s, loss=4.21]
Epoch 12/30 Train Loss: 1.7007 | Val Loss: 3.4483 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 22.24it/s, loss=1.55] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 47.41it/s, loss=4.21]
Epoch 13/30 Train Loss: 1.6243 | Val Loss: 3.4287 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 22.17it/s, loss=1.64] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 47.40it/s, loss=4.18]
Epoch 14/30 Train Loss: 1.5552 | Val Loss: 3.3886 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 22.08it/s, loss=1.59] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 48.89it/s, loss=4.17]
Epoch 15/30 Train Loss: 1.4875 | Val Loss: 3.3707 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 22.19it/s, loss=1.58] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 47.56it/s, loss=4.21]
Epoch 16/30 Train Loss: 1.4242 | Val Loss: 3.3600 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 22.26it/s, loss=1.3] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 46.37it/s, loss=4.29]
Epoch 17/30 Train Loss: 1.3662 | Val Loss: 3.3761 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 21.92it/s, loss=1.06] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 46.49it/s, loss=4.28]
Epoch 18/30 Train Loss: 1.3091 | Val Loss: 3.3516 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:21<00:00, 21.66it/s, loss=1.26] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 46.33it/s, loss=4.41]
Epoch 19/30 Train Loss: 1.2558 | Val Loss: 3.3754 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:21<00:00, 21.53it/s, loss=1.07] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 46.76it/s, loss=4.43]
Epoch 20/30 Train Loss: 1.2045 | Val Loss: 3.3717 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:21<00:00, 21.65it/s, loss=1.14] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 47.43it/s, loss=4.42]
Epoch 21/30 Train Loss: 1.1594 | Val Loss: 3.3566 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:21<00:00, 21.48it/s, loss=1.11] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 46.00it/s, loss=4.38]
Epoch 22/30 Train Loss: 1.0825 | Val Loss: 3.3529 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:21<00:00, 21.64it/s, loss=0.784] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 47.29it/s, loss=4.37]
Epoch 23/30 Train Loss: 1.0526 | Val Loss: 3.3413 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:21<00:00, 21.64it/s, loss=1.1] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 47.22it/s, loss=4.41]
Epoch 24/30 Train Loss: 1.0297 | Val Loss: 3.3589 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:21<00:00, 21.79it/s, loss=1.13] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 47.13it/s, loss=4.43]
Epoch 25/30 Train Loss: 1.0047 | Val Loss: 3.3636 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:21<00:00, 21.89it/s, loss=0.871] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 46.86it/s, loss=4.45]
Epoch 26/30 Train Loss: 0.9848 | Val Loss: 3.3608 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 22.01it/s, loss=1.19] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 47.26it/s, loss=4.43]
Epoch 27/30 Train Loss: 0.9498 | Val Loss: 3.3593 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 21.98it/s, loss=0.834] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 48.03it/s, loss=4.44]
Epoch 28/30 Train Loss: 0.9320 | Val Loss: 3.3656 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 22.01it/s, loss=0.844] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 47.44it/s, loss=4.44]
Epoch 29/30 Train Loss: 0.9223 | Val Loss: 3.3663 --------------------------------------------------
Training: 100%|██████████| 460/460 [00:20<00:00, 22.17it/s, loss=0.934] Evaluating: 100%|██████████| 25/25 [00:00<00:00, 46.88it/s, loss=4.5]
Epoch 30/30 Train Loss: 0.9038 | Val Loss: 3.3837 --------------------------------------------------
In [26]:
def translate(model, sentence, src_tokenizer, tgt_tokenizer,
device, beam_width=3, max_len=50):
model.eval()
src_indices = [1] + src_tokenizer.encode(sentence) + [2]
src_tensor = torch.tensor(src_indices, dtype=torch.long).unsqueeze(0).to(device)
# Initialize beam
beams = [([1], 0.0)] # (sequence, score)
for _ in range(max_len):
new_beams = []
for seq, score in beams:
if seq[-1] == 2: # Already ended
new_beams.append((seq, score))
continue
tgt_tensor = torch.tensor(seq, dtype=torch.long).unsqueeze(0).to(device)
with torch.no_grad():
output = model(src_tensor, tgt_tensor)
# Get top-k candidates
log_probs = torch.log_softmax(output[0, -1, :], dim=-1)
top_k_probs, top_k_indices = torch.topk(log_probs, beam_width)
for prob, idx in zip(top_k_probs, top_k_indices):
new_seq = seq + [idx.item()]
new_score = score + prob.item()
new_beams.append((new_seq, new_score))
# Keep top beam_width beams
beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
if all(seq[-1] == 2 for seq, _ in beams):
break
best_seq = beams[0][0]
translation = tgt_tokenizer.decode(best_seq)
return translation
test_sentences = [
"Hi.",
"Wait!",
"I love you.",
"Good morning.",
"How are you?",
"I am an engineer",
"Can you speak Chinese",
"Do you like playing guitar?"
]
for sentence in test_sentences:
translation = translate(model, sentence, en_tokenizer, zh_tokenizer, device)
print(f"EN: {sentence}")
print(f"ZH: {translation}")
print()
EN: Hi. ZH: 你好 。 EN: Wait! ZH: 等等 ! EN: I love you. ZH: 我爱你 。 EN: Good morning. ZH: 早上好 。 EN: How are you? ZH: 你们好吗 ? EN: I am an engineer ZH: 我是一名工程师 。 EN: Can you speak Chinese ZH: 你会讲中文 ? EN: Do you like playing guitar? ZH: 你喜欢吉他吗 ?