import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import math
import re
from collections import Counter
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda

!wget https://www.manythings.org/anki/cmn-eng.zip -P /content
!unzip cmn-eng.zip

--2025-11-16 06:22:51--  https://www.manythings.org/anki/cmn-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1337935 (1.3M) [application/zip]
Saving to: ‘/content/cmn-eng.zip.1’

cmn-eng.zip.1       100%[===================>]   1.28M  1.04MB/s    in 1.2s    

2025-11-16 06:22:54 (1.04 MB/s) - ‘/content/cmn-eng.zip.1’ saved [1337935/1337935]

Archive:  cmn-eng.zip
replace cmn.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename:

train_data_pd = pd.read_csv('/content/cmn.txt', sep='\t', header=None, usecols=[0, 1])
train_data_pd.columns = ['english', 'chinese']
train_data = list(zip(train_data_pd['english'], train_data_pd['chinese']))

print(f"Total train_data: {len(train_data)}")
print(f"First 10 data: {train_data[:10]}")

Total train_data: 30979
First 10 data: [('Hi.', '嗨。'), ('Hi.', '你好。'), ('Run.', '你用跑的。'), ('Stay.', '待著。'), ('Stay.', '且慢。'), ('Stop!', '住手！'), ('Wait!', '等等！'), ('Wait!', '等一下！'), ('Begin.', '开始！'), ('Fight.', '開打。')]

!pip install zhconv

Requirement already satisfied: zhconv in /usr/local/lib/python3.12/dist-packages (1.4.3)

import jieba
import zhconv

class Tokenizer:
  def __init__(self, language='en'):
    self.language = language
    self.word2idx = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
    self.idx2word = {0: '<PAD>', 1: '<SOS>', 2: '<EOS>', 3: '<UNK>'}
    self.vocab_size = 4

  def fit(self, texts, max_vocab_size=10000):
    word_counts = Counter()
    for text in texts:
      tokens = self.tokenize(text)
      word_counts.update(tokens)

    most_common = word_counts.most_common(max_vocab_size - 4)

    for word, _ in most_common:
      if word not in self.word2idx:
        self.word2idx[word] = self.vocab_size
        self.idx2word[self.vocab_size] = word
        self.vocab_size += 1

  def tokenize(self, text):
    text = text.lower().strip()
    if self.language == 'en':
      text = re.sub(r"([.!?])", r" \1", text)
      tokens = text.split()
    else:
      text = zhconv.convert(text, 'zh-cn')
      text = re.sub(r"([。！？，])", r" \1 ", text)
      tokens = list(jieba.cut(text))
    return tokens

  def encode(self, text, max_len=None):
    tokens = self.tokenize(text)
    indices = [self.word2idx.get(token, 3) for token in tokens]

    if max_len:
      if len(indices) < max_len:
        indices += [0] * (max_len - len(indices))
      else:
        indices = indices[:max_len]

    return indices

  def decode(self, indices):
    tokens = [self.idx2word.get(idx, '<UNK>') for idx in indices
             if idx not in [0, 1, 2]]

    if self.language == 'en':
      return ' '.join(tokens)
    else:
      return ''.join(tokens)

en_tokenizer = Tokenizer('en')
zh_tokenizer = Tokenizer('zh')

en_texts = [pair[0] for pair in train_data]
zh_texts = [pair[1] for pair in train_data]

en_tokenizer.fit(en_texts, max_vocab_size=8000)
zh_tokenizer.fit(zh_texts, max_vocab_size=8000)

print(f"English vocab size: {en_tokenizer.vocab_size}")
print(f"Chinese vocab size: {zh_tokenizer.vocab_size}")

English vocab size: 8000
Chinese vocab size: 8000

class TranslationDataset(Dataset):
  def __init__(self, data_pairs, src_tokenizer, tgt_tokenizer, max_len=50):
    self.data_pairs = data_pairs
    self.src_tokenizer = src_tokenizer
    self.tgt_tokenizer = tgt_tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.data_pairs)

  def __getitem__(self, idx):
    src_text, tgt_text = self.data_pairs[idx]

    src_indices = self.src_tokenizer.encode(src_text)
    src_indices = [1] + src_indices + [2]

    tgt_indices = self.tgt_tokenizer.encode(tgt_text)
    tgt_indices = [1] + tgt_indices + [2]

    src_len = len(src_indices)
    tgt_len = len(tgt_indices)

    src_indices += [0] * (self.max_len - len(src_indices))
    tgt_indices += [0] * (self.max_len - len(tgt_indices))

    src_indices = src_indices[:self.max_len]
    tgt_indices = tgt_indices[:self.max_len]

    return {
      'src': torch.tensor(src_indices, dtype=torch.long),
      'tgt': torch.tensor(tgt_indices, dtype=torch.long),
      'src_len': min(src_len, self.max_len),
      'tgt_len': min(tgt_len, self.max_len)
    }

train_size = int(0.95 * len(train_data))
train_pairs = train_data[:train_size]
val_pairs = train_data[train_size:]

train_dataset = TranslationDataset(train_pairs, en_tokenizer, zh_tokenizer, max_len=50)
val_dataset = TranslationDataset(val_pairs, en_tokenizer, zh_tokenizer, max_len=50)

BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Train batches: {len(train_loader)}, Val batches: {len(val_loader)}")

Train batches: 460, Val batches: 25

class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=5000):
    super().__init__()

    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                        (-math.log(10000.0) / d_model))

    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)

    pe = pe.unsqueeze(0)  # (1, max_len, d_model)
    self.register_buffer('pe', pe)

  def forward(self, x):
    return x + self.pe[:, :x.size(1), :]


class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, num_heads, dropout=0.1):
    super().__init__()
    assert d_model % num_heads == 0

    self.d_model = d_model
    self.num_heads = num_heads
    self.d_k = d_model // num_heads

    self.W_q = nn.Linear(d_model, d_model)
    self.W_k = nn.Linear(d_model, d_model)
    self.W_v = nn.Linear(d_model, d_model)
    self.W_o = nn.Linear(d_model, d_model)

    self.dropout = nn.Dropout(dropout)

  def scaled_dot_product_attention(self, Q, K, V, mask=None):
    attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

    if mask is not None:
      attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

    attn_probs = torch.softmax(attn_scores, dim=-1)
    attn_probs = self.dropout(attn_probs)

    output = torch.matmul(attn_probs, V)
    return output

  def split_heads(self, x):
    # x: (batch_size, seq_len, d_model)
    batch_size, seq_len, d_model = x.size()
    return x.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)

  def combine_heads(self, x):
    # x: (batch_size, num_heads, seq_len, d_k)
    batch_size, num_heads, seq_len, d_k = x.size()
    return x.transpose(1, 2).reshape(batch_size, seq_len, self.d_model)

  def forward(self, Q, K, V, mask=None):
    Q = self.split_heads(self.W_q(Q))
    K = self.split_heads(self.W_k(K))
    V = self.split_heads(self.W_v(V))

    attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
    output = self.W_o(self.combine_heads(attn_output))
    return output


class FeedForward(nn.Module):
  def __init__(self, d_model, d_ff, dropout=0.1):
    super().__init__()
    self.linear1 = nn.Linear(d_model, d_ff)
    self.linear2 = nn.Linear(d_ff, d_model)
    self.dropout = nn.Dropout(dropout)
    self.relu = nn.ReLU()

  def forward(self, x):
    return self.linear2(self.dropout(self.relu(self.linear1(x))))


class EncoderLayer(nn.Module):
  """Encoder Layer"""
  def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
    super().__init__()
    self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
    self.feed_forward = FeedForward(d_model, d_ff, dropout)
    self.norm1 = nn.LayerNorm(d_model)
    self.norm2 = nn.LayerNorm(d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, mask):
    # Self-attention
    attn_output = self.self_attn(x, x, x, mask)
    x = self.norm1(x + self.dropout(attn_output))

    # Feed-forward
    ff_output = self.feed_forward(x)
    x = self.norm2(x + self.dropout(ff_output))

    return x


class DecoderLayer(nn.Module):
  def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
    super().__init__()
    self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
    self.cross_attn = MultiHeadAttention(d_model, num_heads, dropout)
    self.feed_forward = FeedForward(d_model, d_ff, dropout)

    self.norm1 = nn.LayerNorm(d_model)
    self.norm2 = nn.LayerNorm(d_model)
    self.norm3 = nn.LayerNorm(d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, enc_output, src_mask, tgt_mask):
    # Masked self-attention
    attn_output = self.self_attn(x, x, x, tgt_mask)
    x = self.norm1(x + self.dropout(attn_output))

    # Cross-attention
    attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
    x = self.norm2(x + self.dropout(attn_output))

    # Feed-forward
    ff_output = self.feed_forward(x)
    x = self.norm3(x + self.dropout(ff_output))

    return x

class Transformer(nn.Module):
  def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512,
               num_heads=8, num_layers=6, d_ff=2048, max_seq_len=100, dropout=0.1):
    super().__init__()

    self.d_model = d_model

    # Embeddings
    self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
    self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)

    # Positional Encoding
    self.pos_encoding = PositionalEncoding(d_model, max_seq_len)

    # Encoder
    self.encoder_layers = nn.ModuleList([
      EncoderLayer(d_model, num_heads, d_ff, dropout)
      for _ in range(num_layers)
    ])

    # Decoder
    self.decoder_layers = nn.ModuleList([
      DecoderLayer(d_model, num_heads, d_ff, dropout)
      for _ in range(num_layers)
    ])

    # Output layer
    self.fc = nn.Linear(d_model, tgt_vocab_size)
    self.dropout = nn.Dropout(dropout)

  def generate_mask(self, src, tgt):
    src_mask = (src != 0).unsqueeze(1).unsqueeze(2)  # (batch_size, 1, 1, src_len)
    tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)  # (batch_size, 1, tgt_len, 1)

    seq_len = tgt.size(1)
    nopeak_mask = (1 - torch.triu(torch.ones(1, seq_len, seq_len), diagonal=1)).bool()
    nopeak_mask = nopeak_mask.to(tgt.device)
    tgt_mask = tgt_mask & nopeak_mask

    return src_mask, tgt_mask

  def forward(self, src, tgt):
    src_mask, tgt_mask = self.generate_mask(src, tgt)

    # Encoder
    src_embedded = self.dropout(self.pos_encoding(
      self.encoder_embedding(src) * math.sqrt(self.d_model)))

    enc_output = src_embedded
    for layer in self.encoder_layers:
      enc_output = layer(enc_output, src_mask)

    # Decoder
    tgt_embedded = self.dropout(self.pos_encoding(
      self.decoder_embedding(tgt) * math.sqrt(self.d_model)))

    dec_output = tgt_embedded
    for layer in self.decoder_layers:
      dec_output = layer(dec_output, enc_output, src_mask, tgt_mask)

    # Output
    output = self.fc(dec_output)
    return output

from tqdm import tqdm

def train_epoch(model, dataloader, optimizer, criterion, device):
  model.train()
  total_loss = 0

  progress_bar = tqdm(dataloader, desc='Training')

  for batch in progress_bar:
    src = batch['src'].to(device)
    tgt = batch['tgt'].to(device)

    # tgt_input excludes last token
    # tgt_output excludes first token (<SOS>)
    tgt_input = tgt[:, :-1]
    tgt_output = tgt[:, 1:]

    optimizer.zero_grad()

    # Forward pass
    outputs = model(src, tgt_input)

    # Calculate loss
    loss = criterion(
      outputs.contiguous().view(-1, outputs.size(-1)),
      tgt_output.contiguous().view(-1)
    )

    # Backward pass
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()

    total_loss += loss.item()

    # Update progress bar
    progress_bar.set_postfix({'loss': loss.item()})

  return total_loss / len(dataloader)


def evaluate(model, dataloader, criterion, device):
  model.eval()
  total_loss = 0

  progress_bar = tqdm(dataloader, desc='Evaluating')

  with torch.no_grad():
    for batch in progress_bar:
      src = batch['src'].to(device)
      tgt = batch['tgt'].to(device)

      tgt_input = tgt[:, :-1]
      tgt_output = tgt[:, 1:]

      outputs = model(src, tgt_input)
      loss = criterion(
        outputs.contiguous().view(-1, outputs.size(-1)),
        tgt_output.contiguous().view(-1)
      )

      total_loss += loss.item()

      # Update progress bar
      progress_bar.set_postfix({'loss': loss.item()})

  return total_loss / len(dataloader)


# Initialize model
model = Transformer(
  src_vocab_size=en_tokenizer.vocab_size,
  tgt_vocab_size=zh_tokenizer.vocab_size,
  d_model=256,          # Reduced for small dataset
  num_heads=8,
  num_layers=4,         # Fewer layers
  d_ff=512,
  max_seq_len=50,
  dropout=0.1
).to(device)

# Optimizer and loss function
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
                                                factor=0.5, patience=2)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

Model parameters: 11,423,552

# Training
NUM_EPOCHS = 30
train_losses = []
val_losses = []
best_val_loss = float('inf')

for epoch in range(NUM_EPOCHS):
  train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
  val_loss = evaluate(model, val_loader, criterion, device)

  train_losses.append(train_loss)
  val_losses.append(val_loss)

  scheduler.step(val_loss)

  if val_loss < best_val_loss:
    best_val_loss = val_loss
    torch.save(model.state_dict(), 'best_transformer.pth')

  print(f'Epoch {epoch+1}/{NUM_EPOCHS}')
  print(f'Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}')
  print('-' * 50)

plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Training Progress')
plt.show()

Training: 100%|██████████| 460/460 [00:21<00:00, 21.48it/s, loss=3.21]
Evaluating: 100%|██████████| 25/25 [00:00<00:00, 44.87it/s, loss=4.89]

Epoch 1/30
Train Loss: 3.9005 | Val Loss: 4.2946
--------------------------------------------------

Training: 100%|██████████| 460/460 [00:20<00:00, 21.93it/s, loss=3.1]
Evaluating: 100%|██████████| 25/25 [00:00<00:00, 47.30it/s, loss=4.62]

Epoch 2/30
Train Loss: 3.0703 | Val Loss: 4.0617
--------------------------------------------------

Training: 100%|██████████| 460/460 [00:20<00:00, 22.09it/s, loss=2.8]
Evaluating: 100%|██████████| 25/25 [00:00<00:00, 49.01it/s, loss=4.58]

Epoch 3/30
Train Loss: 2.7973 | Val Loss: 3.9491
--------------------------------------------------

Training: 100%|██████████| 460/460 [00:20<00:00, 22.09it/s, loss=2.75]
Evaluating: 100%|██████████| 25/25 [00:00<00:00, 47.61it/s, loss=4.33]

Epoch 4/30
Train Loss: 2.5948 | Val Loss: 3.8160
--------------------------------------------------

Training: 100%|██████████| 460/460 [00:20<00:00, 22.12it/s, loss=2.28]
Evaluating: 100%|██████████| 25/25 [00:00<00:00, 46.64it/s, loss=4.3]

Epoch 5/30
Train Loss: 2.4321 | Val Loss: 3.7317
--------------------------------------------------

def translate(model, sentence, src_tokenizer, tgt_tokenizer,
                         device, beam_width=3, max_len=50):
  model.eval()

  src_indices = [1] + src_tokenizer.encode(sentence) + [2]
  src_tensor = torch.tensor(src_indices, dtype=torch.long).unsqueeze(0).to(device)

  # Initialize beam
  beams = [([1], 0.0)]  # (sequence, score)

  for _ in range(max_len):
    new_beams = []

    for seq, score in beams:
      if seq[-1] == 2:  # Already ended
        new_beams.append((seq, score))
        continue

      tgt_tensor = torch.tensor(seq, dtype=torch.long).unsqueeze(0).to(device)

      with torch.no_grad():
        output = model(src_tensor, tgt_tensor)

      # Get top-k candidates
      log_probs = torch.log_softmax(output[0, -1, :], dim=-1)
      top_k_probs, top_k_indices = torch.topk(log_probs, beam_width)

      for prob, idx in zip(top_k_probs, top_k_indices):
        new_seq = seq + [idx.item()]
        new_score = score + prob.item()
        new_beams.append((new_seq, new_score))

    # Keep top beam_width beams
    beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]

    if all(seq[-1] == 2 for seq, _ in beams):
      break

  best_seq = beams[0][0]
  translation = tgt_tokenizer.decode(best_seq)
  return translation

test_sentences = [
    "Hi.",
    "Wait!",
    "I love you.",
    "Good morning.",
    "How are you?",
    "I am an engineer",
    "Can you speak Chinese",
    "Do you like playing guitar?"
]

for sentence in test_sentences:
    translation = translate(model, sentence, en_tokenizer, zh_tokenizer, device)
    print(f"EN: {sentence}")
    print(f"ZH: {translation}")
    print()

EN: Hi.
ZH: 你好 。 

EN: Wait!
ZH: 等等 ！ 

EN: I love you.
ZH: 我爱你 。 

EN: Good morning.
ZH: 早上好 。 

EN: How are you?
ZH: 你们好吗 ？ 

EN: I am an engineer
ZH: 我是一名工程师 。 

EN: Can you speak Chinese
ZH: 你会讲中文 ？ 

EN: Do you like playing guitar?
ZH: 你喜欢吉他吗 ？