Transformer复习

无需多言,直接上代码。 import torch import torch.nn as nn import math class PositionalEncoding(nn.Module): def __init__(self, d_model, max_len=1024): super().__init__() self.d_model = d_model pe = torch.zeros(max_len, d_model) for pos in range(max_len): for i in range(0, d_model, 2): pe[pos, i] = math.sin(pos / (10000 ** ((2 * i) / d_model))) pe[pos, i + 1] = math.cos(pos / 10000 ** (2 * (i + 1) / d_model)) pe = pe.unsqueeze(0) self.register_buffer('pe', pe) def forward(self, x): x = x * math.sqrt(self.d_model) seq_len = x.size(1) x = x + self.pe[:, :seq_len, :] return x class MultiHeadAttention(nn.Module): def __init__(self, d_model, heads, dropout = 0.1): super().__init__() self.d_model = d_model self.h = heads self.d_k = d_model // heads self.q_linear = nn.Linear(d_model, d_model) self.k_linear = nn.Linear(d_model, d_model) self.v_linear = nn.Linear(d_model, d_model) self.out = nn.Linear(d_model, d_model) self.dropout = nn.Dropout(dropout) def attention(self, q, k, v, d_k, mask = None, dropout=None): scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) if mask is not None: mask = mask.unsqueeze(1) scores = scores.masked_fill(mask == 0, -1e9) scores = F.softmax(scores, dim=-1) if dropout is not None: scores = self.dropout(scores) return torch.matmul(scores, v) def forward(self, q, k, v, mask = None): batch_size = q.size(0) q = self.q_linear(q) k = self.k_linear(k) v = self.v_linear(v) q = q.view(batch_size, -1, self.h, self.d_k) k = k.view(batch_size, -1, self.h, self.d_k) v = v.view(batch_size, -1, self.h, self.d_k) q = q.transpose(1, 2) k = k.transpose(1, 2) v = v.transpose(1, 2) scores = self.attention(q, k, v, self.d_k, mask, self.dropout) concat = scores.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model) return self.out(concat) class FeedForward(nn.Module): def __init__(self, d_model, d_ff=2048, dropout = 0.1): super().__init__() self.linear1 = nn.Linear(d_model, d_ff) self.linear2 = nn.Linear(d_ff, d_model) self.dropout = nn.Dropout(dropout) def forward(self, x): x = self.dropout(F.relu(self.linear1(x))) x = self.linear2(x) return x class NormLayer(nn.Module): def __init__(self, d_model, eps = 1e-6): super().__init__() self.size = d_model self.alpha = nn.Parameter(torch.ones(d_model)) self.bias = nn.Parameter(torch.zeros(d_model)) self.eps = eps def forward(self, x): norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias return norm class Encoder(nn.Module): def __init__(self, vocal_size, d_model, N, heads, dropout): super().__init__() self.N = N self.embed = nn.Embedding(vocal_size, d_model) self.pe = PositionalEncoding(d_model) self.layers = nn.ModuleList([EncoderLayer(d_model, heads, dropout) for _ in range(N)]) self.norm = NormLayer(d_model) def forward(self, src, mask): x = self.embed(src) x = self.pe(x) for layer in self.layers: x = layer(x, mask) return self.norm(x) class EncoderLayer(nn.Module): def __init__(self, d_model, heads, dropout = 0.1): super().__init__() self.norm_1 = NormLayer(d_model) self.norm_2 = NormLayer(d_model) self.attn = MultiHeadAttention(d_model, heads, dropout = dropout) self.ff = FeedForward(d_model, dropout = dropout) self.dropout_1 = nn.Dropout(dropout) self.dropout_2 = nn.Dropout(dropout) def forward(self, x, mask): x2 = self.norm_1(x) x = x + self.dropout_1(self.attn(x2, x2, x2, mask)) x2 = self.norm_2(x) x = x + self.dropout_2(self.ff(x2)) return x class Decoder(nn.Module): def __init__(self, vocab_size, d_model, N, heads, dropout): super().__init__() self.N = N self.embed = nn.Embedding(vocab_size, d_model) self.pe = PositionalEncoding(d_model) self.layers = nn.ModuleList([DecoderLayer(d_model, heads, dropout) for _ in range(N)]) self.norm = NormLayer(d_model) def forward(self, trg, e_outputs, src_mask, trg_mask): x = self.embed(trg) x = self.pe(x) for layer in self.layers: x = layer(x, e_outputs, src_mask, trg_mask) return self.norm(x) class DecoderLayer(nn.Module): def __init__(self, d_model, heads, dropout = 0.1): super().__init__() self.norm_1 = NormLayer(d_model) self.norm_2 = NormLayer(d_model) self.norm_3 = NormLayer(d_model) self.attn_1 = MultiHeadAttention(d_model, heads, dropout = dropout) self.attn_2 = MultiHeadAttention(d_model, heads, dropout = dropout) self.ff = FeedForward(d_model, dropout = dropout) self.dropout_1 = nn.Dropout(dropout) self.dropout_2 = nn.Dropout(dropout) self.dropout_3 = nn.Dropout(dropout) def forward(self, x, e_outputs, src_mask, trg_mask): x2 = self.norm_1(x) x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) x2 = self.norm_2(x) x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, src_mask)) x2 = self.norm_3(x) x = x + self.dropout_3(self.ff(x2)) return x class Transformer(nn.Module): def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout): super().__init__() self.encoder = Encoder(src_vocab, d_model, N, heads, dropout) self.decoder = Decoder(trg_vocab, d_model, N, heads, dropout) self.out = nn.Linear(d_model, trg_vocab) def forward(self, src, trg, src_mask, trg_mask): e_outputs = self.encoder(src, src_mask) d_outputs = self.decoder(trg, e_outputs, src_mask, trg_mask) return self.out(d_outputs)

2025年06月04日 · 2 分钟 · 622 字 · ZhaoYang

DocPuzzle: A Process-Aware Benchmark for Evaluating Realistic Long-Context Reasoning Capabilities

论文概览 论文标题:DocPuzzle: A Process-Aware Benchmark for Evaluating Realistic Long-Context Reasoning Capabilities 主要贡献:提出了一个专门评估大语言模型长文本推理能力的新基准 数据规模:100个专家级问答问题,涵盖5个现实领域 ...

2025年04月06日 · 6 分钟 · 2955 字 · ZhaoYang

FollowBench: A Multi-level Fine-grained Constraints Following Benchmark for Large Language Models

论文概览 论文标题:FollowBench: A Multi-level Fine-grained Constraints Following Benchmark for Large Language Models 数据规模:820条精心设计的指令,涵盖50+个NLP任务 核心创新:首创多层级细粒度约束遵循评估框架 ...

2025年03月27日 · 9 分钟 · 4458 字 · ZhaoYang

WritingBench: A Comprehensive Benchmark for Generative Writing

论文概览 论文标题:WritingBench: A Comprehensive Benchmark for Generative Writing 数据规模:1,239个精心设计的查询,跨越6大核心领域100个子领域 核心创新:首创查询依赖评估框架,动态生成实例特定标准 ...

2025年03月27日 · 7 分钟 · 3213 字 · ZhaoYang

LongEval: A Comprehensive Analysis of Long-Text Generation Through a Plan-based Paradigm

论文概览 论文标题:LongEval: A Comprehensive Analysis of Long-Text Generation Through a Plan-based Paradigm 研究机构:Multiple institutions 数据规模:166份跨三大领域的真实长文本样本 核心创新:首创双范式长文本生成评估框架(直接生成 vs 规划生成) ...

2025年03月25日 · 12 分钟 · 5563 字 · ZhaoYang

SciQAG:A Framework for Auto-Generated Science Question Answering Dataset with Fine-grained Evaluation

地址:https://arxiv.org/abs/2405.09939 代码:https://github.com/MasterAI-EAM/SciQAG/ ...

2025年03月12日 · 6 分钟 · 2560 字 · ZhaoYang

TEST-TIME TRAINING ON NEAREST NEIGHBORS FOR LARGE LANGUAGE MODELS

TEST-TIME TRAINING ON NEAREST NEIGHBORS FOR LARGE LANGUAGE MODELS ICLR 2024 最近的工作都聚焦于将检索到的数据添加到输入上下文中来增强具有检索能力的LLM,这种方式虽然能取得很好的效果,但是必须在训练和测试时添加检索到的数据。此外由于输入长度随着检索到的数据大小线性增长,Transformer的复杂度和计算成本急速上升。 ...

2025年03月12日 · 2 分钟 · 813 字 · ZhaoYang