ODYSSEY: Open-World Quadrupeds Exploration and Manipulation for Long-Horizon Tasks

核心摘要 该论文提出了一个名为ODYSSEY的统一移动操控框架,专为配备了机械臂的敏捷四足机器人设计。该框架旨在解决语言引导下的长时程、开放世界移动操控任务中的三大核心挑战:一是如何将基于大语言模型的规划能力从桌面场景扩展到具有自我中心感知和运动限制的移动平台;二是如何提升操控策略在面对开放世界中多样化物体的泛化能力;三是如何在非结构化环境中同时实现机器人平台的高机动性与末端执行器的高精度控制。ODYSSEY通过无缝集成高级任务规划与低级全身控制来应对这些挑战。它包含一个由视觉语言模型驱动的层级化规划器,一个能够适应挑战性地形的全身控制策略,以及首个用于评估长时程移动操控的综合性基准测试。通过成功的模拟到真实(sim-to-real)迁移,该系统在真实世界部署中展现了强大的泛化性和鲁棒性。 ...

2025年09月15日 · 6 分钟 · 2844 字 · ZhaoYang

TOWARDS REALISTIC UAV VISION-LANGUAGE NAVIGATION: PLATFORM, BENCHMARK, AND METHODOLOGY

一、核心思想(一句话概括) 这篇论文指出,当前的无人机视觉语言导航(VLN)研究因脱离现实而受限。为此,作者团队提供了一套从仿真平台、评测基准到AI模型的“全栈式”解决方案,旨在系统性地推动该领域走向更逼真、更复杂的实际应用。 ...

2025年09月15日 · 5 分钟 · 2139 字 · ZhaoYang

UAV-ON: A Benchmark for Open-World Object Goal Navigation with Aerial Agents

核心摘要 该论文针对空中智能体(如无人机)在开放世界中的物体目标导航任务,提出了一个名为UAV-ON 的大规模基准测试平台。与依赖详细、逐步语言指令的传统视觉语言导航范式不同,UAV-ON 要求智能体根据高级的语义目标自主导航,从而更贴近现实世界中对无人机自主性的要求。该基准包含14个高保真度的室外环境和超过1270个目标物体。为了评估该基准的挑战性,论文实现并测试了多种基线方法,包括一个基于大型多模态模型的零样本模块化策略AOA。实验结果表明,所有基线方法在该任务上都表现不佳,凸显了空中导航与语义目标理解相结合的复合型挑战。UAV-ON旨在推动由语义目标驱动的、可扩展的无人机自主导航研究。 ...

2025年09月15日 · 7 分钟 · 3227 字 · ZhaoYang

VLN-R1: Vision-Language Navigation via Reinforcement Fine-Tuning

一、 总结论文 研究人员训练了一个大型视觉-语言模型 (LVLM),让它像人一样,仅通过第一人称视角视频来理解自然语言指令,并在连续的 3D 虚拟环境中直接输出导航动作,其核心技术是一种“先模仿、再试错”的两阶段训练方法。 ...

2025年09月15日 · 4 分钟 · 1925 字 · ZhaoYang

Transformer复习

无需多言,直接上代码。 import torch import torch.nn as nn import math class PositionalEncoding(nn.Module): def __init__(self, d_model, max_len=1024): super().__init__() self.d_model = d_model pe = torch.zeros(max_len, d_model) for pos in range(max_len): for i in range(0, d_model, 2): pe[pos, i] = math.sin(pos / (10000 ** ((2 * i) / d_model))) pe[pos, i + 1] = math.cos(pos / 10000 ** (2 * (i + 1) / d_model)) pe = pe.unsqueeze(0) self.register_buffer('pe', pe) def forward(self, x): x = x * math.sqrt(self.d_model) seq_len = x.size(1) x = x + self.pe[:, :seq_len, :] return x class MultiHeadAttention(nn.Module): def __init__(self, d_model, heads, dropout = 0.1): super().__init__() self.d_model = d_model self.h = heads self.d_k = d_model // heads self.q_linear = nn.Linear(d_model, d_model) self.k_linear = nn.Linear(d_model, d_model) self.v_linear = nn.Linear(d_model, d_model) self.out = nn.Linear(d_model, d_model) self.dropout = nn.Dropout(dropout) def attention(self, q, k, v, d_k, mask = None, dropout=None): scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) if mask is not None: mask = mask.unsqueeze(1) scores = scores.masked_fill(mask == 0, -1e9) scores = F.softmax(scores, dim=-1) if dropout is not None: scores = self.dropout(scores) return torch.matmul(scores, v) def forward(self, q, k, v, mask = None): batch_size = q.size(0) q = self.q_linear(q) k = self.k_linear(k) v = self.v_linear(v) q = q.view(batch_size, -1, self.h, self.d_k) k = k.view(batch_size, -1, self.h, self.d_k) v = v.view(batch_size, -1, self.h, self.d_k) q = q.transpose(1, 2) k = k.transpose(1, 2) v = v.transpose(1, 2) scores = self.attention(q, k, v, self.d_k, mask, self.dropout) concat = scores.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model) return self.out(concat) class FeedForward(nn.Module): def __init__(self, d_model, d_ff=2048, dropout = 0.1): super().__init__() self.linear1 = nn.Linear(d_model, d_ff) self.linear2 = nn.Linear(d_ff, d_model) self.dropout = nn.Dropout(dropout) def forward(self, x): x = self.dropout(F.relu(self.linear1(x))) x = self.linear2(x) return x class NormLayer(nn.Module): def __init__(self, d_model, eps = 1e-6): super().__init__() self.size = d_model self.alpha = nn.Parameter(torch.ones(d_model)) self.bias = nn.Parameter(torch.zeros(d_model)) self.eps = eps def forward(self, x): norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias return norm class Encoder(nn.Module): def __init__(self, vocal_size, d_model, N, heads, dropout): super().__init__() self.N = N self.embed = nn.Embedding(vocal_size, d_model) self.pe = PositionalEncoding(d_model) self.layers = nn.ModuleList([EncoderLayer(d_model, heads, dropout) for _ in range(N)]) self.norm = NormLayer(d_model) def forward(self, src, mask): x = self.embed(src) x = self.pe(x) for layer in self.layers: x = layer(x, mask) return self.norm(x) class EncoderLayer(nn.Module): def __init__(self, d_model, heads, dropout = 0.1): super().__init__() self.norm_1 = NormLayer(d_model) self.norm_2 = NormLayer(d_model) self.attn = MultiHeadAttention(d_model, heads, dropout = dropout) self.ff = FeedForward(d_model, dropout = dropout) self.dropout_1 = nn.Dropout(dropout) self.dropout_2 = nn.Dropout(dropout) def forward(self, x, mask): x2 = self.norm_1(x) x = x + self.dropout_1(self.attn(x2, x2, x2, mask)) x2 = self.norm_2(x) x = x + self.dropout_2(self.ff(x2)) return x class Decoder(nn.Module): def __init__(self, vocab_size, d_model, N, heads, dropout): super().__init__() self.N = N self.embed = nn.Embedding(vocab_size, d_model) self.pe = PositionalEncoding(d_model) self.layers = nn.ModuleList([DecoderLayer(d_model, heads, dropout) for _ in range(N)]) self.norm = NormLayer(d_model) def forward(self, trg, e_outputs, src_mask, trg_mask): x = self.embed(trg) x = self.pe(x) for layer in self.layers: x = layer(x, e_outputs, src_mask, trg_mask) return self.norm(x) class DecoderLayer(nn.Module): def __init__(self, d_model, heads, dropout = 0.1): super().__init__() self.norm_1 = NormLayer(d_model) self.norm_2 = NormLayer(d_model) self.norm_3 = NormLayer(d_model) self.attn_1 = MultiHeadAttention(d_model, heads, dropout = dropout) self.attn_2 = MultiHeadAttention(d_model, heads, dropout = dropout) self.ff = FeedForward(d_model, dropout = dropout) self.dropout_1 = nn.Dropout(dropout) self.dropout_2 = nn.Dropout(dropout) self.dropout_3 = nn.Dropout(dropout) def forward(self, x, e_outputs, src_mask, trg_mask): x2 = self.norm_1(x) x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) x2 = self.norm_2(x) x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, src_mask)) x2 = self.norm_3(x) x = x + self.dropout_3(self.ff(x2)) return x class Transformer(nn.Module): def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout): super().__init__() self.encoder = Encoder(src_vocab, d_model, N, heads, dropout) self.decoder = Decoder(trg_vocab, d_model, N, heads, dropout) self.out = nn.Linear(d_model, trg_vocab) def forward(self, src, trg, src_mask, trg_mask): e_outputs = self.encoder(src, src_mask) d_outputs = self.decoder(trg, e_outputs, src_mask, trg_mask) return self.out(d_outputs)

2025年06月04日 · 2 分钟 · 622 字 · ZhaoYang

保研机试day3

题单主要是鼠群群友书写的感觉还不错,就先拿这个练练手吧。链接:https://vjudge.net/article/8781 洛谷-P1757

2025年06月03日 · 1 分钟 · 70 字 · ZhaoYang

保研机试day2

题单主要是鼠群群友书写的感觉还不错,就先拿这个练练手吧。链接:https://vjudge.net/article/8781 随手写了道代码随想录: https://kamacoder.com/problempage.php?pid=1046 ...

2025年06月02日 · 2 分钟 · 539 字 · ZhaoYang