Skip to content

Commit 977c934

Browse files
0309hwskew66880309hws
authored
[Feature] Support Evaluation in VL-LN Bench (#193)
* update vln yaml; fix import agent * update habitat, using evaluator and config; env and agent is WIP * add distributed_base evaluator * Habitat env applied, distributed evaluator applied; clean evaluator and agent created * fix observation issues * update new register name; tiny fix on style * latest tested * delete temp agent; rename default evaluator for habitat * update slurm bash * update readme * fix init dist print * fix eval config; fix local rank to rank * update init distributed mode if condition * update dist for dlc * fix bug in evaluator * [test] dialog+object * [Feature] Add testing code for VLLN * [Feature] Add testing code for VLLN * [Feature] Add testing code for VLLN * fix bugs; refactor env * update code, merge dev; fix bug * add raise flag in agent * update a readme for vlln * fix bug in dialog evaluator * add back save video * fix video save path * [FIX] vlln bugs * [FIX] modify the annotations and some small bugs * [FIX] change some annotations * update folder structure * [FIX] delete files * [FIX] docstrings * [FIX] fix docstrings * [FIX] fix docstrings * [FIX] fix docstrings * [FIX] numpy version * [FIX] fix docstrings. * [FIX] fix docstrings. --------- Co-authored-by: wangyukai <[email protected]> Co-authored-by: 0309hws <[email protected]>
1 parent aa449bd commit 977c934

36 files changed

+2853
-65
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,13 +256,13 @@ If you use the specific pretrained models and benchmarks, please kindly cite the
256256
booktitle={arXiv},
257257
}
258258
@misc{wei2025groundslowfastdualsystem,
259-
title={Ground Slow, Move Fast: A Dual-System Foundation Model for Generalizable Vision-and-Language Navigation},
259+
title={Ground Slow, Move Fast: A Dual-System Foundation Model for Generalizable Vision-and-Language Navigation},
260260
author={Meng Wei and Chenyang Wan and Jiaqi Peng and Xiqian Yu and Yuqiang Yang and Delin Feng and Wenzhe Cai and Chenming Zhu and Tai Wang and Jiangmiao Pang and Xihui Liu},
261261
year={2025},
262262
eprint={2512.08186},
263263
archivePrefix={arXiv},
264264
primaryClass={cs.RO},
265-
url={https://arxiv.org/abs/2512.08186},
265+
url={https://arxiv.org/abs/2512.08186},
266266
}
267267
```
268268

internnav/agent/__init__.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,8 @@
11
from internnav.agent.base import Agent
22
from internnav.agent.cma_agent import CmaAgent
3+
from internnav.agent.dialog_agent import DialogAgent
4+
from internnav.agent.internvla_n1_agent import InternVLAN1Agent
35
from internnav.agent.rdp_agent import RdpAgent
46
from internnav.agent.seq2seq_agent import Seq2SeqAgent
5-
from internnav.agent.internvla_n1_agent import InternVLAN1Agent
67

7-
__all__ = [
8-
'Agent',
9-
'CmaAgent',
10-
'RdpAgent',
11-
'Seq2SeqAgent',
12-
'InternVLAN1Agent'
13-
]
8+
__all__ = ['Agent', 'DialogAgent', 'CmaAgent', 'RdpAgent', 'Seq2SeqAgent', 'InternVLAN1Agent']

internnav/agent/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def decorator(agent_class):
2525
if agent_type in cls.agents:
2626
raise ValueError(f"Agent {agent_type} already registered.")
2727
cls.agents[agent_type] = agent_class
28+
return agent_class
2829

2930
return decorator
3031

internnav/agent/dialog_agent.py

Lines changed: 477 additions & 0 deletions
Large diffs are not rendered by default.

internnav/configs/evaluator/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ class MetricCfg(BaseModel):
3939

4040
class TaskCfg(BaseModel):
4141
task_name: Optional[str] = None
42-
task_settings: Dict[str, Any]
43-
scene: SceneCfg
42+
task_settings: Dict[str, Any] = None
43+
scene: SceneCfg = None
4444
robot_name: Optional[str] = None
4545
robot: Optional[RobotCfg] = None
4646
robot_flash: Optional[bool] = None

internnav/env/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from internnav.env.base import Env
2+
from internnav.env.habitat_env import HabitatEnv
23
from internnav.env.internutopia_env import InternutopiaEnv
34

4-
__all__ = ['Env', 'InternutopiaEnv']
5+
__all__ = ['Env', 'InternutopiaEnv', 'HabitatEnv']

internnav/env/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def decorator(env_class):
4242
if env_type in cls.envs:
4343
raise ValueError(f"Env {env_type} already registered.")
4444
cls.envs[env_type] = env_class
45+
return env_class
4546

4647
return decorator
4748

internnav/habitat_extensions/habitat_env.py renamed to internnav/env/habitat_env.py

Lines changed: 11 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@
88

99
@base.Env.register('habitat')
1010
class HabitatEnv(base.Env):
11-
def __init__(self, env_config: EnvCfg, task_config: TaskCfg):
12-
"""
13-
env_settings include:
14-
- habitat_config: loaded from get_habitat_config
15-
- rank: int, rank index for sharding
16-
- world_size: int, total number of ranks
17-
"""
11+
"""A lightweight wrapper around `habitat.Env` that adapts Habitat to the project's `base.Env` interface.
12+
13+
Args:
14+
env_config (EnvCfg): Environment configuration.
15+
task_config (TaskCfg): Optional task configuration passed to the base environment.
16+
"""
17+
def __init__(self, env_config: EnvCfg, task_config: TaskCfg = None):
1818
try:
1919
from habitat import Env
2020
except ImportError as e:
@@ -23,7 +23,6 @@ def __init__(self, env_config: EnvCfg, task_config: TaskCfg):
2323
) from e
2424

2525
super().__init__(env_config, task_config)
26-
2726
self.config = env_config.env_settings['habitat_config']
2827
self._env = Env(self.config)
2928

@@ -36,16 +35,14 @@ def __init__(self, env_config: EnvCfg, task_config: TaskCfg):
3635
self.output_path = env_config.env_settings.get('output_path', './output')
3736

3837
# generate episodes
39-
# self._env.episodes = self._env.episodes[0:1] # for debug
4038
self.episodes = self.generate_episodes()
41-
# print(self.episodes)
4239

4340
def generate_episodes(self) -> List[Any]:
4441
"""
45-
Generate list of episodes for the current split, already:
46-
- grouped by scene
47-
- filtered by done_res (the path is self.output_path/progress.json)
48-
- sharded by (rank, world_size)
42+
Generate list of episodes for the current split.
43+
44+
Returns:
45+
List[Any]: A list of episode objects for the current split.
4946
"""
5047
all_episodes = []
5148

@@ -80,9 +77,6 @@ def generate_episodes(self) -> List[Any]:
8077
return all_episodes
8178

8279
def reset(self):
83-
"""
84-
load next episode and return first observation
85-
"""
8680
# no more episodes
8781
if not (0 <= self._current_episode_index < len(self.episodes)):
8882
self.is_running = False
@@ -94,17 +88,9 @@ def reset(self):
9488

9589
# Habitat reset
9690
self._last_obs = self._env.reset()
97-
9891
return self._last_obs
9992

10093
def step(self, action: List[Any]):
101-
"""
102-
step the environment with given action
103-
104-
Args: action: List[Any], action for each env in the batch
105-
106-
Return: obs, reward, done, info
107-
"""
10894
obs = self._env.step(action)
10995
done = self._env.episode_over
11096
info = self._env.get_metrics()

internnav/env/utils/dialog_mp3d.py

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
import cv2
2+
import numpy as np
3+
4+
5+
def fill_small_holes(depth_img: np.ndarray, area_thresh: int) -> np.ndarray:
6+
"""
7+
Identifies regions in the depth image that have a value of 0 and fills them in
8+
with 1 if the region is smaller than a given area threshold.
9+
10+
Args:
11+
depth_img (np.ndarray): The input depth image
12+
area_thresh (int): The area threshold for filling in holes
13+
14+
Returns:
15+
filled_depth_img (np.ndarray): The depth image with small holes filled in
16+
"""
17+
# Create a binary image where holes are 1 and the rest is 0
18+
binary_img = np.where(depth_img == 0, 1, 0).astype("uint8")
19+
20+
# Find contours in the binary image
21+
contours, _ = cv2.findContours(binary_img, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
22+
23+
filled_holes = np.zeros_like(binary_img)
24+
25+
for cnt in contours:
26+
# If the area of the contour is smaller than the threshold
27+
if cv2.contourArea(cnt) < area_thresh:
28+
# Fill the contour
29+
cv2.drawContours(filled_holes, [cnt], 0, 1, -1)
30+
31+
# Create the filled depth image
32+
filled_depth_img = np.where(filled_holes == 1, 1, depth_img)
33+
34+
return filled_depth_img
35+
36+
37+
class MP3DGTPerception:
38+
"""
39+
Ground-truth perception utility for projecting MP3D object 3D bounding boxes
40+
into the current camera view to produce per-target semantic masks.
41+
42+
Args:
43+
max_depth (float): Maximum metric depth (used for depth rescaling and masking).
44+
min_depth (float): Minimum metric depth (used for depth rescaling).
45+
fx (float): Camera focal length in pixels along x.
46+
fy (float): Camera focal length in pixels along y.
47+
"""
48+
49+
def __init__(self, max_depth, min_depth, fx, fy):
50+
self.max_depth = max_depth
51+
self.min_depth = min_depth
52+
self.fx = fx
53+
self.fy = fy
54+
55+
def predict(self, depth, targets, tf_camera_to_ply, area_threshold=2500):
56+
"""
57+
Get ground-truth semantic masks for target objects by projecting 3D bboxes into the image.
58+
59+
Args:
60+
depth (np.ndarray): Depth image of shape (H, W). Values are assumed to be normalized to [0, 1] and will be rescaled to metric depth using ``depth * (max_depth - min_depth) + min_depth``.
61+
targets (np.ndarray): Target 3D axis-aligned bounding boxes of shape (N, 6), formatted as ``[min_x, min_y, min_z, max_x, max_y, max_z]`` in the PLY/world frame.
62+
tf_camera_to_ply (np.ndarray): Homogeneous 4x4 transform from camera frame to the PLY/world frame.
63+
area_threshold (int): Area threshold used by the hole-filling routine for both the depth map and the output masks.
64+
65+
Returns:
66+
semantic_images (np.ndarray): Binary semantic masks of shape (N, H, W) with dtype ``np.uint8`` where 1 indicates pixels belonging to the corresponding target and 0 otherwise. If no targets are provided, returns an all-zero array of shape (1, H, W).
67+
"""
68+
# get the point clouds of current frame
69+
filled_depth = fill_small_holes(depth, area_threshold)
70+
scaled_depth = filled_depth * (self.max_depth - self.min_depth) + self.min_depth
71+
mask = scaled_depth < self.max_depth
72+
point_cloud_camera_frame = get_point_cloud(scaled_depth, mask, self.fx, self.fy)
73+
point_cloud_ply_frame = transform_points(tf_camera_to_ply, point_cloud_camera_frame)
74+
75+
# mark the points in the target objects' bboxes
76+
semantic_images = []
77+
for target in targets:
78+
min_x, min_y, min_z = target[:3]
79+
max_x, max_y, max_z = target[3:]
80+
81+
in_bbox = (
82+
(point_cloud_ply_frame[:, 0] >= min_x)
83+
& (point_cloud_ply_frame[:, 0] <= max_x)
84+
& (point_cloud_ply_frame[:, 1] >= min_y)
85+
& (point_cloud_ply_frame[:, 1] <= max_y)
86+
& (point_cloud_ply_frame[:, 2] >= min_z)
87+
& (point_cloud_ply_frame[:, 2] <= max_z)
88+
)
89+
in_bbox_points = point_cloud_ply_frame[in_bbox]
90+
semantic_image = np.zeros(depth.shape, dtype=np.uint8)
91+
if len(in_bbox_points) > 0:
92+
# map the marked points back to the image to get the semantic map
93+
in_bbox_camera_frame = inverse_transform_points(tf_camera_to_ply, in_bbox_points)
94+
in_box_image_coords = project_points_to_image(in_bbox_camera_frame, self.fx, self.fy, depth.shape)
95+
try:
96+
mask = [
97+
in_box_image_coords[i, 0] < 480 and in_box_image_coords[i, 1] < 640
98+
for i in range(len(in_box_image_coords))
99+
]
100+
in_box_image_coords = in_box_image_coords[mask]
101+
semantic_image[in_box_image_coords[:, 0], in_box_image_coords[:, 1]] = 1
102+
except Exception as e:
103+
print(e)
104+
semantic_image = fill_small_holes(semantic_image, area_threshold)
105+
semantic_images.append(semantic_image)
106+
if len(semantic_images) > 0:
107+
semantic_images = np.stack(semantic_images, axis=0)
108+
else:
109+
semantic_images = np.zeros((1, depth.shape[0], depth.shape[1]), dtype=np.uint8)
110+
return semantic_images
111+
112+
113+
def transform_points(transformation_matrix: np.ndarray, points: np.ndarray) -> np.ndarray:
114+
# Add a homogeneous coordinate of 1 to each point for matrix multiplication
115+
homogeneous_points = np.hstack((points, np.ones((points.shape[0], 1))))
116+
117+
# Apply the transformation matrix to the points
118+
transformed_points = np.dot(transformation_matrix, homogeneous_points.T).T
119+
120+
# Remove the added homogeneous coordinate and divide by the last coordinate
121+
return transformed_points[:, :3] / transformed_points[:, 3:]
122+
123+
124+
def get_point_cloud(depth_image: np.ndarray, mask: np.ndarray, fx: float, fy: float) -> np.ndarray:
125+
"""Calculates the 3D coordinates (x, y, z) of points in the depth image based on
126+
the horizontal field of view (HFOV), the image width and height, the depth values,
127+
and the pixel x and y coordinates.
128+
129+
Args:
130+
depth_image (np.ndarray): 2D depth image.
131+
mask (np.ndarray): 2D binary mask identifying relevant pixels.
132+
fx (float): Focal length in the x direction.
133+
fy (float): Focal length in the y direction.
134+
135+
Returns:
136+
cloud (np.ndarray): Array of 3D coordinates (x, y, z) of the points in the image plane.
137+
"""
138+
v, u = np.where(mask)
139+
z = depth_image[v, u]
140+
x = (u - depth_image.shape[1] // 2) * z / fx
141+
y = (v - depth_image.shape[0] // 2) * z / fy
142+
cloud = np.stack((x, -y, -z), axis=-1)
143+
144+
return cloud
145+
146+
147+
def inverse_transform_points(transformation_matrix: np.ndarray, points: np.ndarray) -> np.ndarray:
148+
"""Convert point cloud from episodic coordinate system to camera coordinate system
149+
150+
Args:
151+
transformation_matrix (np.ndarray): 4x4 transformation matrix
152+
points (np.ndarray): Point cloud coordinates (N, 3)
153+
154+
Returns:
155+
result_points (np.ndarray): Point cloud coordinates in camera coordinate system (N, 3)
156+
"""
157+
# Calculate the inverse of the transformation matrix
158+
inv_matrix = np.linalg.inv(transformation_matrix)
159+
160+
# Add a homogeneous coordinate of 1 to each point for matrix multiplication
161+
homogeneous_points = np.hstack((points, np.ones((points.shape[0], 1))))
162+
163+
# Apply the inverse transformation
164+
transformed_points = np.dot(inv_matrix, homogeneous_points.T).T
165+
166+
# Remove the added homogeneous coordinate
167+
result_points = transformed_points[:, :3] / transformed_points[:, 3:]
168+
return result_points
169+
170+
171+
def project_points_to_image(points: np.ndarray, fx: float, fy: float, image_shape: tuple) -> np.ndarray:
172+
"""Project points from camera coordinate system to image plane
173+
174+
Args:
175+
points (np.ndarray): Points in camera coordinate system (N, 3)
176+
fx (float): x-axis focal length
177+
fy (float): y-axis focal length
178+
image_shape (tuple): Image dimensions (height, width)
179+
180+
Returns:
181+
result_points (np.ndarray): Image coordinates (N, 2)
182+
"""
183+
points = np.stack((points[:, 0], -points[:, 1], -points[:, 2]), axis=-1)
184+
# Ensure points are in front of the camera
185+
valid_mask = points[:, 2] > 0 # z > 0
186+
187+
# Calculate image coordinates
188+
u = points[:, 0] * fx / points[:, 2] + image_shape[1] // 2
189+
v = points[:, 1] * fy / points[:, 2] + image_shape[0] // 2
190+
191+
# Combine coordinates
192+
image_coords = np.stack((v, u), axis=-1)
193+
image_coords = image_coords.astype(np.int32)
194+
# Return valid points only
195+
result_points = image_coords[valid_mask]
196+
return result_points

internnav/evaluator/__init__.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,20 @@
44

55
# register habitat
66
try:
7-
import internnav.habitat_extensions # noqa: F401 # isort: skip
7+
import internnav.habitat_extensions.vlln # noqa: F401 # isort: skip
88
except Exception as e:
9-
print(f"Warning: ({e}), Habitat Evaluation is not loaded in this runtime. Ignore this if not using Habitat.")
9+
print(f"Warning: ({e}), Habitat vlln is not loaded in this runtime. Ignore this if not using Habitat vlln.")
10+
11+
try:
12+
import internnav.habitat_extensions.vln # noqa: F401 # isort: skip
13+
except Exception as e:
14+
print(f"Warning: ({e}), Habitat vln is not loaded in this runtime. Ignore this if not using Habitat vln.")
1015

1116

12-
__all__ = ['Evaluator', 'DistributedEvaluator', 'VLNDistributedEvaluator', 'HabitatVLNEvaluator']
17+
__all__ = [
18+
'Evaluator',
19+
'DistributedEvaluator',
20+
'VLNDistributedEvaluator',
21+
'HabitatVLNEvaluator',
22+
'HabitatDialogEvaluator',
23+
]

0 commit comments

Comments
 (0)