From 0112c69ed5eb09a8e0e3555c629a218f5c954223 Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Mon, 3 Nov 2025 08:11:18 +0000
Subject: [PATCH 01/16] update vln yaml; fix import agent

---
 scripts/eval/configs/vln_r2r.yaml |  6 +++---
 scripts/eval/start_server.py      | 19 ++-----------------
 setup.cfg                         |  2 +-
 3 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/scripts/eval/configs/vln_r2r.yaml b/scripts/eval/configs/vln_r2r.yaml
index 379f6d7a..ed8361c8 100644
--- a/scripts/eval/configs/vln_r2r.yaml
+++ b/scripts/eval/configs/vln_r2r.yaml
@@ -69,9 +69,9 @@ habitat:
       look_down:
         type: LookDownAction
         agent_index: 0
-        
+
   dataset:
     type: R2RVLN-v1
     split: val_seen
-    scenes_dir: data/scene_data/
-    data_path: data/vln_ce/raw_data/r2r/{split}/{split}.json.gz
\ No newline at end of file
+    scenes_dir: data/scene_data/mp3d_ce
+    data_path: data/vln_ce/raw_data/r2r/{split}/{split}.json.gz
diff --git a/scripts/eval/start_server.py b/scripts/eval/start_server.py
index 5a03cd73..60e60070 100644
--- a/scripts/eval/start_server.py
+++ b/scripts/eval/start_server.py
@@ -5,27 +5,15 @@
 sys.path.append('./src/diffusion-policy')
 
 import argparse
-import glob
 import importlib
 import importlib.util
-import os
 import sys
 
+# Import for agent registry side effects — do not remove
+from internnav.agent import Agent  # noqa: F401
 from internnav.utils import AgentServer
 
 
-# import all agents to register them
-def auto_register_agents(agent_dir: str):
-    # Get all Python files in the agents directory
-    agent_modules = glob.glob(os.path.join(agent_dir, '*.py'))
-
-    # Import each module to trigger the registration
-    for module in agent_modules:
-        if not module.endswith('__init__.py'):  # Avoid importing __init__.py itself
-            module_name = os.path.basename(module)[:-3]  # Remove the .py extension
-            importlib.import_module(f'internnav.agent.{module_name}')  # Replace 'agents' with your module's package
-
-
 def load_eval_cfg(config_path, attr_name='eval_cfg'):
     spec = importlib.util.spec_from_file_location("eval_config_module", config_path)
     config_module = importlib.util.module_from_spec(spec)
@@ -37,9 +25,6 @@ def load_eval_cfg(config_path, attr_name='eval_cfg'):
 if __name__ == '__main__':
     print("Starting Agent Server...")
 
-    print("Registering agents...")
-    auto_register_agents('internnav/agent')
-
     parser = argparse.ArgumentParser()
     parser.add_argument('--host', type=str, default='localhost')
     parser.add_argument(
diff --git a/setup.cfg b/setup.cfg
index 3aeaebe4..b6a3f5b3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -45,4 +45,4 @@ per-file-ignores=*/__init__.py:F401
 ignore=E402,E501,W503,E203,D401,R504,R505,SIM102,SIM117,E711,E226
 max-line-length = 120
 max-complexity = 30
-exclude=_*,.vscode,.git,docs/**,**/test/**,**/lcmtypes/**,*.ipynb,scripts/**,internnav/projects/**
+exclude=_*,.vscode,.git,docs/**,**/test/**,**/lcmtypes/**,*.ipynb

From 05ea2a304b2c7afec6ff93e7cab35004a47b6eae Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Mon, 10 Nov 2025 05:37:18 +0000
Subject: [PATCH 02/16] update habitat, using evaluator and config; env and
 agent is WIP

---
 internnav/configs/evaluator/__init__.py       |   6 +-
 internnav/evaluator/__init__.py               |   2 +
 internnav/evaluator/distributed_base.py       | 131 +++
 internnav/evaluator/habitat_vln_evaluator.py  | 460 +++++-----
 internnav/internnav_habitat/__init__.py       |   2 +
 internnav/internnav_habitat/habitat_env.py    | 126 +++
 .../habitat_n1_agent_temp.py                  | 263 ++++++
 .../habitat_vln_evaluator.py                  | 842 ++++++++++++++++++
 internnav/internnav_habitat/measures.py       | 560 ++++++++++++
 internnav/internnav_habitat/refactor_notes.md |  66 ++
 internnav/internnav_habitat/utils.py          |   0
 scripts/eval/bash/torchrun_eval.sh            |  20 +
 scripts/eval/configs/comm_cfg.py              |  16 +
 scripts/eval/configs/habitat_cfg.py           |  60 ++
 scripts/eval/configs/vln_r2r.yaml             |   2 +-
 scripts/eval/eval.py                          |  20 +-
 16 files changed, 2306 insertions(+), 270 deletions(-)
 create mode 100644 internnav/evaluator/distributed_base.py
 create mode 100644 internnav/internnav_habitat/__init__.py
 create mode 100644 internnav/internnav_habitat/habitat_env.py
 create mode 100644 internnav/internnav_habitat/habitat_n1_agent_temp.py
 create mode 100644 internnav/internnav_habitat/habitat_vln_evaluator.py
 create mode 100644 internnav/internnav_habitat/measures.py
 create mode 100644 internnav/internnav_habitat/refactor_notes.md
 create mode 100644 internnav/internnav_habitat/utils.py
 create mode 100644 scripts/eval/bash/torchrun_eval.sh
 create mode 100644 scripts/eval/configs/comm_cfg.py
 create mode 100644 scripts/eval/configs/habitat_cfg.py

diff --git a/internnav/configs/evaluator/__init__.py b/internnav/configs/evaluator/__init__.py
index ab770c50..27e63a31 100644
--- a/internnav/configs/evaluator/__init__.py
+++ b/internnav/configs/evaluator/__init__.py
@@ -59,9 +59,9 @@ class EvalCfg(BaseModel):
     eval_type: Optional[str] = None
     eval_settings: Optional[Dict[str, Any]] = {}
     agent: Optional[AgentCfg] = None
-    env: EnvCfg
-    task: TaskCfg
-    dataset: EvalDatasetCfg
+    env: EnvCfg = None
+    task: TaskCfg = None
+    dataset: EvalDatasetCfg = None
 
 
 __all__ = [
diff --git a/internnav/evaluator/__init__.py b/internnav/evaluator/__init__.py
index 88393e50..e831ea56 100644
--- a/internnav/evaluator/__init__.py
+++ b/internnav/evaluator/__init__.py
@@ -1,3 +1,5 @@
+# register habitat TODO
+import internnav.internnav_habitat  # noqa: F401
 from internnav.evaluator.base import Evaluator
 from internnav.evaluator.vln_multi_evaluator import VlnMultiEvaluator
 
diff --git a/internnav/evaluator/distributed_base.py b/internnav/evaluator/distributed_base.py
new file mode 100644
index 00000000..32443057
--- /dev/null
+++ b/internnav/evaluator/distributed_base.py
@@ -0,0 +1,131 @@
+import json
+import os
+from datetime import datetime
+
+import torch
+
+from internnav.configs.evaluator import EvalCfg
+from internnav.evaluator.base import Evaluator
+from internnav.utils.dist import dist, get_rank, get_world_size
+
+
+def init_distributed_mode(args):
+    if 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.world_size = int(os.environ['SLURM_NTASKS'])
+
+        num_gpus = torch.cuda.device_count()
+        args.gpu = args.rank % num_gpus
+        args.local_rank = args.gpu
+
+        node_list = os.environ['SLURM_NODELIST']
+        print(f'Node list: {node_list}')
+        # addr = subprocess.getoutput(f'scontrol show hostname {node_list} | head -n1')
+
+        os.environ['MASTER_PORT'] = str(getattr(args, 'port', '29529'))
+        # os.environ['MASTER_ADDR'] = addr
+        os.environ['WORLD_SIZE'] = str(args.world_size)
+        os.environ['LOCAL_RANK'] = str(args.gpu)
+        os.environ['RANK'] = str(args.rank)
+    elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+        args.local_rank = args.gpu
+    else:
+        print('Not using distributed mode')
+        # setup_for_distributed(is_master=True)  # hack
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}, gpu {}'.format(args.rank, args.dist_url, args.gpu), flush=True)
+    dist.init_process_group(
+        backend=args.dist_backend,
+        init_method=args.dist_url,
+        world_size=args.world_size,
+        rank=args.rank,
+        timeout=datetime.timedelta(0, 7200),
+    )
+    dist.barrier()
+    # setup_for_distributed(args.rank == 0)
+
+
+class DistributedEvaluator(Evaluator):
+    """
+    Base class of distributed evaluators.
+    """
+
+    def __init__(self, cfg: EvalCfg):
+        # distributed setting
+        import os
+        import socket
+
+        print(
+            f"Rank {os.getenv('RANK')} / {os.getenv('WORLD_SIZE')} on {socket.gethostname()}:{os.getenv('MASTER_PORT')}"
+        )
+        # init_distributed_mode(args)
+        # local_rank = args.local_rank
+        # np.random.seed(local_rank)
+        cfg.env.env_settings['idx'] = get_rank()
+        cfg.env.env_settings['world_size'] = get_world_size()
+
+        # set agent port based on rank
+        cfg.agent.agent_settings['port'] = 8000 + get_rank()
+        # start_server(cfg.agent.agent_settings['port'])
+
+        super().__init__(cfg)
+
+    def eval(self):
+        # 1. 每个 rank 本地跑一遍
+        local_metrics = self.eval_action()  # dict[str, Tensor], 每个 Tensor shape [N]
+        # 取出设备 & 本地样本数
+        device = next(iter(local_metrics.values())).device
+        local_count = torch.tensor([len(next(iter(local_metrics.values())))], dtype=torch.long, device=device)
+
+        # 2. 全局样本数
+        world_size = get_world_size()
+        global_count = local_count.clone()
+        if world_size > 1:
+            dist.all_reduce(global_count, op=dist.ReduceOp.SUM)
+
+        # 3. 对每个 metric 做全局 sum / mean
+        result_all = {}
+        for name, tensor in local_metrics.items():
+            # tensor: [N]
+            local_sum = tensor.sum()
+            global_sum = local_sum.clone()
+            if world_size > 1:
+                dist.all_reduce(global_sum, op=dist.ReduceOp.SUM)
+
+            mean_val = (global_sum / global_count).item()
+            result_all[name] = mean_val
+
+        # 4. 统计全局 episode 数
+        result_all["length"] = int(global_count.item())
+
+        # 5. 打印 + 只在 rank 0 写文件
+        print(result_all)
+        if get_rank() == 0:
+            os.makedirs(self.args.output_path, exist_ok=True)
+            out_path = os.path.join(self.args.output_path, "result.json")
+            with open(out_path, "a") as f:
+                f.write(json.dumps(result_all) + "\n")
+
+        return result_all
+
+    def eval_action(self):
+        """
+        跑当前 rank 的 episodes, 返回一个 dict:
+        {
+            "success": tensor([0., 1., ...], device=...),
+            "spl": tensor([...]),
+            "os": tensor([...]),
+            "ne": tensor([...]),
+            ...
+        }
+        """
+        raise NotImplementedError
diff --git a/internnav/evaluator/habitat_vln_evaluator.py b/internnav/evaluator/habitat_vln_evaluator.py
index 3bf4c54b..e901ae3e 100644
--- a/internnav/evaluator/habitat_vln_evaluator.py
+++ b/internnav/evaluator/habitat_vln_evaluator.py
@@ -26,14 +26,11 @@
 from habitat_baselines.config.default import get_config as get_habitat_config
 from omegaconf import OmegaConf
 from PIL import Image, ImageDraw, ImageFont
-from torch import Tensor
 from transformers.image_utils import to_numpy_array
 
 from internnav.model.utils.vln_utils import (
     chunk_token,
-    image_resize,
     open_image,
-    rho_theta,
     split_and_clean,
     traj_to_actions,
 )
@@ -68,8 +65,8 @@ def __init__(
 
         with habitat.config.read_write(self.config):
             # self.config.habitat.task.measurements.success.success_distance=3.0
-            self.config.habitat.dataset.split = self.split
-            self.config.habitat.task.measurements.update(
+            self.config.habitat.dataset.split = self.split  # refactor: why args and yaml both have split
+            self.config.habitat.task.measurements.update(  # refactor: move to yaml
                 {
                     "top_down_map": TopDownMapMeasurementConfig(
                         map_padding=3,
@@ -104,6 +101,7 @@ def __init__(
         self.model = model
         self.processor = processor
 
+        # refactor: this part used in three places
         prompt = "You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? Please output the next waypoint\'s coordinates in the image. Please output STOP when you have successfully completed the task."
         answer = ""
         self.conversation = [{"from": "human", "value": prompt}, {"from": "gpt", "value": answer}]
@@ -134,260 +132,12 @@ def __init__(
         self.num_future_steps = args.num_future_steps
         self.num_history = args.num_history
 
-    def preprocess_depth_image_v2(
-        self, depth_image, do_depth_scale=True, depth_scale=1000, target_height=None, target_width=None
-    ):
-        if target_height is None:
-            target_height = self.image_processor.crop_size['height']  # 384
-            target_width = self.image_processor.crop_size['width']  # 384
-
-        resized_depth_image = depth_image.resize((target_width, target_height), Image.NEAREST)
-
-        img = to_numpy_array(resized_depth_image)
-        if do_depth_scale:
-            img = img / depth_scale
-
-        return img, (target_width, target_height)
-
-    def get_intrinsic_matrix(self, sensor_cfg) -> np.ndarray:
-        width = sensor_cfg.width
-        height = sensor_cfg.height
-        fov = sensor_cfg.hfov
-        fx = (width / 2.0) / np.tan(np.deg2rad(fov / 2.0))
-        fy = fx  # Assuming square pixels (fx = fy)
-        cx = (width - 1.0) / 2.0
-        cy = (height - 1.0) / 2.0
-
-        intrinsic_matrix = np.array(
-            [[fx, 0.0, cx, 0.0], [0.0, fy, cy, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
-        )
-        return intrinsic_matrix
-
-    def preprocess_instrinsic(self, intrinsic, ori_size, target_size):  # (V, 4, 4) (resize_shape) (h, w)
-        intrinsic = copy.deepcopy(intrinsic)
-        if len(intrinsic.shape) == 2:
-            intrinsic = intrinsic[None, :, :]  # (1, 4, 4) or (B, 4, 4)
-
-        intrinsic[:, 0] /= ori_size[0] / target_size[0]  # width
-        intrinsic[:, 1] /= ori_size[1] / target_size[1]  # height
-
-        # for crop transform
-        intrinsic[:, 0, 2] -= (target_size[0] - target_size[1]) / 2
-
-        if intrinsic.shape[0] == 1:
-            intrinsic = intrinsic.squeeze(0)
-
-        return intrinsic
-
-    def get_axis_align_matrix(self):
-        ma = np.array([[0, 0, 1, 0], [-1, 0, 0, 0], [0, -1, 0, 0], [0, 0, 0, 1]])
-        return ma
-
-    def xyz_yaw_to_tf_matrix(self, xyz: np.ndarray, yaw: float) -> np.ndarray:
-        x, y, z = xyz
-        transformation_matrix = np.array(
-            [
-                [np.cos(yaw), -np.sin(yaw), 0, x],
-                [np.sin(yaw), np.cos(yaw), 0, y],
-                [0, 0, 1, z],
-                [0, 0, 0, 1],
-            ]
-        )
-        return transformation_matrix
-
-    def xyz_pitch_to_tf_matrix(self, xyz: np.ndarray, pitch: float) -> np.ndarray:
-        """Converts a given position and pitch angle to a 4x4 transformation matrix.
-
-        Args:
-            xyz (np.ndarray): A 3D vector representing the position.
-            pitch (float): The pitch angle in radians for y axis.
-        Returns:
-            np.ndarray: A 4x4 transformation matrix.
-        """
-
-        x, y, z = xyz
-        transformation_matrix = np.array(
-            [
-                [np.cos(pitch), 0, np.sin(pitch), x],
-                [0, 1, 0, y],
-                [-np.sin(pitch), 0, np.cos(pitch), z],
-                [0, 0, 0, 1],
-            ]
-        )
-        return transformation_matrix
-
-    def xyz_yaw_pitch_to_tf_matrix(self, xyz: np.ndarray, yaw: float, pitch: float) -> np.ndarray:
-        """Converts a given position and yaw, pitch angles to a 4x4 transformation matrix.
-
-        Args:
-            xyz (np.ndarray): A 3D vector representing the position.
-            yaw (float): The yaw angle in radians.
-            pitch (float): The pitch angle in radians for y axis.
-        Returns:
-            np.ndarray: A 4x4 transformation matrix.
-        """
-        x, y, z = xyz
-        rot1 = self.xyz_yaw_to_tf_matrix(xyz, yaw)[:3, :3]
-        rot2 = self.xyz_pitch_to_tf_matrix(xyz, pitch)[:3, :3]
-        transformation_matrix = np.eye(4)
-        transformation_matrix[:3, :3] = rot1 @ rot2
-        transformation_matrix[:3, 3] = xyz
-        return transformation_matrix
-
-    def pixel_to_gps(self, pixel, depth, intrinsic, tf_camera_to_episodic):
-        '''
-        Args:
-            pixel: (2,) - [u, v] pixel coordinates
-            depth: (H, W) - depth image where depth[v, u] gives depth in meters
-            intrinsic: (4, 4) - camera intrinsic matrix
-            tf_camera_to_episodic: (4, 4) - transformation from camera to episodic frame
-        Returns:
-            (x, y): (x, y) coordinates in the episodic frame
-        '''
-        v, u = pixel
-        z = depth[v, u]
-        print("depthhhhhhhhhhhhhh", z)
-
-        x = (u - intrinsic[0, 2]) * z / intrinsic[0, 0]
-        y = (v - intrinsic[1, 2]) * z / intrinsic[1, 1]
-        point_camera = np.array([x, y, z, 1.0])
-
-        # Transform to episodic frame
-        point_episodic = tf_camera_to_episodic @ point_camera
-        point_episodic = point_episodic[:3] / point_episodic[3]
-
-        x = point_episodic[0]
-        y = point_episodic[1]
-
-        return (x, y)  # same as habitat gps
-
+    # refactor
     def config_env(self) -> Env:
         env = Env(config=self.config)
         # env.episodes = env.episodes[0:1]
         return env
 
-    def dot_matrix_two_dimensional(
-        self,
-        image_or_image_path,
-        save_path=None,
-        dots_size_w=8,
-        dots_size_h=8,
-        save_img=False,
-        font_path='fonts/arial.ttf',
-        pixel_goal=None,
-    ):
-        """
-        takes an original image as input, save the processed image to save_path. Each dot is labeled with two-dimensional Cartesian coordinates (x,y). Suitable for single-image tasks.
-        control args:
-        1. dots_size_w: the number of columns of the dots matrix
-        2. dots_size_h: the number of rows of the dots matrix
-        """
-        with open_image(image_or_image_path) as img:
-            if img.mode != 'RGB':
-                img = img.convert('RGB')
-            draw = ImageDraw.Draw(img, 'RGB')
-
-            width, height = img.size
-            grid_size_w = dots_size_w + 1
-            grid_size_h = dots_size_h + 1
-            cell_width = width / grid_size_w
-            cell_height = height / grid_size_h
-
-            font = ImageFont.truetype(font_path, width // 40)  # Adjust font size if needed; default == width // 40
-
-            target_i = target_j = None
-            if pixel_goal is not None:
-                y_pixel, x_pixel = pixel_goal[0], pixel_goal[1]
-                # Validate pixel coordinates
-                if not (0 <= x_pixel < width and 0 <= y_pixel < height):
-                    raise ValueError(f"pixel_goal {pixel_goal} exceeds image dimensions ({width}x{height})")
-
-                # Convert to grid coordinates
-                target_i = round(x_pixel / cell_width)
-                target_j = round(y_pixel / cell_height)
-
-                # Validate grid bounds
-                if not (1 <= target_i <= dots_size_w and 1 <= target_j <= dots_size_h):
-                    raise ValueError(
-                        f"pixel_goal {pixel_goal} maps to grid ({target_j},{target_i}), "
-                        f"valid range is (1,1)-({dots_size_h},{dots_size_w})"
-                    )
-
-            count = 0
-
-            for j in range(1, grid_size_h):
-                for i in range(1, grid_size_w):
-                    x = int(i * cell_width)
-                    y = int(j * cell_height)
-
-                    pixel_color = img.getpixel((x, y))
-                    # choose a more contrasting color from black and white
-                    if pixel_color[0] + pixel_color[1] + pixel_color[2] >= 255 * 3 / 2:
-                        opposite_color = (0, 0, 0)
-                    else:
-                        opposite_color = (255, 255, 255)
-
-                    if pixel_goal is not None and i == target_i and j == target_j:
-                        opposite_color = (255, 0, 0)  # Red for target
-
-                    circle_radius = width // 240  # Adjust dot size if needed; default == width // 240
-                    draw.ellipse(
-                        [(x - circle_radius, y - circle_radius), (x + circle_radius, y + circle_radius)],
-                        fill=opposite_color,
-                    )
-
-                    text_x, text_y = x + 3, y
-                    count_w = count // dots_size_w
-                    count_h = count % dots_size_w
-                    label_str = f"({count_w+1},{count_h+1})"
-                    draw.text((text_x, text_y), label_str, fill=opposite_color, font=font)
-                    count += 1
-            if save_img:
-                print(">>> dots overlaid image processed, stored in", save_path)
-                img.save(save_path)
-            return img
-
-    def _pointnav(
-        self,
-        goal: np.ndarray,
-        depth: np.ndarray,
-        step_id: int,
-        robot_xy: np.ndarray,
-        robot_heading: float,
-        stop: bool = False,
-    ) -> Tensor:
-        '''
-        Args:
-            goal (np.ndarray): goal position
-            stop (bool): whether to stop
-        Returns:
-            action: action tensor
-        '''
-
-        masks = torch.tensor([step_id != 0], dtype=torch.bool, device="cuda")
-        if not np.array_equal(goal, self._last_goal):
-            if np.linalg.norm(goal - self._last_goal) > 0.1:
-                self._pointnav_policy.reset()
-                print('Pointnav policy reset!')
-                masks = torch.zeros_like(masks)
-            self._last_goal = goal
-        rho, theta = rho_theta(robot_xy, robot_heading, goal)
-        rho_theta_tensor = torch.tensor([[rho, theta]], device="cuda", dtype=torch.float32)
-        obs_pointnav = {
-            "depth": image_resize(
-                depth,
-                (self._pointnav_depth_image_shape[0], self._pointnav_depth_image_shape[1]),
-                channels_last=True,
-                interpolation_mode="area",
-            ),
-            "pointgoal_with_gps_compass": rho_theta_tensor,
-        }
-
-        if rho < self._pointnav_stop_radius and stop:
-            return 0
-        action = self._pointnav_policy.act(obs_pointnav, masks, deterministic=True)
-        return action
-
     def eval_action(self, idx) -> None:  # noqa: C901
         self.model.eval()
         env = self.config_env()
@@ -414,6 +164,7 @@ def eval_action(self, idx) -> None:  # noqa: C901
                         oss.append(res['os'])
                         nes.append(res['ne'])
 
+        # refactor: why sort to scene: [episode] but nothing actually used
         for scene in sorted(scene_episode_dict.keys()):
             episodes = scene_episode_dict[scene]
             scene_id = scene.split('/')[-2]
@@ -430,6 +181,7 @@ def eval_action(self, idx) -> None:  # noqa: C901
                 if [scene_id, episode_id, episode_instruction] in done_res:
                     continue
 
+                # refactor env warm up
                 env.current_episode = episode
                 observations = env.reset()
 
@@ -465,6 +217,7 @@ def eval_action(self, idx) -> None:  # noqa: C901
                 local_actions = []
 
                 while not env.episode_over and step_id <= 500:
+                    # refactor agent get action
                     rgb = observations["rgb"]
                     depth = observations["depth"]
                     x, y = observations["gps"]
@@ -743,6 +496,7 @@ def eval_action(self, idx) -> None:  # noqa: C901
 
                     print("step_id", step_id, "action", action)
 
+                    # refactor: core
                     if action == 5:
                         env.step(action)
                         observations = env.step(action)
@@ -802,10 +556,194 @@ def parse_actions(self, output):
         actions = itertools.chain.from_iterable(actions)
         return list(actions)
 
-    def preprocess_qwenvl(self, source):
-        prompt = random.choice(self.conjunctions) + DEFAULT_IMAGE_TOKEN
-        if len(source[0]["value"]) != 0:
-            source[0]["value"] += f" {prompt}."
-        else:
-            source[0]["value"] = f"{prompt}."  # Please output the next waypoint\'s coordinates in the image."
-        return source
+    def preprocess_depth_image_v2(
+        self, depth_image, do_depth_scale=True, depth_scale=1000, target_height=None, target_width=None
+    ):
+        if target_height is None:
+            target_height = self.image_processor.crop_size['height']  # 384
+            target_width = self.image_processor.crop_size['width']  # 384
+
+        resized_depth_image = depth_image.resize((target_width, target_height), Image.NEAREST)
+
+        img = to_numpy_array(resized_depth_image)
+        if do_depth_scale:
+            img = img / depth_scale
+
+        return img, (target_width, target_height)
+
+    def get_intrinsic_matrix(self, sensor_cfg) -> np.ndarray:
+        width = sensor_cfg.width
+        height = sensor_cfg.height
+        fov = sensor_cfg.hfov
+        fx = (width / 2.0) / np.tan(np.deg2rad(fov / 2.0))
+        fy = fx  # Assuming square pixels (fx = fy)
+        cx = (width - 1.0) / 2.0
+        cy = (height - 1.0) / 2.0
+
+        intrinsic_matrix = np.array(
+            [[fx, 0.0, cx, 0.0], [0.0, fy, cy, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+        )
+        return intrinsic_matrix
+
+    def get_axis_align_matrix(self):
+        ma = np.array([[0, 0, 1, 0], [-1, 0, 0, 0], [0, -1, 0, 0], [0, 0, 0, 1]])
+        return ma
+
+    def xyz_yaw_to_tf_matrix(self, xyz: np.ndarray, yaw: float) -> np.ndarray:
+        x, y, z = xyz
+        transformation_matrix = np.array(
+            [
+                [np.cos(yaw), -np.sin(yaw), 0, x],
+                [np.sin(yaw), np.cos(yaw), 0, y],
+                [0, 0, 1, z],
+                [0, 0, 0, 1],
+            ]
+        )
+        return transformation_matrix
+
+    def xyz_pitch_to_tf_matrix(self, xyz: np.ndarray, pitch: float) -> np.ndarray:
+        """Converts a given position and pitch angle to a 4x4 transformation matrix.
+
+        Args:
+            xyz (np.ndarray): A 3D vector representing the position.
+            pitch (float): The pitch angle in radians for y axis.
+        Returns:
+            np.ndarray: A 4x4 transformation matrix.
+        """
+
+        x, y, z = xyz
+        transformation_matrix = np.array(
+            [
+                [np.cos(pitch), 0, np.sin(pitch), x],
+                [0, 1, 0, y],
+                [-np.sin(pitch), 0, np.cos(pitch), z],
+                [0, 0, 0, 1],
+            ]
+        )
+        return transformation_matrix
+
+    def xyz_yaw_pitch_to_tf_matrix(self, xyz: np.ndarray, yaw: float, pitch: float) -> np.ndarray:
+        """Converts a given position and yaw, pitch angles to a 4x4 transformation matrix.
+
+        Args:
+            xyz (np.ndarray): A 3D vector representing the position.
+            yaw (float): The yaw angle in radians.
+            pitch (float): The pitch angle in radians for y axis.
+        Returns:
+            np.ndarray: A 4x4 transformation matrix.
+        """
+        x, y, z = xyz
+        rot1 = self.xyz_yaw_to_tf_matrix(xyz, yaw)[:3, :3]
+        rot2 = self.xyz_pitch_to_tf_matrix(xyz, pitch)[:3, :3]
+        transformation_matrix = np.eye(4)
+        transformation_matrix[:3, :3] = rot1 @ rot2
+        transformation_matrix[:3, 3] = xyz
+        return transformation_matrix
+
+    def pixel_to_gps(self, pixel, depth, intrinsic, tf_camera_to_episodic):
+        '''
+        Args:
+            pixel: (2,) - [u, v] pixel coordinates
+            depth: (H, W) - depth image where depth[v, u] gives depth in meters
+            intrinsic: (4, 4) - camera intrinsic matrix
+            tf_camera_to_episodic: (4, 4) - transformation from camera to episodic frame
+        Returns:
+            (x, y): (x, y) coordinates in the episodic frame
+        '''
+        v, u = pixel
+        z = depth[v, u]
+        print("depthhhhhhhhhhhhhh", z)
+
+        x = (u - intrinsic[0, 2]) * z / intrinsic[0, 0]
+        y = (v - intrinsic[1, 2]) * z / intrinsic[1, 1]
+        point_camera = np.array([x, y, z, 1.0])
+
+        # Transform to episodic frame
+        point_episodic = tf_camera_to_episodic @ point_camera
+        point_episodic = point_episodic[:3] / point_episodic[3]
+
+        x = point_episodic[0]
+        y = point_episodic[1]
+
+        return (x, y)  # same as habitat gps
+
+    def dot_matrix_two_dimensional(
+        self,
+        image_or_image_path,
+        save_path=None,
+        dots_size_w=8,
+        dots_size_h=8,
+        save_img=False,
+        font_path='fonts/arial.ttf',
+        pixel_goal=None,
+    ):
+        """
+        takes an original image as input, save the processed image to save_path. Each dot is labeled with two-dimensional Cartesian coordinates (x,y). Suitable for single-image tasks.
+        control args:
+        1. dots_size_w: the number of columns of the dots matrix
+        2. dots_size_h: the number of rows of the dots matrix
+        """
+        with open_image(image_or_image_path) as img:
+            if img.mode != 'RGB':
+                img = img.convert('RGB')
+            draw = ImageDraw.Draw(img, 'RGB')
+
+            width, height = img.size
+            grid_size_w = dots_size_w + 1
+            grid_size_h = dots_size_h + 1
+            cell_width = width / grid_size_w
+            cell_height = height / grid_size_h
+
+            font = ImageFont.truetype(font_path, width // 40)  # Adjust font size if needed; default == width // 40
+
+            target_i = target_j = None
+            if pixel_goal is not None:
+                y_pixel, x_pixel = pixel_goal[0], pixel_goal[1]
+                # Validate pixel coordinates
+                if not (0 <= x_pixel < width and 0 <= y_pixel < height):
+                    raise ValueError(f"pixel_goal {pixel_goal} exceeds image dimensions ({width}x{height})")
+
+                # Convert to grid coordinates
+                target_i = round(x_pixel / cell_width)
+                target_j = round(y_pixel / cell_height)
+
+                # Validate grid bounds
+                if not (1 <= target_i <= dots_size_w and 1 <= target_j <= dots_size_h):
+                    raise ValueError(
+                        f"pixel_goal {pixel_goal} maps to grid ({target_j},{target_i}), "
+                        f"valid range is (1,1)-({dots_size_h},{dots_size_w})"
+                    )
+
+            count = 0
+
+            for j in range(1, grid_size_h):
+                for i in range(1, grid_size_w):
+                    x = int(i * cell_width)
+                    y = int(j * cell_height)
+
+                    pixel_color = img.getpixel((x, y))
+                    # choose a more contrasting color from black and white
+                    if pixel_color[0] + pixel_color[1] + pixel_color[2] >= 255 * 3 / 2:
+                        opposite_color = (0, 0, 0)
+                    else:
+                        opposite_color = (255, 255, 255)
+
+                    if pixel_goal is not None and i == target_i and j == target_j:
+                        opposite_color = (255, 0, 0)  # Red for target
+
+                    circle_radius = width // 240  # Adjust dot size if needed; default == width // 240
+                    draw.ellipse(
+                        [(x - circle_radius, y - circle_radius), (x + circle_radius, y + circle_radius)],
+                        fill=opposite_color,
+                    )
+
+                    text_x, text_y = x + 3, y
+                    count_w = count // dots_size_w
+                    count_h = count % dots_size_w
+                    label_str = f"({count_w+1},{count_h+1})"
+                    draw.text((text_x, text_y), label_str, fill=opposite_color, font=font)
+                    count += 1
+            if save_img:
+                print(">>> dots overlaid image processed, stored in", save_path)
+                img.save(save_path)
+            return img
diff --git a/internnav/internnav_habitat/__init__.py b/internnav/internnav_habitat/__init__.py
new file mode 100644
index 00000000..af9bee9e
--- /dev/null
+++ b/internnav/internnav_habitat/__init__.py
@@ -0,0 +1,2 @@
+from internnav.internnav_habitat.habitat_env import HabitatEnv
+from internnav.internnav_habitat.habitat_vln_evaluator import HabitatVlnEvaluator
diff --git a/internnav/internnav_habitat/habitat_env.py b/internnav/internnav_habitat/habitat_env.py
new file mode 100644
index 00000000..6b102153
--- /dev/null
+++ b/internnav/internnav_habitat/habitat_env.py
@@ -0,0 +1,126 @@
+import json
+import os
+from typing import Any, Dict, List, Optional
+
+from internnav.configs.evaluator import EnvCfg, TaskCfg
+from internnav.env import base
+
+
+@base.Env.register('habitat')
+class HabitatEnv(base.Env):
+    def __init__(self, env_config: EnvCfg, task_config: TaskCfg):
+        """
+        env_settings include:
+            - config_path: str, path to habitat config yaml file
+            - split: str, dataset split to use
+        """
+        try:
+            from habitat import Env
+        except ImportError as e:
+            raise RuntimeError(
+                "Habitat modules could not be imported. " "Make sure both repositories are installed and on PYTHONPATH."
+            ) from e
+
+        super().__init__(env_config, task_config)
+
+        self.config = env_config.env_settings['habitat_config']
+        self.env = Env(self.config)
+
+        self.episodes = self.generate_episodes()
+        self.sort_episodes_by_scene()
+
+        self.index = env_config.env_settings.get('idx', 0)
+        self.world_size = env_config.env_settings.get('world_size', 1)
+        self._current_episode_index: int = 0
+        self._last_obs: Optional[Dict[str, Any]] = None
+
+        self.step_id = 0
+        self.is_running = True
+
+    def generate_episodes(self) -> List[Any]:
+        """
+        Generate list of episodes for the current split
+        """
+        episodes = []
+
+        # sort episode by scene
+        scene_episode_dict = {}
+        for episode in self.env.episodes:
+            if episode.scene_id not in scene_episode_dict:
+                scene_episode_dict[episode.scene_id] = []
+            scene_episode_dict[episode.scene_id].append(episode)
+
+        done_res = set()
+
+        if os.path.exists(os.path.join(self.output_path, 'result.json')):
+            with open(os.path.join(self.output_path, 'result.json'), 'r') as f:
+                for line in f.readlines():
+                    res = json.loads(line)
+                    done_res.add((res["scene_id"], res["episode_id"], res["episode_instruction"]))
+
+        for scene in sorted(scene_episode_dict.keys()):
+            episodes = scene_episode_dict[scene]
+            scene_id = scene.split('/')[-2]
+            for episode in episodes[self.index :: self.world_size]:
+                episode_instruction = (
+                    episode.instruction.instruction_text
+                    if 'objectnav' not in self.config_path
+                    else episode.object_category
+                )
+                episode_id = int(episode.episode_id)
+                if (scene_id, episode_id, episode_instruction) in done_res:
+                    continue
+                episodes.append(episode)
+        return episodes
+
+    def reset(self):
+        """
+        load next episode and return first observation
+        """
+        # no more episodes
+        if not (0 <= self._current_episode_index < len(self.episodes)):
+            self.is_running = False
+            return
+
+        # Manually set to next episode in habitat
+        self.env.current_episode = self.episodes[self._current_episode_index]
+        self._current_episode_index += 1
+
+        # Habitat reset
+        self._last_obs = self.env.reset()
+        self.step_id = 0
+
+        return self._last_obs
+
+    def step(self, action: List[Any]):
+        """
+        step the environment with given action
+
+        Args: action: List[Any], action for each env in the batch
+
+        Return: obs, terminated
+        """
+        self._last_obs = self.env.step(action)
+        terminated = self.env.episode_over
+        return self._last_obs, terminated
+
+    def close(self):
+        print('Vln Env close')
+        self.env.close()
+
+    def render(self):
+        self.env.render()
+
+    def get_observation(self) -> Dict[str, Any]:
+        return self.env.get_observations()
+
+    def get_metrics(self) -> Dict[str, Any]:
+        return self.env.get_metrics()
+
+    def sort_episodes_by_scene(self, key_list: List[str]):
+        sorted_episodes = []
+        episode_dict = {ep.episode_id: ep for ep in self.episodes}
+        for key in key_list:
+            if key in episode_dict:
+                sorted_episodes.append(episode_dict[key])
+        self.episodes = sorted_episodes
diff --git a/internnav/internnav_habitat/habitat_n1_agent_temp.py b/internnav/internnav_habitat/habitat_n1_agent_temp.py
new file mode 100644
index 00000000..598ab25b
--- /dev/null
+++ b/internnav/internnav_habitat/habitat_n1_agent_temp.py
@@ -0,0 +1,263 @@
+import copy
+import itertools
+import os
+import re
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+
+import numpy as np
+import torch
+
+sys.path.append(str(Path(__file__).parent.parent.parent))
+
+from collections import OrderedDict
+
+from PIL import Image
+from transformers import AutoProcessor
+
+from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
+from internnav.model.utils.vln_utils import S2Output, split_and_clean, traj_to_actions
+
+DEFAULT_IMAGE_TOKEN = "<image>"
+
+
+class InternVLAN1AsyncAgent:
+    def __init__(self, args):
+        self.device = torch.device(args.device)
+        self.save_dir = "test_data/" + datetime.now().strftime("%Y%m%d_%H%M%S")
+        print(f"args.model_path{args.model_path}")
+
+        device = torch.device("cuda")
+        if args.mode == 'dual_system':
+            self.model = InternVLAN1ForCausalLM.from_pretrained(
+                args.model_path,
+                torch_dtype=torch.bfloat16,
+                attn_implementation="flash_attention_2",
+                device_map={"": device},
+            )
+        else:
+            raise ValueError(f"Invalid mode: {args.mode}")
+
+        self.model.eval()
+        self.model.to(self.device)
+
+        self.processor = AutoProcessor.from_pretrained(args.model_path)
+        self.processor.tokenizer.padding_side = 'left'
+
+        self.resize_w = args.resize_w
+        self.resize_h = args.resize_h
+        self.num_history = args.num_history
+        self.PLAN_STEP_GAP = args.plan_step_gap
+
+        prompt = "You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? Please output the next waypoint's coordinates in the image. Please output STOP when you have successfully completed the task."
+        answer = ""
+        self.conversation = [{"from": "human", "value": prompt}, {"from": "gpt", "value": answer}]
+        self.conjunctions = [
+            'you can see ',
+            'in front of you is ',
+            'there is ',
+            'you can spot ',
+            'you are toward the ',
+            'ahead of you is ',
+            'in your sight is ',
+        ]
+
+        self.actions2idx = OrderedDict(
+            {
+                'STOP': [0],
+                "↑": [1],
+                "←": [2],
+                "→": [3],
+                "↓": [5],
+            }
+        )
+
+        self.rgb_list = []
+        self.depth_list = []
+        self.pose_list = []
+        self.episode_idx = 0
+        self.conversation_history = []
+        self.llm_output = ""
+        self.past_key_values = None
+        self.last_s2_idx = -100
+
+        # output
+        self.output_action = None
+        self.output_latent = None
+        self.output_pixel = None
+        self.pixel_goal_rgb = None
+        self.pixel_goal_depth = None
+
+    def reset(self):
+        self.rgb_list = []
+        self.depth_list = []
+        self.pose_list = []
+        self.episode_idx = 0
+        self.conversation_history = []
+        self.llm_output = ""
+        self.past_key_values = None
+
+        self.output_action = None
+        self.output_latent = None
+        self.output_pixel = None
+        self.pixel_goal_rgb = None
+        self.pixel_goal_depth = None
+
+        self.save_dir = "test_data/" + datetime.now().strftime("%Y%m%d_%H%M%S")
+        os.makedirs(self.save_dir, exist_ok=True)
+
+    def parse_actions(self, output):
+        action_patterns = '|'.join(re.escape(action) for action in self.actions2idx)
+        regex = re.compile(action_patterns)
+        matches = regex.findall(output)
+        actions = [self.actions2idx[match] for match in matches]
+        actions = itertools.chain.from_iterable(actions)
+        return list(actions)
+
+    def step_no_infer(self, rgb, depth, pose):
+        image = Image.fromarray(rgb).convert('RGB')
+        image = image.resize((self.resize_w, self.resize_h))
+        self.rgb_list.append(image)
+        image.save(f"{self.save_dir}/debug_raw_{self.episode_idx: 04d}.jpg")
+        self.episode_idx += 1
+
+    def trajectory_tovw(self, trajectory, kp=1.0):
+        subgoal = trajectory[-1]
+        linear_vel, angular_vel = kp * np.linalg.norm(subgoal[:2]), kp * subgoal[2]
+        linear_vel = np.clip(linear_vel, 0, 0.5)
+        angular_vel = np.clip(angular_vel, -0.5, 0.5)
+        return linear_vel, angular_vel
+
+    def step(self, rgb, depth, pose, instruction, intrinsic, look_down=False):
+        dual_sys_output = S2Output()
+        no_output_flag = self.output_action is None and self.output_latent is None
+        if (self.episode_idx - self.last_s2_idx > self.PLAN_STEP_GAP) or look_down or no_output_flag:
+            self.output_action, self.output_latent, self.output_pixel = self.step_s2(
+                rgb, depth, pose, instruction, intrinsic, look_down
+            )
+            self.last_s2_idx = self.episode_idx
+            dual_sys_output.output_pixel = self.output_pixel
+            self.pixel_goal_rgb = copy.deepcopy(rgb)
+            self.pixel_goal_depth = copy.deepcopy(depth)
+        else:
+            self.step_no_infer(rgb, depth, pose)
+
+        if self.output_action is not None:
+            dual_sys_output.output_action = copy.deepcopy(self.output_action)
+            self.output_action = None
+        elif self.output_latent is not None:
+            processed_pixel_rgb = np.array(Image.fromarray(self.pixel_goal_rgb).resize((224, 224))) / 255
+            processed_pixel_depth = np.array(Image.fromarray(self.pixel_goal_depth).resize((224, 224)))
+            processed_rgb = np.array(Image.fromarray(rgb).resize((224, 224))) / 255
+            processed_depth = np.array(Image.fromarray(depth).resize((224, 224)))
+            rgbs = (
+                torch.stack([torch.from_numpy(processed_pixel_rgb), torch.from_numpy(processed_rgb)])
+                .unsqueeze(0)
+                .to(self.device)
+            )
+            depths = (
+                torch.stack([torch.from_numpy(processed_pixel_depth), torch.from_numpy(processed_depth)])
+                .unsqueeze(0)
+                .unsqueeze(-1)
+                .to(self.device)
+            )
+            trajectories = self.step_s1(self.output_latent, rgbs, depths)
+
+            dual_sys_output.output_trajectory = traj_to_actions(trajectories, use_discrate_action=False)
+
+        return dual_sys_output
+
+    def step_s2(self, rgb, depth, pose, instruction, intrinsic, look_down=False):
+        image = Image.fromarray(rgb).convert('RGB')
+        if not look_down:
+            image = image.resize((self.resize_w, self.resize_h))
+            self.rgb_list.append(image)
+            image.save(f"{self.save_dir}/debug_raw_{self.episode_idx: 04d}.jpg")
+        else:
+            image.save(f"{self.save_dir}/debug_raw_{self.episode_idx: 04d}_look_down.jpg")
+        if not look_down:
+            self.conversation_history = []
+            self.past_key_values = None
+
+            sources = copy.deepcopy(self.conversation)
+            sources[0]["value"] = sources[0]["value"].replace('<instruction>.', instruction)
+            cur_images = self.rgb_list[-1:]
+            if self.episode_idx == 0:
+                history_id = []
+            else:
+                history_id = np.unique(np.linspace(0, self.episode_idx - 1, self.num_history, dtype=np.int32)).tolist()
+                placeholder = (DEFAULT_IMAGE_TOKEN + '\n') * len(history_id)
+                sources[0]["value"] += f' These are your historical observations: {placeholder}.'
+
+            history_id = sorted(history_id)
+            self.input_images = [self.rgb_list[i] for i in history_id] + cur_images
+            input_img_id = 0
+            self.episode_idx += 1
+        else:
+            self.input_images.append(image)
+            input_img_id = -1
+            assert self.llm_output != "", "Last llm_output should not be empty when look down"
+            sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
+            self.conversation_history.append(
+                {'role': 'assistant', 'content': [{'type': 'text', 'text': self.llm_output}]}
+            )
+
+        prompt = self.conjunctions[0] + DEFAULT_IMAGE_TOKEN
+        sources[0]["value"] += f" {prompt}."
+        prompt_instruction = copy.deepcopy(sources[0]["value"])
+        parts = split_and_clean(prompt_instruction)
+
+        content = []
+        for i in range(len(parts)):
+            if parts[i] == "<image>":
+                content.append({"type": "image", "image": self.input_images[input_img_id]})
+                input_img_id += 1
+            else:
+                content.append({"type": "text", "text": parts[i]})
+
+        self.conversation_history.append({'role': 'user', 'content': content})
+
+        text = self.processor.apply_chat_template(self.conversation_history, tokenize=False, add_generation_prompt=True)
+
+        inputs = self.processor(text=[text], images=self.input_images, return_tensors="pt").to(self.device)
+        t0 = time.time()
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=128,
+                do_sample=False,
+                use_cache=True,
+                past_key_values=self.past_key_values,
+                return_dict_in_generate=True,
+                raw_input_ids=copy.deepcopy(inputs.input_ids),
+            )
+        output_ids = outputs.sequences
+
+        t1 = time.time()
+        self.llm_output = self.processor.tokenizer.decode(
+            output_ids[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
+        )
+        with open(f"{self.save_dir}/llm_output_{self.episode_idx: 04d}.txt", 'w') as f:
+            f.write(self.llm_output)
+        self.last_output_ids = copy.deepcopy(output_ids[0])
+        self.past_key_values = copy.deepcopy(outputs.past_key_values)
+        print(f"output {self.episode_idx}  {self.llm_output} cost: {t1 - t0}s")
+        if bool(re.search(r'\d', self.llm_output)):
+            coord = [int(c) for c in re.findall(r'\d+', self.llm_output)]
+            pixel_goal = [int(coord[1]), int(coord[0])]
+            image_grid_thw = torch.cat([thw.unsqueeze(0) for thw in inputs.image_grid_thw], dim=0)
+            pixel_values = inputs.pixel_values
+            t0 = time.time()
+            with torch.no_grad():
+                traj_latents = self.model.generate_latents(output_ids, pixel_values, image_grid_thw)
+                return None, traj_latents, pixel_goal
+
+        else:
+            action_seq = self.parse_actions(self.llm_output)
+            return action_seq, None, None
+
+    def step_s1(self, latent, rgb, depth):
+        all_trajs = self.model.generate_traj(latent, rgb, depth, use_async=True)
+        return all_trajs
diff --git a/internnav/internnav_habitat/habitat_vln_evaluator.py b/internnav/internnav_habitat/habitat_vln_evaluator.py
new file mode 100644
index 00000000..99e8fdc1
--- /dev/null
+++ b/internnav/internnav_habitat/habitat_vln_evaluator.py
@@ -0,0 +1,842 @@
+import argparse
+import json
+import os
+import sys
+
+sys.path.append('./src/diffusion-policy')
+import copy
+import itertools
+import random
+import re
+from collections import OrderedDict
+
+import habitat
+import numpy as np
+import quaternion
+import torch
+import tqdm
+from depth_camera_filtering import filter_depth
+from PIL import Image, ImageDraw, ImageFont
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+from transformers.image_utils import to_numpy_array
+
+# Import for Habitat registry side effects — do not remove
+import internnav.env.utils.habitat_extensions.measures  # noqa: F401
+from internnav.configs.evaluator import EvalCfg
+from internnav.evaluator.base import Evaluator
+from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
+from internnav.model.utils.vln_utils import (
+    chunk_token,
+    open_image,
+    split_and_clean,
+    traj_to_actions,
+)
+from internnav.utils.dist import dist, get_rank, get_world_size, init_distributed_mode
+
+try:
+    from habitat import Env
+    from habitat.config.default import get_agent_config
+    from habitat.config.default_structured_configs import (
+        CollisionsMeasurementConfig,
+        FogOfWarConfig,
+        TopDownMapMeasurementConfig,
+    )
+    from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower
+    from habitat.utils.visualizations.utils import (
+        images_to_video,
+        observations_to_image,
+    )
+    from habitat_baselines.config.default import get_config as get_habitat_config
+except Exception as e:
+    print("Habitat Error:", e)
+    print("Habitat Evaluation is not loaded.")
+
+
+DEFAULT_IMAGE_TOKEN = "<image>"
+
+
+@Evaluator.register('habitat_vln')
+class HabitatVlnEvaluator(Evaluator):
+    def __init__(self, cfg: EvalCfg):
+        args = argparse.Namespace(**cfg.eval_settings)
+        self.args = args
+        self.save_video = args.save_video
+
+        # distributed setting
+        import os
+        import socket
+
+        print(
+            f"Rank {os.getenv('RANK')} / {os.getenv('WORLD_SIZE')} on {socket.gethostname()}:{os.getenv('MASTER_PORT')}"
+        )
+        init_distributed_mode(args)
+        local_rank = args.local_rank
+        np.random.seed(local_rank)
+        cfg.env.env_settings['idx'] = get_rank()
+        cfg.env.env_settings['world_size'] = get_world_size()
+
+        self.world_size = get_world_size()
+        self.output_path = args.output_path  # TODO: modify by rank
+        self.epoch = 0
+
+        # create habitat config
+        self.config_path = cfg.env.env_settings['config_path']
+        self.config = get_habitat_config(self.config_path)
+        self.agent_config = get_agent_config(self.config.habitat.simulator)
+        self.sim_sensors_config = self.config.habitat.simulator.agents.main_agent.sim_sensors
+
+        with habitat.config.read_write(self.config):
+            self.config.habitat.task.measurements.update(
+                {
+                    "top_down_map": TopDownMapMeasurementConfig(
+                        map_padding=3,
+                        map_resolution=1024,
+                        draw_source=True,
+                        draw_border=True,
+                        draw_shortest_path=True,
+                        draw_view_points=True,
+                        draw_goal_positions=True,
+                        draw_goal_aabbs=True,
+                        fog_of_war=FogOfWarConfig(
+                            draw=True,
+                            visibility_dist=5.0,
+                            fov=90,
+                        ),
+                    ),
+                    "collisions": CollisionsMeasurementConfig(),
+                }
+            )
+        cfg.env.env_settings['habitat_config'] = self.config
+
+        # init agent and env
+        # super().__init__(cfg)
+
+        # ------------------------------------- model ------------------------------------------
+        processor = AutoProcessor.from_pretrained(args.model_path)
+        processor.tokenizer.padding_side = 'left'
+
+        device = torch.device(f"cuda:{local_rank}")
+        if args.mode == 'dual_system':
+            model = InternVLAN1ForCausalLM.from_pretrained(
+                args.model_path,
+                torch_dtype=torch.bfloat16,
+                attn_implementation="flash_attention_2",
+                device_map={"": device},
+            )
+        elif args.mode == 'system2':
+            model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                args.model_path,
+                torch_dtype=torch.bfloat16,
+                attn_implementation="flash_attention_2",
+                device_map={"": device},
+            )
+        else:
+            raise ValueError(f"Invalid mode: {args.mode}")
+
+        model.eval()
+        self.device = device
+
+        # ------------------------------------- old ------------------------------------------
+        self._camera_height = self.sim_sensors_config.rgb_sensor.position[1]
+        self._min_depth = self.sim_sensors_config.depth_sensor.min_depth
+        self._max_depth = self.sim_sensors_config.depth_sensor.max_depth
+
+        camera_fov_rad = np.deg2rad(self.sim_sensors_config.depth_sensor.hfov)
+        self._camera_fov = camera_fov_rad
+        self._fx = self._fy = self.sim_sensors_config.depth_sensor.width / (2 * np.tan(camera_fov_rad / 2))
+
+        self.model = model
+        self.processor = processor
+
+        # refactor: this part used in three places
+        prompt = "You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? Please output the next waypoint\'s coordinates in the image. Please output STOP when you have successfully completed the task."
+        answer = ""
+        self.conversation = [{"from": "human", "value": prompt}, {"from": "gpt", "value": answer}]
+
+        self.conjunctions = [
+            'you can see ',
+            'in front of you is ',
+            'there is ',
+            'you can spot ',
+            'you are toward the ',
+            'ahead of you is ',
+            'in your sight is ',
+        ]
+
+        self.actions2idx = OrderedDict(
+            {
+                'STOP': [0],
+                "↑": [1],
+                "←": [2],
+                "→": [3],
+                "↓": [5],
+            }
+        )
+
+        self.objectnav_instructions = ["Search for the {target_object}."]
+
+        self.num_frames = args.num_frames
+        self.num_future_steps = args.num_future_steps
+        self.num_history = args.num_history
+        # ------------------------------------- remove ------------------------------------------
+
+    def eval(self):
+        # * 3. do eval
+        sucs, spls, oss, nes, ep_num = self.eval_action(self.args.local_rank)
+        ep_num_all = [torch.zeros_like(ep_num) for _ in range(self.world_size)]
+        # import ipdb; ipdb.set_trace()
+        world_size = get_world_size()
+        dist.all_gather(ep_num_all, ep_num)
+        sucs_all = [torch.zeros(ep_num_all[i], dtype=sucs.dtype).to(sucs.device) for i in range(world_size)]
+        spls_all = [torch.zeros(ep_num_all[i], dtype=spls.dtype).to(spls.device) for i in range(world_size)]
+        oss_all = [torch.zeros(ep_num_all[i], dtype=oss.dtype).to(oss.device) for i in range(world_size)]
+        nes_all = [torch.zeros(ep_num_all[i], dtype=nes.dtype).to(nes.device) for i in range(world_size)]
+        dist.barrier()
+        dist.all_gather(sucs_all, sucs)
+        dist.all_gather(spls_all, spls)
+        dist.all_gather(oss_all, oss)
+        dist.all_gather(nes_all, nes)
+
+        sucs_all = torch.cat(sucs_all, dim=0)
+        spls_all = torch.cat(spls_all, dim=0)
+        oss_all = torch.cat(oss_all, dim=0)
+        nes_all = torch.cat(nes_all, dim=0)
+        result_all = {
+            "sucs_all": (sum(sucs_all) / len(sucs_all)).item(),
+            "spls_all": (sum(spls_all) / len(spls_all)).item(),
+            "oss_all": (sum(oss_all) / len(oss_all)).item(),
+            "nes_all": (sum(nes_all) / len(nes_all)).item(),
+            'length': len(sucs_all),
+        }
+
+        print(result_all)
+        if get_rank() == 0:
+            with open(os.path.join(self.args.output_path, 'result.json'), 'a') as f:
+                f.write(json.dumps(result_all))
+
+    def _eval_action(self):
+        obs = self.env.reset()
+        action = self.agent.reset()
+        while not self.env.is_running():
+            action = self.agent.step(action, obs)
+            obs, terminated = self.env.step(action)
+            if terminated:
+                obs = self.env.reset()
+                self.agent.reset()
+                self.env.update_metric()
+
+    # refactor
+    def config_env(self) -> Env:
+        env = Env(config=self.config)
+        # env.episodes = env.episodes[0:1]
+        return env
+
+    def eval_action(self, idx=0) -> None:  # noqa: C901
+        self.model.eval()
+        env = self.config_env()
+        scene_episode_dict = {}
+        for episode in env.episodes:
+            if episode.scene_id not in scene_episode_dict:
+                scene_episode_dict[episode.scene_id] = []
+            scene_episode_dict[episode.scene_id].append(episode)
+
+        intrinsic_matrix = self.get_intrinsic_matrix(
+            self.config.habitat.simulator.agents.main_agent.sim_sensors.rgb_sensor
+        )
+        sucs, spls, oss, nes = [], [], [], []
+        done_res = []
+
+        if os.path.exists(os.path.join(self.output_path, 'result.json')):
+            with open(os.path.join(self.output_path, 'result.json'), 'r') as f:
+                for line in f.readlines():
+                    res = json.loads(line)
+                    done_res.append([res["scene_id"], res["episode_id"], res["episode_instruction"]])
+                    if get_rank() == 0:  # noqa: F405 TODO this need to keep in evaluator
+                        sucs.append(res['success'])
+                        spls.append(res['spl'])
+                        oss.append(res['os'])
+                        nes.append(res['ne'])
+
+        # refactor: sort to scene: [episode] but nothing actually used
+        for scene in sorted(scene_episode_dict.keys()):
+            episodes = scene_episode_dict[scene]
+            scene_id = scene.split('/')[-2]
+            print(f"scene_id = {scene_id}")
+            process_bar = tqdm.tqdm(range(len(episodes[idx :: self.world_size])), desc=f"scene {scene_id}")
+            for episode in episodes[idx :: self.world_size]:
+                episode_instruction = (
+                    episode.instruction.instruction_text
+                    if 'objectnav' not in self.config_path
+                    else episode.object_category
+                )
+                print("episode start", episode_instruction)
+                episode_id = int(episode.episode_id)
+                if [scene_id, episode_id, episode_instruction] in done_res:
+                    continue
+
+                # refactor env warm up
+                env.current_episode = episode
+                observations = env.reset()
+
+                agent_state = env.sim.get_agent_state()
+                rotation = agent_state.rotation
+                translation = agent_state.position
+                rotation_matrix = quaternion.as_rotation_matrix(rotation)
+                transformation_matrix = np.eye(4)
+                transformation_matrix[:3, :3] = rotation_matrix
+                transformation_matrix[:3, 3] = translation
+
+                agent = ShortestPathFollower(env.sim, 0.25, False)
+
+                os.makedirs(os.path.join(self.output_path, f'check_sim_{self.epoch}'), exist_ok=True)
+                Image.fromarray(observations['rgb']).save(
+                    os.path.join(self.output_path, f'check_sim_{self.epoch}', f'rgb_{idx}.jpg')
+                )
+
+                vis_frames = []
+                step_id = 0
+
+                if self.save_video:
+                    os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
+                initial_height = env.sim.get_agent_state().position[1]
+
+                rgb_list = []
+                action_seq = []
+                output_ids = None
+
+                goal = None
+                action = None
+                messages = []
+                local_actions = []
+
+                while not env.episode_over and step_id <= 500:
+                    # refactor agent get action
+                    rgb = observations["rgb"]
+                    depth = observations["depth"]
+                    x, y = observations["gps"]
+                    camera_yaw = observations["compass"][0]
+                    depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
+                    depth = depth * (self._max_depth - self._min_depth) + self._min_depth
+                    depth = depth * 1000
+
+                    agent_state = env.sim.get_agent_state()
+                    height = agent_state.position[1] - initial_height
+                    camera_position = np.array([x, -y, self._camera_height + height])
+                    tf_camera_to_episodic = (
+                        self.xyz_yaw_pitch_to_tf_matrix(camera_position, camera_yaw, np.deg2rad(30))
+                        @ self.get_axis_align_matrix()
+                    )
+
+                    image = Image.fromarray(rgb).convert('RGB')
+                    save_raw_image = image.copy()
+
+                    save_dot = False
+                    if action == 5:
+                        look_down_image = image
+                        save_raw_image = look_down_image.copy()
+                        look_down_depth, resize_shape = self.preprocess_depth_image_v2(
+                            Image.fromarray(depth.astype(np.uint16), mode='I;16'),
+                            do_depth_scale=True,
+                            depth_scale=1000,
+                            target_height=224,
+                            target_width=224,
+                        )
+                        look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
+                        look_down_depth[look_down_depth > 5.0] = 5.0
+                    else:
+                        image = image.resize((self.args.resize_w, self.args.resize_h))
+                        rgb_list.append(image)
+
+                        if self.args.mode == 'dual_system':
+                            down_observations = env.step(5)
+                            down_observations = env.step(5)
+
+                            look_down_image = Image.fromarray(down_observations["rgb"]).convert('RGB')
+                            depth = down_observations["depth"]
+                            depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
+                            depth = depth * (self._max_depth - self._min_depth) + self._min_depth
+                            depth = depth * 1000
+                            look_down_depth, resize_shape = self.preprocess_depth_image_v2(
+                                Image.fromarray(depth.astype(np.uint16), mode='I;16'),
+                                do_depth_scale=True,
+                                depth_scale=1000,
+                                target_height=224,
+                                target_width=224,
+                            )
+                            look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
+                            look_down_depth[look_down_depth > 5.0] = 5.0
+
+                            env.step(4)
+                            env.step(4)
+
+                    info = env.get_metrics()
+
+                    if len(action_seq) == 0 and goal is None:
+                        if action != 5:
+                            sources = copy.deepcopy(self.conversation)
+                            sources[0]["value"] = sources[0]["value"].replace(
+                                '<instruction>.', episode.instruction.instruction_text[:-1]
+                            )
+                            cur_images = rgb_list[-1:]
+                            if step_id == 0:
+                                history_id = []
+                            else:
+                                history_id = np.unique(
+                                    np.linspace(0, step_id - 1, self.num_history, dtype=np.int32)
+                                ).tolist()
+                                placeholder = (DEFAULT_IMAGE_TOKEN + '\n') * len(history_id)
+                                sources[0]["value"] += f' These are your historical observations: {placeholder}.'
+
+                            history_id = sorted(history_id)
+                            print('history_idddddddd', step_id, history_id)
+                            input_images = [rgb_list[i] for i in history_id] + cur_images
+                            input_img_id = 0
+                        else:
+                            assert action == 5
+                            sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
+                            input_images += [look_down_image]
+                            # messages.append(
+                            #     {'role': 'assistant', 'content': [{'type': 'text', 'text': llm_outputs}]}  # noqa: F405
+                            # )
+                            input_img_id = -1
+
+                        prompt = random.choice(self.conjunctions) + DEFAULT_IMAGE_TOKEN
+                        sources[0]["value"] += f" {prompt}."
+                        print('sources', step_id, sources)
+                        prompt_instruction = copy.deepcopy(sources[0]["value"])
+                        parts = split_and_clean(prompt_instruction)
+
+                        content = []
+                        for i in range(len(parts)):
+                            if parts[i] == "<image>":
+                                content.append({"type": "image", "image": input_images[input_img_id]})
+                                input_img_id += 1
+                            else:
+                                content.append({"type": "text", "text": parts[i]})
+
+                        messages.append({'role': 'user', 'content': content})
+
+                        print('step_id', step_id, 'messages:', messages)
+
+                        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+                        inputs = self.processor(text=[text], images=input_images, return_tensors="pt").to(
+                            self.model.device
+                        )
+
+                        with torch.no_grad():
+                            output_ids = self.model.generate(**inputs, max_new_tokens=128, do_sample=False)
+
+                        llm_outputs = self.processor.tokenizer.decode(
+                            output_ids[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
+                        )
+                        print('step_id:', step_id, 'output text:', llm_outputs)
+
+                        if bool(re.search(r'\d', llm_outputs)):
+                            forward_action = 0
+                            coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
+                            pixel_goal = [int(coord[1]), int(coord[0])]
+
+                            goal = self.pixel_to_gps(pixel_goal, depth / 1000, intrinsic_matrix, tf_camera_to_episodic)
+                            print('before', goal, depth.shape)
+                            goal = (transformation_matrix @ np.array([-goal[1], 0, -goal[0], 1]))[:3]
+
+                            if not env.sim.pathfinder.is_navigable(np.array(goal)):
+                                goal = np.array(env.sim.pathfinder.snap_point(np.array(goal)))
+
+                            # look down --> horizontal
+                            env.step(4)
+                            env.step(4)
+
+                            # Forking logic based on mode
+                            if self.args.mode == 'system2':
+                                action = agent.get_next_action(goal)
+                                if action == 0:
+                                    goal = None
+                                    output_ids = None
+                                    action = 2  # random action
+                                    print('conduct a random action 2')
+                                    observations = env.step(action)
+                                    step_id += 1
+                                    messages = []
+                                    continue
+                            else:  # dual-system logic
+                                local_actions = []
+                                pixel_values = inputs.pixel_values
+                                image_grid_thw = torch.cat([thw.unsqueeze(0) for thw in inputs.image_grid_thw], dim=0)
+
+                                with torch.no_grad():
+                                    traj_latents = self.model.generate_latents(output_ids, pixel_values, image_grid_thw)
+
+                                # prepocess align with navdp
+                                image_dp = (
+                                    torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
+                                )
+                                pix_goal_image = copy.copy(image_dp)
+                                images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
+                                depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
+                                pix_goal_depth = copy.copy(depth_dp)
+                                depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
+
+                                with torch.no_grad():
+                                    dp_actions = self.model.generate_traj(
+                                        traj_latents, images_dp, depths_dp, use_async=True
+                                    )
+
+                                random_choice = np.random.choice(dp_actions.shape[0])
+                                if self.args.continuous_traj:
+                                    action_list = traj_to_actions(dp_actions)
+                                    if len(action_list) < 8:
+                                        action_list += [0] * (8 - len(action_list))
+                                else:
+                                    action_list = chunk_token(dp_actions[random_choice])
+
+                                local_actions = action_list
+                                if len(local_actions) >= 4:
+                                    local_actions = local_actions[:4]
+                                action = local_actions[0]
+                                if action == 0:
+                                    goal = None
+                                    output_ids = None
+                                    action = 2  # random action
+                                    print('conduct a random action 2')
+                                    observations = env.step(action)
+                                    step_id += 1
+                                    messages = []
+                                    continue
+
+                            print('predicted goal', pixel_goal, goal, flush=True)
+                        else:
+                            action_seq = self.parse_actions(llm_outputs)
+                            print('actions', action_seq, flush=True)
+
+                    if len(action_seq) != 0:
+                        action = action_seq[0]
+                        action_seq.pop(0)
+                    elif goal is not None:
+                        # Forking logic based on mode
+                        if self.args.mode == 'system2':
+                            action = agent.get_next_action(goal)
+                            action = action.detach().cpu().numpy()[0] if isinstance(action, torch.Tensor) else action
+                            action = action[0] if hasattr(action, "__len__") else action
+                        else:  # dual-system logic
+                            if len(local_actions) == 0:
+                                # navdp
+                                local_actions = []
+                                image_dp = (
+                                    torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
+                                )
+
+                                images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
+                                depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
+
+                                depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
+                                with torch.no_grad():
+                                    dp_actions = self.model.generate_traj(
+                                        traj_latents, images_dp, depths_dp, use_async=True
+                                    )
+
+                                random_choice = np.random.choice(dp_actions.shape[0])
+                                if self.args.continuous_traj:
+                                    action_list = traj_to_actions(dp_actions)
+                                    if len(action_list) < 8:
+                                        action_list += [0] * (8 - len(action_list))
+                                else:
+                                    action_list = chunk_token(dp_actions[random_choice])
+                                print("first action_list", action_list)
+
+                                local_actions = action_list
+                                if len(local_actions) >= 4:
+                                    local_actions = local_actions[:4]
+                                # if len(local_actions) >= 2:
+                                #     local_actions = local_actions[:2]
+
+                                print("local_actions", local_actions)
+
+                                action = local_actions.pop(0)
+                                # navdp
+                            else:
+                                action = local_actions.pop(0)
+
+                        forward_action += 1
+                        print('forward_action', forward_action, flush=True)
+                        if forward_action > 8:
+                            goal = None
+                            output_ids = None
+                            messages = []
+                            step_id += 1
+                            forward_action = 0
+                            local_actions = []
+                            continue
+                        if action == 0:
+                            goal = None
+                            output_ids = None
+                            messages = []
+                            step_id += 1
+                            forward_action = 0
+                            local_actions = []
+                            continue
+                    else:
+                        action = 0
+
+                    if info['top_down_map'] is not None:
+                        if save_dot:
+                            save_raw_image = self.dot_matrix_two_dimensional(
+                                save_raw_image, save_img=False, save_path=f'test_{step_id}.jpg', pixel_goal=pixel_goal
+                            )
+                        frame = observations_to_image({'rgb': np.asarray(save_raw_image)}, info)
+                        vis_frames.append(frame)
+
+                    print("step_id", step_id, "action", action)
+
+                    # refactor: core
+                    if action == 5:
+                        env.step(action)
+                        observations = env.step(action)
+                    else:
+                        observations = env.step(action)
+                        step_id += 1
+                        messages = []
+
+                process_bar.update(1)
+
+                metrics = env.get_metrics()
+                if self.save_video:
+                    images_to_video(
+                        vis_frames,
+                        os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'),
+                        f'{episode_id:04d}',
+                        fps=6,
+                        quality=9,
+                    )
+                vis_frames.clear()
+                sucs.append(metrics['success'])
+                spls.append(metrics['spl'])
+                oss.append(metrics['oracle_success'])
+                nes.append(metrics["distance_to_goal"])
+                print(
+                    f"scene_episode {scene_id}_{episode_id:04d} success: {metrics['success']}, spl: {metrics['spl']}, os: {metrics['oracle_success']}, ne: {metrics['distance_to_goal']}"
+                )
+
+                result = {
+                    "scene_id": scene_id,
+                    "episode_id": episode_id,
+                    "success": metrics["success"],
+                    "spl": metrics["spl"],
+                    "os": metrics['oracle_success'],
+                    "ne": metrics["distance_to_goal"],
+                    "steps": step_id,
+                    "episode_instruction": episode_instruction,
+                }
+
+                with open(os.path.join(self.output_path, 'result.json'), 'a') as f:
+                    f.write(json.dumps(result) + "\n")
+        env.close()
+        return (
+            torch.tensor(sucs).to(self.device),
+            torch.tensor(spls).to(self.device),
+            torch.tensor(oss).to(self.device),
+            torch.tensor(nes).to(self.device),
+            torch.tensor(len(sucs)).to(self.device),
+        )
+
+    def parse_actions(self, output):
+        action_patterns = '|'.join(re.escape(action) for action in self.actions2idx)
+        # import ipdb; ipdb.set_trace()
+        regex = re.compile(action_patterns)
+        matches = regex.findall(output)
+        actions = [self.actions2idx[match] for match in matches]
+        actions = itertools.chain.from_iterable(actions)
+        return list(actions)
+
+    def preprocess_depth_image_v2(
+        self, depth_image, do_depth_scale=True, depth_scale=1000, target_height=None, target_width=None
+    ):
+        if target_height is None:
+            target_height = self.image_processor.crop_size['height']  # 384
+            target_width = self.image_processor.crop_size['width']  # 384
+
+        resized_depth_image = depth_image.resize((target_width, target_height), Image.NEAREST)
+
+        img = to_numpy_array(resized_depth_image)
+        if do_depth_scale:
+            img = img / depth_scale
+
+        return img, (target_width, target_height)
+
+    def get_intrinsic_matrix(self, sensor_cfg) -> np.ndarray:
+        width = sensor_cfg.width
+        height = sensor_cfg.height
+        fov = sensor_cfg.hfov
+        fx = (width / 2.0) / np.tan(np.deg2rad(fov / 2.0))
+        fy = fx  # Assuming square pixels (fx = fy)
+        cx = (width - 1.0) / 2.0
+        cy = (height - 1.0) / 2.0
+
+        intrinsic_matrix = np.array(
+            [[fx, 0.0, cx, 0.0], [0.0, fy, cy, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+        )
+        return intrinsic_matrix
+
+    def get_axis_align_matrix(self):
+        ma = np.array([[0, 0, 1, 0], [-1, 0, 0, 0], [0, -1, 0, 0], [0, 0, 0, 1]])
+        return ma
+
+    def xyz_yaw_to_tf_matrix(self, xyz: np.ndarray, yaw: float) -> np.ndarray:
+        x, y, z = xyz
+        transformation_matrix = np.array(
+            [
+                [np.cos(yaw), -np.sin(yaw), 0, x],
+                [np.sin(yaw), np.cos(yaw), 0, y],
+                [0, 0, 1, z],
+                [0, 0, 0, 1],
+            ]
+        )
+        return transformation_matrix
+
+    def xyz_pitch_to_tf_matrix(self, xyz: np.ndarray, pitch: float) -> np.ndarray:
+        """Converts a given position and pitch angle to a 4x4 transformation matrix.
+
+        Args:
+            xyz (np.ndarray): A 3D vector representing the position.
+            pitch (float): The pitch angle in radians for y axis.
+        Returns:
+            np.ndarray: A 4x4 transformation matrix.
+        """
+
+        x, y, z = xyz
+        transformation_matrix = np.array(
+            [
+                [np.cos(pitch), 0, np.sin(pitch), x],
+                [0, 1, 0, y],
+                [-np.sin(pitch), 0, np.cos(pitch), z],
+                [0, 0, 0, 1],
+            ]
+        )
+        return transformation_matrix
+
+    def xyz_yaw_pitch_to_tf_matrix(self, xyz: np.ndarray, yaw: float, pitch: float) -> np.ndarray:
+        """Converts a given position and yaw, pitch angles to a 4x4 transformation matrix.
+
+        Args:
+            xyz (np.ndarray): A 3D vector representing the position.
+            yaw (float): The yaw angle in radians.
+            pitch (float): The pitch angle in radians for y axis.
+        Returns:
+            np.ndarray: A 4x4 transformation matrix.
+        """
+        x, y, z = xyz
+        rot1 = self.xyz_yaw_to_tf_matrix(xyz, yaw)[:3, :3]
+        rot2 = self.xyz_pitch_to_tf_matrix(xyz, pitch)[:3, :3]
+        transformation_matrix = np.eye(4)
+        transformation_matrix[:3, :3] = rot1 @ rot2
+        transformation_matrix[:3, 3] = xyz
+        return transformation_matrix
+
+    def pixel_to_gps(self, pixel, depth, intrinsic, tf_camera_to_episodic):
+        '''
+        Args:
+            pixel: (2,) - [u, v] pixel coordinates
+            depth: (H, W) - depth image where depth[v, u] gives depth in meters
+            intrinsic: (4, 4) - camera intrinsic matrix
+            tf_camera_to_episodic: (4, 4) - transformation from camera to episodic frame
+        Returns:
+            (x, y): (x, y) coordinates in the episodic frame
+        '''
+        v, u = pixel
+        z = depth[v, u]
+        print("depthhhhhhhhhhhhhh", z)
+
+        x = (u - intrinsic[0, 2]) * z / intrinsic[0, 0]
+        y = (v - intrinsic[1, 2]) * z / intrinsic[1, 1]
+        point_camera = np.array([x, y, z, 1.0])
+
+        # Transform to episodic frame
+        point_episodic = tf_camera_to_episodic @ point_camera
+        point_episodic = point_episodic[:3] / point_episodic[3]
+
+        x = point_episodic[0]
+        y = point_episodic[1]
+
+        return (x, y)  # same as habitat gps
+
+    def dot_matrix_two_dimensional(
+        self,
+        image_or_image_path,
+        save_path=None,
+        dots_size_w=8,
+        dots_size_h=8,
+        save_img=False,
+        font_path='fonts/arial.ttf',
+        pixel_goal=None,
+    ):
+        """
+        takes an original image as input, save the processed image to save_path. Each dot is labeled with two-dimensional Cartesian coordinates (x,y). Suitable for single-image tasks.
+        control args:
+        1. dots_size_w: the number of columns of the dots matrix
+        2. dots_size_h: the number of rows of the dots matrix
+        """
+        with open_image(image_or_image_path) as img:
+            if img.mode != 'RGB':
+                img = img.convert('RGB')
+            draw = ImageDraw.Draw(img, 'RGB')
+
+            width, height = img.size
+            grid_size_w = dots_size_w + 1
+            grid_size_h = dots_size_h + 1
+            cell_width = width / grid_size_w
+            cell_height = height / grid_size_h
+
+            font = ImageFont.truetype(font_path, width // 40)  # Adjust font size if needed; default == width // 40
+
+            target_i = target_j = None
+            if pixel_goal is not None:
+                y_pixel, x_pixel = pixel_goal[0], pixel_goal[1]
+                # Validate pixel coordinates
+                if not (0 <= x_pixel < width and 0 <= y_pixel < height):
+                    raise ValueError(f"pixel_goal {pixel_goal} exceeds image dimensions ({width}x{height})")
+
+                # Convert to grid coordinates
+                target_i = round(x_pixel / cell_width)
+                target_j = round(y_pixel / cell_height)
+
+                # Validate grid bounds
+                if not (1 <= target_i <= dots_size_w and 1 <= target_j <= dots_size_h):
+                    raise ValueError(
+                        f"pixel_goal {pixel_goal} maps to grid ({target_j},{target_i}), "
+                        f"valid range is (1,1)-({dots_size_h},{dots_size_w})"
+                    )
+
+            count = 0
+
+            for j in range(1, grid_size_h):
+                for i in range(1, grid_size_w):
+                    x = int(i * cell_width)
+                    y = int(j * cell_height)
+
+                    pixel_color = img.getpixel((x, y))
+                    # choose a more contrasting color from black and white
+                    if pixel_color[0] + pixel_color[1] + pixel_color[2] >= 255 * 3 / 2:
+                        opposite_color = (0, 0, 0)
+                    else:
+                        opposite_color = (255, 255, 255)
+
+                    if pixel_goal is not None and i == target_i and j == target_j:
+                        opposite_color = (255, 0, 0)  # Red for target
+
+                    circle_radius = width // 240  # Adjust dot size if needed; default == width // 240
+                    draw.ellipse(
+                        [(x - circle_radius, y - circle_radius), (x + circle_radius, y + circle_radius)],
+                        fill=opposite_color,
+                    )
+
+                    text_x, text_y = x + 3, y
+                    count_w = count // dots_size_w
+                    count_h = count % dots_size_w
+                    label_str = f"({count_w+1},{count_h+1})"
+                    draw.text((text_x, text_y), label_str, fill=opposite_color, font=font)
+                    count += 1
+            if save_img:
+                print(">>> dots overlaid image processed, stored in", save_path)
+                img.save(save_path)
+            return img
diff --git a/internnav/internnav_habitat/measures.py b/internnav/internnav_habitat/measures.py
new file mode 100644
index 00000000..5dddcae0
--- /dev/null
+++ b/internnav/internnav_habitat/measures.py
@@ -0,0 +1,560 @@
+from typing import Any, List, Union
+
+import numpy as np
+from habitat.core.embodied_task import EmbodiedTask, Measure
+from habitat.core.registry import registry
+from habitat.core.simulator import Simulator
+from habitat.core.utils import try_cv2_import
+from habitat.tasks.nav.nav import DistanceToGoal
+from numpy import ndarray
+
+cv2 = try_cv2_import()
+
+
+def euclidean_distance(pos_a: Union[List[float], ndarray], pos_b: Union[List[float], ndarray]) -> float:
+    return np.linalg.norm(np.array(pos_b) - np.array(pos_a), ord=2)
+
+
+@registry.register_measure
+class PathLength(Measure):
+    """Path Length (PL)
+    PL = sum(geodesic_distance(agent_prev_position, agent_position)
+            over all agent positions.
+    """
+
+    cls_uuid: str = "path_length"
+
+    def __init__(self, sim: Simulator, *args: Any, **kwargs: Any):
+        self._sim = sim
+        super().__init__(**kwargs)
+
+    def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
+        return self.cls_uuid
+
+    def reset_metric(self, *args: Any, **kwargs: Any):
+        self._previous_position = self._sim.get_agent_state().position
+        self._metric = 0.0
+
+    def update_metric(self, *args: Any, **kwargs: Any):
+        current_position = self._sim.get_agent_state().position
+        self._metric += euclidean_distance(current_position, self._previous_position)
+        self._previous_position = current_position
+
+
+@registry.register_measure
+class OracleNavigationError(Measure):
+    """Oracle Navigation Error (ONE)
+    ONE = min(geosdesic_distance(agent_pos, goal)) over all points in the
+    agent path.
+    """
+
+    cls_uuid: str = "oracle_navigation_error"
+
+    def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
+        return self.cls_uuid
+
+    def reset_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
+        task.measurements.check_measure_dependencies(self.uuid, [DistanceToGoal.cls_uuid])
+        self._metric = float("inf")
+        self.update_metric(task=task)
+
+    def update_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
+        distance_to_target = task.measurements.measures[DistanceToGoal.cls_uuid].get_metric()
+        self._metric = min(self._metric, distance_to_target)
+
+
+@registry.register_measure
+class OracleSuccess(Measure):
+    """Oracle Success Rate (OSR). OSR = I(ONE <= goal_radius)"""
+
+    cls_uuid: str = "oracle_success"
+
+    # def __init__(self, *args: Any, config: Config, **kwargs: Any):
+    #     self._config = config
+    #     super().__init__()
+
+    def __init__(self, *args: Any, config: Any, **kwargs: Any):
+        print(f"in oracle success init: args = {args}, kwargs = {kwargs}")
+        self._config = config
+        super().__init__()
+
+    def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
+        return self.cls_uuid
+
+    def reset_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
+        task.measurements.check_measure_dependencies(self.uuid, [DistanceToGoal.cls_uuid])
+        self._metric = 0.0
+        self.update_metric(task=task)
+
+    def update_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
+        d = task.measurements.measures[DistanceToGoal.cls_uuid].get_metric()
+        # self._metric = float(self._metric or d < self._config["success_distance"])
+        self._metric = float(self._metric or d < 3.0)
+
+
+@registry.register_measure
+class OracleSPL(Measure):
+    """OracleSPL (Oracle Success weighted by Path Length)
+    OracleSPL = max(SPL) over all points in the agent path.
+    """
+
+    cls_uuid: str = "oracle_spl"
+
+    def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
+        return self.cls_uuid
+
+    def reset_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
+        task.measurements.check_measure_dependencies(self.uuid, ["spl"])
+        self._metric = 0.0
+
+    def update_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
+        spl = task.measurements.measures["spl"].get_metric()
+        self._metric = max(self._metric, spl)
+
+
+@registry.register_measure
+class StepsTaken(Measure):
+    """Counts the number of times update_metric() is called. This is equal to
+    the number of times that the agent takes an action. STOP counts as an
+    action.
+    """
+
+    cls_uuid: str = "steps_taken"
+
+    def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
+        return self.cls_uuid
+
+    def reset_metric(self, *args: Any, **kwargs: Any):
+        self._metric = 0.0
+
+    def update_metric(self, *args: Any, **kwargs: Any):
+        self._metric += 1.0
+
+
+# import gzip
+# import json
+# import pickle
+# from dtw import dtw
+# from fastdtw import fastdtw
+# from habitat.config import Config
+# from utils import maps
+# from habitat_extensions.task import RxRVLNCEDatasetV1
+# from habitat.tasks.nav.nav import DistanceToGoal, Success
+# from habitat.tasks.utils import cartesian_to_polar
+# from habitat.utils.geometry_utils import quaternion_rotate_vector
+# from habitat.utils.visualizations import fog_of_war
+# from habitat.utils.visualizations import maps as habitat_maps
+# from habitat.core.dataset import Episode
+# from habitat.core.embodied_task import Action, EmbodiedTask, Measure
+# from habitat.core.logging import logger
+
+# @registry.register_measure
+# class WaypointRewardMeasure(Measure):
+#     """A reward measure used for training VLN-CE agents via RL."""
+
+#     def __init__(
+#         self, *args: Any, sim: Simulator, config: Config, **kwargs: Any
+#     ) -> None:
+#         self._sim = sim
+#         self._slack_reward = config.slack_reward
+#         self._use_distance_scaled_slack_reward = (
+#             config.use_distance_scaled_slack_reward
+#         )
+#         self._scale_slack_on_prediction = config.scale_slack_on_prediction
+#         self._success_reward = config.success_reward
+#         self._distance_scalar = config.distance_scalar
+#         self._prev_position = None
+#         super().__init__()
+
+#     def reset_metric(
+#         self, *args: Any, task: EmbodiedTask, **kwargs: Any
+#     ) -> None:
+#         task.measurements.check_measure_dependencies(
+#             self.uuid, [DistanceToGoal.cls_uuid, Success.cls_uuid]
+#         )
+#         self._previous_distance_to_goal = task.measurements.measures[
+#             "distance_to_goal"
+#         ].get_metric()
+#         self._metric = 0.0
+#         self._prev_position = np.take(
+#             self._sim.get_agent_state().position, [0, 2]
+#         )
+
+#     def _get_scaled_slack_reward(self, action: Action) -> float:
+#         if isinstance(action["action"], int):
+#             return self._slack_reward
+
+#         if not self._use_distance_scaled_slack_reward:
+#             return self._slack_reward
+
+#         agent_pos = np.take(self._sim.get_agent_state().position, [0, 2])
+#         slack_distance = (
+#             action["action_args"]["r"]
+#             if self._scale_slack_on_prediction and action["action"] != "STOP"
+#             else np.linalg.norm(self._prev_position - agent_pos)
+#         )
+#         scaled_slack_reward = self._slack_reward * slack_distance / 0.25
+#         self._prev_position = agent_pos
+#         return min(self._slack_reward, scaled_slack_reward)
+
+#     def _progress_to_goal(self, task: EmbodiedTask) -> float:
+#         distance_to_goal = task.measurements.measures[
+#             "distance_to_goal"
+#         ].get_metric()
+#         distance_to_goal_delta = (
+#             self._previous_distance_to_goal - distance_to_goal
+#         )
+#         if np.isnan(distance_to_goal_delta) or np.isinf(
+#             distance_to_goal_delta
+#         ):
+#             l = self._sim.get_agent_state().position
+#             logger.error(
+#                 f"\nNaN or inf encountered in distance measure. agent location: {l}",
+#             )
+#             distance_to_goal_delta = -1.0
+#         self._previous_distance_to_goal = distance_to_goal
+#         return self._distance_scalar * distance_to_goal_delta
+
+#     def update_metric(
+#         self, *args: Any, action: Action, task: EmbodiedTask, **kwargs: Any
+#     ) -> None:
+#         reward = self._get_scaled_slack_reward(action)
+#         reward += self._progress_to_goal(task)
+#         reward += (
+#             self._success_reward
+#             * task.measurements.measures["success"].get_metric()
+#         )
+#         self._metric = reward
+
+#     @staticmethod
+#     def _get_uuid(*args: Any, **kwargs: Any) -> str:
+#         return "waypoint_reward_measure"
+
+
+# @registry.register_measure
+# class NDTW(Measure):
+#     """NDTW (Normalized Dynamic Time Warping)
+#     ref: https://arxiv.org/abs/1907.05446
+#     """
+
+#     cls_uuid: str = "ndtw"
+
+#     def __init__(
+#         self, *args: Any, sim: Simulator, config: Config, **kwargs: Any
+#     ):
+#         self._sim = sim
+#         self._config = config
+#         self.dtw_func = fastdtw if config.FDTW else dtw
+
+#         if "{role}" in config.GT_PATH:
+#             self.gt_json = {}
+#             for role in RxRVLNCEDatasetV1.annotation_roles:
+#                 with gzip.open(
+#                     config.GT_PATH.format(split=config.SPLIT, role=role), "rt"
+#                 ) as f:
+#                     self.gt_json.update(json.load(f))
+#         else:
+#             with gzip.open(
+#                 config.GT_PATH.format(split=config.SPLIT), "rt"
+#             ) as f:
+#                 self.gt_json = json.load(f)
+
+#         super().__init__()
+
+#     def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
+#         return self.cls_uuid
+
+#     def reset_metric(self, *args: Any, episode, **kwargs: Any):
+#         self.locations = []
+#         self.gt_locations = self.gt_json[episode.episode_id]["locations"]
+#         self.update_metric()
+
+#     def update_metric(self, *args: Any, **kwargs: Any):
+#         current_position = self._sim.get_agent_state().position.tolist()
+#         if len(self.locations) == 0:
+#             self.locations.append(current_position)
+#         else:
+#             if current_position == self.locations[-1]:
+#                 return
+#             self.locations.append(current_position)
+
+#         dtw_distance = self.dtw_func(
+#             self.locations, self.gt_locations, dist=euclidean_distance
+#         )[0]
+
+#         nDTW = np.exp(
+#             -dtw_distance
+#             / (len(self.gt_locations) * self._config.SUCCESS_DISTANCE)
+#         )
+#         self._metric = nDTW
+
+
+# @registry.register_measure
+# class SDTW(Measure):
+#     """SDTW (Success Weighted be nDTW)
+#     ref: https://arxiv.org/abs/1907.05446
+#     """
+
+#     cls_uuid: str = "sdtw"
+
+#     def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
+#         return self.cls_uuid
+
+#     def reset_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
+#         task.measurements.check_measure_dependencies(
+#             self.uuid, [NDTW.cls_uuid, Success.cls_uuid]
+#         )
+#         self.update_metric(task=task)
+
+#     def update_metric(self, *args: Any, task: EmbodiedTask, **kwargs: Any):
+#         ep_success = task.measurements.measures[Success.cls_uuid].get_metric()
+#         nDTW = task.measurements.measures[NDTW.cls_uuid].get_metric()
+#         self._metric = ep_success * nDTW
+
+
+# @registry.register_measure
+# class TopDownMapVLNCE(Measure):
+#     """A top down map that optionally shows VLN-related visual information
+#     such as MP3D node locations and MP3D agent traversals.
+#     """
+
+#     cls_uuid: str = "top_down_map_vlnce"
+
+#     def __init__(
+#         self, *args: Any, sim: Simulator, config: Config, **kwargs: Any
+#     ) -> None:
+#         self._sim = sim
+#         self._config = config
+#         self._step_count = None
+#         self._map_resolution = config.MAP_RESOLUTION
+#         self._previous_xy_location = None
+#         self._top_down_map = None
+#         self._meters_per_pixel = None
+#         self.current_node = ""
+#         with open(self._config.GRAPHS_FILE, "rb") as f:
+#             self._conn_graphs = pickle.load(f)
+#         super().__init__()
+
+#     def _get_uuid(self, *args: Any, **kwargs: Any) -> str:
+#         return self.cls_uuid
+
+#     def get_original_map(self) -> ndarray:
+#         top_down_map = habitat_maps.get_topdown_map_from_sim(
+#             self._sim,
+#             map_resolution=self._map_resolution,
+#             draw_border=self._config.DRAW_BORDER,
+#             meters_per_pixel=self._meters_per_pixel,
+#         )
+
+#         self._fog_of_war_mask = None
+#         if self._config.FOG_OF_WAR.DRAW:
+#             self._fog_of_war_mask = np.zeros_like(top_down_map)
+
+#         return top_down_map
+
+#     def reset_metric(
+#         self, *args: Any, episode: Episode, **kwargs: Any
+#     ) -> None:
+#         self._scene_id = episode.scene_id.split("/")[-2]
+#         self._step_count = 0
+#         self._metric = None
+#         self._meters_per_pixel = habitat_maps.calculate_meters_per_pixel(
+#             self._map_resolution, self._sim
+#         )
+#         self._top_down_map = self.get_original_map()
+#         agent_position = self._sim.get_agent_state().position
+#         scene_id = episode.scene_id.split("/")[-1].split(".")[0]
+#         a_x, a_y = habitat_maps.to_grid(
+#             agent_position[2],
+#             agent_position[0],
+#             self._top_down_map.shape[0:2],
+#             sim=self._sim,
+#         )
+#         self._previous_xy_location = (a_y, a_x)
+
+#         if self._config.FOG_OF_WAR.DRAW:
+#             self._fog_of_war_mask = fog_of_war.reveal_fog_of_war(
+#                 self._top_down_map,
+#                 self._fog_of_war_mask,
+#                 np.array([a_x, a_y]),
+#                 self.get_polar_angle(),
+#                 fov=self._config.FOG_OF_WAR.FOV,
+#                 max_line_len=self._config.FOG_OF_WAR.VISIBILITY_DIST
+#                 / habitat_maps.calculate_meters_per_pixel(
+#                     self._map_resolution, sim=self._sim
+#                 ),
+#             )
+
+#         if self._config.DRAW_FIXED_WAYPOINTS:
+#             maps.draw_mp3d_nodes(
+#                 self._top_down_map,
+#                 self._sim,
+#                 episode,
+#                 self._conn_graphs[scene_id],
+#                 self._meters_per_pixel,
+#             )
+
+#         if self._config.DRAW_SHORTEST_PATH:
+#             shortest_path_points = self._sim.get_straight_shortest_path_points(
+#                 agent_position, episode.goals[0].position
+#             )
+#             maps.draw_straight_shortest_path_points(
+#                 self._top_down_map,
+#                 self._sim,
+#                 self._map_resolution,
+#                 shortest_path_points,
+#             )
+
+#         if self._config.DRAW_REFERENCE_PATH:
+#             maps.draw_reference_path(
+#                 self._top_down_map,
+#                 self._sim,
+#                 episode,
+#                 self._map_resolution,
+#                 self._meters_per_pixel,
+#             )
+
+#         # draw source and target points last to avoid overlap
+#         if self._config.DRAW_SOURCE_AND_TARGET:
+#             maps.draw_source_and_target(
+#                 self._top_down_map,
+#                 self._sim,
+#                 episode,
+#                 self._meters_per_pixel,
+#             )
+
+#         # MP3D START NODE
+#         self._nearest_node = maps.get_nearest_node(
+#             self._conn_graphs[scene_id], np.take(agent_position, (0, 2))
+#         )
+#         nn_position = self._conn_graphs[self._scene_id].nodes[
+#             self._nearest_node
+#         ]["position"]
+#         self.s_x, self.s_y = habitat_maps.to_grid(
+#             nn_position[2],
+#             nn_position[0],
+#             self._top_down_map.shape[0:2],
+#             self._sim,
+#         )
+#         self.update_metric()
+
+#     def update_metric(self, *args: Any, **kwargs: Any) -> None:
+#         self._step_count += 1
+#         (
+#             house_map,
+#             map_agent_pos,
+#         ) = self.update_map(self._sim.get_agent_state().position)
+
+#         self._metric = {
+#             "map": house_map,
+#             "fog_of_war_mask": self._fog_of_war_mask,
+#             "agent_map_coord": map_agent_pos,
+#             "agent_angle": self.get_polar_angle(),
+#             "bounds": {
+#                 k: v
+#                 for k, v in zip(
+#                     ["lower", "upper"],
+#                     self._sim.pathfinder.get_bounds(),
+#                 )
+#             },
+#             "meters_per_px": self._meters_per_pixel,
+#         }
+
+#     def get_polar_angle(self) -> float:
+#         agent_state = self._sim.get_agent_state()
+#         # quaternion is in x, y, z, w format
+#         ref_rotation = agent_state.rotation
+
+#         heading_vector = quaternion_rotate_vector(
+#             ref_rotation.inverse(), np.array([0, 0, -1])
+#         )
+
+#         phi = cartesian_to_polar(-heading_vector[2], heading_vector[0])[1]
+#         z_neg_z_flip = np.pi
+#         return np.array(phi) + z_neg_z_flip
+
+#     def update_map(self, agent_position: List[float]) -> None:
+#         a_x, a_y = habitat_maps.to_grid(
+#             agent_position[2],
+#             agent_position[0],
+#             self._top_down_map.shape[0:2],
+#             self._sim,
+#         )
+#         # Don't draw over the source point
+#         gradient_color = 15 + min(
+#             self._step_count * 245 // self._config.MAX_EPISODE_STEPS, 245
+#         )
+#         if self._top_down_map[a_x, a_y] != maps.MAP_SOURCE_POINT_INDICATOR:
+#             maps.drawline(
+#                 self._top_down_map,
+#                 self._previous_xy_location,
+#                 (a_y, a_x),
+#                 gradient_color,
+#                 thickness=int(
+#                     self._map_resolution * 1.4 / maps.MAP_THICKNESS_SCALAR
+#                 ),
+#                 style="filled",
+#             )
+
+#         if self._config.FOG_OF_WAR.DRAW:
+#             self._fog_of_war_mask = fog_of_war.reveal_fog_of_war(
+#                 self._top_down_map,
+#                 self._fog_of_war_mask,
+#                 np.array([a_x, a_y]),
+#                 self.get_polar_angle(),
+#                 self._config.FOG_OF_WAR.FOV,
+#                 max_line_len=self._config.FOG_OF_WAR.VISIBILITY_DIST
+#                 / habitat_maps.calculate_meters_per_pixel(
+#                     self._map_resolution, sim=self._sim
+#                 ),
+#             )
+
+#         point_padding = int(0.2 / self._meters_per_pixel)
+#         prev_nearest_node = self._nearest_node
+#         self._nearest_node = maps.update_nearest_node(
+#             self._conn_graphs[self._scene_id],
+#             self._nearest_node,
+#             np.take(agent_position, (0, 2)),
+#         )
+#         if (
+#             self._nearest_node != prev_nearest_node
+#             and self._config.DRAW_MP3D_AGENT_PATH
+#         ):
+#             nn_position = self._conn_graphs[self._scene_id].nodes[
+#                 self._nearest_node
+#             ]["position"]
+#             (prev_s_x, prev_s_y) = (self.s_x, self.s_y)
+#             self.s_x, self.s_y = habitat_maps.to_grid(
+#                 nn_position[2],
+#                 nn_position[0],
+#                 self._top_down_map.shape[0:2],
+#                 self._sim,
+#             )
+#             self._top_down_map[
+#                 self.s_x
+#                 - int(2.0 / 3.0 * point_padding) : self.s_x
+#                 + int(2.0 / 3.0 * point_padding)
+#                 + 1,
+#                 self.s_y
+#                 - int(2.0 / 3.0 * point_padding) : self.s_y
+#                 + int(2.0 / 3.0 * point_padding)
+#                 + 1,
+#             ] = gradient_color
+
+#             maps.drawline(
+#                 self._top_down_map,
+#                 (prev_s_y, prev_s_x),
+#                 (self.s_y, self.s_x),
+#                 gradient_color,
+#                 thickness=int(
+#                     1.0
+#                     / 2.0
+#                     * np.round(
+#                         self._map_resolution / maps.MAP_THICKNESS_SCALAR
+#                     )
+#                 ),
+#             )
+
+#         self._previous_xy_location = (a_y, a_x)
+#         map_agent_pos = (a_x, a_y)
+#         return self._top_down_map, map_agent_pos
diff --git a/internnav/internnav_habitat/refactor_notes.md b/internnav/internnav_habitat/refactor_notes.md
new file mode 100644
index 00000000..7a197b4c
--- /dev/null
+++ b/internnav/internnav_habitat/refactor_notes.md
@@ -0,0 +1,66 @@
+# Refactoring `habitat_vln_evaluator`
+
+This note explains how to split the current `VLNEvaluator` implementation into the
+framework's `Env` and `Agent` abstractions.
+
+## 1. Construction and configuration (lines 45-135)
+* **Environment**: Habitat config loading, dataset split selection, sensor parameter
+  extraction, and measurement registration belong to the environment setup. These
+  responsibilities configure and own the simulator state and therefore should be
+  moved into a `HabitatVLNEnv` class that extends `Env`.【F:internnav/evaluator/habitat_vln_evaluator.py†L45-L103】
+* **Agent**: Model handles, prompt bootstrapping, conversation history, action
+  vocabulary, and instruction templates are part of the policy logic and should be
+  carried by a dedicated `HabitatVLNAgent` subclass. These fields initialize the
+  reasoning model rather than the simulator.【F:internnav/evaluator/habitat_vln_evaluator.py†L104-L135】
+
+## 2. Perception utilities (lines 137-236)
+Depth pre-processing, intrinsic matrix computation, coordinate transforms, and GPS
+projection are tied to the simulator sensor geometry. They should move into the
+`HabitatVLNEnv` so that observation tensors returned to the agent are already in a
+consistent world frame.【F:internnav/evaluator/habitat_vln_evaluator.py†L137-L236】
+
+## 3. Visualization helper (lines 238-309)
+The dot-matrix overlay operates purely on rendered frames and can stay as an
+environment utility. The helper should become a method of the environment (or a
+separate visualization module) so evaluators can call it regardless of the agent.
+【F:internnav/evaluator/habitat_vln_evaluator.py†L238-L309】
+
+## 4. Low-level point navigation (lines 311-347)
+The `_pointnav` helper controls a waypoint-following controller that consumes
+processed observations and outputs low-level actions. Because it interacts with the
+robot's state (goal resets, depth resizing, point-goal calculation), it fits inside
+the environment. The agent can request point-goal actions through a method such as
+`HabitatVLNEnv.pointnav(goal, depth, ...)`.【F:internnav/evaluator/habitat_vln_evaluator.py†L311-L347】
+
+## 5. Main evaluation loop (lines 349-520)
+* **Environment**: Episode iteration, resetting, stepping, intrinsic assembly, and
+  metric gathering should be owned by the environment. Wrapping Habitat's episode
+  lifecycle in `HabitatVLNEnv` keeps the evaluator thin and deterministic.
+* **Agent**: Generating waypoint predictions, maintaining conversation turns, and
+  deciding discrete actions are policy responsibilities. The evaluator should ask
+  the new agent for an action by passing observations (RGB, depth, state metadata)
+  returned by the environment wrapper.【F:internnav/evaluator/habitat_vln_evaluator.py†L349-L520】
+
+## 6. Language and action parsing (lines 522-680)
+Instruction processing (`split_and_clean`, dynamic prompt assembly) and action string
+parsing convert model text into executable commands. These should be encapsulated in
+`HabitatVLNAgent` so the evaluator only receives structured actions (e.g., STOP,
+MOVE, LOOK).【F:internnav/evaluator/habitat_vln_evaluator.py†L522-L680】
+
+## 7. Metric aggregation and exports (lines 682-745)
+Writing JSON lines, aggregating SPL/OS/NE, and optional video dumping can remain in
+the evaluator, but the raw metrics originate from the environment through
+`HabitatVLNEnv.get_metrics()` and rendering helpers. The evaluator should simply
+post-process the aggregated numbers.【F:internnav/evaluator/habitat_vln_evaluator.py†L682-L745】
+
+## Resulting structure
+1. **`internnav/env/habitat_vln_env.py`**: wraps Habitat configuration, episode
+   control, sensor processing, point-nav helper, and visualization utilities.
+2. **`internnav/agent/habitat_vln_agent.py`**: encapsulates the vision-language
+   model, prompt management, observation parsing, and action decoding.
+3. **`internnav/evaluator/habitat_vln_evaluator.py`**: becomes a thin coordinator
+   that instantiates the env/agent via the registry, loops over episodes, and logs
+   metrics.
+
+This split brings the Habitat evaluator in line with the existing framework while
+keeping domain-specific functionality in focused components.
diff --git a/internnav/internnav_habitat/utils.py b/internnav/internnav_habitat/utils.py
new file mode 100644
index 00000000..e69de29b
diff --git a/scripts/eval/bash/torchrun_eval.sh b/scripts/eval/bash/torchrun_eval.sh
new file mode 100644
index 00000000..fa99d402
--- /dev/null
+++ b/scripts/eval/bash/torchrun_eval.sh
@@ -0,0 +1,20 @@
+# use to run distributed eval with 4 gpus on single node
+
+# MID_RUN_NAME="InternVLA-N1"
+# torchrun \
+#   --nproc_per_node=8 \
+#   --master_port=2333 \
+#   scripts/eval/eval.py \
+#     --config scripts/eval/configs/habitat_cfg.py \
+#   > logs/${MID_RUN_NAME}_log.txt 2>&1
+
+# CUDA_VISIBLE_DEVICES=6,7
+MID_RUN_NAME="InternVLA-N1"
+torchrun \
+  --nproc_per_node=8 \
+  --master_port=29501 \
+  scripts/eval/eval_habitat.py \
+    --model_path checkpoints/InternVLA-N1 \
+    --continuous_traj \
+    --output_path logs/habitat/test_new_checkpoint2 \
+  > logs/${MID_RUN_NAME}_old_log1.txt 2>&1
diff --git a/scripts/eval/configs/comm_cfg.py b/scripts/eval/configs/comm_cfg.py
new file mode 100644
index 00000000..3c1b7bbf
--- /dev/null
+++ b/scripts/eval/configs/comm_cfg.py
@@ -0,0 +1,16 @@
+from internnav.configs.agent import AgentCfg
+from internnav.configs.evaluator import EnvCfg, EvalCfg, EvalDatasetCfg, TaskCfg
+
+eval_cfg = EvalCfg(
+    agent=AgentCfg(
+        server_port=8087,
+        model_name='cma',
+        ckpt_path='checkpoints/r2r/fine_tuned/cma_plus',
+        model_settings={},
+    ),
+    env=EnvCfg['internutopia'],
+    task=TaskCfg['vln_pe'],
+    dataset=EvalDatasetCfg['mp3d'],
+    eval_type='internutopia_vln',
+    eval_settings={'save_to_json': False, 'vis_output': True},
+)
diff --git a/scripts/eval/configs/habitat_cfg.py b/scripts/eval/configs/habitat_cfg.py
new file mode 100644
index 00000000..aaa702d0
--- /dev/null
+++ b/scripts/eval/configs/habitat_cfg.py
@@ -0,0 +1,60 @@
+from internnav.configs.agent import AgentCfg
+from internnav.configs.evaluator import EnvCfg, EvalCfg
+
+eval_cfg = EvalCfg(
+    agent=AgentCfg(
+        server_port=8087,
+        model_name='internvla_n1',
+        ckpt_path='',
+        model_settings={
+            'env_num': 1,
+            'sim_num': 1,
+            'model_path': "checkpoints/InternVLA-N1",
+            'camera_intrinsic': [[585.0, 0.0, 320.0], [0.0, 585.0, 240.0], [0.0, 0.0, 1.0]],
+            'width': 640,
+            'height': 480,
+            'hfov': 79,
+            'resize_w': 384,
+            'resize_h': 384,
+            'max_new_tokens': 1024,
+            'num_frames': 32,
+            'num_history': 8,
+            'num_future_steps': 4,
+            'device': 'cuda:0',
+            'predict_step_nums': 32,
+            'continuous_traj': True,
+            # debug
+            'vis_debug': True,  # If vis_debug=True, you can get visualization results
+            'vis_debug_path': './logs/test/vis_debug',
+        },
+    ),
+    env=EnvCfg(
+        env_type='habitat',
+        env_settings={
+            # habitat sim specifications - agent, sensors, tasks, measures etc. are defined in the habitat config file
+            'config_path': 'scripts/eval/configs/vln_r2r.yaml',
+        },
+    ),
+    eval_type='habitat_vln',
+    eval_settings={
+        # all current parse args
+        "local_rank": 0,  # node rank
+        "output_path": "./logs/habitat/test_refactor_debug",  # output directory for logs/results
+        "save_video": False,  # whether to save videos
+        "world_size": 1,  # number of distributed processes
+        "rank": 0,  # rank of current process
+        "gpu": 0,  # gpu id to use
+        "port": "2333",  # communication port
+        "dist_url": "env://",  # url for distributed setup
+        "mode": "dual_system",  # inference mode: dual_system or system2
+        "model_path": "checkpoints/InternVLA-N1",  # path to model checkpoint
+        "num_future_steps": 4,  # number of future steps for prediction
+        "num_frames": 32,  # number of frames used in evaluation
+        "num_history": 8,
+        "resize_w": 384,  # image resize width
+        "resize_h": 384,  # image resize height
+        "predict_step_nums": 32,  # number of steps to predict
+        "continuous_traj": True,  # whether to use continuous trajectory
+        "max_new_tokens": 1024,  # maximum number of tokens for generation
+    },
+)
diff --git a/scripts/eval/configs/vln_r2r.yaml b/scripts/eval/configs/vln_r2r.yaml
index ed8361c8..e6b1a895 100644
--- a/scripts/eval/configs/vln_r2r.yaml
+++ b/scripts/eval/configs/vln_r2r.yaml
@@ -72,6 +72,6 @@ habitat:
 
   dataset:
     type: R2RVLN-v1
-    split: val_seen
+    split: val_unseen
     scenes_dir: data/scene_data/mp3d_ce
     data_path: data/vln_ce/raw_data/r2r/{split}/{split}.json.gz
diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py
index 68c507af..79d0cda2 100644
--- a/scripts/eval/eval.py
+++ b/scripts/eval/eval.py
@@ -5,7 +5,6 @@
 import argparse
 import importlib.util
 
-from internnav.configs.evaluator.vln_default_config import get_config
 from internnav.evaluator import Evaluator
 
 # This file is the main file
@@ -19,6 +18,9 @@ def parse_args():
         default='scripts/eval/configs/h1_rdp_cfg.py',
         help='eval config file path, e.g. scripts/eval/configs/h1_cma_cfg.py',
     )
+    parser.add_argument('--port', type=int, default=None)
+    parser.add_argument('--host', type=str, default=None)
+    parser.add_argument('--dist_eval', action="store_true", default=False)
     return parser.parse_args()
 
 
@@ -33,10 +35,18 @@ def load_eval_cfg(config_path, attr_name='eval_cfg'):
 def main():
     args = parse_args()
     evaluator_cfg = load_eval_cfg(args.config, attr_name='eval_cfg')
-    cfg = get_config(evaluator_cfg)
-    print(cfg)
-    evaluator = Evaluator.init(cfg)
-    print(type(evaluator))
+
+    # fill in evaluator default config
+    if evaluator_cfg.eval_type == 'vln_multi':
+        from internnav.configs.evaluator.vln_default_config import get_config
+
+        evaluator_cfg = get_config(evaluator_cfg)
+    elif evaluator_cfg.eval_type == 'habitat_vln':
+        # TODO: add default config
+        pass
+
+    # create evaluator based on sim backend and run eval
+    evaluator = Evaluator.init(evaluator_cfg)
     evaluator.eval()
 
 

From 99024017aded29c1b5d65cd7d0b19de362db3858 Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Tue, 11 Nov 2025 08:31:43 +0000
Subject: [PATCH 03/16] add distributed_base evaluator

---
 internnav/evaluator/__init__.py               |   3 +-
 internnav/evaluator/distributed_base.py       | 213 ++++++++++--------
 .../habitat_vln_evaluator.py                  | 125 +++++-----
 internnav/utils/comm_utils/server.py          |  39 ++++
 scripts/eval/bash/torchrun_eval.sh            |  30 +--
 scripts/eval/configs/habitat_cfg.py           |   1 +
 scripts/eval/eval_habitat.py                  | 127 -----------
 scripts/iros_challenge/start_eval_iros.sh     |   2 +-
 8 files changed, 240 insertions(+), 300 deletions(-)
 delete mode 100644 scripts/eval/eval_habitat.py

diff --git a/internnav/evaluator/__init__.py b/internnav/evaluator/__init__.py
index e831ea56..fda0a88f 100644
--- a/internnav/evaluator/__init__.py
+++ b/internnav/evaluator/__init__.py
@@ -1,6 +1,7 @@
-# register habitat TODO
+# register habitat
 import internnav.internnav_habitat  # noqa: F401
 from internnav.evaluator.base import Evaluator
+from internnav.evaluator.distributed_base import DistributedEvaluator
 from internnav.evaluator.vln_multi_evaluator import VlnMultiEvaluator
 
 __all__ = ['Evaluator', 'VlnMultiEvaluator']
diff --git a/internnav/evaluator/distributed_base.py b/internnav/evaluator/distributed_base.py
index 32443057..a0f06a28 100644
--- a/internnav/evaluator/distributed_base.py
+++ b/internnav/evaluator/distributed_base.py
@@ -1,57 +1,13 @@
+import argparse
 import json
 import os
-from datetime import datetime
 
+import numpy as np
 import torch
 
 from internnav.configs.evaluator import EvalCfg
 from internnav.evaluator.base import Evaluator
-from internnav.utils.dist import dist, get_rank, get_world_size
-
-
-def init_distributed_mode(args):
-    if 'SLURM_PROCID' in os.environ:
-        args.rank = int(os.environ['SLURM_PROCID'])
-        args.world_size = int(os.environ['SLURM_NTASKS'])
-
-        num_gpus = torch.cuda.device_count()
-        args.gpu = args.rank % num_gpus
-        args.local_rank = args.gpu
-
-        node_list = os.environ['SLURM_NODELIST']
-        print(f'Node list: {node_list}')
-        # addr = subprocess.getoutput(f'scontrol show hostname {node_list} | head -n1')
-
-        os.environ['MASTER_PORT'] = str(getattr(args, 'port', '29529'))
-        # os.environ['MASTER_ADDR'] = addr
-        os.environ['WORLD_SIZE'] = str(args.world_size)
-        os.environ['LOCAL_RANK'] = str(args.gpu)
-        os.environ['RANK'] = str(args.rank)
-    elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
-        args.rank = int(os.environ["RANK"])
-        args.world_size = int(os.environ['WORLD_SIZE'])
-        args.gpu = int(os.environ['LOCAL_RANK'])
-        args.local_rank = args.gpu
-    else:
-        print('Not using distributed mode')
-        # setup_for_distributed(is_master=True)  # hack
-        args.distributed = False
-        return
-
-    args.distributed = True
-
-    torch.cuda.set_device(args.gpu)
-    args.dist_backend = 'nccl'
-    print('| distributed init (rank {}): {}, gpu {}'.format(args.rank, args.dist_url, args.gpu), flush=True)
-    dist.init_process_group(
-        backend=args.dist_backend,
-        init_method=args.dist_url,
-        world_size=args.world_size,
-        rank=args.rank,
-        timeout=datetime.timedelta(0, 7200),
-    )
-    dist.barrier()
-    # setup_for_distributed(args.rank == 0)
+from internnav.utils.dist import dist, get_rank, get_world_size, init_distributed_mode
 
 
 class DistributedEvaluator(Evaluator):
@@ -61,53 +17,106 @@ class DistributedEvaluator(Evaluator):
 
     def __init__(self, cfg: EvalCfg):
         # distributed setting
-        import os
         import socket
 
         print(
             f"Rank {os.getenv('RANK')} / {os.getenv('WORLD_SIZE')} on {socket.gethostname()}:{os.getenv('MASTER_PORT')}"
         )
-        # init_distributed_mode(args)
-        # local_rank = args.local_rank
-        # np.random.seed(local_rank)
+
+        args = argparse.Namespace(**cfg.eval_settings)
+        self.args = args
+
+        init_distributed_mode(args)
+
+        self.local_rank = args.local_rank
+        np.random.seed(self.local_rank)
+        self.world_size = get_world_size()
+        self.output_path = args.output_path  # TODO: modify by rank
+
         cfg.env.env_settings['idx'] = get_rank()
         cfg.env.env_settings['world_size'] = get_world_size()
 
+        # -------- initialize agent config (either remote server or local agent) --------
         # set agent port based on rank
-        cfg.agent.agent_settings['port'] = 8000 + get_rank()
+        # cfg.agent.agent_settings['port'] = 8000 + get_rank()
         # start_server(cfg.agent.agent_settings['port'])
 
-        super().__init__(cfg)
+        self.eval_config = cfg
+        # self.env = Env.init(cfg.env, cfg.task)
+        # self.agent = AgentClient(config.agent)
 
     def eval(self):
-        # 1. 每个 rank 本地跑一遍
-        local_metrics = self.eval_action()  # dict[str, Tensor], 每个 Tensor shape [N]
-        # 取出设备 & 本地样本数
-        device = next(iter(local_metrics.values())).device
-        local_count = torch.tensor([len(next(iter(local_metrics.values())))], dtype=torch.long, device=device)
+        """
+        Uniform distributed evaluation pipeline:
+
+        1. Call subclass's eval_action() to get local per-episode tensors.
+        2. Use dist all_gather (+ padding) to build global tensors for each metric.
+        3. Call subclass's calc_metrics(global_metrics) to compute scalar metrics.
+        4. Print + rank 0 writes result.json.
+        """
+        local_metrics = self.eval_action()  # dict[str, Tensor], each [N_local]
+
+        if not local_metrics:
+            raise RuntimeError("eval_action() returned empty metrics dict.")
+
+        first_tensor = next(iter(local_metrics.values()))
+        device = first_tensor.device
+        local_len = first_tensor.shape[0]
 
-        # 2. 全局样本数
         world_size = get_world_size()
-        global_count = local_count.clone()
-        if world_size > 1:
-            dist.all_reduce(global_count, op=dist.ReduceOp.SUM)
-
-        # 3. 对每个 metric 做全局 sum / mean
-        result_all = {}
-        for name, tensor in local_metrics.items():
-            # tensor: [N]
-            local_sum = tensor.sum()
-            global_sum = local_sum.clone()
-            if world_size > 1:
-                dist.all_reduce(global_sum, op=dist.ReduceOp.SUM)
-
-            mean_val = (global_sum / global_count).item()
-            result_all[name] = mean_val
-
-        # 4. 统计全局 episode 数
-        result_all["length"] = int(global_count.item())
-
-        # 5. 打印 + 只在 rank 0 写文件
+
+        # -------- 1) Handle non-distributed / world_size == 1 --------
+        if world_size == 1:
+            global_metrics = {name: tensor.detach().cpu() for name, tensor in local_metrics.items()}
+            total_len = int(local_len)
+        else:
+            # -------- 2) Gather lengths from all ranks --------
+            local_len_t = torch.tensor([local_len], dtype=torch.long, device=device)
+            len_list = [torch.zeros_like(local_len_t) for _ in range(world_size)]
+            dist.all_gather(len_list, local_len_t)
+            lens = torch.stack(len_list).cpu()  # shape [world_size, 1]
+            lens = lens.view(-1)  # [world_size]
+            max_len = int(lens.max().item())
+            total_len = int(lens.sum().item())
+
+            # -------- 3) For each metric, pad + all_gather + unpad --------
+            global_metrics = {}
+            for name, tensor in local_metrics.items():
+                assert tensor.shape[0] == local_len, (
+                    f"Metric {name} length ({tensor.shape[0]}) " f"!= first metric length ({local_len})"
+                )
+
+                # pad to max_len on this rank
+                padded = torch.zeros(
+                    max_len,
+                    dtype=tensor.dtype,
+                    device=device,
+                )
+                padded[:local_len] = tensor
+
+                # gather padded tensors from all ranks
+                gathered = [torch.zeros_like(padded) for _ in range(world_size)]
+                dist.all_gather(gathered, padded)
+
+                # unpad & concat using true lengths
+                parts = []
+                for rank in range(world_size):
+                    cur_len = int(lens[rank].item())
+                    if cur_len > 0:
+                        parts.append(gathered[rank][:cur_len])
+                if parts:
+                    global_tensor = torch.cat(parts, dim=0)
+                else:
+                    # no episodes at all (edge case)
+                    global_tensor = torch.empty(0, dtype=tensor.dtype)
+
+                global_metrics[name] = global_tensor.detach().cpu()
+
+        # -------- 4) Let subclass compute final metrics from global tensors --------
+        result_all = self.calc_metrics(global_metrics)
+        result_all.setdefault("length", total_len)
+
+        # -------- 5) Logging --------
         print(result_all)
         if get_rank() == 0:
             os.makedirs(self.args.output_path, exist_ok=True)
@@ -117,15 +126,43 @@ def eval(self):
 
         return result_all
 
-    def eval_action(self):
+    # ================= ABSTRACT HOOKS =================
+
+    def eval_action(self) -> dict:
+        """
+        Run evaluation on this rank and return per-episode metrics.
+
+        Returns
+        -------
+        dict[str, torch.Tensor]
+            Example:
+            {
+                "sucs": tensor([0., 1., ...], device=...),
+                "spls": tensor([...]),
+                "oss": tensor([...]),
+                "nes": tensor([...]),
+            }
+        """
+        raise NotImplementedError
+
+    def calc_metrics(self, global_metrics: dict) -> dict:
         """
-        跑当前 rank 的 episodes, 返回一个 dict:
-        {
-            "success": tensor([0., 1., ...], device=...),
-            "spl": tensor([...]),
-            "os": tensor([...]),
-            "ne": tensor([...]),
-            ...
-        }
+        Compute final scalar metrics from global per-episode tensors.
+
+        Parameters
+        ----------
+        global_metrics : dict[str, torch.Tensor]
+            For each metric name, a 1-D CPU tensor with all episodes across all ranks.
+            Example:
+                {
+                    "sucs": tensor([...], dtype=torch.float32),
+                    "spls": tensor([...]),
+                    ...
+                }
+
+        Returns
+        -------
+        dict[str, float]
+            Final scalar metrics to log.
         """
         raise NotImplementedError
diff --git a/internnav/internnav_habitat/habitat_vln_evaluator.py b/internnav/internnav_habitat/habitat_vln_evaluator.py
index 99e8fdc1..ef18ec07 100644
--- a/internnav/internnav_habitat/habitat_vln_evaluator.py
+++ b/internnav/internnav_habitat/habitat_vln_evaluator.py
@@ -10,7 +10,6 @@
 import re
 from collections import OrderedDict
 
-import habitat
 import numpy as np
 import quaternion
 import torch
@@ -23,7 +22,7 @@
 # Import for Habitat registry side effects — do not remove
 import internnav.env.utils.habitat_extensions.measures  # noqa: F401
 from internnav.configs.evaluator import EvalCfg
-from internnav.evaluator.base import Evaluator
+from internnav.evaluator import DistributedEvaluator, Evaluator
 from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
 from internnav.model.utils.vln_utils import (
     chunk_token,
@@ -31,9 +30,9 @@
     split_and_clean,
     traj_to_actions,
 )
-from internnav.utils.dist import dist, get_rank, get_world_size, init_distributed_mode
 
 try:
+    import habitat
     from habitat import Env
     from habitat.config.default import get_agent_config
     from habitat.config.default_structured_configs import (
@@ -56,28 +55,12 @@
 
 
 @Evaluator.register('habitat_vln')
-class HabitatVlnEvaluator(Evaluator):
+class HabitatVlnEvaluator(DistributedEvaluator):
     def __init__(self, cfg: EvalCfg):
         args = argparse.Namespace(**cfg.eval_settings)
         self.args = args
         self.save_video = args.save_video
-
-        # distributed setting
-        import os
-        import socket
-
-        print(
-            f"Rank {os.getenv('RANK')} / {os.getenv('WORLD_SIZE')} on {socket.gethostname()}:{os.getenv('MASTER_PORT')}"
-        )
-        init_distributed_mode(args)
-        local_rank = args.local_rank
-        np.random.seed(local_rank)
-        cfg.env.env_settings['idx'] = get_rank()
-        cfg.env.env_settings['world_size'] = get_world_size()
-
-        self.world_size = get_world_size()
-        self.output_path = args.output_path  # TODO: modify by rank
-        self.epoch = 0
+        self.epoch = args.epoch
 
         # create habitat config
         self.config_path = cfg.env.env_settings['config_path']
@@ -109,13 +92,13 @@ def __init__(self, cfg: EvalCfg):
         cfg.env.env_settings['habitat_config'] = self.config
 
         # init agent and env
-        # super().__init__(cfg)
+        super().__init__(cfg)
 
         # ------------------------------------- model ------------------------------------------
         processor = AutoProcessor.from_pretrained(args.model_path)
         processor.tokenizer.padding_side = 'left'
 
-        device = torch.device(f"cuda:{local_rank}")
+        device = torch.device(f"cuda:{self.local_rank}")
         if args.mode == 'dual_system':
             model = InternVLAN1ForCausalLM.from_pretrained(
                 args.model_path,
@@ -136,15 +119,6 @@ def __init__(self, cfg: EvalCfg):
         model.eval()
         self.device = device
 
-        # ------------------------------------- old ------------------------------------------
-        self._camera_height = self.sim_sensors_config.rgb_sensor.position[1]
-        self._min_depth = self.sim_sensors_config.depth_sensor.min_depth
-        self._max_depth = self.sim_sensors_config.depth_sensor.max_depth
-
-        camera_fov_rad = np.deg2rad(self.sim_sensors_config.depth_sensor.hfov)
-        self._camera_fov = camera_fov_rad
-        self._fx = self._fy = self.sim_sensors_config.depth_sensor.width / (2 * np.tan(camera_fov_rad / 2))
-
         self.model = model
         self.processor = processor
 
@@ -178,41 +152,56 @@ def __init__(self, cfg: EvalCfg):
         self.num_frames = args.num_frames
         self.num_future_steps = args.num_future_steps
         self.num_history = args.num_history
+
+        # ------------------------------------- old ------------------------------------------
+        self._camera_height = self.sim_sensors_config.rgb_sensor.position[1]
+        self._min_depth = self.sim_sensors_config.depth_sensor.min_depth
+        self._max_depth = self.sim_sensors_config.depth_sensor.max_depth
+
+        camera_fov_rad = np.deg2rad(self.sim_sensors_config.depth_sensor.hfov)
+        self._camera_fov = camera_fov_rad
+        self._fx = self._fy = self.sim_sensors_config.depth_sensor.width / (2 * np.tan(camera_fov_rad / 2))
+
         # ------------------------------------- remove ------------------------------------------
 
-    def eval(self):
-        # * 3. do eval
-        sucs, spls, oss, nes, ep_num = self.eval_action(self.args.local_rank)
-        ep_num_all = [torch.zeros_like(ep_num) for _ in range(self.world_size)]
-        # import ipdb; ipdb.set_trace()
-        world_size = get_world_size()
-        dist.all_gather(ep_num_all, ep_num)
-        sucs_all = [torch.zeros(ep_num_all[i], dtype=sucs.dtype).to(sucs.device) for i in range(world_size)]
-        spls_all = [torch.zeros(ep_num_all[i], dtype=spls.dtype).to(spls.device) for i in range(world_size)]
-        oss_all = [torch.zeros(ep_num_all[i], dtype=oss.dtype).to(oss.device) for i in range(world_size)]
-        nes_all = [torch.zeros(ep_num_all[i], dtype=nes.dtype).to(nes.device) for i in range(world_size)]
-        dist.barrier()
-        dist.all_gather(sucs_all, sucs)
-        dist.all_gather(spls_all, spls)
-        dist.all_gather(oss_all, oss)
-        dist.all_gather(nes_all, nes)
-
-        sucs_all = torch.cat(sucs_all, dim=0)
-        spls_all = torch.cat(spls_all, dim=0)
-        oss_all = torch.cat(oss_all, dim=0)
-        nes_all = torch.cat(nes_all, dim=0)
-        result_all = {
-            "sucs_all": (sum(sucs_all) / len(sucs_all)).item(),
-            "spls_all": (sum(spls_all) / len(spls_all)).item(),
-            "oss_all": (sum(oss_all) / len(oss_all)).item(),
-            "nes_all": (sum(nes_all) / len(nes_all)).item(),
-            'length': len(sucs_all),
+    def eval_action(self):
+        """
+        Run local episodes on this rank.
+
+        Returns dict[str, Tensor] on GPU (1D tensors of same length).
+        """
+        # Old behavior was something like:
+        # sucs, spls, oss, nes, ep_num = self.eval_action(self.args.local_rank)
+        # Now just implement the actual eval here and return dict.
+
+        sucs, spls, oss, nes, _ = self._run_local_eval(self.args.local_rank)
+
+        return {
+            "sucs": sucs,  # shape [N_local]
+            "spls": spls,  # shape [N_local]
+            "oss": oss,  # shape [N_local]
+            "nes": nes,  # shape [N_local]
         }
 
-        print(result_all)
-        if get_rank() == 0:
-            with open(os.path.join(self.args.output_path, 'result.json'), 'a') as f:
-                f.write(json.dumps(result_all))
+    def calc_metrics(self, global_metrics: dict) -> dict:
+        """
+        global_metrics["sucs"] etc. are global 1-D CPU tensors with all episodes.
+        """
+        sucs_all = global_metrics["sucs"]
+        spls_all = global_metrics["spls"]
+        oss_all = global_metrics["oss"]
+        nes_all = global_metrics["nes"]
+
+        # avoid /0 if no episodes
+        denom = max(len(sucs_all), 1)
+
+        return {
+            "sucs_all": float(sucs_all.mean().item()) if denom > 0 else 0.0,
+            "spls_all": float(spls_all.mean().item()) if denom > 0 else 0.0,
+            "oss_all": float(oss_all.mean().item()) if denom > 0 else 0.0,
+            "nes_all": float(nes_all.mean().item()) if denom > 0 else 0.0,
+            # "length" will be filled by base class
+        }
 
     def _eval_action(self):
         obs = self.env.reset()
@@ -228,10 +217,10 @@ def _eval_action(self):
     # refactor
     def config_env(self) -> Env:
         env = Env(config=self.config)
-        # env.episodes = env.episodes[0:1]
+        env.episodes = env.episodes[0:2]  # for debug
         return env
 
-    def eval_action(self, idx=0) -> None:  # noqa: C901
+    def _run_local_eval(self, idx=0) -> None:  # noqa: C901
         self.model.eval()
         env = self.config_env()
         scene_episode_dict = {}
@@ -250,8 +239,8 @@ def eval_action(self, idx=0) -> None:  # noqa: C901
             with open(os.path.join(self.output_path, 'result.json'), 'r') as f:
                 for line in f.readlines():
                     res = json.loads(line)
-                    done_res.append([res["scene_id"], res["episode_id"], res["episode_instruction"]])
-                    if get_rank() == 0:  # noqa: F405 TODO this need to keep in evaluator
+                    done_res.append([res["scene_id"], res["episode_id"]])
+                    if idx == 0:  # noqa: F405 TODO this need to keep in evaluator
                         sucs.append(res['success'])
                         spls.append(res['spl'])
                         oss.append(res['os'])
@@ -271,7 +260,7 @@ def eval_action(self, idx=0) -> None:  # noqa: C901
                 )
                 print("episode start", episode_instruction)
                 episode_id = int(episode.episode_id)
-                if [scene_id, episode_id, episode_instruction] in done_res:
+                if [scene_id, episode_id] in done_res:
                     continue
 
                 # refactor env warm up
diff --git a/internnav/utils/comm_utils/server.py b/internnav/utils/comm_utils/server.py
index 2d3bf27f..fbe143b7 100644
--- a/internnav/utils/comm_utils/server.py
+++ b/internnav/utils/comm_utils/server.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 import base64
+import multiprocessing
 import pickle
 from typing import Dict
 
@@ -77,3 +78,41 @@ def run(self, reload=False):
             reload=reload,
             reload_dirs=['./internnav/agent/', './internnav/model/'],
         )
+
+
+def start_server(host='localhost', port=8087, dist=False):
+    """
+    start a server in the backgrouond process
+
+    Args:
+        host
+        port
+
+    Returns:
+        The rank of the process group
+        -1, if not part of the group
+
+    """
+    ctx = multiprocessing.get_context("spawn")
+    p = ctx.Process(target=_run_server if not dist else _run_server_dist, args=(host, port))
+    p.daemon = True
+    p.start()
+    print(f"Server started on {host}:{port} (pid={p.pid})")
+    return p
+
+
+def _run_server_dist(host='localhost', port=8087):
+    import torch
+
+    from internnav.utils.dist import get_rank
+
+    device_idx = get_rank()
+    torch.cuda.set_device(device_idx)
+    print(f"Server using GPU {device_idx}")
+    server = AgentServer(host, port)
+    server.run()
+
+
+def _run_server(host='localhost', port=8087):
+    server = AgentServer(host, port)
+    server.run()
diff --git a/scripts/eval/bash/torchrun_eval.sh b/scripts/eval/bash/torchrun_eval.sh
index fa99d402..8a68922f 100644
--- a/scripts/eval/bash/torchrun_eval.sh
+++ b/scripts/eval/bash/torchrun_eval.sh
@@ -1,20 +1,20 @@
 # use to run distributed eval with 4 gpus on single node
 
+MID_RUN_NAME="InternVLA-N1"
+torchrun \
+  --nproc_per_node=2 \
+  --master_port=2333 \
+  scripts/eval/eval.py \
+    --config scripts/eval/configs/habitat_cfg.py \
+  > logs/${MID_RUN_NAME}_log.txt 2>&1
+
+# CUDA_VISIBLE_DEVICES=6,7
 # MID_RUN_NAME="InternVLA-N1"
 # torchrun \
 #   --nproc_per_node=8 \
-#   --master_port=2333 \
-#   scripts/eval/eval.py \
-#     --config scripts/eval/configs/habitat_cfg.py \
-#   > logs/${MID_RUN_NAME}_log.txt 2>&1
-
-# CUDA_VISIBLE_DEVICES=6,7
-MID_RUN_NAME="InternVLA-N1"
-torchrun \
-  --nproc_per_node=8 \
-  --master_port=29501 \
-  scripts/eval/eval_habitat.py \
-    --model_path checkpoints/InternVLA-N1 \
-    --continuous_traj \
-    --output_path logs/habitat/test_new_checkpoint2 \
-  > logs/${MID_RUN_NAME}_old_log1.txt 2>&1
+#   --master_port=29501 \
+#   scripts/eval/eval_habitat.py \
+#     --model_path checkpoints/InternVLA-N1 \
+#     --continuous_traj \
+#     --output_path logs/habitat/test_new_checkpoint2 \
+#   > logs/${MID_RUN_NAME}_old_log1.txt 2>&1
diff --git a/scripts/eval/configs/habitat_cfg.py b/scripts/eval/configs/habitat_cfg.py
index aaa702d0..6e3445a7 100644
--- a/scripts/eval/configs/habitat_cfg.py
+++ b/scripts/eval/configs/habitat_cfg.py
@@ -41,6 +41,7 @@
         "local_rank": 0,  # node rank
         "output_path": "./logs/habitat/test_refactor_debug",  # output directory for logs/results
         "save_video": False,  # whether to save videos
+        "epoch": 0,  # epoch number for logging
         "world_size": 1,  # number of distributed processes
         "rank": 0,  # rank of current process
         "gpu": 0,  # gpu id to use
diff --git a/scripts/eval/eval_habitat.py b/scripts/eval/eval_habitat.py
deleted file mode 100644
index e78a8d6f..00000000
--- a/scripts/eval/eval_habitat.py
+++ /dev/null
@@ -1,127 +0,0 @@
-import argparse
-import json
-import os
-import sys
-
-sys.path.append('./src/diffusion-policy')
-
-import numpy as np
-import torch
-from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
-
-# Import for Habitat registry side effects — do not remove
-import internnav.env.utils.habitat_extensions.measures  # noqa: F401
-from internnav.evaluator.habitat_vln_evaluator import VLNEvaluator
-from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
-from internnav.utils.dist import *
-
-
-def parse_args():
-
-    parser = argparse.ArgumentParser(description='Evaluate InternVLA-N1 on Habitat')
-    parser.add_argument("--mode", default='dual_system', type=str, help="inference mode: dual_system or system2")
-    parser.add_argument("--local_rank", default=0, type=int, help="node rank")
-    parser.add_argument("--model_path", type=str, default="")
-    parser.add_argument("--habitat_config_path", type=str, default='scripts/eval/configs/vln_r2r.yaml')
-    parser.add_argument("--eval_split", type=str, default='val_unseen')
-    parser.add_argument("--output_path", type=str, default='./logs/habitat/test')  #!
-    parser.add_argument("--num_future_steps", type=int, default=4)
-    parser.add_argument("--num_frames", type=int, default=32)
-    parser.add_argument("--save_video", action="store_true", default=False)
-    parser.add_argument("--num_history", type=int, default=8)
-    parser.add_argument("--resize_w", type=int, default=384)
-    parser.add_argument("--resize_h", type=int, default=384)
-    parser.add_argument("--predict_step_nums", type=int, default=16)
-    parser.add_argument("--continuous_traj", action="store_true", default=False)
-    parser.add_argument("--max_new_tokens", type=int, default=1024)
-
-    parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')
-    parser.add_argument('--rank', default=0, type=int, help='rank')
-    parser.add_argument('--gpu', default=0, type=int, help='gpu')
-    parser.add_argument('--port', default='2333')
-    parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
-    parser.add_argument('--device', default='cuda', help='device to use for training / testing')
-
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-
-    init_distributed_mode(args)
-    local_rank = args.local_rank
-    np.random.seed(local_rank)
-
-    # * 1. Load model and tokenizer. Currently, we support two modes: dual_system and system2 in Habitat.
-    processor = AutoProcessor.from_pretrained(args.model_path)
-    processor.tokenizer.padding_side = 'left'
-
-    device = torch.device(f"cuda:{local_rank}")
-    if args.mode == 'dual_system':
-        model = InternVLAN1ForCausalLM.from_pretrained(
-            args.model_path,
-            torch_dtype=torch.bfloat16,
-            attn_implementation="flash_attention_2",
-            device_map={"": device},
-        )
-    elif args.mode == 'system2':
-        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-            args.model_path,
-            torch_dtype=torch.bfloat16,
-            attn_implementation="flash_attention_2",
-            device_map={"": device},
-        )
-    else:
-        raise ValueError(f"Invalid mode: {args.mode}")
-
-    model.eval()
-    world_size = get_world_size()
-
-    # * 2. initialize evaluator
-    evaluator = VLNEvaluator(
-        config_path=args.habitat_config_path,
-        split=args.eval_split,
-        env_num=world_size,
-        output_path=args.output_path,
-        model=model,
-        processor=processor,
-        epoch=0,
-        args=args,
-    )
-
-    # * 3. do eval
-    sucs, spls, oss, nes, ep_num = evaluator.eval_action(idx=get_rank())
-    ep_num_all = [torch.zeros_like(ep_num) for _ in range(world_size)]
-
-    # import ipdb; ipdb.set_trace()
-    dist.all_gather(ep_num_all, ep_num)
-    sucs_all = [torch.zeros(ep_num_all[i], dtype=sucs.dtype).to(sucs.device) for i in range(world_size)]
-    spls_all = [torch.zeros(ep_num_all[i], dtype=spls.dtype).to(spls.device) for i in range(world_size)]
-    oss_all = [torch.zeros(ep_num_all[i], dtype=oss.dtype).to(oss.device) for i in range(world_size)]
-    nes_all = [torch.zeros(ep_num_all[i], dtype=nes.dtype).to(nes.device) for i in range(world_size)]
-    dist.barrier()
-    dist.all_gather(sucs_all, sucs)
-    dist.all_gather(spls_all, spls)
-    dist.all_gather(oss_all, oss)
-    dist.all_gather(nes_all, nes)
-
-    sucs_all = torch.cat(sucs_all, dim=0)
-    spls_all = torch.cat(spls_all, dim=0)
-    oss_all = torch.cat(oss_all, dim=0)
-    nes_all = torch.cat(nes_all, dim=0)
-    result_all = {
-        "sucs_all": (sum(sucs_all) / len(sucs_all)).item(),
-        "spls_all": (sum(spls_all) / len(spls_all)).item(),
-        "oss_all": (sum(oss_all) / len(oss_all)).item(),
-        "nes_all": (sum(nes_all) / len(nes_all)).item(),
-        'length': len(sucs_all),
-    }
-
-    print(result_all)
-    if get_rank() == 0:
-        with open(os.path.join(args.output_path, f'result.json'), 'a') as f:
-            f.write(json.dumps(result_all))
-
-
-if __name__ == '__main__':
-    main()
diff --git a/scripts/iros_challenge/start_eval_iros.sh b/scripts/iros_challenge/start_eval_iros.sh
index d5f43acb..335b6144 100755
--- a/scripts/iros_challenge/start_eval_iros.sh
+++ b/scripts/iros_challenge/start_eval_iros.sh
@@ -47,7 +47,7 @@ if [ -n "$processes" ]; then
         echo "kill: $pid"
     done
 fi
-python internnav/agent/utils/server.py --config scripts/eval/configs/challenge_cfg.py > "$SERVER_LOG" 2>&1 &
+python scripts/eval/start_server.py --config scripts/eval/configs/challenge_cfg.py > "$SERVER_LOG" 2>&1 &
 
 
 START_COMMAND_KUJIALE="python -u scripts/eval/eval_iros.py --config $CONFIG --default_config scripts/eval/configs/challenge_kujiale_cfg.py --split $SPLIT"

From 0d00014a29b376065cb38db6592c28401019bf24 Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Wed, 12 Nov 2025 05:18:37 +0000
Subject: [PATCH 04/16] Habitat env applied, distributed evaluator applied;
 clean evaluator and agent created

---
 internnav/evaluator/distributed_base.py       |  21 +-
 internnav/internnav_habitat/habitat_env.py    | 104 ++-
 .../habitat_n1_agent_temp.py                  | 871 ++++++++++++++----
 .../habitat_vln_evaluator.py                  | 779 ++++++++--------
 .../habitat_vln_evaluator_clean.py            | 128 +++
 internnav/utils/dist.py                       | 147 ++-
 scripts/eval/bash/torchrun_eval.sh            |   2 +-
 scripts/eval/configs/habitat_cfg.py           |  41 +-
 setup.cfg                                     |   2 +-
 9 files changed, 1346 insertions(+), 749 deletions(-)
 create mode 100644 internnav/internnav_habitat/habitat_vln_evaluator_clean.py

diff --git a/internnav/evaluator/distributed_base.py b/internnav/evaluator/distributed_base.py
index a0f06a28..f30d058a 100644
--- a/internnav/evaluator/distributed_base.py
+++ b/internnav/evaluator/distributed_base.py
@@ -1,4 +1,3 @@
-import argparse
 import json
 import os
 
@@ -6,7 +5,8 @@
 import torch
 
 from internnav.configs.evaluator import EvalCfg
-from internnav.evaluator.base import Evaluator
+from internnav.env import Env
+from internnav.evaluator import Evaluator
 from internnav.utils.dist import dist, get_rank, get_world_size, init_distributed_mode
 
 
@@ -23,17 +23,16 @@ def __init__(self, cfg: EvalCfg):
             f"Rank {os.getenv('RANK')} / {os.getenv('WORLD_SIZE')} on {socket.gethostname()}:{os.getenv('MASTER_PORT')}"
         )
 
-        args = argparse.Namespace(**cfg.eval_settings)
-        self.args = args
+        self.output_path = cfg.eval_settings["output_path"]  # TODO: unsafe for distribution
 
-        init_distributed_mode(args)
+        init_distributed_mode()
 
-        self.local_rank = args.local_rank
+        self.local_rank = get_rank()
         np.random.seed(self.local_rank)
         self.world_size = get_world_size()
-        self.output_path = args.output_path  # TODO: modify by rank
 
-        cfg.env.env_settings['idx'] = get_rank()
+        # habitat env also need rank to split dataset
+        cfg.env.env_settings['local_rank'] = get_rank()
         cfg.env.env_settings['world_size'] = get_world_size()
 
         # -------- initialize agent config (either remote server or local agent) --------
@@ -42,7 +41,7 @@ def __init__(self, cfg: EvalCfg):
         # start_server(cfg.agent.agent_settings['port'])
 
         self.eval_config = cfg
-        # self.env = Env.init(cfg.env, cfg.task)
+        self.env = Env.init(cfg.env, cfg.task)
         # self.agent = AgentClient(config.agent)
 
     def eval(self):
@@ -119,8 +118,8 @@ def eval(self):
         # -------- 5) Logging --------
         print(result_all)
         if get_rank() == 0:
-            os.makedirs(self.args.output_path, exist_ok=True)
-            out_path = os.path.join(self.args.output_path, "result.json")
+            os.makedirs(self.output_path, exist_ok=True)
+            out_path = os.path.join(self.output_path, "result.json")
             with open(out_path, "a") as f:
                 f.write(json.dumps(result_all) + "\n")
 
diff --git a/internnav/internnav_habitat/habitat_env.py b/internnav/internnav_habitat/habitat_env.py
index 6b102153..63769741 100644
--- a/internnav/internnav_habitat/habitat_env.py
+++ b/internnav/internnav_habitat/habitat_env.py
@@ -11,8 +11,9 @@ class HabitatEnv(base.Env):
     def __init__(self, env_config: EnvCfg, task_config: TaskCfg):
         """
         env_settings include:
-            - config_path: str, path to habitat config yaml file
-            - split: str, dataset split to use
+            - habitat_config: loaded from get_habitat_config
+            - local_rank: int, rank index for sharding
+            - world_size: int, total number of ranks
         """
         try:
             from habitat import Env
@@ -24,54 +25,59 @@ def __init__(self, env_config: EnvCfg, task_config: TaskCfg):
         super().__init__(env_config, task_config)
 
         self.config = env_config.env_settings['habitat_config']
-        self.env = Env(self.config)
+        self._env = Env(self.config)
 
-        self.episodes = self.generate_episodes()
-        self.sort_episodes_by_scene()
-
-        self.index = env_config.env_settings.get('idx', 0)
+        self.local_rank = env_config.env_settings.get('local_rank', 0)
         self.world_size = env_config.env_settings.get('world_size', 1)
         self._current_episode_index: int = 0
         self._last_obs: Optional[Dict[str, Any]] = None
 
-        self.step_id = 0
         self.is_running = True
+        self.output_path = env_config.env_settings.get('output_path', './output')
+
+        # generate episodes
+        # self._env.episodes = self._env.episodes[0:1]  # for debug
+        self.episodes = self.generate_episodes()
+        print(self.episodes)
 
     def generate_episodes(self) -> List[Any]:
         """
-        Generate list of episodes for the current split
+        Generate list of episodes for the current split, already:
+        - grouped by scene
+        - filtered by done_res (the path is self.output_path/progress.json)
+        - sharded by (local_rank, world_size)
         """
-        episodes = []
+        all_episodes = []
 
-        # sort episode by scene
-        scene_episode_dict = {}
-        for episode in self.env.episodes:
-            if episode.scene_id not in scene_episode_dict:
-                scene_episode_dict[episode.scene_id] = []
-            scene_episode_dict[episode.scene_id].append(episode)
+        # group episodes by scene
+        scene_episode_dict: Dict[str, List[Any]] = {}
+        for episode in self._env.episodes:
+            scene_episode_dict.setdefault(episode.scene_id, []).append(episode)
 
+        # load done_res
         done_res = set()
-
-        if os.path.exists(os.path.join(self.output_path, 'result.json')):
-            with open(os.path.join(self.output_path, 'result.json'), 'r') as f:
-                for line in f.readlines():
+        result_path = os.path.join(self.output_path, 'progress.json')
+        if os.path.exists(result_path):
+            with open(result_path, 'r') as f:
+                for line in f:
                     res = json.loads(line)
-                    done_res.add((res["scene_id"], res["episode_id"], res["episode_instruction"]))
+                    # only skip if current format has scene_id
+                    if "scene_id" in res:
+                        done_res.add((res["scene_id"], res["episode_id"]))
 
+        # iterate scenes in order, collect all episodes
         for scene in sorted(scene_episode_dict.keys()):
-            episodes = scene_episode_dict[scene]
+            per_scene_eps = scene_episode_dict[scene]
             scene_id = scene.split('/')[-2]
-            for episode in episodes[self.index :: self.world_size]:
-                episode_instruction = (
-                    episode.instruction.instruction_text
-                    if 'objectnav' not in self.config_path
-                    else episode.object_category
-                )
+
+            # shard by rank index / world_size
+            for episode in per_scene_eps[self.local_rank :: self.world_size]:
                 episode_id = int(episode.episode_id)
-                if (scene_id, episode_id, episode_instruction) in done_res:
+                if (scene_id, episode_id) in done_res:
                     continue
-                episodes.append(episode)
-        return episodes
+                all_episodes.append(episode)
+
+        return all_episodes
 
     def reset(self):
         """
@@ -83,12 +89,11 @@ def reset(self):
             return
 
         # Manually set to next episode in habitat
-        self.env.current_episode = self.episodes[self._current_episode_index]
+        self._env.current_episode = self.episodes[self._current_episode_index]
         self._current_episode_index += 1
 
         # Habitat reset
-        self._last_obs = self.env.reset()
-        self.step_id = 0
+        self._last_obs = self._env.reset()
 
         return self._last_obs
 
@@ -98,29 +103,26 @@ def step(self, action: List[Any]):
 
         Args: action: List[Any], action for each env in the batch
 
-        Return: obs, terminated
+        Return: obs, reward, done, info
         """
-        self._last_obs = self.env.step(action)
-        terminated = self.env.episode_over
-        return self._last_obs, terminated
+        obs = self._env.step(action)
+        done = self._env.episode_over
+        info = self._env.get_metrics()
+        reward = info.get('reward', 0.0)
+        return obs, reward, done, info
 
     def close(self):
-        print('Vln Env close')
-        self.env.close()
+        print('Habitat Env close')
+        self._env.close()
 
     def render(self):
-        self.env.render()
+        self._env.render()
 
     def get_observation(self) -> Dict[str, Any]:
-        return self.env.get_observations()
+        return self._env.get_observations()
 
     def get_metrics(self) -> Dict[str, Any]:
-        return self.env.get_metrics()
-
-    def sort_episodes_by_scene(self, key_list: List[str]):
-        sorted_episodes = []
-        episode_dict = {ep.episode_id: ep for ep in self.episodes}
-        for key in key_list:
-            if key in episode_dict:
-                sorted_episodes.append(episode_dict[key])
-        self.episodes = sorted_episodes
+        return self._env.get_metrics()
+
+    def get_current_episode(self):
+        return self._env.current_episode
diff --git a/internnav/internnav_habitat/habitat_n1_agent_temp.py b/internnav/internnav_habitat/habitat_n1_agent_temp.py
index 598ab25b..0cea910a 100644
--- a/internnav/internnav_habitat/habitat_n1_agent_temp.py
+++ b/internnav/internnav_habitat/habitat_n1_agent_temp.py
@@ -3,8 +3,6 @@
 import os
 import re
 import sys
-import time
-from datetime import datetime
 from pathlib import Path
 
 import numpy as np
@@ -15,23 +13,34 @@
 from collections import OrderedDict
 
 from PIL import Image
-from transformers import AutoProcessor
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
 
 from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
-from internnav.model.utils.vln_utils import S2Output, split_and_clean, traj_to_actions
+from internnav.model.utils.vln_utils import split_and_clean, traj_to_actions
 
 DEFAULT_IMAGE_TOKEN = "<image>"
 
 
-class InternVLAN1AsyncAgent:
-    def __init__(self, args):
-        self.device = torch.device(args.device)
-        self.save_dir = "test_data/" + datetime.now().strftime("%Y%m%d_%H%M%S")
-        print(f"args.model_path{args.model_path}")
+class HabitatAgent:
+    def __init__(self, model, processor, args, device):
+        self.model = model
+        self.processor = processor
+        self.args = args
+        self.device = device
+        # ------------------------------------- model ------------------------------------------
+        processor = AutoProcessor.from_pretrained(args.model_path)
+        processor.tokenizer.padding_side = 'left'
 
-        device = torch.device("cuda")
+        device = torch.device(f"cuda:{self.local_rank}")
         if args.mode == 'dual_system':
-            self.model = InternVLAN1ForCausalLM.from_pretrained(
+            model = InternVLAN1ForCausalLM.from_pretrained(
+                args.model_path,
+                torch_dtype=torch.bfloat16,
+                attn_implementation="flash_attention_2",
+                device_map={"": device},
+            )
+        elif args.mode == 'system2':
+            model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                 args.model_path,
                 torch_dtype=torch.bfloat16,
                 attn_implementation="flash_attention_2",
@@ -40,20 +49,17 @@ def __init__(self, args):
         else:
             raise ValueError(f"Invalid mode: {args.mode}")
 
-        self.model.eval()
-        self.model.to(self.device)
+        model.eval()
+        self.device = device
 
-        self.processor = AutoProcessor.from_pretrained(args.model_path)
-        self.processor.tokenizer.padding_side = 'left'
+        self.model = model
+        self.processor = processor
 
-        self.resize_w = args.resize_w
-        self.resize_h = args.resize_h
-        self.num_history = args.num_history
-        self.PLAN_STEP_GAP = args.plan_step_gap
-
-        prompt = "You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? Please output the next waypoint's coordinates in the image. Please output STOP when you have successfully completed the task."
+        # refactor: this part used in three places
+        prompt = "You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? Please output the next waypoint\'s coordinates in the image. Please output STOP when you have successfully completed the task."
         answer = ""
         self.conversation = [{"from": "human", "value": prompt}, {"from": "gpt", "value": answer}]
+
         self.conjunctions = [
             'you can see ',
             'in front of you is ',
@@ -74,190 +80,671 @@ def __init__(self, args):
             }
         )
 
+        self.objectnav_instructions = ["Search for the {target_object}."]
+
+        self.num_frames = args.num_frames
+        self.num_future_steps = args.num_future_steps
+        self.num_history = args.num_history
+
+    def reset(self, episode, env):
+        """Clear all per-episode state."""
         self.rgb_list = []
-        self.depth_list = []
-        self.pose_list = []
-        self.episode_idx = 0
-        self.conversation_history = []
-        self.llm_output = ""
-        self.past_key_values = None
-        self.last_s2_idx = -100
-
-        # output
-        self.output_action = None
-        self.output_latent = None
-        self.output_pixel = None
-        self.pixel_goal_rgb = None
-        self.pixel_goal_depth = None
-
-    def reset(self):
-        self.rgb_list = []
-        self.depth_list = []
-        self.pose_list = []
-        self.episode_idx = 0
-        self.conversation_history = []
-        self.llm_output = ""
-        self.past_key_values = None
-
-        self.output_action = None
-        self.output_latent = None
-        self.output_pixel = None
-        self.pixel_goal_rgb = None
-        self.pixel_goal_depth = None
-
-        self.save_dir = "test_data/" + datetime.now().strftime("%Y%m%d_%H%M%S")
-        os.makedirs(self.save_dir, exist_ok=True)
+        self.action_seq = []
+        self.output_ids = None
+        self.goal = None
+        self.messages = []
+        self.local_actions = []
+        self.forward_action = 0
+
+        # maybe store initial transforms you need for this ep:
+        self.initial_agent_state = env.get_agent_state()
+        self.initial_height = self.initial_agent_state.position[1]
+
+    def act(self, observations, env, info):
+        """
+        Pure policy step:
+        - given obs (rgb/depth/gps/compass) + optional env/info
+        - update internal state (goal, messages, local_actions, etc.)
+        - return a single action (int)
+        """
+        # 1) unpack obs: rgb, depth, gps, compass, etc.
+        # 2) handle 'look down' case
+        # 3) maybe call LLM to get pixel goal or action_seq
+        # 4) maybe call diffusion policy to get local_actions
+        # 5) choose final `action` (0..5)
+        # 6) return `action`
+        return action
+
+    def _run_local_eval(self, idx=0) -> None:  # noqa: C901
+        """
+        Run local evaluation on this rank.
+
+        Important: if resuming from previous results, need to read from / write to "self.output_path/progress.json".
+                    For each episode, save the result dict in jsonl format to that file.
+                    In Env, the episodes are already filtered by this file, tasks that have the same (scene_id, episode_id) are skipped.
+
+
+        Returns
+        -------
+        dict[str, Tensor]:
+            {
+                "sucs": [N_local],
+                "spls": [N_local],
+                "oss":  [N_local],
+                "nes":  [N_local],
+            }
+        """
+        # Create / get env
+        # self.env = self.env  # HabitatEnv from DistributedEvaluator
+
+        sucs, spls, oss, nes = [], [], [], []
+        self.model.eval()
+
+        # resume from previous results
+        # TODO: Current read write op is not distributed safe
+        if os.path.exists(os.path.join(self.output_path, 'progress.json')):
+            with open(os.path.join(self.output_path, 'progress.json'), 'r') as f:
+                for line in f.readlines():
+                    res = json.loads(line)
+                    if "scene_id" not in res:
+                        print("This evaluation has already finished!")
+                        return (
+                            torch.tensor(sucs).to(self.device),
+                            torch.tensor(spls).to(self.device),
+                            torch.tensor(oss).to(self.device),
+                            torch.tensor(nes).to(self.device),
+                            torch.tensor(len(sucs)).to(self.device),
+                        )
+                    if idx == 0:  # noqa: F405 TODO this need to keep in evaluator
+                        sucs.append(res['success'])
+                        spls.append(res['spl'])
+                        oss.append(res['os'])
+                        nes.append(res['ne'])
+
+        # Episode loop is now driven by env.reset() + env.is_running
+        process_bar = tqdm.tqdm(total=len(self.env.episodes), desc=f"Eval Epoch {self.epoch} Rank {idx}")
+        while self.env.is_running:
+
+            # ------------ 1. Start of episode ------------
+            observations = self.env.reset()
+            if not self.env.is_running or observations is None:
+                break
+
+            # ---- episode meta (scene_id, episode_id, instruction) ----
+            # we get it from the underlying habitat env
+            episode = self.env.get_current_episode()
+            scene_id = episode.scene_id.split('/')[-2]
+            episode_id = int(episode.episode_id)
+            episode_instruction = (
+                episode.instruction.instruction_text if 'objectnav' not in self.config_path else episode.object_category
+            )
+            print("episode start", episode_instruction)
+
+            agent_state = self.env._env.sim.get_agent_state()
+            rotation = agent_state.rotation
+            translation = agent_state.position
+            rotation_matrix = quaternion.as_rotation_matrix(rotation)
+            transformation_matrix = np.eye(4)
+            transformation_matrix[:3, :3] = rotation_matrix
+            transformation_matrix[:3, 3] = translation
+
+            agent = ShortestPathFollower(self.env._env.sim, 0.25, False)
+
+            os.makedirs(os.path.join(self.output_path, f'check_sim_{self.epoch}'), exist_ok=True)
+            Image.fromarray(observations['rgb']).save(
+                os.path.join(self.output_path, f'check_sim_{self.epoch}', f'rgb_{idx}.jpg')
+            )
+
+            vis_frames = []
+            step_id = 0
+
+            if self.save_video:
+                os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
+            initial_height = self.env._env.sim.get_agent_state().position[1]
+
+            rgb_list = []
+            action_seq = []
+            output_ids = None
+
+            goal = None
+            action = None
+            messages = []
+            local_actions = []
+
+            done = False
+
+            # ---------- 2. Episode step loop -----------
+            while (not done) and (step_id <= self.max_steps_per_episode):
+                # refactor agent get action
+                rgb = observations["rgb"]
+                depth = observations["depth"]
+                x, y = observations["gps"]
+                camera_yaw = observations["compass"][0]
+                depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
+                depth = depth * (self._max_depth - self._min_depth) + self._min_depth
+                depth = depth * 1000
+
+                agent_state = self.env._env.sim.get_agent_state()
+                height = agent_state.position[1] - initial_height
+                camera_position = np.array([x, -y, self._camera_height + height])
+                tf_camera_to_episodic = (
+                    self.xyz_yaw_pitch_to_tf_matrix(camera_position, camera_yaw, np.deg2rad(30))
+                    @ self.get_axis_align_matrix()
+                )
+
+                image = Image.fromarray(rgb).convert('RGB')
+                save_raw_image = image.copy()
+
+                save_dot = False
+                if action == 5:
+                    look_down_image = image
+                    save_raw_image = look_down_image.copy()
+                    look_down_depth, resize_shape = self.preprocess_depth_image_v2(
+                        Image.fromarray(depth.astype(np.uint16), mode='I;16'),
+                        do_depth_scale=True,
+                        depth_scale=1000,
+                        target_height=224,
+                        target_width=224,
+                    )
+                    look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
+                    look_down_depth[look_down_depth > 5.0] = 5.0
+                else:
+                    image = image.resize((self.args.resize_w, self.args.resize_h))
+                    rgb_list.append(image)
+
+                    if self.args.mode == 'dual_system':
+                        down_observations, _, done, _ = self.env.step(5)
+                        down_observations, _, done, _ = self.env.step(5)
+
+                        look_down_image = Image.fromarray(down_observations["rgb"]).convert('RGB')
+                        depth = down_observations["depth"]
+                        depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
+                        depth = depth * (self._max_depth - self._min_depth) + self._min_depth
+                        depth = depth * 1000
+                        look_down_depth, resize_shape = self.preprocess_depth_image_v2(
+                            Image.fromarray(depth.astype(np.uint16), mode='I;16'),
+                            do_depth_scale=True,
+                            depth_scale=1000,
+                            target_height=224,
+                            target_width=224,
+                        )
+                        look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
+                        look_down_depth[look_down_depth > 5.0] = 5.0
+
+                        self.env.step(4)
+                        self.env.step(4)
+
+                info = self.env.get_metrics()
+
+                if len(action_seq) == 0 and goal is None:
+                    if action != 5:
+                        sources = copy.deepcopy(self.conversation)
+                        sources[0]["value"] = sources[0]["value"].replace(
+                            '<instruction>.', episode.instruction.instruction_text[:-1]
+                        )
+                        cur_images = rgb_list[-1:]
+                        if step_id == 0:
+                            history_id = []
+                        else:
+                            history_id = np.unique(
+                                np.linspace(0, step_id - 1, self.num_history, dtype=np.int32)
+                            ).tolist()
+                            placeholder = (DEFAULT_IMAGE_TOKEN + '\n') * len(history_id)
+                            sources[0]["value"] += f' These are your historical observations: {placeholder}.'
+
+                        history_id = sorted(history_id)
+                        print('history_idddddddd', step_id, history_id)
+                        input_images = [rgb_list[i] for i in history_id] + cur_images
+                        input_img_id = 0
+                    else:
+                        assert action == 5
+                        sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
+                        input_images += [look_down_image]
+                        # messages.append(
+                        #     {'role': 'assistant', 'content': [{'type': 'text', 'text': llm_outputs}]}  # noqa: F405
+                        # )
+                        input_img_id = -1
+
+                    prompt = random.choice(self.conjunctions) + DEFAULT_IMAGE_TOKEN
+                    sources[0]["value"] += f" {prompt}."
+                    print('sources', step_id, sources)
+                    prompt_instruction = copy.deepcopy(sources[0]["value"])
+                    parts = split_and_clean(prompt_instruction)
+
+                    content = []
+                    for i in range(len(parts)):
+                        if parts[i] == "<image>":
+                            content.append({"type": "image", "image": input_images[input_img_id]})
+                            input_img_id += 1
+                        else:
+                            content.append({"type": "text", "text": parts[i]})
+
+                    messages.append({'role': 'user', 'content': content})
+
+                    print('step_id', step_id, 'messages:', messages)
+
+                    text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+                    inputs = self.processor(text=[text], images=input_images, return_tensors="pt").to(self.model.device)
+
+                    with torch.no_grad():
+                        output_ids = self.model.generate(**inputs, max_new_tokens=128, do_sample=False)
+
+                    llm_outputs = self.processor.tokenizer.decode(
+                        output_ids[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
+                    )
+                    print('step_id:', step_id, 'output text:', llm_outputs)
+
+                    if bool(re.search(r'\d', llm_outputs)):
+                        forward_action = 0
+                        coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
+                        pixel_goal = [int(coord[1]), int(coord[0])]
+
+                        intrinsic_matrix = self.get_intrinsic_matrix(
+                            self.config.habitat.simulator.agents.main_agent.sim_sensors.rgb_sensor
+                        )
+                        goal = self.pixel_to_gps(pixel_goal, depth / 1000, intrinsic_matrix, tf_camera_to_episodic)
+                        print('before', goal, depth.shape)
+                        goal = (transformation_matrix @ np.array([-goal[1], 0, -goal[0], 1]))[:3]
+
+                        if not self.env._env.sim.pathfinder.is_navigable(np.array(goal)):
+                            goal = np.array(self.env._env.sim.pathfinder.snap_point(np.array(goal)))
+
+                        # look down --> horizontal
+                        self.env.step(4)
+                        self.env.step(4)
+
+                        # Forking logic based on mode
+                        if self.args.mode == 'system2':
+                            action = agent.get_next_action(goal)
+                            if action == 0:
+                                goal = None
+                                output_ids = None
+                                action = 2  # random action
+                                print('conduct a random action 2')
+                                observations = self.env.step(action)
+                                step_id += 1
+                                messages = []
+                                continue
+                        else:  # dual-system logic
+                            local_actions = []
+                            pixel_values = inputs.pixel_values
+                            image_grid_thw = torch.cat([thw.unsqueeze(0) for thw in inputs.image_grid_thw], dim=0)
+
+                            with torch.no_grad():
+                                traj_latents = self.model.generate_latents(output_ids, pixel_values, image_grid_thw)
+
+                            # prepocess align with navdp
+                            image_dp = (
+                                torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
+                            )
+                            pix_goal_image = copy.copy(image_dp)
+                            images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
+                            depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
+                            pix_goal_depth = copy.copy(depth_dp)
+                            depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
+
+                            with torch.no_grad():
+                                dp_actions = self.model.generate_traj(
+                                    traj_latents, images_dp, depths_dp, use_async=True
+                                )
+
+                            random_choice = np.random.choice(dp_actions.shape[0])
+                            if self.args.continuous_traj:
+                                action_list = traj_to_actions(dp_actions)
+                                if len(action_list) < 8:
+                                    action_list += [0] * (8 - len(action_list))
+                            else:
+                                action_list = chunk_token(dp_actions[random_choice])
+
+                            local_actions = action_list
+                            if len(local_actions) >= 4:
+                                local_actions = local_actions[:4]
+                            action = local_actions[0]
+                            if action == 0:
+                                goal = None
+                                output_ids = None
+                                action = 2  # random action
+                                print('conduct a random action 2')
+                                observations = self.env.step(action)
+                                step_id += 1
+                                messages = []
+                                continue
+
+                        print('predicted goal', pixel_goal, goal, flush=True)
+                    else:
+                        action_seq = self.parse_actions(llm_outputs)
+                        print('actions', action_seq, flush=True)
+
+                if len(action_seq) != 0:
+                    action = action_seq[0]
+                    action_seq.pop(0)
+                elif goal is not None:
+                    # Forking logic based on mode
+                    if self.args.mode == 'system2':
+                        action = agent.get_next_action(goal)
+                        action = action.detach().cpu().numpy()[0] if isinstance(action, torch.Tensor) else action
+                        action = action[0] if hasattr(action, "__len__") else action
+                    else:  # dual-system logic
+                        if len(local_actions) == 0:
+                            # navdp
+                            local_actions = []
+                            image_dp = (
+                                torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
+                            )
+
+                            images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
+                            depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
+
+                            depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
+                            with torch.no_grad():
+                                dp_actions = self.model.generate_traj(
+                                    traj_latents, images_dp, depths_dp, use_async=True
+                                )
+
+                            random_choice = np.random.choice(dp_actions.shape[0])
+                            if self.args.continuous_traj:
+                                action_list = traj_to_actions(dp_actions)
+                                if len(action_list) < 8:
+                                    action_list += [0] * (8 - len(action_list))
+                            else:
+                                action_list = chunk_token(dp_actions[random_choice])
+                            print("first action_list", action_list)
+
+                            local_actions = action_list
+                            if len(local_actions) >= 4:
+                                local_actions = local_actions[:4]
+                            # if len(local_actions) >= 2:
+                            #     local_actions = local_actions[:2]
+
+                            print("local_actions", local_actions)
+
+                            action = local_actions.pop(0)
+                            # navdp
+                        else:
+                            action = local_actions.pop(0)
+
+                    forward_action += 1
+                    print('forward_action', forward_action, flush=True)
+                    if forward_action > 8:
+                        goal = None
+                        output_ids = None
+                        messages = []
+                        step_id += 1
+                        forward_action = 0
+                        local_actions = []
+                        continue
+                    if action == 0:
+                        goal = None
+                        output_ids = None
+                        messages = []
+                        step_id += 1
+                        forward_action = 0
+                        local_actions = []
+                        continue
+                else:
+                    action = 0
+
+                if info['top_down_map'] is not None:
+                    if save_dot:
+                        save_raw_image = self.dot_matrix_two_dimensional(
+                            save_raw_image, save_img=False, save_path=f'test_{step_id}.jpg', pixel_goal=pixel_goal
+                        )
+                    if self.save_video:
+                        frame = observations_to_image({'rgb': np.asarray(save_raw_image)}, info)
+                        vis_frames.append(frame)
+
+                print("step_id", step_id, "action", action)
+
+                # refactor: core
+                if action == 5:
+                    self.env.step(action)
+                    observations, _, done, _ = self.env.step(action)
+                else:
+                    observations, _, done, _ = self.env.step(action)
+                    step_id += 1
+                    messages = []
+
+            # ---------- 3. End of episode -----------
+            # Update result and write progress to the output_path/progress.json
+
+            process_bar.update(1)
+
+            # After the episode finishes, collect metrics:
+            metrics = self.env.get_metrics()
+
+            sucs.append(metrics['success'])
+            spls.append(metrics['spl'])
+            oss.append(metrics['oracle_success'])
+            nes.append(metrics["distance_to_goal"])
+
+            print(
+                f"scene_episode {scene_id}_{episode_id:04d} success: {metrics['success']}, "
+                f"spl: {metrics['spl']}, os: {metrics['oracle_success']}, "
+                f"ne: {metrics['distance_to_goal']}"
+            )
+
+            # Write per-episode result.json entry (still per-rank)
+            result = {
+                "scene_id": scene_id,
+                "episode_id": episode_id,
+                "success": metrics["success"],
+                "spl": metrics["spl"],
+                "os": metrics['oracle_success'],
+                "ne": metrics["distance_to_goal"],
+                "steps": step_id,
+                "episode_instruction": episode_instruction,
+            }
+            os.makedirs(self.output_path, exist_ok=True)
+            with open(os.path.join(self.output_path, 'progress.json'), 'a') as f:
+                f.write(json.dumps(result) + "\n")
+
+        self.env.close()
+
+        return {
+            "sucs": torch.tensor(sucs, device=self.device),
+            "spls": torch.tensor(spls, device=self.device),
+            "oss": torch.tensor(oss, device=self.device),
+            "nes": torch.tensor(nes, device=self.device),
+        }
 
     def parse_actions(self, output):
         action_patterns = '|'.join(re.escape(action) for action in self.actions2idx)
+        # import ipdb; ipdb.set_trace()
         regex = re.compile(action_patterns)
         matches = regex.findall(output)
         actions = [self.actions2idx[match] for match in matches]
         actions = itertools.chain.from_iterable(actions)
         return list(actions)
 
-    def step_no_infer(self, rgb, depth, pose):
-        image = Image.fromarray(rgb).convert('RGB')
-        image = image.resize((self.resize_w, self.resize_h))
-        self.rgb_list.append(image)
-        image.save(f"{self.save_dir}/debug_raw_{self.episode_idx: 04d}.jpg")
-        self.episode_idx += 1
-
-    def trajectory_tovw(self, trajectory, kp=1.0):
-        subgoal = trajectory[-1]
-        linear_vel, angular_vel = kp * np.linalg.norm(subgoal[:2]), kp * subgoal[2]
-        linear_vel = np.clip(linear_vel, 0, 0.5)
-        angular_vel = np.clip(angular_vel, -0.5, 0.5)
-        return linear_vel, angular_vel
-
-    def step(self, rgb, depth, pose, instruction, intrinsic, look_down=False):
-        dual_sys_output = S2Output()
-        no_output_flag = self.output_action is None and self.output_latent is None
-        if (self.episode_idx - self.last_s2_idx > self.PLAN_STEP_GAP) or look_down or no_output_flag:
-            self.output_action, self.output_latent, self.output_pixel = self.step_s2(
-                rgb, depth, pose, instruction, intrinsic, look_down
-            )
-            self.last_s2_idx = self.episode_idx
-            dual_sys_output.output_pixel = self.output_pixel
-            self.pixel_goal_rgb = copy.deepcopy(rgb)
-            self.pixel_goal_depth = copy.deepcopy(depth)
-        else:
-            self.step_no_infer(rgb, depth, pose)
-
-        if self.output_action is not None:
-            dual_sys_output.output_action = copy.deepcopy(self.output_action)
-            self.output_action = None
-        elif self.output_latent is not None:
-            processed_pixel_rgb = np.array(Image.fromarray(self.pixel_goal_rgb).resize((224, 224))) / 255
-            processed_pixel_depth = np.array(Image.fromarray(self.pixel_goal_depth).resize((224, 224)))
-            processed_rgb = np.array(Image.fromarray(rgb).resize((224, 224))) / 255
-            processed_depth = np.array(Image.fromarray(depth).resize((224, 224)))
-            rgbs = (
-                torch.stack([torch.from_numpy(processed_pixel_rgb), torch.from_numpy(processed_rgb)])
-                .unsqueeze(0)
-                .to(self.device)
-            )
-            depths = (
-                torch.stack([torch.from_numpy(processed_pixel_depth), torch.from_numpy(processed_depth)])
-                .unsqueeze(0)
-                .unsqueeze(-1)
-                .to(self.device)
-            )
-            trajectories = self.step_s1(self.output_latent, rgbs, depths)
+    def preprocess_depth_image_v2(
+        self, depth_image, do_depth_scale=True, depth_scale=1000, target_height=None, target_width=None
+    ):
+        if target_height is None:
+            target_height = self.image_processor.crop_size['height']  # 384
+            target_width = self.image_processor.crop_size['width']  # 384
 
-            dual_sys_output.output_trajectory = traj_to_actions(trajectories, use_discrate_action=False)
+        resized_depth_image = depth_image.resize((target_width, target_height), Image.NEAREST)
 
-        return dual_sys_output
+        img = to_numpy_array(resized_depth_image)
+        if do_depth_scale:
+            img = img / depth_scale
 
-    def step_s2(self, rgb, depth, pose, instruction, intrinsic, look_down=False):
-        image = Image.fromarray(rgb).convert('RGB')
-        if not look_down:
-            image = image.resize((self.resize_w, self.resize_h))
-            self.rgb_list.append(image)
-            image.save(f"{self.save_dir}/debug_raw_{self.episode_idx: 04d}.jpg")
-        else:
-            image.save(f"{self.save_dir}/debug_raw_{self.episode_idx: 04d}_look_down.jpg")
-        if not look_down:
-            self.conversation_history = []
-            self.past_key_values = None
-
-            sources = copy.deepcopy(self.conversation)
-            sources[0]["value"] = sources[0]["value"].replace('<instruction>.', instruction)
-            cur_images = self.rgb_list[-1:]
-            if self.episode_idx == 0:
-                history_id = []
-            else:
-                history_id = np.unique(np.linspace(0, self.episode_idx - 1, self.num_history, dtype=np.int32)).tolist()
-                placeholder = (DEFAULT_IMAGE_TOKEN + '\n') * len(history_id)
-                sources[0]["value"] += f' These are your historical observations: {placeholder}.'
-
-            history_id = sorted(history_id)
-            self.input_images = [self.rgb_list[i] for i in history_id] + cur_images
-            input_img_id = 0
-            self.episode_idx += 1
-        else:
-            self.input_images.append(image)
-            input_img_id = -1
-            assert self.llm_output != "", "Last llm_output should not be empty when look down"
-            sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
-            self.conversation_history.append(
-                {'role': 'assistant', 'content': [{'type': 'text', 'text': self.llm_output}]}
-            )
+        return img, (target_width, target_height)
 
-        prompt = self.conjunctions[0] + DEFAULT_IMAGE_TOKEN
-        sources[0]["value"] += f" {prompt}."
-        prompt_instruction = copy.deepcopy(sources[0]["value"])
-        parts = split_and_clean(prompt_instruction)
-
-        content = []
-        for i in range(len(parts)):
-            if parts[i] == "<image>":
-                content.append({"type": "image", "image": self.input_images[input_img_id]})
-                input_img_id += 1
-            else:
-                content.append({"type": "text", "text": parts[i]})
-
-        self.conversation_history.append({'role': 'user', 'content': content})
-
-        text = self.processor.apply_chat_template(self.conversation_history, tokenize=False, add_generation_prompt=True)
-
-        inputs = self.processor(text=[text], images=self.input_images, return_tensors="pt").to(self.device)
-        t0 = time.time()
-        with torch.no_grad():
-            outputs = self.model.generate(
-                **inputs,
-                max_new_tokens=128,
-                do_sample=False,
-                use_cache=True,
-                past_key_values=self.past_key_values,
-                return_dict_in_generate=True,
-                raw_input_ids=copy.deepcopy(inputs.input_ids),
-            )
-        output_ids = outputs.sequences
+    def get_intrinsic_matrix(self, sensor_cfg) -> np.ndarray:
+        width = sensor_cfg.width
+        height = sensor_cfg.height
+        fov = sensor_cfg.hfov
+        fx = (width / 2.0) / np.tan(np.deg2rad(fov / 2.0))
+        fy = fx  # Assuming square pixels (fx = fy)
+        cx = (width - 1.0) / 2.0
+        cy = (height - 1.0) / 2.0
 
-        t1 = time.time()
-        self.llm_output = self.processor.tokenizer.decode(
-            output_ids[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
+        intrinsic_matrix = np.array(
+            [[fx, 0.0, cx, 0.0], [0.0, fy, cy, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
         )
-        with open(f"{self.save_dir}/llm_output_{self.episode_idx: 04d}.txt", 'w') as f:
-            f.write(self.llm_output)
-        self.last_output_ids = copy.deepcopy(output_ids[0])
-        self.past_key_values = copy.deepcopy(outputs.past_key_values)
-        print(f"output {self.episode_idx}  {self.llm_output} cost: {t1 - t0}s")
-        if bool(re.search(r'\d', self.llm_output)):
-            coord = [int(c) for c in re.findall(r'\d+', self.llm_output)]
-            pixel_goal = [int(coord[1]), int(coord[0])]
-            image_grid_thw = torch.cat([thw.unsqueeze(0) for thw in inputs.image_grid_thw], dim=0)
-            pixel_values = inputs.pixel_values
-            t0 = time.time()
-            with torch.no_grad():
-                traj_latents = self.model.generate_latents(output_ids, pixel_values, image_grid_thw)
-                return None, traj_latents, pixel_goal
-
-        else:
-            action_seq = self.parse_actions(self.llm_output)
-            return action_seq, None, None
-
-    def step_s1(self, latent, rgb, depth):
-        all_trajs = self.model.generate_traj(latent, rgb, depth, use_async=True)
-        return all_trajs
+        return intrinsic_matrix
+
+    def get_axis_align_matrix(self):
+        ma = np.array([[0, 0, 1, 0], [-1, 0, 0, 0], [0, -1, 0, 0], [0, 0, 0, 1]])
+        return ma
+
+    def xyz_yaw_to_tf_matrix(self, xyz: np.ndarray, yaw: float) -> np.ndarray:
+        x, y, z = xyz
+        transformation_matrix = np.array(
+            [
+                [np.cos(yaw), -np.sin(yaw), 0, x],
+                [np.sin(yaw), np.cos(yaw), 0, y],
+                [0, 0, 1, z],
+                [0, 0, 0, 1],
+            ]
+        )
+        return transformation_matrix
+
+    def xyz_pitch_to_tf_matrix(self, xyz: np.ndarray, pitch: float) -> np.ndarray:
+        """Converts a given position and pitch angle to a 4x4 transformation matrix.
+
+        Args:
+            xyz (np.ndarray): A 3D vector representing the position.
+            pitch (float): The pitch angle in radians for y axis.
+        Returns:
+            np.ndarray: A 4x4 transformation matrix.
+        """
+
+        x, y, z = xyz
+        transformation_matrix = np.array(
+            [
+                [np.cos(pitch), 0, np.sin(pitch), x],
+                [0, 1, 0, y],
+                [-np.sin(pitch), 0, np.cos(pitch), z],
+                [0, 0, 0, 1],
+            ]
+        )
+        return transformation_matrix
+
+    def xyz_yaw_pitch_to_tf_matrix(self, xyz: np.ndarray, yaw: float, pitch: float) -> np.ndarray:
+        """Converts a given position and yaw, pitch angles to a 4x4 transformation matrix.
+
+        Args:
+            xyz (np.ndarray): A 3D vector representing the position.
+            yaw (float): The yaw angle in radians.
+            pitch (float): The pitch angle in radians for y axis.
+        Returns:
+            np.ndarray: A 4x4 transformation matrix.
+        """
+        x, y, z = xyz
+        rot1 = self.xyz_yaw_to_tf_matrix(xyz, yaw)[:3, :3]
+        rot2 = self.xyz_pitch_to_tf_matrix(xyz, pitch)[:3, :3]
+        transformation_matrix = np.eye(4)
+        transformation_matrix[:3, :3] = rot1 @ rot2
+        transformation_matrix[:3, 3] = xyz
+        return transformation_matrix
+
+    def pixel_to_gps(self, pixel, depth, intrinsic, tf_camera_to_episodic):
+        '''
+        Args:
+            pixel: (2,) - [u, v] pixel coordinates
+            depth: (H, W) - depth image where depth[v, u] gives depth in meters
+            intrinsic: (4, 4) - camera intrinsic matrix
+            tf_camera_to_episodic: (4, 4) - transformation from camera to episodic frame
+        Returns:
+            (x, y): (x, y) coordinates in the episodic frame
+        '''
+        v, u = pixel
+        z = depth[v, u]
+        print("depthhhhhhhhhhhhhh", z)
+
+        x = (u - intrinsic[0, 2]) * z / intrinsic[0, 0]
+        y = (v - intrinsic[1, 2]) * z / intrinsic[1, 1]
+        point_camera = np.array([x, y, z, 1.0])
+
+        # Transform to episodic frame
+        point_episodic = tf_camera_to_episodic @ point_camera
+        point_episodic = point_episodic[:3] / point_episodic[3]
+
+        x = point_episodic[0]
+        y = point_episodic[1]
+
+        return (x, y)  # same as habitat gps
+
+    def dot_matrix_two_dimensional(
+        self,
+        image_or_image_path,
+        save_path=None,
+        dots_size_w=8,
+        dots_size_h=8,
+        save_img=False,
+        font_path='fonts/arial.ttf',
+        pixel_goal=None,
+    ):
+        """
+        takes an original image as input, save the processed image to save_path. Each dot is labeled with two-dimensional Cartesian coordinates (x,y). Suitable for single-image tasks.
+        control args:
+        1. dots_size_w: the number of columns of the dots matrix
+        2. dots_size_h: the number of rows of the dots matrix
+        """
+        with open_image(image_or_image_path) as img:
+            if img.mode != 'RGB':
+                img = img.convert('RGB')
+            draw = ImageDraw.Draw(img, 'RGB')
+
+            width, height = img.size
+            grid_size_w = dots_size_w + 1
+            grid_size_h = dots_size_h + 1
+            cell_width = width / grid_size_w
+            cell_height = height / grid_size_h
+
+            font = ImageFont.truetype(font_path, width // 40)  # Adjust font size if needed; default == width // 40
+
+            target_i = target_j = None
+            if pixel_goal is not None:
+                y_pixel, x_pixel = pixel_goal[0], pixel_goal[1]
+                # Validate pixel coordinates
+                if not (0 <= x_pixel < width and 0 <= y_pixel < height):
+                    raise ValueError(f"pixel_goal {pixel_goal} exceeds image dimensions ({width}x{height})")
+
+                # Convert to grid coordinates
+                target_i = round(x_pixel / cell_width)
+                target_j = round(y_pixel / cell_height)
+
+                # Validate grid bounds
+                if not (1 <= target_i <= dots_size_w and 1 <= target_j <= dots_size_h):
+                    raise ValueError(
+                        f"pixel_goal {pixel_goal} maps to grid ({target_j},{target_i}), "
+                        f"valid range is (1,1)-({dots_size_h},{dots_size_w})"
+                    )
+
+            count = 0
+
+            for j in range(1, grid_size_h):
+                for i in range(1, grid_size_w):
+                    x = int(i * cell_width)
+                    y = int(j * cell_height)
+
+                    pixel_color = img.getpixel((x, y))
+                    # choose a more contrasting color from black and white
+                    if pixel_color[0] + pixel_color[1] + pixel_color[2] >= 255 * 3 / 2:
+                        opposite_color = (0, 0, 0)
+                    else:
+                        opposite_color = (255, 255, 255)
+
+                    if pixel_goal is not None and i == target_i and j == target_j:
+                        opposite_color = (255, 0, 0)  # Red for target
+
+                    circle_radius = width // 240  # Adjust dot size if needed; default == width // 240
+                    draw.ellipse(
+                        [(x - circle_radius, y - circle_radius), (x + circle_radius, y + circle_radius)],
+                        fill=opposite_color,
+                    )
+
+                    text_x, text_y = x + 3, y
+                    count_w = count // dots_size_w
+                    count_h = count % dots_size_w
+                    label_str = f"({count_w+1},{count_h+1})"
+                    draw.text((text_x, text_y), label_str, fill=opposite_color, font=font)
+                    count += 1
+            if save_img:
+                print(">>> dots overlaid image processed, stored in", save_path)
+                img.save(save_path)
+            return img
diff --git a/internnav/internnav_habitat/habitat_vln_evaluator.py b/internnav/internnav_habitat/habitat_vln_evaluator.py
index ef18ec07..ba86d6db 100644
--- a/internnav/internnav_habitat/habitat_vln_evaluator.py
+++ b/internnav/internnav_habitat/habitat_vln_evaluator.py
@@ -33,7 +33,6 @@
 
 try:
     import habitat
-    from habitat import Env
     from habitat.config.default import get_agent_config
     from habitat.config.default_structured_configs import (
         CollisionsMeasurementConfig,
@@ -41,10 +40,7 @@
         TopDownMapMeasurementConfig,
     )
     from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower
-    from habitat.utils.visualizations.utils import (
-        images_to_video,
-        observations_to_image,
-    )
+    from habitat.utils.visualizations.utils import observations_to_image
     from habitat_baselines.config.default import get_config as get_habitat_config
 except Exception as e:
     print("Habitat Error:", e)
@@ -58,9 +54,10 @@
 class HabitatVlnEvaluator(DistributedEvaluator):
     def __init__(self, cfg: EvalCfg):
         args = argparse.Namespace(**cfg.eval_settings)
-        self.args = args
         self.save_video = args.save_video
         self.epoch = args.epoch
+        self.max_steps_per_episode = args.max_steps_per_episode
+        self.output_path = args.output_path
 
         # create habitat config
         self.config_path = cfg.env.env_settings['config_path']
@@ -90,31 +87,34 @@ def __init__(self, cfg: EvalCfg):
                 }
             )
         cfg.env.env_settings['habitat_config'] = self.config
+        cfg.env.env_settings['output_path'] = self.output_path
 
         # init agent and env
         super().__init__(cfg)
 
         # ------------------------------------- model ------------------------------------------
-        processor = AutoProcessor.from_pretrained(args.model_path)
+        self.model_args = argparse.Namespace(**cfg.agent.model_settings)
+
+        processor = AutoProcessor.from_pretrained(self.model_args.model_path)
         processor.tokenizer.padding_side = 'left'
 
         device = torch.device(f"cuda:{self.local_rank}")
-        if args.mode == 'dual_system':
+        if self.model_args.mode == 'dual_system':
             model = InternVLAN1ForCausalLM.from_pretrained(
-                args.model_path,
+                self.model_args.model_path,
                 torch_dtype=torch.bfloat16,
                 attn_implementation="flash_attention_2",
                 device_map={"": device},
             )
-        elif args.mode == 'system2':
+        elif self.model_args.mode == 'system2':
             model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-                args.model_path,
+                self.model_args.model_path,
                 torch_dtype=torch.bfloat16,
                 attn_implementation="flash_attention_2",
                 device_map={"": device},
             )
         else:
-            raise ValueError(f"Invalid mode: {args.mode}")
+            raise ValueError(f"Invalid mode: {self.model_args.mode}")
 
         model.eval()
         self.device = device
@@ -149,9 +149,9 @@ def __init__(self, cfg: EvalCfg):
 
         self.objectnav_instructions = ["Search for the {target_object}."]
 
-        self.num_frames = args.num_frames
-        self.num_future_steps = args.num_future_steps
-        self.num_history = args.num_history
+        self.num_frames = self.model_args.num_frames
+        self.num_future_steps = self.model_args.num_future_steps
+        self.num_history = self.model_args.num_history
 
         # ------------------------------------- old ------------------------------------------
         self._camera_height = self.sim_sensors_config.rgb_sensor.position[1]
@@ -162,8 +162,6 @@ def __init__(self, cfg: EvalCfg):
         self._camera_fov = camera_fov_rad
         self._fx = self._fy = self.sim_sensors_config.depth_sensor.width / (2 * np.tan(camera_fov_rad / 2))
 
-        # ------------------------------------- remove ------------------------------------------
-
     def eval_action(self):
         """
         Run local episodes on this rank.
@@ -171,10 +169,10 @@ def eval_action(self):
         Returns dict[str, Tensor] on GPU (1D tensors of same length).
         """
         # Old behavior was something like:
-        # sucs, spls, oss, nes, ep_num = self.eval_action(self.args.local_rank)
+        # sucs, spls, oss, nes, ep_num = self.eval_action(self.local_rank)
         # Now just implement the actual eval here and return dict.
 
-        sucs, spls, oss, nes, _ = self._run_local_eval(self.args.local_rank)
+        sucs, spls, oss, nes, _ = self._run_local_eval(self.local_rank)
 
         return {
             "sucs": sucs,  # shape [N_local]
@@ -203,126 +201,153 @@ def calc_metrics(self, global_metrics: dict) -> dict:
             # "length" will be filled by base class
         }
 
-    def _eval_action(self):
-        obs = self.env.reset()
-        action = self.agent.reset()
-        while not self.env.is_running():
-            action = self.agent.step(action, obs)
-            obs, terminated = self.env.step(action)
-            if terminated:
-                obs = self.env.reset()
-                self.agent.reset()
-                self.env.update_metric()
-
-    # refactor
-    def config_env(self) -> Env:
-        env = Env(config=self.config)
-        env.episodes = env.episodes[0:2]  # for debug
-        return env
-
     def _run_local_eval(self, idx=0) -> None:  # noqa: C901
-        self.model.eval()
-        env = self.config_env()
-        scene_episode_dict = {}
-        for episode in env.episodes:
-            if episode.scene_id not in scene_episode_dict:
-                scene_episode_dict[episode.scene_id] = []
-            scene_episode_dict[episode.scene_id].append(episode)
-
-        intrinsic_matrix = self.get_intrinsic_matrix(
-            self.config.habitat.simulator.agents.main_agent.sim_sensors.rgb_sensor
-        )
+        """
+        Run local evaluation on this rank.
+
+        Important: if resuming from previous results, need to read from / write to "self.output_path/progress.json".
+                    For each episode, save the result dict in jsonl format to that file.
+                    In Env, the episodes are already filtered by this file, tasks that have the same (scene_id, episode_id) are skipped.
+
+
+        Returns
+        -------
+        dict[str, Tensor]:
+            {
+                "sucs": [N_local],
+                "spls": [N_local],
+                "oss":  [N_local],
+                "nes":  [N_local],
+            }
+        """
+        # Create / get env
+        # self.env = self.env  # HabitatEnv from DistributedEvaluator
+
         sucs, spls, oss, nes = [], [], [], []
-        done_res = []
+        self.model.eval()
 
-        if os.path.exists(os.path.join(self.output_path, 'result.json')):
-            with open(os.path.join(self.output_path, 'result.json'), 'r') as f:
+        # resume from previous results
+        # TODO: Current read write op is not distributed safe
+        if os.path.exists(os.path.join(self.output_path, 'progress.json')):
+            with open(os.path.join(self.output_path, 'progress.json'), 'r') as f:
                 for line in f.readlines():
                     res = json.loads(line)
-                    done_res.append([res["scene_id"], res["episode_id"]])
+                    if "scene_id" not in res:
+                        print("This evaluation has already finished!")
+                        return (
+                            torch.tensor(sucs).to(self.device),
+                            torch.tensor(spls).to(self.device),
+                            torch.tensor(oss).to(self.device),
+                            torch.tensor(nes).to(self.device),
+                            torch.tensor(len(sucs)).to(self.device),
+                        )
                     if idx == 0:  # noqa: F405 TODO this need to keep in evaluator
                         sucs.append(res['success'])
                         spls.append(res['spl'])
                         oss.append(res['os'])
                         nes.append(res['ne'])
 
-        # refactor: sort to scene: [episode] but nothing actually used
-        for scene in sorted(scene_episode_dict.keys()):
-            episodes = scene_episode_dict[scene]
-            scene_id = scene.split('/')[-2]
-            print(f"scene_id = {scene_id}")
-            process_bar = tqdm.tqdm(range(len(episodes[idx :: self.world_size])), desc=f"scene {scene_id}")
-            for episode in episodes[idx :: self.world_size]:
-                episode_instruction = (
-                    episode.instruction.instruction_text
-                    if 'objectnav' not in self.config_path
-                    else episode.object_category
-                )
-                print("episode start", episode_instruction)
-                episode_id = int(episode.episode_id)
-                if [scene_id, episode_id] in done_res:
-                    continue
-
-                # refactor env warm up
-                env.current_episode = episode
-                observations = env.reset()
-
-                agent_state = env.sim.get_agent_state()
-                rotation = agent_state.rotation
-                translation = agent_state.position
-                rotation_matrix = quaternion.as_rotation_matrix(rotation)
-                transformation_matrix = np.eye(4)
-                transformation_matrix[:3, :3] = rotation_matrix
-                transformation_matrix[:3, 3] = translation
-
-                agent = ShortestPathFollower(env.sim, 0.25, False)
-
-                os.makedirs(os.path.join(self.output_path, f'check_sim_{self.epoch}'), exist_ok=True)
-                Image.fromarray(observations['rgb']).save(
-                    os.path.join(self.output_path, f'check_sim_{self.epoch}', f'rgb_{idx}.jpg')
+        # Episode loop is now driven by env.reset() + env.is_running
+        process_bar = tqdm.tqdm(total=len(self.env.episodes), desc=f"Eval Epoch {self.epoch} Rank {idx}")
+        while self.env.is_running:
+
+            # ------------ 1. Start of episode ------------
+            observations = self.env.reset()
+            if not self.env.is_running or observations is None:
+                break
+
+            # ---- episode meta (scene_id, episode_id, instruction) ----
+            # we get it from the underlying habitat env
+            episode = self.env.get_current_episode()
+            scene_id = episode.scene_id.split('/')[-2]
+            episode_id = int(episode.episode_id)
+            episode_instruction = (
+                episode.instruction.instruction_text if 'objectnav' not in self.config_path else episode.object_category
+            )
+            print("episode start", episode_instruction)
+
+            agent_state = self.env._env.sim.get_agent_state()
+            rotation = agent_state.rotation
+            translation = agent_state.position
+            rotation_matrix = quaternion.as_rotation_matrix(rotation)
+            transformation_matrix = np.eye(4)
+            transformation_matrix[:3, :3] = rotation_matrix
+            transformation_matrix[:3, 3] = translation
+
+            agent = ShortestPathFollower(self.env._env.sim, 0.25, False)
+
+            # save first frame per rank to validate sim quality
+            os.makedirs(os.path.join(self.output_path, f'check_sim_{self.epoch}'), exist_ok=True)
+            Image.fromarray(observations['rgb']).save(
+                os.path.join(self.output_path, f'check_sim_{self.epoch}', f'rgb_{idx}.jpg')
+            )
+
+            vis_frames = []
+            step_id = 0
+
+            if self.save_video:
+                os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
+            initial_height = self.env._env.sim.get_agent_state().position[1]
+
+            rgb_list = []
+            action_seq = []
+            output_ids = None
+
+            goal = None
+            action = None
+            messages = []
+            local_actions = []
+
+            done = False
+
+            # ---------- 2. Episode step loop -----------
+            while (not done) and (step_id <= self.max_steps_per_episode):
+                # refactor agent get action
+                rgb = observations["rgb"]
+                depth = observations["depth"]
+                x, y = observations["gps"]
+                camera_yaw = observations["compass"][0]
+                depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
+                depth = depth * (self._max_depth - self._min_depth) + self._min_depth
+                depth = depth * 1000
+
+                agent_state = self.env._env.sim.get_agent_state()
+                height = agent_state.position[1] - initial_height
+                camera_position = np.array([x, -y, self._camera_height + height])
+                tf_camera_to_episodic = (
+                    self.xyz_yaw_pitch_to_tf_matrix(camera_position, camera_yaw, np.deg2rad(30))
+                    @ self.get_axis_align_matrix()
                 )
 
-                vis_frames = []
-                step_id = 0
-
-                if self.save_video:
-                    os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
-                initial_height = env.sim.get_agent_state().position[1]
-
-                rgb_list = []
-                action_seq = []
-                output_ids = None
-
-                goal = None
-                action = None
-                messages = []
-                local_actions = []
-
-                while not env.episode_over and step_id <= 500:
-                    # refactor agent get action
-                    rgb = observations["rgb"]
-                    depth = observations["depth"]
-                    x, y = observations["gps"]
-                    camera_yaw = observations["compass"][0]
-                    depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
-                    depth = depth * (self._max_depth - self._min_depth) + self._min_depth
-                    depth = depth * 1000
-
-                    agent_state = env.sim.get_agent_state()
-                    height = agent_state.position[1] - initial_height
-                    camera_position = np.array([x, -y, self._camera_height + height])
-                    tf_camera_to_episodic = (
-                        self.xyz_yaw_pitch_to_tf_matrix(camera_position, camera_yaw, np.deg2rad(30))
-                        @ self.get_axis_align_matrix()
+                image = Image.fromarray(rgb).convert('RGB')
+                save_raw_image = image.copy()
+
+                save_dot = False
+                if action == 5:
+                    look_down_image = image
+                    save_raw_image = look_down_image.copy()
+                    look_down_depth, resize_shape = self.preprocess_depth_image_v2(
+                        Image.fromarray(depth.astype(np.uint16), mode='I;16'),
+                        do_depth_scale=True,
+                        depth_scale=1000,
+                        target_height=224,
+                        target_width=224,
                     )
-
-                    image = Image.fromarray(rgb).convert('RGB')
-                    save_raw_image = image.copy()
-
-                    save_dot = False
-                    if action == 5:
-                        look_down_image = image
-                        save_raw_image = look_down_image.copy()
+                    look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
+                    look_down_depth[look_down_depth > 5.0] = 5.0
+                else:
+                    image = image.resize((self.model_args.resize_w, self.model_args.resize_h))
+                    rgb_list.append(image)
+
+                    if self.model_args.mode == 'dual_system':
+                        down_observations, _, done, _ = self.env.step(5)
+                        down_observations, _, done, _ = self.env.step(5)
+
+                        look_down_image = Image.fromarray(down_observations["rgb"]).convert('RGB')
+                        depth = down_observations["depth"]
+                        depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
+                        depth = depth * (self._max_depth - self._min_depth) + self._min_depth
+                        depth = depth * 1000
                         look_down_depth, resize_shape = self.preprocess_depth_image_v2(
                             Image.fromarray(depth.astype(np.uint16), mode='I;16'),
                             do_depth_scale=True,
@@ -332,295 +357,277 @@ def _run_local_eval(self, idx=0) -> None:  # noqa: C901
                         )
                         look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
                         look_down_depth[look_down_depth > 5.0] = 5.0
-                    else:
-                        image = image.resize((self.args.resize_w, self.args.resize_h))
-                        rgb_list.append(image)
-
-                        if self.args.mode == 'dual_system':
-                            down_observations = env.step(5)
-                            down_observations = env.step(5)
-
-                            look_down_image = Image.fromarray(down_observations["rgb"]).convert('RGB')
-                            depth = down_observations["depth"]
-                            depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
-                            depth = depth * (self._max_depth - self._min_depth) + self._min_depth
-                            depth = depth * 1000
-                            look_down_depth, resize_shape = self.preprocess_depth_image_v2(
-                                Image.fromarray(depth.astype(np.uint16), mode='I;16'),
-                                do_depth_scale=True,
-                                depth_scale=1000,
-                                target_height=224,
-                                target_width=224,
-                            )
-                            look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
-                            look_down_depth[look_down_depth > 5.0] = 5.0
 
-                            env.step(4)
-                            env.step(4)
+                        self.env.step(4)
+                        self.env.step(4)
 
-                    info = env.get_metrics()
+                info = self.env.get_metrics()
 
-                    if len(action_seq) == 0 and goal is None:
-                        if action != 5:
-                            sources = copy.deepcopy(self.conversation)
-                            sources[0]["value"] = sources[0]["value"].replace(
-                                '<instruction>.', episode.instruction.instruction_text[:-1]
-                            )
-                            cur_images = rgb_list[-1:]
-                            if step_id == 0:
-                                history_id = []
-                            else:
-                                history_id = np.unique(
-                                    np.linspace(0, step_id - 1, self.num_history, dtype=np.int32)
-                                ).tolist()
-                                placeholder = (DEFAULT_IMAGE_TOKEN + '\n') * len(history_id)
-                                sources[0]["value"] += f' These are your historical observations: {placeholder}.'
-
-                            history_id = sorted(history_id)
-                            print('history_idddddddd', step_id, history_id)
-                            input_images = [rgb_list[i] for i in history_id] + cur_images
-                            input_img_id = 0
+                if len(action_seq) == 0 and goal is None:
+                    if action != 5:
+                        sources = copy.deepcopy(self.conversation)
+                        sources[0]["value"] = sources[0]["value"].replace(
+                            '<instruction>.', episode.instruction.instruction_text[:-1]
+                        )
+                        cur_images = rgb_list[-1:]
+                        if step_id == 0:
+                            history_id = []
                         else:
-                            assert action == 5
-                            sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
-                            input_images += [look_down_image]
-                            # messages.append(
-                            #     {'role': 'assistant', 'content': [{'type': 'text', 'text': llm_outputs}]}  # noqa: F405
-                            # )
-                            input_img_id = -1
-
-                        prompt = random.choice(self.conjunctions) + DEFAULT_IMAGE_TOKEN
-                        sources[0]["value"] += f" {prompt}."
-                        print('sources', step_id, sources)
-                        prompt_instruction = copy.deepcopy(sources[0]["value"])
-                        parts = split_and_clean(prompt_instruction)
-
-                        content = []
-                        for i in range(len(parts)):
-                            if parts[i] == "<image>":
-                                content.append({"type": "image", "image": input_images[input_img_id]})
-                                input_img_id += 1
-                            else:
-                                content.append({"type": "text", "text": parts[i]})
+                            history_id = np.unique(
+                                np.linspace(0, step_id - 1, self.num_history, dtype=np.int32)
+                            ).tolist()
+                            placeholder = (DEFAULT_IMAGE_TOKEN + '\n') * len(history_id)
+                            sources[0]["value"] += f' These are your historical observations: {placeholder}.'
+
+                        history_id = sorted(history_id)
+                        print('history_idddddddd', step_id, history_id)
+                        input_images = [rgb_list[i] for i in history_id] + cur_images
+                        input_img_id = 0
+                    else:
+                        assert action == 5
+                        sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
+                        input_images += [look_down_image]
+                        # messages.append(
+                        #     {'role': 'assistant', 'content': [{'type': 'text', 'text': llm_outputs}]}  # noqa: F405
+                        # )
+                        input_img_id = -1
+
+                    prompt = random.choice(self.conjunctions) + DEFAULT_IMAGE_TOKEN
+                    sources[0]["value"] += f" {prompt}."
+                    print('sources', step_id, sources)
+                    prompt_instruction = copy.deepcopy(sources[0]["value"])
+                    parts = split_and_clean(prompt_instruction)
+
+                    content = []
+                    for i in range(len(parts)):
+                        if parts[i] == "<image>":
+                            content.append({"type": "image", "image": input_images[input_img_id]})
+                            input_img_id += 1
+                        else:
+                            content.append({"type": "text", "text": parts[i]})
 
-                        messages.append({'role': 'user', 'content': content})
+                    messages.append({'role': 'user', 'content': content})
 
-                        print('step_id', step_id, 'messages:', messages)
+                    print('step_id', step_id, 'messages:', messages)
 
-                        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+                    text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
-                        inputs = self.processor(text=[text], images=input_images, return_tensors="pt").to(
-                            self.model.device
-                        )
+                    inputs = self.processor(text=[text], images=input_images, return_tensors="pt").to(self.model.device)
 
-                        with torch.no_grad():
-                            output_ids = self.model.generate(**inputs, max_new_tokens=128, do_sample=False)
+                    with torch.no_grad():
+                        output_ids = self.model.generate(**inputs, max_new_tokens=128, do_sample=False)
+
+                    llm_outputs = self.processor.tokenizer.decode(
+                        output_ids[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
+                    )
+                    print('step_id:', step_id, 'output text:', llm_outputs)
 
-                        llm_outputs = self.processor.tokenizer.decode(
-                            output_ids[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
+                    if bool(re.search(r'\d', llm_outputs)):
+                        forward_action = 0
+                        coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
+                        pixel_goal = [int(coord[1]), int(coord[0])]
+
+                        intrinsic_matrix = self.get_intrinsic_matrix(
+                            self.config.habitat.simulator.agents.main_agent.sim_sensors.rgb_sensor
                         )
-                        print('step_id:', step_id, 'output text:', llm_outputs)
-
-                        if bool(re.search(r'\d', llm_outputs)):
-                            forward_action = 0
-                            coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
-                            pixel_goal = [int(coord[1]), int(coord[0])]
-
-                            goal = self.pixel_to_gps(pixel_goal, depth / 1000, intrinsic_matrix, tf_camera_to_episodic)
-                            print('before', goal, depth.shape)
-                            goal = (transformation_matrix @ np.array([-goal[1], 0, -goal[0], 1]))[:3]
-
-                            if not env.sim.pathfinder.is_navigable(np.array(goal)):
-                                goal = np.array(env.sim.pathfinder.snap_point(np.array(goal)))
-
-                            # look down --> horizontal
-                            env.step(4)
-                            env.step(4)
-
-                            # Forking logic based on mode
-                            if self.args.mode == 'system2':
-                                action = agent.get_next_action(goal)
-                                if action == 0:
-                                    goal = None
-                                    output_ids = None
-                                    action = 2  # random action
-                                    print('conduct a random action 2')
-                                    observations = env.step(action)
-                                    step_id += 1
-                                    messages = []
-                                    continue
-                            else:  # dual-system logic
-                                local_actions = []
-                                pixel_values = inputs.pixel_values
-                                image_grid_thw = torch.cat([thw.unsqueeze(0) for thw in inputs.image_grid_thw], dim=0)
-
-                                with torch.no_grad():
-                                    traj_latents = self.model.generate_latents(output_ids, pixel_values, image_grid_thw)
-
-                                # prepocess align with navdp
-                                image_dp = (
-                                    torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
-                                )
-                                pix_goal_image = copy.copy(image_dp)
-                                images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
-                                depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
-                                pix_goal_depth = copy.copy(depth_dp)
-                                depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
-
-                                with torch.no_grad():
-                                    dp_actions = self.model.generate_traj(
-                                        traj_latents, images_dp, depths_dp, use_async=True
-                                    )
-
-                                random_choice = np.random.choice(dp_actions.shape[0])
-                                if self.args.continuous_traj:
-                                    action_list = traj_to_actions(dp_actions)
-                                    if len(action_list) < 8:
-                                        action_list += [0] * (8 - len(action_list))
-                                else:
-                                    action_list = chunk_token(dp_actions[random_choice])
-
-                                local_actions = action_list
-                                if len(local_actions) >= 4:
-                                    local_actions = local_actions[:4]
-                                action = local_actions[0]
-                                if action == 0:
-                                    goal = None
-                                    output_ids = None
-                                    action = 2  # random action
-                                    print('conduct a random action 2')
-                                    observations = env.step(action)
-                                    step_id += 1
-                                    messages = []
-                                    continue
-
-                            print('predicted goal', pixel_goal, goal, flush=True)
-                        else:
-                            action_seq = self.parse_actions(llm_outputs)
-                            print('actions', action_seq, flush=True)
+                        goal = self.pixel_to_gps(pixel_goal, depth / 1000, intrinsic_matrix, tf_camera_to_episodic)
+                        print('before', goal, depth.shape)
+                        goal = (transformation_matrix @ np.array([-goal[1], 0, -goal[0], 1]))[:3]
+
+                        if not self.env._env.sim.pathfinder.is_navigable(np.array(goal)):
+                            goal = np.array(self.env._env.sim.pathfinder.snap_point(np.array(goal)))
+
+                        # look down --> horizontal
+                        self.env.step(4)
+                        self.env.step(4)
 
-                    if len(action_seq) != 0:
-                        action = action_seq[0]
-                        action_seq.pop(0)
-                    elif goal is not None:
                         # Forking logic based on mode
-                        if self.args.mode == 'system2':
+                        if self.model_args.mode == 'system2':
                             action = agent.get_next_action(goal)
-                            action = action.detach().cpu().numpy()[0] if isinstance(action, torch.Tensor) else action
-                            action = action[0] if hasattr(action, "__len__") else action
+                            if action == 0:
+                                goal = None
+                                output_ids = None
+                                action = 2  # random action
+                                print('conduct a random action 2')
+                                observations = self.env.step(action)
+                                step_id += 1
+                                messages = []
+                                continue
                         else:  # dual-system logic
-                            if len(local_actions) == 0:
-                                # navdp
-                                local_actions = []
-                                image_dp = (
-                                    torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
+                            local_actions = []
+                            pixel_values = inputs.pixel_values
+                            image_grid_thw = torch.cat([thw.unsqueeze(0) for thw in inputs.image_grid_thw], dim=0)
+
+                            with torch.no_grad():
+                                traj_latents = self.model.generate_latents(output_ids, pixel_values, image_grid_thw)
+
+                            # prepocess align with navdp
+                            image_dp = (
+                                torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
+                            )
+                            pix_goal_image = copy.copy(image_dp)
+                            images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
+                            depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
+                            pix_goal_depth = copy.copy(depth_dp)
+                            depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
+
+                            with torch.no_grad():
+                                dp_actions = self.model.generate_traj(
+                                    traj_latents, images_dp, depths_dp, use_async=True
                                 )
 
-                                images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
-                                depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
-
-                                depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
-                                with torch.no_grad():
-                                    dp_actions = self.model.generate_traj(
-                                        traj_latents, images_dp, depths_dp, use_async=True
-                                    )
-
-                                random_choice = np.random.choice(dp_actions.shape[0])
-                                if self.args.continuous_traj:
-                                    action_list = traj_to_actions(dp_actions)
-                                    if len(action_list) < 8:
-                                        action_list += [0] * (8 - len(action_list))
-                                else:
-                                    action_list = chunk_token(dp_actions[random_choice])
-                                print("first action_list", action_list)
-
-                                local_actions = action_list
-                                if len(local_actions) >= 4:
-                                    local_actions = local_actions[:4]
-                                # if len(local_actions) >= 2:
-                                #     local_actions = local_actions[:2]
-
-                                print("local_actions", local_actions)
-
-                                action = local_actions.pop(0)
-                                # navdp
+                            random_choice = np.random.choice(dp_actions.shape[0])
+                            if self.model_args.continuous_traj:
+                                action_list = traj_to_actions(dp_actions)
+                                if len(action_list) < 8:
+                                    action_list += [0] * (8 - len(action_list))
                             else:
-                                action = local_actions.pop(0)
-
-                        forward_action += 1
-                        print('forward_action', forward_action, flush=True)
-                        if forward_action > 8:
-                            goal = None
-                            output_ids = None
-                            messages = []
-                            step_id += 1
-                            forward_action = 0
-                            local_actions = []
-                            continue
-                        if action == 0:
-                            goal = None
-                            output_ids = None
-                            messages = []
-                            step_id += 1
-                            forward_action = 0
-                            local_actions = []
-                            continue
+                                action_list = chunk_token(dp_actions[random_choice])
+
+                            local_actions = action_list
+                            if len(local_actions) >= 4:
+                                local_actions = local_actions[:4]
+                            action = local_actions[0]
+                            if action == 0:
+                                goal = None
+                                output_ids = None
+                                action = 2  # random action
+                                print('conduct a random action 2')
+                                observations = self.env.step(action)
+                                step_id += 1
+                                messages = []
+                                continue
+
+                        print('predicted goal', pixel_goal, goal, flush=True)
                     else:
-                        action = 0
-
-                    if info['top_down_map'] is not None:
-                        if save_dot:
-                            save_raw_image = self.dot_matrix_two_dimensional(
-                                save_raw_image, save_img=False, save_path=f'test_{step_id}.jpg', pixel_goal=pixel_goal
+                        action_seq = self.parse_actions(llm_outputs)
+                        print('actions', action_seq, flush=True)
+
+                if len(action_seq) != 0:
+                    action = action_seq[0]
+                    action_seq.pop(0)
+                elif goal is not None:
+                    # Forking logic based on mode
+                    if self.model_args.mode == 'system2':
+                        action = agent.get_next_action(goal)
+                        action = action.detach().cpu().numpy()[0] if isinstance(action, torch.Tensor) else action
+                        action = action[0] if hasattr(action, "__len__") else action
+                    else:  # dual-system logic
+                        if len(local_actions) == 0:
+                            # navdp
+                            local_actions = []
+                            image_dp = (
+                                torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
                             )
-                        frame = observations_to_image({'rgb': np.asarray(save_raw_image)}, info)
-                        vis_frames.append(frame)
 
-                    print("step_id", step_id, "action", action)
+                            images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
+                            depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
 
-                    # refactor: core
-                    if action == 5:
-                        env.step(action)
-                        observations = env.step(action)
-                    else:
-                        observations = env.step(action)
+                            depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
+                            with torch.no_grad():
+                                dp_actions = self.model.generate_traj(
+                                    traj_latents, images_dp, depths_dp, use_async=True
+                                )
+
+                            random_choice = np.random.choice(dp_actions.shape[0])
+                            if self.model_args.continuous_traj:
+                                action_list = traj_to_actions(dp_actions)
+                                if len(action_list) < 8:
+                                    action_list += [0] * (8 - len(action_list))
+                            else:
+                                action_list = chunk_token(dp_actions[random_choice])
+                            print("first action_list", action_list)
+
+                            local_actions = action_list
+                            if len(local_actions) >= 4:
+                                local_actions = local_actions[:4]
+                            # if len(local_actions) >= 2:
+                            #     local_actions = local_actions[:2]
+
+                            print("local_actions", local_actions)
+
+                            action = local_actions.pop(0)
+                            # navdp
+                        else:
+                            action = local_actions.pop(0)
+
+                    forward_action += 1
+                    print('forward_action', forward_action, flush=True)
+                    if forward_action > 8:
+                        goal = None
+                        output_ids = None
+                        messages = []
                         step_id += 1
+                        forward_action = 0
+                        local_actions = []
+                        continue
+                    if action == 0:
+                        goal = None
+                        output_ids = None
                         messages = []
+                        step_id += 1
+                        forward_action = 0
+                        local_actions = []
+                        continue
+                else:
+                    action = 0
+
+                if info['top_down_map'] is not None:
+                    if save_dot:
+                        save_raw_image = self.dot_matrix_two_dimensional(
+                            save_raw_image, save_img=False, save_path=f'test_{step_id}.jpg', pixel_goal=pixel_goal
+                        )
+                    if self.save_video:
+                        frame = observations_to_image({'rgb': np.asarray(save_raw_image)}, info)
+                        vis_frames.append(frame)
 
-                process_bar.update(1)
+                print("step_id", step_id, "action", action)
 
-                metrics = env.get_metrics()
-                if self.save_video:
-                    images_to_video(
-                        vis_frames,
-                        os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'),
-                        f'{episode_id:04d}',
-                        fps=6,
-                        quality=9,
-                    )
-                vis_frames.clear()
-                sucs.append(metrics['success'])
-                spls.append(metrics['spl'])
-                oss.append(metrics['oracle_success'])
-                nes.append(metrics["distance_to_goal"])
-                print(
-                    f"scene_episode {scene_id}_{episode_id:04d} success: {metrics['success']}, spl: {metrics['spl']}, os: {metrics['oracle_success']}, ne: {metrics['distance_to_goal']}"
-                )
+                # refactor: core
+                if action == 5:
+                    self.env.step(action)
+                    observations, _, done, _ = self.env.step(action)
+                else:
+                    observations, _, done, _ = self.env.step(action)
+                    step_id += 1
+                    messages = []
 
-                result = {
-                    "scene_id": scene_id,
-                    "episode_id": episode_id,
-                    "success": metrics["success"],
-                    "spl": metrics["spl"],
-                    "os": metrics['oracle_success'],
-                    "ne": metrics["distance_to_goal"],
-                    "steps": step_id,
-                    "episode_instruction": episode_instruction,
-                }
+            # ---------- 3. End of episode -----------
+            # Update result and write progress to the output_path/progress.json
+
+            process_bar.update(1)
+
+            # After the episode finishes, collect metrics:
+            metrics = self.env.get_metrics()
+
+            sucs.append(metrics['success'])
+            spls.append(metrics['spl'])
+            oss.append(metrics['oracle_success'])
+            nes.append(metrics["distance_to_goal"])
+
+            print(
+                f"scene_episode {scene_id}_{episode_id:04d} success: {metrics['success']}, "
+                f"spl: {metrics['spl']}, os: {metrics['oracle_success']}, "
+                f"ne: {metrics['distance_to_goal']}"
+            )
+
+            # Write per-episode result.json entry (still per-rank)
+            result = {
+                "scene_id": scene_id,
+                "episode_id": episode_id,
+                "success": metrics["success"],
+                "spl": metrics["spl"],
+                "os": metrics['oracle_success'],
+                "ne": metrics["distance_to_goal"],
+                "steps": step_id,
+                "episode_instruction": episode_instruction,
+            }
+            os.makedirs(self.output_path, exist_ok=True)
+            with open(os.path.join(self.output_path, 'progress.json'), 'a') as f:
+                f.write(json.dumps(result) + "\n")
+
+        self.env.close()
 
-                with open(os.path.join(self.output_path, 'result.json'), 'a') as f:
-                    f.write(json.dumps(result) + "\n")
-        env.close()
         return (
             torch.tensor(sucs).to(self.device),
             torch.tensor(spls).to(self.device),
diff --git a/internnav/internnav_habitat/habitat_vln_evaluator_clean.py b/internnav/internnav_habitat/habitat_vln_evaluator_clean.py
new file mode 100644
index 00000000..ff5e60ba
--- /dev/null
+++ b/internnav/internnav_habitat/habitat_vln_evaluator_clean.py
@@ -0,0 +1,128 @@
+import argparse
+import sys
+
+sys.path.append('./src/diffusion-policy')
+
+
+# Import for Habitat registry side effects — do not remove
+import internnav.env.utils.habitat_extensions.measures  # noqa: F401
+from internnav.configs.evaluator import EvalCfg
+from internnav.evaluator import DistributedEvaluator, Evaluator
+
+try:
+    import habitat
+    from habitat.config.default import get_agent_config
+    from habitat.config.default_structured_configs import (
+        CollisionsMeasurementConfig,
+        FogOfWarConfig,
+        TopDownMapMeasurementConfig,
+    )
+    from habitat_baselines.config.default import get_config as get_habitat_config
+except Exception as e:
+    print("Habitat Error:", e)
+    print("Habitat Evaluation is not loaded.")
+
+
+DEFAULT_IMAGE_TOKEN = "<image>"
+
+
+@Evaluator.register('habitat_vln')
+class HabitatVlnEvaluator(DistributedEvaluator):
+    def __init__(self, cfg: EvalCfg):
+        args = argparse.Namespace(**cfg.eval_settings)
+        self.args = args
+        self.save_video = args.save_video
+        self.epoch = args.epoch
+        self.max_steps_per_episode = args.max_steps_per_episode
+        self.output_path = args.output_path
+
+        # create habitat config
+        self.config_path = cfg.env.env_settings['config_path']
+        self.config = get_habitat_config(self.config_path)
+        self.agent_config = get_agent_config(self.config.habitat.simulator)
+        self.sim_sensors_config = self.config.habitat.simulator.agents.main_agent.sim_sensors
+
+        with habitat.config.read_write(self.config):
+            self.config.habitat.task.measurements.update(
+                {
+                    "top_down_map": TopDownMapMeasurementConfig(
+                        map_padding=3,
+                        map_resolution=1024,
+                        draw_source=True,
+                        draw_border=True,
+                        draw_shortest_path=True,
+                        draw_view_points=True,
+                        draw_goal_positions=True,
+                        draw_goal_aabbs=True,
+                        fog_of_war=FogOfWarConfig(
+                            draw=True,
+                            visibility_dist=5.0,
+                            fov=90,
+                        ),
+                    ),
+                    "collisions": CollisionsMeasurementConfig(),
+                }
+            )
+        cfg.env.env_settings['habitat_config'] = self.config
+        cfg.env.env_settings['output_path'] = self.output_path
+
+        # init agent and env
+        super().__init__(cfg)
+
+    def eval_action(self):
+        """
+        Run local episodes on this rank.
+
+        Returns dict[str, Tensor] on GPU (1D tensors of same length).
+        """
+        sucs, spls, oss, nes = [], [], [], []
+        env = self.env
+
+        while env.is_running:
+            obs = env.reset()
+            if not env.is_running or obs is None:
+                break
+
+            episode = env.env.current_episode
+            self.agent.reset(episode, env)
+
+            done = False
+            step_id = 0
+            while not done and step_id <= self.max_steps_per_episode:
+                action = self.agent.act(obs, env, info=None)
+                obs, reward, done, info = env.step(action)
+                step_id += 1
+
+            m = env.get_metrics()
+            sucs.append(m["success"])
+            spls.append(m["spl"])
+            oss.append(m["oracle_success"])
+            nes.append(m["distance_to_goal"])
+
+        env.close()
+        return {
+            "sucs": sucs,  # shape [N_local]
+            "spls": spls,  # shape [N_local]
+            "oss": oss,  # shape [N_local]
+            "nes": nes,  # shape [N_local]
+        }
+
+    def calc_metrics(self, global_metrics: dict) -> dict:
+        """
+        global_metrics["sucs"] etc. are global 1-D CPU tensors with all episodes.
+        """
+        sucs_all = global_metrics["sucs"]
+        spls_all = global_metrics["spls"]
+        oss_all = global_metrics["oss"]
+        nes_all = global_metrics["nes"]
+
+        # avoid /0 if no episodes
+        denom = max(len(sucs_all), 1)
+
+        return {
+            "sucs_all": float(sucs_all.mean().item()) if denom > 0 else 0.0,
+            "spls_all": float(spls_all.mean().item()) if denom > 0 else 0.0,
+            "oss_all": float(oss_all.mean().item()) if denom > 0 else 0.0,
+            "nes_all": float(nes_all.mean().item()) if denom > 0 else 0.0,
+            # "length" will be filled by base class
+        }
diff --git a/internnav/utils/dist.py b/internnav/utils/dist.py
index 7994e255..82c14a31 100644
--- a/internnav/utils/dist.py
+++ b/internnav/utils/dist.py
@@ -1,13 +1,13 @@
-import os
-import time
 import builtins
 import datetime
+import os
 import subprocess
+import time
+from collections import defaultdict, deque
 
 import torch
 import torch.distributed as dist
 
-from collections import defaultdict, deque
 
 class SmoothedValue(object):
     def __init__(self, window_size=20, fmt=None):
@@ -60,11 +60,8 @@ def value(self):
 
     def __str__(self):
         return self.fmt.format(
-            median=self.median,
-            avg=self.avg,
-            global_avg=self.global_avg,
-            max=self.max,
-            value=self.value)
+            median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
+        )
 
 
 class MetricLogger(object):
@@ -86,15 +83,12 @@ def __getattr__(self, attr):
             return self.meters[attr]
         if attr in self.__dict__:
             return self.__dict__[attr]
-        raise AttributeError("'{}' object has no attribute '{}'".format(
-            type(self).__name__, attr))
+        raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
 
     def __str__(self):
         loss_str = []
         for name, meter in self.meters.items():
-            loss_str.append(
-                "{}: {}".format(name, str(meter))
-            )
+            loss_str.append("{}: {}".format(name, str(meter)))
         return self.delimiter.join(loss_str)
 
     def synchronize_between_processes(self):
@@ -113,14 +107,7 @@ def log_every(self, iterable, print_freq, header=None):
         iter_time = SmoothedValue(fmt='{avg:.4f}')
         data_time = SmoothedValue(fmt='{avg:.4f}')
         space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
-        log_msg = [
-            header,
-            '[{0' + space_fmt + '}/{1}]',
-            'eta: {eta}',
-            '{meters}',
-            'time: {time}',
-            'data: {data}'
-        ]
+        log_msg = [header, '[{0' + space_fmt + '}/{1}]', 'eta: {eta}', '{meters}', 'time: {time}', 'data: {data}']
         if torch.cuda.is_available():
             log_msg.append('max mem: {memory:.0f}')
         log_msg = self.delimiter.join(log_msg)
@@ -133,22 +120,28 @@ def log_every(self, iterable, print_freq, header=None):
                 eta_seconds = iter_time.global_avg * (len(iterable) - i)
                 eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
                 if torch.cuda.is_available():
-                    print(log_msg.format(
-                        i, len(iterable), eta=eta_string,
-                        meters=str(self),
-                        time=str(iter_time), data=str(data_time),
-                        memory=torch.cuda.max_memory_allocated() / MB))
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / MB,
+                        )
+                    )
                 else:
-                    print(log_msg.format(
-                        i, len(iterable), eta=eta_string,
-                        meters=str(self),
-                        time=str(iter_time), data=str(data_time)))
+                    print(
+                        log_msg.format(
+                            i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time)
+                        )
+                    )
             i += 1
             end = time.time()
         total_time = time.time() - start_time
         total_time_str = str(datetime.timedelta(seconds=int(total_time)))
-        print('{} Total time: {} ({:.4f} s / it)'.format(
-            header, total_time_str, total_time / len(iterable)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(header, total_time_str, total_time / len(iterable)))
 
 
 def setup_for_distributed(is_master):
@@ -197,57 +190,51 @@ def save_on_master(*args, **kwargs):
         torch.save(*args, **kwargs)
 
 
-def init_distributed_mode(args):
-    if 'SLURM_PROCID' in os.environ:
-        args.rank = int(os.environ['SLURM_PROCID'])
-        args.world_size = int(os.environ['SLURM_NTASKS'])
-        
+def init_distributed_mode(port=29529, backend="nccl", timeout_hours=2):
+    # Fast-path: torchrun provides these
+    if all(k in os.environ for k in ["RANK", "WORLD_SIZE", "LOCAL_RANK", "MASTER_ADDR", "MASTER_PORT"]):
+        rank = int(os.environ["RANK"])
+        world_size = int(os.environ["WORLD_SIZE"])
+        local_rank = int(os.environ["LOCAL_RANK"])
+
+    # SLURM path: derive env then fall back to env://
+    elif "SLURM_PROCID" in os.environ:
+        rank = int(os.environ["SLURM_PROCID"])
+        world_size = int(os.environ["SLURM_NTASKS"])
         num_gpus = torch.cuda.device_count()
-        args.gpu = args.rank % num_gpus
-        args.local_rank = args.gpu
-
-        node_list = os.environ['SLURM_NODELIST']
-        print(f'Node list: {node_list}')
-        addr = subprocess.getoutput(f'scontrol show hostname {node_list} | head -n1')
-
-        os.environ['MASTER_PORT'] = str(getattr(args, 'port', '29529'))
-        os.environ['MASTER_ADDR'] = addr
-        os.environ['WORLD_SIZE'] = str(args.world_size)
-        os.environ['LOCAL_RANK'] = str(args.gpu)
-        os.environ['RANK'] = str(args.rank)
-    elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
-        args.rank = int(os.environ["RANK"])
-        args.world_size = int(os.environ['WORLD_SIZE'])
-        args.gpu = int(os.environ['LOCAL_RANK'])
-        args.local_rank = args.gpu
+        local_rank = rank % max(1, num_gpus)
+
+        # pick first node as master
+        nodelist = os.environ["SLURM_NODELIST"]
+        master_addr = subprocess.getoutput(f"scontrol show hostname {nodelist} | head -n1")
+        os.environ.setdefault("MASTER_ADDR", master_addr)
+        os.environ.setdefault("MASTER_PORT", str(port))
+        os.environ["RANK"] = str(rank)
+        os.environ["WORLD_SIZE"] = str(world_size)
+        os.environ["LOCAL_RANK"] = str(local_rank)
+
     else:
-        print('Not using distributed mode')
-        setup_for_distributed(is_master=True)  # hack
-        args.distributed = False
+        print("Not using distributed mode")
+        setup_for_distributed(is_master=True)
         return
 
-    args.distributed = True
+    # Device selection must happen before NCCL init
+    torch.cuda.set_device(local_rank)
+
+    dist.init_process_group(backend=backend, init_method="env://", timeout=datetime.timedelta(hours=timeout_hours))
+    setup_for_distributed(dist.get_rank() == 0)
 
-    torch.cuda.set_device(args.gpu)
-    args.dist_backend = 'nccl'
-    print('| distributed init (rank {}): {}, gpu {}'.format(args.rank, args.dist_url, args.gpu), flush=True)
-    dist.init_process_group(backend=args.dist_backend,
-                            init_method=args.dist_url,
-                            world_size=args.world_size,
-                            rank=args.rank,
-                            timeout=datetime.timedelta(0, 7200))
-    dist.barrier()
-    setup_for_distributed(args.rank == 0)
 
 def save_model(args, epoch, model_without_ddp, optimizer, checkpoint_path):
     to_save = {
-                'model': model_without_ddp.state_dict(),
-                'optimizer': optimizer.state_dict(),
-                'epoch': epoch,
-                'args': args,
-            }
+        'model': model_without_ddp.state_dict(),
+        'optimizer': optimizer.state_dict(),
+        'epoch': epoch,
+        'args': args,
+    }
     save_on_master(to_save, checkpoint_path)
 
+
 def all_reduce_mean(x):
     world_size = get_world_size()
     if world_size > 1:
@@ -257,11 +244,16 @@ def all_reduce_mean(x):
         return x_reduce.item()
     else:
         return x
-    
+
+
 def fsdp_auto_wrap_policy(model, transformer_layer_names):
     import functools
 
-    from torch.distributed.fsdp.wrap import _or_policy, lambda_auto_wrap_policy, transformer_auto_wrap_policy
+    from torch.distributed.fsdp.wrap import (
+        _or_policy,
+        lambda_auto_wrap_policy,
+        transformer_auto_wrap_policy,
+    )
 
     def lambda_policy_fn(module):
         if (
@@ -274,9 +266,8 @@ def lambda_policy_fn(module):
 
     lambda_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn)
     transformer_wrap_policy = functools.partial(
-        transformer_auto_wrap_policy,
-        transformer_layer_cls=set(transformer_layer_names)
+        transformer_auto_wrap_policy, transformer_layer_cls=set(transformer_layer_names)
     )
 
     auto_wrap_policy = functools.partial(_or_policy, policies=[lambda_policy, transformer_wrap_policy])
-    return auto_wrap_policy
\ No newline at end of file
+    return auto_wrap_policy
diff --git a/scripts/eval/bash/torchrun_eval.sh b/scripts/eval/bash/torchrun_eval.sh
index 8a68922f..ae3993c4 100644
--- a/scripts/eval/bash/torchrun_eval.sh
+++ b/scripts/eval/bash/torchrun_eval.sh
@@ -2,7 +2,7 @@
 
 MID_RUN_NAME="InternVLA-N1"
 torchrun \
-  --nproc_per_node=2 \
+  --nproc_per_node=1 \
   --master_port=2333 \
   scripts/eval/eval.py \
     --config scripts/eval/configs/habitat_cfg.py \
diff --git a/scripts/eval/configs/habitat_cfg.py b/scripts/eval/configs/habitat_cfg.py
index 6e3445a7..18a86805 100644
--- a/scripts/eval/configs/habitat_cfg.py
+++ b/scripts/eval/configs/habitat_cfg.py
@@ -7,25 +7,16 @@
         model_name='internvla_n1',
         ckpt_path='',
         model_settings={
-            'env_num': 1,
-            'sim_num': 1,
-            'model_path': "checkpoints/InternVLA-N1",
-            'camera_intrinsic': [[585.0, 0.0, 320.0], [0.0, 585.0, 240.0], [0.0, 0.0, 1.0]],
-            'width': 640,
-            'height': 480,
-            'hfov': 79,
-            'resize_w': 384,
-            'resize_h': 384,
-            'max_new_tokens': 1024,
-            'num_frames': 32,
-            'num_history': 8,
-            'num_future_steps': 4,
-            'device': 'cuda:0',
-            'predict_step_nums': 32,
-            'continuous_traj': True,
-            # debug
-            'vis_debug': True,  # If vis_debug=True, you can get visualization results
-            'vis_debug_path': './logs/test/vis_debug',
+            "mode": "dual_system",  # inference mode: dual_system or system2
+            "model_path": "checkpoints/InternVLA-N1",  # path to model checkpoint
+            "num_future_steps": 4,  # number of future steps for prediction
+            "num_frames": 32,  # number of frames used in evaluation
+            "num_history": 8,
+            "resize_w": 384,  # image resize width
+            "resize_h": 384,  # image resize height
+            "predict_step_nums": 32,  # number of steps to predict
+            "continuous_traj": True,  # whether to use continuous trajectory
+            "max_new_tokens": 1024,  # maximum number of tokens for generation
         },
     ),
     env=EnvCfg(
@@ -42,20 +33,12 @@
         "output_path": "./logs/habitat/test_refactor_debug",  # output directory for logs/results
         "save_video": False,  # whether to save videos
         "epoch": 0,  # epoch number for logging
+        "max_steps_per_episode": 500,  # maximum steps per episode
+        # distributed settings
         "world_size": 1,  # number of distributed processes
         "rank": 0,  # rank of current process
         "gpu": 0,  # gpu id to use
         "port": "2333",  # communication port
         "dist_url": "env://",  # url for distributed setup
-        "mode": "dual_system",  # inference mode: dual_system or system2
-        "model_path": "checkpoints/InternVLA-N1",  # path to model checkpoint
-        "num_future_steps": 4,  # number of future steps for prediction
-        "num_frames": 32,  # number of frames used in evaluation
-        "num_history": 8,
-        "resize_w": 384,  # image resize width
-        "resize_h": 384,  # image resize height
-        "predict_step_nums": 32,  # number of steps to predict
-        "continuous_traj": True,  # whether to use continuous trajectory
-        "max_new_tokens": 1024,  # maximum number of tokens for generation
     },
 )
diff --git a/setup.cfg b/setup.cfg
index b6a3f5b3..0bbee65e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -45,4 +45,4 @@ per-file-ignores=*/__init__.py:F401
 ignore=E402,E501,W503,E203,D401,R504,R505,SIM102,SIM117,E711,E226
 max-line-length = 120
 max-complexity = 30
-exclude=_*,.vscode,.git,docs/**,**/test/**,**/lcmtypes/**,*.ipynb
+exclude=_*,.vscode,.git,docs/**,**/test/**,**/lcmtypes/**,*.ipynb,internnav/internnav_habitat/habitat_n1_agent_temp.py

From 7e25e7243da20e78dc5625b36753775c01cf7e87 Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Wed, 12 Nov 2025 07:06:20 +0000
Subject: [PATCH 05/16] fix observation issues

---
 internnav/evaluator/__init__.py               |   5 +-
 internnav/evaluator/distributed_base.py       |  28 +-
 internnav/evaluator/habitat_vln_evaluator.py  | 749 ------------------
 .../habitat_vln_evaluator.py                  |  11 +-
 ..._clean.py => habitat_vln_evaluator_new.py} |   0
 scripts/eval/bash/torchrun_eval.sh            |   2 +-
 scripts/eval/configs/habitat_cfg.py           |   4 +-
 setup.cfg                                     |   2 +-
 8 files changed, 34 insertions(+), 767 deletions(-)
 delete mode 100644 internnav/evaluator/habitat_vln_evaluator.py
 rename internnav/internnav_habitat/{habitat_vln_evaluator_clean.py => habitat_vln_evaluator_new.py} (100%)

diff --git a/internnav/evaluator/__init__.py b/internnav/evaluator/__init__.py
index fda0a88f..07a72080 100644
--- a/internnav/evaluator/__init__.py
+++ b/internnav/evaluator/__init__.py
@@ -1,7 +1,8 @@
-# register habitat
-import internnav.internnav_habitat  # noqa: F401
 from internnav.evaluator.base import Evaluator
 from internnav.evaluator.distributed_base import DistributedEvaluator
 from internnav.evaluator.vln_multi_evaluator import VlnMultiEvaluator
 
+# register habitat
+import internnav.internnav_habitat  # noqa: F401 # isort: skip
+
 __all__ = ['Evaluator', 'VlnMultiEvaluator']
diff --git a/internnav/evaluator/distributed_base.py b/internnav/evaluator/distributed_base.py
index f30d058a..94bfc06e 100644
--- a/internnav/evaluator/distributed_base.py
+++ b/internnav/evaluator/distributed_base.py
@@ -13,9 +13,14 @@
 class DistributedEvaluator(Evaluator):
     """
     Base class of distributed evaluators.
+
+    Args:
+        cfg (EvalCfg): evaluation configuration
+        init_env (bool): whether to initialize the environment
+        init_agent (bool): whether to initialize the agent
     """
 
-    def __init__(self, cfg: EvalCfg):
+    def __init__(self, cfg: EvalCfg, init_env: bool = True, init_agent: bool = True):
         # distributed setting
         import socket
 
@@ -35,14 +40,23 @@ def __init__(self, cfg: EvalCfg):
         cfg.env.env_settings['local_rank'] = get_rank()
         cfg.env.env_settings['world_size'] = get_world_size()
 
+        self.eval_config = cfg
+
+        if init_env:
+            self.env = Env.init(cfg.env, cfg.task)
+
         # -------- initialize agent config (either remote server or local agent) --------
-        # set agent port based on rank
-        # cfg.agent.agent_settings['port'] = 8000 + get_rank()
-        # start_server(cfg.agent.agent_settings['port'])
+        if init_agent:
+            if cfg.remote_agent:
+                # set agent port based on rank
+                from internnav.utils import AgentClient
 
-        self.eval_config = cfg
-        self.env = Env.init(cfg.env, cfg.task)
-        # self.agent = AgentClient(config.agent)
+                cfg.agent.agent_settings['port'] = 8000 + get_rank()
+                self.agent = AgentClient(cfg.agent)
+            else:
+                from internnav.agent import Agent
+
+                self.agent = Agent(cfg.agent)
 
     def eval(self):
         """
diff --git a/internnav/evaluator/habitat_vln_evaluator.py b/internnav/evaluator/habitat_vln_evaluator.py
deleted file mode 100644
index e901ae3e..00000000
--- a/internnav/evaluator/habitat_vln_evaluator.py
+++ /dev/null
@@ -1,749 +0,0 @@
-import argparse
-import copy
-import itertools
-import json
-import os
-import random
-import re
-from collections import OrderedDict
-from typing import Any
-
-import habitat
-import numpy as np
-import quaternion
-import torch
-import tqdm
-from depth_camera_filtering import filter_depth
-from habitat import Env
-from habitat.config.default import get_agent_config
-from habitat.config.default_structured_configs import (
-    CollisionsMeasurementConfig,
-    FogOfWarConfig,
-    TopDownMapMeasurementConfig,
-)
-from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower
-from habitat.utils.visualizations.utils import images_to_video, observations_to_image
-from habitat_baselines.config.default import get_config as get_habitat_config
-from omegaconf import OmegaConf
-from PIL import Image, ImageDraw, ImageFont
-from transformers.image_utils import to_numpy_array
-
-from internnav.model.utils.vln_utils import (
-    chunk_token,
-    open_image,
-    split_and_clean,
-    traj_to_actions,
-)
-from internnav.utils.dist import *  # noqa: F403
-
-DEFAULT_IMAGE_TOKEN = "<image>"
-
-
-class VLNEvaluator:
-    def __init__(
-        self,
-        config_path: str,
-        split: str = "val_seen",
-        env_num: int = 1,
-        output_path: str = None,
-        model: Any = None,
-        processor: Any = None,
-        epoch: int = 0,
-        args: argparse.Namespace = None,
-    ):
-        self.args = args
-        self.device = torch.device('cuda')
-        self.split = split
-        self.env_num = env_num
-        self.save_video = args.save_video
-        self.output_path = output_path
-        self.epoch = epoch
-        self.config_path = config_path
-        self.config = get_habitat_config(config_path)
-        self.agent_config = get_agent_config(self.config.habitat.simulator)
-        self.sim_sensors_config = self.config.habitat.simulator.agents.main_agent.sim_sensors
-
-        with habitat.config.read_write(self.config):
-            # self.config.habitat.task.measurements.success.success_distance=3.0
-            self.config.habitat.dataset.split = self.split  # refactor: why args and yaml both have split
-            self.config.habitat.task.measurements.update(  # refactor: move to yaml
-                {
-                    "top_down_map": TopDownMapMeasurementConfig(
-                        map_padding=3,
-                        map_resolution=1024,
-                        draw_source=True,
-                        draw_border=True,
-                        draw_shortest_path=True,
-                        draw_view_points=True,
-                        draw_goal_positions=True,
-                        draw_goal_aabbs=True,
-                        fog_of_war=FogOfWarConfig(
-                            draw=True,
-                            visibility_dist=5.0,
-                            fov=90,
-                        ),
-                    ),
-                    "collisions": CollisionsMeasurementConfig(),
-                }
-            )
-
-        print(f"config = {type(self.config)}")
-        print(OmegaConf.to_yaml(self.config))
-
-        self._camera_height = self.sim_sensors_config.rgb_sensor.position[1]
-        self._min_depth = self.sim_sensors_config.depth_sensor.min_depth
-        self._max_depth = self.sim_sensors_config.depth_sensor.max_depth
-
-        camera_fov_rad = np.deg2rad(self.sim_sensors_config.depth_sensor.hfov)
-        self._camera_fov = camera_fov_rad
-        self._fx = self._fy = self.sim_sensors_config.depth_sensor.width / (2 * np.tan(camera_fov_rad / 2))
-
-        self.model = model
-        self.processor = processor
-
-        # refactor: this part used in three places
-        prompt = "You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? Please output the next waypoint\'s coordinates in the image. Please output STOP when you have successfully completed the task."
-        answer = ""
-        self.conversation = [{"from": "human", "value": prompt}, {"from": "gpt", "value": answer}]
-
-        self.conjunctions = [
-            'you can see ',
-            'in front of you is ',
-            'there is ',
-            'you can spot ',
-            'you are toward the ',
-            'ahead of you is ',
-            'in your sight is ',
-        ]
-
-        self.actions2idx = OrderedDict(
-            {
-                'STOP': [0],
-                "↑": [1],
-                "←": [2],
-                "→": [3],
-                "↓": [5],
-            }
-        )
-
-        self.objectnav_instructions = ["Search for the {target_object}."]
-
-        self.num_frames = args.num_frames
-        self.num_future_steps = args.num_future_steps
-        self.num_history = args.num_history
-
-    # refactor
-    def config_env(self) -> Env:
-        env = Env(config=self.config)
-        # env.episodes = env.episodes[0:1]
-        return env
-
-    def eval_action(self, idx) -> None:  # noqa: C901
-        self.model.eval()
-        env = self.config_env()
-        scene_episode_dict = {}
-        for episode in env.episodes:
-            if episode.scene_id not in scene_episode_dict:
-                scene_episode_dict[episode.scene_id] = []
-            scene_episode_dict[episode.scene_id].append(episode)
-
-        intrinsic_matrix = self.get_intrinsic_matrix(
-            self.config.habitat.simulator.agents.main_agent.sim_sensors.rgb_sensor
-        )
-        sucs, spls, oss, nes = [], [], [], []
-        done_res = []
-
-        if os.path.exists(os.path.join(self.output_path, 'result.json')):
-            with open(os.path.join(self.output_path, 'result.json'), 'r') as f:
-                for line in f.readlines():
-                    res = json.loads(line)
-                    done_res.append([res["scene_id"], res["episode_id"], res["episode_instruction"]])
-                    if get_rank() == 0:  # noqa: F405
-                        sucs.append(res['success'])
-                        spls.append(res['spl'])
-                        oss.append(res['os'])
-                        nes.append(res['ne'])
-
-        # refactor: why sort to scene: [episode] but nothing actually used
-        for scene in sorted(scene_episode_dict.keys()):
-            episodes = scene_episode_dict[scene]
-            scene_id = scene.split('/')[-2]
-            print(f"scene_id = {scene_id}")
-            process_bar = tqdm.tqdm(range(len(episodes[idx :: self.env_num])), desc=f"scene {scene_id}")
-            for episode in episodes[idx :: self.env_num]:
-                episode_instruction = (
-                    episode.instruction.instruction_text
-                    if 'objectnav' not in self.config_path
-                    else episode.object_category
-                )
-                print("episode start", episode_instruction)
-                episode_id = int(episode.episode_id)
-                if [scene_id, episode_id, episode_instruction] in done_res:
-                    continue
-
-                # refactor env warm up
-                env.current_episode = episode
-                observations = env.reset()
-
-                agent_state = env.sim.get_agent_state()
-                rotation = agent_state.rotation
-                translation = agent_state.position
-                rotation_matrix = quaternion.as_rotation_matrix(rotation)
-                transformation_matrix = np.eye(4)
-                transformation_matrix[:3, :3] = rotation_matrix
-                transformation_matrix[:3, 3] = translation
-
-                agent = ShortestPathFollower(env.sim, 0.25, False)
-
-                os.makedirs(os.path.join(self.output_path, f'check_sim_{self.epoch}'), exist_ok=True)
-                Image.fromarray(observations['rgb']).save(
-                    os.path.join(self.output_path, f'check_sim_{self.epoch}', f'rgb_{idx}.jpg')
-                )
-
-                vis_frames = []
-                step_id = 0
-
-                if self.save_video:
-                    os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
-                initial_height = env.sim.get_agent_state().position[1]
-
-                rgb_list = []
-                action_seq = []
-                output_ids = None
-
-                goal = None
-                action = None
-                messages = []
-                local_actions = []
-
-                while not env.episode_over and step_id <= 500:
-                    # refactor agent get action
-                    rgb = observations["rgb"]
-                    depth = observations["depth"]
-                    x, y = observations["gps"]
-                    camera_yaw = observations["compass"][0]
-                    depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
-                    depth = depth * (self._max_depth - self._min_depth) + self._min_depth
-                    depth = depth * 1000
-
-                    agent_state = env.sim.get_agent_state()
-                    height = agent_state.position[1] - initial_height
-                    camera_position = np.array([x, -y, self._camera_height + height])
-                    tf_camera_to_episodic = (
-                        self.xyz_yaw_pitch_to_tf_matrix(camera_position, camera_yaw, np.deg2rad(30))
-                        @ self.get_axis_align_matrix()
-                    )
-
-                    image = Image.fromarray(rgb).convert('RGB')
-                    save_raw_image = image.copy()
-
-                    save_dot = False
-                    if action == 5:
-                        look_down_image = image
-                        save_raw_image = look_down_image.copy()
-                        look_down_depth, resize_shape = self.preprocess_depth_image_v2(
-                            Image.fromarray(depth.astype(np.uint16), mode='I;16'),
-                            do_depth_scale=True,
-                            depth_scale=1000,
-                            target_height=224,
-                            target_width=224,
-                        )
-                        look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
-                        look_down_depth[look_down_depth > 5.0] = 5.0
-                    else:
-                        image = image.resize((self.args.resize_w, self.args.resize_h))
-                        rgb_list.append(image)
-
-                        if self.args.mode == 'dual_system':
-                            down_observations = env.step(5)
-                            down_observations = env.step(5)
-
-                            look_down_image = Image.fromarray(down_observations["rgb"]).convert('RGB')
-                            depth = down_observations["depth"]
-                            depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
-                            depth = depth * (self._max_depth - self._min_depth) + self._min_depth
-                            depth = depth * 1000
-                            look_down_depth, resize_shape = self.preprocess_depth_image_v2(
-                                Image.fromarray(depth.astype(np.uint16), mode='I;16'),
-                                do_depth_scale=True,
-                                depth_scale=1000,
-                                target_height=224,
-                                target_width=224,
-                            )
-                            look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
-                            look_down_depth[look_down_depth > 5.0] = 5.0
-
-                            env.step(4)
-                            env.step(4)
-
-                    info = env.get_metrics()
-
-                    if len(action_seq) == 0 and goal is None:
-                        if action != 5:
-                            sources = copy.deepcopy(self.conversation)
-                            sources[0]["value"] = sources[0]["value"].replace(
-                                '<instruction>.', episode.instruction.instruction_text[:-1]
-                            )
-                            cur_images = rgb_list[-1:]
-                            if step_id == 0:
-                                history_id = []
-                            else:
-                                history_id = np.unique(
-                                    np.linspace(0, step_id - 1, self.num_history, dtype=np.int32)
-                                ).tolist()
-                                placeholder = (DEFAULT_IMAGE_TOKEN + '\n') * len(history_id)
-                                sources[0]["value"] += f' These are your historical observations: {placeholder}.'
-
-                            history_id = sorted(history_id)
-                            print('history_idddddddd', step_id, history_id)
-                            input_images = [rgb_list[i] for i in history_id] + cur_images
-                            input_img_id = 0
-                        else:
-                            assert action == 5
-                            sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
-                            input_images += [look_down_image]
-                            messages.append(
-                                {'role': 'assistant', 'content': [{'type': 'text', 'text': llm_outputs}]}  # noqa: F405
-                            )
-                            input_img_id = -1
-
-                        prompt = random.choice(self.conjunctions) + DEFAULT_IMAGE_TOKEN
-                        sources[0]["value"] += f" {prompt}."
-                        print('sources', step_id, sources)
-                        prompt_instruction = copy.deepcopy(sources[0]["value"])
-                        parts = split_and_clean(prompt_instruction)
-
-                        content = []
-                        for i in range(len(parts)):
-                            if parts[i] == "<image>":
-                                content.append({"type": "image", "image": input_images[input_img_id]})
-                                input_img_id += 1
-                            else:
-                                content.append({"type": "text", "text": parts[i]})
-
-                        messages.append({'role': 'user', 'content': content})
-
-                        print('step_id', step_id, 'messages:', messages)
-
-                        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-
-                        inputs = self.processor(text=[text], images=input_images, return_tensors="pt").to(
-                            self.model.device
-                        )
-
-                        with torch.no_grad():
-                            output_ids = self.model.generate(**inputs, max_new_tokens=128, do_sample=False)
-
-                        llm_outputs = self.processor.tokenizer.decode(
-                            output_ids[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
-                        )
-                        print('step_id:', step_id, 'output text:', llm_outputs)
-
-                        if bool(re.search(r'\d', llm_outputs)):
-                            forward_action = 0
-                            coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
-                            pixel_goal = [int(coord[1]), int(coord[0])]
-
-                            goal = self.pixel_to_gps(pixel_goal, depth / 1000, intrinsic_matrix, tf_camera_to_episodic)
-                            print('before', goal, depth.shape)
-                            goal = (transformation_matrix @ np.array([-goal[1], 0, -goal[0], 1]))[:3]
-
-                            if not env.sim.pathfinder.is_navigable(np.array(goal)):
-                                goal = np.array(env.sim.pathfinder.snap_point(np.array(goal)))
-
-                            # look down --> horizontal
-                            env.step(4)
-                            env.step(4)
-
-                            # Forking logic based on mode
-                            if self.args.mode == 'system2':
-                                action = agent.get_next_action(goal)
-                                if action == 0:
-                                    goal = None
-                                    output_ids = None
-                                    action = 2  # random action
-                                    print('conduct a random action 2')
-                                    observations = env.step(action)
-                                    step_id += 1
-                                    messages = []
-                                    continue
-                            else:  # dual-system logic
-                                local_actions = []
-                                pixel_values = inputs.pixel_values
-                                image_grid_thw = torch.cat([thw.unsqueeze(0) for thw in inputs.image_grid_thw], dim=0)
-
-                                with torch.no_grad():
-                                    traj_latents = self.model.generate_latents(output_ids, pixel_values, image_grid_thw)
-
-                                # prepocess align with navdp
-                                image_dp = (
-                                    torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
-                                )
-                                pix_goal_image = copy.copy(image_dp)
-                                images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
-                                depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
-                                pix_goal_depth = copy.copy(depth_dp)
-                                depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
-
-                                with torch.no_grad():
-                                    dp_actions = self.model.generate_traj(
-                                        traj_latents, images_dp, depths_dp, use_async=True
-                                    )
-
-                                random_choice = np.random.choice(dp_actions.shape[0])
-                                if self.args.continuous_traj:
-                                    action_list = traj_to_actions(dp_actions)
-                                    if len(action_list) < 8:
-                                        action_list += [0] * (8 - len(action_list))
-                                else:
-                                    action_list = chunk_token(dp_actions[random_choice])
-
-                                local_actions = action_list
-                                if len(local_actions) >= 4:
-                                    local_actions = local_actions[:4]
-                                action = local_actions[0]
-                                if action == 0:
-                                    goal = None
-                                    output_ids = None
-                                    action = 2  # random action
-                                    print('conduct a random action 2')
-                                    observations = env.step(action)
-                                    step_id += 1
-                                    messages = []
-                                    continue
-
-                            print('predicted goal', pixel_goal, goal, flush=True)
-                        else:
-                            action_seq = self.parse_actions(llm_outputs)
-                            print('actions', action_seq, flush=True)
-
-                    if len(action_seq) != 0:
-                        action = action_seq[0]
-                        action_seq.pop(0)
-                    elif goal is not None:
-                        # Forking logic based on mode
-                        if self.args.mode == 'system2':
-                            action = agent.get_next_action(goal)
-                            action = action.detach().cpu().numpy()[0] if isinstance(action, torch.Tensor) else action
-                            action = action[0] if hasattr(action, "__len__") else action
-                        else:  # dual-system logic
-                            if len(local_actions) == 0:
-                                # navdp
-                                local_actions = []
-                                image_dp = (
-                                    torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
-                                )
-
-                                images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
-                                depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
-
-                                depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
-                                with torch.no_grad():
-                                    dp_actions = self.model.generate_traj(
-                                        traj_latents, images_dp, depths_dp, use_async=True
-                                    )
-
-                                random_choice = np.random.choice(dp_actions.shape[0])
-                                if self.args.continuous_traj:
-                                    action_list = traj_to_actions(dp_actions)
-                                    if len(action_list) < 8:
-                                        action_list += [0] * (8 - len(action_list))
-                                else:
-                                    action_list = chunk_token(dp_actions[random_choice])
-                                print("first action_list", action_list)
-
-                                local_actions = action_list
-                                if len(local_actions) >= 4:
-                                    local_actions = local_actions[:4]
-                                # if len(local_actions) >= 2:
-                                #     local_actions = local_actions[:2]
-
-                                print("local_actions", local_actions)
-
-                                action = local_actions.pop(0)
-                                # navdp
-                            else:
-                                action = local_actions.pop(0)
-
-                        forward_action += 1
-                        print('forward_action', forward_action, flush=True)
-                        if forward_action > 8:
-                            goal = None
-                            output_ids = None
-                            messages = []
-                            step_id += 1
-                            forward_action = 0
-                            local_actions = []
-                            continue
-                        if action == 0:
-                            goal = None
-                            output_ids = None
-                            messages = []
-                            step_id += 1
-                            forward_action = 0
-                            local_actions = []
-                            continue
-                    else:
-                        action = 0
-
-                    if info['top_down_map'] is not None:
-                        if save_dot:
-                            save_raw_image = self.dot_matrix_two_dimensional(
-                                save_raw_image, save_img=False, save_path=f'test_{step_id}.jpg', pixel_goal=pixel_goal
-                            )
-                        frame = observations_to_image({'rgb': np.asarray(save_raw_image)}, info)
-                        vis_frames.append(frame)
-
-                    print("step_id", step_id, "action", action)
-
-                    # refactor: core
-                    if action == 5:
-                        env.step(action)
-                        observations = env.step(action)
-                    else:
-                        observations = env.step(action)
-                        step_id += 1
-                        messages = []
-
-                process_bar.update(1)
-
-                metrics = env.get_metrics()
-                if self.save_video:
-                    images_to_video(
-                        vis_frames,
-                        os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'),
-                        f'{episode_id:04d}',
-                        fps=6,
-                        quality=9,
-                    )
-                vis_frames.clear()
-                sucs.append(metrics['success'])
-                spls.append(metrics['spl'])
-                oss.append(metrics['oracle_success'])
-                nes.append(metrics["distance_to_goal"])
-                print(
-                    f"scene_episode {scene_id}_{episode_id:04d} success: {metrics['success']}, spl: {metrics['spl']}, os: {metrics['oracle_success']}, ne: {metrics['distance_to_goal']}"
-                )
-
-                result = {
-                    "scene_id": scene_id,
-                    "episode_id": episode_id,
-                    "success": metrics["success"],
-                    "spl": metrics["spl"],
-                    "os": metrics['oracle_success'],
-                    "ne": metrics["distance_to_goal"],
-                    "steps": step_id,
-                    "episode_instruction": episode_instruction,
-                }
-
-                with open(os.path.join(self.output_path, 'result.json'), 'a') as f:
-                    f.write(json.dumps(result) + "\n")
-        env.close()
-        return (
-            torch.tensor(sucs).to(self.device),
-            torch.tensor(spls).to(self.device),
-            torch.tensor(oss).to(self.device),
-            torch.tensor(nes).to(self.device),
-            torch.tensor(len(sucs)).to(self.device),
-        )
-
-    def parse_actions(self, output):
-        action_patterns = '|'.join(re.escape(action) for action in self.actions2idx)
-        # import ipdb; ipdb.set_trace()
-        regex = re.compile(action_patterns)
-        matches = regex.findall(output)
-        actions = [self.actions2idx[match] for match in matches]
-        actions = itertools.chain.from_iterable(actions)
-        return list(actions)
-
-    def preprocess_depth_image_v2(
-        self, depth_image, do_depth_scale=True, depth_scale=1000, target_height=None, target_width=None
-    ):
-        if target_height is None:
-            target_height = self.image_processor.crop_size['height']  # 384
-            target_width = self.image_processor.crop_size['width']  # 384
-
-        resized_depth_image = depth_image.resize((target_width, target_height), Image.NEAREST)
-
-        img = to_numpy_array(resized_depth_image)
-        if do_depth_scale:
-            img = img / depth_scale
-
-        return img, (target_width, target_height)
-
-    def get_intrinsic_matrix(self, sensor_cfg) -> np.ndarray:
-        width = sensor_cfg.width
-        height = sensor_cfg.height
-        fov = sensor_cfg.hfov
-        fx = (width / 2.0) / np.tan(np.deg2rad(fov / 2.0))
-        fy = fx  # Assuming square pixels (fx = fy)
-        cx = (width - 1.0) / 2.0
-        cy = (height - 1.0) / 2.0
-
-        intrinsic_matrix = np.array(
-            [[fx, 0.0, cx, 0.0], [0.0, fy, cy, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
-        )
-        return intrinsic_matrix
-
-    def get_axis_align_matrix(self):
-        ma = np.array([[0, 0, 1, 0], [-1, 0, 0, 0], [0, -1, 0, 0], [0, 0, 0, 1]])
-        return ma
-
-    def xyz_yaw_to_tf_matrix(self, xyz: np.ndarray, yaw: float) -> np.ndarray:
-        x, y, z = xyz
-        transformation_matrix = np.array(
-            [
-                [np.cos(yaw), -np.sin(yaw), 0, x],
-                [np.sin(yaw), np.cos(yaw), 0, y],
-                [0, 0, 1, z],
-                [0, 0, 0, 1],
-            ]
-        )
-        return transformation_matrix
-
-    def xyz_pitch_to_tf_matrix(self, xyz: np.ndarray, pitch: float) -> np.ndarray:
-        """Converts a given position and pitch angle to a 4x4 transformation matrix.
-
-        Args:
-            xyz (np.ndarray): A 3D vector representing the position.
-            pitch (float): The pitch angle in radians for y axis.
-        Returns:
-            np.ndarray: A 4x4 transformation matrix.
-        """
-
-        x, y, z = xyz
-        transformation_matrix = np.array(
-            [
-                [np.cos(pitch), 0, np.sin(pitch), x],
-                [0, 1, 0, y],
-                [-np.sin(pitch), 0, np.cos(pitch), z],
-                [0, 0, 0, 1],
-            ]
-        )
-        return transformation_matrix
-
-    def xyz_yaw_pitch_to_tf_matrix(self, xyz: np.ndarray, yaw: float, pitch: float) -> np.ndarray:
-        """Converts a given position and yaw, pitch angles to a 4x4 transformation matrix.
-
-        Args:
-            xyz (np.ndarray): A 3D vector representing the position.
-            yaw (float): The yaw angle in radians.
-            pitch (float): The pitch angle in radians for y axis.
-        Returns:
-            np.ndarray: A 4x4 transformation matrix.
-        """
-        x, y, z = xyz
-        rot1 = self.xyz_yaw_to_tf_matrix(xyz, yaw)[:3, :3]
-        rot2 = self.xyz_pitch_to_tf_matrix(xyz, pitch)[:3, :3]
-        transformation_matrix = np.eye(4)
-        transformation_matrix[:3, :3] = rot1 @ rot2
-        transformation_matrix[:3, 3] = xyz
-        return transformation_matrix
-
-    def pixel_to_gps(self, pixel, depth, intrinsic, tf_camera_to_episodic):
-        '''
-        Args:
-            pixel: (2,) - [u, v] pixel coordinates
-            depth: (H, W) - depth image where depth[v, u] gives depth in meters
-            intrinsic: (4, 4) - camera intrinsic matrix
-            tf_camera_to_episodic: (4, 4) - transformation from camera to episodic frame
-        Returns:
-            (x, y): (x, y) coordinates in the episodic frame
-        '''
-        v, u = pixel
-        z = depth[v, u]
-        print("depthhhhhhhhhhhhhh", z)
-
-        x = (u - intrinsic[0, 2]) * z / intrinsic[0, 0]
-        y = (v - intrinsic[1, 2]) * z / intrinsic[1, 1]
-        point_camera = np.array([x, y, z, 1.0])
-
-        # Transform to episodic frame
-        point_episodic = tf_camera_to_episodic @ point_camera
-        point_episodic = point_episodic[:3] / point_episodic[3]
-
-        x = point_episodic[0]
-        y = point_episodic[1]
-
-        return (x, y)  # same as habitat gps
-
-    def dot_matrix_two_dimensional(
-        self,
-        image_or_image_path,
-        save_path=None,
-        dots_size_w=8,
-        dots_size_h=8,
-        save_img=False,
-        font_path='fonts/arial.ttf',
-        pixel_goal=None,
-    ):
-        """
-        takes an original image as input, save the processed image to save_path. Each dot is labeled with two-dimensional Cartesian coordinates (x,y). Suitable for single-image tasks.
-        control args:
-        1. dots_size_w: the number of columns of the dots matrix
-        2. dots_size_h: the number of rows of the dots matrix
-        """
-        with open_image(image_or_image_path) as img:
-            if img.mode != 'RGB':
-                img = img.convert('RGB')
-            draw = ImageDraw.Draw(img, 'RGB')
-
-            width, height = img.size
-            grid_size_w = dots_size_w + 1
-            grid_size_h = dots_size_h + 1
-            cell_width = width / grid_size_w
-            cell_height = height / grid_size_h
-
-            font = ImageFont.truetype(font_path, width // 40)  # Adjust font size if needed; default == width // 40
-
-            target_i = target_j = None
-            if pixel_goal is not None:
-                y_pixel, x_pixel = pixel_goal[0], pixel_goal[1]
-                # Validate pixel coordinates
-                if not (0 <= x_pixel < width and 0 <= y_pixel < height):
-                    raise ValueError(f"pixel_goal {pixel_goal} exceeds image dimensions ({width}x{height})")
-
-                # Convert to grid coordinates
-                target_i = round(x_pixel / cell_width)
-                target_j = round(y_pixel / cell_height)
-
-                # Validate grid bounds
-                if not (1 <= target_i <= dots_size_w and 1 <= target_j <= dots_size_h):
-                    raise ValueError(
-                        f"pixel_goal {pixel_goal} maps to grid ({target_j},{target_i}), "
-                        f"valid range is (1,1)-({dots_size_h},{dots_size_w})"
-                    )
-
-            count = 0
-
-            for j in range(1, grid_size_h):
-                for i in range(1, grid_size_w):
-                    x = int(i * cell_width)
-                    y = int(j * cell_height)
-
-                    pixel_color = img.getpixel((x, y))
-                    # choose a more contrasting color from black and white
-                    if pixel_color[0] + pixel_color[1] + pixel_color[2] >= 255 * 3 / 2:
-                        opposite_color = (0, 0, 0)
-                    else:
-                        opposite_color = (255, 255, 255)
-
-                    if pixel_goal is not None and i == target_i and j == target_j:
-                        opposite_color = (255, 0, 0)  # Red for target
-
-                    circle_radius = width // 240  # Adjust dot size if needed; default == width // 240
-                    draw.ellipse(
-                        [(x - circle_radius, y - circle_radius), (x + circle_radius, y + circle_radius)],
-                        fill=opposite_color,
-                    )
-
-                    text_x, text_y = x + 3, y
-                    count_w = count // dots_size_w
-                    count_h = count % dots_size_w
-                    label_str = f"({count_w+1},{count_h+1})"
-                    draw.text((text_x, text_y), label_str, fill=opposite_color, font=font)
-                    count += 1
-            if save_img:
-                print(">>> dots overlaid image processed, stored in", save_path)
-                img.save(save_path)
-            return img
diff --git a/internnav/internnav_habitat/habitat_vln_evaluator.py b/internnav/internnav_habitat/habitat_vln_evaluator.py
index ba86d6db..9ed4c487 100644
--- a/internnav/internnav_habitat/habitat_vln_evaluator.py
+++ b/internnav/internnav_habitat/habitat_vln_evaluator.py
@@ -19,8 +19,6 @@
 from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
 from transformers.image_utils import to_numpy_array
 
-# Import for Habitat registry side effects — do not remove
-import internnav.env.utils.habitat_extensions.measures  # noqa: F401
 from internnav.configs.evaluator import EvalCfg
 from internnav.evaluator import DistributedEvaluator, Evaluator
 from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
@@ -42,6 +40,9 @@
     from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower
     from habitat.utils.visualizations.utils import observations_to_image
     from habitat_baselines.config.default import get_config as get_habitat_config
+
+    # Import for Habitat registry side effects — do not remove
+    import internnav.internnav_habitat.measures  # noqa: F401 # isort: skip
 except Exception as e:
     print("Habitat Error:", e)
     print("Habitat Evaluation is not loaded.")
@@ -90,7 +91,7 @@ def __init__(self, cfg: EvalCfg):
         cfg.env.env_settings['output_path'] = self.output_path
 
         # init agent and env
-        super().__init__(cfg)
+        super().__init__(cfg, init_agent=False)
 
         # ------------------------------------- model ------------------------------------------
         self.model_args = argparse.Namespace(**cfg.agent.model_settings)
@@ -449,7 +450,7 @@ def _run_local_eval(self, idx=0) -> None:  # noqa: C901
                                 output_ids = None
                                 action = 2  # random action
                                 print('conduct a random action 2')
-                                observations = self.env.step(action)
+                                observations, _, done, _ = self.env.step(action)
                                 step_id += 1
                                 messages = []
                                 continue
@@ -493,7 +494,7 @@ def _run_local_eval(self, idx=0) -> None:  # noqa: C901
                                 output_ids = None
                                 action = 2  # random action
                                 print('conduct a random action 2')
-                                observations = self.env.step(action)
+                                observations, _, done, _ = self.env.step(action)
                                 step_id += 1
                                 messages = []
                                 continue
diff --git a/internnav/internnav_habitat/habitat_vln_evaluator_clean.py b/internnav/internnav_habitat/habitat_vln_evaluator_new.py
similarity index 100%
rename from internnav/internnav_habitat/habitat_vln_evaluator_clean.py
rename to internnav/internnav_habitat/habitat_vln_evaluator_new.py
diff --git a/scripts/eval/bash/torchrun_eval.sh b/scripts/eval/bash/torchrun_eval.sh
index ae3993c4..00303c5a 100644
--- a/scripts/eval/bash/torchrun_eval.sh
+++ b/scripts/eval/bash/torchrun_eval.sh
@@ -2,7 +2,7 @@
 
 MID_RUN_NAME="InternVLA-N1"
 torchrun \
-  --nproc_per_node=1 \
+  --nproc_per_node=8 \
   --master_port=2333 \
   scripts/eval/eval.py \
     --config scripts/eval/configs/habitat_cfg.py \
diff --git a/scripts/eval/configs/habitat_cfg.py b/scripts/eval/configs/habitat_cfg.py
index 18a86805..33cf6b19 100644
--- a/scripts/eval/configs/habitat_cfg.py
+++ b/scripts/eval/configs/habitat_cfg.py
@@ -29,12 +29,12 @@
     eval_type='habitat_vln',
     eval_settings={
         # all current parse args
-        "local_rank": 0,  # node rank
-        "output_path": "./logs/habitat/test_refactor_debug",  # output directory for logs/results
+        "output_path": "./logs/habitat/test_refactor_0d00014",  # output directory for logs/results
         "save_video": False,  # whether to save videos
         "epoch": 0,  # epoch number for logging
         "max_steps_per_episode": 500,  # maximum steps per episode
         # distributed settings
+        "local_rank": 0,  # node rank
         "world_size": 1,  # number of distributed processes
         "rank": 0,  # rank of current process
         "gpu": 0,  # gpu id to use
diff --git a/setup.cfg b/setup.cfg
index 0bbee65e..baadd6ae 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -10,7 +10,7 @@ extra_standard_library = pkg_resources,setuptools
 known_first_party = internutopia, internutopia_extension, grevaluator, grbench, grmodel
 no_lines_before = STDLIB,LOCALFOLDER
 default_section = THIRDPARTY
-skip_glob = internutopia/*, internutopia_extension/*
+skip_glob = internutopia/*, internutopia_extension/*, internnav/scripts/eval/configs/*
 
 
 # ignore-words-list needs to be lowercase format. For example, if we want to

From 2b0eb8b9125708be8d86d681cd1243c0f25f0ee8 Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Wed, 12 Nov 2025 09:29:05 +0000
Subject: [PATCH 06/16] update new register name; tiny fix on style

---
 internnav/internnav_habitat/habitat_env.py       |  2 +-
 .../internnav_habitat/habitat_n1_agent_temp.py   |  3 ++-
 .../habitat_vln_evaluator_new.py                 |  2 +-
 scripts/eval/configs/comm_cfg.py                 | 16 ----------------
 scripts/iros_challenge/start_eval_iros.sh        |  2 +-
 5 files changed, 5 insertions(+), 20 deletions(-)
 delete mode 100644 scripts/eval/configs/comm_cfg.py

diff --git a/internnav/internnav_habitat/habitat_env.py b/internnav/internnav_habitat/habitat_env.py
index 63769741..b680e94f 100644
--- a/internnav/internnav_habitat/habitat_env.py
+++ b/internnav/internnav_habitat/habitat_env.py
@@ -38,7 +38,7 @@ def __init__(self, env_config: EnvCfg, task_config: TaskCfg):
         # generate episodes
         # self._env.episodes = self._env.episodes[0:1]  # for debug
         self.episodes = self.generate_episodes()
-        print(self.episodes)
+        # print(self.episodes)
 
     def generate_episodes(self) -> List[Any]:
         """
diff --git a/internnav/internnav_habitat/habitat_n1_agent_temp.py b/internnav/internnav_habitat/habitat_n1_agent_temp.py
index 0cea910a..0d525245 100644
--- a/internnav/internnav_habitat/habitat_n1_agent_temp.py
+++ b/internnav/internnav_habitat/habitat_n1_agent_temp.py
@@ -21,7 +21,8 @@
 DEFAULT_IMAGE_TOKEN = "<image>"
 
 
-class HabitatAgent:
+@Agent.register("N1")
+class HabitatAgent(Agent):
     def __init__(self, model, processor, args, device):
         self.model = model
         self.processor = processor
diff --git a/internnav/internnav_habitat/habitat_vln_evaluator_new.py b/internnav/internnav_habitat/habitat_vln_evaluator_new.py
index ff5e60ba..2f652367 100644
--- a/internnav/internnav_habitat/habitat_vln_evaluator_new.py
+++ b/internnav/internnav_habitat/habitat_vln_evaluator_new.py
@@ -26,7 +26,7 @@
 DEFAULT_IMAGE_TOKEN = "<image>"
 
 
-@Evaluator.register('habitat_vln')
+@Evaluator.register('habitat_vlln')
 class HabitatVlnEvaluator(DistributedEvaluator):
     def __init__(self, cfg: EvalCfg):
         args = argparse.Namespace(**cfg.eval_settings)
diff --git a/scripts/eval/configs/comm_cfg.py b/scripts/eval/configs/comm_cfg.py
deleted file mode 100644
index 3c1b7bbf..00000000
--- a/scripts/eval/configs/comm_cfg.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from internnav.configs.agent import AgentCfg
-from internnav.configs.evaluator import EnvCfg, EvalCfg, EvalDatasetCfg, TaskCfg
-
-eval_cfg = EvalCfg(
-    agent=AgentCfg(
-        server_port=8087,
-        model_name='cma',
-        ckpt_path='checkpoints/r2r/fine_tuned/cma_plus',
-        model_settings={},
-    ),
-    env=EnvCfg['internutopia'],
-    task=TaskCfg['vln_pe'],
-    dataset=EvalDatasetCfg['mp3d'],
-    eval_type='internutopia_vln',
-    eval_settings={'save_to_json': False, 'vis_output': True},
-)
diff --git a/scripts/iros_challenge/start_eval_iros.sh b/scripts/iros_challenge/start_eval_iros.sh
index 335b6144..57012414 100755
--- a/scripts/iros_challenge/start_eval_iros.sh
+++ b/scripts/iros_challenge/start_eval_iros.sh
@@ -40,7 +40,7 @@ mkdir -p logs
 SERVER_LOG="logs/${CONFIG_PREFIX}_server.log"
 EVAL_LOG="logs/${CONFIG_PREFIX}_eval.log"
 
-processes=$(ps -ef | grep 'internnav/agent/utils/server.py' | grep -v grep | awk '{print $2}')
+processes=$(ps -ef | grep 'scripts/eval/start_server.py' | grep -v grep | awk '{print $2}')
 if [ -n "$processes" ]; then
     for pid in $processes; do
         kill -9 $pid

From b414ba3ddccd8f9fb5c1940748411accc24d4744 Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Wed, 12 Nov 2025 09:47:09 +0000
Subject: [PATCH 07/16] latest tested

---
 .../{habitat_n1_agent_temp.py => habitat_n1_agent.py}           | 0
 .../{habitat_vln_evaluator_new.py => habitat_vlln_evaluator.py} | 0
 internnav/internnav_habitat/utils.py                            | 0
 scripts/eval/configs/habitat_cfg.py                             | 2 +-
 setup.cfg                                                       | 2 +-
 5 files changed, 2 insertions(+), 2 deletions(-)
 rename internnav/internnav_habitat/{habitat_n1_agent_temp.py => habitat_n1_agent.py} (100%)
 rename internnav/internnav_habitat/{habitat_vln_evaluator_new.py => habitat_vlln_evaluator.py} (100%)
 delete mode 100644 internnav/internnav_habitat/utils.py

diff --git a/internnav/internnav_habitat/habitat_n1_agent_temp.py b/internnav/internnav_habitat/habitat_n1_agent.py
similarity index 100%
rename from internnav/internnav_habitat/habitat_n1_agent_temp.py
rename to internnav/internnav_habitat/habitat_n1_agent.py
diff --git a/internnav/internnav_habitat/habitat_vln_evaluator_new.py b/internnav/internnav_habitat/habitat_vlln_evaluator.py
similarity index 100%
rename from internnav/internnav_habitat/habitat_vln_evaluator_new.py
rename to internnav/internnav_habitat/habitat_vlln_evaluator.py
diff --git a/internnav/internnav_habitat/utils.py b/internnav/internnav_habitat/utils.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/scripts/eval/configs/habitat_cfg.py b/scripts/eval/configs/habitat_cfg.py
index 33cf6b19..de96107b 100644
--- a/scripts/eval/configs/habitat_cfg.py
+++ b/scripts/eval/configs/habitat_cfg.py
@@ -29,7 +29,7 @@
     eval_type='habitat_vln',
     eval_settings={
         # all current parse args
-        "output_path": "./logs/habitat/test_refactor_0d00014",  # output directory for logs/results
+        "output_path": "./logs/habitat/test_refactor_7e25e72",  # output directory for logs/results
         "save_video": False,  # whether to save videos
         "epoch": 0,  # epoch number for logging
         "max_steps_per_episode": 500,  # maximum steps per episode
diff --git a/setup.cfg b/setup.cfg
index baadd6ae..3acbb939 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -45,4 +45,4 @@ per-file-ignores=*/__init__.py:F401
 ignore=E402,E501,W503,E203,D401,R504,R505,SIM102,SIM117,E711,E226
 max-line-length = 120
 max-complexity = 30
-exclude=_*,.vscode,.git,docs/**,**/test/**,**/lcmtypes/**,*.ipynb,internnav/internnav_habitat/habitat_n1_agent_temp.py
+exclude=_*,.vscode,.git,docs/**,**/test/**,**/lcmtypes/**,*.ipynb,internnav/internnav_habitat/habitat_n1_agent.py

From 99adf731eab7f88e85d89f895641d0ef90ea4ba0 Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Wed, 12 Nov 2025 09:49:43 +0000
Subject: [PATCH 08/16] delete temp agent; rename default evaluator for habitat

---
 ...luator.py => habitat_default_evaluator.py} |   0
 .../internnav_habitat/habitat_n1_agent.py     | 751 ------------------
 setup.cfg                                     |   2 +-
 3 files changed, 1 insertion(+), 752 deletions(-)
 rename internnav/internnav_habitat/{habitat_vlln_evaluator.py => habitat_default_evaluator.py} (100%)
 delete mode 100644 internnav/internnav_habitat/habitat_n1_agent.py

diff --git a/internnav/internnav_habitat/habitat_vlln_evaluator.py b/internnav/internnav_habitat/habitat_default_evaluator.py
similarity index 100%
rename from internnav/internnav_habitat/habitat_vlln_evaluator.py
rename to internnav/internnav_habitat/habitat_default_evaluator.py
diff --git a/internnav/internnav_habitat/habitat_n1_agent.py b/internnav/internnav_habitat/habitat_n1_agent.py
deleted file mode 100644
index 0d525245..00000000
--- a/internnav/internnav_habitat/habitat_n1_agent.py
+++ /dev/null
@@ -1,751 +0,0 @@
-import copy
-import itertools
-import os
-import re
-import sys
-from pathlib import Path
-
-import numpy as np
-import torch
-
-sys.path.append(str(Path(__file__).parent.parent.parent))
-
-from collections import OrderedDict
-
-from PIL import Image
-from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
-
-from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
-from internnav.model.utils.vln_utils import split_and_clean, traj_to_actions
-
-DEFAULT_IMAGE_TOKEN = "<image>"
-
-
-@Agent.register("N1")
-class HabitatAgent(Agent):
-    def __init__(self, model, processor, args, device):
-        self.model = model
-        self.processor = processor
-        self.args = args
-        self.device = device
-        # ------------------------------------- model ------------------------------------------
-        processor = AutoProcessor.from_pretrained(args.model_path)
-        processor.tokenizer.padding_side = 'left'
-
-        device = torch.device(f"cuda:{self.local_rank}")
-        if args.mode == 'dual_system':
-            model = InternVLAN1ForCausalLM.from_pretrained(
-                args.model_path,
-                torch_dtype=torch.bfloat16,
-                attn_implementation="flash_attention_2",
-                device_map={"": device},
-            )
-        elif args.mode == 'system2':
-            model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-                args.model_path,
-                torch_dtype=torch.bfloat16,
-                attn_implementation="flash_attention_2",
-                device_map={"": device},
-            )
-        else:
-            raise ValueError(f"Invalid mode: {args.mode}")
-
-        model.eval()
-        self.device = device
-
-        self.model = model
-        self.processor = processor
-
-        # refactor: this part used in three places
-        prompt = "You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? Please output the next waypoint\'s coordinates in the image. Please output STOP when you have successfully completed the task."
-        answer = ""
-        self.conversation = [{"from": "human", "value": prompt}, {"from": "gpt", "value": answer}]
-
-        self.conjunctions = [
-            'you can see ',
-            'in front of you is ',
-            'there is ',
-            'you can spot ',
-            'you are toward the ',
-            'ahead of you is ',
-            'in your sight is ',
-        ]
-
-        self.actions2idx = OrderedDict(
-            {
-                'STOP': [0],
-                "↑": [1],
-                "←": [2],
-                "→": [3],
-                "↓": [5],
-            }
-        )
-
-        self.objectnav_instructions = ["Search for the {target_object}."]
-
-        self.num_frames = args.num_frames
-        self.num_future_steps = args.num_future_steps
-        self.num_history = args.num_history
-
-    def reset(self, episode, env):
-        """Clear all per-episode state."""
-        self.rgb_list = []
-        self.action_seq = []
-        self.output_ids = None
-        self.goal = None
-        self.messages = []
-        self.local_actions = []
-        self.forward_action = 0
-
-        # maybe store initial transforms you need for this ep:
-        self.initial_agent_state = env.get_agent_state()
-        self.initial_height = self.initial_agent_state.position[1]
-
-    def act(self, observations, env, info):
-        """
-        Pure policy step:
-        - given obs (rgb/depth/gps/compass) + optional env/info
-        - update internal state (goal, messages, local_actions, etc.)
-        - return a single action (int)
-        """
-        # 1) unpack obs: rgb, depth, gps, compass, etc.
-        # 2) handle 'look down' case
-        # 3) maybe call LLM to get pixel goal or action_seq
-        # 4) maybe call diffusion policy to get local_actions
-        # 5) choose final `action` (0..5)
-        # 6) return `action`
-        return action
-
-    def _run_local_eval(self, idx=0) -> None:  # noqa: C901
-        """
-        Run local evaluation on this rank.
-
-        Important: if resuming from previous results, need to read from / write to "self.output_path/progress.json".
-                    For each episode, save the result dict in jsonl format to that file.
-                    In Env, the episodes are already filtered by this file, tasks that have the same (scene_id, episode_id) are skipped.
-
-
-        Returns
-        -------
-        dict[str, Tensor]:
-            {
-                "sucs": [N_local],
-                "spls": [N_local],
-                "oss":  [N_local],
-                "nes":  [N_local],
-            }
-        """
-        # Create / get env
-        # self.env = self.env  # HabitatEnv from DistributedEvaluator
-
-        sucs, spls, oss, nes = [], [], [], []
-        self.model.eval()
-
-        # resume from previous results
-        # TODO: Current read write op is not distributed safe
-        if os.path.exists(os.path.join(self.output_path, 'progress.json')):
-            with open(os.path.join(self.output_path, 'progress.json'), 'r') as f:
-                for line in f.readlines():
-                    res = json.loads(line)
-                    if "scene_id" not in res:
-                        print("This evaluation has already finished!")
-                        return (
-                            torch.tensor(sucs).to(self.device),
-                            torch.tensor(spls).to(self.device),
-                            torch.tensor(oss).to(self.device),
-                            torch.tensor(nes).to(self.device),
-                            torch.tensor(len(sucs)).to(self.device),
-                        )
-                    if idx == 0:  # noqa: F405 TODO this need to keep in evaluator
-                        sucs.append(res['success'])
-                        spls.append(res['spl'])
-                        oss.append(res['os'])
-                        nes.append(res['ne'])
-
-        # Episode loop is now driven by env.reset() + env.is_running
-        process_bar = tqdm.tqdm(total=len(self.env.episodes), desc=f"Eval Epoch {self.epoch} Rank {idx}")
-        while self.env.is_running:
-
-            # ------------ 1. Start of episode ------------
-            observations = self.env.reset()
-            if not self.env.is_running or observations is None:
-                break
-
-            # ---- episode meta (scene_id, episode_id, instruction) ----
-            # we get it from the underlying habitat env
-            episode = self.env.get_current_episode()
-            scene_id = episode.scene_id.split('/')[-2]
-            episode_id = int(episode.episode_id)
-            episode_instruction = (
-                episode.instruction.instruction_text if 'objectnav' not in self.config_path else episode.object_category
-            )
-            print("episode start", episode_instruction)
-
-            agent_state = self.env._env.sim.get_agent_state()
-            rotation = agent_state.rotation
-            translation = agent_state.position
-            rotation_matrix = quaternion.as_rotation_matrix(rotation)
-            transformation_matrix = np.eye(4)
-            transformation_matrix[:3, :3] = rotation_matrix
-            transformation_matrix[:3, 3] = translation
-
-            agent = ShortestPathFollower(self.env._env.sim, 0.25, False)
-
-            os.makedirs(os.path.join(self.output_path, f'check_sim_{self.epoch}'), exist_ok=True)
-            Image.fromarray(observations['rgb']).save(
-                os.path.join(self.output_path, f'check_sim_{self.epoch}', f'rgb_{idx}.jpg')
-            )
-
-            vis_frames = []
-            step_id = 0
-
-            if self.save_video:
-                os.makedirs(os.path.join(self.output_path, f'vis_{self.epoch}', f'{scene_id}'), exist_ok=True)
-            initial_height = self.env._env.sim.get_agent_state().position[1]
-
-            rgb_list = []
-            action_seq = []
-            output_ids = None
-
-            goal = None
-            action = None
-            messages = []
-            local_actions = []
-
-            done = False
-
-            # ---------- 2. Episode step loop -----------
-            while (not done) and (step_id <= self.max_steps_per_episode):
-                # refactor agent get action
-                rgb = observations["rgb"]
-                depth = observations["depth"]
-                x, y = observations["gps"]
-                camera_yaw = observations["compass"][0]
-                depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
-                depth = depth * (self._max_depth - self._min_depth) + self._min_depth
-                depth = depth * 1000
-
-                agent_state = self.env._env.sim.get_agent_state()
-                height = agent_state.position[1] - initial_height
-                camera_position = np.array([x, -y, self._camera_height + height])
-                tf_camera_to_episodic = (
-                    self.xyz_yaw_pitch_to_tf_matrix(camera_position, camera_yaw, np.deg2rad(30))
-                    @ self.get_axis_align_matrix()
-                )
-
-                image = Image.fromarray(rgb).convert('RGB')
-                save_raw_image = image.copy()
-
-                save_dot = False
-                if action == 5:
-                    look_down_image = image
-                    save_raw_image = look_down_image.copy()
-                    look_down_depth, resize_shape = self.preprocess_depth_image_v2(
-                        Image.fromarray(depth.astype(np.uint16), mode='I;16'),
-                        do_depth_scale=True,
-                        depth_scale=1000,
-                        target_height=224,
-                        target_width=224,
-                    )
-                    look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
-                    look_down_depth[look_down_depth > 5.0] = 5.0
-                else:
-                    image = image.resize((self.args.resize_w, self.args.resize_h))
-                    rgb_list.append(image)
-
-                    if self.args.mode == 'dual_system':
-                        down_observations, _, done, _ = self.env.step(5)
-                        down_observations, _, done, _ = self.env.step(5)
-
-                        look_down_image = Image.fromarray(down_observations["rgb"]).convert('RGB')
-                        depth = down_observations["depth"]
-                        depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
-                        depth = depth * (self._max_depth - self._min_depth) + self._min_depth
-                        depth = depth * 1000
-                        look_down_depth, resize_shape = self.preprocess_depth_image_v2(
-                            Image.fromarray(depth.astype(np.uint16), mode='I;16'),
-                            do_depth_scale=True,
-                            depth_scale=1000,
-                            target_height=224,
-                            target_width=224,
-                        )
-                        look_down_depth = torch.as_tensor(np.ascontiguousarray(look_down_depth)).float()
-                        look_down_depth[look_down_depth > 5.0] = 5.0
-
-                        self.env.step(4)
-                        self.env.step(4)
-
-                info = self.env.get_metrics()
-
-                if len(action_seq) == 0 and goal is None:
-                    if action != 5:
-                        sources = copy.deepcopy(self.conversation)
-                        sources[0]["value"] = sources[0]["value"].replace(
-                            '<instruction>.', episode.instruction.instruction_text[:-1]
-                        )
-                        cur_images = rgb_list[-1:]
-                        if step_id == 0:
-                            history_id = []
-                        else:
-                            history_id = np.unique(
-                                np.linspace(0, step_id - 1, self.num_history, dtype=np.int32)
-                            ).tolist()
-                            placeholder = (DEFAULT_IMAGE_TOKEN + '\n') * len(history_id)
-                            sources[0]["value"] += f' These are your historical observations: {placeholder}.'
-
-                        history_id = sorted(history_id)
-                        print('history_idddddddd', step_id, history_id)
-                        input_images = [rgb_list[i] for i in history_id] + cur_images
-                        input_img_id = 0
-                    else:
-                        assert action == 5
-                        sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
-                        input_images += [look_down_image]
-                        # messages.append(
-                        #     {'role': 'assistant', 'content': [{'type': 'text', 'text': llm_outputs}]}  # noqa: F405
-                        # )
-                        input_img_id = -1
-
-                    prompt = random.choice(self.conjunctions) + DEFAULT_IMAGE_TOKEN
-                    sources[0]["value"] += f" {prompt}."
-                    print('sources', step_id, sources)
-                    prompt_instruction = copy.deepcopy(sources[0]["value"])
-                    parts = split_and_clean(prompt_instruction)
-
-                    content = []
-                    for i in range(len(parts)):
-                        if parts[i] == "<image>":
-                            content.append({"type": "image", "image": input_images[input_img_id]})
-                            input_img_id += 1
-                        else:
-                            content.append({"type": "text", "text": parts[i]})
-
-                    messages.append({'role': 'user', 'content': content})
-
-                    print('step_id', step_id, 'messages:', messages)
-
-                    text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-
-                    inputs = self.processor(text=[text], images=input_images, return_tensors="pt").to(self.model.device)
-
-                    with torch.no_grad():
-                        output_ids = self.model.generate(**inputs, max_new_tokens=128, do_sample=False)
-
-                    llm_outputs = self.processor.tokenizer.decode(
-                        output_ids[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
-                    )
-                    print('step_id:', step_id, 'output text:', llm_outputs)
-
-                    if bool(re.search(r'\d', llm_outputs)):
-                        forward_action = 0
-                        coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
-                        pixel_goal = [int(coord[1]), int(coord[0])]
-
-                        intrinsic_matrix = self.get_intrinsic_matrix(
-                            self.config.habitat.simulator.agents.main_agent.sim_sensors.rgb_sensor
-                        )
-                        goal = self.pixel_to_gps(pixel_goal, depth / 1000, intrinsic_matrix, tf_camera_to_episodic)
-                        print('before', goal, depth.shape)
-                        goal = (transformation_matrix @ np.array([-goal[1], 0, -goal[0], 1]))[:3]
-
-                        if not self.env._env.sim.pathfinder.is_navigable(np.array(goal)):
-                            goal = np.array(self.env._env.sim.pathfinder.snap_point(np.array(goal)))
-
-                        # look down --> horizontal
-                        self.env.step(4)
-                        self.env.step(4)
-
-                        # Forking logic based on mode
-                        if self.args.mode == 'system2':
-                            action = agent.get_next_action(goal)
-                            if action == 0:
-                                goal = None
-                                output_ids = None
-                                action = 2  # random action
-                                print('conduct a random action 2')
-                                observations = self.env.step(action)
-                                step_id += 1
-                                messages = []
-                                continue
-                        else:  # dual-system logic
-                            local_actions = []
-                            pixel_values = inputs.pixel_values
-                            image_grid_thw = torch.cat([thw.unsqueeze(0) for thw in inputs.image_grid_thw], dim=0)
-
-                            with torch.no_grad():
-                                traj_latents = self.model.generate_latents(output_ids, pixel_values, image_grid_thw)
-
-                            # prepocess align with navdp
-                            image_dp = (
-                                torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
-                            )
-                            pix_goal_image = copy.copy(image_dp)
-                            images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
-                            depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
-                            pix_goal_depth = copy.copy(depth_dp)
-                            depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
-
-                            with torch.no_grad():
-                                dp_actions = self.model.generate_traj(
-                                    traj_latents, images_dp, depths_dp, use_async=True
-                                )
-
-                            random_choice = np.random.choice(dp_actions.shape[0])
-                            if self.args.continuous_traj:
-                                action_list = traj_to_actions(dp_actions)
-                                if len(action_list) < 8:
-                                    action_list += [0] * (8 - len(action_list))
-                            else:
-                                action_list = chunk_token(dp_actions[random_choice])
-
-                            local_actions = action_list
-                            if len(local_actions) >= 4:
-                                local_actions = local_actions[:4]
-                            action = local_actions[0]
-                            if action == 0:
-                                goal = None
-                                output_ids = None
-                                action = 2  # random action
-                                print('conduct a random action 2')
-                                observations = self.env.step(action)
-                                step_id += 1
-                                messages = []
-                                continue
-
-                        print('predicted goal', pixel_goal, goal, flush=True)
-                    else:
-                        action_seq = self.parse_actions(llm_outputs)
-                        print('actions', action_seq, flush=True)
-
-                if len(action_seq) != 0:
-                    action = action_seq[0]
-                    action_seq.pop(0)
-                elif goal is not None:
-                    # Forking logic based on mode
-                    if self.args.mode == 'system2':
-                        action = agent.get_next_action(goal)
-                        action = action.detach().cpu().numpy()[0] if isinstance(action, torch.Tensor) else action
-                        action = action[0] if hasattr(action, "__len__") else action
-                    else:  # dual-system logic
-                        if len(local_actions) == 0:
-                            # navdp
-                            local_actions = []
-                            image_dp = (
-                                torch.tensor(np.array(look_down_image.resize((224, 224)))).to(torch.bfloat16) / 255
-                            )
-
-                            images_dp = torch.stack([pix_goal_image, image_dp]).unsqueeze(0).to(self.device)
-                            depth_dp = look_down_depth.unsqueeze(-1).to(torch.bfloat16)
-
-                            depths_dp = torch.stack([pix_goal_depth, depth_dp]).unsqueeze(0).to(self.device)
-                            with torch.no_grad():
-                                dp_actions = self.model.generate_traj(
-                                    traj_latents, images_dp, depths_dp, use_async=True
-                                )
-
-                            random_choice = np.random.choice(dp_actions.shape[0])
-                            if self.args.continuous_traj:
-                                action_list = traj_to_actions(dp_actions)
-                                if len(action_list) < 8:
-                                    action_list += [0] * (8 - len(action_list))
-                            else:
-                                action_list = chunk_token(dp_actions[random_choice])
-                            print("first action_list", action_list)
-
-                            local_actions = action_list
-                            if len(local_actions) >= 4:
-                                local_actions = local_actions[:4]
-                            # if len(local_actions) >= 2:
-                            #     local_actions = local_actions[:2]
-
-                            print("local_actions", local_actions)
-
-                            action = local_actions.pop(0)
-                            # navdp
-                        else:
-                            action = local_actions.pop(0)
-
-                    forward_action += 1
-                    print('forward_action', forward_action, flush=True)
-                    if forward_action > 8:
-                        goal = None
-                        output_ids = None
-                        messages = []
-                        step_id += 1
-                        forward_action = 0
-                        local_actions = []
-                        continue
-                    if action == 0:
-                        goal = None
-                        output_ids = None
-                        messages = []
-                        step_id += 1
-                        forward_action = 0
-                        local_actions = []
-                        continue
-                else:
-                    action = 0
-
-                if info['top_down_map'] is not None:
-                    if save_dot:
-                        save_raw_image = self.dot_matrix_two_dimensional(
-                            save_raw_image, save_img=False, save_path=f'test_{step_id}.jpg', pixel_goal=pixel_goal
-                        )
-                    if self.save_video:
-                        frame = observations_to_image({'rgb': np.asarray(save_raw_image)}, info)
-                        vis_frames.append(frame)
-
-                print("step_id", step_id, "action", action)
-
-                # refactor: core
-                if action == 5:
-                    self.env.step(action)
-                    observations, _, done, _ = self.env.step(action)
-                else:
-                    observations, _, done, _ = self.env.step(action)
-                    step_id += 1
-                    messages = []
-
-            # ---------- 3. End of episode -----------
-            # Update result and write progress to the output_path/progress.json
-
-            process_bar.update(1)
-
-            # After the episode finishes, collect metrics:
-            metrics = self.env.get_metrics()
-
-            sucs.append(metrics['success'])
-            spls.append(metrics['spl'])
-            oss.append(metrics['oracle_success'])
-            nes.append(metrics["distance_to_goal"])
-
-            print(
-                f"scene_episode {scene_id}_{episode_id:04d} success: {metrics['success']}, "
-                f"spl: {metrics['spl']}, os: {metrics['oracle_success']}, "
-                f"ne: {metrics['distance_to_goal']}"
-            )
-
-            # Write per-episode result.json entry (still per-rank)
-            result = {
-                "scene_id": scene_id,
-                "episode_id": episode_id,
-                "success": metrics["success"],
-                "spl": metrics["spl"],
-                "os": metrics['oracle_success'],
-                "ne": metrics["distance_to_goal"],
-                "steps": step_id,
-                "episode_instruction": episode_instruction,
-            }
-            os.makedirs(self.output_path, exist_ok=True)
-            with open(os.path.join(self.output_path, 'progress.json'), 'a') as f:
-                f.write(json.dumps(result) + "\n")
-
-        self.env.close()
-
-        return {
-            "sucs": torch.tensor(sucs, device=self.device),
-            "spls": torch.tensor(spls, device=self.device),
-            "oss": torch.tensor(oss, device=self.device),
-            "nes": torch.tensor(nes, device=self.device),
-        }
-
-    def parse_actions(self, output):
-        action_patterns = '|'.join(re.escape(action) for action in self.actions2idx)
-        # import ipdb; ipdb.set_trace()
-        regex = re.compile(action_patterns)
-        matches = regex.findall(output)
-        actions = [self.actions2idx[match] for match in matches]
-        actions = itertools.chain.from_iterable(actions)
-        return list(actions)
-
-    def preprocess_depth_image_v2(
-        self, depth_image, do_depth_scale=True, depth_scale=1000, target_height=None, target_width=None
-    ):
-        if target_height is None:
-            target_height = self.image_processor.crop_size['height']  # 384
-            target_width = self.image_processor.crop_size['width']  # 384
-
-        resized_depth_image = depth_image.resize((target_width, target_height), Image.NEAREST)
-
-        img = to_numpy_array(resized_depth_image)
-        if do_depth_scale:
-            img = img / depth_scale
-
-        return img, (target_width, target_height)
-
-    def get_intrinsic_matrix(self, sensor_cfg) -> np.ndarray:
-        width = sensor_cfg.width
-        height = sensor_cfg.height
-        fov = sensor_cfg.hfov
-        fx = (width / 2.0) / np.tan(np.deg2rad(fov / 2.0))
-        fy = fx  # Assuming square pixels (fx = fy)
-        cx = (width - 1.0) / 2.0
-        cy = (height - 1.0) / 2.0
-
-        intrinsic_matrix = np.array(
-            [[fx, 0.0, cx, 0.0], [0.0, fy, cy, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
-        )
-        return intrinsic_matrix
-
-    def get_axis_align_matrix(self):
-        ma = np.array([[0, 0, 1, 0], [-1, 0, 0, 0], [0, -1, 0, 0], [0, 0, 0, 1]])
-        return ma
-
-    def xyz_yaw_to_tf_matrix(self, xyz: np.ndarray, yaw: float) -> np.ndarray:
-        x, y, z = xyz
-        transformation_matrix = np.array(
-            [
-                [np.cos(yaw), -np.sin(yaw), 0, x],
-                [np.sin(yaw), np.cos(yaw), 0, y],
-                [0, 0, 1, z],
-                [0, 0, 0, 1],
-            ]
-        )
-        return transformation_matrix
-
-    def xyz_pitch_to_tf_matrix(self, xyz: np.ndarray, pitch: float) -> np.ndarray:
-        """Converts a given position and pitch angle to a 4x4 transformation matrix.
-
-        Args:
-            xyz (np.ndarray): A 3D vector representing the position.
-            pitch (float): The pitch angle in radians for y axis.
-        Returns:
-            np.ndarray: A 4x4 transformation matrix.
-        """
-
-        x, y, z = xyz
-        transformation_matrix = np.array(
-            [
-                [np.cos(pitch), 0, np.sin(pitch), x],
-                [0, 1, 0, y],
-                [-np.sin(pitch), 0, np.cos(pitch), z],
-                [0, 0, 0, 1],
-            ]
-        )
-        return transformation_matrix
-
-    def xyz_yaw_pitch_to_tf_matrix(self, xyz: np.ndarray, yaw: float, pitch: float) -> np.ndarray:
-        """Converts a given position and yaw, pitch angles to a 4x4 transformation matrix.
-
-        Args:
-            xyz (np.ndarray): A 3D vector representing the position.
-            yaw (float): The yaw angle in radians.
-            pitch (float): The pitch angle in radians for y axis.
-        Returns:
-            np.ndarray: A 4x4 transformation matrix.
-        """
-        x, y, z = xyz
-        rot1 = self.xyz_yaw_to_tf_matrix(xyz, yaw)[:3, :3]
-        rot2 = self.xyz_pitch_to_tf_matrix(xyz, pitch)[:3, :3]
-        transformation_matrix = np.eye(4)
-        transformation_matrix[:3, :3] = rot1 @ rot2
-        transformation_matrix[:3, 3] = xyz
-        return transformation_matrix
-
-    def pixel_to_gps(self, pixel, depth, intrinsic, tf_camera_to_episodic):
-        '''
-        Args:
-            pixel: (2,) - [u, v] pixel coordinates
-            depth: (H, W) - depth image where depth[v, u] gives depth in meters
-            intrinsic: (4, 4) - camera intrinsic matrix
-            tf_camera_to_episodic: (4, 4) - transformation from camera to episodic frame
-        Returns:
-            (x, y): (x, y) coordinates in the episodic frame
-        '''
-        v, u = pixel
-        z = depth[v, u]
-        print("depthhhhhhhhhhhhhh", z)
-
-        x = (u - intrinsic[0, 2]) * z / intrinsic[0, 0]
-        y = (v - intrinsic[1, 2]) * z / intrinsic[1, 1]
-        point_camera = np.array([x, y, z, 1.0])
-
-        # Transform to episodic frame
-        point_episodic = tf_camera_to_episodic @ point_camera
-        point_episodic = point_episodic[:3] / point_episodic[3]
-
-        x = point_episodic[0]
-        y = point_episodic[1]
-
-        return (x, y)  # same as habitat gps
-
-    def dot_matrix_two_dimensional(
-        self,
-        image_or_image_path,
-        save_path=None,
-        dots_size_w=8,
-        dots_size_h=8,
-        save_img=False,
-        font_path='fonts/arial.ttf',
-        pixel_goal=None,
-    ):
-        """
-        takes an original image as input, save the processed image to save_path. Each dot is labeled with two-dimensional Cartesian coordinates (x,y). Suitable for single-image tasks.
-        control args:
-        1. dots_size_w: the number of columns of the dots matrix
-        2. dots_size_h: the number of rows of the dots matrix
-        """
-        with open_image(image_or_image_path) as img:
-            if img.mode != 'RGB':
-                img = img.convert('RGB')
-            draw = ImageDraw.Draw(img, 'RGB')
-
-            width, height = img.size
-            grid_size_w = dots_size_w + 1
-            grid_size_h = dots_size_h + 1
-            cell_width = width / grid_size_w
-            cell_height = height / grid_size_h
-
-            font = ImageFont.truetype(font_path, width // 40)  # Adjust font size if needed; default == width // 40
-
-            target_i = target_j = None
-            if pixel_goal is not None:
-                y_pixel, x_pixel = pixel_goal[0], pixel_goal[1]
-                # Validate pixel coordinates
-                if not (0 <= x_pixel < width and 0 <= y_pixel < height):
-                    raise ValueError(f"pixel_goal {pixel_goal} exceeds image dimensions ({width}x{height})")
-
-                # Convert to grid coordinates
-                target_i = round(x_pixel / cell_width)
-                target_j = round(y_pixel / cell_height)
-
-                # Validate grid bounds
-                if not (1 <= target_i <= dots_size_w and 1 <= target_j <= dots_size_h):
-                    raise ValueError(
-                        f"pixel_goal {pixel_goal} maps to grid ({target_j},{target_i}), "
-                        f"valid range is (1,1)-({dots_size_h},{dots_size_w})"
-                    )
-
-            count = 0
-
-            for j in range(1, grid_size_h):
-                for i in range(1, grid_size_w):
-                    x = int(i * cell_width)
-                    y = int(j * cell_height)
-
-                    pixel_color = img.getpixel((x, y))
-                    # choose a more contrasting color from black and white
-                    if pixel_color[0] + pixel_color[1] + pixel_color[2] >= 255 * 3 / 2:
-                        opposite_color = (0, 0, 0)
-                    else:
-                        opposite_color = (255, 255, 255)
-
-                    if pixel_goal is not None and i == target_i and j == target_j:
-                        opposite_color = (255, 0, 0)  # Red for target
-
-                    circle_radius = width // 240  # Adjust dot size if needed; default == width // 240
-                    draw.ellipse(
-                        [(x - circle_radius, y - circle_radius), (x + circle_radius, y + circle_radius)],
-                        fill=opposite_color,
-                    )
-
-                    text_x, text_y = x + 3, y
-                    count_w = count // dots_size_w
-                    count_h = count % dots_size_w
-                    label_str = f"({count_w+1},{count_h+1})"
-                    draw.text((text_x, text_y), label_str, fill=opposite_color, font=font)
-                    count += 1
-            if save_img:
-                print(">>> dots overlaid image processed, stored in", save_path)
-                img.save(save_path)
-            return img
diff --git a/setup.cfg b/setup.cfg
index 3acbb939..dd5d8f9e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -45,4 +45,4 @@ per-file-ignores=*/__init__.py:F401
 ignore=E402,E501,W503,E203,D401,R504,R505,SIM102,SIM117,E711,E226
 max-line-length = 120
 max-complexity = 30
-exclude=_*,.vscode,.git,docs/**,**/test/**,**/lcmtypes/**,*.ipynb,internnav/internnav_habitat/habitat_n1_agent.py
+exclude=_*,.vscode,.git,docs/**,**/test/**,**/lcmtypes/**,*.ipynb

From 75b38a7fc7c70e7f1438949d679412ddbe9bc129 Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Wed, 12 Nov 2025 09:51:15 +0000
Subject: [PATCH 09/16] update slurm bash

---
 scripts/eval/bash/eval_dual_system.sh | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/scripts/eval/bash/eval_dual_system.sh b/scripts/eval/bash/eval_dual_system.sh
index ef4be1eb..56e4b40e 100755
--- a/scripts/eval/bash/eval_dual_system.sh
+++ b/scripts/eval/bash/eval_dual_system.sh
@@ -12,8 +12,6 @@ srun -p efm_t \
     --ntasks-per-node=8 \
     --cpus-per-task=16 \
     --kill-on-bad-exit=1 \
-    python scripts/eval/eval_habitat.py \
-    --model_path checkpoints/${MID_RUN_NAME} \
-    --predict_step_nums 32 \
-    --continuous_traj \
-    --output_path results/$MID_RUN_NAME/val_unseen_32traj_8steps \
+    python scripts/eval/eval.py \
+        --config scripts/eval/configs/habitat_cfg.py \
+    > logs/${MID_RUN_NAME}_log.txt 2>&1

From 08bb9c33f4e8058945475e92185e91be140957c2 Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Wed, 12 Nov 2025 10:03:53 +0000
Subject: [PATCH 10/16] update readme

---
 internnav/internnav_habitat/README.md | 135 ++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/internnav/internnav_habitat/README.md b/internnav/internnav_habitat/README.md
index 03779ded..b7c5d730 100644
--- a/internnav/internnav_habitat/README.md
+++ b/internnav/internnav_habitat/README.md
@@ -1 +1,136 @@
 # Habitat in InternNav
+
+This package adapts [Meta AI Habitat](https://aihabitat.org) environments and
+metrics so they can be used from InternNav's evaluation framework. It provides
+an environment wrapper, custom measurements, and evaluator implementations that
+bridge Habitat simulations with InternNav agents and distributed evaluation
+utilities.
+
+## Package structure
+
+```
+internnav_habitat/
+├── __init__.py
+├── habitat_env.py
+├── habitat_default_evaluator.py
+├── habitat_vln_evaluator.py
+├── measures.py
+└── refactor_notes.md
+```
+
+* `__init__.py` re-exports the public entry points for the environment and the
+  VLN evaluator so they can be imported as
+  `from internnav.internnav_habitat import HabitatEnv`.
+* `habitat_env.py` implements the `Env` subclass that wraps Habitat's
+  `Env` object. It bootstraps episodes, handles sharding across distributed
+  ranks, and adapts Habitat's observations to InternNav's expectations.
+* `habitat_default_evaluator.py` contains a lightweight evaluator that runs a
+  conventional Habitat agent inside the InternNav evaluator loop.
+* `habitat_vln_evaluator.py` is the task-specific evaluator used for Vision-
+  and-Language Navigation (VLN). It loads InternNav vision-language models,
+  orchestrates inference, and logs results during distributed evaluation.
+* `measures.py` registers additional Habitat measurements (path length,
+  oracle metrics, step counts) that are required by the evaluators.
+
+The `refactor_notes.md` file captures design notes and TODOs collected during a
+previous refactoring pass.
+
+## Habitat environment wrapper
+
+`HabitatEnv` is registered under the key `"habitat"` via the shared
+`Env.register` decorator. When InternNav builds an environment from an
+`EnvCfg`, the wrapper:
+
+1. Imports and instantiates the Habitat `Env` using the configuration object
+   provided in `env_settings['habitat_config']`.
+2. Stores the distributed context (`local_rank`, `world_size`) and any output
+   directory override (`output_path`).
+3. Pre-computes the episode list by grouping Habitat episodes by scene,
+   filtering completed episodes via `progress.json`, and sharding the remaining
+   work by rank.
+4. Implements the standard reset/step/close/render accessors expected by the
+   InternNav `Env` base class while delegating to the underlying Habitat
+   simulator.
+
+This design keeps the Habitat-specific logic isolated from the rest of the
+framework and ensures that distributed evaluation proceeds deterministically
+across ranks.
+
+## Evaluation pipeline
+
+InternNav evaluators extend the shared `DistributedEvaluator` base class, which
+handles distributed initialization, environment instantiation, metric
+aggregation, and result logging. The Habitat integration provides two
+implementations:
+
+### `HabitatVlnEvaluator`
+
+The VLN evaluator (`habitat_vln_evaluator.py`) is responsible for coordinating
+model inference in Habitat scenes.
+
+* **Configuration:** During initialization the evaluator reads an `EvalCfg`
+  whose `env.env_settings['config_path']` points to a Habitat YAML file. The
+  config is loaded with Habitat's baseline utilities, sensor intrinsics are
+  cached, and custom measurements (`top_down_map`, `collisions`) are enabled.
+* **Environment binding:** The Habitat configuration is injected back into the
+  `EnvCfg` so the shared `DistributedEvaluator` base class can create the
+  `HabitatEnv` wrapper with the correct settings.
+* **Model loading:** Depending on `cfg.agent.model_settings.mode`, the evaluator
+  loads either the InternVLA dual-system model or a Qwen2.5-VL model using
+  Hugging Face Transformers. The processor is configured with left padding and
+  the model is moved to the rank-local GPU.
+* **Episode loop:**
+  1. `HabitatEnv.reset()` advances to the next episode and returns the first
+     observation.
+  2. The evaluator reads episode metadata (scene, instruction) from Habitat,
+     constructs prompt messages, and collects RGB/depth history for the
+     language model.
+  3. Visual inputs are prepared (resizing, optional look-down depth capture) and
+     depth maps are filtered through `filter_depth` to remove sensor noise.
+  4. The evaluator queries the loaded model for the next action sequence,
+     translates model tokens to Habitat actions via `traj_to_actions`, and
+     steps the environment.
+  5. Per-episode metrics (`success`, `SPL`, oracle success, navigation error)
+     are appended and checkpointed to `progress.json` for resumability.
+* **Aggregation:** After all ranks finish, inherited utilities gather per-rank
+  tensors, compute global averages, and write `result.json` in
+  `output_path`.
+
+### `HabitatVlnEvaluator` (baseline)
+
+The default evaluator in `habitat_default_evaluator.py` offers a simpler loop
+where a pre-built InternNav agent interacts with the Habitat environment.
+InternNav's agent abstraction is reset with each new Habitat episode, and
+per-step actions are produced via `agent.act()`. The evaluator records the same
+metrics as the VLN evaluator, making it useful for baselines or sanity checks.
+
+## Custom Habitat measurements
+
+`measures.py` registers a suite of metrics with Habitat's registry so that they
+are available in the Habitat configuration:
+
+* `PathLength`: cumulative Euclidean distance traveled by the agent.
+* `OracleNavigationError`: minimum geodesic distance to the goal along the
+  trajectory.
+* `OracleSuccess`: binary success metric derived from oracle navigation error
+  relative to a goal radius (default 3.0 meters).
+* `OracleSPL`: best Success weighted by Path Length value observed during the
+  trajectory.
+* `StepsTaken`: number of actions issued by the agent, including STOP.
+
+These metrics complement Habitat's built-in success and SPL scores, allowing
+InternNav to report a richer set of statistics.
+
+## Extending the integration
+
+* **Adding evaluators:** Subclass `DistributedEvaluator`, supply
+  Habitat-specific initialization similar to `HabitatVlnEvaluator`, and
+  implement `eval_action` and `calc_metrics`.
+* **Custom sensors or observations:** Augment the Habitat YAML configuration and
+  update `HabitatEnv` or the evaluator to consume the new observation keys.
+* **Additional metrics:** Register new measures in `measures.py` and enable them
+  in the Habitat config via `config.habitat.task.measurements.update(...)`.
+
+By centralizing Habitat-specific logic in this package, InternNav can swap in
+other simulators or extend Habitat support without touching the rest of the
+training and evaluation stack.

From cde84b3933f55ba2ac2ba71b07b2daf9ec020998 Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Thu, 13 Nov 2025 03:49:13 +0000
Subject: [PATCH 11/16] fix init dist print

---
 internnav/evaluator/distributed_base.py | 11 ++---------
 internnav/utils/dist.py                 |  9 +++++++--
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/internnav/evaluator/distributed_base.py b/internnav/evaluator/distributed_base.py
index 94bfc06e..4327c8a4 100644
--- a/internnav/evaluator/distributed_base.py
+++ b/internnav/evaluator/distributed_base.py
@@ -22,19 +22,12 @@ class DistributedEvaluator(Evaluator):
 
     def __init__(self, cfg: EvalCfg, init_env: bool = True, init_agent: bool = True):
         # distributed setting
-        import socket
-
-        print(
-            f"Rank {os.getenv('RANK')} / {os.getenv('WORLD_SIZE')} on {socket.gethostname()}:{os.getenv('MASTER_PORT')}"
-        )
-
-        self.output_path = cfg.eval_settings["output_path"]  # TODO: unsafe for distribution
-
-        init_distributed_mode()
+        init_distributed_mode(dist_url=cfg.eval_settings['dist_url'], port=cfg.eval_settings['port'])
 
         self.local_rank = get_rank()
         np.random.seed(self.local_rank)
         self.world_size = get_world_size()
+        self.output_path = cfg.eval_settings["output_path"]  # TODO: unsafe for distribution
 
         # habitat env also need rank to split dataset
         cfg.env.env_settings['local_rank'] = get_rank()
diff --git a/internnav/utils/dist.py b/internnav/utils/dist.py
index 82c14a31..7a65599a 100644
--- a/internnav/utils/dist.py
+++ b/internnav/utils/dist.py
@@ -190,7 +190,7 @@ def save_on_master(*args, **kwargs):
         torch.save(*args, **kwargs)
 
 
-def init_distributed_mode(port=29529, backend="nccl", timeout_hours=2):
+def init_distributed_mode(dist_url="env://", port=29529, backend="nccl", timeout_hours=2):
     # Fast-path: torchrun provides these
     if all(k in os.environ for k in ["RANK", "WORLD_SIZE", "LOCAL_RANK", "MASTER_ADDR", "MASTER_PORT"]):
         rank = int(os.environ["RANK"])
@@ -218,10 +218,15 @@ def init_distributed_mode(port=29529, backend="nccl", timeout_hours=2):
         setup_for_distributed(is_master=True)
         return
 
+    import socket
+
+    print(f"Rank {os.getenv('RANK')} / {os.getenv('WORLD_SIZE')} on {socket.gethostname()}:{os.getenv('MASTER_PORT')}")
+
     # Device selection must happen before NCCL init
     torch.cuda.set_device(local_rank)
 
-    dist.init_process_group(backend=backend, init_method="env://", timeout=datetime.timedelta(hours=timeout_hours))
+    dist.init_process_group(backend=backend, init_method=dist_url, timeout=datetime.timedelta(hours=timeout_hours))
+    dist.barrier()
     setup_for_distributed(dist.get_rank() == 0)
 
 

From c89723d305a74815faba7273cd87e4ed7b08b77c Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Thu, 13 Nov 2025 07:20:25 +0000
Subject: [PATCH 12/16] fix eval config; fix local rank to rank

---
 internnav/evaluator/distributed_base.py           |  6 +++---
 internnav/internnav_habitat/habitat_env.py        |  8 ++++----
 .../internnav_habitat/habitat_vln_evaluator.py    | 15 +++++++--------
 internnav/utils/dist.py                           |  6 +++++-
 scripts/eval/configs/h1_cma_cfg.py                |  3 ++-
 scripts/eval/configs/h1_internvla_n1_cfg.py       |  2 ++
 scripts/eval/configs/h1_rdp_cfg.py                |  2 ++
 scripts/eval/configs/h1_seq2seq_cfg.py            |  2 ++
 8 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/internnav/evaluator/distributed_base.py b/internnav/evaluator/distributed_base.py
index 4327c8a4..c284edc0 100644
--- a/internnav/evaluator/distributed_base.py
+++ b/internnav/evaluator/distributed_base.py
@@ -22,15 +22,15 @@ class DistributedEvaluator(Evaluator):
 
     def __init__(self, cfg: EvalCfg, init_env: bool = True, init_agent: bool = True):
         # distributed setting
-        init_distributed_mode(dist_url=cfg.eval_settings['dist_url'], port=cfg.eval_settings['port'])
+        self.local_rank = init_distributed_mode(dist_url=cfg.eval_settings['dist_url'], port=cfg.eval_settings['port'])
 
-        self.local_rank = get_rank()
+        self.rank = get_rank()
         np.random.seed(self.local_rank)
         self.world_size = get_world_size()
         self.output_path = cfg.eval_settings["output_path"]  # TODO: unsafe for distribution
 
         # habitat env also need rank to split dataset
-        cfg.env.env_settings['local_rank'] = get_rank()
+        cfg.env.env_settings['rank'] = get_rank()
         cfg.env.env_settings['world_size'] = get_world_size()
 
         self.eval_config = cfg
diff --git a/internnav/internnav_habitat/habitat_env.py b/internnav/internnav_habitat/habitat_env.py
index b680e94f..1b0f3f43 100644
--- a/internnav/internnav_habitat/habitat_env.py
+++ b/internnav/internnav_habitat/habitat_env.py
@@ -12,7 +12,7 @@ def __init__(self, env_config: EnvCfg, task_config: TaskCfg):
         """
         env_settings include:
             - habitat_config: loaded from get_habitat_config
-            - local_rank: int, rank index for sharding
+            - rank: int, rank index for sharding
             - world_size: int, total number of ranks
         """
         try:
@@ -27,7 +27,7 @@ def __init__(self, env_config: EnvCfg, task_config: TaskCfg):
         self.config = env_config.env_settings['habitat_config']
         self._env = Env(self.config)
 
-        self.local_rank = env_config.env_settings.get('local_rank', 0)
+        self.rank = env_config.env_settings.get('rank', 0)
         self.world_size = env_config.env_settings.get('world_size', 1)
         self._current_episode_index: int = 0
         self._last_obs: Optional[Dict[str, Any]] = None
@@ -45,7 +45,7 @@ def generate_episodes(self) -> List[Any]:
         Generate list of episodes for the current split, already:
         - grouped by scene
         - filtered by done_res (the path is self.output_path/progress.json)
-        - sharded by (local_rank, world_size)
+        - sharded by (rank, world_size)
         """
         all_episodes = []
 
@@ -71,7 +71,7 @@ def generate_episodes(self) -> List[Any]:
             scene_id = scene.split('/')[-2]
 
             # shard by rank index / world_size
-            for episode in per_scene_eps[self.local_rank :: self.world_size]:
+            for episode in per_scene_eps[self.rank :: self.world_size]:
                 episode_id = int(episode.episode_id)
                 if (scene_id, episode_id) in done_res:
                     continue
diff --git a/internnav/internnav_habitat/habitat_vln_evaluator.py b/internnav/internnav_habitat/habitat_vln_evaluator.py
index 9ed4c487..9fdd0152 100644
--- a/internnav/internnav_habitat/habitat_vln_evaluator.py
+++ b/internnav/internnav_habitat/habitat_vln_evaluator.py
@@ -44,8 +44,7 @@
     # Import for Habitat registry side effects — do not remove
     import internnav.internnav_habitat.measures  # noqa: F401 # isort: skip
 except Exception as e:
-    print("Habitat Error:", e)
-    print("Habitat Evaluation is not loaded.")
+    print(f"Warning: ({e}), Habitat Evaluation is not loaded in this runtime. Ignore this if not using Habitat.")
 
 
 DEFAULT_IMAGE_TOKEN = "<image>"
@@ -170,10 +169,10 @@ def eval_action(self):
         Returns dict[str, Tensor] on GPU (1D tensors of same length).
         """
         # Old behavior was something like:
-        # sucs, spls, oss, nes, ep_num = self.eval_action(self.local_rank)
+        # sucs, spls, oss, nes, ep_num = self.eval_action(self.rank)
         # Now just implement the actual eval here and return dict.
 
-        sucs, spls, oss, nes, _ = self._run_local_eval(self.local_rank)
+        sucs, spls, oss, nes, _ = self._run_local_eval(self.rank)
 
         return {
             "sucs": sucs,  # shape [N_local]
@@ -202,7 +201,7 @@ def calc_metrics(self, global_metrics: dict) -> dict:
             # "length" will be filled by base class
         }
 
-    def _run_local_eval(self, idx=0) -> None:  # noqa: C901
+    def _run_local_eval(self) -> None:  # noqa: C901
         """
         Run local evaluation on this rank.
 
@@ -242,14 +241,14 @@ def _run_local_eval(self, idx=0) -> None:  # noqa: C901
                             torch.tensor(nes).to(self.device),
                             torch.tensor(len(sucs)).to(self.device),
                         )
-                    if idx == 0:  # noqa: F405 TODO this need to keep in evaluator
+                    if self.rank == 0:  # noqa: F405 TODO this need to keep in evaluator
                         sucs.append(res['success'])
                         spls.append(res['spl'])
                         oss.append(res['os'])
                         nes.append(res['ne'])
 
         # Episode loop is now driven by env.reset() + env.is_running
-        process_bar = tqdm.tqdm(total=len(self.env.episodes), desc=f"Eval Epoch {self.epoch} Rank {idx}")
+        process_bar = tqdm.tqdm(total=len(self.env.episodes), desc=f"Eval Epoch {self.epoch} Rank {self.rank}")
         while self.env.is_running:
 
             # ------------ 1. Start of episode ------------
@@ -280,7 +279,7 @@ def _run_local_eval(self, idx=0) -> None:  # noqa: C901
             # save first frame per rank to validate sim quality
             os.makedirs(os.path.join(self.output_path, f'check_sim_{self.epoch}'), exist_ok=True)
             Image.fromarray(observations['rgb']).save(
-                os.path.join(self.output_path, f'check_sim_{self.epoch}', f'rgb_{idx}.jpg')
+                os.path.join(self.output_path, f'check_sim_{self.epoch}', f'rgb_{self.rank}.jpg')
             )
 
             vis_frames = []
diff --git a/internnav/utils/dist.py b/internnav/utils/dist.py
index 7a65599a..81634deb 100644
--- a/internnav/utils/dist.py
+++ b/internnav/utils/dist.py
@@ -221,13 +221,17 @@ def init_distributed_mode(dist_url="env://", port=29529, backend="nccl", timeout
     import socket
 
     print(f"Rank {os.getenv('RANK')} / {os.getenv('WORLD_SIZE')} on {socket.gethostname()}:{os.getenv('MASTER_PORT')}")
+    print('| distributed init (rank {}): {}, gpu {}'.format(rank, dist_url, local_rank), flush=True)
 
     # Device selection must happen before NCCL init
     torch.cuda.set_device(local_rank)
 
-    dist.init_process_group(backend=backend, init_method=dist_url, timeout=datetime.timedelta(hours=timeout_hours))
+    dist.init_process_group(
+        backend=backend, init_method=dist_url, world_size=world_size, rank=rank, timeout=datetime.timedelta(0, 7200)
+    )
     dist.barrier()
     setup_for_distributed(dist.get_rank() == 0)
+    return local_rank
 
 
 def save_model(args, epoch, model_without_ddp, optimizer, checkpoint_path):
diff --git a/scripts/eval/configs/h1_cma_cfg.py b/scripts/eval/configs/h1_cma_cfg.py
index 6b27ee93..eae6a511 100644
--- a/scripts/eval/configs/h1_cma_cfg.py
+++ b/scripts/eval/configs/h1_cma_cfg.py
@@ -47,5 +47,6 @@
             'filter_stairs': False,
         },
     ),
-    eval_settings={'save_to_json': False, 'vis_output': True},
+    eval_type='vln_multi',
+    eval_settings={'save_to_json': True, 'vis_output': False},
 )
diff --git a/scripts/eval/configs/h1_internvla_n1_cfg.py b/scripts/eval/configs/h1_internvla_n1_cfg.py
index 90a801cf..e6411e7c 100644
--- a/scripts/eval/configs/h1_internvla_n1_cfg.py
+++ b/scripts/eval/configs/h1_internvla_n1_cfg.py
@@ -69,4 +69,6 @@
             # 'selected_scans': ['8194nk5LbLH', 'pLe4wQe7qrG'],
         },
     ),
+    eval_type='vln_multi',
+    eval_settings={'save_to_json': True, 'vis_output': False},
 )
diff --git a/scripts/eval/configs/h1_rdp_cfg.py b/scripts/eval/configs/h1_rdp_cfg.py
index a6380c85..a7e02542 100644
--- a/scripts/eval/configs/h1_rdp_cfg.py
+++ b/scripts/eval/configs/h1_rdp_cfg.py
@@ -45,4 +45,6 @@
             'filter_stairs': False,
         },
     ),
+    eval_type='vln_multi',
+    eval_settings={'save_to_json': True, 'vis_output': False},
 )
diff --git a/scripts/eval/configs/h1_seq2seq_cfg.py b/scripts/eval/configs/h1_seq2seq_cfg.py
index 2934e8e7..4fd9c19b 100644
--- a/scripts/eval/configs/h1_seq2seq_cfg.py
+++ b/scripts/eval/configs/h1_seq2seq_cfg.py
@@ -45,4 +45,6 @@
             'filter_stairs': False,
         },
     ),
+    eval_type='vln_multi',
+    eval_settings={'save_to_json': True, 'vis_output': False},
 )

From 783627689df58c53f1600877f38c6afec0edf016 Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Thu, 13 Nov 2025 07:45:33 +0000
Subject: [PATCH 13/16] update init distributed mode if condition

---
 internnav/evaluator/distributed_base.py |  2 +-
 internnav/utils/dist.py                 | 20 +++++++++++---------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/internnav/evaluator/distributed_base.py b/internnav/evaluator/distributed_base.py
index c284edc0..91eeb588 100644
--- a/internnav/evaluator/distributed_base.py
+++ b/internnav/evaluator/distributed_base.py
@@ -23,9 +23,9 @@ class DistributedEvaluator(Evaluator):
     def __init__(self, cfg: EvalCfg, init_env: bool = True, init_agent: bool = True):
         # distributed setting
         self.local_rank = init_distributed_mode(dist_url=cfg.eval_settings['dist_url'], port=cfg.eval_settings['port'])
+        np.random.seed(self.local_rank)
 
         self.rank = get_rank()
-        np.random.seed(self.local_rank)
         self.world_size = get_world_size()
         self.output_path = cfg.eval_settings["output_path"]  # TODO: unsafe for distribution
 
diff --git a/internnav/utils/dist.py b/internnav/utils/dist.py
index 81634deb..1d68832b 100644
--- a/internnav/utils/dist.py
+++ b/internnav/utils/dist.py
@@ -191,14 +191,8 @@ def save_on_master(*args, **kwargs):
 
 
 def init_distributed_mode(dist_url="env://", port=29529, backend="nccl", timeout_hours=2):
-    # Fast-path: torchrun provides these
-    if all(k in os.environ for k in ["RANK", "WORLD_SIZE", "LOCAL_RANK", "MASTER_ADDR", "MASTER_PORT"]):
-        rank = int(os.environ["RANK"])
-        world_size = int(os.environ["WORLD_SIZE"])
-        local_rank = int(os.environ["LOCAL_RANK"])
-
     # SLURM path: derive env then fall back to env://
-    elif "SLURM_PROCID" in os.environ:
+    if "SLURM_PROCID" in os.environ:
         rank = int(os.environ["SLURM_PROCID"])
         world_size = int(os.environ["SLURM_NTASKS"])
         num_gpus = torch.cuda.device_count()
@@ -206,13 +200,21 @@ def init_distributed_mode(dist_url="env://", port=29529, backend="nccl", timeout
 
         # pick first node as master
         nodelist = os.environ["SLURM_NODELIST"]
+        print(f'Node list: {nodelist}')
         master_addr = subprocess.getoutput(f"scontrol show hostname {nodelist} | head -n1")
-        os.environ.setdefault("MASTER_ADDR", master_addr)
-        os.environ.setdefault("MASTER_PORT", str(port))
+
+        os.environ["MASTER_ADDR"] = master_addr
+        os.environ["MASTER_PORT"] = str(port)
         os.environ["RANK"] = str(rank)
         os.environ["WORLD_SIZE"] = str(world_size)
         os.environ["LOCAL_RANK"] = str(local_rank)
 
+    # Fast-path: torchrun provides these
+    elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        rank = int(os.environ["RANK"])
+        world_size = int(os.environ["WORLD_SIZE"])
+        local_rank = int(os.environ["LOCAL_RANK"])
+
     else:
         print("Not using distributed mode")
         setup_for_distributed(is_master=True)

From dac13e1bcf463a514e37c123d81915ae7b17eadd Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Thu, 13 Nov 2025 08:48:56 +0000
Subject: [PATCH 14/16] update dist for dlc

---
 internnav/utils/dist.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/internnav/utils/dist.py b/internnav/utils/dist.py
index 1d68832b..3f7f1d0b 100644
--- a/internnav/utils/dist.py
+++ b/internnav/utils/dist.py
@@ -213,7 +213,14 @@ def init_distributed_mode(dist_url="env://", port=29529, backend="nccl", timeout
     elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
         rank = int(os.environ["RANK"])
         world_size = int(os.environ["WORLD_SIZE"])
-        local_rank = int(os.environ["LOCAL_RANK"])
+        if "LOCAL_RANK" in os.environ:
+            local_rank = int(os.environ["LOCAL_RANK"])
+        elif "RANK" in os.environ:
+            # fallback: assume per-node GPU count n
+            num_gpus = torch.cuda.device_count()
+            local_rank = rank % max(1, num_gpus)
+        else:
+            local_rank = 0
 
     else:
         print("Not using distributed mode")

From d8734c746d0a3158e3c6b859bb71168fd14f31ff Mon Sep 17 00:00:00 2001
From: wangyukai <kewa990809@gmail.com>
Date: Thu, 13 Nov 2025 10:06:20 +0000
Subject: [PATCH 15/16] fix bug in evaluator

---
 internnav/internnav_habitat/habitat_vln_evaluator.py | 3 +--
 scripts/eval/configs/habitat_cfg.py                  | 6 +-----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/internnav/internnav_habitat/habitat_vln_evaluator.py b/internnav/internnav_habitat/habitat_vln_evaluator.py
index 9fdd0152..0be81f43 100644
--- a/internnav/internnav_habitat/habitat_vln_evaluator.py
+++ b/internnav/internnav_habitat/habitat_vln_evaluator.py
@@ -153,7 +153,6 @@ def __init__(self, cfg: EvalCfg):
         self.num_future_steps = self.model_args.num_future_steps
         self.num_history = self.model_args.num_history
 
-        # ------------------------------------- old ------------------------------------------
         self._camera_height = self.sim_sensors_config.rgb_sensor.position[1]
         self._min_depth = self.sim_sensors_config.depth_sensor.min_depth
         self._max_depth = self.sim_sensors_config.depth_sensor.max_depth
@@ -172,7 +171,7 @@ def eval_action(self):
         # sucs, spls, oss, nes, ep_num = self.eval_action(self.rank)
         # Now just implement the actual eval here and return dict.
 
-        sucs, spls, oss, nes, _ = self._run_local_eval(self.rank)
+        sucs, spls, oss, nes, _ = self._run_local_eval()
 
         return {
             "sucs": sucs,  # shape [N_local]
diff --git a/scripts/eval/configs/habitat_cfg.py b/scripts/eval/configs/habitat_cfg.py
index de96107b..a079c0ca 100644
--- a/scripts/eval/configs/habitat_cfg.py
+++ b/scripts/eval/configs/habitat_cfg.py
@@ -29,15 +29,11 @@
     eval_type='habitat_vln',
     eval_settings={
         # all current parse args
-        "output_path": "./logs/habitat/test_refactor_7e25e72",  # output directory for logs/results
+        "output_path": "./logs/habitat/test",  # output directory for logs/results
         "save_video": False,  # whether to save videos
         "epoch": 0,  # epoch number for logging
         "max_steps_per_episode": 500,  # maximum steps per episode
         # distributed settings
-        "local_rank": 0,  # node rank
-        "world_size": 1,  # number of distributed processes
-        "rank": 0,  # rank of current process
-        "gpu": 0,  # gpu id to use
         "port": "2333",  # communication port
         "dist_url": "env://",  # url for distributed setup
     },

From fb21071cf02ab2fbe949da020d76583999851b88 Mon Sep 17 00:00:00 2001
From: 0309hws <huangwensi@pjlab.org.cn>
Date: Wed, 10 Dec 2025 18:49:21 +0800
Subject: [PATCH 16/16] [test] dialog+object

---
 internnav/agent/__init__.py                   |   2 +
 internnav/agent/base.py                       |   1 +
 internnav/agent/dialog_agent.py               | 506 ++++++++++++++++++
 internnav/configs/evaluator/__init__.py       |   5 +-
 internnav/env/__init__.py                     |   3 +-
 internnav/env/base.py                         |   1 +
 internnav/env/dialog_mp3d.py                  | 173 ++++++
 .../{internnav_habitat => env}/habitat_env.py |  43 +-
 internnav/evaluator/distributed_base.py       |   8 +-
 internnav/internnav_habitat/__init__.py       |   2 +-
 internnav/internnav_habitat/dialog_dataset.py |  83 +++
 .../internnav_habitat/dialog_episodes.py      |  36 ++
 internnav/internnav_habitat/dialog_utils.py   | 151 ++++++
 .../habitat_dialog_evaluator.py               | 458 ++++++++++++++++
 .../internnav_habitat/simple_npc/__init__.py  |   0
 .../internnav_habitat/simple_npc/prompt.py    |  86 +++
 .../simple_npc/scene_summary                  |   1 +
 .../simple_npc/simple_npc.py                  | 149 ++++++
 scripts/eval/bash/srun_eval_dialog.sh         |  15 +
 scripts/eval/bash/srun_eval_object.sh         |  15 +
 scripts/eval/configs/gen_videos.yaml          |  64 +++
 scripts/eval/configs/habitat_dialog_cfg.py    |  61 +++
 scripts/eval/configs/habitat_object_cfg.py    |  61 +++
 scripts/eval/configs/instance_dialog.yaml     |  79 +++
 scripts/eval/configs/objectnav_hm3d.yaml      |  83 +++
 25 files changed, 2076 insertions(+), 10 deletions(-)
 create mode 100644 internnav/agent/dialog_agent.py
 create mode 100644 internnav/env/dialog_mp3d.py
 rename internnav/{internnav_habitat => env}/habitat_env.py (64%)
 create mode 100644 internnav/internnav_habitat/dialog_dataset.py
 create mode 100644 internnav/internnav_habitat/dialog_episodes.py
 create mode 100644 internnav/internnav_habitat/dialog_utils.py
 create mode 100644 internnav/internnav_habitat/habitat_dialog_evaluator.py
 create mode 100644 internnav/internnav_habitat/simple_npc/__init__.py
 create mode 100644 internnav/internnav_habitat/simple_npc/prompt.py
 create mode 120000 internnav/internnav_habitat/simple_npc/scene_summary
 create mode 100644 internnav/internnav_habitat/simple_npc/simple_npc.py
 create mode 100755 scripts/eval/bash/srun_eval_dialog.sh
 create mode 100755 scripts/eval/bash/srun_eval_object.sh
 create mode 100644 scripts/eval/configs/gen_videos.yaml
 create mode 100755 scripts/eval/configs/habitat_dialog_cfg.py
 create mode 100755 scripts/eval/configs/habitat_object_cfg.py
 create mode 100644 scripts/eval/configs/instance_dialog.yaml
 create mode 100644 scripts/eval/configs/objectnav_hm3d.yaml

diff --git a/internnav/agent/__init__.py b/internnav/agent/__init__.py
index 74aa4bcd..8eba1daa 100644
--- a/internnav/agent/__init__.py
+++ b/internnav/agent/__init__.py
@@ -1,4 +1,5 @@
 from internnav.agent.base import Agent
+from internnav.agent.dialog_agent import DialogAgent
 from internnav.agent.cma_agent import CmaAgent
 from internnav.agent.rdp_agent import RdpAgent
 from internnav.agent.seq2seq_agent import Seq2SeqAgent
@@ -6,6 +7,7 @@
 
 __all__ = [
     'Agent',
+    'DialogAgent',
     'CmaAgent',
     'RdpAgent',
     'Seq2SeqAgent',
diff --git a/internnav/agent/base.py b/internnav/agent/base.py
index a68626f6..02a566d5 100644
--- a/internnav/agent/base.py
+++ b/internnav/agent/base.py
@@ -25,6 +25,7 @@ def decorator(agent_class):
             if agent_type in cls.agents:
                 raise ValueError(f"Agent {agent_type} already registered.")
             cls.agents[agent_type] = agent_class
+            return agent_class
 
         return decorator
 
diff --git a/internnav/agent/dialog_agent.py b/internnav/agent/dialog_agent.py
new file mode 100644
index 00000000..c6e95e57
--- /dev/null
+++ b/internnav/agent/dialog_agent.py
@@ -0,0 +1,506 @@
+import os
+import re
+import time
+import copy
+import random
+import argparse
+import itertools
+import numpy as np
+from typing import Any, Dict
+from collections import OrderedDict
+import quaternion
+from PIL import Image, ImageDraw, ImageFont
+
+import torch
+from transformers import (
+AutoTokenizer,
+AutoProcessor,
+Qwen2_5_VLForConditionalGeneration,
+)
+
+from internnav.agent import Agent
+from internnav.configs.agent import AgentCfg
+from internnav.configs.evaluator import TaskCfg
+
+try:
+    import habitat
+    from habitat.config.default import get_agent_config
+    from habitat.config.default_structured_configs import (
+        CollisionsMeasurementConfig,
+        FogOfWarConfig,
+        TopDownMapMeasurementConfig,
+    )
+    from habitat.utils.visualizations.utils import observations_to_image
+    from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower
+    from depth_camera_filtering import filter_depth
+except Exception as e:
+    print(f"Warning: ({e}), Habitat Evaluation is not loaded in this runtime. Ignore this if not using Habitat.")
+
+DEFAULT_IMAGE_TOKEN = "<image>"
+
+def split_and_clean(text):
+    # 按 <image> 分割，保留分割符
+    import re
+    parts = re.split(r'(<image>)', text)
+    results = []
+    for part in parts:
+        if part == '<image>':
+            results.append(part)
+        else:
+            # 去除所有换行符，并strip两端空白
+            clean_part = part.replace('\n', '').strip()
+            if clean_part:  # 跳过空字符串
+                results.append(clean_part)
+    return results
+    
+@Agent.register('dialog')
+class DialogAgent(Agent):
+    """
+    agent template, override the functions for custom policy
+    """
+
+    def __init__(self, agent_config: AgentCfg, task_config: TaskCfg, local_rank: int):
+        self.agent_config = agent_config
+        self.task_config = task_config
+
+        # sensor config
+        self.sim_sensors_config = self.agent_config.model_settings['sim_sensors_config']
+        self._camera_height = self.sim_sensors_config.rgb_sensor.position[1]
+        self._min_depth = self.sim_sensors_config.depth_sensor.min_depth
+        self._max_depth = self.sim_sensors_config.depth_sensor.max_depth
+        self._camera_fov = np.deg2rad(self.sim_sensors_config.depth_sensor.hfov)
+        self._fx = self._fy = self.sim_sensors_config.depth_sensor.width / (2 * np.tan(self._camera_fov / 2))
+
+        # model
+        self.model_args = argparse.Namespace(**self.agent_config.model_settings)
+
+        self.task = self.model_args.task
+        self.append_look_down = self.model_args.append_look_down
+        self.resize_h = self.model_args.resize_h
+        self.resize_w = self.model_args.resize_w
+
+        tokenizer = AutoTokenizer.from_pretrained(self.model_args.model_path, use_fast=True)
+        processor = AutoProcessor.from_pretrained("/mnt/inspurfs/efm_t/weimeng/Qwen2.5-VL-7B-Instruct")
+        processor.tokenizer = tokenizer
+        processor.tokenizer.padding_side = 'left'
+
+        self.device = torch.device('cuda', local_rank)
+        if self.model_args.mode == 'dual_system':
+            model = InternVLAN1ForCausalLM.from_pretrained(
+                self.model_args.model_path,
+                torch_dtype=torch.bfloat16,
+                attn_implementation="flash_attention_2",
+                device_map={"": self.device},
+            )
+        elif self.model_args.mode == 'system2':
+            model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                self.model_args.model_path,
+                torch_dtype=torch.bfloat16,
+                attn_implementation="flash_attention_2",
+                device_map={"": self.device},
+            )
+        else:
+            raise ValueError(f"Invalid mode: {self.model_args.mode}")
+
+        model.eval()
+
+        self.model = model
+        self.processor = processor
+        self.num_history = self.model_args.num_history
+
+        # prompt
+        if 'dialog' in self.task_config.task_name or self.agent_config.model_settings['dialog_enabled']:
+            prompt = f"You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? There is an oracle can help you to complete the task in current environment, you can either choose to move or talk. If choosing to talk, please say something that can help you better to find the target object. If choosing to move, when you want to output a waypoint you need to TILT DOWN (↓) by 30 degrees then output the next waypoint\'s coordinates in the image. In case the next waypoint is out of view, utilize the turn actions: TURN LEFT (←) or TURN RIGHT (→) by 15 degrees. Please output STOP when you have successfully completed the task."
+        else:
+            prompt = f"You are an autonomous navigation assistant. Your task is to <instruction>. Where should you go next to stay on track? When you want to output a waypoint you need to TILT DOWN (↓) by 30 degrees then output the next waypoint\'s coordinates in the image. In case the next waypoint is out of view, utilize the turn actions: TURN LEFT (←) or TURN RIGHT (→) by 15 degrees. Please output STOP when you have successfully completed the task."
+        answer = ""
+        self.conversation = [{"from": "human", "value": prompt}, {"from": "gpt", "value": answer}]
+        
+        self.conjunctions = [
+                                'you can see ',
+                                'in front of you is ',
+                                'there is ',
+                                'you can spot ',
+                                'you are toward the ',
+                                'ahead of you is ',
+                                'in your sight is '
+                            ]
+
+        self.actions2idx = OrderedDict({
+            'STOP': [0],
+            "↑": [1],
+            "←": [2],
+            "→": [3],
+            "↓": [5],
+        })
+        
+    def convert_input(self, obs, info):
+        # update new information after env.step
+        depth = obs["depth"]
+        depth = filter_depth(depth.reshape(depth.shape[:2]), blur_type=None)
+        depth = depth * (self._max_depth - self._min_depth) + self._min_depth
+        self.depth = depth * 1000 # get depth
+
+        rgb = obs["rgb"]
+        image = Image.fromarray(rgb).convert('RGB')  # raw observation image 
+        image_size = image.size  #640*480
+        save_raw_image = image.copy() # get rgb
+
+        x, y = obs["gps"]
+        camera_yaw = obs["compass"][0]
+        agent_state = info['agent state']
+        height = agent_state.position[1] - self.initial_height # Habitat GPS makes west negative, so flip y
+        camera_position = np.array([x, -y, self._camera_height + height])
+        robot_xy = camera_position[:2]
+        self.tf_camera_to_episodic = self.xyz_yaw_pitch_to_tf_matrix(camera_position, camera_yaw, np.deg2rad(30)) @ self.get_axis_align_matrix() # get transformation from camera to agent
+
+        if self.last_action == 5:
+            self.look_down_image = image
+            self.save_raw_image = self.look_down_image.copy()
+        elif self.last_action != 6:
+            image = image.resize((self.resize_w, self.resize_h))
+            self.rgb_list.append(image)
+        return obs
+
+    def convert_output(self, env, llm_outputs: str):
+        if '<talk>' in llm_outputs:
+            self.question = llm_outputs.replace('<talk>', '')
+            return 6
+        else:
+            if bool(re.search(r'\d', llm_outputs)): # output pixel goal
+                # get pixel goal
+                self.forward_action = 0
+                coord = [int(c) for c in re.findall(r'\d+', llm_outputs)]
+                print('coords:', coord)
+                try:
+                    pixel_goal = [int(coord[1]), int(coord[0])]  # switch the goal o
+                except:
+                    print("Invalid pixel goal: len(coor)!=2")
+                    return 0
+
+                # trans pixel goal to global goal
+                try:
+                    self.goal = self.pixel_to_gps(pixel_goal, self.depth / 1000, self.intrinsic_matrix, self.tf_camera_to_episodic)
+                except:
+                    print("Invalid pixel goal: out of image size range")
+                    return 0
+                self.goal = (self.transformation_matrix @ np.array([-self.goal[1], 0, -self.goal[0], 1]))[:3]                                
+                if not env._env.sim.pathfinder.is_navigable(np.array(self.goal)):
+                    self.goal = np.array(env._env.sim.pathfinder.snap_point(np.array(self.goal)))
+                
+                # paint pixel goal
+                draw = ImageDraw.Draw(self.save_raw_image, 'RGB')
+                x, y, r = pixel_goal[0], pixel_goal[1], 2
+                draw.ellipse([(x-r, y-r), (x+r, y+r)], fill=(255,0,0))
+            
+                # look down --> horizontal
+                env.step(4)
+                env.step(4)
+            
+                if self.append_look_down and self.look_down_image is not None:
+                    self.prev_look_image = self.look_down_image.resize((self.resize_w, self.resize_h))
+                action = self.agent.get_next_action(self.goal)
+                if action == 0:
+                    self.goal = None
+                    self.messages = []
+                    print('conduct a random action 2')
+                    self.last_action = 2
+                    return 2
+                print('predicted goal', pixel_goal, self.goal, flush=True)
+            else:                           
+                self.action_seq = self.parse_actions(llm_outputs)
+                print('actions', self.action_seq, flush=True)
+
+    def inference(self, obs, info):
+        if self.last_action == 6:
+            self.dialogs.append({'role': 'navigator', 'message': self.question, 'true_idx': self.step_id})
+            self.dialogs.append({'role': 'oracle', 'message': obs['npc_answer'], 'true_idx': self.step_id})
+            self.messages.append({
+                'role': 'assistant',
+                'content': [
+                    {
+                        'type': 'text',
+                        'text': self.last_llm_outputs
+                    }
+                ]
+            })
+            self.messages.append({
+                            'role': 'user',
+                            'content': [
+                                {
+                                    'type': 'text',
+                                    'text': obs['npc_answer']
+                                }
+                            ]
+                        })
+        elif self.last_action == 5:
+            sources = [{"from": "human", "value": ""}, {"from": "gpt", "value": ""}]
+            self.input_images += [self.look_down_image]
+            self.messages.append({
+                            'role': 'assistant',
+                            'content': [
+                                {
+                                    'type': 'text',
+                                    'text': self.last_llm_outputs
+                                }
+                            ]
+                        })
+            input_img_id = -1
+        else: 
+            sources = copy.deepcopy(self.conversation)
+            sources[0]["value"] = sources[0]["value"].replace('<instruction>', info['episode_instruction'])
+            cur_images = self.rgb_list[-1:]   # current observation 
+            if self.step_id == 0:
+                history_id = []
+            else:
+                history_id = np.unique(np.linspace(0, self.step_id - 1, self.num_history, dtype=np.int32)).tolist()
+                # add dialod history
+                dialogs_idx = np.sort(list(set([i['true_idx'] for i in self.dialogs]))).tolist()
+                history_id = np.sort(
+                    np.unique(
+                        np.concatenate([
+                            history_id,
+                            dialogs_idx
+                        ]).astype(np.int32)
+                    )
+                ).tolist()
+                placeholder = [''] * (len(history_id)+1)
+                for n in dialogs_idx:
+                    pos = history_id.index(n)
+                    output = ""
+                    for dialog in self.dialogs:
+                        if dialog['true_idx'] == n:
+                            output += f"<|{dialog['role']}|>{dialog['message']}"
+                    placeholder[pos+1] = "<|dialog_start|>" + output + "<|dialog_end|>"
+                # add image history
+                placeholder = (DEFAULT_IMAGE_TOKEN + '\n').join(placeholder)
+                sources[0]["value"] += f' These are your historical observations: {placeholder}.'
+                if self.append_look_down:
+                    if self.prev_look_image is not None:
+                        sources[0]["value"] += f' Your previous look down image is:{DEFAULT_IMAGE_TOKEN}.'
+                    else:
+                        sources[0]["value"] += f' Your previous look down image is not here.'
+            history_id = sorted(history_id)
+            print('history_id', self.step_id, history_id)
+            # prepare images
+            if self.append_look_down:
+                if self.prev_look_image is not None:
+                    self.input_images = [self.rgb_list[i] for i in history_id] + [self.prev_look_image] + cur_images
+                else:
+                    self.input_images = [self.rgb_list[i] for i in history_id] + cur_images
+            else:
+                self.input_images = [self.rgb_list[i] for i in history_id] + cur_images
+            input_img_id = 0
+        
+        if self.last_action != 6:
+            # prompt text
+            prompt = random.choice(self.conjunctions) + DEFAULT_IMAGE_TOKEN
+            sources[0]["value"] += f" {prompt}."
+            prompt_instruction = copy.deepcopy(sources[0]["value"])
+            
+            # prompt images
+            parts = split_and_clean(prompt_instruction)
+            content = []
+            for i in range (len(parts)):
+                if parts[i] == "<image>":
+                    content.append({"type": "image", "image": self.input_images[input_img_id]})
+                    input_img_id +=1
+                else:
+                    content.append({"type": "text", "text": parts[i]}) 
+            
+            self.messages.append({
+                                'role': 'user',
+                                'content': content
+                            })
+        # inference
+        text = self.processor.apply_chat_template(
+            self.messages,tokenize=False, add_generation_prompt=True
+        )
+        print('step_id', self.step_id, ' ', text)
+        # for image_idx, input_image in enumerate(self.input_images):
+        #     input_image.save(os.path.join('/'.join(info['output_path'].split('/')[:-3]), 'debug_images', f'image_{image_idx}.jpg'))
+        inputs = self.processor(text=[text], images=self.input_images, return_tensors="pt").to(self.model.device)
+        with torch.no_grad():
+            output_ids = self.model.generate(**inputs, max_new_tokens=self.agent_config.model_settings['max_new_tokens'], do_sample=False)
+        llm_outputs = self.processor.tokenizer.decode(output_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+        print('step_id:', self.step_id, 'output text:', llm_outputs)
+        return llm_outputs
+
+    def step(self, obs: Dict[str, Any], env, info):
+        print(f'{self.agent_config.model_name} Agent step')
+        start = time.time()
+        # convert obs to model input
+        self.step_id = info['step']
+        obs = self.convert_input(obs, info)
+        if len(self.action_seq) == 0 and self.goal is None:  
+            llm_outputs = self.inference(obs, info)
+            self.last_llm_outputs = llm_outputs
+            action = self.convert_output(env, llm_outputs)
+            with open(info['output_path'], 'a') as f:
+                f.write(str(self.step_id) + " " + llm_outputs + "\n")
+        else:
+            action = None
+
+        if action is None:                 
+            if len(self.action_seq) != 0:
+                action = self.action_seq.pop(0)
+            elif self.goal is not None:
+                action = self.agent.get_next_action(self.goal)
+                action = action.detach().cpu().numpy()[0] if isinstance(action, torch.Tensor) else action
+                action = action[0] if hasattr(action, "__len__") else action
+
+                self.forward_action +=1
+                print('forward_action', self.forward_action, flush=True)
+                if self.forward_action > 8:
+                    self.goal =  None
+                    self.messages = []
+                    self.forward_action =0
+                    end = time.time()
+                    print(f'time: {round(end-start, 4)}s')
+                    return 7
+                if action == 0:
+                    self.goal = None
+                    self.messages = []
+                    end = time.time()
+                    print(f'time: {round(end-start, 4)}s')
+                    return 7
+            else:
+                action = 0
+
+        end = time.time()
+        print(f'time: {round(end-start, 4)}s')
+        self.last_action = action
+        return action
+
+    def reset(self, env):
+        self.intrinsic_matrix = self.get_intrinsic_matrix(self.sim_sensors_config.rgb_sensor)
+        self.agent = ShortestPathFollower(env._env.sim, 0.25, False)
+        
+        # params saving and initialization
+        agent_state = env._env.sim.get_agent_state()
+        rotation_matrix = quaternion.as_rotation_matrix(agent_state.rotation)
+        self.transformation_matrix = np.eye(4)
+        self.transformation_matrix[:3, :3] = rotation_matrix
+        self.transformation_matrix[:3, 3] = agent_state.position # get transformation from world to agent
+        self.initial_height = agent_state.position[1] # get initial height
+
+        self.last_action = None
+        self.messages = []
+        self.rgb_list = []
+        self.action_seq = []
+        self.goal = None
+        self.prev_look_image = None
+        self.look_down_image = None # params for qwen model
+
+        self.dialogs = []
+
+        #params for saving
+        self.save_raw_image = None
+
+
+    def get_intrinsic_matrix(self, sensor_cfg) -> np.ndarray:
+        width = sensor_cfg.width
+        height = sensor_cfg.height
+        fov = sensor_cfg.hfov
+        fx = (width / 2.0) / np.tan(np.deg2rad(fov / 2.0))
+        fy = fx  # Assuming square pixels (fx = fy)
+        cx = (width - 1.0) / 2.0
+        cy = (height - 1.0) / 2.0
+
+        intrinsic_matrix = np.array([
+            [fx,  0.0, cx, 0.0],
+            [ 0.0, fy, cy, 0.0],
+            [ 0.0,  0.0,  1.0, 0.0],
+            [ 0.0,  0.0,  0.0, 1.0]
+        ])
+        return intrinsic_matrix
+
+    def get_axis_align_matrix(self):
+        ma = np.array([[0, 0, 1, 0], [-1, 0, 0, 0], [0, -1, 0, 0], [0, 0, 0, 1]])
+        return ma
+    
+    def xyz_yaw_to_tf_matrix(self, xyz: np.ndarray, yaw: float) -> np.ndarray:
+        x, y, z = xyz
+        transformation_matrix = np.array(
+            [
+                [np.cos(yaw), -np.sin(yaw), 0, x],
+                [np.sin(yaw), np.cos(yaw), 0, y],
+                [0, 0, 1, z],
+                [0, 0, 0, 1],
+            ]
+        )
+        return transformation_matrix
+
+    def xyz_pitch_to_tf_matrix(self, xyz: np.ndarray, pitch: float) -> np.ndarray:
+        """Converts a given position and pitch angle to a 4x4 transformation matrix.
+
+        Args:
+            xyz (np.ndarray): A 3D vector representing the position.
+            pitch (float): The pitch angle in radians for y axis.
+        Returns:
+            np.ndarray: A 4x4 transformation matrix.
+        """
+
+        x, y, z = xyz
+        transformation_matrix = np.array(
+            [
+                [np.cos(pitch), 0, np.sin(pitch), x],
+                [0, 1, 0, y],
+                [-np.sin(pitch), 0, np.cos(pitch), z],
+                [0, 0, 0, 1],
+            ]
+        )
+        return transformation_matrix
+    
+    def xyz_yaw_pitch_to_tf_matrix(self, xyz: np.ndarray, yaw: float, pitch: float) -> np.ndarray:
+        """Converts a given position and yaw, pitch angles to a 4x4 transformation matrix.
+
+        Args:
+            xyz (np.ndarray): A 3D vector representing the position.
+            yaw (float): The yaw angle in radians.
+            pitch (float): The pitch angle in radians for y axis.
+        Returns:
+            np.ndarray: A 4x4 transformation matrix.
+        """
+        x, y, z = xyz
+        rot1 = self.xyz_yaw_to_tf_matrix(xyz, yaw)[:3, :3]
+        rot2 = self.xyz_pitch_to_tf_matrix(xyz, pitch)[:3, :3]
+        transformation_matrix = np.eye(4)
+        transformation_matrix[:3, :3] = rot1 @ rot2
+        transformation_matrix[:3, 3] = xyz
+        return transformation_matrix
+
+    def pixel_to_gps(self, pixel, depth, intrinsic, tf_camera_to_episodic):
+        '''
+        Args:
+            pixel: (2,) - [u, v] pixel coordinates
+            depth: (H, W) - depth image where depth[v, u] gives depth in meters
+            intrinsic: (4, 4) - camera intrinsic matrix
+            tf_camera_to_episodic: (4, 4) - transformation from camera to episodic frame
+        Returns:
+            (x, y): (x, y) coordinates in the episodic frame
+        '''
+        v, u = pixel
+        z = depth[v, u]
+        print("depth", z)
+
+        x = (u - intrinsic[0, 2]) * z / intrinsic[0, 0]
+        y = (v - intrinsic[1, 2]) * z / intrinsic[1, 1]
+        point_camera = np.array([x, y, z, 1.0])
+
+        # Transform to episodic frame
+        point_episodic = tf_camera_to_episodic @ point_camera
+        point_episodic = point_episodic[:3] / point_episodic[3]
+
+        x = point_episodic[0]
+        y = point_episodic[1]
+
+        return (x, y) # same as habitat gps
+
+    def parse_actions(self, output):
+        action_patterns = '|'.join(re.escape(action) for action in self.actions2idx)
+        regex = re.compile(action_patterns)
+        matches = regex.findall(output)
+        actions = [self.actions2idx[match] for match in matches]
+        actions = itertools.chain.from_iterable(actions)
+        return list(actions)
\ No newline at end of file
diff --git a/internnav/configs/evaluator/__init__.py b/internnav/configs/evaluator/__init__.py
index 27e63a31..fc3f7e7c 100644
--- a/internnav/configs/evaluator/__init__.py
+++ b/internnav/configs/evaluator/__init__.py
@@ -39,8 +39,8 @@ class MetricCfg(BaseModel):
 
 class TaskCfg(BaseModel):
     task_name: Optional[str] = None
-    task_settings: Dict[str, Any]
-    scene: SceneCfg
+    task_settings: Dict[str, Any] = None
+    scene: SceneCfg = None
     robot_name: Optional[str] = None
     robot: Optional[RobotCfg] = None
     robot_flash: Optional[bool] = None
@@ -56,6 +56,7 @@ class EvalDatasetCfg(BaseModel):
 
 
 class EvalCfg(BaseModel):
+    remote_agent: Optional[bool] = None
     eval_type: Optional[str] = None
     eval_settings: Optional[Dict[str, Any]] = {}
     agent: Optional[AgentCfg] = None
diff --git a/internnav/env/__init__.py b/internnav/env/__init__.py
index 798723ec..3342c595 100644
--- a/internnav/env/__init__.py
+++ b/internnav/env/__init__.py
@@ -1,4 +1,5 @@
 from internnav.env.base import Env
 from internnav.env.internutopia_env import InternutopiaEnv
+from internnav.env.habitat_env import HabitatEnv
 
-__all__ = ['Env', 'InternutopiaEnv']
+__all__ = ['Env', 'InternutopiaEnv', 'HabitatEnv']
diff --git a/internnav/env/base.py b/internnav/env/base.py
index a5b9d6ee..5e2a0199 100644
--- a/internnav/env/base.py
+++ b/internnav/env/base.py
@@ -42,6 +42,7 @@ def decorator(env_class):
             if env_type in cls.envs:
                 raise ValueError(f"Env {env_type} already registered.")
             cls.envs[env_type] = env_class
+            return env_class
 
         return decorator
 
diff --git a/internnav/env/dialog_mp3d.py b/internnav/env/dialog_mp3d.py
new file mode 100644
index 00000000..33b140ee
--- /dev/null
+++ b/internnav/env/dialog_mp3d.py
@@ -0,0 +1,173 @@
+import cv2
+import numpy as np
+from typing import List, Tuple, Union
+
+def fill_small_holes(depth_img: np.ndarray, area_thresh: int) -> np.ndarray:
+    """
+    Identifies regions in the depth image that have a value of 0 and fills them in
+    with 1 if the region is smaller than a given area threshold.
+
+    Args:
+        depth_img (np.ndarray): The input depth image
+        area_thresh (int): The area threshold for filling in holes
+
+    Returns:
+        np.ndarray: The depth image with small holes filled in
+    """
+    # Create a binary image where holes are 1 and the rest is 0
+    binary_img = np.where(depth_img == 0, 1, 0).astype("uint8")
+
+    # Find contours in the binary image
+    contours, _ = cv2.findContours(binary_img, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+
+    filled_holes = np.zeros_like(binary_img)
+
+    for cnt in contours:
+        # If the area of the contour is smaller than the threshold
+        if cv2.contourArea(cnt) < area_thresh:
+            # Fill the contour
+            cv2.drawContours(filled_holes, [cnt], 0, 1, -1)
+
+    # Create the filled depth image
+    filled_depth_img = np.where(filled_holes == 1, 1, depth_img)
+
+    return filled_depth_img
+
+class MP3DGTPerception:
+    def __init__(self, max_depth, min_depth, fx, fy):
+        self.max_depth = max_depth
+        self.min_depth = min_depth
+        self.fx = fx
+        self.fy = fy
+
+    def predict(self, depth, targets, tf_camera_to_ply, area_threshold=2500):
+        '''
+        Get the gt semantic map of the target objects
+        image: (H, W, 3) current rgb frame
+        depth: (H, W) current depth frame
+        targets: (N, 6) bboxes of the target objects, first 3 are coordinates of min corner, last 3 are coordinates of max corner
+        area_threshold: int
+        return: (N, H, W) gt semantic map of the target objects
+        '''
+        # get the point clouds of current frame
+        filled_depth = fill_small_holes(depth, area_threshold)
+        scaled_depth = filled_depth * (self.max_depth - self.min_depth) + self.min_depth
+        mask = scaled_depth < self.max_depth
+        point_cloud_camera_frame = get_point_cloud(scaled_depth, mask, self.fx, self.fy)
+        point_cloud_ply_frame = transform_points(tf_camera_to_ply, point_cloud_camera_frame)
+            
+        # mark the points in the target objects' bboxes
+        semantic_images = []
+        for target in targets:
+            min_x, min_y, min_z = target[:3]
+            max_x, max_y, max_z = target[3:]
+            
+            in_bbox = (
+                (point_cloud_ply_frame[:, 0] >= min_x) & 
+                (point_cloud_ply_frame[:, 0] <= max_x) &
+                (point_cloud_ply_frame[:, 1] >= min_y) & 
+                (point_cloud_ply_frame[:, 1] <= max_y) &
+                (point_cloud_ply_frame[:, 2] >= min_z) & 
+                (point_cloud_ply_frame[:, 2] <= max_z)
+            )
+            in_bbox_points = point_cloud_ply_frame[in_bbox]
+            semantic_image = np.zeros(depth.shape, dtype=np.uint8)
+            if len(in_bbox_points) > 0:
+                # map the marked points back to the image to get the semantic map
+                in_bbox_camera_frame = inverse_transform_points(tf_camera_to_ply, in_bbox_points)
+                in_box_image_coords = project_points_to_image(in_bbox_camera_frame, self.fx, self.fy, depth.shape)
+                try:
+                    mask = [in_box_image_coords[i, 0] < 480 and in_box_image_coords[i, 1] < 640 for i in range(len(in_box_image_coords))]
+                    in_box_image_coords = in_box_image_coords[mask]
+                    semantic_image[in_box_image_coords[:, 0], in_box_image_coords[:, 1]] = 1
+                except:
+                    import ipdb; ipdb.set_trace()
+                    print()
+                semantic_image = fill_small_holes(semantic_image, area_threshold)
+            semantic_images.append(semantic_image)
+        if len(semantic_images) > 0:
+            semantic_images = np.stack(semantic_images, axis=0)
+        else:
+            semantic_images = np.zeros((1, depth.shape[0], depth.shape[1]), dtype=np.uint8)
+        return semantic_images
+
+def transform_points(transformation_matrix: np.ndarray, points: np.ndarray) -> np.ndarray:
+    # Add a homogeneous coordinate of 1 to each point for matrix multiplication
+    homogeneous_points = np.hstack((points, np.ones((points.shape[0], 1))))
+
+    # Apply the transformation matrix to the points
+    transformed_points = np.dot(transformation_matrix, homogeneous_points.T).T
+
+    # Remove the added homogeneous coordinate and divide by the last coordinate
+    return transformed_points[:, :3] / transformed_points[:, 3:]
+
+
+def get_point_cloud(depth_image: np.ndarray, mask: np.ndarray, fx: float, fy: float) -> np.ndarray:
+    """Calculates the 3D coordinates (x, y, z) of points in the depth image based on
+    the horizontal field of view (HFOV), the image width and height, the depth values,
+    and the pixel x and y coordinates.
+
+    Args:
+        depth_image (np.ndarray): 2D depth image.
+        mask (np.ndarray): 2D binary mask identifying relevant pixels.
+        fx (float): Focal length in the x direction.
+        fy (float): Focal length in the y direction.
+
+    Returns:
+        np.ndarray: Array of 3D coordinates (x, y, z) of the points in the image plane.
+    """
+    v, u = np.where(mask)
+    z = depth_image[v, u]
+    x = (u - depth_image.shape[1] // 2) * z / fx
+    y = (v - depth_image.shape[0] // 2) * z / fy
+    cloud = np.stack((x, -y, -z), axis=-1)
+
+    return cloud
+
+def inverse_transform_points(transformation_matrix: np.ndarray, points: np.ndarray) -> np.ndarray:
+    """Convert point cloud from episodic coordinate system to camera coordinate system
+    
+    Args:
+        transformation_matrix (np.ndarray): 4x4 transformation matrix
+        points (np.ndarray): Point cloud coordinates (N, 3)
+        
+    Returns:
+        np.ndarray: Point cloud coordinates in camera coordinate system (N, 3)
+    """
+    # Calculate the inverse of the transformation matrix
+    inv_matrix = np.linalg.inv(transformation_matrix)
+    
+    # Add a homogeneous coordinate of 1 to each point for matrix multiplication
+    homogeneous_points = np.hstack((points, np.ones((points.shape[0], 1))))
+    
+    # Apply the inverse transformation
+    transformed_points = np.dot(inv_matrix, homogeneous_points.T).T
+    
+    # Remove the added homogeneous coordinate
+    return transformed_points[:, :3] / transformed_points[:, 3:]
+
+def project_points_to_image(points: np.ndarray, fx: float, fy: float, image_shape: tuple) -> np.ndarray:
+    """Project points from camera coordinate system to image plane
+    
+    Args:
+        points (np.ndarray): Points in camera coordinate system (N, 3)
+        fx (float): x-axis focal length
+        fy (float): y-axis focal length
+        image_shape (tuple): Image dimensions (height, width)
+        
+    Returns:
+        np.ndarray: Image coordinates (N, 2)
+    """
+    points = np.stack((points[:,0], -points[:,1], -points[:,2]), axis=-1)
+    # Ensure points are in front of the camera
+    valid_mask = points[:, 2] > 0  # z > 0
+    
+    # Calculate image coordinates
+    u = points[:, 0] * fx / points[:, 2] + image_shape[1] // 2
+    v = points[:, 1] * fy / points[:, 2] + image_shape[0] // 2
+    
+    # Combine coordinates
+    image_coords = np.stack((v, u), axis=-1)
+    image_coords = image_coords.astype(np.int32)
+    # Return valid points only
+    return image_coords[valid_mask]
\ No newline at end of file
diff --git a/internnav/internnav_habitat/habitat_env.py b/internnav/env/habitat_env.py
similarity index 64%
rename from internnav/internnav_habitat/habitat_env.py
rename to internnav/env/habitat_env.py
index 1b0f3f43..abccca00 100644
--- a/internnav/internnav_habitat/habitat_env.py
+++ b/internnav/env/habitat_env.py
@@ -1,14 +1,19 @@
 import json
 import os
+import quaternion
+import numpy as np
 from typing import Any, Dict, List, Optional
 
+from habitat.config.default import get_agent_config
+
+from depth_camera_filtering import filter_depth
 from internnav.configs.evaluator import EnvCfg, TaskCfg
 from internnav.env import base
-
+from .dialog_mp3d import MP3DGTPerception
 
 @base.Env.register('habitat')
 class HabitatEnv(base.Env):
-    def __init__(self, env_config: EnvCfg, task_config: TaskCfg):
+    def __init__(self, env_config: EnvCfg, task_config: TaskCfg= None):
         """
         env_settings include:
             - habitat_config: loaded from get_habitat_config
@@ -23,7 +28,6 @@ def __init__(self, env_config: EnvCfg, task_config: TaskCfg):
             ) from e
 
         super().__init__(env_config, task_config)
-
         self.config = env_config.env_settings['habitat_config']
         self._env = Env(self.config)
 
@@ -35,6 +39,14 @@ def __init__(self, env_config: EnvCfg, task_config: TaskCfg):
         self.is_running = True
         self.output_path = env_config.env_settings.get('output_path', './output')
 
+        agent_config = get_agent_config(self.config.simulator)
+        self.min_depth = agent_config.sim_sensors.depth_sensor.min_depth
+        self.max_depth = agent_config.sim_sensors.depth_sensor.max_depth
+        self._camera_fov = np.deg2rad(agent_config.sim_sensors.depth_sensor.hfov)
+        self._fx = self._fy = agent_config.sim_sensors.depth_sensor.width / (2 * np.tan(self._camera_fov / 2))
+        self._camera_height = agent_config.sim_sensors.rgb_sensor.position[1]
+        self.segmentation = MP3DGTPerception(self.max_depth, self.min_depth, self._fx, self._fy)
+
         # generate episodes
         # self._env.episodes = self._env.episodes[0:1]  # for debug
         self.episodes = self.generate_episodes()
@@ -94,7 +106,8 @@ def reset(self):
 
         # Habitat reset
         self._last_obs = self._env.reset()
-
+        if "instance" in self.task_config.task_name:
+            self._last_obs['semantic'] = self.get_semantic(self._last_obs)
         return self._last_obs
 
     def step(self, action: List[Any]):
@@ -106,6 +119,8 @@ def step(self, action: List[Any]):
         Return: obs, reward, done, info
         """
         obs = self._env.step(action)
+        if "instance" in self.task_config.task_name:
+            obs['semantic'] = self.get_semantic(obs)
         done = self._env.episode_over
         info = self._env.get_metrics()
         reward = info.get('reward', 0.0)
@@ -126,3 +141,23 @@ def get_metrics(self) -> Dict[str, Any]:
 
     def get_current_episode(self):
         return self._env.current_episode
+
+    def get_tf_episodic_to_global(self):
+        agent_state = self._env.sim.get_agent_state()
+        rotation = agent_state.rotation
+        translation = agent_state.position
+        rotation_matrix = quaternion.as_rotation_matrix(rotation)
+        tf_episodic_to_global = np.eye(4)
+        tf_episodic_to_global[:3, :3] = rotation_matrix
+        tf_episodic_to_global[:3, 3] = translation
+        return tf_episodic_to_global
+
+    def get_semantic(self, obs: dict):
+        targets = [self.get_current_episode().goals[idx].bbox for idx, _ in enumerate(self.get_current_episode().instruction.instance_id)]
+        targets = np.array([[target[0], min(-target[2], -target[5]) , target[1], target[3], max(-target[5], -target[2]), target[4]] for target in targets])
+        depth = filter_depth(obs["depth"].reshape(obs["depth"].shape[:2]), blur_type=None)
+        tf_camera_to_global = self.get_tf_episodic_to_global()
+        tf_camera_to_global[1, 3] = self._camera_height + self._env.sim.get_agent_state().position[1]
+        tf_camera_to_ply = np.dot(np.array([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]), tf_camera_to_global)
+        semantic = self.segmentation.predict(depth, targets, tf_camera_to_ply)
+        return semantic
\ No newline at end of file
diff --git a/internnav/evaluator/distributed_base.py b/internnav/evaluator/distributed_base.py
index 91eeb588..0f29af4f 100644
--- a/internnav/evaluator/distributed_base.py
+++ b/internnav/evaluator/distributed_base.py
@@ -47,9 +47,13 @@ def __init__(self, cfg: EvalCfg, init_env: bool = True, init_agent: bool = True)
                 cfg.agent.agent_settings['port'] = 8000 + get_rank()
                 self.agent = AgentClient(cfg.agent)
             else:
-                from internnav.agent import Agent
+                if cfg.agent.model_name == 'dialog':
+                    from internnav.agent import DialogAgent
+                    self.agent = DialogAgent(cfg.agent, cfg.task, self.rank)
+                else:
+                    from internnav.agent import Agent
 
-                self.agent = Agent(cfg.agent)
+                    self.agent = Agent(cfg.agent)
 
     def eval(self):
         """
diff --git a/internnav/internnav_habitat/__init__.py b/internnav/internnav_habitat/__init__.py
index af9bee9e..fc5a8cf4 100644
--- a/internnav/internnav_habitat/__init__.py
+++ b/internnav/internnav_habitat/__init__.py
@@ -1,2 +1,2 @@
-from internnav.internnav_habitat.habitat_env import HabitatEnv
 from internnav.internnav_habitat.habitat_vln_evaluator import HabitatVlnEvaluator
+from internnav.internnav_habitat.habitat_dialog_evaluator import HabitatDialogEvaluator
diff --git a/internnav/internnav_habitat/dialog_dataset.py b/internnav/internnav_habitat/dialog_dataset.py
new file mode 100644
index 00000000..ef4dcc4b
--- /dev/null
+++ b/internnav/internnav_habitat/dialog_dataset.py
@@ -0,0 +1,83 @@
+
+import gzip
+import json
+import os
+import attr
+from typing import TYPE_CHECKING, List, Optional
+
+from habitat.core.dataset import Dataset
+from habitat.core.registry import registry
+from habitat.datasets.utils import VocabDict
+from .dialog_episodes import DialogEpisode, DialogGoal, DialogViewLocation, AgentPosition
+if TYPE_CHECKING:
+    from omegaconf import DictConfig
+
+
+DEFAULT_SCENE_PATH_PREFIX = "data/scene_datasets/"
+
+@attr.s(auto_attribs=True, kw_only=True)
+class DialogInstructionData:
+    task_type: str
+    instruction_text: str
+    instance_id: List[str]
+    instruction_info: Optional[List[str]] = None
+
+@registry.register_dataset(name="dialog")
+class DialogDatasetV1(Dataset):
+    r"""Class inherited from Dataset that loads a Vision and Language
+    Navigation dataset.
+    """
+
+    episodes: List[DialogEpisode]
+    instruction_vocab: VocabDict
+
+    @staticmethod
+    def check_config_paths_exist(config: "DictConfig") -> bool:
+        return os.path.exists(
+            config.data_path.format(split=config.split)
+        ) and os.path.exists(config.scenes_dir)
+
+    def __init__(self, config: Optional["DictConfig"] = None) -> None:
+        self.episodes = []
+
+        if config is None:
+            return
+
+        dataset_filename = config.data_path.format(split=config.split)
+        with gzip.open(dataset_filename, "rt") as f:
+            self.from_json(f.read(), scenes_dir=config.scenes_dir)
+
+        self.episodes = list(
+            filter(self.build_content_scenes_filter(config), self.episodes)
+        )
+
+    def from_json(
+        self, json_str: str, scenes_dir: Optional[str] = None
+    ) -> None:
+
+        deserialized = json.loads(json_str)
+        # self.instruction_vocab = VocabDict(
+        #     word_list=deserialized["instruction_vocab"]["word_list"]
+        # )
+        if "category_to_task_category_id" in deserialized:
+            self.category_to_task_category_id = deserialized["category_to_task_category_id"]
+
+        for episode in deserialized["episodes"]:
+            episode = DialogEpisode(**episode)
+
+            if scenes_dir is not None:
+                if episode.scene_id.startswith(DEFAULT_SCENE_PATH_PREFIX):
+                    episode.scene_id = episode.scene_id[
+                        len(DEFAULT_SCENE_PATH_PREFIX) :
+                    ]
+
+                episode.scene_id = os.path.join(scenes_dir, episode.scene_id)
+            episode.instruction = DialogInstructionData(**episode.instruction)
+            for g_index, goal in enumerate(episode.goals):
+                view_points = []
+                for view_point in goal['view_points']:
+                    view_point = DialogViewLocation(**{'agent_state': AgentPosition(**view_point['agent_state'])})
+                    view_points.append(view_point)
+                goal['view_points'] = view_points
+                episode.goals[g_index] = DialogGoal(**goal)
+            self.episodes.append(episode)
\ No newline at end of file
diff --git a/internnav/internnav_habitat/dialog_episodes.py b/internnav/internnav_habitat/dialog_episodes.py
new file mode 100644
index 00000000..e7d2409c
--- /dev/null
+++ b/internnav/internnav_habitat/dialog_episodes.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+import numpy as np
+from typing import List, Optional, Union
+
+import attr
+from habitat.core.dataset import Episode
+from habitat.core.utils import not_none_validator
+from habitat.tasks.nav.nav import NavigationGoal
+
+@attr.s(auto_attribs=True)
+class AgentPosition:
+    position: Union[None, List[float], np.ndarray]
+
+@attr.s(auto_attribs=True)
+class DialogViewLocation:
+    agent_state: AgentPosition
+
+@attr.s(auto_attribs=True, kw_only=True)
+class DialogGoal(NavigationGoal):
+    r"""Base class for a goal specification hierarchy."""
+
+    position: List[float] = attr.ib(default=None, validator=not_none_validator)
+    radius: Optional[float] = None
+    bbox: Optional[List[float]] = None
+    view_points: Optional[List[DialogViewLocation]] = None
+    
+@attr.s(auto_attribs=True, kw_only=True)
+class DialogEpisode(Episode):
+    object_category: Optional[str] = None
+    goals: List[DialogGoal] = attr.ib(
+        default=None,
+        validator=not_none_validator,
+        on_setattr=Episode._reset_shortest_path_cache_hook,
+    )
+    instruction: Optional[dict] = []
+    frames: Optional[int] = []
\ No newline at end of file
diff --git a/internnav/internnav_habitat/dialog_utils.py b/internnav/internnav_habitat/dialog_utils.py
new file mode 100644
index 00000000..d3140ad9
--- /dev/null
+++ b/internnav/internnav_habitat/dialog_utils.py
@@ -0,0 +1,151 @@
+import inspect
+import numpy as np
+import quaternion
+import cv2
+import os
+import re
+from typing import Dict, Optional, Sequence, List, Tuple, Any, Union
+
+import habitat_sim
+from habitat_baselines.config.default import get_config as get_habitat_config
+
+from omegaconf import OmegaConf, DictConfig, open_dict
+from npc.utils.get_description import get_path_description, get_path_description_without_additional_info
+
+DEFAULT_IMAGE_TOKEN = "<image>"
+
+def get_config(
+    habitat_config_path: str,
+    baseline_config_path: str,
+    opts: Optional[list] = None,
+    configs_dir: str = os.path.dirname(inspect.getabsfile(inspect.currentframe())),
+) -> DictConfig:
+    """
+    Returns habitat_baselines config object composed of configs from yaml file (config_path) and overrides.
+
+    :param config_path: path to the yaml config file.
+    :param overrides: list of config overrides. For example, :py:`overrides=["habitat_baselines.trainer_name=ddppo"]`.
+    :param configs_dir: path to the config files root directory (defaults to :ref:`_BASELINES_CFG_DIR`).
+    :return: composed config object.
+    """
+    habitat_config = get_habitat_config(
+        habitat_config_path, overrides=opts, configs_dir=configs_dir
+    )
+    baseline_config = OmegaConf.load(baseline_config_path)
+    
+    with open_dict(habitat_config):
+        config = OmegaConf.merge(habitat_config, baseline_config)
+    
+    return config
+
+def calculate_path_length(path):
+    accumulated_length = [0]
+    for i, p in enumerate(path[1:]):
+        accumulated_length.append(accumulated_length[i] + np.linalg.norm(np.array(p) - np.array(path[i])))
+    return accumulated_length
+
+def get_shortest_path(env, start_position, target_position):
+    """
+    在habitat环境中找到从当前位置到目标位置的最短路径
+    
+    参数:
+        env: habitat环境实例
+        start_position: 起点位置坐标, numpy数组 [x, y, z]
+        target_position: 目标位置坐标, numpy数组 [x, y, z]
+        
+    返回:
+        path: 路径点列表
+        success: 是否找到有效路径
+    """
+    # 创建路径规划器
+    shortest_path = habitat_sim.ShortestPath()
+    shortest_path.requested_start = start_position
+    shortest_path.requested_end = target_position
+    
+    # 计算最短路径
+    success = env.sim.pathfinder.find_path(
+        shortest_path
+    )
+    return shortest_path.points, success
+
+def get_navigable_path(env, start_position, target_positions: list, object_info: dict):
+    start_position = [float(i) for i in start_position]
+    target_positions = sorted(target_positions, key=lambda x: np.linalg.norm(np.array(x['agent_state']['position']) - np.array(object_info['position'])))
+    success = False
+    while not success and len(target_positions) > 0:
+        target_position = target_positions.pop(0)
+        shortest_path, success = get_shortest_path(env, start_position, target_position['agent_state']['position'])
+    if success:
+        return shortest_path, True
+    else:
+        return [], False
+
+def get_path_description_(env, object_dict, region_dict):
+    goal_path, success = get_navigable_path(env, env.sim.get_agent_state().position, [{'agent_state': {'position': vp.agent_state.position}} for vp in env.current_episode.goals[0].view_points], {'position': env.current_episode.goals[0].position})
+    if not success or len(np.unique(goal_path, axis=0)) == 1:
+        print('no shortest path')
+        return None, 0
+    path_length = calculate_path_length(goal_path)
+    pl = path_length[-1]
+    goal_index = max([i for i, c in enumerate(path_length) if c < 4])
+    # goal_index = len(goal_path)-1
+    if goal_index == 0:
+        goal_index = len(goal_path)-1
+    questioned_path = goal_path[:goal_index+1]
+    current_yaw = 2 * np.arctan2(env.sim.get_agent_state().rotation.y, env.sim.get_agent_state().rotation.w)
+    _, idx = np.unique(questioned_path, axis=0, return_index=True)
+    idx_sorted = np.sort(idx)
+    questioned_path = list(np.array(questioned_path)[idx_sorted])
+    try:
+        path_description, _ = get_path_description(quaternion.from_euler_angles([0, current_yaw, 0]), questioned_path, object_dict, region_dict, return_finish=False, height_list=[env.sim.get_agent_state().position[1]]*len(questioned_path))
+    except:
+        path_description, _ = get_path_description_without_additional_info(quaternion.from_euler_angles([0, current_yaw, 0]), questioned_path, height_list=[env.sim.get_agent_state().position[1]]*len(questioned_path))
+    return path_description, pl
+
+def unify_to_first(
+    vis_frames,
+    method: str = "resize",         # "resize" 或 "letterbox"
+    pad_color=(0, 0, 0),            # letterbox 的填充色 (B,G,R)
+    assume_rgb: bool = True         # 如果后续用 OpenCV 写视频，通常 True 表示当前是 RGB，需要转 BGR
+):
+    assert len(vis_frames) > 0, "vis_frames 为空"
+    h0, w0 = vis_frames[0].shape[:2]
+    out = []
+
+    for i, f in enumerate(vis_frames):
+        f = np.asarray(f)
+
+        # 保障三通道
+        if f.ndim == 2:  # 灰度 -> 3通道
+            f = np.stack([f]*3, axis=2)
+        if f.shape[2] > 3:
+            f = f[:, :, :3]  # 多通道时只取前三个
+
+        # dtype 归一：转 uint8
+        if f.dtype != np.uint8:
+            # 若是 [0,1] 浮点，×255；若已是 0-255 浮点，直接裁剪
+            fmax = float(np.nanmax(f)) if f.size else 1.0
+            f = (f * 255.0) if fmax <= 1.5 else np.clip(f, 0, 255)
+            f = f.astype(np.uint8)
+
+        h, w = f.shape[:2]
+        if (h, w) == (h0, w0):
+            out.append(np.ascontiguousarray(f))
+            continue
+
+        if method == "letterbox":
+            # 等比缩放 + 居中贴到画布
+            scale = min(w0 / w, h0 / h)
+            nw, nh = int(round(w * scale)), int(round(h * scale))
+            resized = cv2.resize(f, (nw, nh), interpolation=cv2.INTER_AREA if scale < 1 else cv2.INTER_LINEAR)
+            canvas = np.full((h0, w0, 3), pad_color, dtype=np.uint8)
+            top, left = (h0 - nh) // 2, (w0 - nw) // 2
+            canvas[top:top+nh, left:left+nw] = resized
+            f_out = canvas
+        else:
+            # 直接拉伸到目标大小
+            f_out = cv2.resize(f, (w0, h0), interpolation=cv2.INTER_AREA if (h*w) > (h0*w0) else cv2.INTER_LINEAR)
+
+        out.append(np.ascontiguousarray(f_out))
+
+    return out
\ No newline at end of file
diff --git a/internnav/internnav_habitat/habitat_dialog_evaluator.py b/internnav/internnav_habitat/habitat_dialog_evaluator.py
new file mode 100644
index 00000000..6558685f
--- /dev/null
+++ b/internnav/internnav_habitat/habitat_dialog_evaluator.py
@@ -0,0 +1,458 @@
+import argparse
+import json
+import os
+import sys
+
+import copy
+import itertools
+import random
+import re
+from collections import OrderedDict
+
+import numpy as np
+import quaternion
+import torch
+import tqdm
+from depth_camera_filtering import filter_depth
+from PIL import Image, ImageDraw, ImageFont
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+from transformers.image_utils import to_numpy_array
+
+from internnav.configs.evaluator import EvalCfg
+from internnav.evaluator import DistributedEvaluator, Evaluator
+from internnav.model.basemodel.internvla_n1.internvla_n1 import InternVLAN1ForCausalLM
+from internnav.model.utils.vln_utils import (
+    chunk_token,
+    open_image,
+    split_and_clean,
+    traj_to_actions,
+)
+from internnav.agent.dialog_agent import DialogAgent
+from internnav.internnav_habitat.dialog_utils import get_config, get_path_description_
+from internnav.internnav_habitat.simple_npc.simple_npc import SimpleNPC
+
+try:
+    import habitat
+    from habitat.config.default import get_agent_config
+    from habitat.config.default_structured_configs import (
+        CollisionsMeasurementConfig,
+        FogOfWarConfig,
+        TopDownMapMeasurementConfig,
+    )
+    from habitat.utils.visualizations.utils import observations_to_image
+
+    # Import for Habitat registry side effects — do not remove
+    import internnav.internnav_habitat.measures  # noqa: F401 
+    from internnav.internnav_habitat.dialog_dataset import DialogDatasetV1
+    # isort: skip
+except Exception as e:
+    print(f"Warning: ({e}), Habitat Evaluation is not loaded in this runtime. Ignore this if not using Habitat.")
+
+DEFAULT_IMAGE_TOKEN = "<image>"
+
+@Evaluator.register('habitat_dialog')
+class HabitatDialogEvaluator(DistributedEvaluator):
+    def __init__(self, cfg: EvalCfg):
+        args = argparse.Namespace(**cfg.eval_settings)
+        self.epoch = args.epoch
+        self.max_steps_per_episode = args.max_steps_per_episode
+        self.scene_summary = args.scene_summary
+        self.output_path = args.output_path
+ 
+        self.task = cfg.task.task_name
+        self.turn = args.turn
+        self.dialog_enabled = cfg.agent.model_settings['dialog_enabled']
+        self.save_video = args.save_video
+
+        self.npc = SimpleNPC(
+                max_interaction_turn=10,
+                model_name=args.model_name,
+                openai_api_key=args.openai_api_key,
+                base_url=args.base_url,
+            )
+
+        # create habitat config
+        self.config_path = cfg.env.env_settings['habitat_config_path']
+        self.config = get_config(self.config_path, cfg.env.env_settings['baseline_config_path'])
+
+        with habitat.config.read_write(self.config):
+            self.config.exp.task = self.task
+            self.config.habitat.dataset.split = args.eval_split
+            self.config.habitat.task.measurements.update(
+                {
+                    "top_down_map": TopDownMapMeasurementConfig(
+                        map_padding=3,
+                        map_resolution=1024,
+                        draw_source=True,
+                        draw_border=True,
+                        draw_shortest_path=True,
+                        draw_view_points=True,
+                        draw_goal_positions=True,
+                        draw_goal_aabbs=True,
+                        fog_of_war=FogOfWarConfig(
+                            draw=True,
+                            visibility_dist=5.0,
+                            fov=90,
+                        ),
+                    ),
+                    "collisions": CollisionsMeasurementConfig(),
+                }
+            )
+        cfg.env.env_settings['habitat_config'] = self.config.habitat
+        cfg.env.env_settings['output_path'] = self.output_path
+
+        # init agent and env
+        cfg.agent.model_settings['task'] = self.task
+        cfg.agent.model_settings['sim_sensors_config'] = self.config.habitat.simulator.agents.main_agent.sim_sensors
+        self.objectnav_instruction = "search for {target_object}."
+        super().__init__(cfg, init_agent=True, init_env=True)
+
+    def eval_action(self):
+        """
+        Run local episodes on this rank.
+
+        Returns dict[str, Tensor] on GPU (1D tensors of same length).
+        """
+        sucs, spls, oss, nes = [], [], [], []
+        done_res = []
+        if os.path.exists(os.path.join(self.output_path, f'result.json')):
+            with open(os.path.join(self.output_path, f'result.json'),'r') as f:
+                for line in f.readlines():
+                    res = json.loads(line)
+                    done_res.append([res["scene_id"], res["episode_id"], res["episode_instruction"]])
+                    sucs.append(res['success'])
+                    spls.append(res['spl'])
+                    oss.append(res['os'])
+                    nes.append(res['ne'])
+        env = self.env
+
+        while env.is_running:
+            obs = env.reset()
+            if not env.is_running or obs is None:
+                break
+
+            # recover from last evaluated episode
+            episode = env._env.current_episode
+            scene_id = episode.scene_id.split('/')[-2]
+            if 'coin' in self.task:
+                episode_instruction = self.objectnav_instruction.format(target_object=episode.object_category.replace('_', ' '))+", "+episode.instruction
+            elif 'objectnav' in self.task:
+                episode_instruction = self.objectnav_instruction.format(target_object=episode.object_category.replace('_', ' '))
+            else:    
+                episode_instruction = episode.instruction.instruction_text[:-1]
+            episode_id = int(episode.episode_id)
+            if [scene_id, episode_id, episode_instruction] in done_res:
+                continue
+            # make directories
+            os.makedirs(os.path.join(self.output_path, 'check_sim'), exist_ok=True)
+            Image.fromarray(obs['rgb']).save(os.path.join(self.output_path, 'check_sim', f'rgb_{self.rank}.jpg'))
+            os.makedirs(os.path.join(self.output_path, 'action', f'{scene_id}'), exist_ok=True)
+            # os.makedirs(os.path.join(self.output_path, 'debug_images'), exist_ok=True)
+            
+            if self.save_video:
+                os.makedirs(os.path.join(self.output_path, 'vis', f'{scene_id}'), exist_ok=True)
+            
+            # get agent ready
+            self.agent.reset(env)
+
+            # info for npc
+            if 'dialog' in self.task or self.dialog_enabled: # gt of env for npc
+                with open(os.path.join(self.scene_summary, scene_id, 'object_dict.json'), 'r', encoding='utf-8') as f:
+                    object_dict = json.load(f)
+                with open(os.path.join(self.scene_summary, scene_id, 'region_dict.json'), 'r', encoding='utf-8') as f:
+                    region_dict = json.load(f)
+
+            # initialization
+            step_id = 0
+
+            path_list = []
+            vis_frames = []
+            action_list = [] # params for saving results
+
+            while not env._env.episode_over and step_id <= self.max_steps_per_episode:
+                agent_state = env._env.sim.get_agent_state()
+                path_list.append(agent_state.position.tolist())
+                info = {'step': step_id, 'agent state': agent_state, 'episode_instruction': episode_instruction, 'output_path': os.path.join(self.output_path, 'action', f'{scene_id}', f'{episode_id}.txt'), 'info': env.get_metrics()}
+                action = self.agent.step(obs, env, info=info)
+                print("step_id", step_id, "action", action)
+                action_list.append(action)
+                if action in [0, 1, 2, 3]:
+                    obs, reward, done, info = env.step(action)
+                elif action == 5:
+                    env.step(action)
+                    obs, reward, done, info = env.step(action)
+                    continue
+                elif action == 6:
+                    if len(self.agent.dialogs)/2>=self.turn:
+                        npc_answer = 'Sorry, you have reached the question limit. No further answers are available.'
+                    else: 
+                        path_description, pl = get_path_description_(env._env, object_dict, region_dict)
+                        task_finish = obs['semantic'][0].sum() > 0 and pl < 3
+                        npc_answer = self.npc.answer_question(
+                                                                question=self.agent.question,
+                                                                instance_id=env._env.current_episode.instruction.instance_id[0],
+                                                                object_dict=object_dict,
+                                                                task_done=task_finish,
+                                                                path_description=path_description,
+                                                                mode="two_turn",
+                                                            )
+                    if npc_answer is None:
+                        npc_answer = 'Sorry, I can not answer your question now.'
+                    
+                    with open(os.path.join(self.output_path, 'action', f'{scene_id}', f'{episode_id}.txt'), 'a') as f:
+                        f.write(npc_answer + "\n")
+                    obs['npc_answer'] = npc_answer
+                    continue
+
+                step_id += 1
+                self.agent.messages = []
+
+            m = env.get_metrics()
+            sucs.append(m["success"])
+            spls.append(m["spl"])
+            oss.append(m["oracle_success"])
+            nes.append(m["distance_to_goal"])
+            result = {
+                "scene_id": scene_id,
+                "episode_id": episode_id,
+                "success": m["success"],
+                "spl": m["spl"],
+                "os": m['oracle_success'],
+                "ne": m["distance_to_goal"],
+                "steps": step_id,
+                "episode_instruction": episode_instruction,
+                "path": path_list,
+                "action": action_list,
+                "object_category": episode.object_category if 'vln' not in self.task else ''
+            }
+            with open(os.path.join(self.output_path, f'result.json'), 'a') as f:
+                f.write(json.dumps(result) + "\n")
+
+        env.close()
+        return (
+            torch.tensor(sucs).to(self.device),
+            torch.tensor(spls).to(self.device),
+            torch.tensor(oss).to(self.device),
+            torch.tensor(nes).to(self.device),
+            torch.tensor(len(sucs)).to(self.device),
+        )
+
+    def calc_metrics(self, global_metrics: dict) -> dict:
+        """
+        global_metrics["sucs"] etc. are global 1-D CPU tensors with all episodes.
+        """
+        sucs_all = global_metrics["sucs"]
+        spls_all = global_metrics["spls"]
+        oss_all = global_metrics["oss"]
+        nes_all = global_metrics["nes"]
+
+        # avoid /0 if no episodes
+        denom = max(len(sucs_all), 1)
+
+        return {
+            "sucs_all": float(sucs_all.mean().item()) if denom > 0 else 0.0,
+            "spls_all": float(spls_all.mean().item()) if denom > 0 else 0.0,
+            "oss_all": float(oss_all.mean().item()) if denom > 0 else 0.0,
+            "nes_all": float(nes_all.mean().item()) if denom > 0 else 0.0,
+            # "length" will be filled by base class
+        }
+
+    def parse_actions(self, output):
+        action_patterns = '|'.join(re.escape(action) for action in self.actions2idx)
+        regex = re.compile(action_patterns)
+        matches = regex.findall(output)
+        actions = [self.actions2idx[match] for match in matches]
+        actions = itertools.chain.from_iterable(actions)
+        return list(actions)
+
+    def preprocess_depth_image_v2(
+        self, depth_image, do_depth_scale=True, depth_scale=1000, target_height=None, target_width=None
+    ):
+        if target_height is None:
+            target_height = self.image_processor.crop_size['height']  # 384
+            target_width = self.image_processor.crop_size['width']  # 384
+
+        resized_depth_image = depth_image.resize((target_width, target_height), Image.NEAREST)
+
+        img = to_numpy_array(resized_depth_image)
+        if do_depth_scale:
+            img = img / depth_scale
+
+        return img, (target_width, target_height)
+
+    def get_intrinsic_matrix(self, sensor_cfg) -> np.ndarray:
+        width = sensor_cfg.width
+        height = sensor_cfg.height
+        fov = sensor_cfg.hfov
+        fx = (width / 2.0) / np.tan(np.deg2rad(fov / 2.0))
+        fy = fx  # Assuming square pixels (fx = fy)
+        cx = (width - 1.0) / 2.0
+        cy = (height - 1.0) / 2.0
+
+        intrinsic_matrix = np.array(
+            [[fx, 0.0, cx, 0.0], [0.0, fy, cy, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+        )
+        return intrinsic_matrix
+
+    def get_axis_align_matrix(self):
+        ma = np.array([[0, 0, 1, 0], [-1, 0, 0, 0], [0, -1, 0, 0], [0, 0, 0, 1]])
+        return ma
+
+    def xyz_yaw_to_tf_matrix(self, xyz: np.ndarray, yaw: float) -> np.ndarray:
+        x, y, z = xyz
+        transformation_matrix = np.array(
+            [
+                [np.cos(yaw), -np.sin(yaw), 0, x],
+                [np.sin(yaw), np.cos(yaw), 0, y],
+                [0, 0, 1, z],
+                [0, 0, 0, 1],
+            ]
+        )
+        return transformation_matrix
+
+    def xyz_pitch_to_tf_matrix(self, xyz: np.ndarray, pitch: float) -> np.ndarray:
+        """Converts a given position and pitch angle to a 4x4 transformation matrix.
+
+        Args:
+            xyz (np.ndarray): A 3D vector representing the position.
+            pitch (float): The pitch angle in radians for y axis.
+        Returns:
+            np.ndarray: A 4x4 transformation matrix.
+        """
+
+        x, y, z = xyz
+        transformation_matrix = np.array(
+            [
+                [np.cos(pitch), 0, np.sin(pitch), x],
+                [0, 1, 0, y],
+                [-np.sin(pitch), 0, np.cos(pitch), z],
+                [0, 0, 0, 1],
+            ]
+        )
+        return transformation_matrix
+
+    def xyz_yaw_pitch_to_tf_matrix(self, xyz: np.ndarray, yaw: float, pitch: float) -> np.ndarray:
+        """Converts a given position and yaw, pitch angles to a 4x4 transformation matrix.
+
+        Args:
+            xyz (np.ndarray): A 3D vector representing the position.
+            yaw (float): The yaw angle in radians.
+            pitch (float): The pitch angle in radians for y axis.
+        Returns:
+            np.ndarray: A 4x4 transformation matrix.
+        """
+        x, y, z = xyz
+        rot1 = self.xyz_yaw_to_tf_matrix(xyz, yaw)[:3, :3]
+        rot2 = self.xyz_pitch_to_tf_matrix(xyz, pitch)[:3, :3]
+        transformation_matrix = np.eye(4)
+        transformation_matrix[:3, :3] = rot1 @ rot2
+        transformation_matrix[:3, 3] = xyz
+        return transformation_matrix
+
+    def pixel_to_gps(self, pixel, depth, intrinsic, tf_camera_to_episodic):
+        '''
+        Args:
+            pixel: (2,) - [u, v] pixel coordinates
+            depth: (H, W) - depth image where depth[v, u] gives depth in meters
+            intrinsic: (4, 4) - camera intrinsic matrix
+            tf_camera_to_episodic: (4, 4) - transformation from camera to episodic frame
+        Returns:
+            (x, y): (x, y) coordinates in the episodic frame
+        '''
+        v, u = pixel
+        z = depth[v, u]
+        print("depthhhhhhhhhhhhhh", z)
+
+        x = (u - intrinsic[0, 2]) * z / intrinsic[0, 0]
+        y = (v - intrinsic[1, 2]) * z / intrinsic[1, 1]
+        point_camera = np.array([x, y, z, 1.0])
+
+        # Transform to episodic frame
+        point_episodic = tf_camera_to_episodic @ point_camera
+        point_episodic = point_episodic[:3] / point_episodic[3]
+
+        x = point_episodic[0]
+        y = point_episodic[1]
+
+        return (x, y)  # same as habitat gps
+
+    def dot_matrix_two_dimensional(
+        self,
+        image_or_image_path,
+        save_path=None,
+        dots_size_w=8,
+        dots_size_h=8,
+        save_img=False,
+        font_path='fonts/arial.ttf',
+        pixel_goal=None,
+    ):
+        """
+        takes an original image as input, save the processed image to save_path. Each dot is labeled with two-dimensional Cartesian coordinates (x,y). Suitable for single-image tasks.
+        control args:
+        1. dots_size_w: the number of columns of the dots matrix
+        2. dots_size_h: the number of rows of the dots matrix
+        """
+        with open_image(image_or_image_path) as img:
+            if img.mode != 'RGB':
+                img = img.convert('RGB')
+            draw = ImageDraw.Draw(img, 'RGB')
+
+            width, height = img.size
+            grid_size_w = dots_size_w + 1
+            grid_size_h = dots_size_h + 1
+            cell_width = width / grid_size_w
+            cell_height = height / grid_size_h
+
+            font = ImageFont.truetype(font_path, width // 40)  # Adjust font size if needed; default == width // 40
+
+            target_i = target_j = None
+            if pixel_goal is not None:
+                y_pixel, x_pixel = pixel_goal[0], pixel_goal[1]
+                # Validate pixel coordinates
+                if not (0 <= x_pixel < width and 0 <= y_pixel < height):
+                    raise ValueError(f"pixel_goal {pixel_goal} exceeds image dimensions ({width}x{height})")
+
+                # Convert to grid coordinates
+                target_i = round(x_pixel / cell_width)
+                target_j = round(y_pixel / cell_height)
+
+                # Validate grid bounds
+                if not (1 <= target_i <= dots_size_w and 1 <= target_j <= dots_size_h):
+                    raise ValueError(
+                        f"pixel_goal {pixel_goal} maps to grid ({target_j},{target_i}), "
+                        f"valid range is (1,1)-({dots_size_h},{dots_size_w})"
+                    )
+
+            count = 0
+
+            for j in range(1, grid_size_h):
+                for i in range(1, grid_size_w):
+                    x = int(i * cell_width)
+                    y = int(j * cell_height)
+
+                    pixel_color = img.getpixel((x, y))
+                    # choose a more contrasting color from black and white
+                    if pixel_color[0] + pixel_color[1] + pixel_color[2] >= 255 * 3 / 2:
+                        opposite_color = (0, 0, 0)
+                    else:
+                        opposite_color = (255, 255, 255)
+
+                    if pixel_goal is not None and i == target_i and j == target_j:
+                        opposite_color = (255, 0, 0)  # Red for target
+
+                    circle_radius = width // 240  # Adjust dot size if needed; default == width // 240
+                    draw.ellipse(
+                        [(x - circle_radius, y - circle_radius), (x + circle_radius, y + circle_radius)],
+                        fill=opposite_color,
+                    )
+
+                    text_x, text_y = x + 3, y
+                    count_w = count // dots_size_w
+                    count_h = count % dots_size_w
+                    label_str = f"({count_w+1},{count_h+1})"
+                    draw.text((text_x, text_y), label_str, fill=opposite_color, font=font)
+                    count += 1
+            if save_img:
+                print(">>> dots overlaid image processed, stored in", save_path)
+                img.save(save_path)
+            return img
diff --git a/internnav/internnav_habitat/simple_npc/__init__.py b/internnav/internnav_habitat/simple_npc/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/internnav/internnav_habitat/simple_npc/prompt.py b/internnav/internnav_habitat/simple_npc/prompt.py
new file mode 100644
index 00000000..9a7f962a
--- /dev/null
+++ b/internnav/internnav_habitat/simple_npc/prompt.py
@@ -0,0 +1,86 @@
+TEMPLATE = {'one_turn_prompt': """
+You are a helpful assistant in helping agent to finish its navigation task.
+
+## Here is the ground truth information you know more than the agent
+'TASK DONE' shows if the agent has finished the task, if it is false, you need to know that the agent hasn't found the goal object.
+'GOAL INFORMATION' shows the goal object's information.
+'CORRECT PATH' shows the correct path description to the goal object.
+
+TASK DONE: 
+{task_done}
+
+GOAL INFORMATION: 
+{goal_information}
+
+CORRECT PATH:
+{path_description}
+
+## Some constraints you MUST follow:
+1. Only output the answer to the question.
+2. Don't be verbose.
+
+## Here is the question you need to answer
+QUESTION: {question}
+""",
+
+"two_turn_prompt_0": """
+You are a helpful assistant in helping agent to finish its navigation task. You will be given a question among the following three types:
+1. Disambiguation: This question is asked to check whether the agent has found the goal object. Like "Is it the object you are looking for?"
+2. Path: This question is asked to get the path to the goal object. Like "Where should I go now?"
+3. Information: This question is asked to get more information about the goal object. Like "Where is the goal object?", "What is the color of the goal object?"
+
+You need to classify the question into one of the three types. Only output the name of the type(disambiguation, path, information). Don't be verbose.
+
+## Here is the question you need to answer
+QUESTION: {question}
+""",
+
+"two_turn_prompt_1": """
+You are a helpful assistant in answering the question. Here follows the ground truth information about the goal object. You need to answer the question based on the ground truth information.
+
+## Here is the ground truth information about the goal object
+GOAL INFORMATION: 
+{goal_information}  
+
+## Here is the question you need to answer
+QUESTION: {question}
+"""}
+
+DISAMBIGUATION_PROMPT = {            
+    'yes':[
+        "Yes, you are in the correct position.",
+        "That's right, you are at the intended location.",
+        "Yes, you have reached the right spot.",
+        "Correct, you are in the proper place.",
+        "Yes, you are exactly where you need to be.",
+        "Yes, you are aligned correctly.",
+        "Yes, you are positioned accurately.",
+        "Everything looks good, you are at the correct location.",
+        "You are in the right area.",
+        "Yes, you are currently at the correct position.",
+        "That's perfect, you are in the right spot.",
+        "Yes, your position is accurate.",
+        "You have reached the proper location.",
+        "Yes, you are at the specified position.",
+        "Everything is aligned properly, you're in the correct spot.",
+        "Yes, you are where you should be.",
+        "Yes, this is the right place."
+    ],
+    'no':[
+        "This is not the intended location.",
+        "You are not in the proper place.",
+        "No, you are not where you need to be.",
+        "No, you are not aligned correctly.",
+        "No, you are positioned incorrectly.",
+        "You are not at the correct location.",
+        "No, you are situated incorrectly.",
+        "You are in the wrong area.",
+        "No, you are not currently at the correct position.",
+        "That's not the right spot.",
+        "No, you are not at the intended destination.",
+        "Your position is inaccurate.",
+        "You haven't reached the proper location.",
+        "No, you are not at the specified position.",
+        "The alignment is off, you are in the wrong spot.",
+        "This is not the right place."
+    ]}
\ No newline at end of file
diff --git a/internnav/internnav_habitat/simple_npc/scene_summary b/internnav/internnav_habitat/simple_npc/scene_summary
new file mode 120000
index 00000000..ef6d02ee
--- /dev/null
+++ b/internnav/internnav_habitat/simple_npc/scene_summary
@@ -0,0 +1 @@
+/mnt/petrelfs/huangwensi/vln_llava/data/scene_summary
\ No newline at end of file
diff --git a/internnav/internnav_habitat/simple_npc/simple_npc.py b/internnav/internnav_habitat/simple_npc/simple_npc.py
new file mode 100644
index 00000000..4976bb40
--- /dev/null
+++ b/internnav/internnav_habitat/simple_npc/simple_npc.py
@@ -0,0 +1,149 @@
+import base64
+import random
+from .prompt import TEMPLATE, DISAMBIGUATION_PROMPT
+
+class SimpleNPC:
+    def __init__(
+        self,
+        max_interaction_turn: int,
+        model_name: str,
+        openai_api_key: str,
+        base_url: str = None,
+    ) -> None:
+        try:
+            from openai import OpenAI
+        except ModuleNotFoundError:
+            print('ModuleNotFoundError: No module named \'openai\'. Please install it first.')
+            return
+        self.model_name = model_name
+        self.max_turn = max_interaction_turn
+        self.history_messages = []
+        with open(openai_api_key, 'r', encoding='utf-8') as file:
+            openai_api_key = file.read().strip()
+        try:
+            self.llm = OpenAI(api_key=openai_api_key, base_url=base_url)
+        except Exception as e:
+            print(f'Failed to initialize OpenAI: {e}')
+
+    def get_room_name(self, room):
+        room_name_dict = {
+            "living region": "living room",
+            "stair region": "stairs",
+            "bathing region": "bathroom",
+            "storage region": "storage room",
+            "study region": "study room",
+            "cooking region": "kitchen",
+            "sports region": "sports room",
+            "corridor region": "corridor",
+            "toliet region": "toilet",
+            "dinning region": "dining room",
+            "resting region": "resting room",
+            "open area region": "open area",
+            "other region": "area",
+        }
+        return room_name_dict[room]
+
+    def answer_question(self, question: str, instance_id: str,  object_dict: dict, task_done: bool, path_description: str, mode: str):
+        if mode == 'one_turn':
+            goal_information = ''
+            goal_information += 'room: ' + self.get_room_name(object_dict[instance_id]['room']) + '\n'
+            goal_information += '\n'.join([f'{a.lower()}: {i.lower()}' for a,i in object_dict[instance_id]['unique_description'].items() if a in ['color','texture','material','shape','placement'] and len(i)>0])
+            nearby_objects = [object_dict[obj]['unique_description']['fine grained category'].lower() for obj, _ in object_dict[instance_id]['nearby_objects'].items() if obj in object_dict and isinstance(object_dict[obj]['unique_description'], dict)]
+            if len(nearby_objects) > 0:
+                goal_information += '\nnearby objects: ' + ','.join(nearby_objects)
+            goal_information += 'whole description: ' + object_dict[instance_id]['caption']
+            answer = self.ask_directly(
+                template_type="one_turn_prompt",
+                question=question,
+                goal_information = goal_information,
+                path_description = path_description,
+                task_done = task_done,
+            )
+            return answer
+        elif mode == 'two_turn':
+            answer = self.ask_directly(
+                template_type="two_turn_prompt_0",
+                question=question,
+            )
+            if 'path' in answer.lower():
+                return path_description
+            elif 'disambiguation' in answer.lower():
+                if task_done:
+                    return random.choice(DISAMBIGUATION_PROMPT['yes'])
+                else:
+                    return random.choice(DISAMBIGUATION_PROMPT['no'])
+            elif 'information' in answer.lower():
+                goal_information = ''
+                goal_information += 'room: ' + self.get_room_name(object_dict[instance_id]['room']) + '\n'
+                goal_information += '\n'.join([f'{a.lower()}: {i.lower()}' for a,i in object_dict[instance_id]['unique_description'].items() if a in ['color','texture','material','shape','placement'] and len(i)>0])
+                nearby_objects = [object_dict[obj]['unique_description']['fine grained category'].lower() for obj, _ in object_dict[instance_id]['nearby_objects'].items() if obj in object_dict and isinstance(object_dict[obj]['unique_description'], dict)]
+                if len(nearby_objects) > 0:
+                    goal_information += '\nnearby objects: ' + ','.join(nearby_objects)
+                goal_information += 'whole description: ' + object_dict[instance_id]['caption']
+                answer = self.ask_directly(
+                    template_type="one_turn_prompt",
+                    question=question,
+                    goal_information = goal_information,
+                    path_description = path_description,
+                    task_done = task_done,
+                )
+                answer = self.answer_question(question, instance_id, object_dict, task_done, answer, 'one_turn')
+            return answer
+        else:
+            raise ValueError(f"Invalid mode: {mode}")
+
+    def ask_directly(self, template_type, **kwargs):
+        def generate_prompt(template_type, **kwargs):
+            """
+            Generate a complete prompt based on the template type and provided content.
+
+            Parameters:
+                template_type (str): The type of template to use.
+                **kwargs: The content to fill in the template.
+
+            Returns:
+                str: The complete prompt.
+            """
+
+            prompt = TEMPLATE.get(template_type, None)
+            if prompt is None:
+                raise ValueError(f"Template type '{template_type}' not found.")
+            prompt = prompt.format(**kwargs)
+            return prompt
+        messages = []
+        image_bufs = kwargs.get('images', None)
+        cnt = 0
+        prompt = generate_prompt(template_type, **kwargs)
+        content = [{'type': 'text', 'text': prompt}]
+        if image_bufs is not None:
+            for im_id, image_buf in enumerate(image_bufs):
+                img_encoded = base64.b64encode(image_buf.getvalue()).decode('utf-8')
+                image_buf.close()
+                item = {
+                    'type': 'image_url',
+                    'image_url': {
+                        'url': f'data:image/png;base64,{img_encoded}',
+                        'detail': 'high',
+                    },
+                    'index': im_id,
+                }
+                content.append(item)
+        messages.append({'role': 'user', 'content': content})
+
+        while cnt < self.max_turn:
+            try:
+                response = self.llm.chat.completions.create(
+                    model=self.model_name,
+                    messages=messages,
+                    max_tokens=2048,
+                    top_p=1,
+                    frequency_penalty=0,
+                    presence_penalty=0,
+                )
+                result = response.choices[0].message.content
+                break
+            except Exception as e:
+                print(e)
+                cnt += 1
+                result = None
+        return result
diff --git a/scripts/eval/bash/srun_eval_dialog.sh b/scripts/eval/bash/srun_eval_dialog.sh
new file mode 100755
index 00000000..704d4e9d
--- /dev/null
+++ b/scripts/eval/bash/srun_eval_dialog.sh
@@ -0,0 +1,15 @@
+# use to run distributed eval with 8 gpus on single node
+export MAGNUM_LOG=quiet HABITAT_SIM_LOG=quiet
+export NCCL_SOCKET_IFNAME=bond0 
+export NCCL_IB_HCA=mlx5_2,mlx5_3,mlx5_4,mlx5_5
+
+srun -p mozi_t \
+    --gres=gpu:1 \
+    --ntasks=1 \
+    --time=0-20:00:00 \
+    --ntasks-per-node=1 \
+    --cpus-per-task=8 \
+    --exclude=HOST-10-140-66-53,HOST-10-140-66-69 \
+    --kill-on-bad-exit=1 \
+    python scripts/eval/eval.py \
+      --config scripts/eval/configs/habitat_dialog_cfg.py \
\ No newline at end of file
diff --git a/scripts/eval/bash/srun_eval_object.sh b/scripts/eval/bash/srun_eval_object.sh
new file mode 100755
index 00000000..2a2746d1
--- /dev/null
+++ b/scripts/eval/bash/srun_eval_object.sh
@@ -0,0 +1,15 @@
+# use to run distributed eval with 8 gpus on single node
+export MAGNUM_LOG=quiet HABITAT_SIM_LOG=quiet
+export NCCL_SOCKET_IFNAME=bond0 
+export NCCL_IB_HCA=mlx5_2,mlx5_3,mlx5_4,mlx5_5
+
+srun -p efm_t \
+    --gres=gpu:8 \
+    --ntasks=8 \
+    --time=0-20:00:00 \
+    --ntasks-per-node=8 \
+    --cpus-per-task=8 \
+    --exclude=HOST-10-140-66-53,HOST-10-140-66-69 \
+    --kill-on-bad-exit=1 \
+    python scripts/eval/eval.py \
+      --config scripts/eval/configs/habitat_object_cfg.py \
\ No newline at end of file
diff --git a/scripts/eval/configs/gen_videos.yaml b/scripts/eval/configs/gen_videos.yaml
new file mode 100644
index 00000000..b431e2f9
--- /dev/null
+++ b/scripts/eval/configs/gen_videos.yaml
@@ -0,0 +1,64 @@
+exp:
+  num_environments: 1
+  dump_location: datadump_qualitative
+  exp_name: llava3d
+  no_gpu: 0
+  seed: 1
+  visualize: True
+  print_images: 0
+  num_sem_categories: 10  
+  goal_type: object
+  goal_map_only: False
+  detector: mp3d_gt
+  frame_width: 90
+  frame_height: 160
+  vocabulary: hm3d
+  task: object
+
+agent:
+  max_steps: 500
+  pointnav_stop_radius: 0.9
+  num_goal_categories: 6
+  use_vision_language_feat: False
+  model_path: checkpoints/llava-v1.5-7b-waypoint-position-p-encoder
+  conv_mode: llava_v1
+  seq: ","
+  temperature: 0.2
+  top_p: null
+  num_beams: 1
+  max_new_tokens: 512
+  semantic_map:
+    semantic_categories: hm3d_9cat # "hm3d_90cat" #map semantic channel categories ("coco_indoor", "longtail_indoor", "mukul_indoor")
+    num_sem_categories: 10           # number of map semantic channel categories (16, 257, 35)
+    agent_height_project: False
+    map_size_cm: 4800        # global map size (in centimeters)
+    map_resolution: 5        # size of map bins (in centimeters)
+    vision_range: 100        # diameter of local map region visible by the agent (in cells)
+    global_downscaling: 2    # ratio of global over local map
+    du_scale: 1              # frame downscaling before projecting to point cloud
+    cat_pred_threshold: 5.0  # number of depth points to be in bin to classify it as a certain semantic category
+    exp_pred_threshold: 1.0  # number of depth points to be in bin to consider it as explored
+    map_pred_threshold: 0.1  # number of depth points to be in bin to consider it as obstacle
+    explored_radius: 150     # radius (in centimeters) of visually explored region
+    been_close_to_radius: 200  # radius (in centimeters) of been close to region
+    must_explore_close: False
+    min_obs_height_cm: 25    # minimum height (in centimeters) of obstacle to be considered as obstacle
+    global_update_steps: 500
+    # erosion and filtering to reduce the number of spurious artifacts
+    dilate_obstacles: False
+    dilate_size: 3
+    dilate_iter: 1
+    goal_reached_dist: 20
+
+  planner:
+    collision_threshold: 0.20       # forward move distance under which we consider there's a collision (in meters)
+    min_obs_dilation_selem_radius: 3    # radius (in cells) of obstacle dilation structuring element
+    obs_dilation_selem_radius: 5    # radius (in cells) of obstacle dilation structuring element
+    goal_dilation_selem_radius: 5  # radius (in cells) of goal dilation structuring element
+    use_dilation_for_stg: True      # use dilated goals for estimating short-term goals - or just reaching
+    map_downsample_factor: 1            # optional downsampling of traversible and goal map before fmm distance call (1 for no downsampling, 2 for halving resolution)
+    map_update_frequency: 1             # compute fmm distance map every n steps 
+    step_size: 10                    # maximum distance of the short-term goal selected by the planner
+    magnify_goal_when_hard: 100
+    discrete_actions: True  
+    verbose: False
\ No newline at end of file
diff --git a/scripts/eval/configs/habitat_dialog_cfg.py b/scripts/eval/configs/habitat_dialog_cfg.py
new file mode 100755
index 00000000..3ca503c3
--- /dev/null
+++ b/scripts/eval/configs/habitat_dialog_cfg.py
@@ -0,0 +1,61 @@
+from internnav.configs.agent import AgentCfg
+from internnav.configs.evaluator import EnvCfg, EvalCfg, TaskCfg
+
+eval_cfg = EvalCfg(
+    remote_agent = False,
+    agent=AgentCfg(
+        server_port=8087,
+        model_name='dialog',
+        ckpt_path='',
+        model_settings={
+            "mode": "system2",  # inference mode: dual_system or system2
+            "dialog_enabled": True,
+            "model_path": "checkpoints/Vlln-dialog",  # path to model checkpoint
+            "append_look_down": False,
+            "num_history": 8,
+            "resize_w": 384,  # image resize width
+            "resize_h": 384,  # image resize height
+            "max_new_tokens": 128,  # maximum number of tokens for generation
+        },
+    ),
+    env=EnvCfg(
+        env_type='habitat',
+        env_settings={
+            # habitat sim specifications - agent, sensors, tasks, measures etc. are defined in the habitat config file
+
+            'baseline_config_path': 'scripts/eval/configs/gen_videos.yaml',
+            'habitat_config_path': 'scripts/eval/configs/instance_dialog.yaml',
+            # 'habitat_config_path': 'scripts/eval/configs/objectnav_hm3d.yaml',
+            # 'habitat_config_path': 'scripts/eval/configs/instance.yaml',
+        },
+    ),
+    task=TaskCfg(
+        task_name = "instance_dialog"
+        # task_name = "object", 
+        # task_name = "instance", 
+    ),
+    eval_type="habitat_dialog",
+    eval_settings={
+        # all current parse args
+        "output_path": "./logs/habitat/dialog",  # output directory for logs/results
+        "epoch": 0,  # epoch number for logging
+        "max_steps_per_episode": 500,  # maximum steps per episode
+
+        # task setting
+        "eval_split": "easy_same",
+        # "eval_split": "val",
+        # "eval_split": "easy_same",
+        "turn": 5,
+        "save_video": False,  # whether to save videos
+
+        # npc setting
+        "base_url": 'http://35.220.164.252:3888/v1',
+        "model_name": "gpt-4o",
+        "openai_api_key": 'internnav/internnav_habitat/simple_npc/api_key.txt',
+        "scene_summary": 'internnav/internnav_habitat/simple_npc/scene_summary',
+        
+        # distributed settings
+        "port": "2333",  # communication port
+        "dist_url": "env://",  # url for distributed setup
+    },
+)
diff --git a/scripts/eval/configs/habitat_object_cfg.py b/scripts/eval/configs/habitat_object_cfg.py
new file mode 100755
index 00000000..c28daa42
--- /dev/null
+++ b/scripts/eval/configs/habitat_object_cfg.py
@@ -0,0 +1,61 @@
+from internnav.configs.agent import AgentCfg
+from internnav.configs.evaluator import EnvCfg, EvalCfg, TaskCfg
+
+eval_cfg = EvalCfg(
+    remote_agent = False,
+    agent=AgentCfg(
+        server_port=8087,
+        model_name='dialog',
+        ckpt_path='',
+        model_settings={
+            "mode": "system2",  # inference mode: dual_system or system2
+            "dialog_enabled": False,
+            "model_path": "checkpoints/Vlln-object",  # path to model checkpoint
+            "append_look_down": True,
+            "num_history": 8,
+            "resize_w": 384,  # image resize width
+            "resize_h": 384,  # image resize height
+            "max_new_tokens": 128,  # maximum number of tokens for generation
+        },
+    ),
+    env=EnvCfg(
+        env_type='habitat',
+        env_settings={
+            # habitat sim specifications - agent, sensors, tasks, measures etc. are defined in the habitat config file
+
+            'baseline_config_path': 'scripts/eval/configs/gen_videos.yaml',
+            # 'habitat_config_path': 'scripts/eval/configs/instance_dialog.yaml',
+            'habitat_config_path': 'scripts/eval/configs/objectnav_hm3d.yaml',
+            # 'habitat_config_path': 'scripts/eval/configs/instance.yaml',
+        },
+    ),
+    task=TaskCfg(
+        # task_name = "instance_dialog"
+        task_name = "objectnav", 
+        # task_name = "instance", 
+    ),
+    eval_type="habitat_dialog",
+    eval_settings={
+        # all current parse args
+        "output_path": "./logs/habitat/object",  # output directory for logs/results
+        "epoch": 0,  # epoch number for logging
+        "max_steps_per_episode": 500,  # maximum steps per episode
+
+        # task setting
+        # "eval_split": "easy_same",
+        "eval_split": "val",
+        # "eval_split": "easy_same",
+        "turn": 5,
+        "save_video": False,  # whether to save videos
+
+        # npc setting
+        "base_url": 'http://35.220.164.252:3888/v1',
+        "model_name": "gpt-4o",
+        "openai_api_key": 'internnav/internnav_habitat/simple_npc/api_key.txt',
+        "scene_summary": 'internnav/internnav_habitat/simple_npc/scene_summary',
+        
+        # distributed settings
+        "port": "2333",  # communication port
+        "dist_url": "env://",  # url for distributed setup
+    },
+)
diff --git a/scripts/eval/configs/instance_dialog.yaml b/scripts/eval/configs/instance_dialog.yaml
new file mode 100644
index 00000000..55746512
--- /dev/null
+++ b/scripts/eval/configs/instance_dialog.yaml
@@ -0,0 +1,79 @@
+# @package _global_
+
+defaults:
+  - /habitat: habitat_config_base
+  - /habitat/task: objectnav
+  # - habitat/task/measurements: dialog_success
+  - /habitat/simulator/agents@habitat.simulator.agents.main_agent: rgbd_agent
+  - /habitat/dataset/objectnav: mp3d
+  - /habitat/task/lab_sensors:
+    - gps_sensor
+    - compass_sensor
+  - _self_
+
+habitat:
+  environment:
+    max_episode_steps: 2000
+    iterator_options:
+      max_scene_repeat_steps: 50000
+      shuffle: False
+  simulator:
+    agents:
+      main_agent:
+        sim_sensors:
+          rgb_sensor:
+            width: 640
+            height: 480
+            hfov: 79
+          depth_sensor:
+            width: 640
+            height: 480
+            hfov: 79
+            min_depth: 0.0
+            max_depth: 10.0
+    forward_step_size: 0.25
+    turn_angle: 30
+    tilt_angle: 15
+    action_space_config: "v1"
+    habitat_sim_v0:
+      allow_sliding: True
+      gpu_device_id: 0
+  task:
+    measurements:
+      distance_to_goal:
+        type: DistanceToGoal
+        distance_to: VIEW_POINTS
+      success:
+        type: Success
+        success_distance: 0.25
+      spl:
+        type: SPL
+      oracle_success:
+        type: OracleSuccess
+      oracle_navigation_error:
+        type: OracleNavigationError
+    actions:
+      stop:
+        type: StopAction
+        agent_index: 0
+      move_forward:
+        type: MoveForwardAction
+        agent_index: 0
+      turn_left:
+        type: TurnLeftAction
+        agent_index: 0
+      turn_right:
+        type: TurnRightAction
+        agent_index: 0
+      look_up:
+        type: LookUpAction
+        agent_index: 0
+      look_down:
+        type: LookDownAction
+        agent_index: 0
+
+  dataset:
+    type: dialog
+    split: unseen_mini
+    scenes_dir: data/scene_datasets/
+    data_path: data/datasets/instance_goal_dialog/unseen/final_VLLN_testset.json.gz
\ No newline at end of file
diff --git a/scripts/eval/configs/objectnav_hm3d.yaml b/scripts/eval/configs/objectnav_hm3d.yaml
new file mode 100644
index 00000000..5a2f2ef1
--- /dev/null
+++ b/scripts/eval/configs/objectnav_hm3d.yaml
@@ -0,0 +1,83 @@
+# @package _global_
+
+defaults:
+  - /habitat: habitat_config_base
+  - /habitat/task: objectnav
+  - /habitat/simulator/agents@habitat.simulator.agents.main_agent: rgbd_agent
+  - /habitat/dataset/objectnav: hm3d
+  - /habitat/task/lab_sensors:
+    - gps_sensor
+    - compass_sensor
+  - _self_
+
+habitat:
+  environment:
+    max_episode_steps: 2000
+    iterator_options:
+      max_scene_repeat_steps: 50000
+      shuffle: False
+  simulator:
+    agents:
+      main_agent:
+        sim_sensors:
+          rgb_sensor:
+            width: 640
+            height: 480
+            hfov: 79
+            position: [0, 1.25, 0]
+          depth_sensor:
+            width: 640
+            height: 480
+            hfov: 79
+            min_depth: 0.0
+            max_depth: 10.0
+            position: [0, 1.25, 0]
+        height: 1.25
+        radius: 0.18
+    forward_step_size: 0.25
+    turn_angle: 30
+    tilt_angle: 15
+    action_space_config: "v1"
+    habitat_sim_v0:
+      gpu_device_id: 0
+      allow_sliding: True
+  task:
+    measurements:
+      distance_to_goal:
+        type: DistanceToGoal
+        distance_to: VIEW_POINTS
+      success:
+        type: Success
+        success_distance: 0.25
+      spl:
+        type: SPL
+      oracle_success:
+        type: OracleSuccess
+      oracle_navigation_error:
+        type: OracleNavigationError
+    actions:
+      stop:
+        type: StopAction
+        agent_index: 0
+      move_forward:
+        type: MoveForwardAction
+        agent_index: 0
+      turn_left:
+        type: TurnLeftAction
+        agent_index: 0
+      turn_right:
+        type: TurnRightAction
+        agent_index: 0
+      look_up:
+        type: LookUpAction
+        agent_index: 0
+      look_down:
+        type: LookDownAction
+        agent_index: 0
+
+  dataset:
+    type: ObjectNav-v1
+    split: val
+    scenes_dir: data/scene_datasets/
+    data_path: data/datasets/objectnav_hm3d_v2/{split}/{split}.json.gz
+    # data_path: data/datasets/objectnav/mp3d/v1/val/val.json.gz
\ No newline at end of file