From cc5bb8130ae1b6a93ef0cf7aa7bd0a489f70039d Mon Sep 17 00:00:00 2001 From: wzcai99 Date: Thu, 16 Oct 2025 11:32:33 +0000 Subject: [PATCH 1/7] [FEAT] Add support for navdp finetuning --- internnav/dataset/navdp_dataset_lerobot.py | 5 +- .../model/basemodel/navdp/navdp_policy.py | 11 +-- internnav/model/encoder/navdp_backbone.py | 4 +- scripts/train/configs/navdp.py | 9 +- scripts/train/train.py | 95 +++++++++---------- 5 files changed, 62 insertions(+), 62 deletions(-) diff --git a/internnav/dataset/navdp_dataset_lerobot.py b/internnav/dataset/navdp_dataset_lerobot.py index 4437e7c..f5b3a2f 100644 --- a/internnav/dataset/navdp_dataset_lerobot.py +++ b/internnav/dataset/navdp_dataset_lerobot.py @@ -41,6 +41,7 @@ def __init__( image_size=224, scene_data_scale=1.0, trajectory_data_scale=1.0, + pixel_channel=7, debug=False, preload=False, random_digit=False, @@ -61,6 +62,7 @@ def __init__( self.trajectory_afford_path = [] self.random_digit = random_digit self.prior_sample = prior_sample + self.pixel_channel = pixel_channel self.item_cnt = 0 self.batch_size = batch_size self.batch_time_sum = 0.0 @@ -509,7 +511,8 @@ def __getitem__(self, index): camera_intrinsic, trajectory_base_extrinsic, ) - pixel_goal = np.concatenate((pixel_goal, memory_images[-1]), axis=-1) + if self.pixel_channel == 7: + pixel_goal = np.concatenate((pixel_goal, memory_images[-1]), axis=-1) pred_actions = (pred_actions[1:] - pred_actions[:-1]) * 4.0 augment_actions = (augment_actions[1:] - augment_actions[:-1]) * 4.0 diff --git a/internnav/model/basemodel/navdp/navdp_policy.py b/internnav/model/basemodel/navdp/navdp_policy.py index 6a8da42..784044d 100644 --- a/internnav/model/basemodel/navdp/navdp_policy.py +++ b/internnav/model/basemodel/navdp/navdp_policy.py @@ -51,9 +51,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): elif pretrained_model_name_or_path is None or len(pretrained_model_name_or_path) == 0: pass else: - incompatible_keys, _ = model.load_state_dict( - torch.load(pretrained_model_name_or_path)['state_dict'], strict=False - ) + incompatible_keys, _ = model.load_state_dict(torch.load(pretrained_model_name_or_path), strict=False) if len(incompatible_keys) > 0: print(f'Incompatible keys: {incompatible_keys}') @@ -66,13 +64,12 @@ def __init__(self, config: NavDPModelConfig): self.model_config = ModelCfg(**config.model_cfg['model']) else: self.model_config = config - self.config.model_cfg['il'] - self._device = torch.device(f"cuda:{config.model_cfg['local_rank']}") self.image_size = self.config.model_cfg['il']['image_size'] self.memory_size = self.config.model_cfg['il']['memory_size'] self.predict_size = self.config.model_cfg['il']['predict_size'] + self.pixel_channel = self.config.model_cfg['il']['pixel_channel'] self.temporal_depth = self.config.model_cfg['il']['temporal_depth'] self.attention_heads = self.config.model_cfg['il']['heads'] self.input_channels = self.config.model_cfg['il']['channels'] @@ -83,7 +80,9 @@ def __init__(self, config: NavDPModelConfig): self.rgbd_encoder = NavDP_RGBD_Backbone( self.image_size, self.token_dim, memory_size=self.memory_size, finetune=self.finetune, device=self._device ) - self.pixel_encoder = NavDP_PixelGoal_Backbone(self.image_size, self.token_dim, device=self._device) + self.pixel_encoder = NavDP_PixelGoal_Backbone( + self.image_size, self.token_dim, pixel_channel=self.pixel_channel, device=self._device + ) self.image_encoder = NavDP_ImageGoal_Backbone(self.image_size, self.token_dim, device=self._device) self.point_encoder = nn.Linear(3, self.token_dim) diff --git a/internnav/model/encoder/navdp_backbone.py b/internnav/model/encoder/navdp_backbone.py index cd2e879..680c67b 100644 --- a/internnav/model/encoder/navdp_backbone.py +++ b/internnav/model/encoder/navdp_backbone.py @@ -377,7 +377,7 @@ def _get_device(self): class NavDP_PixelGoal_Backbone(nn.Module): - def __init__(self, image_size=224, embed_size=512, device='cuda:0'): + def __init__(self, image_size=224, embed_size=512, pixel_channel=7, device='cuda:0'): super().__init__() if device is None: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -392,7 +392,7 @@ def __init__(self, image_size=224, embed_size=512, device='cuda:0'): self.pixelgoal_encoder = DepthAnythingV2(**model_configs['vits']) self.pixelgoal_encoder = self.pixelgoal_encoder.pretrained.float() self.pixelgoal_encoder.patch_embed.proj = nn.Conv2d( - in_channels=7, + in_channels=pixel_channel, out_channels=self.pixelgoal_encoder.patch_embed.proj.out_channels, kernel_size=self.pixelgoal_encoder.patch_embed.proj.kernel_size, stride=self.pixelgoal_encoder.patch_embed.proj.stride, diff --git a/scripts/train/configs/navdp.py b/scripts/train/configs/navdp.py index 4085f8a..7858c17 100644 --- a/scripts/train/configs/navdp.py +++ b/scripts/train/configs/navdp.py @@ -38,8 +38,8 @@ inflection_weight_coef=3.2, save_interval_epochs=5, save_filter_frozen_weights=False, - load_from_ckpt=False, - ckpt_to_load='', + load_from_ckpt=True, + ckpt_to_load='/shared/smartbot_new/caiwenzhe/InternNav/checkpoints/cross-waic-final4-125.ckpt', lmdb_map_size=1e12, dataset_r2r_root_dir='data/vln_pe/raw_data/r2r', dataset_3dgs_root_dir='', @@ -48,9 +48,10 @@ lerobot_features_dir='data/vln_pe/traj_data/r2r', camera_name='pano_camera_0', report_to='tensorboard', # wandb, tensorboard, none - dataset_navdp='data/datasets/navdp_dataset_lerobot.json', - root_dir='data/datasets/InternData-N1/vln_n1/traj_data', + dataset_navdp='./navdp_dataset_lerobot.json', + root_dir='/shared/smartbot_new/liuyu/vln-n1-minival/', image_size=224, + pixel_channel=4, scene_scale=1.0, preload=False, random_digit=False, diff --git a/scripts/train/train.py b/scripts/train/train.py index 060af53..538a9c2 100755 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -3,40 +3,41 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) import logging +import sys +from datetime import datetime from pathlib import Path -import torch.distributed as dist import torch +import torch.distributed as dist import tyro from pydantic import BaseModel from transformers import TrainerCallback, TrainingArguments from internnav.dataset.cma_lerobot_dataset import CMALerobotDataset, cma_collate_fn -from internnav.dataset.rdp_lerobot_dataset import RDP_LerobotDataset, rdp_collate_fn from internnav.dataset.navdp_dataset_lerobot import NavDP_Base_Datset, navdp_collate_fn +from internnav.dataset.rdp_lerobot_dataset import RDP_LerobotDataset, rdp_collate_fn from internnav.model import ( CMAModelConfig, CMANet, + NavDPModelConfig, + NavDPNet, RDPModelConfig, RDPNet, Seq2SeqModelConfig, Seq2SeqNet, - NavDPNet, - NavDPModelConfig, ) from internnav.model.utils.logger import MyLogger from internnav.model.utils.utils import load_dataset -from internnav.trainer import CMATrainer, RDPTrainer, NavDPTrainer +from internnav.trainer import CMATrainer, NavDPTrainer, RDPTrainer from scripts.train.configs import ( cma_exp_cfg, cma_plus_exp_cfg, + navdp_exp_cfg, rdp_exp_cfg, seq2seq_exp_cfg, seq2seq_plus_exp_cfg, - navdp_exp_cfg, ) -import sys -from datetime import datetime + class TrainCfg(BaseModel): """Training configuration class""" @@ -68,16 +69,16 @@ def on_save(self, args, state, control, **kwargs): def _make_dir(config): - config.tensorboard_dir = config.tensorboard_dir % config.name + config.tensorboard_dir = config.tensorboard_dir % config.name config.checkpoint_folder = config.checkpoint_folder % config.name config.log_dir = config.log_dir % config.name config.output_dir = config.output_dir % config.name if not os.path.exists(config.tensorboard_dir): - os.makedirs(config.tensorboard_dir,exist_ok=True) + os.makedirs(config.tensorboard_dir, exist_ok=True) if not os.path.exists(config.checkpoint_folder): - os.makedirs(config.checkpoint_folder,exist_ok=True) + os.makedirs(config.checkpoint_folder, exist_ok=True) if not os.path.exists(config.log_dir): - os.makedirs(config.log_dir,exist_ok=True) + os.makedirs(config.log_dir, exist_ok=True) def main(config, model_class, model_config_class): @@ -85,12 +86,12 @@ def main(config, model_class, model_config_class): """Main training function.""" _make_dir(config) - print(f"=== Start training ===") + print("=== Start training ===") print(f"Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(f"PyTorch version: {torch.__version__}") print(f"CUDA available: {torch.cuda.is_available()}") print(f"CUDA device count: {torch.cuda.device_count()}") - print(f"Environment variables:") + print("Environment variables:") print(f" RANK: {os.getenv('RANK', 'Not set')}") print(f" LOCAL_RANK: {os.getenv('LOCAL_RANK', 'Not set')}") print(f" WORLD_SIZE: {os.getenv('WORLD_SIZE', 'Not set')}") @@ -101,28 +102,23 @@ def main(config, model_class, model_config_class): local_rank = int(os.getenv('LOCAL_RANK', '0')) world_size = int(os.getenv('WORLD_SIZE', '1')) rank = int(os.getenv('RANK', '0')) - + # Set CUDA device for each process device_id = local_rank torch.cuda.set_device(device_id) device = torch.device(f'cuda:{device_id}') print(f"World size: {world_size}, Local rank: {local_rank}, Global rank: {rank}") - + # Initialize distributed training environment if world_size > 1: try: - dist.init_process_group( - backend='nccl', - init_method='env://', - world_size=world_size, - rank=rank - ) + dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank) print("Distributed initialization SUCCESS") except Exception as e: print(f"Distributed initialization FAILED: {str(e)}") world_size = 1 - print("="*50) + print("=" * 50) print("After distributed init:") print(f"LOCAL_RANK: {local_rank}") print(f"WORLD_SIZE: {world_size}") @@ -150,26 +146,24 @@ def main(config, model_class, model_config_class): if buffer.device != device: print(f"Buffer {name} is on wrong device {buffer.device}, should be moved to {device}") buffer.data = buffer.data.to(device) - + # If distributed training, wrap the model with DDP if world_size > 1: model = torch.nn.parallel.DistributedDataParallel( - model, - device_ids=[local_rank], - output_device=local_rank, - find_unused_parameters=True + model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True ) # ------------ load logger ------------ train_logger_filename = os.path.join(config.log_dir, 'train.log') if dist.is_initialized() and dist.get_rank() == 0: train_logger = MyLogger( - name='train', level=logging.INFO, format_str='%(asctime)-15s %(message)s', filename=train_logger_filename + name='train', + level=logging.INFO, + format_str='%(asctime)-15s %(message)s', + filename=train_logger_filename, ) else: # Other processes use console logging - train_logger = MyLogger( - name='train', level=logging.INFO, format_str='%(asctime)-15s %(message)s' - ) + train_logger = MyLogger(name='train', level=logging.INFO, format_str='%(asctime)-15s %(message)s') transformers_logger = logging.getLogger("transformers") if transformers_logger.hasHandlers(): transformers_logger.handlers = [] @@ -177,19 +171,21 @@ def main(config, model_class, model_config_class): transformers_logger.addHandler(train_logger.handlers[0]) transformers_logger.setLevel(logging.INFO) - # ------------ load dataset ------------ if config.model_name == "navdp": - train_dataset_data = NavDP_Base_Datset(config.il.root_dir, - config.il.dataset_navdp, - config.il.memory_size, - config.il.predict_size, - config.il.batch_size, - config.il.image_size, - config.il.scene_scale, - preload = config.il.preload, - random_digit = config.il.random_digit, - prior_sample = config.il.prior_sample) + train_dataset_data = NavDP_Base_Datset( + config.il.root_dir, + config.il.dataset_navdp, + config.il.memory_size, + config.il.predict_size, + config.il.batch_size, + config.il.image_size, + config.il.scene_scale, + pixel_channel=config.il.pixel_channel, + preload=config.il.preload, + random_digit=config.il.random_digit, + prior_sample=config.il.prior_sample, + ) else: if '3dgs' in config.il.lmdb_features_dir or '3dgs' in config.il.lmdb_features_dir: dataset_root_dir = config.il.dataset_six_floor_root_dir @@ -223,7 +219,7 @@ def main(config, model_class, model_config_class): config, config.il.lerobot_features_dir, dataset_data=train_dataset_data, - batch_size=config.il.batch_size, + batch_size=config.il.batch_size, ) collate_fn = rdp_collate_fn(global_batch_size=global_batch_size) elif config.model_name == 'navdp': @@ -238,7 +234,7 @@ def main(config, model_class, model_config_class): remove_unused_columns=False, deepspeed='', gradient_checkpointing=False, - bf16=False,#fp16=False, + bf16=False, # fp16=False, tf32=False, per_device_train_batch_size=config.il.batch_size, gradient_accumulation_steps=1, @@ -249,7 +245,7 @@ def main(config, model_class, model_config_class): lr_scheduler_type='cosine', logging_steps=10.0, num_train_epochs=config.il.epochs, - save_strategy='epoch',# no + save_strategy='epoch', # no save_steps=config.il.save_interval_epochs, save_total_limit=8, report_to=config.il.report_to, @@ -260,7 +256,7 @@ def main(config, model_class, model_config_class): torch_compile_mode=None, dataloader_drop_last=True, disable_tqdm=True, - log_level="info" + log_level="info", ) # Create the trainer @@ -279,14 +275,15 @@ def main(config, model_class, model_config_class): handler.flush() except Exception as e: import traceback + print(f"Unhandled exception: {str(e)}") print("Stack trace:") traceback.print_exc() - + # If distributed environment, ensure all processes exit if dist.is_initialized(): dist.destroy_process_group() - + raise From ce6d62d23409b3d63e3ea46411be54a1ee9f9b2b Mon Sep 17 00:00:00 2001 From: wzcai99 Date: Wed, 29 Oct 2025 10:47:42 +0000 Subject: [PATCH 2/7] [FIX] NavDP Training Gradient --- internnav/model/basemodel/navdp/navdp_policy.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/internnav/model/basemodel/navdp/navdp_policy.py b/internnav/model/basemodel/navdp/navdp_policy.py index 784044d..1d16017 100644 --- a/internnav/model/basemodel/navdp/navdp_policy.py +++ b/internnav/model/basemodel/navdp/navdp_policy.py @@ -87,9 +87,9 @@ def __init__(self, config: NavDPModelConfig): self.point_encoder = nn.Linear(3, self.token_dim) if not self.finetune: - for p in self.rgbd_encoder.parameters(): + for p in self.rgbd_encoder.rgb_model.parameters(): p.requires_grad = False - self.rgbd_encoder.eval() + self.rgbd_encoder.rgb_model.eval() decoder_layer = nn.TransformerDecoderLayer( d_model=self.token_dim, @@ -348,3 +348,7 @@ def predict_nogoal_batch_action_vel(self, input_images, input_depths, sample_num negative_trajectory = torch.cumsum(naction / 4.0, dim=1)[(critic_values).argsort()[0:8]] positive_trajectory = torch.cumsum(naction / 4.0, dim=1)[(-critic_values).argsort()[0:8]] return negative_trajectory, positive_trajectory + + +# if __name__ == "__main__": +# policy = NavDPNet(config=) \ No newline at end of file From 656626108e3e75536c9a7bd8563229e8baebb77f Mon Sep 17 00:00:00 2001 From: wzcai99 Date: Mon, 3 Nov 2025 06:30:03 +0000 Subject: [PATCH 3/7] [FIX] Support NavDP finetune --- scripts/train/configs/navdp.py | 9 ++++----- scripts/train/train.py | 11 ++++------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/scripts/train/configs/navdp.py b/scripts/train/configs/navdp.py index 7858c17..4085f8a 100644 --- a/scripts/train/configs/navdp.py +++ b/scripts/train/configs/navdp.py @@ -38,8 +38,8 @@ inflection_weight_coef=3.2, save_interval_epochs=5, save_filter_frozen_weights=False, - load_from_ckpt=True, - ckpt_to_load='/shared/smartbot_new/caiwenzhe/InternNav/checkpoints/cross-waic-final4-125.ckpt', + load_from_ckpt=False, + ckpt_to_load='', lmdb_map_size=1e12, dataset_r2r_root_dir='data/vln_pe/raw_data/r2r', dataset_3dgs_root_dir='', @@ -48,10 +48,9 @@ lerobot_features_dir='data/vln_pe/traj_data/r2r', camera_name='pano_camera_0', report_to='tensorboard', # wandb, tensorboard, none - dataset_navdp='./navdp_dataset_lerobot.json', - root_dir='/shared/smartbot_new/liuyu/vln-n1-minival/', + dataset_navdp='data/datasets/navdp_dataset_lerobot.json', + root_dir='data/datasets/InternData-N1/vln_n1/traj_data', image_size=224, - pixel_channel=4, scene_scale=1.0, preload=False, random_digit=False, diff --git a/scripts/train/train.py b/scripts/train/train.py index 538a9c2..d6745a9 100755 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -16,13 +16,10 @@ from internnav.dataset.cma_lerobot_dataset import CMALerobotDataset, cma_collate_fn from internnav.dataset.navdp_dataset_lerobot import NavDP_Base_Datset, navdp_collate_fn from internnav.dataset.rdp_lerobot_dataset import RDP_LerobotDataset, rdp_collate_fn -from internnav.model import ( - CMAModelConfig, - CMANet, - NavDPModelConfig, - NavDPNet, - RDPModelConfig, - RDPNet, +from internnav.model.basemodel.cma.cma_policy import CMAModelConfig, CMANet +from internnav.model.basemodel.navdp.navdp_policy import NavDPModelConfig, NavDPNet +from internnav.model.basemodel.rdp.rdp_policy import RDPModelConfig, RDPNet +from internnav.model.basemodel.seq2seq.seq2seq_policy import ( Seq2SeqModelConfig, Seq2SeqNet, ) From b73c85e03b8451a3f3b4b4901073e7b340eb37ff Mon Sep 17 00:00:00 2001 From: wzcai99 Date: Mon, 3 Nov 2025 07:30:52 +0000 Subject: [PATCH 4/7] [FIX] update navdp training parameters --- .../model/basemodel/navdp/navdp_policy.py | 14 +++++--- scripts/train/configs/navdp.py | 3 +- scripts/train/train.py | 36 +------------------ 3 files changed, 13 insertions(+), 40 deletions(-) diff --git a/internnav/model/basemodel/navdp/navdp_policy.py b/internnav/model/basemodel/navdp/navdp_policy.py index 1d16017..f08f131 100644 --- a/internnav/model/basemodel/navdp/navdp_policy.py +++ b/internnav/model/basemodel/navdp/navdp_policy.py @@ -7,7 +7,13 @@ from internnav.configs.model.base_encoders import ModelCfg from internnav.configs.trainer.exp import ExpCfg -from internnav.model.encoder.navdp_backbone import * +from internnav.model.encoder.navdp_backbone import ( + LearnablePositionalEncoding, + NavDP_ImageGoal_Backbone, + NavDP_PixelGoal_Backbone, + NavDP_RGBD_Backbone, + SinusoidalPosEmb, +) class NavDPModelConfig(PretrainedConfig): @@ -324,7 +330,7 @@ def predict_pointgoal_batch_action_vel(self, goal_point, input_images, input_dep naction = self.noise_scheduler.step(model_output=noise_pred, timestep=k, sample=naction).prev_sample critic_values = self.predict_critic(naction, rgbd_embed) - all_trajectory = torch.cumsum(naction / 4.0, dim=1) + # all_trajectory = torch.cumsum(naction / 4.0, dim=1) negative_trajectory = torch.cumsum(naction / 4.0, dim=1)[(critic_values).argsort()[0:8]] positive_trajectory = torch.cumsum(naction / 4.0, dim=1)[(-critic_values).argsort()[0:8]] @@ -343,7 +349,7 @@ def predict_nogoal_batch_action_vel(self, input_images, input_depths, sample_num naction = self.noise_scheduler.step(model_output=noise_pred, timestep=k, sample=naction).prev_sample critic_values = self.predict_critic(naction, rgbd_embed) - all_trajectory = torch.cumsum(naction / 4.0, dim=1) + # all_trajectory = torch.cumsum(naction / 4.0, dim=1) negative_trajectory = torch.cumsum(naction / 4.0, dim=1)[(critic_values).argsort()[0:8]] positive_trajectory = torch.cumsum(naction / 4.0, dim=1)[(-critic_values).argsort()[0:8]] @@ -351,4 +357,4 @@ def predict_nogoal_batch_action_vel(self, input_images, input_depths, sample_num # if __name__ == "__main__": -# policy = NavDPNet(config=) \ No newline at end of file +# policy = NavDPNet(config=) diff --git a/scripts/train/configs/navdp.py b/scripts/train/configs/navdp.py index 4085f8a..4329da2 100644 --- a/scripts/train/configs/navdp.py +++ b/scripts/train/configs/navdp.py @@ -29,7 +29,7 @@ ), il=IlCfg( epochs=1000, - batch_size=16, + batch_size=32, lr=1e-4, num_workers=8, weight_decay=1e-4, # TODO @@ -57,6 +57,7 @@ prior_sample=False, memory_size=8, predict_size=24, + pixel_channel=4, temporal_depth=16, heads=8, token_dim=384, diff --git a/scripts/train/train.py b/scripts/train/train.py index cc9f998..0c254dd 100755 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -60,20 +60,16 @@ def on_save(self, args, state, control, **kwargs): def _make_dir(config): - config.tensorboard_dir = config.tensorboard_dir % config.name config.tensorboard_dir = config.tensorboard_dir % config.name config.checkpoint_folder = config.checkpoint_folder % config.name config.log_dir = config.log_dir % config.name config.output_dir = config.output_dir % config.name if not os.path.exists(config.tensorboard_dir): os.makedirs(config.tensorboard_dir, exist_ok=True) - os.makedirs(config.tensorboard_dir, exist_ok=True) if not os.path.exists(config.checkpoint_folder): os.makedirs(config.checkpoint_folder, exist_ok=True) - os.makedirs(config.checkpoint_folder, exist_ok=True) if not os.path.exists(config.log_dir): os.makedirs(config.log_dir, exist_ok=True) - os.makedirs(config.log_dir, exist_ok=True) def main(config, model_class, model_config_class): @@ -98,14 +94,12 @@ def main(config, model_class, model_config_class): world_size = int(os.getenv('WORLD_SIZE', '1')) rank = int(os.getenv('RANK', '0')) - # Set CUDA device for each process device_id = local_rank torch.cuda.set_device(device_id) device = torch.device(f'cuda:{device_id}') print(f"World size: {world_size}, Local rank: {local_rank}, Global rank: {rank}") - # Initialize distributed training environment if world_size > 1: try: @@ -116,7 +110,6 @@ def main(config, model_class, model_config_class): print(f"Distributed initialization FAILED: {str(e)}") world_size = 1 - print("=" * 50) print("=" * 50) print("After distributed init:") print(f"LOCAL_RANK: {local_rank}") @@ -146,13 +139,10 @@ def main(config, model_class, model_config_class): print(f"Buffer {name} is on wrong device {buffer.device}, should be moved to {device}") buffer.data = buffer.data.to(device) - # If distributed training, wrap the model with DDP if world_size > 1: model = torch.nn.parallel.DistributedDataParallel( - model, device_ids=[local_rank], - output_device=local_rank, - find_unused_parameters=True + model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True ) # ------------ load logger ------------ train_logger_filename = os.path.join(config.log_dir, 'train.log') @@ -162,15 +152,10 @@ def main(config, model_class, model_config_class): level=logging.INFO, format_str='%(asctime)-15s %(message)s', filename=train_logger_filename, - name='train', - level=logging.INFO, - format_str='%(asctime)-15s %(message)s', - filename=train_logger_filename, ) else: # Other processes use console logging train_logger = MyLogger(name='train', level=logging.INFO, format_str='%(asctime)-15s %(message)s') - train_logger = MyLogger(name='train', level=logging.INFO, format_str='%(asctime)-15s %(message)s') transformers_logger = logging.getLogger("transformers") if transformers_logger.hasHandlers(): transformers_logger.handlers = [] @@ -180,18 +165,6 @@ def main(config, model_class, model_config_class): # ------------ load dataset ------------ if config.model_name == "navdp": - train_dataset_data = NavDP_Base_Datset( - config.il.root_dir, - config.il.dataset_navdp, - config.il.memory_size, - config.il.predict_size, - config.il.batch_size, - config.il.image_size, - config.il.scene_scale, - preload=config.il.preload, - random_digit=config.il.random_digit, - prior_sample=config.il.prior_sample, - ) train_dataset_data = NavDP_Base_Datset( config.il.root_dir, config.il.dataset_navdp, @@ -239,7 +212,6 @@ def main(config, model_class, model_config_class): config.il.lerobot_features_dir, dataset_data=train_dataset_data, batch_size=config.il.batch_size, - batch_size=config.il.batch_size, ) collate_fn = rdp_collate_fn(global_batch_size=global_batch_size) elif config.model_name == 'navdp': @@ -255,7 +227,6 @@ def main(config, model_class, model_config_class): deepspeed='', gradient_checkpointing=False, bf16=False, # fp16=False, - bf16=False, # fp16=False, tf32=False, per_device_train_batch_size=config.il.batch_size, gradient_accumulation_steps=1, @@ -267,7 +238,6 @@ def main(config, model_class, model_config_class): logging_steps=10.0, num_train_epochs=config.il.epochs, save_strategy='epoch', # no - save_strategy='epoch', # no save_steps=config.il.save_interval_epochs, save_total_limit=8, report_to=config.il.report_to, @@ -279,7 +249,6 @@ def main(config, model_class, model_config_class): dataloader_drop_last=True, disable_tqdm=True, log_level="info", - log_level="info", ) # Create the trainer @@ -299,17 +268,14 @@ def main(config, model_class, model_config_class): except Exception as e: import traceback - print(f"Unhandled exception: {str(e)}") print("Stack trace:") traceback.print_exc() - # If distributed environment, ensure all processes exit if dist.is_initialized(): dist.destroy_process_group() - raise From 1545c203fe8169643f1ae7a979b5290a84e7320e Mon Sep 17 00:00:00 2001 From: wzcai99 Date: Wed, 5 Nov 2025 03:46:46 +0000 Subject: [PATCH 5/7] [FIX] Support NavDP finetuning --- internnav/dataset/navdp_dataset_lerobot.py | 4 ++ .../model/basemodel/navdp/navdp_policy.py | 41 +++++-------------- scripts/train/train.py | 1 - 3 files changed, 14 insertions(+), 32 deletions(-) diff --git a/internnav/dataset/navdp_dataset_lerobot.py b/internnav/dataset/navdp_dataset_lerobot.py index f5b3a2f..95753d9 100644 --- a/internnav/dataset/navdp_dataset_lerobot.py +++ b/internnav/dataset/navdp_dataset_lerobot.py @@ -511,6 +511,10 @@ def __getitem__(self, index): camera_intrinsic, trajectory_base_extrinsic, ) + # pixel channel == 7 represents the navdp works pixel navigation under asynchronous pace, + # pixel_mask (1), the history image with the assigned pixel goal (3), current image (3) + # if pixel_channel == 4, pixel goal is assigned at current frame, therefore, + # only pixel_mask (1) and current image (3) are needed if self.pixel_channel == 7: pixel_goal = np.concatenate((pixel_goal, memory_images[-1]), axis=-1) diff --git a/internnav/model/basemodel/navdp/navdp_policy.py b/internnav/model/basemodel/navdp/navdp_policy.py index f08f131..a166473 100644 --- a/internnav/model/basemodel/navdp/navdp_policy.py +++ b/internnav/model/basemodel/navdp/navdp_policy.py @@ -7,13 +7,15 @@ from internnav.configs.model.base_encoders import ModelCfg from internnav.configs.trainer.exp import ExpCfg +from internnav.model.encoder.navdp_backbone import LearnablePositionalEncoding from internnav.model.encoder.navdp_backbone import ( - LearnablePositionalEncoding, - NavDP_ImageGoal_Backbone, - NavDP_PixelGoal_Backbone, - NavDP_RGBD_Backbone, - SinusoidalPosEmb, + NavDP_ImageGoal_Backbone as ImageGoal_Backbone, ) +from internnav.model.encoder.navdp_backbone import ( + NavDP_PixelGoal_Backbone as PixelGoal_Backbone, +) +from internnav.model.encoder.navdp_backbone import NavDP_RGBD_Backbone as RGBD_Backbone +from internnav.model.encoder.navdp_backbone import SinusoidalPosEmb class NavDPModelConfig(PretrainedConfig): @@ -83,13 +85,13 @@ def __init__(self, config: NavDPModelConfig): self.token_dim = self.config.model_cfg['il']['token_dim'] self.scratch = self.config.model_cfg['il']['scratch'] self.finetune = self.config.model_cfg['il']['finetune'] - self.rgbd_encoder = NavDP_RGBD_Backbone( + self.rgbd_encoder = RGBD_Backbone( self.image_size, self.token_dim, memory_size=self.memory_size, finetune=self.finetune, device=self._device ) - self.pixel_encoder = NavDP_PixelGoal_Backbone( + self.pixel_encoder = PixelGoal_Backbone( self.image_size, self.token_dim, pixel_channel=self.pixel_channel, device=self._device ) - self.image_encoder = NavDP_ImageGoal_Backbone(self.image_size, self.token_dim, device=self._device) + self.image_encoder = ImageGoal_Backbone(self.image_size, self.token_dim, device=self._device) self.point_encoder = nn.Linear(3, self.token_dim) if not self.finetune: @@ -185,23 +187,6 @@ def predict_critic(self, predict_trajectory, rgbd_embed): return critic_output def forward(self, goal_point, goal_image, goal_pixel, input_images, input_depths, output_actions, augment_actions): - # """get device safely""" - # # get device safely - # try: - # # try to get device through model parameters - # device = next(self.parameters()).device - # except StopIteration: - # # model has no parameters, use the default device - # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - # # move all inputs to model device - # goal_point = goal_point.to(device) - # goal_image = goal_image.to(device) - # input_images = input_images.to(device) - # input_depths = input_depths.to(device) - # output_actions = output_actions.to(device) - # augment_actions = augment_actions.to(device) - # device = self._device - # print(f"self.parameters() is:{self.parameters()}") device = next(self.parameters()).device assert input_images.shape[1] == self.memory_size @@ -330,7 +315,6 @@ def predict_pointgoal_batch_action_vel(self, goal_point, input_images, input_dep naction = self.noise_scheduler.step(model_output=noise_pred, timestep=k, sample=naction).prev_sample critic_values = self.predict_critic(naction, rgbd_embed) - # all_trajectory = torch.cumsum(naction / 4.0, dim=1) negative_trajectory = torch.cumsum(naction / 4.0, dim=1)[(critic_values).argsort()[0:8]] positive_trajectory = torch.cumsum(naction / 4.0, dim=1)[(-critic_values).argsort()[0:8]] @@ -349,12 +333,7 @@ def predict_nogoal_batch_action_vel(self, input_images, input_depths, sample_num naction = self.noise_scheduler.step(model_output=noise_pred, timestep=k, sample=naction).prev_sample critic_values = self.predict_critic(naction, rgbd_embed) - # all_trajectory = torch.cumsum(naction / 4.0, dim=1) negative_trajectory = torch.cumsum(naction / 4.0, dim=1)[(critic_values).argsort()[0:8]] positive_trajectory = torch.cumsum(naction / 4.0, dim=1)[(-critic_values).argsort()[0:8]] return negative_trajectory, positive_trajectory - - -# if __name__ == "__main__": -# policy = NavDPNet(config=) diff --git a/scripts/train/train.py b/scripts/train/train.py index 0c254dd..d2cc965 100755 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -103,7 +103,6 @@ def main(config, model_class, model_config_class): # Initialize distributed training environment if world_size > 1: try: - dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank) dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank) print("Distributed initialization SUCCESS") except Exception as e: From 644b59d07ee6bd1cad5a3d07bf038e657c5681ec Mon Sep 17 00:00:00 2001 From: wzcai99 Date: Thu, 6 Nov 2025 07:08:19 +0000 Subject: [PATCH 6/7] modify the class name for NavDP --- internnav/model/basemodel/navdp/navdp_policy.py | 12 +++++------- internnav/model/encoder/navdp_backbone.py | 6 +++--- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/internnav/model/basemodel/navdp/navdp_policy.py b/internnav/model/basemodel/navdp/navdp_policy.py index a166473..9ac26b9 100644 --- a/internnav/model/basemodel/navdp/navdp_policy.py +++ b/internnav/model/basemodel/navdp/navdp_policy.py @@ -7,15 +7,13 @@ from internnav.configs.model.base_encoders import ModelCfg from internnav.configs.trainer.exp import ExpCfg -from internnav.model.encoder.navdp_backbone import LearnablePositionalEncoding from internnav.model.encoder.navdp_backbone import ( - NavDP_ImageGoal_Backbone as ImageGoal_Backbone, + ImageGoal_Backbone, + LearnablePositionalEncoding, + PixelGoal_Backbone, + RGBD_Backbone, + SinusoidalPosEmb, ) -from internnav.model.encoder.navdp_backbone import ( - NavDP_PixelGoal_Backbone as PixelGoal_Backbone, -) -from internnav.model.encoder.navdp_backbone import NavDP_RGBD_Backbone as RGBD_Backbone -from internnav.model.encoder.navdp_backbone import SinusoidalPosEmb class NavDPModelConfig(PretrainedConfig): diff --git a/internnav/model/encoder/navdp_backbone.py b/internnav/model/encoder/navdp_backbone.py index 680c67b..8f2a9b8 100644 --- a/internnav/model/encoder/navdp_backbone.py +++ b/internnav/model/encoder/navdp_backbone.py @@ -202,7 +202,7 @@ def forward(self, images, depths): return memory_token -class NavDP_RGBD_Backbone(nn.Module): +class RGBD_Backbone(nn.Module): def __init__( self, image_size=224, @@ -313,7 +313,7 @@ def _get_device(self): return torch.device("cuda" if torch.cuda.is_available() else "cpu") -class NavDP_ImageGoal_Backbone(nn.Module): +class ImageGoal_Backbone(nn.Module): def __init__(self, image_size=224, embed_size=512, device='cuda:0'): super().__init__() if device is None: @@ -376,7 +376,7 @@ def _get_device(self): return torch.device("cuda" if torch.cuda.is_available() else "cpu") -class NavDP_PixelGoal_Backbone(nn.Module): +class PixelGoal_Backbone(nn.Module): def __init__(self, image_size=224, embed_size=512, pixel_channel=7, device='cuda:0'): super().__init__() if device is None: From 8e9574e3329c50da8d417cad23ef8edb283a39be Mon Sep 17 00:00:00 2001 From: wzcai99 Date: Fri, 7 Nov 2025 06:15:59 +0000 Subject: [PATCH 7/7] update class name --- internnav/model/basemodel/navdp/navdp_policy.py | 12 ++++++------ internnav/model/encoder/navdp_backbone.py | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/internnav/model/basemodel/navdp/navdp_policy.py b/internnav/model/basemodel/navdp/navdp_policy.py index 9ac26b9..e235352 100644 --- a/internnav/model/basemodel/navdp/navdp_policy.py +++ b/internnav/model/basemodel/navdp/navdp_policy.py @@ -8,10 +8,10 @@ from internnav.configs.model.base_encoders import ModelCfg from internnav.configs.trainer.exp import ExpCfg from internnav.model.encoder.navdp_backbone import ( - ImageGoal_Backbone, + ImageGoalBackbone, LearnablePositionalEncoding, - PixelGoal_Backbone, - RGBD_Backbone, + PixelGoalBackbone, + RGBDBackbone, SinusoidalPosEmb, ) @@ -83,13 +83,13 @@ def __init__(self, config: NavDPModelConfig): self.token_dim = self.config.model_cfg['il']['token_dim'] self.scratch = self.config.model_cfg['il']['scratch'] self.finetune = self.config.model_cfg['il']['finetune'] - self.rgbd_encoder = RGBD_Backbone( + self.rgbd_encoder = RGBDBackbone( self.image_size, self.token_dim, memory_size=self.memory_size, finetune=self.finetune, device=self._device ) - self.pixel_encoder = PixelGoal_Backbone( + self.pixel_encoder = PixelGoalBackbone( self.image_size, self.token_dim, pixel_channel=self.pixel_channel, device=self._device ) - self.image_encoder = ImageGoal_Backbone(self.image_size, self.token_dim, device=self._device) + self.image_encoder = ImageGoalBackbone(self.image_size, self.token_dim, device=self._device) self.point_encoder = nn.Linear(3, self.token_dim) if not self.finetune: diff --git a/internnav/model/encoder/navdp_backbone.py b/internnav/model/encoder/navdp_backbone.py index 8f2a9b8..34cb2c9 100644 --- a/internnav/model/encoder/navdp_backbone.py +++ b/internnav/model/encoder/navdp_backbone.py @@ -202,7 +202,7 @@ def forward(self, images, depths): return memory_token -class RGBD_Backbone(nn.Module): +class RGBDBackbone(nn.Module): def __init__( self, image_size=224, @@ -313,7 +313,7 @@ def _get_device(self): return torch.device("cuda" if torch.cuda.is_available() else "cpu") -class ImageGoal_Backbone(nn.Module): +class ImageGoalBackbone(nn.Module): def __init__(self, image_size=224, embed_size=512, device='cuda:0'): super().__init__() if device is None: @@ -376,7 +376,7 @@ def _get_device(self): return torch.device("cuda" if torch.cuda.is_available() else "cpu") -class PixelGoal_Backbone(nn.Module): +class PixelGoalBackbone(nn.Module): def __init__(self, image_size=224, embed_size=512, pixel_channel=7, device='cuda:0'): super().__init__() if device is None: