diff --git a/src/sagemaker/hyperpod/cli/commands/training.py b/src/sagemaker/hyperpod/cli/commands/training.py index 25688902..8bfbee9d 100644 --- a/src/sagemaker/hyperpod/cli/commands/training.py +++ b/src/sagemaker/hyperpod/cli/commands/training.py @@ -1,16 +1,7 @@ import click -import logging -import os -import yaml -import shutil -import subprocess -from pathlib import Path from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob from sagemaker.hyperpod.common.config import Metadata -import tempfile -from typing import List, Dict, Any, Optional, Callable, get_args, get_origin, Literal from sagemaker.hyperpod.cli.training_utils import generate_click_command -from importlib.metadata import entry_points from hyperpod_pytorch_job_template.registry import SCHEMA_REGISTRY from sagemaker.hyperpod.common.telemetry.telemetry_logging import ( _hyperpod_telemetry_emitter, diff --git a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py index e44b217e..5d2c370a 100644 --- a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py +++ b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py @@ -3,8 +3,8 @@ _HyperPodPytorchJob, HyperPodPytorchJobStatus ) from sagemaker.hyperpod.common.config.metadata import Metadata -from kubernetes import client, config, __version__ as kubernetes_client_version -from typing import List, Optional, ClassVar, Tuple +from kubernetes import client, config +from typing import List, Optional, ClassVar from sagemaker.hyperpod.common.utils import ( handle_exception, get_default_namespace, @@ -84,7 +84,7 @@ def create(self, debug=False): plural=PLURAL, body=config, ) - logger.info("Successfully submitted HyperPodPytorchJob!") + logger.info(f"Successfully submitted HyperPodPytorchJob '{self.metadata.name}'!") except Exception as e: logger.error(f"Failed to create HyperPodPytorchJob {self.metadata.name}!") handle_exception(e, self.metadata.name, self.metadata.namespace) @@ -131,7 +131,7 @@ def delete(self): plural=PLURAL, name=self.metadata.name, ) - logger.info(f"Successful deleted HyperPodPytorchJob!") + logger.info(f"Successful deleted HyperPodPytorchJob '{self.metadata.name}'!") except Exception as e: logger.error(f"Failed to delete HyperPodPytorchJob {self.metadata.name}!") handle_exception(e, self.metadata.name, self.metadata.namespace) diff --git a/test/integration_tests/training/cli/test_cli_training.py b/test/integration_tests/training/cli/test_cli_training.py index 4cc9dd9a..dd12f06f 100644 --- a/test/integration_tests/training/cli/test_cli_training.py +++ b/test/integration_tests/training/cli/test_cli_training.py @@ -27,16 +27,6 @@ def test_list_clusters(self, cluster_name): """Test listing clusters """ assert cluster_name - def test_set_cluster_context(self, cluster_name): - """Test setting cluster context.""" - result = execute_command([ - "hyp", "set-cluster-context", - "--cluster-name", cluster_name - ]) - assert result.returncode == 0 - context_line = result.stdout.strip().splitlines()[-1] - assert any(text in context_line for text in ["Updated context", "Added new context"]) - def test_get_cluster_context(self): """Test getting current cluster context.""" result = execute_command(["hyp", "get-cluster-context"]) diff --git a/test/integration_tests/training/sdk/test_sdk_training.py b/test/integration_tests/training/sdk/test_sdk_training.py index 970e9b62..f7dc4574 100644 --- a/test/integration_tests/training/sdk/test_sdk_training.py +++ b/test/integration_tests/training/sdk/test_sdk_training.py @@ -70,10 +70,9 @@ def test_list_jobs(self, pytorch_job): job_names = [job.metadata.name for job in jobs] assert pytorch_job.metadata.name in job_names - # def test_refresh_job(self, pytorch_job): pytorch_job.refresh() - time.sleep(15) + time.sleep(30) assert pytorch_job.status is not None, "Job status should not be None" logger.info(f"Refreshed job status:\n{yaml.dump(pytorch_job.status)}")