diff --git a/tools/AutoTuner/src/autotuner/distributed.py b/tools/AutoTuner/src/autotuner/distributed.py index c5b07dc74c..2a264a4c36 100644 --- a/tools/AutoTuner/src/autotuner/distributed.py +++ b/tools/AutoTuner/src/autotuner/distributed.py @@ -92,14 +92,15 @@ read_config, read_metrics, prepare_ray_server, + calculate_score, + ERROR_METRIC, CONSTRAINTS_SDC, FASTROUTE_TCL, ) +from autotuner.tensorboard_logger import TensorBoardLogger # Name of the final metric METRIC = "metric" -# The worst of optimized metric -ERROR_METRIC = 9e99 # Path to the FLOW_HOME directory ORFS_FLOW_DIR = os.path.abspath( os.path.join(os.path.dirname(__file__), "../../../../flow") @@ -172,16 +173,7 @@ def evaluate(self, metrics): It can change in any form to minimize the score (return value). Default evaluation function optimizes effective clock period. """ - error = "ERR" in metrics.values() - not_found = "N/A" in metrics.values() - if error or not_found: - return (ERROR_METRIC, "-", "-", "-") - effective_clk_period = metrics["clk_period"] - metrics["worst_slack"] - num_drc = metrics["num_drc"] - gamma = effective_clk_period / 10 - score = effective_clk_period - score = score * (100 / self.step_) + gamma * num_drc - return (score, effective_clk_period, num_drc, metrics["die_area"]) + return calculate_score(metrics, step=self.step_) def _is_valid_config(self, config): """ @@ -566,6 +558,14 @@ def sweep(): else: repo_dir = os.path.abspath(os.path.join(ORFS_FLOW_DIR, "..")) print(f"[INFO TUN-0012] Log folder {LOCAL_DIR}.") + + tb_log_dir = os.path.join(LOCAL_DIR, args.experiment) + print( + f"[INFO TUN-0034] TensorBoard logging enabled. Run: tensorboard --logdir={tb_log_dir}" + ) + + tb_logger = TensorBoardLogger.remote(log_dir=tb_log_dir) + queue = Queue() parameter_list = list() for name, content in config_dict.items(): @@ -581,10 +581,22 @@ def sweep(): temp = dict() for value in parameter: temp.update(value) - queue.put([args, repo_dir, temp, SDC_ORIGINAL, FR_ORIGINAL, INSTALL_PATH]) + queue.put( + [ + args, + repo_dir, + temp, + SDC_ORIGINAL, + FR_ORIGINAL, + INSTALL_PATH, + tb_logger, + ] + ) workers = [consumer.remote(queue) for _ in range(args.jobs)] print("[INFO TUN-0009] Waiting for results.") ray.get(workers) + ray.get(tb_logger.close.remote()) + print(f"[INFO TUN-0035] TensorBoard events written to {tb_log_dir}") print("[INFO TUN-0010] Sweep complete.") diff --git a/tools/AutoTuner/src/autotuner/tensorboard_logger.py b/tools/AutoTuner/src/autotuner/tensorboard_logger.py new file mode 100644 index 0000000000..6e0dbf9200 --- /dev/null +++ b/tools/AutoTuner/src/autotuner/tensorboard_logger.py @@ -0,0 +1,67 @@ +import logging +import os +from typing import Any, Union + +import ray +from tensorboardX import SummaryWriter + +from autotuner.utils import ERROR_METRIC + +logger = logging.getLogger(__name__) + + +@ray.remote +class TensorBoardLogger: + """TensorBoard logger for AutoTuner experiments""" + + def __init__(self, log_dir: str): + os.makedirs(log_dir, exist_ok=True) + self.writer = SummaryWriter(log_dir=log_dir) + self.log_dir = log_dir + self.step = 0 + logger.info(f"TensorBoard logs will be written to {log_dir}") + + def log_sweep_metrics( + self, + params: dict[str, Any], + metrics: dict[str, Any], + score: float, + effective_clk_period: Union[float, str], + num_drc: Union[int, str], + die_area: Union[float, str], + ) -> None: + """Log metrics from a single sweep run""" + self.writer.add_scalar("sweep/score", score, self.step) + + if isinstance(effective_clk_period, (int, float)): + self.writer.add_scalar( + "sweep/effective_clk_period", effective_clk_period, self.step + ) + + if isinstance(num_drc, (int, float)): + self.writer.add_scalar("sweep/num_drc", num_drc, self.step) + + if isinstance(die_area, (int, float)): + self.writer.add_scalar("sweep/die_area", die_area, self.step) + + for key, value in metrics.items(): + if isinstance(value, (int, float)): + self.writer.add_scalar(f"metrics/{key}", value, self.step) + + self.writer.add_hparams( + { + k: v if isinstance(v, (int, float, str, bool)) else str(v) + for k, v in params.items() + }, + {"hparam/metric": score}, + ) + + self.step += 1 + + def close(self) -> None: + """Close the TensorBoard writer and log completion message""" + self.writer.close() + logger.info( + f"Sweep complete. View results with: tensorboard --logdir={self.log_dir}" + ) + logger.info(f"Total runs logged: {self.step}") diff --git a/tools/AutoTuner/src/autotuner/utils.py b/tools/AutoTuner/src/autotuner/utils.py index fadab40325..7b43e25c83 100644 --- a/tools/AutoTuner/src/autotuner/utils.py +++ b/tools/AutoTuner/src/autotuner/utils.py @@ -69,6 +69,24 @@ # Name of the TCL script run before routing FASTROUTE_TCL = "fastroute.tcl" DATE = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") +# The worst of optimized metric +ERROR_METRIC = 9e99 + + +def calculate_score(metrics, step=1): + """Calculate optimization score from metrics.""" + error = "ERR" in metrics.values() + not_found = "N/A" in metrics.values() + + if error or not_found: + return (ERROR_METRIC, "-", "-", "-") + + effective_clk_period = metrics["clk_period"] - metrics["worst_slack"] + num_drc = metrics["num_drc"] + gamma = effective_clk_period / 10 + score = effective_clk_period * (100 / step) + gamma * num_drc + + return (score, effective_clk_period, num_drc, metrics["die_area"]) def write_sdc(variables, path, sdc_original, constraints_sdc): @@ -287,6 +305,21 @@ def run_command( raise RuntimeError +def calculate_trial_path(args, base_dir, flow_variant): + """ + Calculate the log path and flow variant + """ + flow_variant_with_experiment = f"{args.experiment}/{flow_variant}" + log_path = os.path.abspath( + os.path.join( + base_dir, + f"flow/logs/{args.platform}/{args.design}", + flow_variant_with_experiment, + ) + ) + return log_path, flow_variant_with_experiment + + def openroad( args, base_dir, @@ -297,10 +330,8 @@ def openroad( """ Run OpenROAD-flow-scripts with a given set of parameters. """ - # Make sure path ends in a slash, i.e., is a folder - flow_variant = f"{args.experiment}/{flow_variant}" - log_path = os.path.abspath( - os.path.join(base_dir, f"flow/logs/{args.platform}/{args.design}", flow_variant) + log_path, flow_variant = calculate_trial_path( + args=args, base_dir=base_dir, flow_variant=flow_variant ) report_path = os.path.abspath( os.path.join( @@ -643,6 +674,20 @@ def openroad_distributed( variant=None, ): """Simple wrapper to run openroad distributed with Ray.""" + if variant is None: + variant_parts = [] + for key, value in config.items(): + if key not in ["_SDC_FILE_PATH", "_FR_FILE_PATH"]: + variant_parts.append(f"{key}_{value}") + variant = "_".join(variant_parts) if variant_parts else "" + flow_variant = f"{uuid.uuid4()}-{variant}" if variant else f"{uuid.uuid4()}" + + trial_path, _ = calculate_trial_path( + args=args, base_dir=repo_dir, flow_variant=flow_variant + ) + + os.makedirs(trial_path, exist_ok=True) + config = parse_config( config=config, base_dir=repo_dir, @@ -651,15 +696,15 @@ def openroad_distributed( constraints_sdc=CONSTRAINTS_SDC, fr_original=fr_original, fastroute_tcl=FASTROUTE_TCL, + path=trial_path, ) - if variant is None: - variant = config.replace(" ", "_").replace("=", "_") + t = time.time() metric_file = openroad( args=args, base_dir=repo_dir, parameters=config, - flow_variant=f"{uuid.uuid4()}-{variant}" if variant else f"{uuid.uuid4()}", + flow_variant=flow_variant, install_path=install_path, ) duration = time.time() - t @@ -669,9 +714,29 @@ def openroad_distributed( @ray.remote def consumer(queue): """consumer""" - while not queue.empty(): - next_item = queue.get() - name = next_item[1] - print(f"[INFO TUN-0007] Scheduling run for parameter {name}.") - ray.get(openroad_distributed.remote(*next_item)) - print(f"[INFO TUN-0008] Finished run for parameter {name}.") + item = queue.get() + tb_logger = item[6] + + while item: + args, repo_dir, config, sdc, fr, install, tb_logger = item + print(f"[INFO TUN-0007] Scheduling run for parameter {config}.") + metric_file, _ = ray.get( + openroad_distributed.remote(args, repo_dir, config, sdc, fr, install) + ) + print(f"[INFO TUN-0008] Finished run for parameter {config}.") + + metrics = read_metrics(metric_file, args.stop_stage) + score, effective_clk_period, num_drc, die_area = calculate_score(metrics) + + ray.get( + tb_logger.log_sweep_metrics.remote( + params=config, + metrics=metrics, + score=score, + effective_clk_period=effective_clk_period, + num_drc=num_drc, + die_area=die_area, + ) + ) + + item = queue.get() if not queue.empty() else None