Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 25 additions & 13 deletions tools/AutoTuner/src/autotuner/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,15 @@
read_config,
read_metrics,
prepare_ray_server,
calculate_score,
ERROR_METRIC,
CONSTRAINTS_SDC,
FASTROUTE_TCL,
)
from autotuner.tensorboard_logger import TensorBoardLogger

# Name of the final metric
METRIC = "metric"
# The worst of optimized metric
ERROR_METRIC = 9e99
# Path to the FLOW_HOME directory
ORFS_FLOW_DIR = os.path.abspath(
os.path.join(os.path.dirname(__file__), "../../../../flow")
Expand Down Expand Up @@ -172,16 +173,7 @@ def evaluate(self, metrics):
It can change in any form to minimize the score (return value).
Default evaluation function optimizes effective clock period.
"""
error = "ERR" in metrics.values()
not_found = "N/A" in metrics.values()
if error or not_found:
return (ERROR_METRIC, "-", "-", "-")
effective_clk_period = metrics["clk_period"] - metrics["worst_slack"]
num_drc = metrics["num_drc"]
gamma = effective_clk_period / 10
score = effective_clk_period
score = score * (100 / self.step_) + gamma * num_drc
return (score, effective_clk_period, num_drc, metrics["die_area"])
return calculate_score(metrics, step=self.step_)

def _is_valid_config(self, config):
"""
Expand Down Expand Up @@ -566,6 +558,14 @@ def sweep():
else:
repo_dir = os.path.abspath(os.path.join(ORFS_FLOW_DIR, ".."))
print(f"[INFO TUN-0012] Log folder {LOCAL_DIR}.")

tb_log_dir = os.path.join(LOCAL_DIR, args.experiment)
print(
f"[INFO TUN-0034] TensorBoard logging enabled. Run: tensorboard --logdir={tb_log_dir}"
)

tb_logger = TensorBoardLogger.remote(log_dir=tb_log_dir)

queue = Queue()
parameter_list = list()
for name, content in config_dict.items():
Expand All @@ -581,10 +581,22 @@ def sweep():
temp = dict()
for value in parameter:
temp.update(value)
queue.put([args, repo_dir, temp, SDC_ORIGINAL, FR_ORIGINAL, INSTALL_PATH])
queue.put(
[
args,
repo_dir,
temp,
SDC_ORIGINAL,
FR_ORIGINAL,
INSTALL_PATH,
tb_logger,
]
)
workers = [consumer.remote(queue) for _ in range(args.jobs)]
print("[INFO TUN-0009] Waiting for results.")
ray.get(workers)
ray.get(tb_logger.close.remote())
print(f"[INFO TUN-0035] TensorBoard events written to {tb_log_dir}")
print("[INFO TUN-0010] Sweep complete.")


Expand Down
67 changes: 67 additions & 0 deletions tools/AutoTuner/src/autotuner/tensorboard_logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import logging
import os
from typing import Any, Union

import ray
from tensorboardX import SummaryWriter

from autotuner.utils import ERROR_METRIC

logger = logging.getLogger(__name__)


@ray.remote
class TensorBoardLogger:
"""TensorBoard logger for AutoTuner experiments"""

def __init__(self, log_dir: str):
os.makedirs(log_dir, exist_ok=True)
self.writer = SummaryWriter(log_dir=log_dir)
self.log_dir = log_dir
self.step = 0
logger.info(f"TensorBoard logs will be written to {log_dir}")

def log_sweep_metrics(
self,
params: dict[str, Any],
metrics: dict[str, Any],
score: float,
effective_clk_period: Union[float, str],
num_drc: Union[int, str],
die_area: Union[float, str],
) -> None:
"""Log metrics from a single sweep run"""
self.writer.add_scalar("sweep/score", score, self.step)

if isinstance(effective_clk_period, (int, float)):
self.writer.add_scalar(
"sweep/effective_clk_period", effective_clk_period, self.step
)

if isinstance(num_drc, (int, float)):
self.writer.add_scalar("sweep/num_drc", num_drc, self.step)

if isinstance(die_area, (int, float)):
self.writer.add_scalar("sweep/die_area", die_area, self.step)

for key, value in metrics.items():
if isinstance(value, (int, float)):
self.writer.add_scalar(f"metrics/{key}", value, self.step)

self.writer.add_hparams(
{
k: v if isinstance(v, (int, float, str, bool)) else str(v)
for k, v in params.items()
},
{"hparam/metric": score},
)

self.step += 1

def close(self) -> None:
"""Close the TensorBoard writer and log completion message"""
self.writer.close()
logger.info(
f"Sweep complete. View results with: tensorboard --logdir={self.log_dir}"
)
logger.info(f"Total runs logged: {self.step}")
91 changes: 78 additions & 13 deletions tools/AutoTuner/src/autotuner/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,24 @@
# Name of the TCL script run before routing
FASTROUTE_TCL = "fastroute.tcl"
DATE = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
# The worst of optimized metric
ERROR_METRIC = 9e99


def calculate_score(metrics, step=1):
"""Calculate optimization score from metrics."""
error = "ERR" in metrics.values()
not_found = "N/A" in metrics.values()

if error or not_found:
return (ERROR_METRIC, "-", "-", "-")

effective_clk_period = metrics["clk_period"] - metrics["worst_slack"]
num_drc = metrics["num_drc"]
gamma = effective_clk_period / 10
score = effective_clk_period * (100 / step) + gamma * num_drc

return (score, effective_clk_period, num_drc, metrics["die_area"])


def write_sdc(variables, path, sdc_original, constraints_sdc):
Expand Down Expand Up @@ -287,6 +305,21 @@ def run_command(
raise RuntimeError


def calculate_trial_path(args, base_dir, flow_variant):
"""
Calculate the log path and flow variant
"""
flow_variant_with_experiment = f"{args.experiment}/{flow_variant}"
log_path = os.path.abspath(
os.path.join(
base_dir,
f"flow/logs/{args.platform}/{args.design}",
flow_variant_with_experiment,
)
)
return log_path, flow_variant_with_experiment


def openroad(
args,
base_dir,
Expand All @@ -297,10 +330,8 @@ def openroad(
"""
Run OpenROAD-flow-scripts with a given set of parameters.
"""
# Make sure path ends in a slash, i.e., is a folder
flow_variant = f"{args.experiment}/{flow_variant}"
log_path = os.path.abspath(
os.path.join(base_dir, f"flow/logs/{args.platform}/{args.design}", flow_variant)
log_path, flow_variant = calculate_trial_path(
args=args, base_dir=base_dir, flow_variant=flow_variant
)
report_path = os.path.abspath(
os.path.join(
Expand Down Expand Up @@ -643,6 +674,20 @@ def openroad_distributed(
variant=None,
):
"""Simple wrapper to run openroad distributed with Ray."""
if variant is None:
variant_parts = []
for key, value in config.items():
if key not in ["_SDC_FILE_PATH", "_FR_FILE_PATH"]:
variant_parts.append(f"{key}_{value}")
variant = "_".join(variant_parts) if variant_parts else ""
flow_variant = f"{uuid.uuid4()}-{variant}" if variant else f"{uuid.uuid4()}"

trial_path, _ = calculate_trial_path(
args=args, base_dir=repo_dir, flow_variant=flow_variant
)

os.makedirs(trial_path, exist_ok=True)

config = parse_config(
config=config,
base_dir=repo_dir,
Expand All @@ -651,15 +696,15 @@ def openroad_distributed(
constraints_sdc=CONSTRAINTS_SDC,
fr_original=fr_original,
fastroute_tcl=FASTROUTE_TCL,
path=trial_path,
)
if variant is None:
variant = config.replace(" ", "_").replace("=", "_")

t = time.time()
metric_file = openroad(
args=args,
base_dir=repo_dir,
parameters=config,
flow_variant=f"{uuid.uuid4()}-{variant}" if variant else f"{uuid.uuid4()}",
flow_variant=flow_variant,
install_path=install_path,
)
duration = time.time() - t
Expand All @@ -669,9 +714,29 @@ def openroad_distributed(
@ray.remote
def consumer(queue):
"""consumer"""
while not queue.empty():
next_item = queue.get()
name = next_item[1]
print(f"[INFO TUN-0007] Scheduling run for parameter {name}.")
ray.get(openroad_distributed.remote(*next_item))
print(f"[INFO TUN-0008] Finished run for parameter {name}.")
item = queue.get()
tb_logger = item[6]

while item:
args, repo_dir, config, sdc, fr, install, tb_logger = item
print(f"[INFO TUN-0007] Scheduling run for parameter {config}.")
metric_file, _ = ray.get(
openroad_distributed.remote(args, repo_dir, config, sdc, fr, install)
)
print(f"[INFO TUN-0008] Finished run for parameter {config}.")

metrics = read_metrics(metric_file, args.stop_stage)
score, effective_clk_period, num_drc, die_area = calculate_score(metrics)

ray.get(
tb_logger.log_sweep_metrics.remote(
params=config,
metrics=metrics,
score=score,
effective_clk_period=effective_clk_period,
num_drc=num_drc,
die_area=die_area,
)
)

item = queue.get() if not queue.empty() else None