Update critical path definition (#2879)

Caner Gocmen · facebook-github-bot · commit 787e7118daf0 · 2025-04-10T17:53:20.000-07:00
Summary: Pull Request resolved: #2879 Update the critical path definition in the planner logs to match what we think is the most realistic option. See the comments for the `_calculate_critical_path` function for the detailed logic. Reviewed By: iamzainhuda Differential Revision: D72410003 fbshipit-source-id: 050bed84b8bbe1b3b9489f3093a2297277cdac57
diff --git a/torchrec/distributed/planner/stats.py b/torchrec/distributed/planner/stats.py
@@ -25,6 +25,7 @@
     Union,
 )
 
+import pandas as pd
 from torch import nn
 
 from torchrec.distributed.embedding_types import EmbeddingComputeKernel
@@ -36,6 +37,7 @@
     InferenceStorageReservation,
 )
 from torchrec.distributed.planner.types import (
+    CriticalPathEstimate,
     ParameterConstraints,
     Perf,
     ShardingOption,
@@ -319,7 +321,7 @@ def log(
                 )
 
                 # Max perf and HBM to help root cause imbalance
-                self._log_max_perf_and_max_hbm(perf, used_hbm)
+                self._log_max_perf_and_max_hbm(perf, used_hbm, best_plan)
             self._log_storage_reservation_stats(
                 storage_reservation,
                 topology,
@@ -445,7 +447,9 @@ def _log_plan_imbalance_stats(
                 f"# {'Imbalance stats range 0-1, higher means more imbalanced' : <{self._width-3}}#"
             )
 
-    def _log_max_perf_and_max_hbm(self, perfs: List[Perf], used_hbm: List[int]) -> None:
+    def _log_max_perf_and_max_hbm(
+        self, perfs: List[Perf], used_hbm: List[int], best_plan: List[ShardingOption]
+    ) -> None:
         total_perfs = [perf.total for perf in perfs]
 
         max_total_perf_text = f"Longest Critical Path (Maximum of Total Perf): {_generate_max_text(total_perfs)}"
@@ -480,6 +484,8 @@ def _log_max_perf_and_max_hbm(self, perfs: List[Perf], used_hbm: List[int]) -> N
         )
         sum_of_maxima_text = f"Sum of Maxima: {round(sum_of_maxima, 3)} ms"
 
+        critical_path_estimate = _calculate_critical_path(best_plan)
+
         self._stats_table.append(f"#{'' : ^{self._width-2}}#")
         self._stats_table.append(f"# {max_total_perf_text : <{self._width-3}}#")
         self._stats_table.append(f"# {mean_total_perf_text : <{self._width-3}}#")
@@ -512,6 +518,15 @@ def _log_max_perf_and_max_hbm(self, perfs: List[Perf], used_hbm: List[int]) -> N
         self._stats_table.append(
             f"# {'High Median HBM: '+_generate_rank_hbm_stats(used_hbm, statistics.median_high) : <{self._width-3}}#"
         )
+        self._stats_table.append(
+            f"# {'Critical Path (comms): '+str(round(critical_path_estimate.comms_estimate, 3)) : <{self._width-3}}#"
+        )
+        self._stats_table.append(
+            f"# {'Critical Path (compute): '+str(round(critical_path_estimate.comp_estimate, 3)) : <{self._width-3}}#"
+        )
+        self._stats_table.append(
+            f"# {'Critical Path (comms + compute): '+str(round(critical_path_estimate.comp_estimate, 3)) : <{self._width-3}}#"
+        )
 
         max_used_hbm = max(used_hbm)
         mean_used_hbm = statistics.mean(used_hbm)
@@ -1052,6 +1067,76 @@ def _reduce_int_list(input_list: List[int]) -> str:
     return ", ".join(reduced)
 
 
+def _calculate_critical_path(best_plan: List[ShardingOption]) -> CriticalPathEstimate:
+    """
+    Calculates the critical path of the sharding plan. Makes the following assumptions:
+
+        1. There is a synchronization point across the ranks after each of the 4 events: Fwd/Bwd x Comms/Comp.
+        2. There are additional synchronization points during communication (both fwd & bwd) for each module <> sharding type combination.
+            i. Communication operations for each shard from the same module <> sharding type group are executed sequentially.
+            ii. Ranks need to synchronize before they can begin the communication operation for the next module <> sharding type group.
+        3. There are additional synchronization points during computation (both fwd & bwd) at the rank level.
+            i. Computation operations for each shard from the same module are executed sequentially.
+            ii. Ranks need to synchronize before they can begin the next set of events.
+    """
+
+    perf_data = defaultdict(float)
+    for so in best_plan:
+        module = so.module
+        sharding_type = so.sharding_type
+        ranks = sorted([cast(int, shard.rank) for shard in so.shards])
+        shard_perfs = [cast(Perf, shard.perf) for shard in so.shards]
+        perf_breakdowns = [
+            {
+                "fwd_compute": perf.fwd_compute,
+                "fwd_comms": perf.fwd_comms,
+                "bwd_compute": perf.bwd_compute,
+                "bwd_comms": perf.bwd_comms,
+                "prefetch_compute": perf.prefetch_compute,
+            }
+            for perf in shard_perfs
+        ]
+
+        for rank, perf_breakdown in zip(ranks, perf_breakdowns):
+            for perf_type in perf_breakdown:
+                perf_data[
+                    (
+                        rank,
+                        module,
+                        sharding_type,
+                        perf_type.split("_")[0],  # fwd or bwd
+                        perf_type.split("_")[1],  # compute or comms
+                    )
+                ] += perf_breakdown[perf_type]
+    perf_df = pd.DataFrame.from_dict(perf_data, orient="index", columns=["perf"])
+    perf_df.index = pd.MultiIndex.from_tuples(
+        perf_df.index,
+        names=["rank", "module", "sharding_type", "direction", "perf_type"],
+    )
+
+    comms_estimate = (
+        perf_df.xs("comms", level="perf_type")
+        .groupby(["rank", "module", "sharding_type", "direction"])
+        .sum()
+        .groupby(["module", "sharding_type", "direction"])
+        .max()
+        .sum()
+        .item()
+    )
+
+    comp_estimate = (
+        perf_df.xs("compute", level="perf_type")
+        .groupby(["rank", "direction"])
+        .sum()
+        .groupby(["direction"])
+        .max()
+        .sum()
+        .item()
+    )
+
+    return CriticalPathEstimate(comms_estimate, comp_estimate)
+
+
 class NoopEmbeddingStats(Stats):
     """
     Noop Stats for a sharding planner execution.
diff --git a/torchrec/distributed/planner/types.py b/torchrec/distributed/planner/types.py
@@ -831,3 +831,12 @@ def log(
         See class description
         """
         ...
+
+
+@dataclass
+class CriticalPathEstimate:
+    comms_estimate: float
+    comp_estimate: float
+
+    def total(self) -> float:
+        return self.comms_estimate + self.comp_estimate