From 7be8bc1262080d359dd97be976d971a04139c9f4 Mon Sep 17 00:00:00 2001
From: Nadia Yvette Chambers <nadia.chambers@iohk.io>
Date: Mon, 5 Aug 2024 17:42:11 +0000
Subject: [PATCH] span-calculation: comment 85% CPU span calculation

Anomalies were seen with 85% CPU span calculations. This is at least
trying to figure out what's going on and explain it to everyone else.
---
 bench/locli/src/Cardano/Analysis/API/Chain.hs |  3 ++
 bench/locli/src/Cardano/Analysis/MachPerf.hs  | 47 +++++++++++++++++++
 bench/locli/src/Cardano/Command.hs            | 19 ++++++++
 bench/locli/src/Cardano/Render.hs             |  3 ++
 bench/locli/src/Cardano/Util.hs               |  5 ++
 5 files changed, 77 insertions(+)

diff --git a/bench/locli/src/Cardano/Analysis/API/Chain.hs b/bench/locli/src/Cardano/Analysis/API/Chain.hs
index e15de28c001..1d372270b29 100644
--- a/bench/locli/src/Cardano/Analysis/API/Chain.hs
+++ b/bench/locli/src/Cardano/Analysis/API/Chain.hs
@@ -40,6 +40,9 @@ slotStart Genesis{..} =
   . fromIntegral
   . unSlotNo
 
+-- | `impliedSlot` calculates slot numbers from timestamps and genesis
+--   parameters. The fields `systemStart` time and `slotLength` are
+--   both fields of the `Genesis` record.
 impliedSlot :: Genesis -> UTCTime -> SlotNo
 impliedSlot Genesis{..} =
   SlotNo
diff --git a/bench/locli/src/Cardano/Analysis/MachPerf.hs b/bench/locli/src/Cardano/Analysis/MachPerf.hs
index 4ee40096443..69de6c452e4 100644
--- a/bench/locli/src/Cardano/Analysis/MachPerf.hs
+++ b/bench/locli/src/Cardano/Analysis/MachPerf.hs
@@ -33,11 +33,24 @@ import Cardano.Unlog.Resources
 
 -- * 1. Collect SlotStats & RunScalars:
 --
+-- | `collectSlotStats` processes logfiles from different cluster
+--   nodes in parallel. The right-hand argument of `(<$>)` is a
+--   partially-applied functipn, so it's really just `(.)`, but then
+--   `sequence` being composed with the function within the map has
+--   some complications from the forcing etc. from `deepseq` and the
+--   `evaluate` IO primitive.
 collectSlotStats :: Run -> [(JsonLogfile, [LogObject])]
                  -> IO (Either Text [(JsonLogfile, (RunScalars, [SlotStats UTCTime]))])
 collectSlotStats run = fmap sequence <$> mapConcurrentlyPure (timelineFromLogObjects run)
 
 
+-- | `timelineFromLogObjects` processes a list of `LogObject` to
+--   assemble the log entries into time slots. Per-slot data structures
+--   absorb the log entries describing the events ongoing during the
+--   time period they're intended to track over the course of the fold.
+--   The per-slot structures are also instantiated as the time periods
+--   in need of having events tracked within them are encountered in
+--   the list of log entries.
 timelineFromLogObjects :: Run -> (JsonLogfile, [LogObject])
                        -> Either Text (JsonLogfile, (RunScalars, [SlotStats UTCTime]))
 timelineFromLogObjects _ (JsonLogfile f, []) =
@@ -54,6 +67,8 @@ timelineFromLogObjects run@Run{genesis} (f, xs') =
    firstLogObjectHost :: Host
    firstLogObjectHost = loHost (head xs)
 
+   -- | `zeroTimelineAccum` represents the initial state of the
+   --   timeline accumulator to be fed to the fold.
    zeroTimelineAccum :: TimelineAccum
    zeroTimelineAccum =
      TimelineAccum
@@ -107,6 +122,9 @@ timelineFromLogObjects run@Run{genesis} (f, xs') =
      , slLogObjects  = []
      }
 
+-- | `timelineStep` processes a single `LogObject` to incorporate it
+--   into the `TimelineAccum` accumulator. A more detailed description
+--   of the case analysis inside it could be worthwhile.
 timelineStep :: Run -> JsonLogfile -> TimelineAccum -> LogObject -> TimelineAccum
 timelineStep Run{genesis} f accum@TimelineAccum{aSlotStats=cur:_, ..} lo =
   -- 1. skip pre-historic events not subject to performance analysis;
@@ -357,6 +375,9 @@ lastBlockSlot new TimelineAccum{aSlotStats=SlotStats{..}:_,..} =
   then slSlot
   else aLastBlockSlot
 
+-- | `patchSlotGap` walks through a gap in slot numbers adding slots
+--   to the accumulator. The gap is taken to start after the head of
+--   the accumulator, whose list is reversed.
 patchSlotGap :: Genesis -> SlotNo -> TimelineAccum -> TimelineAccum
 patchSlotGap genesis curSlot a@TimelineAccum{aSlotStats=last:_, ..} =
   a & if gapLen < 1000
@@ -372,9 +393,12 @@ patchSlotGap genesis curSlot a@TimelineAccum{aSlotStats=last:_, ..} =
 
    go :: Word64 -> SlotNo -> TimelineAccum -> TimelineAccum
    go 0      _         acc = acc
+
    go remainingGap patchSlot acc =
      go (remainingGap - 1) (patchSlot + 1) (acc & addGapSlot patchSlot)
 
+   -- | `addGapSlot` constructs a single slot and adds it to the front
+   --   of the accumulator list.
    addGapSlot :: SlotNo -> TimelineAccum -> TimelineAccum
    addGapSlot slot acc =
     let (epoch, epochSlot) = genesis `unsafeParseSlot` slot in
@@ -418,6 +442,10 @@ patchSlotGap genesis curSlot a@TimelineAccum{aSlotStats=last:_, ..} =
         }
     where slStart = slotStart genesis slot
 
+-- | `addTimelineSlot` updates the current position of the
+--   `TimelineAccum` accumulator. `timelineStep` invokes it via its
+--   `continue` internal function, which calls it when the slot is
+--   strictly greater than the `slSlot` if properly deciphered.
 addTimelineSlot :: Genesis -> SlotNo -> UTCTime -> TimelineAccum -> TimelineAccum
 addTimelineSlot genesis slot _time a@TimelineAccum{..} =
   let (epoch, epochSlot) = genesis `unsafeParseSlot` slot in
@@ -556,6 +584,23 @@ slotStatsSummary Run{genesis=Genesis{epochLength}} slots =
           in if tailEpoch == slEpoch (Vec.head v) then v
              else Vec.dropWhile ((tailEpoch == ) . slEpoch) v
 
+   -- | `spanLen` in effect does (slSlot last) - (slSlot head) with
+   --   `unSlotNo` only unpacking it from the `SlotNo` newtype and
+   --   `fromIntegral` converting it from a `Word64` to an `Int`, where
+   --   last and head refer to the vector's elements.
+   --
+   --   In plain words, this represents the number of slots in a half-open
+   --   interval bounded by the slot numbers at the first and last
+   --   positions within the vector of slot numbers. Some thoughts could
+   --   be had about the potential for a discontiguous set of slots
+   --   within the vector and whether a closed interval should be
+   --   preferred over a half-open one. Answers could be turned up over
+   --   the course of this documentation audit.
+   --
+   --   Simplifying the expression within the executable code might help
+   --   with maintainability, but it's best left undisturbed for the sake
+   --   of reducing the audit surface area in the event of regressions,
+   --   barring sufficiently broad consensus.
    spanLen :: Vector (SlotStats a) -> Int
    spanLen = fromIntegral . unSlotNo . uncurry (-) . (slSlot *** slSlot) . (Vec.last &&& Vec.head)
 
@@ -598,6 +643,8 @@ slotStatsMachPerf run (f, slots) =
 
    (,) sFirst sLast = (slSlot . head &&& slSlot . last) slots
 
+   -- | `dist` assembles the sample into a distribution `CDF` according
+   --   to the standard list of centiles `stdCentiles` using `cdfZ`.
    dist :: Divisible a => [a] -> CDF I a
    dist = cdfZ stdCentiles
 
diff --git a/bench/locli/src/Cardano/Command.hs b/bench/locli/src/Cardano/Command.hs
index 41d3738477e..60215eebf94 100644
--- a/bench/locli/src/Cardano/Command.hs
+++ b/bench/locli/src/Cardano/Command.hs
@@ -425,6 +425,9 @@ runChainCommand s@State{sMultiSummary=Just summ@Summary{..}}
 runChainCommand _ c@WriteMetaGenesis{} = missingCommandData c
   ["multi objects"]
 
+-- | Slots are input here, in the `Unlog` command.
+--   `runLiftLogObjects` does the IO and `Aeson` processing with
+--   `readLogObjectStream` as a helper.
 runChainCommand s
   c@(Unlog rlf okDErr loAnyLimit) = do
   progress "logs" (Q $ printf "reading run log manifest %s" $ unJsonInputFile rlf)
@@ -537,6 +540,8 @@ runChainCommand s@State{sRun=Just _run, sChain=Just Chain{..}}
 runChainCommand _ c@TimelineChain{} = missingCommandData c
   ["run metadata & genesis", "chain"]
 
+-- | `CollectSlots` is the step slots make from `sRunLogs` to
+--   `sSlotsRaw`. `deltifySlotStats` is the crucial post-processing after
 runChainCommand s@State{sRun=Just run, sRunLogs=Just (rlLogs -> objs)}
   c@(CollectSlots ignores) = do
   let nonIgnored = flip filter objs $ (`notElem` ignores) . fst
@@ -544,6 +549,17 @@ runChainCommand s@State{sRun=Just run, sRunLogs=Just (rlLogs -> objs)}
     progress "perf-ignored-log" . R . unJsonLogfile
   progress "slots" (Q $ printf "building slot %d timelines" $ length objs)
   (scalars, slotsRaw) <-
+    -- `mapAndUnzip` just maps using a function producing a pair and
+    -- unzips the result. `redistribute` just takes a triple in the
+    -- form of (t1, (t2, t3)) mapping it to ((t1, t2), (t1, t3)),
+    -- repeating the first component to pair it with both halves of the
+    -- second component. `collectSlotStats` just maps coalescing the log
+    -- entries into per- (time) slot structures collecting everything
+    -- described in one of the `LogObject` lists, done in parallel across
+    -- the distinct log files. `deltifySlotStats` then differences the
+    -- timestamps within a slot against the `Genesis` to provide a
+    -- relative slot start time, and then differences other fields
+    -- against each other and that baseline to measure timings.
     fmap (mapAndUnzip redistribute) <$> collectSlotStats run nonIgnored
     & newExceptT
     & firstExceptT (CommandError c)
@@ -560,6 +576,8 @@ runChainCommand s@State{sSlotsRaw=Just slotsRaw}
 runChainCommand _ c@DumpSlotsRaw = missingCommandData c
   ["unfiltered slots"]
 
+-- | `FilterSlots` takes raw slots in `sSlotsRaw` and cleans them up to
+--   be put at long last in `sSlots` for consumption by higher-level code.
 runChainCommand s@State{sRun=Just run, sSlotsRaw=Just slotsRaw}
   c@(FilterSlots fltfs fltExprs) = do
   progress "slots" (Q $ printf "filtering %d slot timelines" $ length slotsRaw)
@@ -591,6 +609,7 @@ runChainCommand s@State{sSlots=Just slots}
 runChainCommand _ c@DumpSlots = missingCommandData c
   ["filtered slots"]
 
+-- | `TimelineSlots` exports the processed slots in renderable form.
 runChainCommand s@State{sRun=Just _run, sSlots=Just slots}
   c@(TimelineSlots rc comments) = do
   progress "mach" (Q $ printf "dumping %d slot timelines: %s" (length slots) (show rc :: String))
diff --git a/bench/locli/src/Cardano/Render.hs b/bench/locli/src/Cardano/Render.hs
index 784df301d7f..db72a83c87e 100644
--- a/bench/locli/src/Cardano/Render.hs
+++ b/bench/locli/src/Cardano/Render.hs
@@ -245,6 +245,9 @@ renderProfilingData rc a flt pd =
      , fDescription = peSrcLoc pe
      }
 
+-- | `renderTimelineWithClass` and `renderTimeline` output textual
+--   representations of timelines for postprocessing with ede, em, and
+--   other such tools for producing benchmark reports.
 renderTimelineWithClass :: forall (a :: Type). TimelineFields a => (Field ISelect I a -> Bool) -> RenderConfig -> Anchor -> [TimelineComments a] -> [a] -> [Text]
 renderTimelineWithClass flt = renderTimeline (filter flt timelineFields) rtCommentary
 
diff --git a/bench/locli/src/Cardano/Util.hs b/bench/locli/src/Cardano/Util.hs
index 7ccbe0b6643..aada3f1cc14 100644
--- a/bench/locli/src/Cardano/Util.hs
+++ b/bench/locli/src/Cardano/Util.hs
@@ -133,6 +133,9 @@ mapSMaybeFB cons f x next = case f x of
   SNothing -> next
   SJust r -> cons r next
 
+-- | `mapConcurrentlyPure` mostly fully forces the list elements
+--   produced in parallel. `evaluate` seems to have mostly to do with
+--   exceptions.
 mapConcurrentlyPure :: NFData b => (a -> b) -> [a] -> IO [b]
 mapConcurrentlyPure f =
   mapConcurrently
@@ -178,6 +181,8 @@ replaceExtension :: FilePath -> String -> FilePath
 replaceExtension f new = F.dropExtension f <> "." <> new
 
 
+-- | `spans` creates a list of `Vector` where each list element is all
+--   of the list elements comprising a span put into a `Vector`.
 spans :: forall a. (a -> Bool) -> [a] -> [Vector a]
 spans f = go []
  where