From 7be8bc1262080d359dd97be976d971a04139c9f4 Mon Sep 17 00:00:00 2001 From: Nadia Yvette Chambers Date: Mon, 5 Aug 2024 17:42:11 +0000 Subject: [PATCH] span-calculation: comment 85% CPU span calculation Anomalies were seen with 85% CPU span calculations. This is at least trying to figure out what's going on and explain it to everyone else. --- bench/locli/src/Cardano/Analysis/API/Chain.hs | 3 ++ bench/locli/src/Cardano/Analysis/MachPerf.hs | 47 +++++++++++++++++++ bench/locli/src/Cardano/Command.hs | 19 ++++++++ bench/locli/src/Cardano/Render.hs | 3 ++ bench/locli/src/Cardano/Util.hs | 5 ++ 5 files changed, 77 insertions(+) diff --git a/bench/locli/src/Cardano/Analysis/API/Chain.hs b/bench/locli/src/Cardano/Analysis/API/Chain.hs index e15de28c001..1d372270b29 100644 --- a/bench/locli/src/Cardano/Analysis/API/Chain.hs +++ b/bench/locli/src/Cardano/Analysis/API/Chain.hs @@ -40,6 +40,9 @@ slotStart Genesis{..} = . fromIntegral . unSlotNo +-- | `impliedSlot` calculates slot numbers from timestamps and genesis +-- parameters. The fields `systemStart` time and `slotLength` are +-- both fields of the `Genesis` record. impliedSlot :: Genesis -> UTCTime -> SlotNo impliedSlot Genesis{..} = SlotNo diff --git a/bench/locli/src/Cardano/Analysis/MachPerf.hs b/bench/locli/src/Cardano/Analysis/MachPerf.hs index 4ee40096443..69de6c452e4 100644 --- a/bench/locli/src/Cardano/Analysis/MachPerf.hs +++ b/bench/locli/src/Cardano/Analysis/MachPerf.hs @@ -33,11 +33,24 @@ import Cardano.Unlog.Resources -- * 1. Collect SlotStats & RunScalars: -- +-- | `collectSlotStats` processes logfiles from different cluster +-- nodes in parallel. The right-hand argument of `(<$>)` is a +-- partially-applied functipn, so it's really just `(.)`, but then +-- `sequence` being composed with the function within the map has +-- some complications from the forcing etc. from `deepseq` and the +-- `evaluate` IO primitive. collectSlotStats :: Run -> [(JsonLogfile, [LogObject])] -> IO (Either Text [(JsonLogfile, (RunScalars, [SlotStats UTCTime]))]) collectSlotStats run = fmap sequence <$> mapConcurrentlyPure (timelineFromLogObjects run) +-- | `timelineFromLogObjects` processes a list of `LogObject` to +-- assemble the log entries into time slots. Per-slot data structures +-- absorb the log entries describing the events ongoing during the +-- time period they're intended to track over the course of the fold. +-- The per-slot structures are also instantiated as the time periods +-- in need of having events tracked within them are encountered in +-- the list of log entries. timelineFromLogObjects :: Run -> (JsonLogfile, [LogObject]) -> Either Text (JsonLogfile, (RunScalars, [SlotStats UTCTime])) timelineFromLogObjects _ (JsonLogfile f, []) = @@ -54,6 +67,8 @@ timelineFromLogObjects run@Run{genesis} (f, xs') = firstLogObjectHost :: Host firstLogObjectHost = loHost (head xs) + -- | `zeroTimelineAccum` represents the initial state of the + -- timeline accumulator to be fed to the fold. zeroTimelineAccum :: TimelineAccum zeroTimelineAccum = TimelineAccum @@ -107,6 +122,9 @@ timelineFromLogObjects run@Run{genesis} (f, xs') = , slLogObjects = [] } +-- | `timelineStep` processes a single `LogObject` to incorporate it +-- into the `TimelineAccum` accumulator. A more detailed description +-- of the case analysis inside it could be worthwhile. timelineStep :: Run -> JsonLogfile -> TimelineAccum -> LogObject -> TimelineAccum timelineStep Run{genesis} f accum@TimelineAccum{aSlotStats=cur:_, ..} lo = -- 1. skip pre-historic events not subject to performance analysis; @@ -357,6 +375,9 @@ lastBlockSlot new TimelineAccum{aSlotStats=SlotStats{..}:_,..} = then slSlot else aLastBlockSlot +-- | `patchSlotGap` walks through a gap in slot numbers adding slots +-- to the accumulator. The gap is taken to start after the head of +-- the accumulator, whose list is reversed. patchSlotGap :: Genesis -> SlotNo -> TimelineAccum -> TimelineAccum patchSlotGap genesis curSlot a@TimelineAccum{aSlotStats=last:_, ..} = a & if gapLen < 1000 @@ -372,9 +393,12 @@ patchSlotGap genesis curSlot a@TimelineAccum{aSlotStats=last:_, ..} = go :: Word64 -> SlotNo -> TimelineAccum -> TimelineAccum go 0 _ acc = acc + go remainingGap patchSlot acc = go (remainingGap - 1) (patchSlot + 1) (acc & addGapSlot patchSlot) + -- | `addGapSlot` constructs a single slot and adds it to the front + -- of the accumulator list. addGapSlot :: SlotNo -> TimelineAccum -> TimelineAccum addGapSlot slot acc = let (epoch, epochSlot) = genesis `unsafeParseSlot` slot in @@ -418,6 +442,10 @@ patchSlotGap genesis curSlot a@TimelineAccum{aSlotStats=last:_, ..} = } where slStart = slotStart genesis slot +-- | `addTimelineSlot` updates the current position of the +-- `TimelineAccum` accumulator. `timelineStep` invokes it via its +-- `continue` internal function, which calls it when the slot is +-- strictly greater than the `slSlot` if properly deciphered. addTimelineSlot :: Genesis -> SlotNo -> UTCTime -> TimelineAccum -> TimelineAccum addTimelineSlot genesis slot _time a@TimelineAccum{..} = let (epoch, epochSlot) = genesis `unsafeParseSlot` slot in @@ -556,6 +584,23 @@ slotStatsSummary Run{genesis=Genesis{epochLength}} slots = in if tailEpoch == slEpoch (Vec.head v) then v else Vec.dropWhile ((tailEpoch == ) . slEpoch) v + -- | `spanLen` in effect does (slSlot last) - (slSlot head) with + -- `unSlotNo` only unpacking it from the `SlotNo` newtype and + -- `fromIntegral` converting it from a `Word64` to an `Int`, where + -- last and head refer to the vector's elements. + -- + -- In plain words, this represents the number of slots in a half-open + -- interval bounded by the slot numbers at the first and last + -- positions within the vector of slot numbers. Some thoughts could + -- be had about the potential for a discontiguous set of slots + -- within the vector and whether a closed interval should be + -- preferred over a half-open one. Answers could be turned up over + -- the course of this documentation audit. + -- + -- Simplifying the expression within the executable code might help + -- with maintainability, but it's best left undisturbed for the sake + -- of reducing the audit surface area in the event of regressions, + -- barring sufficiently broad consensus. spanLen :: Vector (SlotStats a) -> Int spanLen = fromIntegral . unSlotNo . uncurry (-) . (slSlot *** slSlot) . (Vec.last &&& Vec.head) @@ -598,6 +643,8 @@ slotStatsMachPerf run (f, slots) = (,) sFirst sLast = (slSlot . head &&& slSlot . last) slots + -- | `dist` assembles the sample into a distribution `CDF` according + -- to the standard list of centiles `stdCentiles` using `cdfZ`. dist :: Divisible a => [a] -> CDF I a dist = cdfZ stdCentiles diff --git a/bench/locli/src/Cardano/Command.hs b/bench/locli/src/Cardano/Command.hs index 41d3738477e..60215eebf94 100644 --- a/bench/locli/src/Cardano/Command.hs +++ b/bench/locli/src/Cardano/Command.hs @@ -425,6 +425,9 @@ runChainCommand s@State{sMultiSummary=Just summ@Summary{..}} runChainCommand _ c@WriteMetaGenesis{} = missingCommandData c ["multi objects"] +-- | Slots are input here, in the `Unlog` command. +-- `runLiftLogObjects` does the IO and `Aeson` processing with +-- `readLogObjectStream` as a helper. runChainCommand s c@(Unlog rlf okDErr loAnyLimit) = do progress "logs" (Q $ printf "reading run log manifest %s" $ unJsonInputFile rlf) @@ -537,6 +540,8 @@ runChainCommand s@State{sRun=Just _run, sChain=Just Chain{..}} runChainCommand _ c@TimelineChain{} = missingCommandData c ["run metadata & genesis", "chain"] +-- | `CollectSlots` is the step slots make from `sRunLogs` to +-- `sSlotsRaw`. `deltifySlotStats` is the crucial post-processing after runChainCommand s@State{sRun=Just run, sRunLogs=Just (rlLogs -> objs)} c@(CollectSlots ignores) = do let nonIgnored = flip filter objs $ (`notElem` ignores) . fst @@ -544,6 +549,17 @@ runChainCommand s@State{sRun=Just run, sRunLogs=Just (rlLogs -> objs)} progress "perf-ignored-log" . R . unJsonLogfile progress "slots" (Q $ printf "building slot %d timelines" $ length objs) (scalars, slotsRaw) <- + -- `mapAndUnzip` just maps using a function producing a pair and + -- unzips the result. `redistribute` just takes a triple in the + -- form of (t1, (t2, t3)) mapping it to ((t1, t2), (t1, t3)), + -- repeating the first component to pair it with both halves of the + -- second component. `collectSlotStats` just maps coalescing the log + -- entries into per- (time) slot structures collecting everything + -- described in one of the `LogObject` lists, done in parallel across + -- the distinct log files. `deltifySlotStats` then differences the + -- timestamps within a slot against the `Genesis` to provide a + -- relative slot start time, and then differences other fields + -- against each other and that baseline to measure timings. fmap (mapAndUnzip redistribute) <$> collectSlotStats run nonIgnored & newExceptT & firstExceptT (CommandError c) @@ -560,6 +576,8 @@ runChainCommand s@State{sSlotsRaw=Just slotsRaw} runChainCommand _ c@DumpSlotsRaw = missingCommandData c ["unfiltered slots"] +-- | `FilterSlots` takes raw slots in `sSlotsRaw` and cleans them up to +-- be put at long last in `sSlots` for consumption by higher-level code. runChainCommand s@State{sRun=Just run, sSlotsRaw=Just slotsRaw} c@(FilterSlots fltfs fltExprs) = do progress "slots" (Q $ printf "filtering %d slot timelines" $ length slotsRaw) @@ -591,6 +609,7 @@ runChainCommand s@State{sSlots=Just slots} runChainCommand _ c@DumpSlots = missingCommandData c ["filtered slots"] +-- | `TimelineSlots` exports the processed slots in renderable form. runChainCommand s@State{sRun=Just _run, sSlots=Just slots} c@(TimelineSlots rc comments) = do progress "mach" (Q $ printf "dumping %d slot timelines: %s" (length slots) (show rc :: String)) diff --git a/bench/locli/src/Cardano/Render.hs b/bench/locli/src/Cardano/Render.hs index 784df301d7f..db72a83c87e 100644 --- a/bench/locli/src/Cardano/Render.hs +++ b/bench/locli/src/Cardano/Render.hs @@ -245,6 +245,9 @@ renderProfilingData rc a flt pd = , fDescription = peSrcLoc pe } +-- | `renderTimelineWithClass` and `renderTimeline` output textual +-- representations of timelines for postprocessing with ede, em, and +-- other such tools for producing benchmark reports. renderTimelineWithClass :: forall (a :: Type). TimelineFields a => (Field ISelect I a -> Bool) -> RenderConfig -> Anchor -> [TimelineComments a] -> [a] -> [Text] renderTimelineWithClass flt = renderTimeline (filter flt timelineFields) rtCommentary diff --git a/bench/locli/src/Cardano/Util.hs b/bench/locli/src/Cardano/Util.hs index 7ccbe0b6643..aada3f1cc14 100644 --- a/bench/locli/src/Cardano/Util.hs +++ b/bench/locli/src/Cardano/Util.hs @@ -133,6 +133,9 @@ mapSMaybeFB cons f x next = case f x of SNothing -> next SJust r -> cons r next +-- | `mapConcurrentlyPure` mostly fully forces the list elements +-- produced in parallel. `evaluate` seems to have mostly to do with +-- exceptions. mapConcurrentlyPure :: NFData b => (a -> b) -> [a] -> IO [b] mapConcurrentlyPure f = mapConcurrently @@ -178,6 +181,8 @@ replaceExtension :: FilePath -> String -> FilePath replaceExtension f new = F.dropExtension f <> "." <> new +-- | `spans` creates a list of `Vector` where each list element is all +-- of the list elements comprising a span put into a `Vector`. spans :: forall a. (a -> Bool) -> [a] -> [Vector a] spans f = go [] where