Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add New Metrics to Tracing #5991

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cardano-node/src/Cardano/Node/Tracing/Tracers.hs
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,8 @@ mkConsensusTracers configReflection trBase trForward mbTrEKG _trDataPoint trConf
["BlockFetch", "Server"]
configureTracers configReflection trConfig [blockFetchServerTr]

!servedBlockLatestTr <- servedBlockLatest mbTrEKG

!forgeKESInfoTr <- mkCardanoTracer
trBase trForward mbTrEKG
["Forge", "StateInfo"]
Expand Down Expand Up @@ -350,6 +352,7 @@ mkConsensusTracers configReflection trBase trForward mbTrEKG _trDataPoint trConf
<> traceWith blockFetchClientMetricsTr
, Consensus.blockFetchServerTracer = Tracer $
traceWith blockFetchServerTr
<> traceWith servedBlockLatestTr
, Consensus.forgeStateInfoTracer = Tracer $
traceWith (traceAsKESInfo (Proxy @blk) forgeKESInfoTr)
, Consensus.gddTracer = Tracer $
Expand Down
215 changes: 136 additions & 79 deletions cardano-node/src/Cardano/Node/Tracing/Tracers/Consensus.hs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ module Cardano.Node.Tracing.Tracers.Consensus
, forgeTracerTransform
, initialClientMetrics
, calculateBlockFetchClientMetrics
, servedBlockLatest
, ClientMetrics
) where

Expand Down Expand Up @@ -82,6 +83,7 @@ import qualified Data.Text as Text
import Data.Time (DiffTime, NominalDiffTime)
import Data.Word (Word32, Word64)


instance (LogFormatting adr, Show adr) => LogFormatting (ConnectionId adr) where
forMachine _dtal (ConnectionId local' remote) =
mconcat [ "connectionId" .= String (showT local'
Expand Down Expand Up @@ -515,44 +517,49 @@ incCdf v cdf =
then cdf {counter = counter cdf + 1}
else cdf


data ClientMetrics = ClientMetrics {
cmSlotMap :: IntPSQ Word64 NominalDiffTime
, cmCdf1sVar :: CdfCounter
, cmCdf3sVar :: CdfCounter
, cmCdf5sVar :: CdfCounter
, cmDelay :: Double
cmSlotMap :: IntPSQ Word64 NominalDiffTime
, cmCdf1sVar :: CdfCounter
, cmCdf3sVar :: CdfCounter
, cmCdf5sVar :: CdfCounter
, cmDelay :: Double
, cmBlockSize :: Word32
, cmTraceIt :: Bool
, cmTraceIt :: Bool
, cmTraceVars :: Bool
}

instance LogFormatting ClientMetrics where
forMachine _dtal _ = mempty
asMetrics ClientMetrics {..} =
if cmTraceIt
then
let size = Pq.size cmSlotMap
msgs =
[ DoubleM "blockfetchclient.blockdelay"
cmDelay
, IntM "blockfetchclient.blocksize"
(fromIntegral cmBlockSize)
, DoubleM "blockfetchclient.blockdelay.cdfOne"
(fromIntegral (counter cmCdf1sVar) / fromIntegral size)
, DoubleM "blockfetchclient.blockdelay.cdfThree"
(fromIntegral (counter cmCdf3sVar) / fromIntegral size)
, DoubleM "blockfetchclient.blockdelay.cdfFive"
(fromIntegral (counter cmCdf5sVar) / fromIntegral size)
]
in if cmDelay > 5
then
CounterM "blockfetchclient.lateblocks" Nothing
: msgs
else msgs
else []
let size = Pq.size cmSlotMap
msgs =
[ DoubleM "blockfetchclient.blockdelay" cmDelay
, IntM "blockfetchclient.blocksize" (fromIntegral cmBlockSize)
]
<> if cmTraceVars
then cdfMetric "blockfetchclient.blockdelay.cdfOne" cmCdf1sVar
<> cdfMetric "blockfetchclient.blockdelay.cdfThree" cmCdf3sVar
<> cdfMetric "blockfetchclient.blockdelay.cdfFive" cmCdf5sVar
<> lateBlockMetric cmDelay
else []
where
cdfMetric name var =
[ DoubleM name (fromIntegral (counter var) / fromIntegral size)
]

lateBlockMetric delay =
[ CounterM "blockfetchclient.lateblocks" Nothing
| delay > 5
]
in msgs
else []

instance MetaTrace ClientMetrics where
namespaceFor _ = Namespace [] ["ClientMetrics"]
severityFor _ _ = Just Info
severityFor _ _ = Nothing
documentFor _ = Just ""

metricsDocFor (Namespace _ ["ClientMetrics"]) =
Expand All @@ -579,68 +586,74 @@ initialClientMetrics =
0
0
False
False

calculateBlockFetchClientMetrics ::
ClientMetrics
-> LoggingContext
-> BlockFetch.TraceLabelPeer peer (BlockFetch.TraceFetchClientState header)
-> IO ClientMetrics
calculateBlockFetchClientMetrics cm@ClientMetrics {..} _lc
(TraceLabelPeer _ (BlockFetch.CompletedBlockFetch p _ _ _ forgeDelay blockSize)) =
case pointSlot p of
Origin -> pure cm {cmTraceIt = False} -- Nothing to do.
At (SlotNo slotNo) -> do
if Pq.null cmSlotMap && forgeDelay > 20
then pure cm {cmTraceIt = False} -- During startup wait until we are in sync
else case Pq.lookup (fromIntegral slotNo) cmSlotMap of
Just _ -> pure cm {cmTraceIt = False} -- dupe, we only track the first
Nothing -> do
let slotMap' = Pq.insert (fromIntegral slotNo) slotNo forgeDelay cmSlotMap
if Pq.size slotMap' > 1080 -- TODO k/2, should come from config file
then case Pq.minView slotMap' of
Nothing -> pure cm {cmTraceIt = False} -- Err. We just inserted an element!
Just (_, minSlotNo, minDelay, slotMap'') ->
if minSlotNo == slotNo
then pure cm {cmTraceIt = False, cmSlotMap = slotMap'}
else let
cdf1sVar = decCdf minDelay cmCdf1sVar
cdf3sVar = decCdf minDelay cmCdf3sVar
cdf5sVar = decCdf minDelay cmCdf5sVar
cdf1sVar' = incCdf forgeDelay cdf1sVar
cdf3sVar' = incCdf forgeDelay cdf3sVar
cdf5sVar' = incCdf forgeDelay cdf5sVar
in pure cm {
cmCdf1sVar = cdf1sVar'
, cmCdf3sVar = cdf3sVar'
, cmCdf5sVar = cdf5sVar'
, cmDelay = realToFrac forgeDelay
, cmBlockSize = getSizeInBytes blockSize
, cmTraceIt = True
, cmSlotMap = slotMap''}
else let
cdf1sVar' = incCdf forgeDelay cmCdf1sVar
cdf3sVar' = incCdf forgeDelay cmCdf3sVar
cdf5sVar' = incCdf forgeDelay cmCdf5sVar
-- -- Wait until we have at least 45 samples before we start providing
-- -- cdf estimates.
in if Pq.size slotMap' >= 45
then pure cm {
cmCdf1sVar = cdf1sVar'
, cmCdf3sVar = cdf3sVar'
, cmCdf5sVar = cdf5sVar'
, cmDelay = realToFrac forgeDelay
, cmBlockSize = getSizeInBytes blockSize
, cmTraceIt = True
, cmSlotMap = slotMap'}
else pure cm {
cmCdf1sVar = cdf1sVar'
, cmCdf3sVar = cdf3sVar'
, cmCdf5sVar = cdf5sVar'
, cmTraceIt = False
, cmSlotMap = slotMap'}
(TraceLabelPeer _ (BlockFetch.CompletedBlockFetch p _ _ _ forgeDelay blockSize)) =
case pointSlot p of
Origin -> pure cm {cmTraceIt = False} -- Nothing to do for Origin
At (SlotNo slotNo) ->
if Pq.null cmSlotMap && forgeDelay > 20
then pure cm {cmTraceIt = False} -- During startup wait until we are in sync
else processSlot slotNo
where
processSlot slotNo =
case Pq.lookup (fromIntegral slotNo) cmSlotMap of
Just _ -> pure cm {cmTraceIt = False} -- Duplicate, only track the first
Nothing -> let slotMap' = Pq.insert (fromIntegral slotNo) slotNo forgeDelay cmSlotMap
in if Pq.size slotMap' > 1080
then trimSlotMap slotMap' slotNo
else updateMetrics slotMap' slotNo

trimSlotMap slotMap' slotNo =
case Pq.minView slotMap' of
Nothing -> pure cm {cmTraceIt = False} -- Error: Just inserted element
Just (_, minSlotNo, minDelay, slotMap'') ->
if minSlotNo == slotNo
then pure cm { cmTraceIt = False, cmSlotMap = slotMap' }
else let updatedMetrics = updateCDFs minDelay forgeDelay
in pure cm
{ cmCdf1sVar = fst3 updatedMetrics
, cmCdf3sVar = snd3 updatedMetrics
, cmCdf5sVar = thd3 updatedMetrics
, cmDelay = realToFrac forgeDelay
, cmBlockSize = getSizeInBytes blockSize
, cmSlotMap = slotMap'' }

updateMetrics slotMap' _slotNo =
let updatedMetrics = updateCDFs 0 forgeDelay
in if Pq.size slotMap' >= 45
then pure cm
{ cmCdf1sVar = fst3 updatedMetrics
, cmCdf3sVar = snd3 updatedMetrics
, cmCdf5sVar = thd3 updatedMetrics
, cmDelay = realToFrac forgeDelay
, cmBlockSize = getSizeInBytes blockSize
, cmSlotMap = slotMap' }
else pure cm
{ cmCdf1sVar = fst3 updatedMetrics
, cmCdf3sVar = snd3 updatedMetrics
, cmCdf5sVar = thd3 updatedMetrics
, cmTraceVars = False
, cmSlotMap = slotMap' }

updateCDFs minDelay forgeDelay' =
( incCdf forgeDelay' (decCdf minDelay cmCdf1sVar)
, incCdf forgeDelay' (decCdf minDelay cmCdf3sVar)
, incCdf forgeDelay' (decCdf minDelay cmCdf5sVar) )

fst3 (x, _, _) = x
snd3 (_, y, _) = y
thd3 (_, _, z) = z

calculateBlockFetchClientMetrics cm _lc _ = pure cm


--------------------------------------------------------------------------------
-- BlockFetchDecision Tracer
--------------------------------------------------------------------------------
Expand Down Expand Up @@ -863,7 +876,9 @@ instance MetaTrace (TraceBlockFetchServerEvent blk) where
severityFor _ _ = Nothing

metricsDocFor (Namespace [] ["SendBlock"]) =
[("served.block", "")]
[("served.block", "This counter metric indicates how many blocks this node has served.")
,("served.block.latest", "This counter metric indicates how many chain tip blocks this node has served.")]

metricsDocFor _ = []

documentFor (Namespace [] ["SendBlock"]) = Just
Expand All @@ -872,6 +887,47 @@ instance MetaTrace (TraceBlockFetchServerEvent blk) where

allNamespaces = [Namespace [] ["SendBlock"]]

--------------------------------------------------------------------------------
-- Metric for server block latest
--------------------------------------------------------------------------------

data ServedBlock = ServedBlock {
maxSlotNo :: SlotNo
, localUp :: Word64
, servedBlocksLatest :: Word64
}

instance LogFormatting ServedBlock where
forMachine _mDtal ServedBlock {} = mempty

asMetrics ServedBlock {..} =
[IntM "served.block.latest" (fromIntegral servedBlocksLatest)]

emptyServedBlocks :: ServedBlock
emptyServedBlocks = ServedBlock 0 0 0

servedBlockLatest ::
Maybe (Trace IO FormattedMessage)
-> IO (Trace IO (TraceLabelPeer peer (TraceBlockFetchServerEvent blk)))
servedBlockLatest mbTrEKG =
foldTraceM calculateServedBlockLatest emptyServedBlocks
(metricsFormatter
(mkMetricsTracer mbTrEKG))

calculateServedBlockLatest :: ServedBlock
-> LoggingContext
-> TraceLabelPeer peer (TraceBlockFetchServerEvent blk)
-> IO ServedBlock
calculateServedBlockLatest ServedBlock{..} _lc (TraceLabelPeer _ (TraceBlockFetchServerSendBlock p)) =
case pointSlot p of
Origin -> return $ ServedBlock maxSlotNo localUp servedBlocksLatest
At slotNo ->
case compare maxSlotNo slotNo of
LT -> return $ ServedBlock slotNo (localUp + 1) (localUp + 1)
GT -> return $ ServedBlock maxSlotNo localUp servedBlocksLatest
EQ -> return $ ServedBlock maxSlotNo (localUp + 1) (localUp + 1)


--------------------------------------------------------------------------------
-- Gdd Tracer
--------------------------------------------------------------------------------
Expand Down Expand Up @@ -975,6 +1031,7 @@ instance LogFormatting SanityCheckIssue where
"Configuration contains multiple security parameters: " <> Text.pack (show e)



--------------------------------------------------------------------------------
-- TxInbound Tracer
--------------------------------------------------------------------------------
Expand Down
16 changes: 8 additions & 8 deletions cardano-node/src/Cardano/Node/Tracing/Tracers/P2P.hs
Original file line number Diff line number Diff line change
Expand Up @@ -921,29 +921,29 @@ instance LogFormatting PeerSelectionCounters where
PeerSelectionCountersHWC {..} ->
-- Deprecated metrics; they will be removed in a future version.
[ IntM
"peerSelection.cold"
"peerSelection.Cold"
(fromIntegral numberOfColdPeers)
, IntM
"peerSelection.warm"
"peerSelection.Warm"
(fromIntegral numberOfWarmPeers)
, IntM
"peerSelection.hot"
"peerSelection.Hot"
(fromIntegral numberOfHotPeers)
, IntM
"peerSelection.coldBigLedgerPeers"
"peerSelection.ColdBigLedgerPeers"
(fromIntegral numberOfColdBigLedgerPeers)
, IntM
"peerSelection.warmBigLedgerPeers"
"peerSelection.WarmBigLedgerPeers"
(fromIntegral numberOfWarmBigLedgerPeers)
, IntM
"peerSelection.hotBigLedgerPeers"
"peerSelection.HotBigLedgerPeers"
(fromIntegral numberOfHotBigLedgerPeers)

, IntM
"peerSelection.warmLocalRoots"
"peerSelection.WarmLocalRoots"
(fromIntegral $ numberOfActiveLocalRootPeers psc)
, IntM
"peerSelection.hotLocalRoots"
"peerSelection.HotLocalRoots"
(fromIntegral $ numberOfEstablishedLocalRootPeers psc
- numberOfActiveLocalRootPeers psc)
]
Expand Down
7 changes: 5 additions & 2 deletions cardano-node/src/Cardano/Node/Tracing/Tracers/Startup.hs
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,9 @@ instance ( Show (BlockNodeToNodeVersion blk)
NotEffective -> 0
)]
asMetrics (BICommon BasicInfoCommon {..}) =
[ PrometheusM "basicInfo" [("nodeStartTime", (pack . show) biNodeStartTime)]]
[ PrometheusM "basicInfo" [("nodeStartTime", (pack . show) biNodeStartTime)]
, IntM "node.start.time" ((ceiling . utcTimeToPOSIXSeconds) biNodeStartTime)
]
asMetrics _ = []

instance MetaTrace (StartupTrace blk) where
Expand Down Expand Up @@ -409,7 +411,8 @@ instance MetaTrace (StartupTrace blk) where
metricsDocFor (Namespace _ ["BlockForgingUpdate"]) =
[("forging_enabled","Can this node forge blocks? (Is it provided with block forging credentials) 0 = no, 1 = yes")]
metricsDocFor (Namespace _ ["Common"]) =
[("systemStartTime","The UTC time this node was started.")]
[("systemStartTime","The UTC time this node was started."),
("node.start.time","The UTC time this node was started represented in POSIX seconds.")]


metricsDocFor _ = []
Expand Down
Loading