The logic does seem to help, as you can tell below:
<Event Severity="10" Time="1538815785.209264" Type="TLogLocked" Machine="10.42.2.10:4683" ID="8bfb28dc474f8e23" TLog="0c09b93d6932812d" end="27672122785" logGroup="default"/>
<Event Severity="10" Time="1538815785.209264" Type="DelayingMasterRecovery" Machine="10.42.2.10:4683" ID="8bfb28dc474f8e23" StatusCode="2" Status="locking_old_transaction_servers" MissingIDs="b2a01e059992bff38bff75465ae563c8, 4059bc9019d0c3b086f720e232db82eb, 16b0804b7f2b76c138c6afd2c1fe8995, 7ca18acd06cb634d5866fa62ca3e36ac" logGroup="default" TrackLatestType="Original"/>
<Event Severity="10" Time="1538815786.209325" Type="DelayingMasterRecovery" Machine="10.42.2.10:4683" ID="8bfb28dc474f8e23" StatusCode="2" Status="locking_old_transaction_servers" MissingIDs="b2a01e059992bff38bff75465ae563c8, 4059bc9019d0c3b086f720e232db82eb, 16b0804b7f2b76c138c6afd2c1fe8995, 7ca18acd06cb634d5866fa62ca3e36ac" logGroup="default" TrackLatestType="Original"/>
<Event Severity="10" Time="1538815786.733162" Type="ConnectionFrom" Machine="10.42.2.10:4683" ID="c4419d54e87a14ab" FromAddress="10.42.2.59:59396" SuppressedEventCount="35" logGroup="default"/>
<Event Severity="10" Time="1538815786.771376" Type="ConnectionEstablished" Machine="10.42.2.10:4683" ID="c4419d54e87a14ab" Peer="10.42.2.59:59396" ConnectionId="1" SuppressedEventCount="35" logGroup="default"/>
<Event Severity="10" Time="1538815786.771376" Type="IncomingConnection" Machine="10.42.2.10:4683" ID="c4419d54e87a14ab" FromAddr="10.42.2.59:59396" CanonicalAddr="10.42.2.59:4653" IsPublic="1" logGroup="default"/>
<Event Severity="10" Time="1538815786.771376" Type="ConnectionClosed" Machine="10.42.2.10:4683" ID="0000000000000000" PeerAddr="10.42.2.59:4653" Error="operation_cancelled" ErrorDescription="Asynchronous operation cancelled" ErrorCode="1101" SuppressedEventCount="35" logGroup="default"/>
<Event Severity="10" Time="1538815786.771376" Type="TLogJoinedMeUnknown" Machine="10.42.2.10:4683" ID="8bfb28dc474f8e23" TLog="ca88d97401ad13cf" Address="10.42.2.59:4653" logGroup="default"/>
<Event Severity="10" Time="1538815786.771376" Type="TLogJoinedMe" Machine="10.42.2.10:4683" ID="8bfb28dc474f8e23" TLog="7ca18acd06cb634d" Address="10.42.2.59:4653" logGroup="default"/>
<Event Severity="10" Time="1538815786.772171" Type="TLogLocked" Machine="10.42.2.10:4683" ID="8bfb28dc474f8e23" TLog="7ca18acd06cb634d" end="27672122785" logGroup="default"/>
<Event Severity="10" Time="1538815787.035086" Type="IncomingConnection" Machine="10.42.2.10:4683" ID="f57166692cca38e1" FromAddr="10.42.2.59:59398" CanonicalAddr="10.42.2.59:4650" IsPublic="1" logGroup="default"/>
<Event Severity="10" Time="1538815787.036583" Type="TLogJoinedMe" Machine="10.42.2.10:4683" ID="8bfb28dc474f8e23" TLog="b2a01e059992bff3" Address="10.42.2.59:4650" logGroup="default"/>
<Event Severity="10" Time="1538815787.036583" Type="TLogJoinedMeUnknown" Machine="10.42.2.10:4683" ID="8bfb28dc474f8e23" TLog="a7b2380a1d47c806" Address="10.42.2.59:4650" logGroup="default"/>
<Event Severity="10" Time="1538815787.037139" Type="TLogLocked" Machine="10.42.2.10:4683" ID="8bfb28dc474f8e23" TLog="b2a01e059992bff3" end="27672122785" logGroup="default"/>
<Event Severity="10" Time="1538815787.129738" Type="IncomingConnection" Machine="10.42.2.10:4683" ID="67e25797c8b3551f" FromAddr="10.42.2.59:59400" CanonicalAddr="10.42.2.59:4652" IsPublic="1" logGroup="default"/>
<Event Severity="10" Time="1538815787.132086" Type="TLogJoinedMe" Machine="10.42.2.10:4683" ID="8bfb28dc474f8e23" TLog="16b0804b7f2b76c1" Address="10.42.2.59:4652" logGroup="default"/>
<Event Severity="10" Time="1538815787.132086" Type="TLogJoinedMeUnknown" Machine="10.42.2.10:4683" ID="8bfb28dc474f8e23" TLog="9e23d610a8de4849" Address="10.42.2.59:4652" logGroup="default"/>
<Event Severity="10" Time="1538815787.132872" Type="TLogLocked" Machine="10.42.2.10:4683" ID="8bfb28dc474f8e23" TLog="16b0804b7f2b76c1" end="27672122785" logGroup="default"/>
<Event Severity="10" Time="1538815787.209444" Type="DelayingMasterRecovery" Machine="10.42.2.10:4683" ID="8bfb28dc474f8e23" StatusCode="2" Status="locking_old_transaction_servers" MissingIDs="4059bc9019d0c3b086f720e232db82eb" logGroup="default" TrackLatestType="Original"/>
<Event Severity="10" Time="1538815787.296189" Type="IncomingConnection" Machine="10.42.2.10:4683" ID="7a609d4744a84de5" FromAddr="10.42.2.59:59402" CanonicalAddr="10.42.2.59:4651" IsPublic="1" logGroup="default"/>
<Event Severity="10" Time="1538815787.296522" Type="TLogJoinedMe" Machine="10.42.2.10:4683" ID="8bfb28dc474f8e23" TLog="4059bc9019d0c3b0" Address="10.42.2.59:4651" logGroup="default"/>
<Event Severity="10" Time="1538815787.296522" Type="TLogJoinedMeUnknown" Machine="10.42.2.10:4683" ID="8bfb28dc474f8e23" TLog="fe60a433f5e28635" Address="10.42.2.59:4651" logGroup="default"/>
<Event Severity="10" Time="1538815787.296953" Type="TLogLocked" Machine="10.42.2.10:4683" ID="8bfb28dc474f8e23" TLog="4059bc9019d0c3b0" end="27672122785" logGroup="default"/>
<Event Severity="10" Time="1538815788.209522" Type="LogSystemRecovery" Machine="10.42.2.10:4683" ID="8bfb28dc474f8e23" Cycles="40" TotalServers="40" Present="40" Available="40" Absent="0" ServerState="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" ReplicationFactor="2" AntiQuorum="0" Policy="zoneid^2 x 1" TooManyFailures="0" LastVersion="-1" RecoveryVersion="27671204392" EndVersion="27671204392" SafeBegin="0" SafeEnd="2" NewSafeBegin="0" LogZones="a188c033405bf0c7e3e3f4ab4db7619d,5f207adfb8bce3482a137e74236271f9,08e17799c6882ca97ffd786f10738e2e,5f207adfb8bce3482a137e74236271f9,6034975fa591c43bbd18532021638a03,b98e961a7cd5aca9ccf113d9dc902674,0439c79a19ca33c549459b4190c83245,f1659782186ec6fd118af10c247de0b0,7b3a48e3dde447444009d7c0c54f88d1,b98e961a7cd5aca9ccf113d9dc902674,0439c79a19ca33c549459b4190c83245,0439c79a19ca33c549459b4190c83245,5f207adfb8bce3482a137e74236271f9,0439c79a19ca33c549459b4190c83245,08e17799c6882ca97ffd786f10738e2e,\..." LogDataHalls="[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset],[unset]" tLogs="40" oldTlogsSize="0" logSystemType="2" At="27671204392" AvailableServers="40" knownCommittedVersion="27671202579" logGroup="default"/>
Specifically, because of the delay, it waited until all logs have joined before attempting recovery which meant it likely had a higher knownCommittedVersion than before (I should have logged that information when the code attempted the delay).
Obviously, I think for SSD storage engines, this should not be the default (meaning the knob should default to zero) but for memory, since the trade-off is huge (we still had some storage servers rollback in our tests), it’s probably a good idea to wait a couple seconds (up to 10 perhaps in limited experiments I just did) just in case the logs were just restarted or a temporary issue prevented them from responding.