I’m running two clusters and one of those clusters runs a set of dr_agent processes.
I can’t seem to make the switchover work.
Seems I encounter DBA_SwitchOverPossibleDRAgentsIncorrectSetup
.
This is the event trace from the fdbdr switch
command:
bash-5.1$ fdbdr switch -s /etc/foundationdb/primary_fdb.cluster -d /etc/foundationdb/fdb.cluster --log
ERROR: An error was encountered during submission
Fatal Error: Backup error
bash-5.1$ cat trace.172.26.8.190.325.1750417328.sFK5VL.0.1.xml
<?xml version="1.0"?>
<Trace>
<Event Severity="10" Time="1750417328.054636" DateTime="2025-06-20T11:02:08Z" Type="Net2Starting" ID="0000000000000000" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.055318" DateTime="2025-06-20T11:02:08Z" Type="ProgramStart" ID="0000000000000000" SourceVersion="52d9b323d90e0f8a682bc786d706997dbc7f05b0" Version="7.3.59" PackageName="7.3" ActualTime="1750417328" CommandLine="fdbdr switch -s /etc/foundationdb/primary_fdb.cluster -d /etc/foundationdb/fdb.cluster --log" MemoryLimit="8589934592" Proxy="" ThreadID="17344581416006569760" TrackLatestType="Original" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.054736" DateTime="2025-06-20T11:02:08Z" Type="ClientStart" ID="0000000000000000" SourceVersion="52d9b323d90e0f8a682bc786d706997dbc7f05b0" Version="7.3.59" PackageName="7.3" ActualTime="1750417328" ApiVersion="730" ClientLibrary="fdbdr" ImageOffset="(nil)" Primary="1" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" TrackLatestType="Original" />
<Event Severity="10" Time="1750417328.054736" DateTime="2025-06-20T11:02:08Z" Type="MachineLoadDetail" ID="0000000000000000" User="47697765" Nice="334178" System="28434570" Idle="967418241" IOWait="5206855" IRQ="0" SoftIRQ="7133392" Steal="184572" Guest="0" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.054736" DateTime="2025-06-20T11:02:08Z" Type="Net2TLSConfig" ID="0000000000000000" CAPath="/etc/foundationdb/ssl/ca.pem" CertificatePath="/etc/foundationdb/ssl/cert.pem" KeyPath="/etc/foundationdb/ssl/key.pem" HasPassword="0" VerifyPeers="Check.Valid=1" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.054736" DateTime="2025-06-20T11:02:08Z" Type="NotifyAddressHealthy" ID="0000000000000000" SuppressedEventCount="0" Address="172.26.11.128:4600:tls" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.054736" DateTime="2025-06-20T11:02:08Z" Type="DatabaseContextCreated" ID="873cefdda41b27ff" Backtrace="addr2line -e fdbdr.debug -p -C -f -i 0x16e8e6d 0xf1a412 0xf29005 0x988eef 0x9920b5 0x98d065 0x7efcca5e9590" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.054736" DateTime="2025-06-20T11:02:08Z" Type="ConnectToDatabase" ID="873cefdda41b27ff" Version="7.3.59" ClusterFile="file:///etc/foundationdb/fdb.cluster" ConnectionString="store_fdb_dr_pp_fra1:YTiJnPcG@172.26.11.128:4600:tls,172.26.11.3:4600:tls,172.26.8.190:4600:tls" ClientLibrary="fdbdr" Primary="1" Internal="1" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" TrackLatestType="Original" />
<Event Severity="10" Time="1750417328.054736" DateTime="2025-06-20T11:02:08Z" Type="DatabaseContextCreated" ID="3268247b4027f23a" Backtrace="addr2line -e fdbdr.debug -p -C -f -i 0x16e8e6d 0xf1a412 0xf29005 0x988eef 0x992986 0x98d092 0x7efcca5e9590" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.054736" DateTime="2025-06-20T11:02:08Z" Type="ConnectToDatabase" ID="3268247b4027f23a" Version="7.3.59" ClusterFile="file:///etc/foundationdb/primary_fdb.cluster" ConnectionString="store_fdb_pp_fra1:dAz0JWb92cXkWQQILX62B2MqEfMrKvvz@172.26.8.40:4600:tls,172.26.8.188:4600:tls,172.26.9.12:4600:tls,172.26.9.196:4600:tls,172.26.10.146:4600:tls" ClientLibrary="fdbdr" Primary="1" Internal="1" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" TrackLatestType="Original" />
<Event Severity="10" Time="1750417328.054736" DateTime="2025-06-20T11:02:08Z" Type="CodeCoverage" ID="0000000000000000" File="fdbclient/ReadYourWrites.actor.cpp" Line="1632" Condition="true" Covered="1" Comment="ReadYourWritesTransaction::get" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.054736" DateTime="2025-06-20T11:02:08Z" Type="StartingRunLoopProfilingThread" ID="0000000000000000" Interval="0.125" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.054736" DateTime="2025-06-20T11:02:08Z" Type="Net2Running" ID="0000000000000000" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.054736" DateTime="2025-06-20T11:02:08Z" Type="ProcessTimeOffset" ID="0000000000000000" ProcessTime="1750417328.060683" SystemTime="1750417328.060683" OffsetFromSystemTime="-0.000000" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.060713" DateTime="2025-06-20T11:02:08Z" Type="ConnectingTo" ID="0000000000000000" SuppressedEventCount="0" PeerAddr="172.26.8.190:4600:tls" PeerAddress="172.26.8.190:4600:tls" PeerReferences="3" FailureStatus="OK" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.067115" DateTime="2025-06-20T11:02:08Z" Type="ConnectionExchangingConnectPacket" ID="75dff09623f041b8" SuppressedEventCount="0" PeerAddr="172.26.9.12:4600:tls" PeerAddress="172.26.9.12:4600:tls" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.067671" DateTime="2025-06-20T11:02:08Z" Type="ConnectionEstablished" ID="75dff09623f041b8" SuppressedEventCount="0" Peer="172.26.9.12:4600:tls" PeerAddress="172.26.9.12:4600:tls" ConnectionId="1" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.067671" DateTime="2025-06-20T11:02:08Z" Type="ConnectedOutgoing" ID="0000000000000000" SuppressedEventCount="0" PeerAddr="172.26.9.12:4600" PeerAddress="172.26.9.12:4600" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.089691" DateTime="2025-06-20T11:02:08Z" Type="DBA_SwitchoverStart" ID="0000000000000000" Status="3" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.089691" DateTime="2025-06-20T11:02:08Z" Type="MonitorLeaderChange" ID="0000000000000000" NewLeader="0000000000000001" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.090632" DateTime="2025-06-20T11:02:08Z" Type="GetLeaderReply" ID="0000000000000000" SuppressedEventCount="0" Coordinator="172.26.9.12:4600:tls" Nominee="640552aa70627052" ClusterKey="store_fdb_pp_fra1:dAz0JWb92cXkWQQILX62B2MqEfMrKvvz" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.090632" DateTime="2025-06-20T11:02:08Z" Type="MonitorLeaderChange" ID="0000000000000000" NewLeader="640552aa70627052" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.090632" DateTime="2025-06-20T11:02:08Z" Type="MonitorLeaderChange" ID="0000000000000000" NewLeader="640552aa70627052" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.097641" DateTime="2025-06-20T11:02:08Z" Type="MonitorLeaderChange" ID="0000000000000000" NewLeader="640552aa70627052" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.097775" DateTime="2025-06-20T11:02:08Z" Type="MonitorLeaderChange" ID="0000000000000000" NewLeader="640552aa70627052" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.098440" DateTime="2025-06-20T11:02:08Z" Type="MonitorLeaderChange" ID="0000000000000000" NewLeader="640552aa70627052" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.138812" DateTime="2025-06-20T11:02:08Z" Type="MonitorLeaderChange" ID="0000000000000000" NewLeader="0000000000000001" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.148620" DateTime="2025-06-20T11:02:08Z" Type="MonitorLeaderChange" ID="0000000000000000" NewLeader="6425267b92c9a9ed" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.154705" DateTime="2025-06-20T11:02:08Z" Type="MonitorLeaderChange" ID="0000000000000000" NewLeader="6425267b92c9a9ed" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.155693" DateTime="2025-06-20T11:02:08Z" Type="MonitorLeaderChange" ID="0000000000000000" NewLeader="6425267b92c9a9ed" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="20" Time="1750417328.179483" DateTime="2025-06-20T11:02:08Z" Type="DBA_SwitchOverPossibleDRAgentsIncorrectSetup" ID="0000000000000000" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="40" ErrorKind="Unset" Time="1750417328.179483" DateTime="2025-06-20T11:02:08Z" Type="StopAfterError" ID="0000000000000000" Error="backup_error" ErrorDescription="Backup error" ErrorCode="2300" ThreadID="17344581416006569760" Backtrace="addr2line -e fdbdr.debug -p -C -f -i 0x16e8e6d 0x16e9133 0x16e3334 0x9c18f1 0x9c1d37 0x9c9929 0x9ab60f 0x9ab987 0x9c9929 0xc71631 0x13f4008 0x13f4706 0x13f4567 0x13f2c1c 0x13f2761 0x13f23b4 0x13f0a8b 0x13f1ce8 0x13f11fc 0xee4368 0xee403a 0x14cb5bd 0x14cae93 0x1672398 0xf2ccd6 0x98dd75 0x7efcca5e9590" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.179483" DateTime="2025-06-20T11:02:08Z" Type="MachineLoadDetail" ID="0000000000000000" User="47697777" Nice="334179" System="28434573" Idle="967418276" IOWait="5206855" IRQ="0" SoftIRQ="7133393" Steal="184572" Guest="0" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.179483" DateTime="2025-06-20T11:02:08Z" Type="ProcessMetrics" ID="0000000000000000" Elapsed="0.129026" CPUSeconds="0.049436" MainThreadCPUSeconds="0.048017" UptimeSeconds="0.124747" Memory="415588352" ResidentMemory="27611136" UnusedAllocatedMemory="130816" MbpsSent="0.518098" MbpsReceived="9.88901" DiskTotalBytes="0" DiskFreeBytes="0" DiskQueueDepth="0" DiskIdleSeconds="0" DiskReads="0" DiskReadSeconds="0" DiskWrites="0" DiskWriteSeconds="0" DiskReadsCount="0" DiskWritesCount="0" DiskWriteSectors="0" DiskReadSectors="0" FileWrites="0" FileReads="0" CacheReadBytes="0" CacheFinds="0" CacheWritesBlocked="0" CacheReadsBlocked="0" CachePageReadsMerged="0" CacheWrites="0" CacheReads="0" CacheHits="0" CacheMisses="0" CacheEvictions="0" DCID="[not set]" ZoneID="[not set]" MachineID="[not set]" Version="[not set]" AIOSubmitCount="0" AIOCollectCount="0" AIOSubmitLag="0" AIODiskStall="0" CurrentConnections="12" ConnectionsEstablished="93.0047" ConnectionsClosed="0" ConnectionErrors="0" TLSPolicyFailures="0" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" TrackLatestType="Original" />
<Event Severity="10" Time="1750417328.179483" DateTime="2025-06-20T11:02:08Z" Type="MemoryMetrics" ID="0000000000000000" TotalMemory16="0" ApproximateUnusedMemory16="0" ActiveThreads16="0" TotalMemory32="131072" ApproximateUnusedMemory32="0" ActiveThreads32="1" TotalMemory64="393216" ApproximateUnusedMemory64="130816" ActiveThreads64="2" TotalMemory96="131040" ApproximateUnusedMemory96="0" ActiveThreads96="1" TotalMemory128="131072" ApproximateUnusedMemory128="0" ActiveThreads128="1" TotalMemory256="262144" ApproximateUnusedMemory256="0" ActiveThreads256="2" TotalMemory512="0" ApproximateUnusedMemory512="0" ActiveThreads512="0" TotalMemory1024="0" ApproximateUnusedMemory1024="0" ActiveThreads1024="0" TotalMemory2048="0" ApproximateUnusedMemory2048="0" ActiveThreads2048="0" TotalMemory4096="0" ApproximateUnusedMemory4096="0" ActiveThreads4096="0" TotalMemory8192="0" ApproximateUnusedMemory8192="0" ActiveThreads8192="0" TotalMemory16384="0" ApproximateUnusedMemory16384="0" ActiveThreads16384="0" HugeArenaMemory="120128" DCID="[not set]" ZoneID="[not set]" MachineID="[not set]" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.179483" DateTime="2025-06-20T11:02:08Z" Type="FastAllocMemoryUsage" ID="0000000000000000" TotalMemory="1048544" UnusedMemory="130816" Utilization="87.524033%" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" />
<Event Severity="10" Time="1750417328.179483" DateTime="2025-06-20T11:02:08Z" Type="NetworkMetrics" ID="0000000000000000" Elapsed="0.129026" CantSleep="0" WontSleep="2" Yields="5" YieldCalls="61" YieldCallsTrue="2" RunLoopProfilingSignals="0" YieldBigStack="0" RunLoopIterations="81" TimersExecuted="28" TasksExecuted="152" ASIOEventsProcessed="187" ReadCalls="82" WriteCalls="27" ReadProbes="43" WriteProbes="0" PacketsRead="35" PacketsGenerated="45" WouldBlock="43" LaunchTime="2.16961e-05" ReactTime="0.00838661" DCID="[not set]" ZoneID="[not set]" MachineID="[not set]" SlowTask2M="3" SlowTask4M="1" SlowTask8M="1" SlowTask16M="1" PriorityBusy0="0.0943108" PriorityBusy8000="0.0156324" PriorityBusy20001="0.00838661" PriorityStarvedBelow1="0.024466" PriorityMaxStarvedBelow1="0.00971866" PriorityStarvedBelow3500="0.024466" PriorityMaxStarvedBelow3500="0.00971866" PriorityStarvedBelow7000="0.0242898" PriorityMaxStarvedBelow7000="0.00971866" PriorityStarvedBelow7500="0.0227959" PriorityMaxStarvedBelow7500="0.00960493" PriorityStarvedBelow8500="0.00711441" PriorityMaxStarvedBelow8500="0.00292015" PriorityStarvedBelow8900="0.00608134" PriorityMaxStarvedBelow8900="0.00292015" PriorityStarvedBelow10500="0.00327492" PriorityMaxStarvedBelow10500="0.00270176" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" TrackLatestType="Original" />
<Event Severity="10" Time="1750417328.179483" DateTime="2025-06-20T11:02:08Z" Type="MachineMetrics" ID="0000000000000000" Elapsed="0.129026" MbpsSent="17.2265" MbpsReceived="24.3237" OutSegs="928" RetransSegs="0" CPUSeconds="0.0421815" TotalMemory="16760942592" CommittedMemory="1158873088" AvailableMemory="15602069504" DCID="[not set]" ZoneID="[not set]" MachineID="[not set]" DatahallID="[not set]" ThreadID="17344581416006569760" Machine="172.26.8.190:325" LogGroup="default" ClientDescription="primary-7.3.59-7349041801802113034" TrackLatestType="Original" />
</Trace>
I start my dr_agents in the foundationdb.conf
file like this:
...
[dr_agent]
command = /usr/bin/dr_agent
logdir = /var/log/foundationdb/dr_agent
trace_format = json
source = /etc/foundationdb/primary_fdb.cluster
[fdbserver.4600]
[fdbserver.4601]
[fdbserver.4602]
[dr_agent.0]
[dr_agent.1]
[dr_agent.2]
The secondary cluster gets up to speed when I start the DR with fdbdr
, so the dr_agents
are working fine. Only the switch
operation seems to error out.
whats “incorrect setup” about them?